Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Text preprocessing using spaCy
import re
from typing import List
import spacy
from spacy.tokens import Doc
from tqdm import tqdm
class SpacyPreprocessor:
def __init__(
Preprocesses text using spaCy
:param remove_numbers: Whether to remove numbers from text
:param remove_stopwords: Whether to remove stopwords from text
:param remove_special: Whether to remove special characters (including numbers)
:param pos_to_remove: list of PoS tags to remove
:param lemmatize: Whether to apply lemmatization
self._remove_numbers = remove_numbers
self._pos_to_remove = pos_to_remove
self._remove_stopwords = remove_stopwords
self._remove_special = remove_special
self._lemmatize = lemmatize
if not spacy_model:
self.model = spacy.load("en_core_web_sm")
self.model = spacy_model
def download_spacy_model(model="en_core_web_sm"):
print(f"Downloading spaCy model {model}")
print(f"Finished downloading model")
def load_model(model="en_core_web_sm"):
return spacy.load(model, disable=["ner", "parser"])
def tokenize(self, text) -> List[str]:
Tokenize text using a spaCy pipeline
:param text: Text to tokenize
:return: list of str
doc = self.model(text)
return [token.text for token in doc]
def preprocess_text(self, text) -> str:
Runs a spaCy pipeline and removes unwanted parts from text
:param text: text string to clean
:return: str, clean text
doc = self.model(text)
return self.__clean(doc)
def preprocess_text_list(self, texts=List[str]) -> List[str]:
Runs a spaCy pipeline and removes unwantes parts from a list of text.
Leverages spaCy's `pipe` for faster batch processing.
:param texts: List of texts to clean
:return: List of clean texts
clean_texts = []
for doc in tqdm(self.model.pipe(texts)):
return clean_texts
def __clean(self, doc: Doc) -> str:
tokens = []
# POS Tags removal
if self._pos_to_remove:
for token in doc:
if token.pos_ not in self._pos_to_remove:
tokens = doc
# Remove Numbers
if self._remove_numbers:
tokens = [
token for token in tokens if not (token.like_num or token.is_currency)
# Remove Stopwords
if self._remove_stopwords:
tokens = [token for token in tokens if not token.is_stop]
# remove unwanted tokens
tokens = [
for token in tokens
if not (
token.is_punct or token.is_space or token.is_quote or token.is_bracket
# Remove empty tokens
tokens = [token for token in tokens if token.text.strip() != ""]
# Lemmatize
if self._lemmatize:
text = " ".join([token.lemma_ for token in tokens])
text = " ".join([token.text for token in tokens])
if self._remove_special:
# Remove non alphabetic characters
text = re.sub(r"[^a-zA-Z\']", " ", text)
# remove non-Unicode characters
text = re.sub(r"[^\x00-\x7F]+", "", text)
text = text.lower()
return text
if __name__ == "__main__":
spacy_model = SpacyPreprocessor.load_model()
preprocessor = SpacyPreprocessor(spacy_model=spacy_model, lemmatize=True, remove_numbers=True)
clean_text = preprocessor.preprocess_text("spaCy is awesome! 123")

This comment has been minimized.

Copy link

@aliforgetti aliforgetti commented Apr 9, 2021

This is great and exactly what I was looking for. Thank you for this :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment