Text preprocessing using spaCy
import re
from typing import List
import spacy
from spacy.tokens import Doc
from tqdm import tqdm
class SpacyPreprocessor:
def __init__(
Preprocesses text using spaCy
:param remove_numbers: Whether to remove numbers from text
:param remove_stopwords: Whether to remove stopwords from text
:param remove_special: Whether to remove special characters (including numbers)
:param pos_to_remove: list of PoS tags to remove
:param lemmatize: Whether to apply lemmatization
self._remove_numbers = remove_numbers
self._pos_to_remove = pos_to_remove
self._remove_stopwords = remove_stopwords
self._remove_special = remove_special
self._lemmatize = lemmatize
if not spacy_model:
self.model = spacy.load("en_core_web_sm")
self.model = spacy_model
def download_spacy_model(model="en_core_web_sm"):
print(f"Downloading spaCy model {model}")
print(f"Finished downloading model")
def load_model(model="en_core_web_sm"):
return spacy.load(model, disable=["ner", "parser"])
def tokenize(self, text) -> List[str]:
Tokenize text using a spaCy pipeline
:param text: Text to tokenize
:return: list of str
doc = self.model(text)
return [token.text for token in doc]
def preprocess_text(self, text) -> str:
Runs a spaCy pipeline and removes unwanted parts from text
:param text: text string to clean
:return: str, clean text
doc = self.model(text)
return self.__clean(doc)
def preprocess_text_list(self, texts=List[str]) -> List[str]:
Runs a spaCy pipeline and removes unwantes parts from a list of text.
Leverages spaCy's `pipe` for faster batch processing.
:param texts: List of texts to clean
:return: List of clean texts
clean_texts = []
for doc in tqdm(self.model.pipe(texts)):
return clean_texts
def __clean(self, doc: Doc) -> str:
tokens = []
# POS Tags removal
if self._pos_to_remove:
for token in doc:
if token.pos_ not in self._pos_to_remove:
tokens = doc
# Remove Numbers
if self._remove_numbers:
tokens = [
token for token in tokens if not (token.like_num or token.is_currency)
# Remove Stopwords
if self._remove_stopwords:
tokens = [token for token in tokens if not token.is_stop]
# remove unwanted tokens
tokens = [
for token in tokens
if not (
token.is_punct or token.is_space or token.is_quote or token.is_bracket
# Remove empty tokens
tokens = [token for token in tokens if token.text.strip() != ""]
# Lemmatize
if self._lemmatize:
text = " ".join([token.lemma_ for token in tokens])
text = " ".join([token.text for token in tokens])
if self._remove_special:
# Remove non alphabetic characters
text = re.sub(r"[^a-zA-Z\']", " ", text)
# remove non-Unicode characters
text = re.sub(r"[^\x00-\x7F]+", "", text)
text = text.lower()
return text
if __name__ == "__main__":
spacy_model = SpacyPreprocessor.load_model()
preprocessor = SpacyPreprocessor(spacy_model=spacy_model, lemmatize=True, remove_numbers=True)
clean_text = preprocessor.preprocess_text("spaCy is awesome! 123")

