Skip to content

Instantly share code, notes, and snippets.

@CyrilRJK
Created June 7, 2022 14:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CyrilRJK/87f2130bb2e94c5653ac4a6aebc41454 to your computer and use it in GitHub Desktop.
Save CyrilRJK/87f2130bb2e94c5653ac4a6aebc41454 to your computer and use it in GitHub Desktop.
def spacy_pipe(docs):
return list(doc for doc in nlp.pipe(docs, batch_size=50, n_process=4))
def preprocessor(doc):
# remove digits & punctuation and do lemmatization
doc = [tok.lemma_.lower() for tok in doc if not tok.is_punct and not tok.is_digit and not tok.like_num]
doc = [re.sub(r'(?<!\w)\'\w|\w\'(?!\w)', '', x) for x in doc] # remove spacy negation tokens (e.g. n't)
doc = [tok for tok in doc if tok not in stopwords] # remove stopwords
doc = list(filter(None, map(str.strip, doc))) # remove empty strings
doc = [tok for tok in doc if len(tok) > 2] # remove strings <2 characters
return doc
cv = CountVectorizer(max_features=5000,
max_df=0.9,
min_df=0.01,
preprocessor=lambda x: x,
tokenizer=preprocessor,
ngram_range=(1,3)
)
stopwords = get_stopwords()
nlp = spacy.load("en_core_web_sm", exclude=["ner", "parser"])
df = get_data(name, 1000) # limit to first 1000 results
df['processed_docs'] = spacy_pipe(df['docs'].to_list()) # process documents with spacy
data = np.array(cv.fit_transform(df['processed_docs']).todense()).T # tokenize documents using vectorizer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment