-
-
Save CyrilRJK/87f2130bb2e94c5653ac4a6aebc41454 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def spacy_pipe(docs): | |
return list(doc for doc in nlp.pipe(docs, batch_size=50, n_process=4)) | |
def preprocessor(doc): | |
# remove digits & punctuation and do lemmatization | |
doc = [tok.lemma_.lower() for tok in doc if not tok.is_punct and not tok.is_digit and not tok.like_num] | |
doc = [re.sub(r'(?<!\w)\'\w|\w\'(?!\w)', '', x) for x in doc] # remove spacy negation tokens (e.g. n't) | |
doc = [tok for tok in doc if tok not in stopwords] # remove stopwords | |
doc = list(filter(None, map(str.strip, doc))) # remove empty strings | |
doc = [tok for tok in doc if len(tok) > 2] # remove strings <2 characters | |
return doc | |
cv = CountVectorizer(max_features=5000, | |
max_df=0.9, | |
min_df=0.01, | |
preprocessor=lambda x: x, | |
tokenizer=preprocessor, | |
ngram_range=(1,3) | |
) | |
stopwords = get_stopwords() | |
nlp = spacy.load("en_core_web_sm", exclude=["ner", "parser"]) | |
df = get_data(name, 1000) # limit to first 1000 results | |
df['processed_docs'] = spacy_pipe(df['docs'].to_list()) # process documents with spacy | |
data = np.array(cv.fit_transform(df['processed_docs']).todense()).T # tokenize documents using vectorizer |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment