CyrilRJK/artm_preprocessing.py Secret

## artm_preprocessing.py
def spacy_pipe(docs):
    return list(doc for doc in nlp.pipe(docs, batch_size=50, n_process=4))

def preprocessor(doc):
    # remove digits & punctuation and do lemmatization
    doc = [tok.lemma_.lower() for tok in doc if not tok.is_punct and not tok.is_digit and not tok.like_num]
    doc = [re.sub(r'(?<!\w)\'\w|\w\'(?!\w)', '', x) for x in doc]  # remove spacy negation tokens (e.g. n't)
    doc = [tok for tok in doc if tok not in stopwords] # remove stopwords
    doc = list(filter(None, map(str.strip, doc))) # remove empty strings
    doc = [tok for tok in doc if len(tok) > 2] # remove strings <2 characters
    return doc

cv = CountVectorizer(max_features=5000,
                     max_df=0.9,
                     min_df=0.01,
                     preprocessor=lambda x: x,
                     tokenizer=preprocessor,
                     ngram_range=(1,3)
                    )

stopwords = get_stopwords()
nlp = spacy.load("en_core_web_sm", exclude=["ner", "parser"])

df = get_data(name, 1000) # limit to first 1000 results
df['processed_docs'] = spacy_pipe(df['docs'].to_list()) # process documents with spacy
data = np.array(cv.fit_transform(df['processed_docs']).todense()).T # tokenize documents using vectorizer
	def spacy_pipe(docs):
	return list(doc for doc in nlp.pipe(docs, batch_size=50, n_process=4))

	def preprocessor(doc):
	# remove digits & punctuation and do lemmatization
	doc = [tok.lemma_.lower() for tok in doc if not tok.is_punct and not tok.is_digit and not tok.like_num]
	doc = [re.sub(r'(?<!\w)\'\w\|\w\'(?!\w)', '', x) for x in doc] # remove spacy negation tokens (e.g. n't)
	doc = [tok for tok in doc if tok not in stopwords] # remove stopwords
	doc = list(filter(None, map(str.strip, doc))) # remove empty strings
	doc = [tok for tok in doc if len(tok) > 2] # remove strings <2 characters
	return doc

	cv = CountVectorizer(max_features=5000,
	max_df=0.9,
	min_df=0.01,
	preprocessor=lambda x: x,
	tokenizer=preprocessor,
	ngram_range=(1,3)
	)

	stopwords = get_stopwords()
	nlp = spacy.load("en_core_web_sm", exclude=["ner", "parser"])

	df = get_data(name, 1000) # limit to first 1000 results
	df['processed_docs'] = spacy_pipe(df['docs'].to_list()) # process documents with spacy
	data = np.array(cv.fit_transform(df['processed_docs']).todense()).T # tokenize documents using vectorizer