Skip to content

Instantly share code, notes, and snippets.

@smsubrahmannian
Last active October 4, 2018 09:54
Show Gist options
  • Save smsubrahmannian/2835bd32c688b7b57a5300f94af07b1b to your computer and use it in GitHub Desktop.
Save smsubrahmannian/2835bd32c688b7b57a5300f94af07b1b to your computer and use it in GitHub Desktop.
import spacy
nlp = spacy.load('en') # loading the language model
data = pd.read_feather('data/preprocessed_data') # reading a pandas dataframe which is stored as a feather file
def clean_up(text): # clean up your text and generate list of words for each document.
removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
text_out = []
doc= nlp(text)
for token in doc:
if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
lemma = token.lemma_
text_out.append(lemma)
return text_out
datalist = data.text.apply(lambda x:clean_up(x))
# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
dictionary = corpora.Dictionary(dataList)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in skillList]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment