smsubrahmannian/Preprocessing with Spacy

## Preprocessing with Spacy
import spacy

nlp = spacy.load('en') # loading the language model
data = pd.read_feather('data/preprocessed_data') # reading a pandas dataframe which is stored as a feather file

def clean_up(text):  # clean up your text and generate list of words for each document.
    removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
            lemma = token.lemma_
            text_out.append(lemma)
    return text_out

datalist = data.text.apply(lambda x:clean_up(x))

# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
dictionary = corpora.Dictionary(dataList)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in skillList]
	import spacy

	nlp = spacy.load('en') # loading the language model
	data = pd.read_feather('data/preprocessed_data') # reading a pandas dataframe which is stored as a feather file

	def clean_up(text): # clean up your text and generate list of words for each document.
	removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
	text_out = []
	doc= nlp(text)
	for token in doc:
	if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
	lemma = token.lemma_
	text_out.append(lemma)
	return text_out

	datalist = data.text.apply(lambda x:clean_up(x))

	# Create a vocabulary for the lda model and
	# convert our corpus into document-term matrix for Lda
	dictionary = corpora.Dictionary(dataList)
	doc_term_matrix = [dictionary.doc2bow(doc) for doc in skillList]