joshua-taylor/sPacy tokenize.py

## sPacy tokenize.py
nlp = spacy.load("en_core_web_sm")
tok_text=[] # OUTPUT for our tokenised corpus
text = df.text.str.lower().values
text = [fix_text(str(i)) for i in text]

#Tokenising using SpaCy:
for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])):
   tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)]
   tok_text.append(tok)
	nlp = spacy.load("en_core_web_sm")
	tok_text=[] # OUTPUT for our tokenised corpus
	text = df.text.str.lower().values
	text = [fix_text(str(i)) for i in text]

	#Tokenising using SpaCy:
	for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])):
	tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)]
	tok_text.append(tok)