Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created June 12, 2014 07:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cigrainger/dfbb5eb549d52064062c to your computer and use it in GitHub Desktop.
Save cigrainger/dfbb5eb549d52064062c to your computer and use it in GitHub Desktop.
import nltk, timeit
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from joblib import Parallel, delayed
lmtzr = WordNetLemmatizer()
tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV}
def get_wordnet_pos(treebank_tag):
return tag_to_type.get(treebank_tag[:1], wordnet.NOUN)
def clean(text):
words = nltk.word_tokenize(text)
tags = nltk.pos_tag(words)
return ' '.join(
lmtzr.lemmatize(word, get_wordnet_pos(tag[1]))
for word, tag in zip(words, tags)
)
tic = timeit.default_timer()
with open("abstracts.txt","rb") as f:
abstracts = Parallel(n_jobs=32)(delayed(clean)(line) for line in f)
toc = timeit.default_timer()
time = toc-tic
print('Writing to file now. %s abstracts processed in %s seconds.' % (len(abstracts),time))
with open("abstractsfinal.txt","w") as f:
f.truncate()
for item in abstracts[1:]:
f.write('%s\n' % item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment