Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created June 5, 2014 13:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cigrainger/bdd6fbd532dc6f44a9d4 to your computer and use it in GitHub Desktop.
Save cigrainger/bdd6fbd532dc6f44a9d4 to your computer and use it in GitHub Desktop.
import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
from joblib import Parallel, delayed
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def clean(x):
x = x.replace('<image>','')
x = pattern.sub('',x.lower())
x = x.replace('\r','')
x = x.replace('\n','')
x = nltk.word_tokenize(x)
tag = nltk.pos_tag(x)
pos = []
for i in range(len(tag)):
pos.append(get_wordnet_pos(tag[i][1]))
y = []
for i in range(len(x)):
y.append(lmtzr.lemmatize(x[i],pos[i]))
x = ' '.join(y)
return(x)
def final_clean(x,y):
a = x.split(',',1)
if len(a)==2:
c = a[1].replace(',','')
b = clean(c)
y.write(b + '\n')
with open("abstracts.txt","rb") as f:
with open("abstractsfinal.txt","w") as f2:
f2.truncate()
Parallel(n_jobs=30)(delayed(final_clean)(line,f2) for line in f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment