Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist
View gist:bdd6fbd532dc6f44a9d4
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
from joblib import Parallel, delayed
 
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
 
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
 
def clean(x):
x = x.replace('<image>','')
x = pattern.sub('',x.lower())
x = x.replace('\r','')
x = x.replace('\n','')
x = nltk.word_tokenize(x)
tag = nltk.pos_tag(x)
pos = []
for i in range(len(tag)):
pos.append(get_wordnet_pos(tag[i][1]))
y = []
for i in range(len(x)):
y.append(lmtzr.lemmatize(x[i],pos[i]))
x = ' '.join(y)
return(x)
 
def final_clean(x,y):
a = x.split(',',1)
if len(a)==2:
c = a[1].replace(',','')
b = clean(c)
y.write(b + '\n')
 
with open("abstracts.txt","rb") as f:
with open("abstractsfinal.txt","w") as f2:
f2.truncate()
Parallel(n_jobs=30)(delayed(final_clean)(line,f2) for line in f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.