Create a gist now

Instantly share code, notes, and snippets.

import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
from joblib import Parallel, delayed
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('R'):
return wordnet.ADV
return wordnet.NOUN
def clean(x):
x = x.replace('<image>','')
x = pattern.sub('',x.lower())
x = x.replace('\r','')
x = x.replace('\n','')
x = nltk.word_tokenize(x)
tag = nltk.pos_tag(x)
pos = []
for i in range(len(tag)):
y = []
for i in range(len(x)):
x = ' '.join(y)
def final_clean(x,y):
a = x.split(',',1)
if len(a)==2:
c = a[1].replace(',','')
b = clean(c)
y.write(b + '\n')
with open("abstracts.txt","rb") as f:
with open("abstractsfinal.txt","w") as f2:
Parallel(n_jobs=30)(delayed(final_clean)(line,f2) for line in f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment