Skip to content
Create a gist now

Instantly share code, notes, and snippets.

import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
from joblib import Parallel, delayed
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('R'):
return wordnet.ADV
return wordnet.NOUN
def clean(x):
x = x.replace('<image>','')
x = pattern.sub('',x.lower())
x = x.replace('\r','')
x = x.replace('\n','')
x = nltk.word_tokenize(x)
tag = nltk.pos_tag(x)
pos = []
for i in range(len(tag)):
y = []
for i in range(len(x)):
x = ' '.join(y)
def final_clean(x,y):
a = x.split(',',1)
if len(a)==2:
c = a[1].replace(',','')
b = clean(c)
y.write(b + '\n')
with open("abstracts.txt","rb") as f:
with open("abstractsfinal.txt","w") as f2:
Parallel(n_jobs=30)(delayed(final_clean)(line,f2) for line in f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.