Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created June 5, 2014 13:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cigrainger/d3b24e002bb140d45c8c to your computer and use it in GitHub Desktop.
Save cigrainger/d3b24e002bb140d45c8c to your computer and use it in GitHub Desktop.
import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def clean(x):
x = x.replace('<image>','')
x = pattern.sub('',x.lower())
x = x.replace('\r','')
x = x.replace('\n','')
x = nltk.word_tokenize(x)
tag = nltk.pos_tag(x)
pos = []
for i in range(len(tag)):
pos.append(get_wordnet_pos(tag[i][1]))
y = []
for i in range(len(x)):
y.append(lmtzr.lemmatize(x[i],pos[i]))
x = ' '.join(y)
return(x)
with open("abstracts.txt","rb") as f:
with open("abstractsfinal.txt","w") as f2:
f2.truncate()
i = 0
for line in f:
y = line.split(',',1)
if len(y)==2:
c = y[1].replace(',','')
a = clean(c)
f2.write(a + '\n')
i = i+1
print i
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment