cigrainger/gist:bdd6fbd532dc6f44a9d4

## gistfile1.py
import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)
from joblib import Parallel, delayed

lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean(x):
	x = x.replace('<image>','')
	x = pattern.sub('',x.lower())
	x = x.replace('\r','')
	x = x.replace('\n','')
	x = nltk.word_tokenize(x)
	tag = nltk.pos_tag(x)
	pos = []
	for i in range(len(tag)):
        pos.append(get_wordnet_pos(tag[i][1]))
	y = []
	for i in range(len(x)):
        y.append(lmtzr.lemmatize(x[i],pos[i]))
	x = ' '.join(y)
	return(x)

def final_clean(x,y):
    a = x.split(',',1)
    if len(a)==2:
        c = a[1].replace(',','')
        b = clean(c)
        y.write(b + '\n')

with open("abstracts.txt","rb") as f:
    with open("abstractsfinal.txt","w") as f2:
        f2.truncate()
        Parallel(n_jobs=30)(delayed(final_clean)(line,f2) for line in f)
	import re, string, sys, nltk
	from nltk.stem.wordnet import WordNetLemmatizer
	from nltk.corpus import wordnet
	import logging
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
	level=logging.INFO)
	from joblib import Parallel, delayed

	lmtzr = WordNetLemmatizer()
	pattern=re.compile(r'[^a-zA-Z ]')

	def get_wordnet_pos(treebank_tag):
	if treebank_tag.startswith('J'):
	return wordnet.ADJ
	elif treebank_tag.startswith('V'):
	return wordnet.VERB
	elif treebank_tag.startswith('R'):
	return wordnet.ADV
	else:
	return wordnet.NOUN

	def clean(x):
	x = x.replace('<image>','')
	x = pattern.sub('',x.lower())
	x = x.replace('\r','')
	x = x.replace('\n','')
	x = nltk.word_tokenize(x)
	tag = nltk.pos_tag(x)
	pos = []
	for i in range(len(tag)):
	pos.append(get_wordnet_pos(tag[i][1]))
	y = []
	for i in range(len(x)):
	y.append(lmtzr.lemmatize(x[i],pos[i]))
	x = ' '.join(y)
	return(x)

	def final_clean(x,y):
	a = x.split(',',1)
	if len(a)==2:
	c = a[1].replace(',','')
	b = clean(c)
	y.write(b + '\n')

	with open("abstracts.txt","rb") as f:
	with open("abstractsfinal.txt","w") as f2:
	f2.truncate()
	Parallel(n_jobs=30)(delayed(final_clean)(line,f2) for line in f)