Skip to content

Instantly share code, notes, and snippets.

Created March 5, 2014 11:43
Show Gist options
  • Save anonymous/9365684 to your computer and use it in GitHub Desktop.
Save anonymous/9365684 to your computer and use it in GitHub Desktop.
Pasted from IPython
pwd
s1 = open('stopwords.txt','r').read().split()
s1
s2 = open('../scholarec/corpus/stopwords.txt','r').read().split()
s2
set(s1)
set(s2)
set(s1)-set(s2)
l
ls
ll stopwords.txt
ll ../scholarec/corpus/stopwords.txt
set(s2)-set(s1)
ls
cat stopwords.txt| wc -l
cat ../scholarec/corpus/stopwords.txt| wc -l
set(s2)+set(s1)
s1
s2
s1+s2
len(s1+s2)
s1.extend(s2)
s1
len(s1)
s2 + list(set(s1) - set(s2))
len(s2 + list(set(s1) - set(s2)))
import os.path
import os
import sys
import cPickle as pickle
from string import punctuation
from operator import itemgetter
import re
N= 100
ls
len(s2 + list(set(s1) - set(s2)))
stopwords = len(s2 + list(set(s1) - set(s2)))
stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
stopwords
stopwords = s2 + list(set(s1) - set(s2))
stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
stopwords
pdfpath = 'paper.pdf'
picklepath = os.path.join('db', pid, 'topwords.p')
picklepath = os.path.join('topwords.p')
picklepath
cmd = "pdftotext %s %s" % (pdfpath, "out.txt")
cmd
rm out.txt
from subprocess import call
call(cmd)
cmd
ls
call
cmd
cmd.split()
x = call(cmd.split())
ls
rm out.txt
x = call(cmd.split())
ls
x.real
ls
txtlst = open("out.txt").read().split()
words = [x.lower() for x in txtlst if re.match('^[\w-]+$', x) is not None]
words
words = [x for x in words if len(x)>2 and (not x in stopwords)]
words
len(words)
pickle.dump(top, open(picklepath, "wb"))
top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
wcount = {}
wcount.iteritems()
top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
top
wcount = {}
top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
top
words
for w in words: wcount[w] = wcount.get(w, 0) + 1
wcount
top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
top
pickle.dump(top, open(picklepath, "wb"))
ls
less topwords.p
picklepath
twords = pickle.load(open(picklepath, "rb"))
twords
dict(iter(twords))
%pastebin 1-91
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment