Created April 12, 2016 19:14
import gensim
import nltk
from nltk import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import itertools, string
#Read single file
f = open('/Users/tonyojeda/Desktop/keyphrase/ddl_corpus/corpus_text/2014-08-08-how-to-transition-from-excel-to-r.txt', 'rU')
text =
#Read entire corpus
CORPUS_TEXT = '/Users/tonyojeda/Desktop/keyphrase/ddl_corpus/corpus_text'
texts = PlaintextCorpusReader(CORPUS_TEXT, '.*\.txt')
def corpus_info(corpus):
Prints out information about the status of a corpus.
fids = len(corpus.fileids())
paras = len(corpus.paras())
sents = len(corpus.sents())
sperp = sum(len(para) for para in corpus.paras()) / float(paras)
tokens = FreqDist(corpus.words())
count = sum(tokens.values())
vocab = len(tokens)
lexdiv = float(count) / float(vocab)
"Text corpus contains {} files\n"
"Composed of {} paragraphs and {} sentences.\n"
"{:0.3f} sentences per paragraph\n"
"Word count of {} with a vocabulary of {}\n"
"lexical diversity is {:0.3f}"
fids, paras, sents, sperp, count, vocab, lexdiv
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group).lower()
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize and POS-tag words
tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
for sent in nltk.sent_tokenize(text)))
# filter on certain POS tags and lowercase all words
candidates = [word.lower() for word, tag in tagged_words if tag in good_tags and word.lower() not in stop_words and not all(char in punct for char in word)]
return candidates
def score_keyphrases_by_tfidf(texts, candidates='chunks'):
# extract candidates from each text in texts, either chunks or words
if candidates == 'chunks':
boc_texts = [extract_candidate_chunks(text) for text in texts.fileids()]
elif candidates == 'words':
boc_texts = [extract_candidate_words(text) for text in texts.fileids()]
# make gensim dictionary and corpus
dictionary = gensim.corpora.Dictionary(boc_texts)
corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
# transform corpus with tf*idf model
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return corpus_tfidf, dictionary
