Skip to content

Instantly share code, notes, and snippets.

@pasupulaphani
Forked from alexbowe/nltk-intro.py
Created August 21, 2016 14:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pasupulaphani/c01023a3ea9f9a66b308e642f24f1abc to your computer and use it in GitHub Desktop.
Save pasupulaphani/c01023a3ea9f9a66b308e642f24f1abc to your computer and use it in GitHub Desktop.
Demonstration of extracting key phrases with NLTK in Python
import nltk
text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital
computer or the gears of a cycle transmission as he does at the top of a mountain
or in the petals of a flower. To think otherwise is to demean the Buddha...which is
to demean oneself."""
# Used when tokenizing words
sentence_re = r'''(?x) # set flag to allow verbose regexps
([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
#Taken from Su Nam Kim Paper...
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)
toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
print postoks
tree = chunker.parse(postoks)
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
def leaves(tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter = lambda t: t.node=='NP'):
yield subtree.leaves()
def normalise(word):
"""Normalises words to lowercase and stems and lemmatizes it."""
word = word.lower()
word = stemmer.stem_word(word)
word = lemmatizer.lemmatize(word)
return word
def acceptable_word(word):
"""Checks conditions for acceptable word: length, stopword."""
accepted = bool(2 <= len(word) <= 40
and word.lower() not in stopwords)
return accepted
def get_terms(tree):
for leaf in leaves(tree):
term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
yield term
terms = get_terms(tree)
for term in terms:
for word in term:
print word,
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment