Skip to content

Instantly share code, notes, and snippets.

@yjxiong
Created October 13, 2015 04:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yjxiong/929dc480da608b9b35b0 to your computer and use it in GitHub Desktop.
Save yjxiong/929dc480da608b9b35b0 to your computer and use it in GitHub Desktop.
use NLTK to do word extraction
__author__ = 'alex'
# from pyspark import SparkContext, SparkConf
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')
tk = nltk.tokenize.WordPunctTokenizer()
def list_useful_word(text):
"""
This function return a list of words tagged with their POS from the input text.
This function will remove:
[1] named entity (location, person, organization)
[2] stopwords
[3] punctuation
:param text: str
:return: list(str)
"""
parsed_text = nltk.ne_chunk_sents(nltk.pos_tag_sents(tk.tokenize_sents(nltk.sent_tokenize(text))))
useful_word_list = []
for sent in parsed_text:
for node in sent:
if type(node) != nltk.tree.Tree \
and node[0].lower() not in sw \
and node[1] != '.':
key = node[0]+'/'+node[1]
useful_word_list.append(key)
return useful_word_list
print list_useful_word("This is a good show boy. You should come with us to see it!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment