yjxiong/nltk_word_extract.py

## nltk_word_extract.py
__author__ = 'alex'


# from pyspark import SparkContext, SparkConf

import nltk
from nltk.corpus import stopwords

sw = stopwords.words('english')
tk = nltk.tokenize.WordPunctTokenizer()


def list_useful_word(text):
    """
    This function return a list of words tagged with their POS from the input text.
    This function will remove:
        [1] named entity (location, person, organization)
        [2] stopwords
        [3] punctuation
    :param text: str
    :return: list(str)
    """
    parsed_text = nltk.ne_chunk_sents(nltk.pos_tag_sents(tk.tokenize_sents(nltk.sent_tokenize(text))))
    useful_word_list = []
    for sent in parsed_text:
        for node in sent:
            if type(node) != nltk.tree.Tree \
                    and node[0].lower() not in sw \
                    and node[1] != '.':
                key = node[0]+'/'+node[1]
                useful_word_list.append(key)

    return useful_word_list


print list_useful_word("This is a good show boy. You should come with us to see it!")
	__author__ = 'alex'


	# from pyspark import SparkContext, SparkConf

	import nltk
	from nltk.corpus import stopwords

	sw = stopwords.words('english')
	tk = nltk.tokenize.WordPunctTokenizer()


	def list_useful_word(text):
	"""
	This function return a list of words tagged with their POS from the input text.
	This function will remove:
	[1] named entity (location, person, organization)
	[2] stopwords
	[3] punctuation
	:param text: str
	:return: list(str)
	"""
	parsed_text = nltk.ne_chunk_sents(nltk.pos_tag_sents(tk.tokenize_sents(nltk.sent_tokenize(text))))
	useful_word_list = []
	for sent in parsed_text:
	for node in sent:
	if type(node) != nltk.tree.Tree \
	and node[0].lower() not in sw \
	and node[1] != '.':
	key = node[0]+'/'+node[1]
	useful_word_list.append(key)

	return useful_word_list


	print list_useful_word("This is a good show boy. You should come with us to see it!")