mac389/NLP.PY

## NLP.PY
import json
import nltk

#import matplotlib.pyplot as plt
#import utils as tech

from nltk.corpus import stopwords
from nltk import bigrams,trigrams
from pprint import pprint

READ = 'r'

corpus = [json.loads(item) for item in json.load(open('twitter-output.json',READ))]

def cleanse(data,remove_stopwords=True):
    #extract text
    corpus = [datum['text'].lower().split() for datum in data]

    #remove URLs and stopwords
    corpus = [[word for word in text if not word.startswith('http://')
                    and word not in stopwords.words('english')] for text in corpus]

    #remove unicode
    corpus = [[word for word in text if all([ord(ch)<128 for ch in word])] for text in corpus]

    return corpus

def extract_entities(selector,tweet):
    '''Tweet is a list of words'''
    return [word for word in tweet if word.startswith(selector)]

def extract_hashtags(tweet):
    '''Tweet is a list of words'''
    return extract_entities('#',tweet)

def extract_people(tweet):
    '''Tweet is a list of words'''
    return extract_entities('@',tweet)

tokens = [text for tweet in cleanse(corpus) for text in tweet]
word_frequencies = nltk.FreqDist(tokens)

bi_tokens = bigrams(tokens)
tri_tokens = trigrams(tokens)

#pprint([bi_tokens])
pprint(list(bi_tokens))
pprint(list(tri_tokens)

'''
    Exercises:
      1. Partition data
      2.

'''

''' Data Visualization'''
fig = plt.figure()
ax = fig.add_subplot(111)
words,freqs = zip(*word_frequencies.most_common(25))
ax.plot(freqs,'k--',linewidth=2)
tech.adjust_spines(ax)
ax.set_xticks(range(len(words)))
ax.set_xticklabels(words,rotation='vertical',weight='bold')
ax.set_ylabel('Count')
plt.tight_layout()
plt.savefig('word-frequency')
	import json
	import nltk

	#import matplotlib.pyplot as plt
	#import utils as tech

	from nltk.corpus import stopwords
	from nltk import bigrams,trigrams
	from pprint import pprint

	READ = 'r'

	corpus = [json.loads(item) for item in json.load(open('twitter-output.json',READ))]

	def cleanse(data,remove_stopwords=True):
	#extract text
	corpus = [datum['text'].lower().split() for datum in data]

	#remove URLs and stopwords
	corpus = [[word for word in text if not word.startswith('http://')
	and word not in stopwords.words('english')] for text in corpus]

	#remove unicode
	corpus = [[word for word in text if all([ord(ch)<128 for ch in word])] for text in corpus]

	return corpus

	def extract_entities(selector,tweet):
	'''Tweet is a list of words'''
	return [word for word in tweet if word.startswith(selector)]

	def extract_hashtags(tweet):
	'''Tweet is a list of words'''
	return extract_entities('#',tweet)

	def extract_people(tweet):
	'''Tweet is a list of words'''
	return extract_entities('@',tweet)

	tokens = [text for tweet in cleanse(corpus) for text in tweet]
	word_frequencies = nltk.FreqDist(tokens)

	bi_tokens = bigrams(tokens)
	tri_tokens = trigrams(tokens)

	#pprint([bi_tokens])
	pprint(list(bi_tokens))
	pprint(list(tri_tokens)

	'''
	Exercises:
	1. Partition data
	2.

	'''

	''' Data Visualization'''
	fig = plt.figure()
	ax = fig.add_subplot(111)
	words,freqs = zip(*word_frequencies.most_common(25))
	ax.plot(freqs,'k--',linewidth=2)
	tech.adjust_spines(ax)
	ax.set_xticks(range(len(words)))
	ax.set_xticklabels(words,rotation='vertical',weight='bold')
	ax.set_ylabel('Count')
	plt.tight_layout()
	plt.savefig('word-frequency')