Skip to content

Instantly share code, notes, and snippets.

@mac389
Created June 19, 2015 14:37
Show Gist options
  • Save mac389/f9f08bdb4cf4ebc8a1a8 to your computer and use it in GitHub Desktop.
Save mac389/f9f08bdb4cf4ebc8a1a8 to your computer and use it in GitHub Desktop.
import json
import nltk
#import matplotlib.pyplot as plt
#import utils as tech
from nltk.corpus import stopwords
from nltk import bigrams,trigrams
from pprint import pprint
READ = 'r'
corpus = [json.loads(item) for item in json.load(open('twitter-output.json',READ))]
def cleanse(data,remove_stopwords=True):
#extract text
corpus = [datum['text'].lower().split() for datum in data]
#remove URLs and stopwords
corpus = [[word for word in text if not word.startswith('http://')
and word not in stopwords.words('english')] for text in corpus]
#remove unicode
corpus = [[word for word in text if all([ord(ch)<128 for ch in word])] for text in corpus]
return corpus
def extract_entities(selector,tweet):
'''Tweet is a list of words'''
return [word for word in tweet if word.startswith(selector)]
def extract_hashtags(tweet):
'''Tweet is a list of words'''
return extract_entities('#',tweet)
def extract_people(tweet):
'''Tweet is a list of words'''
return extract_entities('@',tweet)
tokens = [text for tweet in cleanse(corpus) for text in tweet]
word_frequencies = nltk.FreqDist(tokens)
bi_tokens = bigrams(tokens)
tri_tokens = trigrams(tokens)
#pprint([bi_tokens])
pprint(list(bi_tokens))
pprint(list(tri_tokens)
'''
Exercises:
1. Partition data
2.
'''
''' Data Visualization'''
fig = plt.figure()
ax = fig.add_subplot(111)
words,freqs = zip(*word_frequencies.most_common(25))
ax.plot(freqs,'k--',linewidth=2)
tech.adjust_spines(ax)
ax.set_xticks(range(len(words)))
ax.set_xticklabels(words,rotation='vertical',weight='bold')
ax.set_ylabel('Count')
plt.tight_layout()
plt.savefig('word-frequency')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment