jaidevd/pycon.py

## pycon.py
#!/usr/bin/env python

import os
import json
import numpy as np
from pandas import DataFrame, concat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from matplotlib.pyplot import imshow, plot, show, figure, title, yticks


# Download this file from https://gist.github.com/3809261
filename = os.path.join('pyconindia2012.json')
f = open(filename, 'r')

###############################################################################
# Decoding the json file, getting the data in a pandas dataframe and saving it as
# an xls file.
###############################################################################

tweets = []
for line in f:
    tweets.append(json.loads(line))

tweeters = []
texts = []
timestamps = []
metadata = []
for tweet in tweets:
    tweeters.append(tweet['from_user_name'])
    texts.append(tweet['text'])
    timestamps.append(tweet['created_at'])
    metadata.append(tweet['metadata']['result_type'])

tweet_dict = {
    'tweeters':tweeters, 'texts':texts, 'timestamps':timestamps,
    'metadata':metadata
}

df = DataFrame(tweet_dict)
df.to_excel('pycontweets_pandas.xls')


###############################################################################
# Processing the text in tweets to remove redundancies
###############################################################################

# These characters are unwated in the words in a tweet.
unchars = ['@','#','http', 'rt', 'RT', 'and', 'at', 'by', 'for', 'in', 'is',
           'of','on','the', 'two', 'with', 'to', 'was', 'day', 'will', 'it', 'who',
           'had']

proc_text = []

for text in texts:
    words = text.split(' ')
    wordlist = []
    for word in words:
        if np.prod([unchar not in word for unchar in unchars]):
            wordlist.append(word)
    s = ''
    for word in wordlist:
        s += word + ' '
    proc_text.append(s)

df['texts'] = proc_text


###############################################################################
# Tokenizing and analyzing the text in tweets
###############################################################################

vectorizer = TfidfVectorizer()
text_vectorized = vectorizer.fit_transform(proc_text)
tv_sum = np.sum(text_vectorized.toarray(), axis=0)

plot(tv_sum)
show()

thresh = input('Enter Threshold:\n')

inds = tv_sum > thresh

keyword_inds = []
for i in range(len(inds)):
    if inds[i]:
        keyword_inds.append(i)

keywords = []
for key in vectorizer.vocabulary_:
    if vectorizer.vocabulary_[key] in keyword_inds:
        keywords.append(key)

for keyword in keywords:
    if len(keyword)<3:
        keywords.remove(keyword)
        keyword_inds.remove(vectorizer.vocabulary_[keyword])


imshow(text_vectorized.toarray().T, aspect='auto')
yticks(keyword_inds, tuple(keywords), rotation=0)
title('Image Plot of Words in tweets')
show()


# Making a PCA plot:

pca = PCA(2, whiten=True)
pc_red = pca.fit_transform(text_vectorized.toarray())

figure()
plot(pc_red[:,0], pc_red[:,1], 'ro')
title('PCA Plot')
show()

# Performing K-means clustering
k = input('Input cluster numbers:\n')
km = KMeans(2)
km.fit(pc_red)

figure()
plot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], 'r+', markersize=20)
plot(pc_red[:,0], pc_red[:,1], 'bo')
show()
	#!/usr/bin/env python

	import os
	import json
	import numpy as np
	from pandas import DataFrame, concat
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.decomposition import PCA
	from sklearn.cluster import KMeans
	from matplotlib.pyplot import imshow, plot, show, figure, title, yticks


	# Download this file from https://gist.github.com/3809261
	filename = os.path.join('pyconindia2012.json')
	f = open(filename, 'r')

	###############################################################################
	# Decoding the json file, getting the data in a pandas dataframe and saving it as
	# an xls file.
	###############################################################################

	tweets = []
	for line in f:
	tweets.append(json.loads(line))

	tweeters = []
	texts = []
	timestamps = []
	metadata = []
	for tweet in tweets:
	tweeters.append(tweet['from_user_name'])
	texts.append(tweet['text'])
	timestamps.append(tweet['created_at'])
	metadata.append(tweet['metadata']['result_type'])

	tweet_dict = {
	'tweeters':tweeters, 'texts':texts, 'timestamps':timestamps,
	'metadata':metadata
	}

	df = DataFrame(tweet_dict)
	df.to_excel('pycontweets_pandas.xls')



	###############################################################################
	# Processing the text in tweets to remove redundancies
	###############################################################################

	# These characters are unwated in the words in a tweet.
	unchars = ['@','#','http', 'rt', 'RT', 'and', 'at', 'by', 'for', 'in', 'is',
	'of','on','the', 'two', 'with', 'to', 'was', 'day', 'will', 'it', 'who',
	'had']

	proc_text = []

	for text in texts:
	words = text.split(' ')
	wordlist = []
	for word in words:
	if np.prod([unchar not in word for unchar in unchars]):
	wordlist.append(word)
	s = ''
	for word in wordlist:
	s += word + ' '
	proc_text.append(s)

	df['texts'] = proc_text



	###############################################################################
	# Tokenizing and analyzing the text in tweets
	###############################################################################

	vectorizer = TfidfVectorizer()
	text_vectorized = vectorizer.fit_transform(proc_text)
	tv_sum = np.sum(text_vectorized.toarray(), axis=0)

	plot(tv_sum)
	show()

	thresh = input('Enter Threshold:\n')

	inds = tv_sum > thresh

	keyword_inds = []
	for i in range(len(inds)):
	if inds[i]:
	keyword_inds.append(i)

	keywords = []
	for key in vectorizer.vocabulary_:
	if vectorizer.vocabulary_[key] in keyword_inds:
	keywords.append(key)

	for keyword in keywords:
	if len(keyword)<3:
	keywords.remove(keyword)
	keyword_inds.remove(vectorizer.vocabulary_[keyword])


	imshow(text_vectorized.toarray().T, aspect='auto')
	yticks(keyword_inds, tuple(keywords), rotation=0)
	title('Image Plot of Words in tweets')
	show()


	# Making a PCA plot:

	pca = PCA(2, whiten=True)
	pc_red = pca.fit_transform(text_vectorized.toarray())

	figure()
	plot(pc_red[:,0], pc_red[:,1], 'ro')
	title('PCA Plot')
	show()

	# Performing K-means clustering
	k = input('Input cluster numbers:\n')
	km = KMeans(2)
	km.fit(pc_red)

	figure()
	plot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], 'r+', markersize=20)
	plot(pc_red[:,0], pc_red[:,1], 'bo')
	show()