natematias/365grateful_on_twitter.py

## 365grateful_on_twitter.py
#adapted from minus_context

import nltk
import feedparser
import random
import twitter
import yaml
import string
import re
from urlparse import urlparse

#uses the format defined here: https://github.com/natematias/minus_context/blob/master/config.yaml.sample
api_key = yaml.load(open('config.yaml'))

api = twitter.Api(consumer_key=api_key['api'],
  consumer_secret=api_key['consumer_secret'],
  access_token_key=api_key['access_token_key'],
  access_token_secret=api_key['access_token_secret'])

gratitude = []
# Tweet out a random sentence
tweets = api.GetSearch("#365grateful", include_entities=True, count=1000)
gratitude = tweets

#while(len(gratitude)< 500):
#  tweets = api.GetSearch("#365grateful", include_entities=True, count=1000,max_id=gratitude[-1].id)
#  gratitude = gratitude + tweets

mention_count = [len(x.user_mentions) for x in gratitude]

plt.hist(mention_count)
plt.ylabel('Tweets mentioning X users')
plt.title('Number of #365grateful tweets mentioning other users', fontsize= 12)
plt.xlabel("Number of people mentioned in #365grateful tweets")
plt.show()

# plot the term frequency distribution among the tweets
tweet_tokens = [re.sub(r'[^\w\s]','',x).lower().strip() for x in nltk.word_tokenize(' '.join([tweet.text for tweet in gratitude])) if not x in nltk.corpus.stopwords.words('english') and not x in string.punctuation]
tweet_tokens = [x for x in tweet_tokens if len(x) > 0]

fdist = nltk.FreqDist(tweet_tokens)
fdist.plot(50, cumulative=False)

tweet_tokens = list(set(nltk.word_tokenize(tweet_text)) - set(nltk.corpus.stopwords.words('english')))

# count URLs
url_count = [len(x.urls) for x in gratitude]
plt.hist(url_count,2)
plt.ylabel('Tweets including urls')
plt.title('Number of #365grateful tweets including links', fontsize= 12)
plt.xlabel("Number of links in #365grateful tweets")
plt.show()

expanded_urls = []
for tweet in gratitude:
  expanded_urls = expanded_urls + [url.expanded_url for url in tweet.urls]

baseurls = []
baseurls = [urlparse(url).netloc for url in expanded_urls]

urldist = nltk.FreqDist(baseurls)
urldist.plot(4, cumulative=False)
	#adapted from minus_context

	import nltk
	import feedparser
	import random
	import twitter
	import yaml
	import string
	import re
	from urlparse import urlparse

	#uses the format defined here: https://github.com/natematias/minus_context/blob/master/config.yaml.sample
	api_key = yaml.load(open('config.yaml'))

	api = twitter.Api(consumer_key=api_key['api'],
	consumer_secret=api_key['consumer_secret'],
	access_token_key=api_key['access_token_key'],
	access_token_secret=api_key['access_token_secret'])

	gratitude = []
	# Tweet out a random sentence
	tweets = api.GetSearch("#365grateful", include_entities=True, count=1000)
	gratitude = tweets

	#while(len(gratitude)< 500):
	# tweets = api.GetSearch("#365grateful", include_entities=True, count=1000,max_id=gratitude[-1].id)
	# gratitude = gratitude + tweets

	mention_count = [len(x.user_mentions) for x in gratitude]

	plt.hist(mention_count)
	plt.ylabel('Tweets mentioning X users')
	plt.title('Number of #365grateful tweets mentioning other users', fontsize= 12)
	plt.xlabel("Number of people mentioned in #365grateful tweets")
	plt.show()

	# plot the term frequency distribution among the tweets
	tweet_tokens = [re.sub(r'[^\w\s]','',x).lower().strip() for x in nltk.word_tokenize(' '.join([tweet.text for tweet in gratitude])) if not x in nltk.corpus.stopwords.words('english') and not x in string.punctuation]
	tweet_tokens = [x for x in tweet_tokens if len(x) > 0]

	fdist = nltk.FreqDist(tweet_tokens)
	fdist.plot(50, cumulative=False)

	tweet_tokens = list(set(nltk.word_tokenize(tweet_text)) - set(nltk.corpus.stopwords.words('english')))

	# count URLs
	url_count = [len(x.urls) for x in gratitude]
	plt.hist(url_count,2)
	plt.ylabel('Tweets including urls')
	plt.title('Number of #365grateful tweets including links', fontsize= 12)
	plt.xlabel("Number of links in #365grateful tweets")
	plt.show()

	expanded_urls = []
	for tweet in gratitude:
	expanded_urls = expanded_urls + [url.expanded_url for url in tweet.urls]

	baseurls = []
	baseurls = [urlparse(url).netloc for url in expanded_urls]

	urldist = nltk.FreqDist(baseurls)
	urldist.plot(4, cumulative=False)