Created
September 26, 2012 22:55
-
-
Save michelleboisson/3791161 to your computer and use it in GitHub Desktop.
Data Without Borders - Assignment 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#How many unique users have more than 100000 followers? What are their screen names? | |
tweets <- read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.csv", as.is=TRUE) | |
unique(tweets$screen_name[which(as.numeric(tweets$followers) >= 100000)]) | |
# [1] "detikcom" "DonLemonCNN" "HuffingtonPost" "Dputamadre" "WorldRss" "AlMasryAlYoum" | |
# [7] "theobscurant" "fadjroeL" "TPO_Hisself" "CAPAMAG" "TwittyAlgeria" "foxandfriends" | |
# [13] "PranayGupte" | |
#It’d be interesting to see what part of the world users are tweeting from. What are the top 3 #locations people are from (not counting blanks)? | |
all.locations.sorted = data.frame(rev(sort(table(tweets$location)))) | |
top.3.locations = all.locations.sorted[2:4,] | |
top.3.locations | |
# USA Tripoli, Libya London | |
# 34 28 20 | |
#Retweets can often indicate what’s important, or at least influential. What is the text of the #tweet that was retweeted the most times and who tweeted it? | |
all.retweets.sorted = data.frame(rev(sort(table(tweets$retweet)))) | |
head(all.retweets.sorted) | |
tweets$text[all.retweets.sorted[1:2,]] | |
#[1] "RT @DennisDMZ: So let me get this straight. There's a War on Women but no War on Terror? #Hey guys, little less focus on the labia little ..." | |
tweets$screen_name[all.retweets.sorted[1,]] | |
#[1] "AndrewDeikel" | |
#Plot the distribution of the number of people the users are following (don’t | |
#worry about the fact that some people will be counted multiple times – just pretend each | |
#row is a different user). What do you see? | |
hist(tweets$following) | |
###See graph below | |
###The graph doesn't tell us much only that there is a large number of people who aren't following a lot of people. | |
#Let’s reduce our set to just people with fewer than | |
#5000 followers and look at the histogram again. What do you see now? Have you tried | |
#using different breaks? Does anything surprise you? | |
fewer.than.5000.followers = data.frame(tweets$followers < 5000) | |
hist(tweets$followers[which(fewer.than.5000.followers == TRUE)], breaks=200) | |
#See image below | |
###there are a lot of people with very little followers. It's mostly a curve, except for a small spike around 800 followers or so. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Write code to find the 5 most popular words used in the descriptions of our | |
#users (again, just treat each row as if it’s a unique user, even though that means we’ll be counting #users who tweeted more than once multiple times). | |
description.words = unlist(strsplit(tweets$description, " ")) | |
description.words = tolower(description.words) | |
sorted.description.words = rev(sort(table(description.words))) | |
head(sorted.description.words) | |
#description.words | |
#the and of a to | |
#868 689 581 511 479 413 | |
###The top five words used in description of the users are 'the', 'and', 'of', ' ', and 'a'. | |
#Let’s clean out super common words, often referred to as stopwords, so we can just focus on the | |
#interesting words people are using. That means we need to remove any common words | |
#from our big ol’ word vector we created in step 2 above. Hmm, how can we remove | |
#specific elements of a vector? | |
###load in stop words | |
stop.words = read.csv("http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop", as.is=TRUE) | |
###this is a table with 2 columns for some reason, I just want the words so | |
stop.words = stop.words$a | |
###add extras to the vector | |
newwords = c(" ", "&", "-", "|", "a") | |
stop.words = append(stop.words, newwords) | |
descr.words.withtout.stop = description.words[ !description.words %in% stop.words] | |
rev(sort(table(descr.words.withtout.stop)))[1:5] | |
descr.words.withtout.stop | |
# news love world follow | |
# 511 233 110 96 73 | |
###I forgot to remove the blank | |
stop.words = append(stop.words, "") | |
descr.words.withtout.stop = description.words[ !description.words %in% stop.words] | |
rev(sort(table(descr.words.withtout.stop)))[1:5] | |
#descr.words.withtout.stop | |
# news love world follow conservative | |
# 233 110 96 73 69 | |
#Using your skills with %in% and a vector of stopwords, remove the | |
#stopwords from the descriptions and recompute the top 5 words our Twitter users use to | |
#describe themselves. What do you think of the results? Do you have a sense of what | |
#types of users are most common in our dataset? | |
###It looks like most of the users in this dataset are news reporters, report on the world, and a few are conservative. | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###Part 3 | |
haiti <- read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/haiti-tweets.csv", as.is=TRUE) | |
#How many tweets did I collect in over an hour? | |
nrow(haiti) | |
[1] 411 | |
#This is significantly lower than the data about Libya. Obviously, Haiti is not on a lot of people's mind right now. | |
#Where are the tweets coming from? | |
all.locations.sorted = data.frame(rev(sort(table(haiti$location)))) | |
head(all.locations.sorted) | |
# 98 | |
#Haiti petion-ville 33 | |
#Haiti 15 | |
#CA 6 | |
#Tabarre 5 | |
#San Francisco 5 | |
#What are people tweeting about? | |
text.words = unlist(strsplit(haiti$text, " ")) | |
text.words = tolower(text.words) | |
sorted.text.words = rev(sort(table(text.words))) | |
head(sorted.text.words) | |
#text.words | |
#haiti rt in to i my | |
# 161 134 130 108 103 102 | |
#I'm adding "rt" to the stop words | |
stop.words = append(stop.words, "rt") | |
text.words.without.stop = text.words[ !text.words %in% stop.words] | |
rev(sort(table(text.words.without.stop)))[1:5] | |
#text.words.without.stop | |
# haiti #haiti ... de haiti, | |
# 161 82 62 54 47 | |
#So I'm going to add "haiti" and "#haiti" and more to the stop.words too, and see what I get | |
stop.words = append(stop.words, c("haiti", "#haiti", "...","haiti,")) | |
text.words.without.stop = text.words[ !text.words %in% stop.words] | |
rev(sort(table(text.words.without.stop)))[1:10] | |
#text.words.without.stop | |
# de nice spirit, pin everybody. country bbm #32dd26ac!!!! en le | |
# 54 35 33 33 33 33 33 #31 30 29 | |
##OK, so still not much. Maybe there isn't a central theme to what people are tweeting about. Though I cam curious about what "32dd26ac!!!!" isl; It's in 31 tweets. Let me try to figure what they say. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import json # Import the library that lets us work with JSON | |
import csv # Import the library that lets us read/write CSVs | |
import time # We're going to need to deal with a quick time conversion in here | |
# The path to the file we want to open (change this for your machine) | |
# This should be the result of the streaming API | |
infilename = "/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.json" | |
infile = open(infilename, "r") # Open up the file. "r" says we want to read from it (as opposed to write) | |
tweets = [] # An empty list to hold the tweets we're going to load | |
for line in infile: # Iterate over every line in the file and call it the variable "line" | |
try: | |
new_tweet = json.loads(line) # Load in the text and convert it from JSON to a Python dictoinary | |
except: | |
continue # try/except basically says to just keep moving if we fail to | |
# convert the JSON to a tweet (maybe there's malformed data or something) | |
tweets.append(new_tweet) # Add it to our list of tweets | |
# Whoa! That was easy! Now we have a list of tweets, where each tweet is a dictionary! | |
first_tweet = tweets[0] | |
first_tweet["text"] # The text of the first tweet. Boom. | |
# Here's an example of what's in a typical tweet: | |
# { | |
# "in_reply_to_status_id_str":null, | |
# "id_str":"247530200278114304", | |
# "text":"Nice work @AnnCoulter: Libya commemorates 9\/11 | http:\/\/t.co\/8yVjg5Ej http:\/\/t.co\/fSPlkhSK", | |
# "in_reply_to_screen_name":null, | |
# "in_reply_to_user_id_str":null, | |
# "favorited":false, | |
# "source":"web", | |
# "possibly_sensitive_editable":true, | |
# "entities":{ | |
# "hashtags":[], | |
# "user_mentions":[ | |
# {"id_str":"196168350", | |
# "indices":[10,21], | |
# "screen_name":"AnnCoulter", | |
# "name":"Ann Coulter", | |
# "id":196168350} | |
# ], | |
# "urls":[ | |
# {"indices":[49,69], | |
# "url":"http:\/\/t.co\/8yVjg5Ej", | |
# "display_url":"StAugustine.com", | |
# "expanded_url":"http:\/\/StAugustine.com" | |
# }, | |
# {"indices":[70,90], | |
# "url":"http:\/\/t.co\/fSPlkhSK", | |
# "display_url":"staugustine.com\/opinions\/2012-\u2026", | |
# "expanded_url":"http:\/\/staugustine.com\/opinions\/2012-09-16\/coulter-libya-commemorates-911#.UFaRvlv8T5w.twitter" | |
# } | |
# ] | |
# }, | |
# "truncated":false, | |
# "created_at":"Mon Sep 17 02:59:33 +0000 2012", | |
# "place":null, | |
# "in_reply_to_user_id":null, | |
# "contributors":null, | |
# "geo":null, | |
# "retweet_count":0, | |
# "retweeted":false, | |
# "coordinates":null, | |
# "user":{ | |
# "id_str":"131546419", | |
# "follow_request_sent":null, | |
# "default_profile_image":false, | |
# "profile_use_background_image":true, | |
# "friends_count":207, | |
# "profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2617221744\/d0sblw2ynv4aqbwmj9wa_normal.png", | |
# "is_translator":false, | |
# "statuses_count":475, | |
# "profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/90887244\/Walleye_Puck.bmp", | |
# "favourites_count":0, | |
# "profile_text_color":"333333", | |
# "followers_count":93, | |
# "geo_enabled":false, | |
# "profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/90887244\/Walleye_Puck.bmp", | |
# "description":"PROUD Conservative, Full Time Student, Former Business Owner, and Full Time Musician", | |
# "profile_link_color":"0084B4", | |
# "lang":"en", | |
# "notifications":null, | |
# "created_at":"Sat Apr 10 16:04:24 +0000 2010", | |
# "profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2617221744\/d0sblw2ynv4aqbwmj9wa_normal.png","listed_count":1, | |
# "profile_background_color":"C0DEED", | |
# "url":null, | |
# "contributors_enabled":false, | |
# "verified":false, | |
# "profile_background_tile":true, | |
# "time_zone":"Eastern Time (US & Canada)", | |
# "protected":false, | |
# "screen_name":"TTownD", | |
# "default_profile":false, | |
# "following":null, | |
# "profile_sidebar_fill_color":"DDEEF6", | |
# "name":"Doug ", | |
# "location":"TTown", | |
# "id":131546419, | |
# "utc_offset":-18000, | |
# "profile_sidebar_border_color":"C0DEED" | |
# }, | |
# "id":247530200278114304, | |
# "possibly_sensitive":false, | |
# "in_reply_to_status_id":null | |
# } | |
# | |
# http://www.scribd.com/doc/30146338/map-of-a-tweet | |
# Wow, there's a *ton* of stuff in there. We have all the information about the tweet, including | |
# structure information about the mentions, hashtags, and links in it, as well as all the user info. Awesome! | |
# So all we need to do is iterate through the tweets, pull out the fields we want into a list, | |
# then save them as a row in a CSV file. | |
# Open up the file we want to write to (libya_tweets.csv) | |
# csvwriter is an object that will write to that file | |
csvwriter = csv.writer(open("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.csv", "w")) | |
csvwriter.writerow(["tweet_id", "retweet", "text", "source", "screen_name", "name", "location", "description", "followers", "following", "created_at", "created_at_seconds", "hashtag1", "hashtag2", "url1", "url2", "mention1", "mention2", "lat", "lon"]) | |
for tweet in tweets: | |
# This will loop over the tweets list and, for each iteration, the "tweet" variable will stand in for each tweet. | |
# csv writer writes out whatever list of things you give it to a row of the CSV, so let's construct a row | |
# of the variables we want. | |
tweet_id = tweet["id_str"] | |
retweet = tweet["in_reply_to_status_id_str"] | |
# Sometimes the retweet ID isn't in that field, so let's check and see if it's in this other field | |
if not retweet and "retweeted_status" in tweet and tweet["retweeted_status"]: | |
retweet = tweet["retweeted_status"]["id_str"] | |
text = tweet["text"] | |
source = tweet["source"] | |
screen_name = tweet["user"]["screen_name"] | |
name = tweet["user"]["name"] | |
location = tweet["user"]["location"] | |
description = tweet["user"]["description"] | |
followers = tweet["user"]["followers_count"] | |
following = tweet["user"]["friends_count"] | |
created_at = tweet["user"]["created_at"] | |
# Having the time be a string is annoying. Let's use the Python time library to convert | |
# the time this tweet was created to a UNIX timestamp (learn more about these here - http://en.wikipedia.org/wiki/Unix_timestamp) | |
created_at_seconds = time.mktime(time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y")) | |
# Two other quick things we should think about: | |
# 1) I'd like us to record the hashtags, links, and mentions in each tweet, but they're in this variable length | |
# format (i.e. there could be 0, 1, 3, 100, who knows). Since CSVs are fixed width, why don't we just | |
# take the first two of each, accounting for the case where there are none. | |
hashtag1 = None | |
hashtag2 = None | |
if "hashtags" in tweet["entities"] and len(tweet["entities"]["hashtags"]): | |
hashtag1 = tweet["entities"]["hashtags"][0]["text"] | |
if len(tweet["entities"]["hashtags"]) > 1: | |
hashtag2 = tweet["entities"]["hashtags"][1]["text"] | |
url1 = None | |
url2 = None | |
if "urls" in tweet["entities"] and len(tweet["entities"]["urls"]): | |
url1 = tweet["entities"]["urls"][0]["url"] | |
if len(tweet["entities"]["urls"]) > 1: | |
url2 = tweet["entities"]["urls"][1]["url"] | |
mention1 = None | |
mention2 = None | |
if "user_mentions" in tweet["entities"] and len(tweet["entities"]["user_mentions"]): | |
mention1 = tweet["entities"]["user_mentions"][0]["screen_name"] | |
if len(tweet["entities"]["user_mentions"]) > 1: | |
mention2 = tweet["entities"]["user_mentions"][1]["screen_name"] | |
# 2) I'd also like us to record geo data, if it exists | |
lat = None | |
lon = None | |
if tweet["geo"]: | |
lat = tweet["geo"]["coordinates"][0] | |
lon = tweet["geo"]["coordinates"][1] | |
# OK! Let's write this tweet! | |
newrow = [tweet_id, retweet, text, source, screen_name, name, location, description, followers, following, created_at, created_at_seconds, hashtag1, hashtag2, url1, url2, mention1, mention2, lat, lon] | |
# Oop, one thing we need to do is convert everything to UTF8 before we write... | |
for i in range(len(newrow)): # For every value in our newrow | |
if hasattr(newrow[i], 'encode'): | |
newrow[i] = newrow[i].encode('utf8') | |
# Write it! | |
csvwriter.writerow(newrow) | |
# Done! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment