michelleboisson/libya_tweets-pt1.r

## libya_tweets-pt1.r
#How many unique users have more than 100000 followers?  What are their screen names?

tweets <- read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.csv", as.is=TRUE)

unique(tweets$screen_name[which(as.numeric(tweets$followers) >= 100000)])
# [1] "detikcom"       "DonLemonCNN"    "HuffingtonPost" "Dputamadre"     "WorldRss"       "AlMasryAlYoum"
# [7] "theobscurant"   "fadjroeL"       "TPO_Hisself"    "CAPAMAG"        "TwittyAlgeria"  "foxandfriends"
# [13] "PranayGupte"


#It’d be interesting to see what part of the world users are tweeting from.  What are the top 3 #locations people are from (not counting blanks)?

all.locations.sorted = data.frame(rev(sort(table(tweets$location))))
top.3.locations = all.locations.sorted[2:4,]
top.3.locations
#          USA Tripoli, Libya         London
#           34             28             20


#Retweets can often indicate what’s important, or at least influential.  What is the text of the #tweet that was retweeted the most times and who tweeted it?
all.retweets.sorted = data.frame(rev(sort(table(tweets$retweet))))
head(all.retweets.sorted)
tweets$text[all.retweets.sorted[1:2,]]
#[1] "RT @DennisDMZ: So let me get this straight. There's a War on Women but no War on Terror? #Hey guys, little less focus on the labia little ..."

tweets$screen_name[all.retweets.sorted[1,]]
#[1] "AndrewDeikel"

#Plot the distribution of the number of people the users are following (don’t
#worry about the fact that some people will be counted multiple times – just pretend each
#row is a different user). What do you see?
hist(tweets$following)
###See graph below
###The graph doesn't tell us much only that there is a large number of people who aren't following a lot of people.

#Let’s reduce our set to just people with fewer than
#5000 followers and look at the histogram again.  What do you see now?  Have you tried
#using different breaks?  Does anything surprise you?

fewer.than.5000.followers = data.frame(tweets$followers < 5000)
hist(tweets$followers[which(fewer.than.5000.followers == TRUE)], breaks=200)
#See image below
###there are a lot of people with very little followers. It's mostly a curve, except for a small spike around 800 followers or so.

## libya_tweets-pt2.r
#Write code to find the 5 most popular words used in the descriptions of our
#users (again, just treat each row as if it’s a unique user, even though that means we’ll be counting #users who tweeted more than once multiple times).

description.words = unlist(strsplit(tweets$description, " "))
description.words = tolower(description.words)
sorted.description.words = rev(sort(table(description.words)))
head(sorted.description.words)
#description.words
#the and  of       a  to
#868 689 581 511 479 413

###The top five words used in description of the users are 'the', 'and', 'of', ' ', and 'a'.

#Let’s clean out super common words, often referred to as stopwords, so we can just focus on the
#interesting words people are using.  That means we need to remove any common words
#from our big ol’ word vector we created in step 2 above.  Hmm, how can we remove
#specific elements of a vector?

###load in stop words
stop.words = read.csv("http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop", as.is=TRUE)

###this is a table with 2 columns for some reason, I just want the words so
stop.words = stop.words$a

###add extras to the vector
newwords = c(" ", "&", "-", "|", "a")
stop.words = append(stop.words, newwords)
descr.words.withtout.stop = description.words[ !description.words %in% stop.words]
rev(sort(table(descr.words.withtout.stop)))[1:5]
descr.words.withtout.stop
#         news   love  world follow
#   511    233    110     96     73

###I forgot to remove the blank
stop.words = append(stop.words, "")
descr.words.withtout.stop = description.words[ !description.words %in% stop.words]
rev(sort(table(descr.words.withtout.stop)))[1:5]
#descr.words.withtout.stop
#        news         love        world       follow conservative
#         233          110           96           73           69

#Using your skills with %in% and a vector of stopwords, remove the
#stopwords from the descriptions and recompute the top 5 words our Twitter users use to
#describe themselves.  What do you think of the results? Do you have a sense of what
#types of users are most common in our dataset?

###It looks like most of the users in this dataset are news reporters, report on the world, and a few are conservative.


## libya_tweets-pt3.r
###Part 3

haiti <- read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/haiti-tweets.csv", as.is=TRUE)

#How many tweets did I collect in over an hour?
nrow(haiti)
[1] 411

#This is significantly lower than the data about Libya. Obviously, Haiti is not on a lot of people's mind right now.


#Where are the tweets coming from?
all.locations.sorted = data.frame(rev(sort(table(haiti$location))))
head(all.locations.sorted)
#                                                   98
#Haiti petion-ville                                 33
#Haiti                                              15
#CA                                                  6
#Tabarre                                             5
#San Francisco                                       5

#What are people tweeting about?
text.words = unlist(strsplit(haiti$text, " "))
text.words = tolower(text.words)
sorted.text.words = rev(sort(table(text.words)))
head(sorted.text.words)
#text.words
#haiti    rt    in    to     i    my
#  161   134   130   108   103   102

#I'm adding "rt" to the stop words
stop.words = append(stop.words, "rt")
text.words.without.stop = text.words[ !text.words %in% stop.words]
rev(sort(table(text.words.without.stop)))[1:5]
#text.words.without.stop
# haiti #haiti    ...     de haiti,
#   161     82     62     54     47

#So I'm going to add "haiti" and "#haiti" and more to the stop.words too, and see what I get
stop.words = append(stop.words, c("haiti", "#haiti", "...","haiti,"))
text.words.without.stop = text.words[ !text.words %in% stop.words]
rev(sort(table(text.words.without.stop)))[1:10]
#text.words.without.stop
#          de         nice      spirit,          pin   everybody.      country          bbm #32dd26ac!!!!           en           le
#          54           35           33           33           33           33           33           #31           30           29

##OK, so still not much. Maybe there isn't a central theme to what people are tweeting about. Though I cam curious about what "32dd26ac!!!!" isl; It's in 31 tweets. Let me try to figure what they say.

## tweets_to_csv.py
#!/usr/bin/python

import json  # Import the library that lets us work with JSON
import csv   # Import the library that lets us read/write CSVs
import time  # We're going to need to deal with a quick time conversion in here

# The path to the file we want to open (change this for your machine)
# This should be the result of the streaming API
infilename = "/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.json"

infile = open(infilename, "r") # Open up the file.  "r" says we want to read from it (as opposed to write)

tweets = []  # An empty list to hold the tweets we're going to load
for line in infile:  # Iterate over every line in the file and call it the variable "line"
    try:
        new_tweet = json.loads(line)  # Load in the text and convert it from JSON to a Python dictoinary
    except:
        continue                      # try/except basically says to just keep moving if we fail to
                                      # convert the JSON to a tweet (maybe there's malformed data or something)

    tweets.append(new_tweet)          # Add it to our list of tweets

# Whoa!  That was easy!  Now we have a list of tweets, where each tweet is a dictionary!

first_tweet = tweets[0]
first_tweet["text"]  # The text of the first tweet.  Boom.

# Here's an example of what's in a typical tweet:

# {
#  "in_reply_to_status_id_str":null,
#  "id_str":"247530200278114304",
#  "text":"Nice work @AnnCoulter: Libya commemorates 9\/11 | http:\/\/t.co\/8yVjg5Ej http:\/\/t.co\/fSPlkhSK",
#  "in_reply_to_screen_name":null,
#  "in_reply_to_user_id_str":null,
#  "favorited":false,
#  "source":"web",
#  "possibly_sensitive_editable":true,
#  "entities":{
#       "hashtags":[],
#       "user_mentions":[
#               {"id_str":"196168350",
#               "indices":[10,21],
#               "screen_name":"AnnCoulter",
#               "name":"Ann Coulter",
#               "id":196168350}
#               ],
#       "urls":[
#               {"indices":[49,69],
#                "url":"http:\/\/t.co\/8yVjg5Ej",
#                "display_url":"StAugustine.com",
#                "expanded_url":"http:\/\/StAugustine.com"
#               },
#               {"indices":[70,90],
#                "url":"http:\/\/t.co\/fSPlkhSK",
#                "display_url":"staugustine.com\/opinions\/2012-\u2026",
#                "expanded_url":"http:\/\/staugustine.com\/opinions\/2012-09-16\/coulter-libya-commemorates-911#.UFaRvlv8T5w.twitter"
#               }
#               ]
#  },
#  "truncated":false,
#  "created_at":"Mon Sep 17 02:59:33 +0000 2012",
#  "place":null,
#  "in_reply_to_user_id":null,
#  "contributors":null,
#  "geo":null,
#  "retweet_count":0,
#  "retweeted":false,
#  "coordinates":null,
#  "user":{
#        "id_str":"131546419",
#        "follow_request_sent":null,
#        "default_profile_image":false,
#        "profile_use_background_image":true,
#        "friends_count":207,
#        "profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2617221744\/d0sblw2ynv4aqbwmj9wa_normal.png",
#        "is_translator":false,
#        "statuses_count":475,
#        "profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/90887244\/Walleye_Puck.bmp",
#        "favourites_count":0,
#        "profile_text_color":"333333",
#        "followers_count":93,
#        "geo_enabled":false,
#        "profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/90887244\/Walleye_Puck.bmp",
#        "description":"PROUD Conservative, Full Time Student, Former Business Owner, and Full Time Musician",
#        "profile_link_color":"0084B4",
#        "lang":"en",
#        "notifications":null,
#        "created_at":"Sat Apr 10 16:04:24 +0000 2010",
#        "profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2617221744\/d0sblw2ynv4aqbwmj9wa_normal.png","listed_count":1,
#        "profile_background_color":"C0DEED",
#        "url":null,
#        "contributors_enabled":false,
#        "verified":false,
#        "profile_background_tile":true,
#        "time_zone":"Eastern Time (US & Canada)",
#        "protected":false,
#        "screen_name":"TTownD",
#        "default_profile":false,
#        "following":null,
#        "profile_sidebar_fill_color":"DDEEF6",
#        "name":"Doug ",
#        "location":"TTown",
#        "id":131546419,
#        "utc_offset":-18000,
#        "profile_sidebar_border_color":"C0DEED"
#    },
#    "id":247530200278114304,
#    "possibly_sensitive":false,
#    "in_reply_to_status_id":null
#  }
#
# http://www.scribd.com/doc/30146338/map-of-a-tweet

# Wow, there's a *ton* of stuff in there.  We have all the information about the tweet, including
# structure information about the mentions, hashtags, and links in it, as well as all the user info.  Awesome!

# So all we need to do is iterate through the tweets, pull out the fields we want into a list,
# then save them as a row in a CSV file.

# Open up the file we want to write to (libya_tweets.csv)
# csvwriter is an object that will write to that file
csvwriter = csv.writer(open("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.csv", "w"))
csvwriter.writerow(["tweet_id", "retweet", "text", "source", "screen_name", "name", "location", "description", "followers", "following", "created_at", "created_at_seconds", "hashtag1", "hashtag2", "url1", "url2", "mention1", "mention2", "lat", "lon"])
for tweet in tweets:
    # This will loop over the tweets list and, for each iteration, the "tweet" variable will stand in for each tweet.
    # csv writer writes out whatever list of things you give it to a row of the CSV, so let's construct a row
    # of the variables we want.
    tweet_id = tweet["id_str"]
    retweet = tweet["in_reply_to_status_id_str"]
    # Sometimes the retweet ID isn't in that field, so let's check and see if it's in this other field
    if not retweet and "retweeted_status" in tweet and tweet["retweeted_status"]:
        retweet = tweet["retweeted_status"]["id_str"]
    text = tweet["text"]
    source = tweet["source"]
    screen_name = tweet["user"]["screen_name"]
    name = tweet["user"]["name"]
    location = tweet["user"]["location"]
    description = tweet["user"]["description"]
    followers = tweet["user"]["followers_count"]
    following = tweet["user"]["friends_count"]
    created_at = tweet["user"]["created_at"]


    # Having the time be a string is annoying.  Let's use the Python time library to convert
    # the time this tweet was created to a UNIX timestamp (learn more about these here - http://en.wikipedia.org/wiki/Unix_timestamp)
    created_at_seconds = time.mktime(time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y"))

    # Two other quick things we should think about:
    # 1)  I'd like us to record the hashtags, links, and mentions in each tweet, but they're in this variable length
    #     format (i.e. there could be 0, 1, 3, 100, who knows).  Since CSVs are fixed width, why don't we just
    #     take the first two of each, accounting for the case where there are none.
    hashtag1 = None
    hashtag2 = None
    if "hashtags" in tweet["entities"] and len(tweet["entities"]["hashtags"]):
        hashtag1 = tweet["entities"]["hashtags"][0]["text"]
        if len(tweet["entities"]["hashtags"]) > 1:
            hashtag2 = tweet["entities"]["hashtags"][1]["text"]

    url1 = None
    url2 = None
    if "urls" in tweet["entities"] and len(tweet["entities"]["urls"]):
        url1 = tweet["entities"]["urls"][0]["url"]
        if len(tweet["entities"]["urls"]) > 1:
            url2 = tweet["entities"]["urls"][1]["url"]

    mention1 = None
    mention2 = None
    if "user_mentions" in tweet["entities"] and len(tweet["entities"]["user_mentions"]):
        mention1 = tweet["entities"]["user_mentions"][0]["screen_name"]
        if len(tweet["entities"]["user_mentions"]) > 1:
            mention2 = tweet["entities"]["user_mentions"][1]["screen_name"]

    # 2)  I'd also like us to record geo data, if it exists
    lat = None
    lon = None
    if tweet["geo"]:
        lat = tweet["geo"]["coordinates"][0]
        lon = tweet["geo"]["coordinates"][1]

    # OK!  Let's write this tweet!
    newrow = [tweet_id, retweet, text, source, screen_name, name, location, description, followers, following, created_at, created_at_seconds, hashtag1, hashtag2, url1, url2, mention1, mention2, lat, lon]
    # Oop, one thing we need to do is convert everything to UTF8 before we write...
    for i in range(len(newrow)):  # For every value in our newrow
        if hasattr(newrow[i], 'encode'):
            newrow[i] = newrow[i].encode('utf8')

    # Write it!
    csvwriter.writerow(newrow)

# Done!
	#How many unique users have more than 100000 followers? What are their screen names?

	tweets <- read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.csv", as.is=TRUE)

	unique(tweets$screen_name[which(as.numeric(tweets$followers) >= 100000)])
	# [1] "detikcom" "DonLemonCNN" "HuffingtonPost" "Dputamadre" "WorldRss" "AlMasryAlYoum"
	# [7] "theobscurant" "fadjroeL" "TPO_Hisself" "CAPAMAG" "TwittyAlgeria" "foxandfriends"
	# [13] "PranayGupte"


	#It’d be interesting to see what part of the world users are tweeting from. What are the top 3 #locations people are from (not counting blanks)?

	all.locations.sorted = data.frame(rev(sort(table(tweets$location))))
	top.3.locations = all.locations.sorted[2:4,]
	top.3.locations
	# USA Tripoli, Libya London
	# 34 28 20


	#Retweets can often indicate what’s important, or at least influential. What is the text of the #tweet that was retweeted the most times and who tweeted it?
	all.retweets.sorted = data.frame(rev(sort(table(tweets$retweet))))
	head(all.retweets.sorted)
	tweets$text[all.retweets.sorted[1:2,]]
	#[1] "RT @DennisDMZ: So let me get this straight. There's a War on Women but no War on Terror? #Hey guys, little less focus on the labia little ..."

	tweets$screen_name[all.retweets.sorted[1,]]
	#[1] "AndrewDeikel"

	#Plot the distribution of the number of people the users are following (don’t
	#worry about the fact that some people will be counted multiple times – just pretend each
	#row is a different user). What do you see?
	hist(tweets$following)
	###See graph below
	###The graph doesn't tell us much only that there is a large number of people who aren't following a lot of people.

	#Let’s reduce our set to just people with fewer than
	#5000 followers and look at the histogram again. What do you see now? Have you tried
	#using different breaks? Does anything surprise you?

	fewer.than.5000.followers = data.frame(tweets$followers < 5000)
	hist(tweets$followers[which(fewer.than.5000.followers == TRUE)], breaks=200)
	#See image below
	###there are a lot of people with very little followers. It's mostly a curve, except for a small spike around 800 followers or so.
	#Write code to find the 5 most popular words used in the descriptions of our
	#users (again, just treat each row as if it’s a unique user, even though that means we’ll be counting #users who tweeted more than once multiple times).

	description.words = unlist(strsplit(tweets$description, " "))
	description.words = tolower(description.words)
	sorted.description.words = rev(sort(table(description.words)))
	head(sorted.description.words)
	#description.words
	#the and of a to
	#868 689 581 511 479 413

	###The top five words used in description of the users are 'the', 'and', 'of', ' ', and 'a'.

	#Let’s clean out super common words, often referred to as stopwords, so we can just focus on the
	#interesting words people are using. That means we need to remove any common words
	#from our big ol’ word vector we created in step 2 above. Hmm, how can we remove
	#specific elements of a vector?

	###load in stop words
	stop.words = read.csv("http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop", as.is=TRUE)

	###this is a table with 2 columns for some reason, I just want the words so
	stop.words = stop.words$a

	###add extras to the vector
	newwords = c(" ", "&", "-", "\|", "a")
	stop.words = append(stop.words, newwords)
	descr.words.withtout.stop = description.words[ !description.words %in% stop.words]
	rev(sort(table(descr.words.withtout.stop)))[1:5]
	descr.words.withtout.stop
	# news love world follow
	# 511 233 110 96 73

	###I forgot to remove the blank
	stop.words = append(stop.words, "")
	descr.words.withtout.stop = description.words[ !description.words %in% stop.words]
	rev(sort(table(descr.words.withtout.stop)))[1:5]
	#descr.words.withtout.stop
	# news love world follow conservative
	# 233 110 96 73 69

	#Using your skills with %in% and a vector of stopwords, remove the
	#stopwords from the descriptions and recompute the top 5 words our Twitter users use to
	#describe themselves. What do you think of the results? Do you have a sense of what
	#types of users are most common in our dataset?

	###It looks like most of the users in this dataset are news reporters, report on the world, and a few are conservative.
	###Part 3

	haiti <- read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/haiti-tweets.csv", as.is=TRUE)

	#How many tweets did I collect in over an hour?
	nrow(haiti)
	[1] 411

	#This is significantly lower than the data about Libya. Obviously, Haiti is not on a lot of people's mind right now.


	#Where are the tweets coming from?
	all.locations.sorted = data.frame(rev(sort(table(haiti$location))))
	head(all.locations.sorted)
	# 98
	#Haiti petion-ville 33
	#Haiti 15
	#CA 6
	#Tabarre 5
	#San Francisco 5

	#What are people tweeting about?
	text.words = unlist(strsplit(haiti$text, " "))
	text.words = tolower(text.words)
	sorted.text.words = rev(sort(table(text.words)))
	head(sorted.text.words)
	#text.words
	#haiti rt in to i my
	# 161 134 130 108 103 102

	#I'm adding "rt" to the stop words
	stop.words = append(stop.words, "rt")
	text.words.without.stop = text.words[ !text.words %in% stop.words]
	rev(sort(table(text.words.without.stop)))[1:5]
	#text.words.without.stop
	# haiti #haiti ... de haiti,
	# 161 82 62 54 47

	#So I'm going to add "haiti" and "#haiti" and more to the stop.words too, and see what I get
	stop.words = append(stop.words, c("haiti", "#haiti", "...","haiti,"))
	text.words.without.stop = text.words[ !text.words %in% stop.words]
	rev(sort(table(text.words.without.stop)))[1:10]
	#text.words.without.stop
	# de nice spirit, pin everybody. country bbm #32dd26ac!!!! en le
	# 54 35 33 33 33 33 33 #31 30 29

	##OK, so still not much. Maybe there isn't a central theme to what people are tweeting about. Though I cam curious about what "32dd26ac!!!!" isl; It's in 31 tweets. Let me try to figure what they say.
	#!/usr/bin/python

	import json # Import the library that lets us work with JSON
	import csv # Import the library that lets us read/write CSVs
	import time # We're going to need to deal with a quick time conversion in here

	# The path to the file we want to open (change this for your machine)
	# This should be the result of the streaming API
	infilename = "/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.json"

	infile = open(infilename, "r") # Open up the file. "r" says we want to read from it (as opposed to write)

	tweets = [] # An empty list to hold the tweets we're going to load
	for line in infile: # Iterate over every line in the file and call it the variable "line"
	try:
	new_tweet = json.loads(line) # Load in the text and convert it from JSON to a Python dictoinary
	except:
	continue # try/except basically says to just keep moving if we fail to
	# convert the JSON to a tweet (maybe there's malformed data or something)

	tweets.append(new_tweet) # Add it to our list of tweets

	# Whoa! That was easy! Now we have a list of tweets, where each tweet is a dictionary!

	first_tweet = tweets[0]
	first_tweet["text"] # The text of the first tweet. Boom.

	# Here's an example of what's in a typical tweet:

	# {
	# "in_reply_to_status_id_str":null,
	# "id_str":"247530200278114304",
	# "text":"Nice work @AnnCoulter: Libya commemorates 9\/11 \| http:\/\/t.co\/8yVjg5Ej http:\/\/t.co\/fSPlkhSK",
	# "in_reply_to_screen_name":null,
	# "in_reply_to_user_id_str":null,
	# "favorited":false,
	# "source":"web",
	# "possibly_sensitive_editable":true,
	# "entities":{
	# "hashtags":[],
	# "user_mentions":[
	# {"id_str":"196168350",
	# "indices":[10,21],
	# "screen_name":"AnnCoulter",
	# "name":"Ann Coulter",
	# "id":196168350}
	# ],
	# "urls":[
	# {"indices":[49,69],
	# "url":"http:\/\/t.co\/8yVjg5Ej",
	# "display_url":"StAugustine.com",
	# "expanded_url":"http:\/\/StAugustine.com"
	# },
	# {"indices":[70,90],
	# "url":"http:\/\/t.co\/fSPlkhSK",
	# "display_url":"staugustine.com\/opinions\/2012-\u2026",
	# "expanded_url":"http:\/\/staugustine.com\/opinions\/2012-09-16\/coulter-libya-commemorates-911#.UFaRvlv8T5w.twitter"
	# }
	# ]
	# },
	# "truncated":false,
	# "created_at":"Mon Sep 17 02:59:33 +0000 2012",
	# "place":null,
	# "in_reply_to_user_id":null,
	# "contributors":null,
	# "geo":null,
	# "retweet_count":0,
	# "retweeted":false,
	# "coordinates":null,
	# "user":{
	# "id_str":"131546419",
	# "follow_request_sent":null,
	# "default_profile_image":false,
	# "profile_use_background_image":true,
	# "friends_count":207,
	# "profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2617221744\/d0sblw2ynv4aqbwmj9wa_normal.png",
	# "is_translator":false,
	# "statuses_count":475,
	# "profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/90887244\/Walleye_Puck.bmp",
	# "favourites_count":0,
	# "profile_text_color":"333333",
	# "followers_count":93,
	# "geo_enabled":false,
	# "profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/90887244\/Walleye_Puck.bmp",
	# "description":"PROUD Conservative, Full Time Student, Former Business Owner, and Full Time Musician",
	# "profile_link_color":"0084B4",
	# "lang":"en",
	# "notifications":null,
	# "created_at":"Sat Apr 10 16:04:24 +0000 2010",
	# "profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2617221744\/d0sblw2ynv4aqbwmj9wa_normal.png","listed_count":1,
	# "profile_background_color":"C0DEED",
	# "url":null,
	# "contributors_enabled":false,
	# "verified":false,
	# "profile_background_tile":true,
	# "time_zone":"Eastern Time (US & Canada)",
	# "protected":false,
	# "screen_name":"TTownD",
	# "default_profile":false,
	# "following":null,
	# "profile_sidebar_fill_color":"DDEEF6",
	# "name":"Doug ",
	# "location":"TTown",
	# "id":131546419,
	# "utc_offset":-18000,
	# "profile_sidebar_border_color":"C0DEED"
	# },
	# "id":247530200278114304,
	# "possibly_sensitive":false,
	# "in_reply_to_status_id":null
	# }
	#
	# http://www.scribd.com/doc/30146338/map-of-a-tweet

	# Wow, there's a ton of stuff in there. We have all the information about the tweet, including
	# structure information about the mentions, hashtags, and links in it, as well as all the user info. Awesome!

	# So all we need to do is iterate through the tweets, pull out the fields we want into a list,
	# then save them as a row in a CSV file.

	# Open up the file we want to write to (libya_tweets.csv)
	# csvwriter is an object that will write to that file
	csvwriter = csv.writer(open("/Users/michelleboisson/Documents/ITP/* Data without Borders/hw3/libya_tweets.csv", "w"))
	csvwriter.writerow(["tweet_id", "retweet", "text", "source", "screen_name", "name", "location", "description", "followers", "following", "created_at", "created_at_seconds", "hashtag1", "hashtag2", "url1", "url2", "mention1", "mention2", "lat", "lon"])
	for tweet in tweets:
	# This will loop over the tweets list and, for each iteration, the "tweet" variable will stand in for each tweet.
	# csv writer writes out whatever list of things you give it to a row of the CSV, so let's construct a row
	# of the variables we want.
	tweet_id = tweet["id_str"]
	retweet = tweet["in_reply_to_status_id_str"]
	# Sometimes the retweet ID isn't in that field, so let's check and see if it's in this other field
	if not retweet and "retweeted_status" in tweet and tweet["retweeted_status"]:
	retweet = tweet["retweeted_status"]["id_str"]
	text = tweet["text"]
	source = tweet["source"]
	screen_name = tweet["user"]["screen_name"]
	name = tweet["user"]["name"]
	location = tweet["user"]["location"]
	description = tweet["user"]["description"]
	followers = tweet["user"]["followers_count"]
	following = tweet["user"]["friends_count"]
	created_at = tweet["user"]["created_at"]


	# Having the time be a string is annoying. Let's use the Python time library to convert
	# the time this tweet was created to a UNIX timestamp (learn more about these here - http://en.wikipedia.org/wiki/Unix_timestamp)
	created_at_seconds = time.mktime(time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y"))

	# Two other quick things we should think about:
	# 1) I'd like us to record the hashtags, links, and mentions in each tweet, but they're in this variable length
	# format (i.e. there could be 0, 1, 3, 100, who knows). Since CSVs are fixed width, why don't we just
	# take the first two of each, accounting for the case where there are none.
	hashtag1 = None
	hashtag2 = None
	if "hashtags" in tweet["entities"] and len(tweet["entities"]["hashtags"]):
	hashtag1 = tweet["entities"]["hashtags"][0]["text"]
	if len(tweet["entities"]["hashtags"]) > 1:
	hashtag2 = tweet["entities"]["hashtags"][1]["text"]

	url1 = None
	url2 = None
	if "urls" in tweet["entities"] and len(tweet["entities"]["urls"]):
	url1 = tweet["entities"]["urls"][0]["url"]
	if len(tweet["entities"]["urls"]) > 1:
	url2 = tweet["entities"]["urls"][1]["url"]

	mention1 = None
	mention2 = None
	if "user_mentions" in tweet["entities"] and len(tweet["entities"]["user_mentions"]):
	mention1 = tweet["entities"]["user_mentions"][0]["screen_name"]
	if len(tweet["entities"]["user_mentions"]) > 1:
	mention2 = tweet["entities"]["user_mentions"][1]["screen_name"]

	# 2) I'd also like us to record geo data, if it exists
	lat = None
	lon = None
	if tweet["geo"]:
	lat = tweet["geo"]["coordinates"][0]
	lon = tweet["geo"]["coordinates"][1]

	# OK! Let's write this tweet!
	newrow = [tweet_id, retweet, text, source, screen_name, name, location, description, followers, following, created_at, created_at_seconds, hashtag1, hashtag2, url1, url2, mention1, mention2, lat, lon]
	# Oop, one thing we need to do is convert everything to UTF8 before we write...
	for i in range(len(newrow)): # For every value in our newrow
	if hasattr(newrow[i], 'encode'):
	newrow[i] = newrow[i].encode('utf8')

	# Write it!
	csvwriter.writerow(newrow)

	# Done!