Skip to content

Instantly share code, notes, and snippets.

@bbengfort
Last active December 28, 2015 18:41
Show Gist options
  • Save bbengfort/c21012d8af009c953f21 to your computer and use it in GitHub Desktop.
Save bbengfort/c21012d8af009c953f21 to your computer and use it in GitHub Desktop.
Import and wrangling of the Big Tweet Dump from @murphsp1.
#!/usr/bin/env python
# big_tweet_import
# Imports the tweet-dump into MongoDB
#
# Author: Benjamin Bengfort <benjamin@bengfort.com>
# Created: Thu Aug 28 07:37:41 2014 -0400
#
# Copyright (C) 2014 Bengfort.com
# For license information, see LICENSE.txt
#
# ID: utils.py [] benjamin@bengfort.com $
"""
Import and wrangling of Big Tweet Dump from Sean.
"""
##########################################################################
## Imports
##########################################################################
import os
import json
import pymongo
from collections import Counter
from operator import itemgetter
##########################################################################
## Module Constants
##########################################################################
BASE_DIR = os.path.abspath(os.path.dirname(__file__))
DUMP_PATH = os.path.join(BASE_DIR, "big_tweet_dump.json")
JSONDATE = "%Y-%m-%dT%H:%M:%SZ"
##########################################################################
## Helper functions
##########################################################################
def utcstr(dt, fmt=JSONDATE):
if not dt: return None
tups = dt.utctimetuple()
return datetime.fromtimestamp(time.mktime(tups)).strftime(fmt)
def parse_date(dt, fmts=None):
"""
Attempts to parse a date using a series of formats.
"""
try:
return datetime.strptime(dt, JSONDATE)
except ValueError:
return None
def mongodb(**kwargs):
"""
Connect to the Mongo Database (returns a Database object)
"""
host = kwargs.get('host', 'localhost')
port = kwargs.get('port', 27017)
database = kwargs.get('database', 'tweet-corpus')
client = pymongo.MongoClient(host, port)
return client[database]
def dotkeys(obj, parent=None):
"""
Returns a depth first search of all keys in dot notation.
"""
for key, val in obj.items():
key = "%s.%s" % (parent, key) if parent else key
yield key
if hasattr(val, 'items') and callable(val.items):
for key in dotkeys(val, key):
yield key
##########################################################################
## Import Utility
##########################################################################
def load_data(path=DUMP_PATH, **kwargs):
"""
Loads the data from the dump file into the Mongo Database
"""
tweets = mongodb(**kwargs).tweets
with open(path, 'r') as data:
data = json.load(data)
for tweet in data:
tweets.insert(tweet)
##########################################################################
## Inspection Utility
##########################################################################
def inspect(**kwargs):
def percent(item, count):
pcent = (float(item[1]) /count) * 100
return item[0], pcent
tweets = mongodb(**kwargs).tweets
count = tweets.count()
counter = Counter()
for tweet in tweets.find():
for key in dotkeys(tweet):
counter[key] += 1
pcents = map(lambda item: percent(item, count), counter.items())
return sorted(pcents, key=itemgetter(0))
if __name__ == "__main__":
for item in inspect():
print "{1: >5.1f}%: {0}".format(*item)
100.0%: _id
100.0%: contributors
100.0%: coordinates
4.6%: coordinates.coordinates
4.6%: coordinates.type
100.0%: created_at
100.0%: entities
100.0%: entities.hashtags
9.9%: entities.media
100.0%: entities.symbols
100.0%: entities.urls
100.0%: entities.user_mentions
9.9%: extended_entities
9.9%: extended_entities.media
100.0%: favorite_count
100.0%: favorited
100.0%: geo
4.6%: geo.coordinates
4.6%: geo.type
100.0%: id
100.0%: id_str
100.0%: in_reply_to_screen_name
100.0%: in_reply_to_status_id
100.0%: in_reply_to_status_id_str
100.0%: in_reply_to_user_id
100.0%: in_reply_to_user_id_str
100.0%: lang
100.0%: place
4.9%: place.attributes
0.0%: place.attributes.street_address
4.9%: place.bounding_box
4.9%: place.bounding_box.coordinates
4.9%: place.bounding_box.type
4.9%: place.contained_within
4.9%: place.country
4.9%: place.country_code
4.9%: place.full_name
4.9%: place.id
4.9%: place.name
4.9%: place.place_type
4.9%: place.url
48.5%: possibly_sensitive
100.0%: retweet_count
100.0%: retweeted
26.6%: retweeted_status
26.6%: retweeted_status.contributors
26.6%: retweeted_status.coordinates
0.6%: retweeted_status.coordinates.coordinates
0.6%: retweeted_status.coordinates.type
26.6%: retweeted_status.created_at
26.6%: retweeted_status.entities
26.6%: retweeted_status.entities.hashtags
5.7%: retweeted_status.entities.media
26.6%: retweeted_status.entities.symbols
26.6%: retweeted_status.entities.urls
26.6%: retweeted_status.entities.user_mentions
5.7%: retweeted_status.extended_entities
5.7%: retweeted_status.extended_entities.media
26.6%: retweeted_status.favorite_count
26.6%: retweeted_status.favorited
26.6%: retweeted_status.geo
0.6%: retweeted_status.geo.coordinates
0.6%: retweeted_status.geo.type
26.6%: retweeted_status.id
26.6%: retweeted_status.id_str
26.6%: retweeted_status.in_reply_to_screen_name
26.6%: retweeted_status.in_reply_to_status_id
26.6%: retweeted_status.in_reply_to_status_id_str
26.6%: retweeted_status.in_reply_to_user_id
26.6%: retweeted_status.in_reply_to_user_id_str
26.6%: retweeted_status.lang
26.6%: retweeted_status.place
0.6%: retweeted_status.place.attributes
0.6%: retweeted_status.place.bounding_box
0.6%: retweeted_status.place.bounding_box.coordinates
0.6%: retweeted_status.place.bounding_box.type
0.6%: retweeted_status.place.contained_within
0.6%: retweeted_status.place.country
0.6%: retweeted_status.place.country_code
0.6%: retweeted_status.place.full_name
0.6%: retweeted_status.place.id
0.6%: retweeted_status.place.name
0.6%: retweeted_status.place.place_type
0.6%: retweeted_status.place.url
12.3%: retweeted_status.possibly_sensitive
26.6%: retweeted_status.retweet_count
26.6%: retweeted_status.retweeted
0.0%: retweeted_status.scopes
0.0%: retweeted_status.scopes.followers
0.0%: retweeted_status.scopes.place_ids
26.6%: retweeted_status.source
26.6%: retweeted_status.text
26.6%: retweeted_status.truncated
26.6%: retweeted_status.user
26.6%: retweeted_status.user.contributors_enabled
26.6%: retweeted_status.user.created_at
26.6%: retweeted_status.user.default_profile
26.6%: retweeted_status.user.default_profile_image
26.6%: retweeted_status.user.description
26.6%: retweeted_status.user.entities
26.6%: retweeted_status.user.entities.description
26.6%: retweeted_status.user.entities.description.urls
17.2%: retweeted_status.user.entities.url
17.2%: retweeted_status.user.entities.url.urls
26.6%: retweeted_status.user.favourites_count
26.6%: retweeted_status.user.follow_request_sent
26.6%: retweeted_status.user.followers_count
26.6%: retweeted_status.user.following
26.6%: retweeted_status.user.friends_count
26.6%: retweeted_status.user.geo_enabled
26.6%: retweeted_status.user.id
26.6%: retweeted_status.user.id_str
26.6%: retweeted_status.user.is_translation_enabled
26.6%: retweeted_status.user.is_translator
26.6%: retweeted_status.user.lang
26.6%: retweeted_status.user.listed_count
26.6%: retweeted_status.user.location
26.6%: retweeted_status.user.name
26.6%: retweeted_status.user.notifications
26.6%: retweeted_status.user.profile_background_color
26.6%: retweeted_status.user.profile_background_image_url
26.6%: retweeted_status.user.profile_background_image_url_https
26.6%: retweeted_status.user.profile_background_tile
21.6%: retweeted_status.user.profile_banner_url
26.6%: retweeted_status.user.profile_image_url
26.6%: retweeted_status.user.profile_image_url_https
26.6%: retweeted_status.user.profile_link_color
26.6%: retweeted_status.user.profile_sidebar_border_color
26.6%: retweeted_status.user.profile_sidebar_fill_color
26.6%: retweeted_status.user.profile_text_color
26.6%: retweeted_status.user.profile_use_background_image
26.6%: retweeted_status.user.protected
26.6%: retweeted_status.user.screen_name
26.6%: retweeted_status.user.statuses_count
26.6%: retweeted_status.user.time_zone
26.6%: retweeted_status.user.url
26.6%: retweeted_status.user.utc_offset
26.6%: retweeted_status.user.verified
100.0%: source
100.0%: text
100.0%: truncated
100.0%: user
100.0%: user.contributors_enabled
100.0%: user.created_at
100.0%: user.default_profile
100.0%: user.default_profile_image
100.0%: user.description
100.0%: user.entities
100.0%: user.entities.description
100.0%: user.entities.description.urls
49.0%: user.entities.url
49.0%: user.entities.url.urls
100.0%: user.favourites_count
100.0%: user.follow_request_sent
100.0%: user.followers_count
100.0%: user.following
100.0%: user.friends_count
100.0%: user.geo_enabled
100.0%: user.id
100.0%: user.id_str
100.0%: user.is_translation_enabled
100.0%: user.is_translator
100.0%: user.lang
100.0%: user.listed_count
100.0%: user.location
100.0%: user.name
100.0%: user.notifications
100.0%: user.profile_background_color
100.0%: user.profile_background_image_url
100.0%: user.profile_background_image_url_https
100.0%: user.profile_background_tile
66.5%: user.profile_banner_url
100.0%: user.profile_image_url
100.0%: user.profile_image_url_https
100.0%: user.profile_link_color
100.0%: user.profile_sidebar_border_color
100.0%: user.profile_sidebar_fill_color
100.0%: user.profile_text_color
100.0%: user.profile_use_background_image
100.0%: user.protected
100.0%: user.screen_name
100.0%: user.statuses_count
100.0%: user.time_zone
100.0%: user.url
100.0%: user.utc_offset
100.0%: user.verified
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment