Skip to content

Instantly share code, notes, and snippets.

@olp-cs
Created September 29, 2013 06:34
Show Gist options
  • Save olp-cs/6749895 to your computer and use it in GitHub Desktop.
Save olp-cs/6749895 to your computer and use it in GitHub Desktop.
Explore a single data file from the US Gov Bit.ly dataset
{
"metadata": {
"name": "exploring_a_single_data_file"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import json\n",
"path = 'data/usagov_bitly_data2012-05-21-1337634399.txt'\n",
"records = [json.loads(line) for line in open(path)]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Display a couple of records\n",
"records[0:2]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 32,
"text": [
"[{u'a': u'Mozilla/5.0 (iPod; U; CPU iPhone OS 3_1_3 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7E18 Safari/528.16',\n",
" u'al': u'en-us',\n",
" u'c': u'US',\n",
" u'cy': u'Chesapeake',\n",
" u'g': u'JKZUHq',\n",
" u'gr': u'VA',\n",
" u'h': u'J8ZPYk',\n",
" u'hc': 1337629186,\n",
" u'hh': u'go.nasa.gov',\n",
" u'l': u'nasatwitter',\n",
" u'll': [36.755798, -76.292801],\n",
" u'nk': 1,\n",
" u'r': u'http://t.co/JEY40vW4',\n",
" u't': 1337634399,\n",
" u'tz': u'America/New_York',\n",
" u'u': u'http://www.nasa.gov/mission_pages/hinode/eclipse_120520.html'},\n",
" {u'a': u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5',\n",
" u'al': u'en-US,en;q=0.8',\n",
" u'c': u'US',\n",
" u'cy': u'O Fallon',\n",
" u'g': u'vNJS4H',\n",
" u'gr': u'MO',\n",
" u'h': u'u0uD9q',\n",
" u'hc': 1319563556,\n",
" u'hh': u'1.usa.gov',\n",
" u'l': u'o_4us71ccioa',\n",
" u'll': [38.8251, -90.728897],\n",
" u'nk': 1,\n",
" u'r': u'direct',\n",
" u't': 1337634399,\n",
" u'tz': u'America/Chicago',\n",
" u'u': u'https://www.nysdot.gov/rexdesign/design/community.gif'}]"
]
}
],
"prompt_number": 32
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Display the user agent from the first record\n",
"records[0][\"a\"]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 33,
"text": [
"u'Mozilla/5.0 (iPod; U; CPU iPhone OS 3_1_3 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7E18 Safari/528.16'"
]
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Utility function: get counts for each element from a collection\n",
"\n",
"from collections import defaultdict\n",
"\n",
"def get_counts(sequence):\n",
" counts = defaultdict(int) # values will initialize to 0\n",
" for x in sequence:\n",
" count[x] += 1\n",
" return counts"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# How a time zone looks like\n",
"records[0]['tz']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 35,
"text": [
"u'America/New_York'"
]
}
],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Check if a time zone is listed for the record\n",
"def time_zone_listed(record):\n",
" return 'tz' in record\n",
" \n",
"time_zone_listed(records[0])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 36,
"text": [
"True"
]
}
],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# See where the timezone is not listed \n",
"[item for item in records if not time_zone_listed(item)]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 38,
"text": [
"[{u'_heartbeat_': 1337634451},\n",
" {u'_heartbeat_': 1337634482},\n",
" {u'_heartbeat_': 1337634512},\n",
" {u'_heartbeat_': 1337634541},\n",
" {u'_heartbeat_': 1337634571},\n",
" {u'_heartbeat_': 1337634601},\n",
" {u'_heartbeat_': 1337634631},\n",
" {u'_heartbeat_': 1337634661},\n",
" {u'_heartbeat_': 1337634691},\n",
" {u'_heartbeat_': 1337634721},\n",
" {u'_heartbeat_': 1337634751},\n",
" {u'_heartbeat_': 1337634781},\n",
" {u'_heartbeat_': 1337634811},\n",
" {u'_heartbeat_': 1337634841},\n",
" {u'_heartbeat_': 1337634871},\n",
" {u'_heartbeat_': 1337634901},\n",
" {u'_heartbeat_': 1337634931},\n",
" {u'_heartbeat_': 1337634961},\n",
" {u'_heartbeat_': 1337634991},\n",
" {u'_heartbeat_': 1337635021},\n",
" {u'_heartbeat_': 1337635051},\n",
" {u'_heartbeat_': 1337635081},\n",
" {u'_heartbeat_': 1337635112},\n",
" {u'_heartbeat_': 1337635141},\n",
" {u'_heartbeat_': 1337635171},\n",
" {u'_heartbeat_': 1337635201},\n",
" {u'_heartbeat_': 1337635231},\n",
" {u'_heartbeat_': 1337635261},\n",
" {u'_heartbeat_': 1337635291},\n",
" {u'_heartbeat_': 1337635321},\n",
" {u'_heartbeat_': 1337635351},\n",
" {u'_heartbeat_': 1337635381},\n",
" {u'_heartbeat_': 1337635411},\n",
" {u'_heartbeat_': 1337635441},\n",
" {u'_heartbeat_': 1337635471},\n",
" {u'_heartbeat_': 1337635501},\n",
" {u'_heartbeat_': 1337635531},\n",
" {u'_heartbeat_': 1337635561},\n",
" {u'_heartbeat_': 1337635591},\n",
" {u'_heartbeat_': 1337635621},\n",
" {u'_heartbeat_': 1337635651},\n",
" {u'_heartbeat_': 1337635681},\n",
" {u'_heartbeat_': 1337635711},\n",
" {u'_heartbeat_': 1337635741},\n",
" {u'_heartbeat_': 1337635771},\n",
" {u'_heartbeat_': 1337635801},\n",
" {u'_heartbeat_': 1337635831},\n",
" {u'_heartbeat_': 1337635861},\n",
" {u'_heartbeat_': 1337635891},\n",
" {u'_heartbeat_': 1337635921},\n",
" {u'_heartbeat_': 1337635951},\n",
" {u'_heartbeat_': 1337635981},\n",
" {u'_heartbeat_': 1337636011},\n",
" {u'_heartbeat_': 1337636041},\n",
" {u'_heartbeat_': 1337636071},\n",
" {u'_heartbeat_': 1337636101},\n",
" {u'_heartbeat_': 1337636131},\n",
" {u'_heartbeat_': 1337636161},\n",
" {u'_heartbeat_': 1337636191},\n",
" {u'_heartbeat_': 1337636221}]"
]
}
],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"time_zones = [item['tz'] for item in records if time_zone_listed(item)]\n",
"time_zones[0:3]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 42,
"text": [
"[u'America/New_York', u'America/Chicago', u'America/New_York']"
]
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from collections import Counter\n",
"\n",
"Counter(time_zones).most_common(10)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 46,
"text": [
"[(u'America/Chicago', 643),\n",
" (u'America/New_York', 571),\n",
" (u'', 521),\n",
" (u'America/Los_Angeles', 315),\n",
" (u'Europe/London', 135),\n",
" (u'America/Denver', 77),\n",
" (u'Europe/Amsterdam', 32),\n",
" (u'America/Phoenix', 32),\n",
" (u'Europe/Madrid', 29),\n",
" (u'America/Rainy_River', 26)]"
]
}
],
"prompt_number": 46
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment