davclark/Historical.ipynb

## Historical.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from requests import post, get, put\n",
    "from requests.auth import HTTPBasicAuth\n",
    "import yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('creds.yaml') as credfile:\n",
    "    creds = yaml.load(credfile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "auth = HTTPBasicAuth(creds['email'], creds['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "url_base = 'https://historical.gnip.com/accounts/' + creds['user'] + '/'\n",
    "post_url = url_base + 'jobs.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "base_query = {\"publisher\": \"twitter\",\n",
    "              \"streamType\": \"track\",\n",
    "              \"dataFormat\": \"activity_streams\",}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1\n",
    "\n",
    "Submit a job"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# info specific to this query\n",
    "q = base_query.copy()\n",
    "q['fromDate'] = \"201504110000\"\n",
    "q['toDate'] = \"20150417\"  # time will be inferred as 0000 if unspecified\n",
    "q['title'] = 'gb3'  # Needs to be unique\n",
    "q[\"rules\"] = [\n",
    "    {\n",
    "        \"value\": \"from:CathrynCarson\"\n",
    "    },\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'account': 'UCBerkeleyExplore',\n",
       " 'format': 'activity_streams',\n",
       " 'fromDate': '201504110000',\n",
       " 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json',\n",
       " 'publisher': 'twitter',\n",
       " 'requestedAt': '2015-04-24T06:30:03Z',\n",
       " 'requestedBy': 'jordan@cs.berkeley.edu',\n",
       " 'status': 'opened',\n",
       " 'statusMessage': 'Waiting on quote from Gnip.',\n",
       " 'streamType': 'track',\n",
       " 'title': 'gb3',\n",
       " 'toDate': '201504170000'}"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get a quote\n",
    "resp = post(post_url, json=q, auth=auth)\n",
    "job_info = resp.json()\n",
    "job_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Since data won't be persistent, it's a good idea to copy the jobURL into text somewhere\n",
    "# For a more robust solution, we'd want to write this out to a file or even better\n",
    "# something like a redis queue or mongodb\n",
    "# Note that this is also included in requests - even those where the request was TO this URL!\n",
    "jobURL = 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Steps 2 and 4\n",
    "\n",
    "This is the general approach to checking in on the status of a job - both checking for a quote prior to acceptance, and also checking for completion of the job."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'acceptedAt': '2015-04-24T06:32:05Z',\n",
       " 'acceptedBy': 'jordan@cs.berkeley.edu',\n",
       " 'account': 'UCBerkeleyExplore',\n",
       " 'format': 'activity_streams',\n",
       " 'fromDate': '201504110000',\n",
       " 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json',\n",
       " 'percentComplete': 100,\n",
       " 'publisher': 'twitter',\n",
       " 'quote': {'estimatedActivityCount': 100,\n",
       "  'estimatedDurationHours': '1.0',\n",
       "  'estimatedFileSizeMb': '0.0',\n",
       "  'expiresAt': '2015-05-01T06:31:14Z'},\n",
       " 'requestedAt': '2015-04-24T06:30:03Z',\n",
       " 'requestedBy': 'jordan@cs.berkeley.edu',\n",
       " 'results': {'activityCount': 5,\n",
       "  'completedAt': '2015-04-24T06:37:30Z',\n",
       "  'dataURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm/results.json',\n",
       "  'expiresAt': '2015-05-09T06:37:12Z',\n",
       "  'fileCount': 5,\n",
       "  'fileSizeMb': '0.0'},\n",
       " 'status': 'delivered',\n",
       " 'statusMessage': 'Job delivered and available for download.',\n",
       " 'streamType': 'track',\n",
       " 'title': 'gb3',\n",
       " 'toDate': '201504170000'}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Keep checking (max 5 / 5 sec across all GET requests) until ready\n",
    "resp = get(jobURL, auth=auth)\n",
    "job_status = resp.json()\n",
    "job_status"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# dynamically could do\n",
    "# dataURL = job_status['dataURL']\n",
    "# But this is persistent:\n",
    "dataURL = 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm/results.json'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice that the \"requestedAt\" time is in GMT. Bummer!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2015-04-24 06:35:54'"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from time import gmtime, strftime\n",
    "strftime(\"%Y-%m-%d %H:%M:%S\", gmtime())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'acceptedAt': '2015-04-24T06:32:05Z',\n",
       " 'acceptedBy': 'jordan@cs.berkeley.edu',\n",
       " 'account': 'UCBerkeleyExplore',\n",
       " 'format': 'activity_streams',\n",
       " 'fromDate': '201504110000',\n",
       " 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json',\n",
       " 'publisher': 'twitter',\n",
       " 'quote': {'estimatedActivityCount': 100,\n",
       "  'estimatedDurationHours': '1.0',\n",
       "  'estimatedFileSizeMb': '0.0',\n",
       "  'expiresAt': '2015-05-01T06:31:14Z'},\n",
       " 'requestedAt': '2015-04-24T06:30:03Z',\n",
       " 'requestedBy': 'jordan@cs.berkeley.edu',\n",
       " 'status': 'accepted',\n",
       " 'statusMessage': 'Job accepted and ready to be queued.',\n",
       " 'streamType': 'track',\n",
       " 'title': 'gb3',\n",
       " 'toDate': '201504170000'}"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Could also use job_status here, can also \"reject\"\n",
    "resp = put(job_info['jobURL'], json={\"status\": \"accept\"}, auth=auth)\n",
    "job_status = resp.json()\n",
    "job_status"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 5\n",
    "\n",
    "Get our results (for the sample query, this should be very small)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'expiresAt': '2015-05-09T06:37:12Z',\n",
       " 'totalFileSizeBytes': 5154,\n",
       " 'urlCount': 5,\n",
       " 'urlList': ['https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/11/14/10_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=uacttaiL2eEQmQrcMf2dxOOJF%2FA%3D',\n",
       "  'https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/11/14/20_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=y2vtp%2FfFN%2FAUx2KlvyKzKTRe1iE%3D',\n",
       "  'https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/11/23/50_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=NrHsR8YWf9hjD3Ks8KEwJCkPKv0%3D',\n",
       "  'https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/12/03/20_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=N4oGXFlb%2BvwXV2gaMUOjRJ%2FUcI8%3D',\n",
       "  'https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/15/13/50_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=Mj4wnN1x988HC%2F4R%2FxgEUq%2FOxdo%3D']}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Should just be ready\n",
    "resp = get(dataURL, auth=auth)\n",
    "job_status = resp.json()\n",
    "job_status"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ofnames = []\n",
    "for url in job_status['urlList']:\n",
    "    ofname = url.rsplit('/', 1)[1].split('?', 1)[0]\n",
    "    resp = get(url, stream=True)\n",
    "    with open(ofname, 'wb') as ofile:\n",
    "        for chunk in resp.iter_content():\n",
    "            ofile.write(chunk)\n",
    "    ofnames.append(ofname)  # This again isn't persistent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['10_activities.json.gz',\n",
       " '20_activities.json.gz',\n",
       " '50_activities.json.gz',\n",
       " '20_activities.json.gz',\n",
       " '50_activities.json.gz']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ofnames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ofnames = ['10_activities.json.gz',\n",
    "           '20_activities.json.gz',\n",
    "           '50_activities.json.gz',\n",
    "           '20_activities.json.gz',\n",
    "           '50_activities.json.gz']\n",
    "# or ofnames = glob('*.json.gz')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json, gzip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = json.load?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def read_tweet_data(fname):\n",
    "    '''These files include tweet packages, as well as some general info at the end\n",
    "    \n",
    "    There may be some regular structure to the blank lines that could make this\n",
    "    *slightly* faster, but probably doesn't matter relative to the speed of parsing.'''\n",
    "    tweet_data = []\n",
    "    with gzip.open(fname, 'rt') as datafile:\n",
    "        for line in datafile:\n",
    "            if line != '\\n':\n",
    "                tweet_data.append(json.loads(line))\n",
    "                \n",
    "    return tweet_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'actor': {'displayName': 'Cathryn Carson',\n",
       "   'favoritesCount': 45,\n",
       "   'followersCount': 260,\n",
       "   'friendsCount': 212,\n",
       "   'id': 'id:twitter.com:2594018798',\n",
       "   'image': 'https://pbs.twimg.com/profile_images/526971310090358784/eWx-JEup_normal.jpeg',\n",
       "   'languages': ['en'],\n",
       "   'link': 'http://www.twitter.com/CathrynCarson',\n",
       "   'links': [{'href': 'http://history.berkeley.edu/people/cathryn-carson',\n",
       "     'rel': 'me'}],\n",
       "   'listedCount': 17,\n",
       "   'location': {'displayName': 'Berkeley, CA', 'objectType': 'place'},\n",
       "   'objectType': 'person',\n",
       "   'postedTime': '2014-06-29T01:59:33.000Z',\n",
       "   'preferredUsername': 'CathrynCarson',\n",
       "   'statusesCount': 384,\n",
       "   'summary': 'Historian of science, ethnographer of contemporary research institutions. Once upon a time @DLabAtBerkeley.',\n",
       "   'twitterTimeZone': None,\n",
       "   'utcOffset': None,\n",
       "   'verified': False},\n",
       "  'body': 'RT @abuaardvark: Big Data in Social Science - new Annals packed w/interesting articles  http://t.co/grLC6eRwO2',\n",
       "  'favoritesCount': 0,\n",
       "  'generator': {'displayName': 'Twitter Web Client',\n",
       "   'link': 'http://twitter.com'},\n",
       "  'gnip': {'matching_rules': [{'tag': None, 'value': 'from:CathrynCarson'}],\n",
       "   'urls': [{'expanded_status': 200,\n",
       "     'expanded_url': 'http://ann.sagepub.com/content/659/1.toc',\n",
       "     'url': 'http://t.co/grLC6eRwO2'}]},\n",
       "  'id': 'tag:search.twitter.com,2005:586896169319600128',\n",
       "  'link': 'http://twitter.com/CathrynCarson/statuses/586896169319600128',\n",
       "  'object': {'actor': {'displayName': 'Marc Lynch',\n",
       "    'favoritesCount': 97,\n",
       "    'followersCount': 35939,\n",
       "    'friendsCount': 666,\n",
       "    'id': 'id:twitter.com:18267544',\n",
       "    'image': 'https://pbs.twimg.com/profile_images/566579490063937536/MYo9iE8U_normal.jpeg',\n",
       "    'languages': ['en'],\n",
       "    'link': 'http://www.twitter.com/abuaardvark',\n",
       "    'links': [{'href': 'http://www.marclynch.com', 'rel': 'me'}],\n",
       "    'listedCount': 1865,\n",
       "    'location': {'displayName': 'ÜT: 37.892943,-122.270439',\n",
       "     'objectType': 'place'},\n",
       "    'objectType': 'person',\n",
       "    'postedTime': '2008-12-20T15:33:59.000Z',\n",
       "    'preferredUsername': 'abuaardvark',\n",
       "    'statusesCount': 29649,\n",
       "    'summary': 'Abu Aardvark. GWU. Monkey Cage. POMEPS. CNAS. Go Brewers and Packers!',\n",
       "    'twitterTimeZone': 'Eastern Time (US & Canada)',\n",
       "    'utcOffset': '-14400',\n",
       "    'verified': False},\n",
       "   'body': 'Big Data in Social Science - new Annals packed w/interesting articles  http://t.co/grLC6eRwO2',\n",
       "   'favoritesCount': 10,\n",
       "   'generator': {'displayName': 'Twitter for Mac',\n",
       "    'link': 'http://itunes.apple.com/us/app/twitter/id409789998?mt=12'},\n",
       "   'id': 'tag:search.twitter.com,2005:586855911601348608',\n",
       "   'link': 'http://twitter.com/abuaardvark/statuses/586855911601348608',\n",
       "   'object': {'id': 'object:search.twitter.com,2005:586855911601348608',\n",
       "    'link': 'http://twitter.com/abuaardvark/statuses/586855911601348608',\n",
       "    'objectType': 'note',\n",
       "    'postedTime': '2015-04-11T11:38:47.000Z',\n",
       "    'summary': 'Big Data in Social Science - new Annals packed w/interesting articles  http://t.co/grLC6eRwO2'},\n",
       "   'objectType': 'activity',\n",
       "   'postedTime': '2015-04-11T11:38:47.000Z',\n",
       "   'provider': {'displayName': 'Twitter',\n",
       "    'link': 'http://www.twitter.com',\n",
       "    'objectType': 'service'},\n",
       "   'twitter_entities': {'hashtags': [],\n",
       "    'symbols': [],\n",
       "    'trends': [],\n",
       "    'urls': [{'display_url': 'ann.sagepub.com/content/659/1.…',\n",
       "      'expanded_url': 'http://ann.sagepub.com/content/659/1.toc',\n",
       "      'indices': [71, 93],\n",
       "      'url': 'http://t.co/grLC6eRwO2'}],\n",
       "    'user_mentions': []},\n",
       "   'twitter_filter_level': 'low',\n",
       "   'twitter_lang': 'en',\n",
       "   'verb': 'post'},\n",
       "  'objectType': 'activity',\n",
       "  'postedTime': '2015-04-11T14:18:45.000Z',\n",
       "  'provider': {'displayName': 'Twitter',\n",
       "   'link': 'http://www.twitter.com',\n",
       "   'objectType': 'service'},\n",
       "  'retweetCount': 6,\n",
       "  'twitter_entities': {'hashtags': [],\n",
       "   'symbols': [],\n",
       "   'trends': [],\n",
       "   'urls': [{'display_url': 'ann.sagepub.com/content/659/1.…',\n",
       "     'expanded_url': 'http://ann.sagepub.com/content/659/1.toc',\n",
       "     'indices': [88, 110],\n",
       "     'url': 'http://t.co/grLC6eRwO2'}],\n",
       "   'user_mentions': [{'id': 18267544,\n",
       "     'id_str': '18267544',\n",
       "     'indices': [3, 15],\n",
       "     'name': 'Marc Lynch',\n",
       "     'screen_name': 'abuaardvark'}]},\n",
       "  'twitter_filter_level': 'low',\n",
       "  'twitter_lang': 'en',\n",
       "  'verb': 'share'},\n",
       " {'info': {'activity_count': 1,\n",
       "   'message': 'Replay Request Completed',\n",
       "   'sent': '2015-04-24T06:36:32+00:00'}}]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "read_tweet_data(ofnames[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = [read_tweet_data(ofname) for ofname in ofnames]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[2, 2, 2, 2, 2]"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cathryn never tweets twice in 10 minutes\n",
    "[len(d) for d in data]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0\n",
    "\n",
    "Did you forget about your job status? (Or want to spy on the others?)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Generally: https://historical.gnip.com/accounts/<account_name>/jobs.json\n",
    "jobs_url = url_base + 'jobs.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'delivered': {'activityCount': 74,\n",
       "  'jobCount': 3,\n",
       "  'jobDaysRun': 10,\n",
       "  'period': 'trial',\n",
       "  'since': '2015-04-16T21:44:19Z'},\n",
       " 'jobs': [{'expiresAt': '2015-05-08T05:47:10Z',\n",
       "   'fromDate': '201201010000',\n",
       "   'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/9mkbvaz928.json',\n",
       "   'percentComplete': 100,\n",
       "   'publisher': 'twitter',\n",
       "   'status': 'delivered',\n",
       "   'streamType': 'track',\n",
       "   'title': 'my_job',\n",
       "   'toDate': '201201010001',\n",
       "   'uuid': '9mkbvaz928'},\n",
       "  {'expiresAt': '2015-05-01T06:14:35Z',\n",
       "   'fromDate': '201404230000',\n",
       "   'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/g75dcp53nh.json',\n",
       "   'percentComplete': 0,\n",
       "   'publisher': 'twitter',\n",
       "   'status': 'quoted',\n",
       "   'streamType': 'track',\n",
       "   'title': 'gb1',\n",
       "   'toDate': '201504230000',\n",
       "   'uuid': 'g75dcp53nh'},\n",
       "  {'expiresAt': '2015-05-01T06:28:19Z',\n",
       "   'fromDate': '201501010000',\n",
       "   'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/w9r3bb41z2.json',\n",
       "   'percentComplete': 0,\n",
       "   'publisher': 'twitter',\n",
       "   'status': 'quoted',\n",
       "   'streamType': 'track',\n",
       "   'title': 'gb2',\n",
       "   'toDate': '201504230000',\n",
       "   'uuid': 'w9r3bb41z2'},\n",
       "  {'expiresAt': '2015-05-09T06:37:12Z',\n",
       "   'fromDate': '201504110000',\n",
       "   'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json',\n",
       "   'percentComplete': 100,\n",
       "   'publisher': 'twitter',\n",
       "   'status': 'delivered',\n",
       "   'streamType': 'track',\n",
       "   'title': 'gb3',\n",
       "   'toDate': '201504170000',\n",
       "   'uuid': '8wg55wk6vm'},\n",
       "  {'expiresAt': '2015-05-01T16:07:26Z',\n",
       "   'fromDate': '201501010000',\n",
       "   'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/80b1dnt6q.json',\n",
       "   'percentComplete': 0,\n",
       "   'publisher': 'twitter',\n",
       "   'status': 'rejected',\n",
       "   'streamType': 'track',\n",
       "   'title': 'mike j',\n",
       "   'toDate': '201504230001',\n",
       "   'uuid': '80b1dnt6q'},\n",
       "  {'expiresAt': '2015-05-09T16:23:16Z',\n",
       "   'fromDate': '201504210000',\n",
       "   'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/pvnzbnwf0b.json',\n",
       "   'percentComplete': 100,\n",
       "   'publisher': 'twitter',\n",
       "   'status': 'delivered',\n",
       "   'streamType': 'track',\n",
       "   'title': 'mike j',\n",
       "   'toDate': '201504230001',\n",
       "   'uuid': 'pvnzbnwf0b'},\n",
       "  {'expiresAt': '2015-05-01T18:20:13Z',\n",
       "   'fromDate': '201301010000',\n",
       "   'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/ea88dwtjve.json',\n",
       "   'percentComplete': 0,\n",
       "   'publisher': 'twitter',\n",
       "   'status': 'quoted',\n",
       "   'streamType': 'track',\n",
       "   'title': 'my historical job python',\n",
       "   'toDate': '201301010001',\n",
       "   'uuid': 'ea88dwtjve'}]}"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "resp = get(jobs_url, auth=auth)\n",
    "jobs_info = resp.json()\n",
    "jobs_info"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 61,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from requests import post, get, put\n",
	"from requests.auth import HTTPBasicAuth\n",
	"import yaml"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 65,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"with open('creds.yaml') as credfile:\n",
	" creds = yaml.load(credfile)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 68,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"auth = HTTPBasicAuth(creds['email'], creds['password'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 69,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"url_base = 'https://historical.gnip.com/accounts/' + creds['user'] + '/'\n",
	"post_url = url_base + 'jobs.json'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 70,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"base_query = {\"publisher\": \"twitter\",\n",
	" \"streamType\": \"track\",\n",
	" \"dataFormat\": \"activity_streams\",}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Step 1\n",
	"\n",
	"Submit a job"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 93,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# info specific to this query\n",
	"q = base_query.copy()\n",
	"q['fromDate'] = \"201504110000\"\n",
	"q['toDate'] = \"20150417\" # time will be inferred as 0000 if unspecified\n",
	"q['title'] = 'gb3' # Needs to be unique\n",
	"q[\"rules\"] = [\n",
	" {\n",
	" \"value\": \"from:CathrynCarson\"\n",
	" },\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 94,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'account': 'UCBerkeleyExplore',\n",
	" 'format': 'activity_streams',\n",
	" 'fromDate': '201504110000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json',\n",
	" 'publisher': 'twitter',\n",
	" 'requestedAt': '2015-04-24T06:30:03Z',\n",
	" 'requestedBy': 'jordan@cs.berkeley.edu',\n",
	" 'status': 'opened',\n",
	" 'statusMessage': 'Waiting on quote from Gnip.',\n",
	" 'streamType': 'track',\n",
	" 'title': 'gb3',\n",
	" 'toDate': '201504170000'}"
	]
	},
	"execution_count": 94,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Get a quote\n",
	"resp = post(post_url, json=q, auth=auth)\n",
	"job_info = resp.json()\n",
	"job_info"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Since data won't be persistent, it's a good idea to copy the jobURL into text somewhere\n",
	"# For a more robust solution, we'd want to write this out to a file or even better\n",
	"# something like a redis queue or mongodb\n",
	"# Note that this is also included in requests - even those where the request was TO this URL!\n",
	"jobURL = 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Steps 2 and 4\n",
	"\n",
	"This is the general approach to checking in on the status of a job - both checking for a quote prior to acceptance, and also checking for completion of the job."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'acceptedAt': '2015-04-24T06:32:05Z',\n",
	" 'acceptedBy': 'jordan@cs.berkeley.edu',\n",
	" 'account': 'UCBerkeleyExplore',\n",
	" 'format': 'activity_streams',\n",
	" 'fromDate': '201504110000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json',\n",
	" 'percentComplete': 100,\n",
	" 'publisher': 'twitter',\n",
	" 'quote': {'estimatedActivityCount': 100,\n",
	" 'estimatedDurationHours': '1.0',\n",
	" 'estimatedFileSizeMb': '0.0',\n",
	" 'expiresAt': '2015-05-01T06:31:14Z'},\n",
	" 'requestedAt': '2015-04-24T06:30:03Z',\n",
	" 'requestedBy': 'jordan@cs.berkeley.edu',\n",
	" 'results': {'activityCount': 5,\n",
	" 'completedAt': '2015-04-24T06:37:30Z',\n",
	" 'dataURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm/results.json',\n",
	" 'expiresAt': '2015-05-09T06:37:12Z',\n",
	" 'fileCount': 5,\n",
	" 'fileSizeMb': '0.0'},\n",
	" 'status': 'delivered',\n",
	" 'statusMessage': 'Job delivered and available for download.',\n",
	" 'streamType': 'track',\n",
	" 'title': 'gb3',\n",
	" 'toDate': '201504170000'}"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Keep checking (max 5 / 5 sec across all GET requests) until ready\n",
	"resp = get(jobURL, auth=auth)\n",
	"job_status = resp.json()\n",
	"job_status"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# dynamically could do\n",
	"# dataURL = job_status['dataURL']\n",
	"# But this is persistent:\n",
	"dataURL = 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm/results.json'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Notice that the \"requestedAt\" time is in GMT. Bummer!"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Step 3"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 105,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'2015-04-24 06:35:54'"
	]
	},
	"execution_count": 105,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from time import gmtime, strftime\n",
	"strftime(\"%Y-%m-%d %H:%M:%S\", gmtime())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 99,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'acceptedAt': '2015-04-24T06:32:05Z',\n",
	" 'acceptedBy': 'jordan@cs.berkeley.edu',\n",
	" 'account': 'UCBerkeleyExplore',\n",
	" 'format': 'activity_streams',\n",
	" 'fromDate': '201504110000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json',\n",
	" 'publisher': 'twitter',\n",
	" 'quote': {'estimatedActivityCount': 100,\n",
	" 'estimatedDurationHours': '1.0',\n",
	" 'estimatedFileSizeMb': '0.0',\n",
	" 'expiresAt': '2015-05-01T06:31:14Z'},\n",
	" 'requestedAt': '2015-04-24T06:30:03Z',\n",
	" 'requestedBy': 'jordan@cs.berkeley.edu',\n",
	" 'status': 'accepted',\n",
	" 'statusMessage': 'Job accepted and ready to be queued.',\n",
	" 'streamType': 'track',\n",
	" 'title': 'gb3',\n",
	" 'toDate': '201504170000'}"
	]
	},
	"execution_count": 99,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Could also use job_status here, can also \"reject\"\n",
	"resp = put(job_info['jobURL'], json={\"status\": \"accept\"}, auth=auth)\n",
	"job_status = resp.json()\n",
	"job_status"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Step 5\n",
	"\n",
	"Get our results (for the sample query, this should be very small)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'expiresAt': '2015-05-09T06:37:12Z',\n",
	" 'totalFileSizeBytes': 5154,\n",
	" 'urlCount': 5,\n",
	" 'urlList': ['https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/11/14/10_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=uacttaiL2eEQmQrcMf2dxOOJF%2FA%3D',\n",
	" 'https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/11/14/20_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=y2vtp%2FfFN%2FAUx2KlvyKzKTRe1iE%3D',\n",
	" 'https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/11/23/50_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=NrHsR8YWf9hjD3Ks8KEwJCkPKv0%3D',\n",
	" 'https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/12/03/20_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=N4oGXFlb%2BvwXV2gaMUOjRJ%2FUcI8%3D',\n",
	" 'https://s3-us-west-1.amazonaws.com/archive.replay.snapshots/snapshots/twitter/track/activity_streams/UCBerkeleyExplore/2015/04/24/20150411-20150417_8wg55wk6vm/2015/04/15/13/50_activities.json.gz?AWSAccessKeyId=AKIAJMSYMREFVVJ6E7QQ&Expires=1432449450&Signature=Mj4wnN1x988HC%2F4R%2FxgEUq%2FOxdo%3D']}"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Should just be ready\n",
	"resp = get(dataURL, auth=auth)\n",
	"job_status = resp.json()\n",
	"job_status"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"ofnames = []\n",
	"for url in job_status['urlList']:\n",
	" ofname = url.rsplit('/', 1)[1].split('?', 1)[0]\n",
	" resp = get(url, stream=True)\n",
	" with open(ofname, 'wb') as ofile:\n",
	" for chunk in resp.iter_content():\n",
	" ofile.write(chunk)\n",
	" ofnames.append(ofname) # This again isn't persistent"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['10_activities.json.gz',\n",
	" '20_activities.json.gz',\n",
	" '50_activities.json.gz',\n",
	" '20_activities.json.gz',\n",
	" '50_activities.json.gz']"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ofnames"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"ofnames = ['10_activities.json.gz',\n",
	" '20_activities.json.gz',\n",
	" '50_activities.json.gz',\n",
	" '20_activities.json.gz',\n",
	" '50_activities.json.gz']\n",
	"# or ofnames = glob('*.json.gz')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Step 6"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import json, gzip"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data = json.load?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 56,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def read_tweet_data(fname):\n",
	" '''These files include tweet packages, as well as some general info at the end\n",
	" \n",
	" There may be some regular structure to the blank lines that could make this\n",
	" slightly faster, but probably doesn't matter relative to the speed of parsing.'''\n",
	" tweet_data = []\n",
	" with gzip.open(fname, 'rt') as datafile:\n",
	" for line in datafile:\n",
	" if line != '\\n':\n",
	" tweet_data.append(json.loads(line))\n",
	" \n",
	" return tweet_data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 57,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[{'actor': {'displayName': 'Cathryn Carson',\n",
	" 'favoritesCount': 45,\n",
	" 'followersCount': 260,\n",
	" 'friendsCount': 212,\n",
	" 'id': 'id:twitter.com:2594018798',\n",
	" 'image': 'https://pbs.twimg.com/profile_images/526971310090358784/eWx-JEup_normal.jpeg',\n",
	" 'languages': ['en'],\n",
	" 'link': 'http://www.twitter.com/CathrynCarson',\n",
	" 'links': [{'href': 'http://history.berkeley.edu/people/cathryn-carson',\n",
	" 'rel': 'me'}],\n",
	" 'listedCount': 17,\n",
	" 'location': {'displayName': 'Berkeley, CA', 'objectType': 'place'},\n",
	" 'objectType': 'person',\n",
	" 'postedTime': '2014-06-29T01:59:33.000Z',\n",
	" 'preferredUsername': 'CathrynCarson',\n",
	" 'statusesCount': 384,\n",
	" 'summary': 'Historian of science, ethnographer of contemporary research institutions. Once upon a time @DLabAtBerkeley.',\n",
	" 'twitterTimeZone': None,\n",
	" 'utcOffset': None,\n",
	" 'verified': False},\n",
	" 'body': 'RT @abuaardvark: Big Data in Social Science - new Annals packed w/interesting articles http://t.co/grLC6eRwO2',\n",
	" 'favoritesCount': 0,\n",
	" 'generator': {'displayName': 'Twitter Web Client',\n",
	" 'link': 'http://twitter.com'},\n",
	" 'gnip': {'matching_rules': [{'tag': None, 'value': 'from:CathrynCarson'}],\n",
	" 'urls': [{'expanded_status': 200,\n",
	" 'expanded_url': 'http://ann.sagepub.com/content/659/1.toc',\n",
	" 'url': 'http://t.co/grLC6eRwO2'}]},\n",
	" 'id': 'tag:search.twitter.com,2005:586896169319600128',\n",
	" 'link': 'http://twitter.com/CathrynCarson/statuses/586896169319600128',\n",
	" 'object': {'actor': {'displayName': 'Marc Lynch',\n",
	" 'favoritesCount': 97,\n",
	" 'followersCount': 35939,\n",
	" 'friendsCount': 666,\n",
	" 'id': 'id:twitter.com:18267544',\n",
	" 'image': 'https://pbs.twimg.com/profile_images/566579490063937536/MYo9iE8U_normal.jpeg',\n",
	" 'languages': ['en'],\n",
	" 'link': 'http://www.twitter.com/abuaardvark',\n",
	" 'links': [{'href': 'http://www.marclynch.com', 'rel': 'me'}],\n",
	" 'listedCount': 1865,\n",
	" 'location': {'displayName': 'ÜT: 37.892943,-122.270439',\n",
	" 'objectType': 'place'},\n",
	" 'objectType': 'person',\n",
	" 'postedTime': '2008-12-20T15:33:59.000Z',\n",
	" 'preferredUsername': 'abuaardvark',\n",
	" 'statusesCount': 29649,\n",
	" 'summary': 'Abu Aardvark. GWU. Monkey Cage. POMEPS. CNAS. Go Brewers and Packers!',\n",
	" 'twitterTimeZone': 'Eastern Time (US & Canada)',\n",
	" 'utcOffset': '-14400',\n",
	" 'verified': False},\n",
	" 'body': 'Big Data in Social Science - new Annals packed w/interesting articles http://t.co/grLC6eRwO2',\n",
	" 'favoritesCount': 10,\n",
	" 'generator': {'displayName': 'Twitter for Mac',\n",
	" 'link': 'http://itunes.apple.com/us/app/twitter/id409789998?mt=12'},\n",
	" 'id': 'tag:search.twitter.com,2005:586855911601348608',\n",
	" 'link': 'http://twitter.com/abuaardvark/statuses/586855911601348608',\n",
	" 'object': {'id': 'object:search.twitter.com,2005:586855911601348608',\n",
	" 'link': 'http://twitter.com/abuaardvark/statuses/586855911601348608',\n",
	" 'objectType': 'note',\n",
	" 'postedTime': '2015-04-11T11:38:47.000Z',\n",
	" 'summary': 'Big Data in Social Science - new Annals packed w/interesting articles http://t.co/grLC6eRwO2'},\n",
	" 'objectType': 'activity',\n",
	" 'postedTime': '2015-04-11T11:38:47.000Z',\n",
	" 'provider': {'displayName': 'Twitter',\n",
	" 'link': 'http://www.twitter.com',\n",
	" 'objectType': 'service'},\n",
	" 'twitter_entities': {'hashtags': [],\n",
	" 'symbols': [],\n",
	" 'trends': [],\n",
	" 'urls': [{'display_url': 'ann.sagepub.com/content/659/1.…',\n",
	" 'expanded_url': 'http://ann.sagepub.com/content/659/1.toc',\n",
	" 'indices': [71, 93],\n",
	" 'url': 'http://t.co/grLC6eRwO2'}],\n",
	" 'user_mentions': []},\n",
	" 'twitter_filter_level': 'low',\n",
	" 'twitter_lang': 'en',\n",
	" 'verb': 'post'},\n",
	" 'objectType': 'activity',\n",
	" 'postedTime': '2015-04-11T14:18:45.000Z',\n",
	" 'provider': {'displayName': 'Twitter',\n",
	" 'link': 'http://www.twitter.com',\n",
	" 'objectType': 'service'},\n",
	" 'retweetCount': 6,\n",
	" 'twitter_entities': {'hashtags': [],\n",
	" 'symbols': [],\n",
	" 'trends': [],\n",
	" 'urls': [{'display_url': 'ann.sagepub.com/content/659/1.…',\n",
	" 'expanded_url': 'http://ann.sagepub.com/content/659/1.toc',\n",
	" 'indices': [88, 110],\n",
	" 'url': 'http://t.co/grLC6eRwO2'}],\n",
	" 'user_mentions': [{'id': 18267544,\n",
	" 'id_str': '18267544',\n",
	" 'indices': [3, 15],\n",
	" 'name': 'Marc Lynch',\n",
	" 'screen_name': 'abuaardvark'}]},\n",
	" 'twitter_filter_level': 'low',\n",
	" 'twitter_lang': 'en',\n",
	" 'verb': 'share'},\n",
	" {'info': {'activity_count': 1,\n",
	" 'message': 'Replay Request Completed',\n",
	" 'sent': '2015-04-24T06:36:32+00:00'}}]"
	]
	},
	"execution_count": 57,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"read_tweet_data(ofnames[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 72,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data = [read_tweet_data(ofname) for ofname in ofnames]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 74,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[2, 2, 2, 2, 2]"
	]
	},
	"execution_count": 74,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Cathryn never tweets twice in 10 minutes\n",
	"[len(d) for d in data]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Step 0\n",
	"\n",
	"Did you forget about your job status? (Or want to spy on the others?)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 58,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Generally: https://historical.gnip.com/accounts/<account_name>/jobs.json\n",
	"jobs_url = url_base + 'jobs.json'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 71,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'delivered': {'activityCount': 74,\n",
	" 'jobCount': 3,\n",
	" 'jobDaysRun': 10,\n",
	" 'period': 'trial',\n",
	" 'since': '2015-04-16T21:44:19Z'},\n",
	" 'jobs': [{'expiresAt': '2015-05-08T05:47:10Z',\n",
	" 'fromDate': '201201010000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/9mkbvaz928.json',\n",
	" 'percentComplete': 100,\n",
	" 'publisher': 'twitter',\n",
	" 'status': 'delivered',\n",
	" 'streamType': 'track',\n",
	" 'title': 'my_job',\n",
	" 'toDate': '201201010001',\n",
	" 'uuid': '9mkbvaz928'},\n",
	" {'expiresAt': '2015-05-01T06:14:35Z',\n",
	" 'fromDate': '201404230000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/g75dcp53nh.json',\n",
	" 'percentComplete': 0,\n",
	" 'publisher': 'twitter',\n",
	" 'status': 'quoted',\n",
	" 'streamType': 'track',\n",
	" 'title': 'gb1',\n",
	" 'toDate': '201504230000',\n",
	" 'uuid': 'g75dcp53nh'},\n",
	" {'expiresAt': '2015-05-01T06:28:19Z',\n",
	" 'fromDate': '201501010000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/w9r3bb41z2.json',\n",
	" 'percentComplete': 0,\n",
	" 'publisher': 'twitter',\n",
	" 'status': 'quoted',\n",
	" 'streamType': 'track',\n",
	" 'title': 'gb2',\n",
	" 'toDate': '201504230000',\n",
	" 'uuid': 'w9r3bb41z2'},\n",
	" {'expiresAt': '2015-05-09T06:37:12Z',\n",
	" 'fromDate': '201504110000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/8wg55wk6vm.json',\n",
	" 'percentComplete': 100,\n",
	" 'publisher': 'twitter',\n",
	" 'status': 'delivered',\n",
	" 'streamType': 'track',\n",
	" 'title': 'gb3',\n",
	" 'toDate': '201504170000',\n",
	" 'uuid': '8wg55wk6vm'},\n",
	" {'expiresAt': '2015-05-01T16:07:26Z',\n",
	" 'fromDate': '201501010000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/80b1dnt6q.json',\n",
	" 'percentComplete': 0,\n",
	" 'publisher': 'twitter',\n",
	" 'status': 'rejected',\n",
	" 'streamType': 'track',\n",
	" 'title': 'mike j',\n",
	" 'toDate': '201504230001',\n",
	" 'uuid': '80b1dnt6q'},\n",
	" {'expiresAt': '2015-05-09T16:23:16Z',\n",
	" 'fromDate': '201504210000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/pvnzbnwf0b.json',\n",
	" 'percentComplete': 100,\n",
	" 'publisher': 'twitter',\n",
	" 'status': 'delivered',\n",
	" 'streamType': 'track',\n",
	" 'title': 'mike j',\n",
	" 'toDate': '201504230001',\n",
	" 'uuid': 'pvnzbnwf0b'},\n",
	" {'expiresAt': '2015-05-01T18:20:13Z',\n",
	" 'fromDate': '201301010000',\n",
	" 'jobURL': 'https://historical.gnip.com:443/accounts/UCBerkeleyExplore/publishers/twitter/historical/track/jobs/ea88dwtjve.json',\n",
	" 'percentComplete': 0,\n",
	" 'publisher': 'twitter',\n",
	" 'status': 'quoted',\n",
	" 'streamType': 'track',\n",
	" 'title': 'my historical job python',\n",
	" 'toDate': '201301010001',\n",
	" 'uuid': 'ea88dwtjve'}]}"
	]
	},
	"execution_count": 71,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"resp = get(jobs_url, auth=auth)\n",
	"jobs_info = resp.json()\n",
	"jobs_info"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}