Instantly share code, notes, and snippets.
Created
October 2, 2017 18:32
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save AndrewGreen/b4376579aacf62f28b086656f0142b79 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied (use --upgrade to upgrade): pydruid in ./venv/lib/python3.4/site-packages\r\n", | |
"Requirement already satisfied (use --upgrade to upgrade): pandas in ./venv/lib/python3.4/site-packages\r\n", | |
"Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2 in ./venv/lib/python3.4/site-packages (from pandas)\r\n", | |
"Requirement already satisfied (use --upgrade to upgrade): pytz>=2011k in ./venv/lib/python3.4/site-packages (from pandas)\r\n", | |
"Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0 in ./venv/lib/python3.4/site-packages (from pandas)\r\n", | |
"Cleaning up...\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install pydruid pandas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from pydruid.client import *\n", | |
"from pydruid.utils.aggregators import *\n", | |
"from pydruid.utils.filters import *\n", | |
"import matplotlib.pyplot as plt\n", | |
"import pandas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Temporay hack to get around Jupyter proxy settings\n", | |
"class PyDruidIgnoreProxy(PyDruid):\n", | |
" def _post(self, query):\n", | |
" try:\n", | |
" headers, querystr, url = self._prepare_url_headers_and_body(query)\n", | |
" req = urllib.request.Request(url, querystr, headers)\n", | |
" proxy_handler = urllib.request.ProxyHandler({})\n", | |
" opener = urllib.request.build_opener(proxy_handler) \n", | |
" res = opener.open(req)\n", | |
" data = res.read().decode(\"utf-8\")\n", | |
" res.close()\n", | |
" except urllib.error.HTTPError:\n", | |
" _, e, _ = sys.exc_info()\n", | |
" err = None\n", | |
" if e.code == 500:\n", | |
" # has Druid returned an error?\n", | |
" try:\n", | |
" err = json.loads(e.read().decode(\"utf-8\"))\n", | |
" except (ValueError, AttributeError, KeyError):\n", | |
" pass\n", | |
" else:\n", | |
" err = err.get('error', None)\n", | |
"\n", | |
" raise IOError('{0} \\n Druid Error: {1} \\n Query is: {2}'.format(\n", | |
" e, err, json.dumps(query.query_dict, indent=4)))\n", | |
" else:\n", | |
" query.parse(data)\n", | |
" return query" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"query = PyDruidIgnoreProxy('<druid_URL>', 'druid/v2')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Pageviews for a specific country, project and language, on desktop\n", | |
"pv_druid = query.timeseries(\n", | |
" datasource = 'pageviews-hourly',\n", | |
" granularity = 'hour',\n", | |
" intervals = '2017-09-06/2017-09-30',\n", | |
" aggregations = {'pageviews': doublesum('view_count')},\n", | |
" filter = (\n", | |
" (Dimension('country_code') == 'JP') &\n", | |
" (Dimension('project') == 'en.wikipedia') &\n", | |
" (Dimension('agent_type') == 'user') &\n", | |
" (Dimension('access_method') == 'desktop')\n", | |
" ) \n", | |
")\n", | |
"pv = query.export_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Impressions for the same segment\n", | |
"imp_druid = query.timeseries(\n", | |
" datasource = 'banner_activity_minutely',\n", | |
" granularity = 'hour',\n", | |
" intervals = '2017-09-06/2017-09-30',\n", | |
" aggregations = {'impressions': longsum('request_count')},\n", | |
" filter = (\n", | |
" (Dimension('country') == 'JP') &\n", | |
" (Dimension('project') == 'wikipedia') &\n", | |
" (Dimension('uselang') == 'en') &\n", | |
" (Dimension('anonymous') == True) &\n", | |
" (Dimension('device') == 'desktop')\n", | |
" ) \n", | |
")\n", | |
"imp = query.export_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Merge results from the two data sets\n", | |
"rates = pandas.merge(pv, imp, how='left', on=['timestamp'])\n", | |
"rates['rate'] = rates.apply(lambda row: row.impressions/row.pageviews, axis=1)\n", | |
"# rates['timestamp'] = rates['timestamp'].map(lambda x: x.split('T')[0])\n", | |
"rates.rename(columns = {'timestamp':'time'}, inplace = True )\n", | |
"rates = rates.reindex_axis(['time', 'impressions', 'pageviews', 'rate'], axis='columns')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Plot the results\n", | |
"rates.plot(x='time', y='rate')\n", | |
"plt.show()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment