Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save AndrewGreen/b4376579aacf62f28b086656f0142b79 to your computer and use it in GitHub Desktop.
Save AndrewGreen/b4376579aacf62f28b086656f0142b79 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied (use --upgrade to upgrade): pydruid in ./venv/lib/python3.4/site-packages\r\n",
"Requirement already satisfied (use --upgrade to upgrade): pandas in ./venv/lib/python3.4/site-packages\r\n",
"Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2 in ./venv/lib/python3.4/site-packages (from pandas)\r\n",
"Requirement already satisfied (use --upgrade to upgrade): pytz>=2011k in ./venv/lib/python3.4/site-packages (from pandas)\r\n",
"Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0 in ./venv/lib/python3.4/site-packages (from pandas)\r\n",
"Cleaning up...\r\n"
]
}
],
"source": [
"!pip install pydruid pandas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pydruid.client import *\n",
"from pydruid.utils.aggregators import *\n",
"from pydruid.utils.filters import *\n",
"import matplotlib.pyplot as plt\n",
"import pandas"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Temporay hack to get around Jupyter proxy settings\n",
"class PyDruidIgnoreProxy(PyDruid):\n",
" def _post(self, query):\n",
" try:\n",
" headers, querystr, url = self._prepare_url_headers_and_body(query)\n",
" req = urllib.request.Request(url, querystr, headers)\n",
" proxy_handler = urllib.request.ProxyHandler({})\n",
" opener = urllib.request.build_opener(proxy_handler) \n",
" res = opener.open(req)\n",
" data = res.read().decode(\"utf-8\")\n",
" res.close()\n",
" except urllib.error.HTTPError:\n",
" _, e, _ = sys.exc_info()\n",
" err = None\n",
" if e.code == 500:\n",
" # has Druid returned an error?\n",
" try:\n",
" err = json.loads(e.read().decode(\"utf-8\"))\n",
" except (ValueError, AttributeError, KeyError):\n",
" pass\n",
" else:\n",
" err = err.get('error', None)\n",
"\n",
" raise IOError('{0} \\n Druid Error: {1} \\n Query is: {2}'.format(\n",
" e, err, json.dumps(query.query_dict, indent=4)))\n",
" else:\n",
" query.parse(data)\n",
" return query"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"query = PyDruidIgnoreProxy('<druid_URL>', 'druid/v2')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Pageviews for a specific country, project and language, on desktop\n",
"pv_druid = query.timeseries(\n",
" datasource = 'pageviews-hourly',\n",
" granularity = 'hour',\n",
" intervals = '2017-09-06/2017-09-30',\n",
" aggregations = {'pageviews': doublesum('view_count')},\n",
" filter = (\n",
" (Dimension('country_code') == 'JP') &\n",
" (Dimension('project') == 'en.wikipedia') &\n",
" (Dimension('agent_type') == 'user') &\n",
" (Dimension('access_method') == 'desktop')\n",
" ) \n",
")\n",
"pv = query.export_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Impressions for the same segment\n",
"imp_druid = query.timeseries(\n",
" datasource = 'banner_activity_minutely',\n",
" granularity = 'hour',\n",
" intervals = '2017-09-06/2017-09-30',\n",
" aggregations = {'impressions': longsum('request_count')},\n",
" filter = (\n",
" (Dimension('country') == 'JP') &\n",
" (Dimension('project') == 'wikipedia') &\n",
" (Dimension('uselang') == 'en') &\n",
" (Dimension('anonymous') == True) &\n",
" (Dimension('device') == 'desktop')\n",
" ) \n",
")\n",
"imp = query.export_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Merge results from the two data sets\n",
"rates = pandas.merge(pv, imp, how='left', on=['timestamp'])\n",
"rates['rate'] = rates.apply(lambda row: row.impressions/row.pageviews, axis=1)\n",
"# rates['timestamp'] = rates['timestamp'].map(lambda x: x.split('T')[0])\n",
"rates.rename(columns = {'timestamp':'time'}, inplace = True )\n",
"rates = rates.reindex_axis(['time', 'impressions', 'pageviews', 'rate'], axis='columns')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Plot the results\n",
"rates.plot(x='time', y='rate')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment