Skip to content

Instantly share code, notes, and snippets.

@friso
Created June 5, 2014 09:09
Show Gist options
  • Save friso/82445bbdcff9d7ea7755 to your computer and use it in GitHub Desktop.
Save friso/82445bbdcff9d7ea7755 to your computer and use it in GitHub Desktop.
Blog classification notebook
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:80004889a81a21c536ba996a5ff29390a0919d8e1fa3e023b0b83adc9232987a"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import sqlite3 as sql\n",
"import pandas as pd\n",
"from operator import mul"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"connection = sql.connect('blog.db')\n",
"cursor = connection.cursor()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"class Product:\n",
" def __init__(self):\n",
" self.product = 1.0\n",
"\n",
" def step(self, value):\n",
" self.product *= value\n",
"\n",
" def finalize(self):\n",
" return self.product\n",
"\n",
"connection.create_aggregate(\"product\", 1, Product)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def query(query, params = None):\n",
" rs = cursor.execute(query, params) if params else cursor.execute(query)\n",
" d = [{\n",
" desc[0]: field\n",
" for desc, field in zip(rs.description, row)\n",
" } for row in rs]\n",
" \n",
" return pd.DataFrame(d)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query('''\n",
"select count(*) from posts\n",
"''')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count(*)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 862</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows \u00d7 1 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
" count(*)\n",
"0 862\n",
"\n",
"[1 rows x 1 columns]"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query('''\n",
"create table training_posts as select id, title, url from posts where id % 2 == 0 order by id limit 40\n",
"''')\n",
"\n",
"query('''\n",
"alter table training_posts add column liked bool\n",
"''')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query('''\n",
"create table test_posts as select id, title, url from posts where id % 2 == 1 order by id limit 40\n",
"''')\n",
"\n",
"query('''\n",
"alter table test_posts add column liked bool\n",
"''')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2014/03/25/organisational-inertia-predictor-for-success-of-agile-transformations-part-2/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2014/03/05/end-of-an-era-in-my-garage/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2014/02/24/david-farley-about-continuous-delivery/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2014/02/11/phantomjs-data-mining-bash-data-analysis/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2014/01/06/speedy-fitnesse-development/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/12/18/composite-user-stories/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/31/mattt-thompson-visits-xebia/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/31/organisational-inertia-a-predictor-for-successful-agile-transformations-part-1/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/18/master-po-episode-2/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/10/15/angularjs-validating-radio-buttons/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/07/introducing-master-po/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/09/26/tutorial-using-deployit-cloud-pack-with-amazon-ec2-part-1/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/09/01/differences-between-providers-in-angularjs/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/22/continuous-delivery-essentials-expect-it-to-break/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/13/continuous-delivery-essentials-autonomous-systems/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/08/error-messages-should-have-a-finger-and-a-tell/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/08/02/on-the-mysteriously-fast-spray-can-web-server/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/30/a-better-custom-viewgroup/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/29/7-rules-for-agile-test-automation/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/19/ios-pixel-misalignment-why-its-bad-how-to-fix-it/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/07/16/on-averages-history-and-predicting-the-short-term-future/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/06/xebia-knowledgecast-episode-2/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/04/dependency-management-in-fitnesse-with-apache-ivy/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/27/whats-your-passion/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/06/15/maven-user-starting-with-javascript-package-management/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/13/constructing-your-parachute-on-the-way-down-overcoming-organizational-gravity-for-smarties/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/06/05/speeding-up-require-js-optimization-in-maven/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/30/agile-crisis-management-explained-part-1/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/17/google-io-2013-day-2/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/14/scripting-deployit-part-2/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/04/10/toyota-kata-by-mike-rother/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/03/06/small-data-workflow-long-transactions-and-db2/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/02/24/android-looper-anatomy/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/01/29/series-how-to-kill-the-architecture-department-part-7/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/01/17/combining-neo4j-and-hadoop-part-ii/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2012/12/13/series-how-to-kill-the-architecture-department-part-6/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2012/12/05/take-the-application-release-market-survey/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/29/measure-the-right-coverage/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/25/how-to-integrate-fitnesse-tests-into-jenkins/'\"\"\")\n",
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2012/11/18/performance-testing-with-selenium-and-jmeter/'\"\"\")\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <tbody>\n",
" <tr>\n",
" <td>Int64Index([], dtype='int64')</td>\n",
" <td>Empty DataFrame</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows \u00d7 0 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: []\n",
"\n",
"[0 rows x 0 columns]"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2014/05/17/wrestling-with-the-browser-event-queue/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2014/03/08/angularjs-e2e-testing-using-ngmocke2e/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2014/02/26/on-quality-beauty-elegance/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2014/02/23/promises-and-design-patterns-in-angularjs/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2014/01/31/ngclass-expressions-in-angularjs/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/12/28/phoney-deadlines-are-deadly-for-achieving-results/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/11/12/de-oneindige-cirkel-van-succes/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/31/can-continuous-delivery-succeed-without-management-support/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/22/group-conformity-velocity-in-scrum/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/16/agilehome/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/10/12/testing-promises-in-angularjs/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/09/27/mattt-thompson-at-xebia/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/09/04/the-secret-to-3-digit-productivity-growth/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/08/28/bootstraps-tabs-and-lazy-data-loading-in-angularjs/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/08/21/why-even-spray-can-is-way-to-slow-for-my-purposes/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/12/new-scrum-process-overview-update/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/04/android-tests-with-proguard/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/31/functionality-is-lame-quality-rules/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/29/continuous-delivery-essentials-providing-a-platform-as-a-service/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/22/viewholder-considered-harmful/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/18/separating-responsibilities-stalls-the-continuous-delivery-process/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/11/big-fat-war-files-hurt-continuous-delivery/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/05/true-reliable-continuous-delivery/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/28/you-cannot-change-anything-without-starting-a-project/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/24/the-truth-about-agile-management-and-how-it-can-help-you/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/14/conference-report-plot-visits-xebicon-2013/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/13/agile-crisis-management-explained-part-2/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/06/03/xebia-knowledgecast-reboot/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/22/kanban/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/16/google-io-2013-some-highlights-from-day-1/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/04/21/developing-a-soa-based-integration-layer-framework-features/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/03/24/scripting-deployit/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/03/04/fitnesse-and-dependency-management-with-maven/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/02/23/sketchnoting-for-absolute-beginners/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/01/21/finding-important-connections-in-a-network-automatically/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2012/12/19/developing-a-soa-based-integration-layer-framework-challenges/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2012/12/12/exponential-backoff-with-akka-actors/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/30/openshift-reviewed-part-2/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/26/openshift-reviewed/'\"\"\")\n",
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/20/vmware-vfabric-application-director-1-0-demonstation-setup-at-xebia/'\"\"\")\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <tbody>\n",
" <tr>\n",
" <td>Int64Index([], dtype='int64')</td>\n",
" <td>Empty DataFrame</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows \u00d7 0 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: []\n",
"\n",
"[0 rows x 0 columns]"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query('''create table post_features (post_id integer, feature text)''')\n",
"query('''insert into post_features (post_id, feature) select id, 'has_math' from posts where has_math = 1''')\n",
"query('''insert into post_features (post_id, feature) select id, 'has_code' from posts where has_code = 1''')\n",
"query('''insert into post_features (post_id, feature) select id, 'has_images' from posts where has_images = 1''')\n",
"query('''insert into post_features (post_id, feature) select id, 'has_iframe' from posts where has_iframe = 1''')\n",
"query('''insert into post_features (post_id, feature) select id, language from posts''')\n",
"query('''insert into post_features (post_id, feature) select post_id, code_language from code_languages''')\n",
"query('''insert into post_features (post_id, feature) select post_id, domain from domains''')\n",
"query('''insert into post_features (post_id, feature) select post_id, tag from tags''')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <tbody>\n",
" <tr>\n",
" <td>Int64Index([], dtype='int64')</td>\n",
" <td>Empty DataFrame</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows \u00d7 0 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: []\n",
"\n",
"[0 rows x 0 columns]"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query('''\n",
"create table feature_probs as\n",
"select\n",
" pf.feature as feature,\n",
" count(*) + 1 as occurrence,\n",
" count(case when tp.liked = 1 then 1 end) + 1 as likes,\n",
" count(case when tp.liked = 0 then 1 end) + 1 as dislikes\n",
"from\n",
" post_features pf\n",
" join training_posts tp on pf.post_id = tp.id\n",
"group by\n",
" pf.feature\n",
"''')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <tbody>\n",
" <tr>\n",
" <td>Int64Index([], dtype='int64')</td>\n",
" <td>Empty DataFrame</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows \u00d7 0 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: []\n",
"\n",
"[0 rows x 0 columns]"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query('select * from feature_probs where likes > dislikes order by occurrence desc limit 50')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>dislikes</th>\n",
" <th>feature</th>\n",
" <th>likes</th>\n",
" <th>occurrence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0 </th>\n",
" <td> 5</td>\n",
" <td> has_code</td>\n",
" <td> 8</td>\n",
" <td> 12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1 </th>\n",
" <td> 1</td>\n",
" <td> Javascript</td>\n",
" <td> 4</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2 </th>\n",
" <td> 2</td>\n",
" <td> Maven</td>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3 </th>\n",
" <td> 1</td>\n",
" <td> bash</td>\n",
" <td> 4</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4 </th>\n",
" <td> 1</td>\n",
" <td> jscript</td>\n",
" <td> 4</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5 </th>\n",
" <td> 1</td>\n",
" <td> AngularJS</td>\n",
" <td> 3</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6 </th>\n",
" <td> 1</td>\n",
" <td> NodeJS</td>\n",
" <td> 3</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7 </th>\n",
" <td> 1</td>\n",
" <td> requirejs.org</td>\n",
" <td> 3</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8 </th>\n",
" <td> 1</td>\n",
" <td> Selenium</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9 </th>\n",
" <td> 1</td>\n",
" <td> Xebia</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td> 1</td>\n",
" <td> angularjs.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td> 1</td>\n",
" <td> automated deployment</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td> 1</td>\n",
" <td> automated provisioning</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td> 1</td>\n",
" <td> bower.io</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td> 1</td>\n",
" <td> brew.sh</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td> 1</td>\n",
" <td> casper</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td> 1</td>\n",
" <td> casperjs.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td> 1</td>\n",
" <td> cloud</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td> 1</td>\n",
" <td> console.aws.amazon.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td> 1</td>\n",
" <td> developer.mozilla.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td> 1</td>\n",
" <td> development</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td> 1</td>\n",
" <td> digitalstain.blogspot.nl</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td> 1</td>\n",
" <td> docs.angularjs.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td> 1</td>\n",
" <td> docs.seleniumhq.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td> 1</td>\n",
" <td> gist.github.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td> 1</td>\n",
" <td> help.ubuntu.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td> 1</td>\n",
" <td> localhost:8080</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td> 1</td>\n",
" <td> mojo.codehaus.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td> 1</td>\n",
" <td> mxcl.github.io</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td> 1</td>\n",
" <td> nodejs.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td> 1</td>\n",
" <td> npmjs.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td> 1</td>\n",
" <td> nuget.codeplex.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td> 1</td>\n",
" <td> nuget.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td> 1</td>\n",
" <td> online.wsj.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td> 1</td>\n",
" <td> phantomjs.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td> 1</td>\n",
" <td> providers</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td> 1</td>\n",
" <td> publib.boulder.ibm.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td> 1</td>\n",
" <td> shell</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td> 1</td>\n",
" <td> sindresorhus.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td> 1</td>\n",
" <td> software</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td> 1</td>\n",
" <td> static.springsource.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td> 1</td>\n",
" <td> tidy.sourceforge.net</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td> 1</td>\n",
" <td> waredingen.nl</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td> 1</td>\n",
" <td> wiki.debian.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td> 1</td>\n",
" <td> www-01.ibm.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td> 1</td>\n",
" <td> www.activiti.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td> 1</td>\n",
" <td> www.angularjs.org</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td> 1</td>\n",
" <td> www.codinghorror.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td> 1</td>\n",
" <td> www.jetbrains.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td> 1</td>\n",
" <td> www.sublimetext.com</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>50 rows \u00d7 4 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 23,
"text": [
" dislikes feature likes occurrence\n",
"0 5 has_code 8 12\n",
"1 1 Javascript 4 4\n",
"2 2 Maven 3 4\n",
"3 1 bash 4 4\n",
"4 1 jscript 4 4\n",
"5 1 AngularJS 3 3\n",
"6 1 NodeJS 3 3\n",
"7 1 requirejs.org 3 3\n",
"8 1 Selenium 2 2\n",
"9 1 Xebia 2 2\n",
"10 1 angularjs.org 2 2\n",
"11 1 automated deployment 2 2\n",
"12 1 automated provisioning 2 2\n",
"13 1 bower.io 2 2\n",
"14 1 brew.sh 2 2\n",
"15 1 casper 2 2\n",
"16 1 casperjs.org 2 2\n",
"17 1 cloud 2 2\n",
"18 1 console.aws.amazon.com 2 2\n",
"19 1 developer.mozilla.org 2 2\n",
"20 1 development 2 2\n",
"21 1 digitalstain.blogspot.nl 2 2\n",
"22 1 docs.angularjs.org 2 2\n",
"23 1 docs.seleniumhq.org 2 2\n",
"24 1 gist.github.com 2 2\n",
"25 1 help.ubuntu.com 2 2\n",
"26 1 localhost:8080 2 2\n",
"27 1 mojo.codehaus.org 2 2\n",
"28 1 mxcl.github.io 2 2\n",
"29 1 nodejs.org 2 2\n",
"30 1 npmjs.org 2 2\n",
"31 1 nuget.codeplex.com 2 2\n",
"32 1 nuget.org 2 2\n",
"33 1 online.wsj.com 2 2\n",
"34 1 phantomjs.org 2 2\n",
"35 1 providers 2 2\n",
"36 1 publib.boulder.ibm.com 2 2\n",
"37 1 shell 2 2\n",
"38 1 sindresorhus.com 2 2\n",
"39 1 software 2 2\n",
"40 1 static.springsource.org 2 2\n",
"41 1 tidy.sourceforge.net 2 2\n",
"42 1 waredingen.nl 2 2\n",
"43 1 wiki.debian.org 2 2\n",
"44 1 www-01.ibm.com 2 2\n",
"45 1 www.activiti.org 2 2\n",
"46 1 www.angularjs.org 2 2\n",
"47 1 www.codinghorror.com 2 2\n",
"48 1 www.jetbrains.com 2 2\n",
"49 1 www.sublimetext.com 2 2\n",
"\n",
"[50 rows x 4 columns]"
]
}
],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query('select liked, count(*) from training_posts group by liked')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count(*)</th>\n",
" <th>liked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 28</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 12</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows \u00d7 2 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
" count(*) liked\n",
"0 28 0\n",
"1 12 1\n",
"\n",
"[2 rows x 2 columns]"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query('''\n",
"select\n",
" p.feature,\n",
" p.likes,\n",
" p.dislikes,\n",
" (p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0) as p_like,\n",
" (p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0) as p_dislike\n",
"from feature_probs p\n",
" join post_features pf on p.feature = pf.feature\n",
"where\n",
" pf.post_id = 1\n",
"''')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>dislikes</th>\n",
" <th>feature</th>\n",
" <th>likes</th>\n",
" <th>p_dislike</th>\n",
" <th>p_like</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 5</td>\n",
" <td> has_code</td>\n",
" <td> 8</td>\n",
" <td> 0.416667</td>\n",
" <td> 0.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 16</td>\n",
" <td> has_images</td>\n",
" <td> 9</td>\n",
" <td> 0.666667</td>\n",
" <td> 0.375000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 29</td>\n",
" <td> en</td>\n",
" <td> 13</td>\n",
" <td> 0.707317</td>\n",
" <td> 0.317073</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 3</td>\n",
" <td> xml</td>\n",
" <td> 3</td>\n",
" <td> 0.600000</td>\n",
" <td> 0.600000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 1</td>\n",
" <td> jscript</td>\n",
" <td> 4</td>\n",
" <td> 0.250000</td>\n",
" <td> 1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> 15</td>\n",
" <td> blog.xebia.com</td>\n",
" <td> 8</td>\n",
" <td> 0.681818</td>\n",
" <td> 0.363636</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6 rows \u00d7 5 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
" dislikes feature likes p_dislike p_like\n",
"0 5 has_code 8 0.416667 0.666667\n",
"1 16 has_images 9 0.666667 0.375000\n",
"2 29 en 13 0.707317 0.317073\n",
"3 3 xml 3 0.600000 0.600000\n",
"4 1 jscript 4 0.250000 1.000000\n",
"5 15 blog.xebia.com 8 0.681818 0.363636\n",
"\n",
"[6 rows x 5 columns]"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = query('''\n",
"select\n",
" pf.post_id,\n",
" tp.liked,\n",
" product((p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0)) as p_like,\n",
" product((p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0)) as p_dislike,\n",
" product((p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0)) > product((p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0)) as classification\n",
"from feature_probs p\n",
" join post_features pf on p.feature = pf.feature\n",
" join test_posts tp on pf.post_id = tp.id\n",
"group by pf.post_id\n",
"''')\n",
"\n",
"df[:15]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>classification</th>\n",
" <th>liked</th>\n",
" <th>p_dislike</th>\n",
" <th>p_like</th>\n",
" <th>post_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0 </th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0.020094</td>\n",
" <td> 0.017295</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1 </th>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.026524</td>\n",
" <td> 0.063415</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2 </th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0.471545</td>\n",
" <td> 0.118902</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3 </th>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.000209</td>\n",
" <td> 0.014412</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4 </th>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.002456</td>\n",
" <td> 0.047561</td>\n",
" <td> 9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5 </th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0.200942</td>\n",
" <td> 0.021619</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6 </th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0.471545</td>\n",
" <td> 0.118902</td>\n",
" <td> 15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7 </th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0.707317</td>\n",
" <td> 0.317073</td>\n",
" <td> 17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8 </th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0.321508</td>\n",
" <td> 0.001601</td>\n",
" <td> 19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9 </th>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.008187</td>\n",
" <td> 0.013211</td>\n",
" <td> 21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0.707317</td>\n",
" <td> 0.009909</td>\n",
" <td> 23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0.321508</td>\n",
" <td> 0.000267</td>\n",
" <td> 25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0.000368</td>\n",
" <td> 0.023780</td>\n",
" <td> 27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0.120565</td>\n",
" <td> 0.009608</td>\n",
" <td> 29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0.321508</td>\n",
" <td> 0.001601</td>\n",
" <td> 31</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>15 rows \u00d7 5 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 27,
"text": [
" classification liked p_dislike p_like post_id\n",
"0 0 1 0.020094 0.017295 1\n",
"1 1 1 0.026524 0.063415 3\n",
"2 0 0 0.471545 0.118902 5\n",
"3 1 1 0.000209 0.014412 7\n",
"4 1 1 0.002456 0.047561 9\n",
"5 0 0 0.200942 0.021619 11\n",
"6 0 0 0.471545 0.118902 15\n",
"7 0 0 0.707317 0.317073 17\n",
"8 0 0 0.321508 0.001601 19\n",
"9 1 1 0.008187 0.013211 21\n",
"10 0 0 0.707317 0.009909 23\n",
"11 0 0 0.321508 0.000267 25\n",
"12 1 1 0.000368 0.023780 27\n",
"13 0 1 0.120565 0.009608 29\n",
"14 0 0 0.321508 0.001601 31\n",
"\n",
"[15 rows x 5 columns]"
]
}
],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = query('''\n",
"select liked, classification, count(*)\n",
"from (\n",
"select\n",
" pf.post_id,\n",
" tp.liked,\n",
" product((p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0)) as p_like,\n",
" product((p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0)) as p_dislike,\n",
" product((p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0)) > product((p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0)) as classification\n",
"from feature_probs p\n",
" join post_features pf on p.feature = pf.feature\n",
" join test_posts tp on pf.post_id = tp.id\n",
"group by pf.post_id\n",
")\n",
"group by liked, classification\n",
"''')\n",
"\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>classification</th>\n",
" <th>count(*)</th>\n",
" <th>liked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td> 29</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 0</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 1</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows \u00d7 3 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"text": [
" classification count(*) liked\n",
"0 0 29 0\n",
"1 0 5 1\n",
"2 1 5 1\n",
"\n",
"[3 rows x 3 columns]"
]
}
],
"prompt_number": 26
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment