Created
June 5, 2014 09:09
-
-
Save friso/82445bbdcff9d7ea7755 to your computer and use it in GitHub Desktop.
Blog classification notebook
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:80004889a81a21c536ba996a5ff29390a0919d8e1fa3e023b0b83adc9232987a" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import sqlite3 as sql\n", | |
"import pandas as pd\n", | |
"from operator import mul" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"connection = sql.connect('blog.db')\n", | |
"cursor = connection.cursor()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"class Product:\n", | |
" def __init__(self):\n", | |
" self.product = 1.0\n", | |
"\n", | |
" def step(self, value):\n", | |
" self.product *= value\n", | |
"\n", | |
" def finalize(self):\n", | |
" return self.product\n", | |
"\n", | |
"connection.create_aggregate(\"product\", 1, Product)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def query(query, params = None):\n", | |
" rs = cursor.execute(query, params) if params else cursor.execute(query)\n", | |
" d = [{\n", | |
" desc[0]: field\n", | |
" for desc, field in zip(rs.description, row)\n", | |
" } for row in rs]\n", | |
" \n", | |
" return pd.DataFrame(d)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query('''\n", | |
"select count(*) from posts\n", | |
"''')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>count(*)</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 862</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1 rows \u00d7 1 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
" count(*)\n", | |
"0 862\n", | |
"\n", | |
"[1 rows x 1 columns]" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query('''\n", | |
"create table training_posts as select id, title, url from posts where id % 2 == 0 order by id limit 40\n", | |
"''')\n", | |
"\n", | |
"query('''\n", | |
"alter table training_posts add column liked bool\n", | |
"''')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query('''\n", | |
"create table test_posts as select id, title, url from posts where id % 2 == 1 order by id limit 40\n", | |
"''')\n", | |
"\n", | |
"query('''\n", | |
"alter table test_posts add column liked bool\n", | |
"''')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2014/03/25/organisational-inertia-predictor-for-success-of-agile-transformations-part-2/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2014/03/05/end-of-an-era-in-my-garage/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2014/02/24/david-farley-about-continuous-delivery/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2014/02/11/phantomjs-data-mining-bash-data-analysis/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2014/01/06/speedy-fitnesse-development/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/12/18/composite-user-stories/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/31/mattt-thompson-visits-xebia/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/31/organisational-inertia-a-predictor-for-successful-agile-transformations-part-1/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/18/master-po-episode-2/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/10/15/angularjs-validating-radio-buttons/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/07/introducing-master-po/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/09/26/tutorial-using-deployit-cloud-pack-with-amazon-ec2-part-1/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/09/01/differences-between-providers-in-angularjs/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/22/continuous-delivery-essentials-expect-it-to-break/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/13/continuous-delivery-essentials-autonomous-systems/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/08/error-messages-should-have-a-finger-and-a-tell/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/08/02/on-the-mysteriously-fast-spray-can-web-server/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/30/a-better-custom-viewgroup/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/29/7-rules-for-agile-test-automation/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/19/ios-pixel-misalignment-why-its-bad-how-to-fix-it/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/07/16/on-averages-history-and-predicting-the-short-term-future/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/06/xebia-knowledgecast-episode-2/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/04/dependency-management-in-fitnesse-with-apache-ivy/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/27/whats-your-passion/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/06/15/maven-user-starting-with-javascript-package-management/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/13/constructing-your-parachute-on-the-way-down-overcoming-organizational-gravity-for-smarties/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/06/05/speeding-up-require-js-optimization-in-maven/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/30/agile-crisis-management-explained-part-1/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/17/google-io-2013-day-2/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/14/scripting-deployit-part-2/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/04/10/toyota-kata-by-mike-rother/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/03/06/small-data-workflow-long-transactions-and-db2/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/02/24/android-looper-anatomy/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2013/01/29/series-how-to-kill-the-architecture-department-part-7/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2013/01/17/combining-neo4j-and-hadoop-part-ii/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2012/12/13/series-how-to-kill-the-architecture-department-part-6/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2012/12/05/take-the-application-release-market-survey/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/29/measure-the-right-coverage/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/25/how-to-integrate-fitnesse-tests-into-jenkins/'\"\"\")\n", | |
"query(\"\"\"update training_posts set liked = 1 where url = 'http://blog.xebia.com/2012/11/18/performance-testing-with-selenium-and-jmeter/'\"\"\")\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>Int64Index([], dtype='int64')</td>\n", | |
" <td>Empty DataFrame</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>0 rows \u00d7 0 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"Empty DataFrame\n", | |
"Columns: []\n", | |
"Index: []\n", | |
"\n", | |
"[0 rows x 0 columns]" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2014/05/17/wrestling-with-the-browser-event-queue/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2014/03/08/angularjs-e2e-testing-using-ngmocke2e/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2014/02/26/on-quality-beauty-elegance/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2014/02/23/promises-and-design-patterns-in-angularjs/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2014/01/31/ngclass-expressions-in-angularjs/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/12/28/phoney-deadlines-are-deadly-for-achieving-results/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/11/12/de-oneindige-cirkel-van-succes/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/31/can-continuous-delivery-succeed-without-management-support/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/22/group-conformity-velocity-in-scrum/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/10/16/agilehome/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/10/12/testing-promises-in-angularjs/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/09/27/mattt-thompson-at-xebia/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/09/04/the-secret-to-3-digit-productivity-growth/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/08/28/bootstraps-tabs-and-lazy-data-loading-in-angularjs/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/08/21/why-even-spray-can-is-way-to-slow-for-my-purposes/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/12/new-scrum-process-overview-update/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/08/04/android-tests-with-proguard/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/31/functionality-is-lame-quality-rules/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/29/continuous-delivery-essentials-providing-a-platform-as-a-service/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/22/viewholder-considered-harmful/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/18/separating-responsibilities-stalls-the-continuous-delivery-process/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/11/big-fat-war-files-hurt-continuous-delivery/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/07/05/true-reliable-continuous-delivery/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/28/you-cannot-change-anything-without-starting-a-project/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/24/the-truth-about-agile-management-and-how-it-can-help-you/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/14/conference-report-plot-visits-xebicon-2013/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/06/13/agile-crisis-management-explained-part-2/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/06/03/xebia-knowledgecast-reboot/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/22/kanban/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/05/16/google-io-2013-some-highlights-from-day-1/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/04/21/developing-a-soa-based-integration-layer-framework-features/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/03/24/scripting-deployit/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/03/04/fitnesse-and-dependency-management-with-maven/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2013/02/23/sketchnoting-for-absolute-beginners/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2013/01/21/finding-important-connections-in-a-network-automatically/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2012/12/19/developing-a-soa-based-integration-layer-framework-challenges/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 1 where url = 'http://blog.xebia.com/2012/12/12/exponential-backoff-with-akka-actors/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/30/openshift-reviewed-part-2/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/26/openshift-reviewed/'\"\"\")\n", | |
"query(\"\"\"update test_posts set liked = 0 where url = 'http://blog.xebia.com/2012/11/20/vmware-vfabric-application-director-1-0-demonstation-setup-at-xebia/'\"\"\")\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>Int64Index([], dtype='int64')</td>\n", | |
" <td>Empty DataFrame</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>0 rows \u00d7 0 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"Empty DataFrame\n", | |
"Columns: []\n", | |
"Index: []\n", | |
"\n", | |
"[0 rows x 0 columns]" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query('''create table post_features (post_id integer, feature text)''')\n", | |
"query('''insert into post_features (post_id, feature) select id, 'has_math' from posts where has_math = 1''')\n", | |
"query('''insert into post_features (post_id, feature) select id, 'has_code' from posts where has_code = 1''')\n", | |
"query('''insert into post_features (post_id, feature) select id, 'has_images' from posts where has_images = 1''')\n", | |
"query('''insert into post_features (post_id, feature) select id, 'has_iframe' from posts where has_iframe = 1''')\n", | |
"query('''insert into post_features (post_id, feature) select id, language from posts''')\n", | |
"query('''insert into post_features (post_id, feature) select post_id, code_language from code_languages''')\n", | |
"query('''insert into post_features (post_id, feature) select post_id, domain from domains''')\n", | |
"query('''insert into post_features (post_id, feature) select post_id, tag from tags''')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>Int64Index([], dtype='int64')</td>\n", | |
" <td>Empty DataFrame</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>0 rows \u00d7 0 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
"Empty DataFrame\n", | |
"Columns: []\n", | |
"Index: []\n", | |
"\n", | |
"[0 rows x 0 columns]" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query('''\n", | |
"create table feature_probs as\n", | |
"select\n", | |
" pf.feature as feature,\n", | |
" count(*) + 1 as occurrence,\n", | |
" count(case when tp.liked = 1 then 1 end) + 1 as likes,\n", | |
" count(case when tp.liked = 0 then 1 end) + 1 as dislikes\n", | |
"from\n", | |
" post_features pf\n", | |
" join training_posts tp on pf.post_id = tp.id\n", | |
"group by\n", | |
" pf.feature\n", | |
"''')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>Int64Index([], dtype='int64')</td>\n", | |
" <td>Empty DataFrame</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>0 rows \u00d7 0 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"Empty DataFrame\n", | |
"Columns: []\n", | |
"Index: []\n", | |
"\n", | |
"[0 rows x 0 columns]" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query('select * from feature_probs where likes > dislikes order by occurrence desc limit 50')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>dislikes</th>\n", | |
" <th>feature</th>\n", | |
" <th>likes</th>\n", | |
" <th>occurrence</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0 </th>\n", | |
" <td> 5</td>\n", | |
" <td> has_code</td>\n", | |
" <td> 8</td>\n", | |
" <td> 12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1 </th>\n", | |
" <td> 1</td>\n", | |
" <td> Javascript</td>\n", | |
" <td> 4</td>\n", | |
" <td> 4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2 </th>\n", | |
" <td> 2</td>\n", | |
" <td> Maven</td>\n", | |
" <td> 3</td>\n", | |
" <td> 4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3 </th>\n", | |
" <td> 1</td>\n", | |
" <td> bash</td>\n", | |
" <td> 4</td>\n", | |
" <td> 4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4 </th>\n", | |
" <td> 1</td>\n", | |
" <td> jscript</td>\n", | |
" <td> 4</td>\n", | |
" <td> 4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5 </th>\n", | |
" <td> 1</td>\n", | |
" <td> AngularJS</td>\n", | |
" <td> 3</td>\n", | |
" <td> 3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6 </th>\n", | |
" <td> 1</td>\n", | |
" <td> NodeJS</td>\n", | |
" <td> 3</td>\n", | |
" <td> 3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7 </th>\n", | |
" <td> 1</td>\n", | |
" <td> requirejs.org</td>\n", | |
" <td> 3</td>\n", | |
" <td> 3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8 </th>\n", | |
" <td> 1</td>\n", | |
" <td> Selenium</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9 </th>\n", | |
" <td> 1</td>\n", | |
" <td> Xebia</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td> 1</td>\n", | |
" <td> angularjs.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td> 1</td>\n", | |
" <td> automated deployment</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td> 1</td>\n", | |
" <td> automated provisioning</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td> 1</td>\n", | |
" <td> bower.io</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td> 1</td>\n", | |
" <td> brew.sh</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td> 1</td>\n", | |
" <td> casper</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td> 1</td>\n", | |
" <td> casperjs.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td> 1</td>\n", | |
" <td> cloud</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td> 1</td>\n", | |
" <td> console.aws.amazon.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td> 1</td>\n", | |
" <td> developer.mozilla.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td> 1</td>\n", | |
" <td> development</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td> 1</td>\n", | |
" <td> digitalstain.blogspot.nl</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td> 1</td>\n", | |
" <td> docs.angularjs.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td> 1</td>\n", | |
" <td> docs.seleniumhq.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td> 1</td>\n", | |
" <td> gist.github.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td> 1</td>\n", | |
" <td> help.ubuntu.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td> 1</td>\n", | |
" <td> localhost:8080</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td> 1</td>\n", | |
" <td> mojo.codehaus.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td> 1</td>\n", | |
" <td> mxcl.github.io</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29</th>\n", | |
" <td> 1</td>\n", | |
" <td> nodejs.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>30</th>\n", | |
" <td> 1</td>\n", | |
" <td> npmjs.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td> 1</td>\n", | |
" <td> nuget.codeplex.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>32</th>\n", | |
" <td> 1</td>\n", | |
" <td> nuget.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>33</th>\n", | |
" <td> 1</td>\n", | |
" <td> online.wsj.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>34</th>\n", | |
" <td> 1</td>\n", | |
" <td> phantomjs.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>35</th>\n", | |
" <td> 1</td>\n", | |
" <td> providers</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>36</th>\n", | |
" <td> 1</td>\n", | |
" <td> publib.boulder.ibm.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>37</th>\n", | |
" <td> 1</td>\n", | |
" <td> shell</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td> 1</td>\n", | |
" <td> sindresorhus.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>39</th>\n", | |
" <td> 1</td>\n", | |
" <td> software</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>40</th>\n", | |
" <td> 1</td>\n", | |
" <td> static.springsource.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41</th>\n", | |
" <td> 1</td>\n", | |
" <td> tidy.sourceforge.net</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>42</th>\n", | |
" <td> 1</td>\n", | |
" <td> waredingen.nl</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>43</th>\n", | |
" <td> 1</td>\n", | |
" <td> wiki.debian.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>44</th>\n", | |
" <td> 1</td>\n", | |
" <td> www-01.ibm.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>45</th>\n", | |
" <td> 1</td>\n", | |
" <td> www.activiti.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>46</th>\n", | |
" <td> 1</td>\n", | |
" <td> www.angularjs.org</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>47</th>\n", | |
" <td> 1</td>\n", | |
" <td> www.codinghorror.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>48</th>\n", | |
" <td> 1</td>\n", | |
" <td> www.jetbrains.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>49</th>\n", | |
" <td> 1</td>\n", | |
" <td> www.sublimetext.com</td>\n", | |
" <td> 2</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>50 rows \u00d7 4 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 23, | |
"text": [ | |
" dislikes feature likes occurrence\n", | |
"0 5 has_code 8 12\n", | |
"1 1 Javascript 4 4\n", | |
"2 2 Maven 3 4\n", | |
"3 1 bash 4 4\n", | |
"4 1 jscript 4 4\n", | |
"5 1 AngularJS 3 3\n", | |
"6 1 NodeJS 3 3\n", | |
"7 1 requirejs.org 3 3\n", | |
"8 1 Selenium 2 2\n", | |
"9 1 Xebia 2 2\n", | |
"10 1 angularjs.org 2 2\n", | |
"11 1 automated deployment 2 2\n", | |
"12 1 automated provisioning 2 2\n", | |
"13 1 bower.io 2 2\n", | |
"14 1 brew.sh 2 2\n", | |
"15 1 casper 2 2\n", | |
"16 1 casperjs.org 2 2\n", | |
"17 1 cloud 2 2\n", | |
"18 1 console.aws.amazon.com 2 2\n", | |
"19 1 developer.mozilla.org 2 2\n", | |
"20 1 development 2 2\n", | |
"21 1 digitalstain.blogspot.nl 2 2\n", | |
"22 1 docs.angularjs.org 2 2\n", | |
"23 1 docs.seleniumhq.org 2 2\n", | |
"24 1 gist.github.com 2 2\n", | |
"25 1 help.ubuntu.com 2 2\n", | |
"26 1 localhost:8080 2 2\n", | |
"27 1 mojo.codehaus.org 2 2\n", | |
"28 1 mxcl.github.io 2 2\n", | |
"29 1 nodejs.org 2 2\n", | |
"30 1 npmjs.org 2 2\n", | |
"31 1 nuget.codeplex.com 2 2\n", | |
"32 1 nuget.org 2 2\n", | |
"33 1 online.wsj.com 2 2\n", | |
"34 1 phantomjs.org 2 2\n", | |
"35 1 providers 2 2\n", | |
"36 1 publib.boulder.ibm.com 2 2\n", | |
"37 1 shell 2 2\n", | |
"38 1 sindresorhus.com 2 2\n", | |
"39 1 software 2 2\n", | |
"40 1 static.springsource.org 2 2\n", | |
"41 1 tidy.sourceforge.net 2 2\n", | |
"42 1 waredingen.nl 2 2\n", | |
"43 1 wiki.debian.org 2 2\n", | |
"44 1 www-01.ibm.com 2 2\n", | |
"45 1 www.activiti.org 2 2\n", | |
"46 1 www.angularjs.org 2 2\n", | |
"47 1 www.codinghorror.com 2 2\n", | |
"48 1 www.jetbrains.com 2 2\n", | |
"49 1 www.sublimetext.com 2 2\n", | |
"\n", | |
"[50 rows x 4 columns]" | |
] | |
} | |
], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query('select liked, count(*) from training_posts group by liked')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>count(*)</th>\n", | |
" <th>liked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 28</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 12</td>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>2 rows \u00d7 2 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 14, | |
"text": [ | |
" count(*) liked\n", | |
"0 28 0\n", | |
"1 12 1\n", | |
"\n", | |
"[2 rows x 2 columns]" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query('''\n", | |
"select\n", | |
" p.feature,\n", | |
" p.likes,\n", | |
" p.dislikes,\n", | |
" (p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0) as p_like,\n", | |
" (p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0) as p_dislike\n", | |
"from feature_probs p\n", | |
" join post_features pf on p.feature = pf.feature\n", | |
"where\n", | |
" pf.post_id = 1\n", | |
"''')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>dislikes</th>\n", | |
" <th>feature</th>\n", | |
" <th>likes</th>\n", | |
" <th>p_dislike</th>\n", | |
" <th>p_like</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 5</td>\n", | |
" <td> has_code</td>\n", | |
" <td> 8</td>\n", | |
" <td> 0.416667</td>\n", | |
" <td> 0.666667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 16</td>\n", | |
" <td> has_images</td>\n", | |
" <td> 9</td>\n", | |
" <td> 0.666667</td>\n", | |
" <td> 0.375000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 29</td>\n", | |
" <td> en</td>\n", | |
" <td> 13</td>\n", | |
" <td> 0.707317</td>\n", | |
" <td> 0.317073</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 3</td>\n", | |
" <td> xml</td>\n", | |
" <td> 3</td>\n", | |
" <td> 0.600000</td>\n", | |
" <td> 0.600000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 1</td>\n", | |
" <td> jscript</td>\n", | |
" <td> 4</td>\n", | |
" <td> 0.250000</td>\n", | |
" <td> 1.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 15</td>\n", | |
" <td> blog.xebia.com</td>\n", | |
" <td> 8</td>\n", | |
" <td> 0.681818</td>\n", | |
" <td> 0.363636</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>6 rows \u00d7 5 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 15, | |
"text": [ | |
" dislikes feature likes p_dislike p_like\n", | |
"0 5 has_code 8 0.416667 0.666667\n", | |
"1 16 has_images 9 0.666667 0.375000\n", | |
"2 29 en 13 0.707317 0.317073\n", | |
"3 3 xml 3 0.600000 0.600000\n", | |
"4 1 jscript 4 0.250000 1.000000\n", | |
"5 15 blog.xebia.com 8 0.681818 0.363636\n", | |
"\n", | |
"[6 rows x 5 columns]" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = query('''\n", | |
"select\n", | |
" pf.post_id,\n", | |
" tp.liked,\n", | |
" product((p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0)) as p_like,\n", | |
" product((p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0)) as p_dislike,\n", | |
" product((p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0)) > product((p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0)) as classification\n", | |
"from feature_probs p\n", | |
" join post_features pf on p.feature = pf.feature\n", | |
" join test_posts tp on pf.post_id = tp.id\n", | |
"group by pf.post_id\n", | |
"''')\n", | |
"\n", | |
"df[:15]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>classification</th>\n", | |
" <th>liked</th>\n", | |
" <th>p_dislike</th>\n", | |
" <th>p_like</th>\n", | |
" <th>post_id</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0 </th>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.020094</td>\n", | |
" <td> 0.017295</td>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1 </th>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.026524</td>\n", | |
" <td> 0.063415</td>\n", | |
" <td> 3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2 </th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.471545</td>\n", | |
" <td> 0.118902</td>\n", | |
" <td> 5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3 </th>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.000209</td>\n", | |
" <td> 0.014412</td>\n", | |
" <td> 7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4 </th>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.002456</td>\n", | |
" <td> 0.047561</td>\n", | |
" <td> 9</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5 </th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.200942</td>\n", | |
" <td> 0.021619</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6 </th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.471545</td>\n", | |
" <td> 0.118902</td>\n", | |
" <td> 15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7 </th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.707317</td>\n", | |
" <td> 0.317073</td>\n", | |
" <td> 17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8 </th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.321508</td>\n", | |
" <td> 0.001601</td>\n", | |
" <td> 19</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9 </th>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.008187</td>\n", | |
" <td> 0.013211</td>\n", | |
" <td> 21</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.707317</td>\n", | |
" <td> 0.009909</td>\n", | |
" <td> 23</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.321508</td>\n", | |
" <td> 0.000267</td>\n", | |
" <td> 25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.000368</td>\n", | |
" <td> 0.023780</td>\n", | |
" <td> 27</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.120565</td>\n", | |
" <td> 0.009608</td>\n", | |
" <td> 29</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.321508</td>\n", | |
" <td> 0.001601</td>\n", | |
" <td> 31</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>15 rows \u00d7 5 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 27, | |
"text": [ | |
" classification liked p_dislike p_like post_id\n", | |
"0 0 1 0.020094 0.017295 1\n", | |
"1 1 1 0.026524 0.063415 3\n", | |
"2 0 0 0.471545 0.118902 5\n", | |
"3 1 1 0.000209 0.014412 7\n", | |
"4 1 1 0.002456 0.047561 9\n", | |
"5 0 0 0.200942 0.021619 11\n", | |
"6 0 0 0.471545 0.118902 15\n", | |
"7 0 0 0.707317 0.317073 17\n", | |
"8 0 0 0.321508 0.001601 19\n", | |
"9 1 1 0.008187 0.013211 21\n", | |
"10 0 0 0.707317 0.009909 23\n", | |
"11 0 0 0.321508 0.000267 25\n", | |
"12 1 1 0.000368 0.023780 27\n", | |
"13 0 1 0.120565 0.009608 29\n", | |
"14 0 0 0.321508 0.001601 31\n", | |
"\n", | |
"[15 rows x 5 columns]" | |
] | |
} | |
], | |
"prompt_number": 27 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = query('''\n", | |
"select liked, classification, count(*)\n", | |
"from (\n", | |
"select\n", | |
" pf.post_id,\n", | |
" tp.liked,\n", | |
" product((p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0)) as p_like,\n", | |
" product((p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0)) as p_dislike,\n", | |
" product((p.likes / 12.0) * (12.0 / 40.0) / (p.occurrence / 40.0)) > product((p.dislikes / 28.0) * (28.0 / 40.0) / (p.occurrence / 40.0)) as classification\n", | |
"from feature_probs p\n", | |
" join post_features pf on p.feature = pf.feature\n", | |
" join test_posts tp on pf.post_id = tp.id\n", | |
"group by pf.post_id\n", | |
")\n", | |
"group by liked, classification\n", | |
"''')\n", | |
"\n", | |
"df" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>classification</th>\n", | |
" <th>count(*)</th>\n", | |
" <th>liked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0</td>\n", | |
" <td> 29</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 0</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>3 rows \u00d7 3 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 26, | |
"text": [ | |
" classification count(*) liked\n", | |
"0 0 29 0\n", | |
"1 0 5 1\n", | |
"2 1 5 1\n", | |
"\n", | |
"[3 rows x 3 columns]" | |
] | |
} | |
], | |
"prompt_number": 26 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment