-
-
Save shurain/ceef157770d2944edb20 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from pandas.io.parsers import read_csv\n", | |
"%matplotlib inline\n", | |
"\n", | |
"from matplotlib import pyplot as plt\n", | |
"import matplotlib as mpl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n", | |
" data = self._reader.read(nrows)\n" | |
] | |
} | |
], | |
"source": [ | |
"click_df = read_csv('data/yoochoose-clicks.dat', names=[\"sid\", \"timestamp\", \"iid\", \"category\"], parse_dates=[1])\n", | |
"buy_df = read_csv(\"data/yoochoose-buys.dat\", names=[\"sid\", \"timestamp\", \"iid\", \"price\", \"quantity\"], parse_dates=[1])\n", | |
"test_click_df = read_csv(\"data/yoochoose-test.dat\", names=[\"sid\", \"timestamp\", \"iid\", \"category\"], parse_dates=[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)\n", | |
"buy_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)\n", | |
"test_click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"click_df['count'] = 1\n", | |
"click_count_df = click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()\n", | |
"click_count_df = click_count_df.reset_index()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_click_df['count'] = 1\n", | |
"test_click_count_df = test_click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()\n", | |
"test_click_count_df = test_click_count_df.reset_index()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"feature_df = click_df.merge(click_count_df, on=['sid', 'iid'], how='inner')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_feature_df = test_click_df.merge(test_click_count_df, on=['sid', 'iid'], how='inner')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"del feature_df['count_x']\n", | |
"feature_df.columns = [u'sid', u'timestamp', u'iid', u'category', u'count']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"del test_feature_df['count_x']\n", | |
"test_feature_df.columns = ['sid', 'timestamp', 'iid', 'category', 'count']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:51:09.277000</td>\n", | |
" <td>214536502</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:09.868000</td>\n", | |
" <td>214536500</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:46.998000</td>\n", | |
" <td>214536506</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:57:00.306000</td>\n", | |
" <td>214577561</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2014-04-07 13:56:37.614000</td>\n", | |
" <td>214662742</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count\n", | |
"0 1 2014-04-07 10:51:09.277000 214536502 0 1\n", | |
"1 1 2014-04-07 10:54:09.868000 214536500 0 1\n", | |
"2 1 2014-04-07 10:54:46.998000 214536506 0 1\n", | |
"3 1 2014-04-07 10:57:00.306000 214577561 0 1\n", | |
"4 2 2014-04-07 13:56:37.614000 214662742 0 2" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:13:46.713000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:20:56.973000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:21:19.602000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:44:14.590000</td>\n", | |
" <td>214820942</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:45:20.245000</td>\n", | |
" <td>214826810</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count\n", | |
"0 5 2014-04-07 17:13:46.713000 214530776 0 3\n", | |
"1 5 2014-04-07 17:20:56.973000 214530776 0 3\n", | |
"2 5 2014-04-07 17:21:19.602000 214530776 0 3\n", | |
"3 10 2014-04-04 07:44:14.590000 214820942 0 1\n", | |
"4 10 2014-04-04 07:45:20.245000 214826810 0 1" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"It'll take around 3 minutes to get here." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"session_click_n = click_df[['sid', 'iid']].groupby(['sid']).count()\n", | |
"session_click_n = session_click_n.reset_index()\n", | |
"session_click_n.columns = ['sid', 'session_count']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_session_click_n = test_click_df[['sid', 'iid']].groupby(['sid']).count()\n", | |
"test_session_click_n = test_session_click_n.reset_index()\n", | |
"test_session_click_n.columns = ['sid', 'session_count']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"feature_df = feature_df.merge(session_click_n, on=['sid'], how='inner')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_feature_df = test_feature_df.merge(test_session_click_n, on=['sid'], how='inner')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:51:09.277000</td>\n", | |
" <td>214536502</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:09.868000</td>\n", | |
" <td>214536500</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:46.998000</td>\n", | |
" <td>214536506</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:57:00.306000</td>\n", | |
" <td>214577561</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2014-04-07 13:56:37.614000</td>\n", | |
" <td>214662742</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count\n", | |
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4\n", | |
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4\n", | |
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4\n", | |
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4\n", | |
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:13:46.713000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:20:56.973000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:21:19.602000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:44:14.590000</td>\n", | |
" <td>214820942</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:45:20.245000</td>\n", | |
" <td>214826810</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count\n", | |
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3\n", | |
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3\n", | |
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3\n", | |
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2\n", | |
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Avoid working with timestamps. They are extremely slow. Instead, work with raw numpy arrays." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def timestamp_seconds(date):\n", | |
" baseline = np.datetime64('2014-04-01')\n", | |
" return (date - baseline) / np.timedelta64(1, 's')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"click_df['timestamp_seconds'] = timestamp_seconds(click_df['timestamp'].values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"test_click_df['timestamp_seconds'] = timestamp_seconds(test_click_df['timestamp'].values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"g = click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_g = test_click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.83 s, sys: 469 ms, total: 2.3 s\n", | |
"Wall time: 2.14 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time session_duration = np.round(g.last() - g.first())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_session_duration = np.round(test_g.last() - test_g.first())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"session_duration = pd.DataFrame(session_duration).reset_index()\n", | |
"session_duration.columns = ['sid', 'session_duration']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_session_duration = pd.DataFrame(test_session_duration).reset_index()\n", | |
"test_session_duration.columns = ['sid', 'session_duration']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>session_duration</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>351</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>359</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>745</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1034</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>6</td>\n", | |
" <td>246</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid session_duration\n", | |
"0 1 351\n", | |
"1 2 359\n", | |
"2 3 745\n", | |
"3 4 1034\n", | |
"4 6 246" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"session_duration.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>session_duration</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5</td>\n", | |
" <td>453</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>10</td>\n", | |
" <td>66</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>15</td>\n", | |
" <td>327</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>20</td>\n", | |
" <td>163</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>25</td>\n", | |
" <td>41</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid session_duration\n", | |
"0 5 453\n", | |
"1 10 66\n", | |
"2 15 327\n", | |
"3 20 163\n", | |
"4 25 41" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_session_duration.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"feature_df = feature_df.merge(session_duration, on=['sid'], how='inner')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:51:09.277000</td>\n", | |
" <td>214536502</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:09.868000</td>\n", | |
" <td>214536500</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:46.998000</td>\n", | |
" <td>214536506</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:57:00.306000</td>\n", | |
" <td>214577561</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2014-04-07 13:56:37.614000</td>\n", | |
" <td>214662742</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>6</td>\n", | |
" <td>359</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n", | |
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n", | |
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n", | |
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n", | |
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n", | |
"\n", | |
" session_duration \n", | |
"0 351 \n", | |
"1 351 \n", | |
"2 351 \n", | |
"3 351 \n", | |
"4 359 " | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:13:46.713000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:20:56.973000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:21:19.602000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:44:14.590000</td>\n", | |
" <td>214820942</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>66</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:45:20.245000</td>\n", | |
" <td>214826810</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>66</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 \n", | |
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 \n", | |
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 \n", | |
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 \n", | |
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 \n", | |
"\n", | |
" session_duration \n", | |
"0 453 \n", | |
"1 453 \n", | |
"2 453 \n", | |
"3 66 \n", | |
"4 66 " | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_feature_df = test_feature_df.merge(test_session_duration, on=['sid'], how='inner')\n", | |
"test_feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def year(dates):\n", | |
" \"Return an array of the years given an array of datetime64s\"\n", | |
" return dates.astype('M8[Y]').astype('i8') + 1970\n", | |
"\n", | |
"def month(dates):\n", | |
" \"Return an array of the months given an array of datetime64s\"\n", | |
" return dates.astype('M8[M]').astype('i8') % 12 + 1\n", | |
"\n", | |
"def day(dates):\n", | |
" \"Return an array of the days of the month given an array of datetime64s\"\n", | |
" return (dates - dates.astype('M8[M]')) / np.timedelta64(1, 'D') + 1\n", | |
"\n", | |
"def hour(dates):\n", | |
" \"Return an array of the hours of the day given an array of datetime64s\"\n", | |
" return np.floor((dates - dates.astype(\"M8[D]\")) / np.timedelta64(1, 'h'))\n", | |
"\n", | |
"def dayofweek(dates):\n", | |
" \"Return an array of the day of weeks given an array of datetime64s. Sunday is 0, Saturday is 6\"\n", | |
" baseline = np.datetime64('2014-04-01') # Tuesday: 2\n", | |
" return np.floor(((dates - baseline) / np.timedelta64(1, 'D') + 2) % 7)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.62 s, sys: 831 ms, total: 2.45 s\n", | |
"Wall time: 2.48 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time feature_df['month'] = month(feature_df['timestamp'].values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 765 ms, sys: 537 ms, total: 1.3 s\n", | |
"Wall time: 1.3 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time feature_df['dayofweek'] = dayofweek(feature_df['timestamp'].values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.01 s, sys: 476 ms, total: 1.49 s\n", | |
"Wall time: 1.49 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time feature_df['hour'] = hour(feature_df['timestamp'].values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_feature_df['month'] = month(test_feature_df['timestamp'].values)\n", | |
"test_feature_df['dayofweek'] = dayofweek(test_feature_df['timestamp'].values)\n", | |
"test_feature_df['hour'] = hour(test_feature_df['timestamp'].values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" <th>month</th>\n", | |
" <th>dayofweek</th>\n", | |
" <th>hour</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:51:09.277000</td>\n", | |
" <td>214536502</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:09.868000</td>\n", | |
" <td>214536500</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:46.998000</td>\n", | |
" <td>214536506</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:57:00.306000</td>\n", | |
" <td>214577561</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2014-04-07 13:56:37.614000</td>\n", | |
" <td>214662742</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>6</td>\n", | |
" <td>359</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n", | |
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n", | |
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n", | |
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n", | |
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n", | |
"\n", | |
" session_duration month dayofweek hour \n", | |
"0 351 4 1 10 \n", | |
"1 351 4 1 10 \n", | |
"2 351 4 1 10 \n", | |
"3 351 4 1 10 \n", | |
"4 359 4 1 13 " | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" <th>month</th>\n", | |
" <th>dayofweek</th>\n", | |
" <th>hour</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:13:46.713000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:20:56.973000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:21:19.602000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:44:14.590000</td>\n", | |
" <td>214820942</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>66</td>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:45:20.245000</td>\n", | |
" <td>214826810</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>66</td>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 \n", | |
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 \n", | |
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 \n", | |
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 \n", | |
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 \n", | |
"\n", | |
" session_duration month dayofweek hour \n", | |
"0 453 4 1 17 \n", | |
"1 453 4 1 17 \n", | |
"2 453 4 1 17 \n", | |
"3 66 4 5 7 \n", | |
"4 66 4 5 7 " | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Now for the list\n", | |
"\n", | |
"- label\n", | |
"- Things to change to one-hot encoding\n", | |
" - iid\n", | |
" - category\n", | |
" - month\n", | |
" - dayofweek\n", | |
" - hour\n", | |
"- list of items" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Remeber, we need to combine training and test item list." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"item_list = np.unique(np.append(click_df['iid'].unique(), test_click_df['iid'].unique()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"54287" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(item_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"item_list.sort()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"item_index_dict = {k: v for v, k in enumerate(item_list)} # Beware that index starts at 0. This may cause some problems." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 5min 36s, sys: 5.02 s, total: 5min 41s\n", | |
"Wall time: 5min 32s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time item_list_df = click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()\n", | |
"item_list_df.columns = ['sid', 'item_list']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1min 24s, sys: 1.06 s, total: 1min 25s\n", | |
"Wall time: 1min 23s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time test_item_list_df = test_click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()\n", | |
"test_item_list_df.columns = ['sid', 'item_list']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"feature_df = feature_df.merge(item_list_df, on='sid', how='inner')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_feature_df = test_feature_df.merge(test_item_list_df, on='sid', how='inner')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" <th>month</th>\n", | |
" <th>dayofweek</th>\n", | |
" <th>hour</th>\n", | |
" <th>item_list</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:51:09.277000</td>\n", | |
" <td>214536502</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:09.868000</td>\n", | |
" <td>214536500</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:46.998000</td>\n", | |
" <td>214536506</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:57:00.306000</td>\n", | |
" <td>214577561</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2014-04-07 13:56:37.614000</td>\n", | |
" <td>214662742</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>6</td>\n", | |
" <td>359</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>13</td>\n", | |
" <td>[4943, 33274, 42167, 19891, 33280]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n", | |
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n", | |
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n", | |
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n", | |
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n", | |
"\n", | |
" session_duration month dayofweek hour \\\n", | |
"0 351 4 1 10 \n", | |
"1 351 4 1 10 \n", | |
"2 351 4 1 10 \n", | |
"3 351 4 1 10 \n", | |
"4 359 4 1 13 \n", | |
"\n", | |
" item_list \n", | |
"0 [10040, 2097, 2095, 2096] \n", | |
"1 [10040, 2097, 2095, 2096] \n", | |
"2 [10040, 2097, 2095, 2096] \n", | |
"3 [10040, 2097, 2095, 2096] \n", | |
"4 [4943, 33274, 42167, 19891, 33280] " | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" <th>month</th>\n", | |
" <th>dayofweek</th>\n", | |
" <th>hour</th>\n", | |
" <th>item_list</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:13:46.713000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" <td>[1007]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:20:56.973000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" <td>[1007]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:21:19.602000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" <td>[1007]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:44:14.590000</td>\n", | |
" <td>214820942</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>66</td>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>7</td>\n", | |
" <td>[42615, 41214]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:45:20.245000</td>\n", | |
" <td>214826810</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>66</td>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>7</td>\n", | |
" <td>[42615, 41214]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 \n", | |
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 \n", | |
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 \n", | |
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 \n", | |
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 \n", | |
"\n", | |
" session_duration month dayofweek hour item_list \n", | |
"0 453 4 1 17 [1007] \n", | |
"1 453 4 1 17 [1007] \n", | |
"2 453 4 1 17 [1007] \n", | |
"3 66 4 5 7 [42615, 41214] \n", | |
"4 66 4 5 7 [42615, 41214] " | |
] | |
}, | |
"execution_count": 49, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"feature_df['item_index'] = np.array([item_index_dict[k] for k in feature_df['iid'].values])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_feature_df['item_index'] = np.array([item_index_dict[k] for k in test_feature_df['iid'].values])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" <th>month</th>\n", | |
" <th>dayofweek</th>\n", | |
" <th>hour</th>\n", | |
" <th>item_list</th>\n", | |
" <th>item_index</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:51:09.277000</td>\n", | |
" <td>214536502</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" <td>2096</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:09.868000</td>\n", | |
" <td>214536500</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" <td>2095</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:46.998000</td>\n", | |
" <td>214536506</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" <td>2097</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:57:00.306000</td>\n", | |
" <td>214577561</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" <td>10040</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2014-04-07 13:56:37.614000</td>\n", | |
" <td>214662742</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>6</td>\n", | |
" <td>359</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>13</td>\n", | |
" <td>[4943, 33274, 42167, 19891, 33280]</td>\n", | |
" <td>19891</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n", | |
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n", | |
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n", | |
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n", | |
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n", | |
"\n", | |
" session_duration month dayofweek hour \\\n", | |
"0 351 4 1 10 \n", | |
"1 351 4 1 10 \n", | |
"2 351 4 1 10 \n", | |
"3 351 4 1 10 \n", | |
"4 359 4 1 13 \n", | |
"\n", | |
" item_list item_index \n", | |
"0 [10040, 2097, 2095, 2096] 2096 \n", | |
"1 [10040, 2097, 2095, 2096] 2095 \n", | |
"2 [10040, 2097, 2095, 2096] 2097 \n", | |
"3 [10040, 2097, 2095, 2096] 10040 \n", | |
"4 [4943, 33274, 42167, 19891, 33280] 19891 " | |
] | |
}, | |
"execution_count": 52, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" <th>month</th>\n", | |
" <th>dayofweek</th>\n", | |
" <th>hour</th>\n", | |
" <th>item_list</th>\n", | |
" <th>item_index</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:13:46.713000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" <td>[1007]</td>\n", | |
" <td>1007</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:20:56.973000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" <td>[1007]</td>\n", | |
" <td>1007</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>2014-04-07 17:21:19.602000</td>\n", | |
" <td>214530776</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" <td>453</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>17</td>\n", | |
" <td>[1007]</td>\n", | |
" <td>1007</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:44:14.590000</td>\n", | |
" <td>214820942</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>66</td>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>7</td>\n", | |
" <td>[42615, 41214]</td>\n", | |
" <td>41214</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>10</td>\n", | |
" <td>2014-04-04 07:45:20.245000</td>\n", | |
" <td>214826810</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>66</td>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>7</td>\n", | |
" <td>[42615, 41214]</td>\n", | |
" <td>42615</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 \n", | |
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 \n", | |
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 \n", | |
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 \n", | |
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 \n", | |
"\n", | |
" session_duration month dayofweek hour item_list item_index \n", | |
"0 453 4 1 17 [1007] 1007 \n", | |
"1 453 4 1 17 [1007] 1007 \n", | |
"2 453 4 1 17 [1007] 1007 \n", | |
"3 66 4 5 7 [42615, 41214] 41214 \n", | |
"4 66 4 5 7 [42615, 41214] 42615 " | |
] | |
}, | |
"execution_count": 54, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_feature_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"buy_df['target'] = 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"final_df = feature_df.merge(buy_df[['sid', 'iid', 'target']], on=['sid', 'iid'], how='left')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"final_df['target'].fillna(-1, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"sid: 9249729\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"sid: {}\".format(final_df['sid'].nunique()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"item index: 52739\n", | |
"category: 340\n", | |
"month: 6\n", | |
"hour: 24\n", | |
"dayofweek: 7\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"item index: {}\".format(final_df['item_index'].nunique()))\n", | |
"print(\"category: {}\".format(final_df['category'].nunique()))\n", | |
"print(\"month: {}\".format(final_df['month'].nunique()))\n", | |
"print(\"hour: {}\".format(final_df['hour'].nunique()))\n", | |
"print(\"dayofweek: {}\".format(final_df['dayofweek'].nunique()))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Create training/test data. Training will contain validation data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"training_prob = 0.75\n", | |
"mask = np.random.binomial(1, training_prob, size=final_df['sid'].nunique())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"mask_df = pd.DataFrame(mask, index=final_df['sid'].unique(), columns=['mask']).reset_index()\n", | |
"mask_df.columns = ['sid', 'mask']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>mask</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>6</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid mask\n", | |
"0 1 0\n", | |
"1 2 1\n", | |
"2 3 1\n", | |
"3 4 1\n", | |
"4 6 1" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"mask_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"final_df = final_df.merge(mask_df, on='sid', how='inner')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>iid</th>\n", | |
" <th>category</th>\n", | |
" <th>count</th>\n", | |
" <th>session_count</th>\n", | |
" <th>session_duration</th>\n", | |
" <th>month</th>\n", | |
" <th>dayofweek</th>\n", | |
" <th>hour</th>\n", | |
" <th>item_list</th>\n", | |
" <th>item_index</th>\n", | |
" <th>target</th>\n", | |
" <th>mask</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:51:09.277000</td>\n", | |
" <td>214536502</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" <td>2096</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:09.868000</td>\n", | |
" <td>214536500</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" <td>2095</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:54:46.998000</td>\n", | |
" <td>214536506</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" <td>2097</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2014-04-07 10:57:00.306000</td>\n", | |
" <td>214577561</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>351</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>[10040, 2097, 2095, 2096]</td>\n", | |
" <td>10040</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2014-04-07 13:56:37.614000</td>\n", | |
" <td>214662742</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>6</td>\n", | |
" <td>359</td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>13</td>\n", | |
" <td>[4943, 33274, 42167, 19891, 33280]</td>\n", | |
" <td>19891</td>\n", | |
" <td>-1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sid timestamp iid category count session_count \\\n", | |
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n", | |
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n", | |
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n", | |
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n", | |
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n", | |
"\n", | |
" session_duration month dayofweek hour \\\n", | |
"0 351 4 1 10 \n", | |
"1 351 4 1 10 \n", | |
"2 351 4 1 10 \n", | |
"3 351 4 1 10 \n", | |
"4 359 4 1 13 \n", | |
"\n", | |
" item_list item_index target mask \n", | |
"0 [10040, 2097, 2095, 2096] 2096 -1 0 \n", | |
"1 [10040, 2097, 2095, 2096] 2095 -1 0 \n", | |
"2 [10040, 2097, 2095, 2096] 2097 -1 0 \n", | |
"3 [10040, 2097, 2095, 2096] 10040 -1 0 \n", | |
"4 [4943, 33274, 42167, 19891, 33280] 19891 -1 1 " | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"final_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"training_df = final_df[final_df['mask'] == 1]\n", | |
"cv_df = final_df[final_df['mask'] == 0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"24932146\n", | |
"8302296\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(training_df))\n", | |
"print(len(cv_df))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Buy probability per entry (not per session)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.061387520813498242" | |
] | |
}, | |
"execution_count": 45, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(final_df['target'].mean() + 1) / 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.061448781825679977" | |
] | |
}, | |
"execution_count": 46, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(training_df['target'].mean() + 1) / 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.061203551403129941" | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(cv_df['target'].mean() + 1) / 2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Buy probability per session" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.055103884665161529" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(np.mean(final_df.groupby('sid')['target'].max()) + 1) / 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.055144914996467675" | |
] | |
}, | |
"execution_count": 49, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(np.mean(training_df.groupby('sid')['target'].max()) + 1) / 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.054980584441226776" | |
] | |
}, | |
"execution_count": 50, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"(np.mean(cv_df.groupby('sid')['target'].max()) + 1) / 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"training_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data\n", | |
" 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data\n", | |
" 'item_list', # categorical data, variable length\n", | |
" ]].to_csv('training_data.tsv', sep='\\t', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"cv_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data\n", | |
" 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data\n", | |
" 'item_list', # categorical data, variable length\n", | |
" ]].to_csv('cv_data.tsv', sep='\\t', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"test_feature_df[['sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data\n", | |
" 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data\n", | |
" 'item_list', # categorical data, variable length\n", | |
" ]].to_csv('test_data.tsv', sep='\\t', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment