Skip to content

Instantly share code, notes, and snippets.

@shurain
Created June 12, 2015 07:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shurain/ceef157770d2944edb20 to your computer and use it in GitHub Desktop.
Save shurain/ceef157770d2944edb20 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from pandas.io.parsers import read_csv\n",
"%matplotlib inline\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import matplotlib as mpl"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/shurain/venv/ml/local/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
}
],
"source": [
"click_df = read_csv('data/yoochoose-clicks.dat', names=[\"sid\", \"timestamp\", \"iid\", \"category\"], parse_dates=[1])\n",
"buy_df = read_csv(\"data/yoochoose-buys.dat\", names=[\"sid\", \"timestamp\", \"iid\", \"price\", \"quantity\"], parse_dates=[1])\n",
"test_click_df = read_csv(\"data/yoochoose-test.dat\", names=[\"sid\", \"timestamp\", \"iid\", \"category\"], parse_dates=[1])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)\n",
"buy_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)\n",
"test_click_df.sort(['sid', 'timestamp'], ascending=[True, True], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"click_df['count'] = 1\n",
"click_count_df = click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()\n",
"click_count_df = click_count_df.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_click_df['count'] = 1\n",
"test_click_count_df = test_click_df[['sid', 'iid', 'count']].groupby(['sid', 'iid']).sum()\n",
"test_click_count_df = test_click_count_df.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"feature_df = click_df.merge(click_count_df, on=['sid', 'iid'], how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_feature_df = test_click_df.merge(test_click_count_df, on=['sid', 'iid'], how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"del feature_df['count_x']\n",
"feature_df.columns = [u'sid', u'timestamp', u'iid', u'category', u'count']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"del test_feature_df['count_x']\n",
"test_feature_df.columns = ['sid', 'timestamp', 'iid', 'category', 'count']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:51:09.277000</td>\n",
" <td>214536502</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:09.868000</td>\n",
" <td>214536500</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:46.998000</td>\n",
" <td>214536506</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:57:00.306000</td>\n",
" <td>214577561</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2014-04-07 13:56:37.614000</td>\n",
" <td>214662742</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count\n",
"0 1 2014-04-07 10:51:09.277000 214536502 0 1\n",
"1 1 2014-04-07 10:54:09.868000 214536500 0 1\n",
"2 1 2014-04-07 10:54:46.998000 214536506 0 1\n",
"3 1 2014-04-07 10:57:00.306000 214577561 0 1\n",
"4 2 2014-04-07 13:56:37.614000 214662742 0 2"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:13:46.713000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:20:56.973000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:21:19.602000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:44:14.590000</td>\n",
" <td>214820942</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:45:20.245000</td>\n",
" <td>214826810</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count\n",
"0 5 2014-04-07 17:13:46.713000 214530776 0 3\n",
"1 5 2014-04-07 17:20:56.973000 214530776 0 3\n",
"2 5 2014-04-07 17:21:19.602000 214530776 0 3\n",
"3 10 2014-04-04 07:44:14.590000 214820942 0 1\n",
"4 10 2014-04-04 07:45:20.245000 214826810 0 1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_feature_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It'll take around 3 minutes to get here."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"session_click_n = click_df[['sid', 'iid']].groupby(['sid']).count()\n",
"session_click_n = session_click_n.reset_index()\n",
"session_click_n.columns = ['sid', 'session_count']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_session_click_n = test_click_df[['sid', 'iid']].groupby(['sid']).count()\n",
"test_session_click_n = test_session_click_n.reset_index()\n",
"test_session_click_n.columns = ['sid', 'session_count']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"feature_df = feature_df.merge(session_click_n, on=['sid'], how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_feature_df = test_feature_df.merge(test_session_click_n, on=['sid'], how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:51:09.277000</td>\n",
" <td>214536502</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:09.868000</td>\n",
" <td>214536500</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:46.998000</td>\n",
" <td>214536506</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:57:00.306000</td>\n",
" <td>214577561</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2014-04-07 13:56:37.614000</td>\n",
" <td>214662742</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count\n",
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4\n",
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4\n",
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4\n",
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4\n",
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:13:46.713000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:20:56.973000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:21:19.602000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:44:14.590000</td>\n",
" <td>214820942</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:45:20.245000</td>\n",
" <td>214826810</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count\n",
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3\n",
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3\n",
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3\n",
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2\n",
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_feature_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Avoid working with timestamps. They are extremely slow. Instead, work with raw numpy arrays."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def timestamp_seconds(date):\n",
" baseline = np.datetime64('2014-04-01')\n",
" return (date - baseline) / np.timedelta64(1, 's')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"click_df['timestamp_seconds'] = timestamp_seconds(click_df['timestamp'].values)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"test_click_df['timestamp_seconds'] = timestamp_seconds(test_click_df['timestamp'].values)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"g = click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_g = test_click_df[['sid', 'timestamp_seconds']].groupby(['sid'])['timestamp_seconds']"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.83 s, sys: 469 ms, total: 2.3 s\n",
"Wall time: 2.14 s\n"
]
}
],
"source": [
"%time session_duration = np.round(g.last() - g.first())"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_session_duration = np.round(test_g.last() - test_g.first())"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"session_duration = pd.DataFrame(session_duration).reset_index()\n",
"session_duration.columns = ['sid', 'session_duration']"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_session_duration = pd.DataFrame(test_session_duration).reset_index()\n",
"test_session_duration.columns = ['sid', 'session_duration']"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>session_duration</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>359</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1034</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>246</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid session_duration\n",
"0 1 351\n",
"1 2 359\n",
"2 3 745\n",
"3 4 1034\n",
"4 6 246"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"session_duration.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>session_duration</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>453</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>15</td>\n",
" <td>327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20</td>\n",
" <td>163</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>25</td>\n",
" <td>41</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid session_duration\n",
"0 5 453\n",
"1 10 66\n",
"2 15 327\n",
"3 20 163\n",
"4 25 41"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_session_duration.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"feature_df = feature_df.merge(session_duration, on=['sid'], how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:51:09.277000</td>\n",
" <td>214536502</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:09.868000</td>\n",
" <td>214536500</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:46.998000</td>\n",
" <td>214536506</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:57:00.306000</td>\n",
" <td>214577561</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2014-04-07 13:56:37.614000</td>\n",
" <td>214662742</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>359</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n",
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n",
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n",
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n",
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n",
"\n",
" session_duration \n",
"0 351 \n",
"1 351 \n",
"2 351 \n",
"3 351 \n",
"4 359 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:13:46.713000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:20:56.973000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:21:19.602000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:44:14.590000</td>\n",
" <td>214820942</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:45:20.245000</td>\n",
" <td>214826810</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>66</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 \n",
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 \n",
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 \n",
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 \n",
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 \n",
"\n",
" session_duration \n",
"0 453 \n",
"1 453 \n",
"2 453 \n",
"3 66 \n",
"4 66 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_feature_df = test_feature_df.merge(test_session_duration, on=['sid'], how='inner')\n",
"test_feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def year(dates):\n",
" \"Return an array of the years given an array of datetime64s\"\n",
" return dates.astype('M8[Y]').astype('i8') + 1970\n",
"\n",
"def month(dates):\n",
" \"Return an array of the months given an array of datetime64s\"\n",
" return dates.astype('M8[M]').astype('i8') % 12 + 1\n",
"\n",
"def day(dates):\n",
" \"Return an array of the days of the month given an array of datetime64s\"\n",
" return (dates - dates.astype('M8[M]')) / np.timedelta64(1, 'D') + 1\n",
"\n",
"def hour(dates):\n",
" \"Return an array of the hours of the day given an array of datetime64s\"\n",
" return np.floor((dates - dates.astype(\"M8[D]\")) / np.timedelta64(1, 'h'))\n",
"\n",
"def dayofweek(dates):\n",
" \"Return an array of the day of weeks given an array of datetime64s. Sunday is 0, Saturday is 6\"\n",
" baseline = np.datetime64('2014-04-01') # Tuesday: 2\n",
" return np.floor(((dates - baseline) / np.timedelta64(1, 'D') + 2) % 7)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.62 s, sys: 831 ms, total: 2.45 s\n",
"Wall time: 2.48 s\n"
]
}
],
"source": [
"%time feature_df['month'] = month(feature_df['timestamp'].values)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 765 ms, sys: 537 ms, total: 1.3 s\n",
"Wall time: 1.3 s\n"
]
}
],
"source": [
"%time feature_df['dayofweek'] = dayofweek(feature_df['timestamp'].values)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.01 s, sys: 476 ms, total: 1.49 s\n",
"Wall time: 1.49 s\n"
]
}
],
"source": [
"%time feature_df['hour'] = hour(feature_df['timestamp'].values)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_feature_df['month'] = month(test_feature_df['timestamp'].values)\n",
"test_feature_df['dayofweek'] = dayofweek(test_feature_df['timestamp'].values)\n",
"test_feature_df['hour'] = hour(test_feature_df['timestamp'].values)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" <th>month</th>\n",
" <th>dayofweek</th>\n",
" <th>hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:51:09.277000</td>\n",
" <td>214536502</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:09.868000</td>\n",
" <td>214536500</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:46.998000</td>\n",
" <td>214536506</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:57:00.306000</td>\n",
" <td>214577561</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2014-04-07 13:56:37.614000</td>\n",
" <td>214662742</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>359</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>13</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n",
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n",
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n",
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n",
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n",
"\n",
" session_duration month dayofweek hour \n",
"0 351 4 1 10 \n",
"1 351 4 1 10 \n",
"2 351 4 1 10 \n",
"3 351 4 1 10 \n",
"4 359 4 1 13 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" <th>month</th>\n",
" <th>dayofweek</th>\n",
" <th>hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:13:46.713000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:20:56.973000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:21:19.602000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:44:14.590000</td>\n",
" <td>214820942</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>66</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:45:20.245000</td>\n",
" <td>214826810</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>66</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 \n",
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 \n",
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 \n",
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 \n",
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 \n",
"\n",
" session_duration month dayofweek hour \n",
"0 453 4 1 17 \n",
"1 453 4 1 17 \n",
"2 453 4 1 17 \n",
"3 66 4 5 7 \n",
"4 66 4 5 7 "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_feature_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now for the list\n",
"\n",
"- label\n",
"- Things to change to one-hot encoding\n",
" - iid\n",
" - category\n",
" - month\n",
" - dayofweek\n",
" - hour\n",
"- list of items"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remeber, we need to combine training and test item list."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"item_list = np.unique(np.append(click_df['iid'].unique(), test_click_df['iid'].unique()))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"54287"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(item_list)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"item_list.sort()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"item_index_dict = {k: v for v, k in enumerate(item_list)} # Beware that index starts at 0. This may cause some problems."
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5min 36s, sys: 5.02 s, total: 5min 41s\n",
"Wall time: 5min 32s\n"
]
}
],
"source": [
"%time item_list_df = click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()\n",
"item_list_df.columns = ['sid', 'item_list']"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1min 24s, sys: 1.06 s, total: 1min 25s\n",
"Wall time: 1min 23s\n"
]
}
],
"source": [
"%time test_item_list_df = test_click_df.groupby('sid')['iid'].apply(lambda group: [item_index_dict[k] for k in set(group.values)]).reset_index()\n",
"test_item_list_df.columns = ['sid', 'item_list']"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"feature_df = feature_df.merge(item_list_df, on='sid', how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_feature_df = test_feature_df.merge(test_item_list_df, on='sid', how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" <th>month</th>\n",
" <th>dayofweek</th>\n",
" <th>hour</th>\n",
" <th>item_list</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:51:09.277000</td>\n",
" <td>214536502</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:09.868000</td>\n",
" <td>214536500</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:46.998000</td>\n",
" <td>214536506</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:57:00.306000</td>\n",
" <td>214577561</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2014-04-07 13:56:37.614000</td>\n",
" <td>214662742</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>359</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>13</td>\n",
" <td>[4943, 33274, 42167, 19891, 33280]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n",
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n",
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n",
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n",
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n",
"\n",
" session_duration month dayofweek hour \\\n",
"0 351 4 1 10 \n",
"1 351 4 1 10 \n",
"2 351 4 1 10 \n",
"3 351 4 1 10 \n",
"4 359 4 1 13 \n",
"\n",
" item_list \n",
"0 [10040, 2097, 2095, 2096] \n",
"1 [10040, 2097, 2095, 2096] \n",
"2 [10040, 2097, 2095, 2096] \n",
"3 [10040, 2097, 2095, 2096] \n",
"4 [4943, 33274, 42167, 19891, 33280] "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" <th>month</th>\n",
" <th>dayofweek</th>\n",
" <th>hour</th>\n",
" <th>item_list</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:13:46.713000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>[1007]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:20:56.973000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>[1007]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:21:19.602000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>[1007]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:44:14.590000</td>\n",
" <td>214820942</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>66</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>[42615, 41214]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:45:20.245000</td>\n",
" <td>214826810</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>66</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>[42615, 41214]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 \n",
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 \n",
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 \n",
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 \n",
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 \n",
"\n",
" session_duration month dayofweek hour item_list \n",
"0 453 4 1 17 [1007] \n",
"1 453 4 1 17 [1007] \n",
"2 453 4 1 17 [1007] \n",
"3 66 4 5 7 [42615, 41214] \n",
"4 66 4 5 7 [42615, 41214] "
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"feature_df['item_index'] = np.array([item_index_dict[k] for k in feature_df['iid'].values])"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test_feature_df['item_index'] = np.array([item_index_dict[k] for k in test_feature_df['iid'].values])"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" <th>month</th>\n",
" <th>dayofweek</th>\n",
" <th>hour</th>\n",
" <th>item_list</th>\n",
" <th>item_index</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:51:09.277000</td>\n",
" <td>214536502</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" <td>2096</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:09.868000</td>\n",
" <td>214536500</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" <td>2095</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:46.998000</td>\n",
" <td>214536506</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" <td>2097</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:57:00.306000</td>\n",
" <td>214577561</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" <td>10040</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2014-04-07 13:56:37.614000</td>\n",
" <td>214662742</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>359</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>13</td>\n",
" <td>[4943, 33274, 42167, 19891, 33280]</td>\n",
" <td>19891</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n",
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n",
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n",
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n",
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n",
"\n",
" session_duration month dayofweek hour \\\n",
"0 351 4 1 10 \n",
"1 351 4 1 10 \n",
"2 351 4 1 10 \n",
"3 351 4 1 10 \n",
"4 359 4 1 13 \n",
"\n",
" item_list item_index \n",
"0 [10040, 2097, 2095, 2096] 2096 \n",
"1 [10040, 2097, 2095, 2096] 2095 \n",
"2 [10040, 2097, 2095, 2096] 2097 \n",
"3 [10040, 2097, 2095, 2096] 10040 \n",
"4 [4943, 33274, 42167, 19891, 33280] 19891 "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" <th>month</th>\n",
" <th>dayofweek</th>\n",
" <th>hour</th>\n",
" <th>item_list</th>\n",
" <th>item_index</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:13:46.713000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>[1007]</td>\n",
" <td>1007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:20:56.973000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>[1007]</td>\n",
" <td>1007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>2014-04-07 17:21:19.602000</td>\n",
" <td>214530776</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>453</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>[1007]</td>\n",
" <td>1007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:44:14.590000</td>\n",
" <td>214820942</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>66</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>[42615, 41214]</td>\n",
" <td>41214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>2014-04-04 07:45:20.245000</td>\n",
" <td>214826810</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>66</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>[42615, 41214]</td>\n",
" <td>42615</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 5 2014-04-07 17:13:46.713000 214530776 0 3 3 \n",
"1 5 2014-04-07 17:20:56.973000 214530776 0 3 3 \n",
"2 5 2014-04-07 17:21:19.602000 214530776 0 3 3 \n",
"3 10 2014-04-04 07:44:14.590000 214820942 0 1 2 \n",
"4 10 2014-04-04 07:45:20.245000 214826810 0 1 2 \n",
"\n",
" session_duration month dayofweek hour item_list item_index \n",
"0 453 4 1 17 [1007] 1007 \n",
"1 453 4 1 17 [1007] 1007 \n",
"2 453 4 1 17 [1007] 1007 \n",
"3 66 4 5 7 [42615, 41214] 41214 \n",
"4 66 4 5 7 [42615, 41214] 42615 "
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"buy_df['target'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"final_df = feature_df.merge(buy_df[['sid', 'iid', 'target']], on=['sid', 'iid'], how='left')"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"final_df['target'].fillna(-1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sid: 9249729\n"
]
}
],
"source": [
"print(\"sid: {}\".format(final_df['sid'].nunique()))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"item index: 52739\n",
"category: 340\n",
"month: 6\n",
"hour: 24\n",
"dayofweek: 7\n"
]
}
],
"source": [
"print(\"item index: {}\".format(final_df['item_index'].nunique()))\n",
"print(\"category: {}\".format(final_df['category'].nunique()))\n",
"print(\"month: {}\".format(final_df['month'].nunique()))\n",
"print(\"hour: {}\".format(final_df['hour'].nunique()))\n",
"print(\"dayofweek: {}\".format(final_df['dayofweek'].nunique()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create training/test data. Training will contain validation data"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"training_prob = 0.75\n",
"mask = np.random.binomial(1, training_prob, size=final_df['sid'].nunique())"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mask_df = pd.DataFrame(mask, index=final_df['sid'].unique(), columns=['mask']).reset_index()\n",
"mask_df.columns = ['sid', 'mask']"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>mask</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid mask\n",
"0 1 0\n",
"1 2 1\n",
"2 3 1\n",
"3 4 1\n",
"4 6 1"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mask_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"final_df = final_df.merge(mask_df, on='sid', how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sid</th>\n",
" <th>timestamp</th>\n",
" <th>iid</th>\n",
" <th>category</th>\n",
" <th>count</th>\n",
" <th>session_count</th>\n",
" <th>session_duration</th>\n",
" <th>month</th>\n",
" <th>dayofweek</th>\n",
" <th>hour</th>\n",
" <th>item_list</th>\n",
" <th>item_index</th>\n",
" <th>target</th>\n",
" <th>mask</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:51:09.277000</td>\n",
" <td>214536502</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" <td>2096</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:09.868000</td>\n",
" <td>214536500</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" <td>2095</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:54:46.998000</td>\n",
" <td>214536506</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" <td>2097</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2014-04-07 10:57:00.306000</td>\n",
" <td>214577561</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>351</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>[10040, 2097, 2095, 2096]</td>\n",
" <td>10040</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2014-04-07 13:56:37.614000</td>\n",
" <td>214662742</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>359</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>13</td>\n",
" <td>[4943, 33274, 42167, 19891, 33280]</td>\n",
" <td>19891</td>\n",
" <td>-1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sid timestamp iid category count session_count \\\n",
"0 1 2014-04-07 10:51:09.277000 214536502 0 1 4 \n",
"1 1 2014-04-07 10:54:09.868000 214536500 0 1 4 \n",
"2 1 2014-04-07 10:54:46.998000 214536506 0 1 4 \n",
"3 1 2014-04-07 10:57:00.306000 214577561 0 1 4 \n",
"4 2 2014-04-07 13:56:37.614000 214662742 0 2 6 \n",
"\n",
" session_duration month dayofweek hour \\\n",
"0 351 4 1 10 \n",
"1 351 4 1 10 \n",
"2 351 4 1 10 \n",
"3 351 4 1 10 \n",
"4 359 4 1 13 \n",
"\n",
" item_list item_index target mask \n",
"0 [10040, 2097, 2095, 2096] 2096 -1 0 \n",
"1 [10040, 2097, 2095, 2096] 2095 -1 0 \n",
"2 [10040, 2097, 2095, 2096] 2097 -1 0 \n",
"3 [10040, 2097, 2095, 2096] 10040 -1 0 \n",
"4 [4943, 33274, 42167, 19891, 33280] 19891 -1 1 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"training_df = final_df[final_df['mask'] == 1]\n",
"cv_df = final_df[final_df['mask'] == 0]"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"24932146\n",
"8302296\n"
]
}
],
"source": [
"print(len(training_df))\n",
"print(len(cv_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Buy probability per entry (not per session)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.061387520813498242"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(final_df['target'].mean() + 1) / 2"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.061448781825679977"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(training_df['target'].mean() + 1) / 2"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.061203551403129941"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(cv_df['target'].mean() + 1) / 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Buy probability per session"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.055103884665161529"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(np.mean(final_df.groupby('sid')['target'].max()) + 1) / 2"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.055144914996467675"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(np.mean(training_df.groupby('sid')['target'].max()) + 1) / 2"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.054980584441226776"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(np.mean(cv_df.groupby('sid')['target'].max()) + 1) / 2"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"training_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data\n",
" 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data\n",
" 'item_list', # categorical data, variable length\n",
" ]].to_csv('training_data.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"cv_df[['target', 'sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data\n",
" 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data\n",
" 'item_list', # categorical data, variable length\n",
" ]].to_csv('cv_data.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"test_feature_df[['sid', 'iid', 'count', 'session_count', 'session_duration', # numerical data\n",
" 'item_index', 'category', 'month', 'hour', 'dayofweek', # categorical data\n",
" 'item_list', # categorical data, variable length\n",
" ]].to_csv('test_data.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment