Skip to content

Instantly share code, notes, and snippets.

@kuk
Created August 30, 2015 16:52
Show Gist options
  • Save kuk/56496d4c48e43d044b85 to your computer and use it in GitHub Desktop.
Save kuk/56496d4c48e43d044b85 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Linear model"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Transform JSON to VW format"
]
},
{
"cell_type": "code",
"execution_count": 512,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def transform_json(line, border):\n",
" dict_data = json.loads(line)\n",
" if dict_data['salary'][border] is not None:\n",
" result_line = '{} '.format(log(float(dict_data['salary'][border])))\n",
" else:\n",
" return 0\n",
" result_line = result_line + '|{} {}'.format('billing_type', dict_data['billing_type']['id'])\n",
" result_line = result_line + '|{} {}'.format('accept_handicapped', dict_data['accept_handicapped'])\n",
" if dict_data['key_skills']:\n",
" result_line = result_line + '|key_skills '\n",
" for elem in dict_data['key_skills']:\n",
" result_line = result_line + '{} '.format(elem['name'].encode('utf8').replace(' ', '_').replace(':', ''))\n",
" \n",
" result_line = result_line + '|employment {}'.format(dict_data['employment']['id'])\n",
" result_line = result_line + '|archived {}'.format(dict_data['archived'])\n",
" if 'id' in dict_data['employer']:\n",
" result_line = result_line + '|employer {}'.format(dict_data['employer']['id'])\n",
" result_line = result_line + '|response_letter_required {}'.format(dict_data['response_letter_required'])\n",
" result_line = result_line + '|type {}'.format(dict_data['type']['id'])\n",
" if dict_data['specializations']:\n",
" result_line = result_line + '|specializations '\n",
" for elem in dict_data['specializations']:\n",
" result_line = result_line + '{} '.format(elem['id'])\n",
" result_line = result_line + '|premium {}'.format(dict_data['premium'])\n",
" result_line = result_line + '|schedule {}'.format(dict_data['schedule']['id'])\n",
" result_line = result_line + '|billing_type {}'.format(dict_data['billing_type']['id'])\n",
" if dict_data['department']:\n",
" result_line = result_line + '|department {}'.format(dict_data['department']['id'])\n",
" if dict_data['address']:\n",
" if 'city' in dict_data:\n",
" result_line = result_line + '|address {}'.format(dict_data['address']['city'].encode('utf8').replace(':', ''))\n",
" result_line = result_line + '|name ' + dict_data['name'].encode('utf8').replace(':', '')\n",
" result_line = result_line + '|area {}'.format(dict_data['area']['id'])\n",
" result_line = result_line + '|experience {}'.format(dict_data['experience']['id'])\n",
" result_line = result_line + '|description ' + dict_data['description'].encode('utf8').replace(':', '').replace('\\n', ' ')\n",
" \n",
" return result_line "
]
},
{
"cell_type": "code",
"execution_count": 513,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open('train.txt', 'r') as start, open('data_low.vw', 'w') as transformed:\n",
" for line in start:\n",
" transformed_line = transform_json(line, 'from')\n",
" if isinstance(transformed_line, int):\n",
" continue\n",
" transformed.write(transformed_line + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 514,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open('train.txt', 'r') as start, open('data_high.vw', 'w') as transformed:\n",
" for line in start:\n",
" transformed_line = transform_json(line, 'to')\n",
" if isinstance(transformed_line, int):\n",
" continue\n",
" transformed.write(transformed_line + '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Train Test Split"
]
},
{
"cell_type": "code",
"execution_count": 515,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"390960 data_high.vw\r\n"
]
}
],
"source": [
"!wc data_high.vw -l"
]
},
{
"cell_type": "code",
"execution_count": 611,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cat: ошибка записи: Обрыв канала\r\n"
]
}
],
"source": [
"!cat data_high.vw | tail -100000 >> test_high.vw\n",
"!cat data_high.vw | head -290000 >> train_high.vw"
]
},
{
"cell_type": "code",
"execution_count": 517,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"646085 data_low.vw\r\n"
]
}
],
"source": [
"!wc data_low.vw -l"
]
},
{
"cell_type": "code",
"execution_count": 610,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cat: ошибка записи: Обрыв канала\n"
]
}
],
"source": [
"!cat data_low.vw | tail -100000 >> test_low.vw\n",
"!cat data_low.vw | head -540000 >> train_low.vw"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train high and low models"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* -d -- train data file\n",
"* -c -- create binary cash\n",
"* -f -- name of the model\n",
"* --ftrl -- optimization algorithm"
]
},
{
"cell_type": "code",
"execution_count": 614,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"final_regressor = model_low.vw\n",
"Enabling FTRL based optimization\n",
"Algorithm used: Proximal-FTRL\n",
"ftrl_alpha = 0.005\n",
"ftrl_beta = 0.1\n",
"Num weight bits = 18\n",
"learning rate = 0.5\n",
"initial_t = 0\n",
"power_t = 0.5\n",
"using cache_file = train_low.vw.cache\n",
"ignoring text input in favor of cache input\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"106.951660 106.951660 1 1.0 10.3417 0.0000 20\n",
"111.503761 116.055862 2 2.0 10.8198 0.0469 19\n",
"145.596371 179.688980 4 4.0 11.6952 0.0805 26\n",
"123.561625 101.526880 8 8.0 10.7144 0.1870 19\n",
"112.988557 102.415488 16 16.0 10.4631 0.4912 21\n",
"103.163995 93.339433 32 32.0 10.3090 0.5028 23\n",
"101.071607 98.979219 64 64.0 9.9035 0.7482 18\n",
"89.244177 77.416748 128 128.0 10.3090 1.2837 23\n",
"84.063526 78.882875 256 256.0 11.1562 1.5962 19\n",
"78.266630 72.469734 512 512.0 10.4631 2.1302 18\n",
"68.171888 58.077146 1024 1024.0 9.9035 3.6030 19\n",
"56.471147 44.770406 2048 2048.0 11.6952 4.8247 21\n",
"42.680226 28.889304 4096 4096.0 9.7981 5.7984 15\n",
"28.110370 13.540513 8192 8192.0 10.3417 7.5182 23\n",
"17.085322 6.060274 16384 16384.0 10.1924 10.3211 24\n",
"9.867351 2.649381 32768 32768.0 10.4631 10.5835 21\n",
"5.954988 2.042625 65536 65536.0 11.0021 11.3056 37\n",
"3.738675 1.522362 131072 131072.0 9.2591 10.3081 17\n",
"2.614757 1.490838 262144 262144.0 9.6158 10.1048 15\n",
"1.942687 1.270618 524288 524288.0 11.5129 11.0482 17\n",
"1.567351 1.192015 1048576 1048576.0 9.5468 10.0724 16\n",
"1.264792 0.962233 2097152 2097152.0 12.2061 11.7879 245\n",
"\n",
"finished run\n",
"number of examples = 2160000\n",
"weighted example sum = 2160000.000000\n",
"weighted label sum = 22548532.381310\n",
"average loss = 1.245573\n",
"best constant = 10.439136\n",
"total feature number = 113738775\n"
]
}
],
"source": [
"!vw -d train_low.vw -c -f model_low.vw --ftrl"
]
},
{
"cell_type": "code",
"execution_count": 615,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"final_regressor = model_high.vw\n",
"Enabling FTRL based optimization\n",
"Algorithm used: Proximal-FTRL\n",
"ftrl_alpha = 0.005\n",
"ftrl_beta = 0.1\n",
"Num weight bits = 18\n",
"learning rate = 0.5\n",
"initial_t = 0\n",
"power_t = 0.5\n",
"using cache_file = train_high.vw.cache\n",
"ignoring text input in favor of cache input\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"109.476509 109.476509 1 1.0 10.4631 0.0000 23\n",
"114.824165 120.171822 2 2.0 11.0021 0.0398 20\n",
"114.894907 114.965649 4 4.0 10.0432 0.1037 19\n",
"109.988022 105.081137 8 8.0 10.5966 0.1608 21\n",
"108.195213 106.402404 16 16.0 10.0858 0.2885 20\n",
"105.115345 102.035477 32 32.0 10.4043 0.5303 20\n",
"101.926354 98.737363 64 64.0 10.7144 0.8666 20\n",
"95.272331 88.618309 128 128.0 10.9331 1.1735 21\n",
"93.169282 91.066232 256 256.0 10.3090 1.4080 28\n",
"83.598470 74.027658 512 512.0 10.4913 2.4718 24\n",
"72.168264 60.738059 1024 1024.0 9.9897 3.3374 18\n",
"60.070717 47.973170 2048 2048.0 10.5966 4.3320 20\n",
"46.866293 33.661869 4096 4096.0 11.5129 5.5064 22\n",
"31.345077 15.823861 8192 8192.0 9.9523 7.3887 20\n",
"18.794027 6.242977 16384 16384.0 10.1266 9.2525 20\n",
"10.716580 2.639132 32768 32768.0 10.7144 10.2656 17\n",
"6.333589 1.950599 65536 65536.0 11.9184 11.3878 24\n",
"3.963696 1.593802 131072 131072.0 11.9184 10.7568 22\n",
"2.686637 1.409579 262144 262144.0 10.4631 10.7039 18\n",
"1.923282 1.159926 524288 524288.0 10.5966 11.2771 23\n",
"\n",
"finished run\n",
"number of examples = 870000\n",
"weighted example sum = 870000.000000\n",
"weighted label sum = 9438920.949740\n",
"average loss = 1.536323\n",
"best constant = 10.849335\n",
"total feature number = 55323771\n"
]
}
],
"source": [
"!vw -d train_high.vw -c -f model_high.vw --ftrl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Test results"
]
},
{
"cell_type": "code",
"execution_count": 616,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"values_low = []\n",
"for line in open('test_low.vw'):\n",
" values_low.append(float(line.split()[0]))"
]
},
{
"cell_type": "code",
"execution_count": 617,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"values_low = array(values_low)"
]
},
{
"cell_type": "code",
"execution_count": 618,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"values_low = exp(values_low)"
]
},
{
"cell_type": "code",
"execution_count": 619,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"values_high = []\n",
"for line in open('test_high.vw'):\n",
" values_high.append(float(line.split()[0]))"
]
},
{
"cell_type": "code",
"execution_count": 620,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"values_high = array(values_high)"
]
},
{
"cell_type": "code",
"execution_count": 621,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"values_high = exp(values_high)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Predict with different models and take average"
]
},
{
"cell_type": "code",
"execution_count": 622,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"only testing\n",
"predictions = high_value_high_model.txt\n",
"Num weight bits = 18\n",
"learning rate = 10\n",
"initial_t = 1\n",
"power_t = 0.5\n",
"using no cache\n",
"Reading datafile = test_high.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"0.000293 0.000293 1 1.0 10.1943 10.2114 138\n",
"0.277528 0.554763 2 2.0 12.4292 11.6844 205\n",
"0.323833 0.370139 4 4.0 10.5966 10.3895 105\n",
"2.326514 4.329195 8 8.0 15.7614 11.7062 58\n",
"1.371655 0.416795 16 16.0 11.0666 10.4735 318\n",
"0.833614 0.295573 32 32.0 11.0666 10.3431 314\n",
"0.521573 0.209531 64 64.0 11.5129 11.7870 280\n",
"0.869756 1.217939 128 128.0 10.8198 10.7094 100\n",
"0.876650 0.883544 256 256.0 12.8992 10.5850 129\n",
"0.939057 1.001464 512 512.0 11.5129 11.4477 133\n",
"0.589599 0.240142 1024 1024.0 11.2898 11.1903 314\n",
"0.485231 0.380862 2048 2048.0 10.2036 10.3593 71\n",
"0.702109 0.918988 4096 4096.0 11.2898 11.3900 314\n",
"0.654038 0.605967 8192 8192.0 11.2898 11.4486 314\n",
"0.658315 0.662592 16384 16384.0 11.2898 11.3859 321\n",
"0.592263 0.526210 32768 32768.0 9.3927 10.3072 58\n",
"0.670875 0.749488 65536 65536.0 5.7038 9.9592 153\n",
"\n",
"finished run\n",
"number of examples per pass = 100000\n",
"passes used = 1\n",
"weighted example sum = 100000.000000\n",
"weighted label sum = 1087685.667307\n",
"average loss = 0.791596\n",
"best constant = 10.876856\n",
"total feature number = 16345342\n",
"only testing\n",
"predictions = high_value_low_model.txt\n",
"Num weight bits = 18\n",
"learning rate = 10\n",
"initial_t = 1\n",
"power_t = 0.5\n",
"using no cache\n",
"Reading datafile = test_high.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"0.003395 0.003395 1 1.0 10.1943 10.1360 138\n",
"0.379005 0.754615 2 2.0 12.4292 11.5605 205\n",
"0.437300 0.495595 4 4.0 10.5966 10.4200 105\n",
"1.983976 3.530652 8 8.0 15.7614 12.2383 58\n",
"1.200286 0.416596 16 16.0 11.0666 10.1742 318\n",
"0.826913 0.453541 32 32.0 11.0666 10.0812 314\n",
"0.612778 0.398643 64 64.0 11.5129 11.2358 280\n",
"0.835588 1.058398 128 128.0 10.8198 10.0413 100\n",
"0.808874 0.782159 256 256.0 12.8992 10.6437 129\n",
"0.837803 0.866732 512 512.0 11.5129 11.1406 133\n",
"0.548719 0.259634 1024 1024.0 11.2898 11.3075 314\n",
"0.471293 0.393866 2048 2048.0 10.2036 10.0554 71\n",
"0.683791 0.896289 4096 4096.0 11.2898 11.0925 314\n",
"0.643103 0.602415 8192 8192.0 11.2898 11.1765 314\n",
"0.674958 0.706812 16384 16384.0 11.2898 11.1891 321\n",
"0.668404 0.661850 32768 32768.0 9.3927 10.1822 58\n",
"0.826300 0.984197 65536 65536.0 5.7038 9.4176 153\n",
"\n",
"finished run\n",
"number of examples per pass = 100000\n",
"passes used = 1\n",
"weighted example sum = 100000.000000\n",
"weighted label sum = 1087685.667307\n",
"average loss = 0.894098\n",
"best constant = 10.876856\n",
"total feature number = 16345342\n",
"only testing\n",
"predictions = low_value_low_model.txt\n",
"Num weight bits = 18\n",
"learning rate = 10\n",
"initial_t = 1\n",
"power_t = 0.5\n",
"using no cache\n",
"Reading datafile = test_low.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"0.874886 0.874886 1 1.0 9.2103 10.1457 162\n",
"0.494875 0.114864 2 2.0 10.1266 10.4655 139\n",
"0.293302 0.091729 4 4.0 10.1659 10.0599 175\n",
"0.215332 0.137363 8 8.0 9.3927 10.0077 49\n",
"0.475586 0.735839 16 16.0 10.2036 9.9747 90\n",
"0.664582 0.853577 32 32.0 14.9141 12.2175 69\n",
"0.779344 0.894105 64 64.0 10.1266 10.8547 78\n",
"0.821924 0.864504 128 128.0 10.1266 10.2541 230\n",
"0.771984 0.722043 256 256.0 10.4631 11.0679 193\n",
"0.734926 0.697869 512 512.0 11.9184 11.7936 276\n",
"0.737960 0.740994 1024 1024.0 9.9035 10.5535 93\n",
"0.924567 1.111173 2048 2048.0 10.8198 11.0387 118\n",
"0.687355 0.450144 4096 4096.0 9.9035 10.7395 95\n",
"0.668522 0.649688 8192 8192.0 11.7753 11.4229 259\n",
"0.922647 1.176773 16384 16384.0 11.5129 11.1825 105\n",
"0.866610 0.810573 32768 32768.0 10.9151 10.6063 352\n",
"0.870291 0.873972 65536 65536.0 10.5966 10.7134 87\n",
"\n",
"finished run\n",
"number of examples per pass = 100000\n",
"passes used = 1\n",
"weighted example sum = 100000.000000\n",
"weighted label sum = 1046550.958891\n",
"average loss = 0.868597\n",
"best constant = 10.465509\n",
"total feature number = 14711990\n",
"only testing\n",
"predictions = low_value_high_model.txt\n",
"Num weight bits = 18\n",
"learning rate = 10\n",
"initial_t = 1\n",
"power_t = 0.5\n",
"using no cache\n",
"Reading datafile = test_low.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"2.169537 2.169537 1 1.0 9.2103 10.6833 162\n",
"1.259199 0.348861 2 2.0 10.1266 10.7173 139\n",
"0.701791 0.144383 4 4.0 10.1659 9.9977 175\n",
"0.508060 0.314329 8 8.0 9.3927 10.4537 49\n",
"0.996243 1.484427 16 16.0 10.2036 9.9883 90\n",
"1.158390 1.320537 32 32.0 14.9141 11.4474 69\n",
"1.308792 1.459194 64 64.0 10.1266 11.0456 78\n",
"1.338885 1.368977 128 128.0 10.1266 10.7433 230\n",
"1.163914 0.988944 256 256.0 10.4631 10.9168 193\n",
"1.130644 1.097373 512 512.0 11.9184 10.9229 276\n",
"1.123531 1.116418 1024 1024.0 9.9035 10.3853 93\n",
"1.334858 1.546185 2048 2048.0 10.8198 10.9418 118\n",
"1.079781 0.824704 4096 4096.0 9.9035 10.7894 95\n",
"1.048797 1.017813 8192 8192.0 11.7753 11.5085 259\n",
"1.311310 1.573823 16384 16384.0 11.5129 10.8461 105\n",
"1.227000 1.142691 32768 32768.0 10.9151 11.7443 352\n",
"1.267441 1.307881 65536 65536.0 10.5966 10.9719 87\n",
"\n",
"finished run\n",
"number of examples per pass = 100000\n",
"passes used = 1\n",
"weighted example sum = 100000.000000\n",
"weighted label sum = 1046550.958891\n",
"average loss = 1.280375\n",
"best constant = 10.465509\n",
"total feature number = 14711990\n"
]
}
],
"source": [
"!vw -t test_high.vw -i model_high.vw -p high_value_high_model.txt\n",
"!vw -t test_high.vw -i model_low.vw -p high_value_low_model.txt\n",
"!vw -t test_low.vw -i model_low.vw -p low_value_low_model.txt\n",
"!vw -t test_low.vw -i model_high.vw -p low_value_high_model.txt"
]
},
{
"cell_type": "code",
"execution_count": 623,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"low_prediction_high_model = exp(pd.read_csv('low_value_high_model.txt', header=None).values[:,0])\n",
"low_prediction_low_model = exp(pd.read_csv('low_value_low_model.txt', header=None).values[:,0])\n",
"low_predictions = 0.5 * low_prediction_high_model + 0.5 * low_prediction_low_model"
]
},
{
"cell_type": "code",
"execution_count": 624,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"high_prediction_high_model = exp(pd.read_csv('high_value_high_model.txt', header=None).values[:,0])\n",
"high_prediction_low_model = exp(pd.read_csv('high_value_low_model.txt', header=None).values[:,0])\n",
"high_predictions = 0.5 * high_prediction_high_model + 0.5 * high_prediction_low_model"
]
},
{
"cell_type": "code",
"execution_count": 625,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"158.841075749\n"
]
}
],
"source": [
"low_error = sqrt(mean((low_predictions / values_low - 1) ** 2))\n",
"high_error = sqrt(mean((values_high / high_predictions - 1) ** 2))\n",
"print 0.5*low_error + 0.5*high_error"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Mean Values Benchmark"
]
},
{
"cell_type": "code",
"execution_count": 635,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"naive_low_predictions = mean(values_low)\n",
"naive_high_predictions = mean(values_high)"
]
},
{
"cell_type": "code",
"execution_count": 637,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1160.31114288\n"
]
}
],
"source": [
"low_error_naive = sqrt(mean((naive_low_predictions / values_low - 1) ** 2))\n",
"high_error_naive = sqrt(mean((values_high / naive_high_predictions - 1) ** 2))\n",
"print 0.5*low_error_naive + 0.5*high_error_naive"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train And Apply"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def transform_json_test(line):\n",
" dict_data = json.loads(line)\n",
" result_line = '{} '.format(0)\n",
" result_line = result_line + '|{} {}'.format('billing_type', dict_data['billing_type']['id'])\n",
" result_line = result_line + '|{} {}'.format('accept_handicapped', dict_data['accept_handicapped'])\n",
" if dict_data['key_skills']:\n",
" result_line = result_line + '|key_skills '\n",
" for elem in dict_data['key_skills']:\n",
" result_line = result_line + '{} '.format(elem['name'].encode('utf8').replace(' ', '_').replace(':', ''))\n",
" \n",
" result_line = result_line + '|employment {}'.format(dict_data['employment']['id'])\n",
" result_line = result_line + '|archived {}'.format(dict_data['archived'])\n",
" if 'id' in dict_data['employer']:\n",
" result_line = result_line + '|employer {}'.format(dict_data['employer']['id'])\n",
" result_line = result_line + '|response_letter_required {}'.format(dict_data['response_letter_required'])\n",
" result_line = result_line + '|type {}'.format(dict_data['type']['id'])\n",
" if dict_data['specializations']:\n",
" result_line = result_line + '|specializations '\n",
" for elem in dict_data['specializations']:\n",
" result_line = result_line + '{} '.format(elem['id'])\n",
" result_line = result_line + '|premium {}'.format(dict_data['premium'])\n",
" result_line = result_line + '|schedule {}'.format(dict_data['schedule']['id'])\n",
" result_line = result_line + '|billing_type {}'.format(dict_data['billing_type']['id'])\n",
" if dict_data['department']:\n",
" result_line = result_line + '|department {}'.format(dict_data['department']['id'])\n",
" if dict_data['address']:\n",
" if 'city' in dict_data:\n",
" result_line = result_line + '|address {}'.format(dict_data['address']['city'].encode('utf8').replace(':', ''))\n",
" result_line = result_line + '|name ' + dict_data['name'].encode('utf8').replace(':', '')\n",
" result_line = result_line + '|area {}'.format(dict_data['area']['id'])\n",
" result_line = result_line + '|experience {}'.format(dict_data['experience']['id'])\n",
" result_line = result_line + '|description ' + dict_data['description'].encode('utf8').replace(':', '').replace('\\n', ' ')\n",
" \n",
" return result_line "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"all_id = []\n",
"with open('test.txt', 'r') as start, open('test.vw', 'w') as transformed:\n",
" for line in start:\n",
" transformed_line = transform_json_test(line)\n",
" data_dict = json.loads(line)\n",
" all_id.append(data_dict['id'])\n",
" if isinstance(transformed_line, int):\n",
" continue\n",
" transformed.write(transformed_line + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"final_regressor = model_low.vw\n",
"Enabling FTRL based optimization\n",
"Algorithm used: Proximal-FTRL\n",
"ftrl_alpha = 0.005\n",
"ftrl_beta = 0.1\n",
"Num weight bits = 18\n",
"learning rate = 0.5\n",
"initial_t = 0\n",
"power_t = 0.5\n",
"can't open: data_low.vw.cache, error = No such file or directory\n",
"creating cache_file = data_low.vw.cache\n",
"Reading datafile = data_low.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"106.951660 106.951660 1 1.0 10.3417 0.0000 75\n",
"87.880878 68.810097 2 2.0 8.5172 0.2220 153\n",
"93.990223 100.099567 4 4.0 10.8198 0.5810 207\n",
"95.530641 97.071058 8 8.0 9.7981 1.0559 116\n",
"92.893018 90.255394 16 16.0 15.7441 2.0837 218\n",
"81.520528 70.148039 32 32.0 11.2252 2.0292 111\n",
"62.562174 43.603821 64 64.0 9.7410 1.2855 72\n",
"48.582031 34.601887 128 128.0 6.9078 3.5295 127\n",
"34.969368 21.356705 256 256.0 10.5966 8.5866 165\n",
"26.602257 18.235146 512 512.0 8.8537 3.0915 58\n",
"19.497580 12.392903 1024 1024.0 9.2103 9.1850 136\n",
"15.888860 12.280140 2048 2048.0 9.9988 3.3881 39\n",
"13.119980 10.351100 4096 4096.0 9.3927 9.0212 176\n",
"9.934799 6.749618 8192 8192.0 10.8198 10.6501 181\n",
"6.985215 4.035631 16384 16384.0 10.9277 8.9472 197\n",
"5.036215 3.087215 32768 32768.0 10.2036 10.4781 101\n",
"3.705789 2.375363 65536 65536.0 15.2018 9.1716 41\n",
"2.758735 1.811681 131072 131072.0 9.9035 10.4783 122\n",
"2.125767 1.492799 262144 262144.0 9.9988 10.4289 109\n",
"1.698421 1.271076 524288 524288.0 10.1266 11.1014 91\n",
"\n",
"finished run\n",
"number of examples = 646085\n",
"weighted example sum = 646085.000000\n",
"weighted label sum = 6744885.165561\n",
"average loss = 1.578285\n",
"best constant = 10.439625\n",
"total feature number = 96916208\n",
"final_regressor = model_high.vw\n",
"Enabling FTRL based optimization\n",
"Algorithm used: Proximal-FTRL\n",
"ftrl_alpha = 0.005\n",
"ftrl_beta = 0.1\n",
"Num weight bits = 18\n",
"learning rate = 0.5\n",
"initial_t = 0\n",
"power_t = 0.5\n",
"can't open: data_high.vw.cache, error = No such file or directory\n",
"creating cache_file = data_high.vw.cache\n",
"Reading datafile = data_high.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"109.476509 109.476509 1 1.0 10.4631 0.0000 48\n",
"114.660633 119.844757 2 2.0 11.0021 0.0547 71\n",
"111.746721 108.832809 4 4.0 10.0432 0.5217 128\n",
"106.070107 100.393492 8 8.0 10.5966 0.2365 47\n",
"97.291625 88.513143 16 16.0 10.0858 0.9092 97\n",
"82.861561 68.431497 32 32.0 10.4043 2.9572 181\n",
"70.255918 57.650275 64 64.0 10.7144 3.7279 187\n",
"53.459390 36.662862 128 128.0 10.9331 10.8924 331\n",
"44.865170 36.270949 256 256.0 10.3090 9.0353 189\n",
"32.070256 19.275342 512 512.0 10.4913 12.3265 235\n",
"24.270089 16.469922 1024 1024.0 9.9897 11.1697 180\n",
"19.091618 13.913147 2048 2048.0 10.5966 8.7240 115\n",
"14.123099 9.154580 4096 4096.0 11.5129 11.2993 158\n",
"9.728520 5.333941 8192 8192.0 9.9523 8.0815 56\n",
"7.101585 4.474651 16384 16384.0 10.1266 8.6268 59\n",
"5.141983 3.182381 32768 32768.0 10.7144 10.6395 111\n",
"3.777053 2.412123 65536 65536.0 11.9184 11.9293 166\n",
"2.804415 1.831777 131072 131072.0 11.9184 10.7000 233\n",
"2.162436 1.520457 262144 262144.0 10.4631 10.4092 81\n",
"\n",
"finished run\n",
"number of examples = 390960\n",
"weighted example sum = 390960.000000\n",
"weighted label sum = 4244350.350891\n",
"average loss = 1.775027\n",
"best constant = 10.856227\n",
"total feature number = 60183161\n"
]
}
],
"source": [
"!vw -d data_low.vw -c -f model_low.vw --ftrl\n",
"!vw -d data_high.vw -c -f model_high.vw --ftrl"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"only testing\n",
"predictions = test_high.txt\n",
"Num weight bits = 18\n",
"learning rate = 10\n",
"initial_t = 1\n",
"power_t = 0.5\n",
"using no cache\n",
"Reading datafile = test.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"113.499733 113.499733 1 1.0 0.0000 10.6536 200\n",
"107.319633 101.139534 2 2.0 0.0000 10.0568 75\n",
"112.751225 118.182816 4 4.0 0.0000 10.3804 114\n",
"109.374735 105.998245 8 8.0 0.0000 10.0007 125\n",
"105.644907 101.915078 16 16.0 0.0000 8.0501 69\n",
"102.383578 99.122249 32 32.0 0.0000 11.7805 179\n",
"107.211714 112.039850 64 64.0 0.0000 10.5021 335\n",
"111.094909 114.978105 128 128.0 0.0000 10.6815 236\n",
"111.666152 112.237394 256 256.0 0.0000 11.0607 323\n",
"109.238800 106.811447 512 512.0 0.0000 9.4624 101\n",
"109.461024 109.683248 1024 1024.0 0.0000 11.1749 137\n",
"118.899255 128.337486 2048 2048.0 0.0000 11.5257 75\n",
"121.063670 123.228085 4096 4096.0 0.0000 11.1480 132\n",
"118.235043 115.406417 8192 8192.0 0.0000 11.6428 287\n",
"117.539135 116.843227 16384 16384.0 0.0000 11.2021 135\n",
"117.178761 116.818386 32768 32768.0 0.0000 10.2871 95\n",
"117.374743 117.570725 65536 65536.0 0.0000 10.8540 163\n",
"117.787372 118.200001 131072 131072.0 0.0000 12.1189 125\n",
"\n",
"finished run\n",
"number of examples per pass = 173995\n",
"passes used = 1\n",
"weighted example sum = 173995.000000\n",
"weighted label sum = 0.000000\n",
"average loss = 118.251191\n",
"total feature number = 26036210\n",
"only testing\n",
"predictions = test_low.txt\n",
"Num weight bits = 18\n",
"learning rate = 10\n",
"initial_t = 1\n",
"power_t = 0.5\n",
"using no cache\n",
"Reading datafile = test.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"105.438477 105.438477 1 1.0 0.0000 10.2683 200\n",
"101.695347 97.952217 2 2.0 0.0000 9.8971 75\n",
"107.531317 113.367287 4 4.0 0.0000 10.0901 114\n",
"105.094079 102.656841 8 8.0 0.0000 9.9564 125\n",
"100.753685 96.413290 16 16.0 0.0000 8.1239 69\n",
"97.406150 94.058615 32 32.0 0.0000 11.1240 179\n",
"101.150055 104.893961 64 64.0 0.0000 9.8346 335\n",
"104.861938 108.573820 128 128.0 0.0000 10.0408 236\n",
"104.944590 105.027242 256 256.0 0.0000 10.3332 323\n",
"105.249780 105.554970 512 512.0 0.0000 9.3182 101\n",
"104.736897 104.224014 1024 1024.0 0.0000 10.3392 137\n",
"110.274377 115.811856 2048 2048.0 0.0000 10.5131 75\n",
"107.636711 104.999044 4096 4096.0 0.0000 10.2691 132\n",
"106.784993 105.933276 8192 8192.0 0.0000 7.6446 287\n",
"106.552613 106.320233 16384 16384.0 0.0000 10.4032 135\n",
"107.536974 108.521335 32768 32768.0 0.0000 9.9372 95\n",
"108.379893 109.222811 65536 65536.0 0.0000 11.2499 163\n",
"109.039884 109.699875 131072 131072.0 0.0000 10.8236 125\n",
"\n",
"finished run\n",
"number of examples per pass = 173995\n",
"passes used = 1\n",
"weighted example sum = 173995.000000\n",
"weighted label sum = 0.000000\n",
"average loss = 109.413302\n",
"total feature number = 26036210\n"
]
}
],
"source": [
"!vw -t test.vw -i model_high.vw -p test_high.txt\n",
"!vw -t test.vw -i model_low.vw -p test_low.txt"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"prediction_high = exp(pd.read_csv('test_high.txt', header=None).values[:,0])\n",
"prediction_low = exp(pd.read_csv('test_low.txt', header=None).values[:,0])\n",
"predictions = 0.5 * prediction_low + 0.5 * prediction_high"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open('result.txt', 'w') as submit_file:\n",
" for index, elem in enumerate(all_id):\n",
" results_string = json.dumps({\"salary\": {\"predict\": \"both\", 'to': prediction_high[index], \"from\": prediction_low[index], \"currency\": \"RUR\"}, \"id\": elem})\n",
" submit_file.write(results_string + '\\n')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"salary\": {\"predict\": \"both\", \"to\": 42345.820257877342, \"from\": 28805.568553905428, \"currency\": \"RUR\"}, \"id\": \"9917565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 23314.132482652887, \"from\": 19872.278513333458, \"currency\": \"RUR\"}, \"id\": \"2917565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 84185.501988915072, \"from\": 71467.065486749969, \"currency\": \"RUR\"}, \"id\": \"9717565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 32221.33210766493, \"from\": 24102.359036012145, \"currency\": \"RUR\"}, \"id\": \"2717565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 121795.65982471367, \"from\": 87236.578486766928, \"currency\": \"RUR\"}, \"id\": \"5517565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 4919.2531540177242, \"from\": 6789.6522213536655, \"currency\": \"RUR\"}, \"id\": \"7437565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 44558.802483863459, \"from\": 27150.751376816497, \"currency\": \"RUR\"}, \"id\": \"6437565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 22041.140307096251, \"from\": 21087.231624169333, \"currency\": \"RUR\"}, \"id\": \"8337565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 35784.710494341045, \"from\": 24630.283465478988, \"currency\": \"RUR\"}, \"id\": \"0337565\"}\r\n",
"{\"salary\": {\"predict\": \"both\", \"to\": 29929.562999847221, \"from\": 16506.37584337965, \"currency\": \"RUR\"}, \"id\": \"0237565\"}\r\n",
"cat: ошибка записи: Обрыв канала\r\n"
]
}
],
"source": [
"!cat result.txt | head -10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.8"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment