kuk/VW.ipynb

## VW.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "%pylab inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Linear model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Transform JSON to VW format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 512,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def transform_json(line, border):\n",
    "    dict_data = json.loads(line)\n",
    "    if dict_data['salary'][border] is not None:\n",
    "        result_line = '{} '.format(log(float(dict_data['salary'][border])))\n",
    "    else:\n",
    "        return 0\n",
    "    result_line = result_line + '|{} {}'.format('billing_type', dict_data['billing_type']['id'])\n",
    "    result_line = result_line + '|{} {}'.format('accept_handicapped', dict_data['accept_handicapped'])\n",
    "    if dict_data['key_skills']:\n",
    "        result_line = result_line + '|key_skills '\n",
    "        for elem in dict_data['key_skills']:\n",
    "            result_line = result_line + '{} '.format(elem['name'].encode('utf8').replace(' ', '_').replace(':', ''))\n",
    "        \n",
    "    result_line = result_line + '|employment {}'.format(dict_data['employment']['id'])\n",
    "    result_line = result_line + '|archived {}'.format(dict_data['archived'])\n",
    "    if 'id' in dict_data['employer']:\n",
    "        result_line = result_line + '|employer {}'.format(dict_data['employer']['id'])\n",
    "    result_line = result_line + '|response_letter_required {}'.format(dict_data['response_letter_required'])\n",
    "    result_line = result_line + '|type {}'.format(dict_data['type']['id'])\n",
    "    if dict_data['specializations']:\n",
    "        result_line = result_line + '|specializations '\n",
    "        for elem in dict_data['specializations']:\n",
    "            result_line = result_line + '{} '.format(elem['id'])\n",
    "    result_line = result_line + '|premium {}'.format(dict_data['premium'])\n",
    "    result_line = result_line + '|schedule {}'.format(dict_data['schedule']['id'])\n",
    "    result_line = result_line + '|billing_type {}'.format(dict_data['billing_type']['id'])\n",
    "    if dict_data['department']:\n",
    "        result_line = result_line + '|department {}'.format(dict_data['department']['id'])\n",
    "    if dict_data['address']:\n",
    "        if 'city' in dict_data:\n",
    "            result_line = result_line + '|address {}'.format(dict_data['address']['city'].encode('utf8').replace(':', ''))\n",
    "    result_line = result_line + '|name ' + dict_data['name'].encode('utf8').replace(':', '')\n",
    "    result_line = result_line + '|area {}'.format(dict_data['area']['id'])\n",
    "    result_line = result_line + '|experience {}'.format(dict_data['experience']['id'])\n",
    "    result_line = result_line + '|description ' + dict_data['description'].encode('utf8').replace(':', '').replace('\\n', ' ')\n",
    "    \n",
    "    return result_line    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 513,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open('train.txt', 'r') as start, open('data_low.vw', 'w') as transformed:\n",
    "    for line in start:\n",
    "        transformed_line = transform_json(line, 'from')\n",
    "        if isinstance(transformed_line, int):\n",
    "            continue\n",
    "        transformed.write(transformed_line + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 514,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open('train.txt', 'r') as start, open('data_high.vw', 'w') as transformed:\n",
    "    for line in start:\n",
    "        transformed_line = transform_json(line, 'to')\n",
    "        if isinstance(transformed_line, int):\n",
    "            continue\n",
    "        transformed.write(transformed_line + '\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Train Test Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 515,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "390960 data_high.vw\r\n"
     ]
    }
   ],
   "source": [
    "!wc data_high.vw -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 611,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cat: ошибка записи: Обрыв канала\r\n"
     ]
    }
   ],
   "source": [
    "!cat data_high.vw | tail -100000 >> test_high.vw\n",
    "!cat data_high.vw | head -290000 >> train_high.vw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 517,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "646085 data_low.vw\r\n"
     ]
    }
   ],
   "source": [
    "!wc data_low.vw -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 610,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cat: ошибка записи: Обрыв канала\n"
     ]
    }
   ],
   "source": [
    "!cat data_low.vw | tail -100000 >> test_low.vw\n",
    "!cat data_low.vw | head -540000 >> train_low.vw"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train high and low models"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* -d -- train data file\n",
    "* -c -- create binary cash\n",
    "* -f -- name of the model\n",
    "* --ftrl -- optimization algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 614,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "final_regressor = model_low.vw\n",
      "Enabling FTRL based optimization\n",
      "Algorithm used: Proximal-FTRL\n",
      "ftrl_alpha = 0.005\n",
      "ftrl_beta = 0.1\n",
      "Num weight bits = 18\n",
      "learning rate = 0.5\n",
      "initial_t = 0\n",
      "power_t = 0.5\n",
      "using cache_file = train_low.vw.cache\n",
      "ignoring text input in favor of cache input\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "106.951660 106.951660            1            1.0  10.3417   0.0000       20\n",
      "111.503761 116.055862            2            2.0  10.8198   0.0469       19\n",
      "145.596371 179.688980            4            4.0  11.6952   0.0805       26\n",
      "123.561625 101.526880            8            8.0  10.7144   0.1870       19\n",
      "112.988557 102.415488           16           16.0  10.4631   0.4912       21\n",
      "103.163995 93.339433           32           32.0  10.3090   0.5028       23\n",
      "101.071607 98.979219           64           64.0   9.9035   0.7482       18\n",
      "89.244177 77.416748          128          128.0  10.3090   1.2837       23\n",
      "84.063526 78.882875          256          256.0  11.1562   1.5962       19\n",
      "78.266630 72.469734          512          512.0  10.4631   2.1302       18\n",
      "68.171888 58.077146         1024         1024.0   9.9035   3.6030       19\n",
      "56.471147 44.770406         2048         2048.0  11.6952   4.8247       21\n",
      "42.680226 28.889304         4096         4096.0   9.7981   5.7984       15\n",
      "28.110370 13.540513         8192         8192.0  10.3417   7.5182       23\n",
      "17.085322 6.060274        16384        16384.0  10.1924  10.3211       24\n",
      "9.867351 2.649381        32768        32768.0  10.4631  10.5835       21\n",
      "5.954988 2.042625        65536        65536.0  11.0021  11.3056       37\n",
      "3.738675 1.522362       131072       131072.0   9.2591  10.3081       17\n",
      "2.614757 1.490838       262144       262144.0   9.6158  10.1048       15\n",
      "1.942687 1.270618       524288       524288.0  11.5129  11.0482       17\n",
      "1.567351 1.192015      1048576      1048576.0   9.5468  10.0724       16\n",
      "1.264792 0.962233      2097152      2097152.0  12.2061  11.7879      245\n",
      "\n",
      "finished run\n",
      "number of examples = 2160000\n",
      "weighted example sum = 2160000.000000\n",
      "weighted label sum = 22548532.381310\n",
      "average loss = 1.245573\n",
      "best constant = 10.439136\n",
      "total feature number = 113738775\n"
     ]
    }
   ],
   "source": [
    "!vw -d train_low.vw -c -f model_low.vw --ftrl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 615,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "final_regressor = model_high.vw\n",
      "Enabling FTRL based optimization\n",
      "Algorithm used: Proximal-FTRL\n",
      "ftrl_alpha = 0.005\n",
      "ftrl_beta = 0.1\n",
      "Num weight bits = 18\n",
      "learning rate = 0.5\n",
      "initial_t = 0\n",
      "power_t = 0.5\n",
      "using cache_file = train_high.vw.cache\n",
      "ignoring text input in favor of cache input\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "109.476509 109.476509            1            1.0  10.4631   0.0000       23\n",
      "114.824165 120.171822            2            2.0  11.0021   0.0398       20\n",
      "114.894907 114.965649            4            4.0  10.0432   0.1037       19\n",
      "109.988022 105.081137            8            8.0  10.5966   0.1608       21\n",
      "108.195213 106.402404           16           16.0  10.0858   0.2885       20\n",
      "105.115345 102.035477           32           32.0  10.4043   0.5303       20\n",
      "101.926354 98.737363           64           64.0  10.7144   0.8666       20\n",
      "95.272331 88.618309          128          128.0  10.9331   1.1735       21\n",
      "93.169282 91.066232          256          256.0  10.3090   1.4080       28\n",
      "83.598470 74.027658          512          512.0  10.4913   2.4718       24\n",
      "72.168264 60.738059         1024         1024.0   9.9897   3.3374       18\n",
      "60.070717 47.973170         2048         2048.0  10.5966   4.3320       20\n",
      "46.866293 33.661869         4096         4096.0  11.5129   5.5064       22\n",
      "31.345077 15.823861         8192         8192.0   9.9523   7.3887       20\n",
      "18.794027 6.242977        16384        16384.0  10.1266   9.2525       20\n",
      "10.716580 2.639132        32768        32768.0  10.7144  10.2656       17\n",
      "6.333589 1.950599        65536        65536.0  11.9184  11.3878       24\n",
      "3.963696 1.593802       131072       131072.0  11.9184  10.7568       22\n",
      "2.686637 1.409579       262144       262144.0  10.4631  10.7039       18\n",
      "1.923282 1.159926       524288       524288.0  10.5966  11.2771       23\n",
      "\n",
      "finished run\n",
      "number of examples = 870000\n",
      "weighted example sum = 870000.000000\n",
      "weighted label sum = 9438920.949740\n",
      "average loss = 1.536323\n",
      "best constant = 10.849335\n",
      "total feature number = 55323771\n"
     ]
    }
   ],
   "source": [
    "!vw -d train_high.vw -c -f model_high.vw --ftrl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Test results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 616,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "values_low = []\n",
    "for line in open('test_low.vw'):\n",
    "    values_low.append(float(line.split()[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 617,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "values_low = array(values_low)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 618,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "values_low = exp(values_low)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 619,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "values_high = []\n",
    "for line in open('test_high.vw'):\n",
    "    values_high.append(float(line.split()[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 620,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "values_high = array(values_high)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 621,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "values_high = exp(values_high)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Predict with different models and take average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 622,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "only testing\n",
      "predictions = high_value_high_model.txt\n",
      "Num weight bits = 18\n",
      "learning rate = 10\n",
      "initial_t = 1\n",
      "power_t = 0.5\n",
      "using no cache\n",
      "Reading datafile = test_high.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "0.000293 0.000293            1            1.0  10.1943  10.2114      138\n",
      "0.277528 0.554763            2            2.0  12.4292  11.6844      205\n",
      "0.323833 0.370139            4            4.0  10.5966  10.3895      105\n",
      "2.326514 4.329195            8            8.0  15.7614  11.7062       58\n",
      "1.371655 0.416795           16           16.0  11.0666  10.4735      318\n",
      "0.833614 0.295573           32           32.0  11.0666  10.3431      314\n",
      "0.521573 0.209531           64           64.0  11.5129  11.7870      280\n",
      "0.869756 1.217939          128          128.0  10.8198  10.7094      100\n",
      "0.876650 0.883544          256          256.0  12.8992  10.5850      129\n",
      "0.939057 1.001464          512          512.0  11.5129  11.4477      133\n",
      "0.589599 0.240142         1024         1024.0  11.2898  11.1903      314\n",
      "0.485231 0.380862         2048         2048.0  10.2036  10.3593       71\n",
      "0.702109 0.918988         4096         4096.0  11.2898  11.3900      314\n",
      "0.654038 0.605967         8192         8192.0  11.2898  11.4486      314\n",
      "0.658315 0.662592        16384        16384.0  11.2898  11.3859      321\n",
      "0.592263 0.526210        32768        32768.0   9.3927  10.3072       58\n",
      "0.670875 0.749488        65536        65536.0   5.7038   9.9592      153\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 100000\n",
      "passes used = 1\n",
      "weighted example sum = 100000.000000\n",
      "weighted label sum = 1087685.667307\n",
      "average loss = 0.791596\n",
      "best constant = 10.876856\n",
      "total feature number = 16345342\n",
      "only testing\n",
      "predictions = high_value_low_model.txt\n",
      "Num weight bits = 18\n",
      "learning rate = 10\n",
      "initial_t = 1\n",
      "power_t = 0.5\n",
      "using no cache\n",
      "Reading datafile = test_high.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "0.003395 0.003395            1            1.0  10.1943  10.1360      138\n",
      "0.379005 0.754615            2            2.0  12.4292  11.5605      205\n",
      "0.437300 0.495595            4            4.0  10.5966  10.4200      105\n",
      "1.983976 3.530652            8            8.0  15.7614  12.2383       58\n",
      "1.200286 0.416596           16           16.0  11.0666  10.1742      318\n",
      "0.826913 0.453541           32           32.0  11.0666  10.0812      314\n",
      "0.612778 0.398643           64           64.0  11.5129  11.2358      280\n",
      "0.835588 1.058398          128          128.0  10.8198  10.0413      100\n",
      "0.808874 0.782159          256          256.0  12.8992  10.6437      129\n",
      "0.837803 0.866732          512          512.0  11.5129  11.1406      133\n",
      "0.548719 0.259634         1024         1024.0  11.2898  11.3075      314\n",
      "0.471293 0.393866         2048         2048.0  10.2036  10.0554       71\n",
      "0.683791 0.896289         4096         4096.0  11.2898  11.0925      314\n",
      "0.643103 0.602415         8192         8192.0  11.2898  11.1765      314\n",
      "0.674958 0.706812        16384        16384.0  11.2898  11.1891      321\n",
      "0.668404 0.661850        32768        32768.0   9.3927  10.1822       58\n",
      "0.826300 0.984197        65536        65536.0   5.7038   9.4176      153\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 100000\n",
      "passes used = 1\n",
      "weighted example sum = 100000.000000\n",
      "weighted label sum = 1087685.667307\n",
      "average loss = 0.894098\n",
      "best constant = 10.876856\n",
      "total feature number = 16345342\n",
      "only testing\n",
      "predictions = low_value_low_model.txt\n",
      "Num weight bits = 18\n",
      "learning rate = 10\n",
      "initial_t = 1\n",
      "power_t = 0.5\n",
      "using no cache\n",
      "Reading datafile = test_low.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "0.874886 0.874886            1            1.0   9.2103  10.1457      162\n",
      "0.494875 0.114864            2            2.0  10.1266  10.4655      139\n",
      "0.293302 0.091729            4            4.0  10.1659  10.0599      175\n",
      "0.215332 0.137363            8            8.0   9.3927  10.0077       49\n",
      "0.475586 0.735839           16           16.0  10.2036   9.9747       90\n",
      "0.664582 0.853577           32           32.0  14.9141  12.2175       69\n",
      "0.779344 0.894105           64           64.0  10.1266  10.8547       78\n",
      "0.821924 0.864504          128          128.0  10.1266  10.2541      230\n",
      "0.771984 0.722043          256          256.0  10.4631  11.0679      193\n",
      "0.734926 0.697869          512          512.0  11.9184  11.7936      276\n",
      "0.737960 0.740994         1024         1024.0   9.9035  10.5535       93\n",
      "0.924567 1.111173         2048         2048.0  10.8198  11.0387      118\n",
      "0.687355 0.450144         4096         4096.0   9.9035  10.7395       95\n",
      "0.668522 0.649688         8192         8192.0  11.7753  11.4229      259\n",
      "0.922647 1.176773        16384        16384.0  11.5129  11.1825      105\n",
      "0.866610 0.810573        32768        32768.0  10.9151  10.6063      352\n",
      "0.870291 0.873972        65536        65536.0  10.5966  10.7134       87\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 100000\n",
      "passes used = 1\n",
      "weighted example sum = 100000.000000\n",
      "weighted label sum = 1046550.958891\n",
      "average loss = 0.868597\n",
      "best constant = 10.465509\n",
      "total feature number = 14711990\n",
      "only testing\n",
      "predictions = low_value_high_model.txt\n",
      "Num weight bits = 18\n",
      "learning rate = 10\n",
      "initial_t = 1\n",
      "power_t = 0.5\n",
      "using no cache\n",
      "Reading datafile = test_low.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "2.169537 2.169537            1            1.0   9.2103  10.6833      162\n",
      "1.259199 0.348861            2            2.0  10.1266  10.7173      139\n",
      "0.701791 0.144383            4            4.0  10.1659   9.9977      175\n",
      "0.508060 0.314329            8            8.0   9.3927  10.4537       49\n",
      "0.996243 1.484427           16           16.0  10.2036   9.9883       90\n",
      "1.158390 1.320537           32           32.0  14.9141  11.4474       69\n",
      "1.308792 1.459194           64           64.0  10.1266  11.0456       78\n",
      "1.338885 1.368977          128          128.0  10.1266  10.7433      230\n",
      "1.163914 0.988944          256          256.0  10.4631  10.9168      193\n",
      "1.130644 1.097373          512          512.0  11.9184  10.9229      276\n",
      "1.123531 1.116418         1024         1024.0   9.9035  10.3853       93\n",
      "1.334858 1.546185         2048         2048.0  10.8198  10.9418      118\n",
      "1.079781 0.824704         4096         4096.0   9.9035  10.7894       95\n",
      "1.048797 1.017813         8192         8192.0  11.7753  11.5085      259\n",
      "1.311310 1.573823        16384        16384.0  11.5129  10.8461      105\n",
      "1.227000 1.142691        32768        32768.0  10.9151  11.7443      352\n",
      "1.267441 1.307881        65536        65536.0  10.5966  10.9719       87\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 100000\n",
      "passes used = 1\n",
      "weighted example sum = 100000.000000\n",
      "weighted label sum = 1046550.958891\n",
      "average loss = 1.280375\n",
      "best constant = 10.465509\n",
      "total feature number = 14711990\n"
     ]
    }
   ],
   "source": [
    "!vw -t test_high.vw -i model_high.vw -p high_value_high_model.txt\n",
    "!vw -t test_high.vw -i model_low.vw -p high_value_low_model.txt\n",
    "!vw -t test_low.vw -i model_low.vw -p low_value_low_model.txt\n",
    "!vw -t test_low.vw -i model_high.vw -p low_value_high_model.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 623,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "low_prediction_high_model = exp(pd.read_csv('low_value_high_model.txt', header=None).values[:,0])\n",
    "low_prediction_low_model = exp(pd.read_csv('low_value_low_model.txt', header=None).values[:,0])\n",
    "low_predictions = 0.5 * low_prediction_high_model + 0.5 * low_prediction_low_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 624,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "high_prediction_high_model = exp(pd.read_csv('high_value_high_model.txt', header=None).values[:,0])\n",
    "high_prediction_low_model = exp(pd.read_csv('high_value_low_model.txt', header=None).values[:,0])\n",
    "high_predictions = 0.5 * high_prediction_high_model + 0.5 * high_prediction_low_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 625,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "158.841075749\n"
     ]
    }
   ],
   "source": [
    "low_error = sqrt(mean((low_predictions / values_low - 1) ** 2))\n",
    "high_error = sqrt(mean((values_high / high_predictions - 1) ** 2))\n",
    "print 0.5*low_error + 0.5*high_error"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mean Values Benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 635,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "naive_low_predictions = mean(values_low)\n",
    "naive_high_predictions = mean(values_high)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 637,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1160.31114288\n"
     ]
    }
   ],
   "source": [
    "low_error_naive = sqrt(mean((naive_low_predictions / values_low - 1) ** 2))\n",
    "high_error_naive = sqrt(mean((values_high / naive_high_predictions - 1) ** 2))\n",
    "print 0.5*low_error_naive + 0.5*high_error_naive"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train And Apply"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def transform_json_test(line):\n",
    "    dict_data = json.loads(line)\n",
    "    result_line = '{} '.format(0)\n",
    "    result_line = result_line + '|{} {}'.format('billing_type', dict_data['billing_type']['id'])\n",
    "    result_line = result_line + '|{} {}'.format('accept_handicapped', dict_data['accept_handicapped'])\n",
    "    if dict_data['key_skills']:\n",
    "        result_line = result_line + '|key_skills '\n",
    "        for elem in dict_data['key_skills']:\n",
    "            result_line = result_line + '{} '.format(elem['name'].encode('utf8').replace(' ', '_').replace(':', ''))\n",
    "        \n",
    "    result_line = result_line + '|employment {}'.format(dict_data['employment']['id'])\n",
    "    result_line = result_line + '|archived {}'.format(dict_data['archived'])\n",
    "    if 'id' in dict_data['employer']:\n",
    "        result_line = result_line + '|employer {}'.format(dict_data['employer']['id'])\n",
    "    result_line = result_line + '|response_letter_required {}'.format(dict_data['response_letter_required'])\n",
    "    result_line = result_line + '|type {}'.format(dict_data['type']['id'])\n",
    "    if dict_data['specializations']:\n",
    "        result_line = result_line + '|specializations '\n",
    "        for elem in dict_data['specializations']:\n",
    "            result_line = result_line + '{} '.format(elem['id'])\n",
    "    result_line = result_line + '|premium {}'.format(dict_data['premium'])\n",
    "    result_line = result_line + '|schedule {}'.format(dict_data['schedule']['id'])\n",
    "    result_line = result_line + '|billing_type {}'.format(dict_data['billing_type']['id'])\n",
    "    if dict_data['department']:\n",
    "        result_line = result_line + '|department {}'.format(dict_data['department']['id'])\n",
    "    if dict_data['address']:\n",
    "        if 'city' in dict_data:\n",
    "            result_line = result_line + '|address {}'.format(dict_data['address']['city'].encode('utf8').replace(':', ''))\n",
    "    result_line = result_line + '|name ' + dict_data['name'].encode('utf8').replace(':', '')\n",
    "    result_line = result_line + '|area {}'.format(dict_data['area']['id'])\n",
    "    result_line = result_line + '|experience {}'.format(dict_data['experience']['id'])\n",
    "    result_line = result_line + '|description ' + dict_data['description'].encode('utf8').replace(':', '').replace('\\n', ' ')\n",
    "    \n",
    "    return result_line    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_id = []\n",
    "with open('test.txt', 'r') as start, open('test.vw', 'w') as transformed:\n",
    "    for line in start:\n",
    "        transformed_line = transform_json_test(line)\n",
    "        data_dict = json.loads(line)\n",
    "        all_id.append(data_dict['id'])\n",
    "        if isinstance(transformed_line, int):\n",
    "            continue\n",
    "        transformed.write(transformed_line + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "final_regressor = model_low.vw\n",
      "Enabling FTRL based optimization\n",
      "Algorithm used: Proximal-FTRL\n",
      "ftrl_alpha = 0.005\n",
      "ftrl_beta = 0.1\n",
      "Num weight bits = 18\n",
      "learning rate = 0.5\n",
      "initial_t = 0\n",
      "power_t = 0.5\n",
      "can't open: data_low.vw.cache, error = No such file or directory\n",
      "creating cache_file = data_low.vw.cache\n",
      "Reading datafile = data_low.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "106.951660 106.951660            1            1.0  10.3417   0.0000       75\n",
      "87.880878 68.810097            2            2.0   8.5172   0.2220      153\n",
      "93.990223 100.099567            4            4.0  10.8198   0.5810      207\n",
      "95.530641 97.071058            8            8.0   9.7981   1.0559      116\n",
      "92.893018 90.255394           16           16.0  15.7441   2.0837      218\n",
      "81.520528 70.148039           32           32.0  11.2252   2.0292      111\n",
      "62.562174 43.603821           64           64.0   9.7410   1.2855       72\n",
      "48.582031 34.601887          128          128.0   6.9078   3.5295      127\n",
      "34.969368 21.356705          256          256.0  10.5966   8.5866      165\n",
      "26.602257 18.235146          512          512.0   8.8537   3.0915       58\n",
      "19.497580 12.392903         1024         1024.0   9.2103   9.1850      136\n",
      "15.888860 12.280140         2048         2048.0   9.9988   3.3881       39\n",
      "13.119980 10.351100         4096         4096.0   9.3927   9.0212      176\n",
      "9.934799 6.749618         8192         8192.0  10.8198  10.6501      181\n",
      "6.985215 4.035631        16384        16384.0  10.9277   8.9472      197\n",
      "5.036215 3.087215        32768        32768.0  10.2036  10.4781      101\n",
      "3.705789 2.375363        65536        65536.0  15.2018   9.1716       41\n",
      "2.758735 1.811681       131072       131072.0   9.9035  10.4783      122\n",
      "2.125767 1.492799       262144       262144.0   9.9988  10.4289      109\n",
      "1.698421 1.271076       524288       524288.0  10.1266  11.1014       91\n",
      "\n",
      "finished run\n",
      "number of examples = 646085\n",
      "weighted example sum = 646085.000000\n",
      "weighted label sum = 6744885.165561\n",
      "average loss = 1.578285\n",
      "best constant = 10.439625\n",
      "total feature number = 96916208\n",
      "final_regressor = model_high.vw\n",
      "Enabling FTRL based optimization\n",
      "Algorithm used: Proximal-FTRL\n",
      "ftrl_alpha = 0.005\n",
      "ftrl_beta = 0.1\n",
      "Num weight bits = 18\n",
      "learning rate = 0.5\n",
      "initial_t = 0\n",
      "power_t = 0.5\n",
      "can't open: data_high.vw.cache, error = No such file or directory\n",
      "creating cache_file = data_high.vw.cache\n",
      "Reading datafile = data_high.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "109.476509 109.476509            1            1.0  10.4631   0.0000       48\n",
      "114.660633 119.844757            2            2.0  11.0021   0.0547       71\n",
      "111.746721 108.832809            4            4.0  10.0432   0.5217      128\n",
      "106.070107 100.393492            8            8.0  10.5966   0.2365       47\n",
      "97.291625 88.513143           16           16.0  10.0858   0.9092       97\n",
      "82.861561 68.431497           32           32.0  10.4043   2.9572      181\n",
      "70.255918 57.650275           64           64.0  10.7144   3.7279      187\n",
      "53.459390 36.662862          128          128.0  10.9331  10.8924      331\n",
      "44.865170 36.270949          256          256.0  10.3090   9.0353      189\n",
      "32.070256 19.275342          512          512.0  10.4913  12.3265      235\n",
      "24.270089 16.469922         1024         1024.0   9.9897  11.1697      180\n",
      "19.091618 13.913147         2048         2048.0  10.5966   8.7240      115\n",
      "14.123099 9.154580         4096         4096.0  11.5129  11.2993      158\n",
      "9.728520 5.333941         8192         8192.0   9.9523   8.0815       56\n",
      "7.101585 4.474651        16384        16384.0  10.1266   8.6268       59\n",
      "5.141983 3.182381        32768        32768.0  10.7144  10.6395      111\n",
      "3.777053 2.412123        65536        65536.0  11.9184  11.9293      166\n",
      "2.804415 1.831777       131072       131072.0  11.9184  10.7000      233\n",
      "2.162436 1.520457       262144       262144.0  10.4631  10.4092       81\n",
      "\n",
      "finished run\n",
      "number of examples = 390960\n",
      "weighted example sum = 390960.000000\n",
      "weighted label sum = 4244350.350891\n",
      "average loss = 1.775027\n",
      "best constant = 10.856227\n",
      "total feature number = 60183161\n"
     ]
    }
   ],
   "source": [
    "!vw -d data_low.vw -c -f model_low.vw --ftrl\n",
    "!vw -d data_high.vw -c -f model_high.vw --ftrl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "only testing\n",
      "predictions = test_high.txt\n",
      "Num weight bits = 18\n",
      "learning rate = 10\n",
      "initial_t = 1\n",
      "power_t = 0.5\n",
      "using no cache\n",
      "Reading datafile = test.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "113.499733 113.499733            1            1.0   0.0000  10.6536      200\n",
      "107.319633 101.139534            2            2.0   0.0000  10.0568       75\n",
      "112.751225 118.182816            4            4.0   0.0000  10.3804      114\n",
      "109.374735 105.998245            8            8.0   0.0000  10.0007      125\n",
      "105.644907 101.915078           16           16.0   0.0000   8.0501       69\n",
      "102.383578 99.122249           32           32.0   0.0000  11.7805      179\n",
      "107.211714 112.039850           64           64.0   0.0000  10.5021      335\n",
      "111.094909 114.978105          128          128.0   0.0000  10.6815      236\n",
      "111.666152 112.237394          256          256.0   0.0000  11.0607      323\n",
      "109.238800 106.811447          512          512.0   0.0000   9.4624      101\n",
      "109.461024 109.683248         1024         1024.0   0.0000  11.1749      137\n",
      "118.899255 128.337486         2048         2048.0   0.0000  11.5257       75\n",
      "121.063670 123.228085         4096         4096.0   0.0000  11.1480      132\n",
      "118.235043 115.406417         8192         8192.0   0.0000  11.6428      287\n",
      "117.539135 116.843227        16384        16384.0   0.0000  11.2021      135\n",
      "117.178761 116.818386        32768        32768.0   0.0000  10.2871       95\n",
      "117.374743 117.570725        65536        65536.0   0.0000  10.8540      163\n",
      "117.787372 118.200001       131072       131072.0   0.0000  12.1189      125\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 173995\n",
      "passes used = 1\n",
      "weighted example sum = 173995.000000\n",
      "weighted label sum = 0.000000\n",
      "average loss = 118.251191\n",
      "total feature number = 26036210\n",
      "only testing\n",
      "predictions = test_low.txt\n",
      "Num weight bits = 18\n",
      "learning rate = 10\n",
      "initial_t = 1\n",
      "power_t = 0.5\n",
      "using no cache\n",
      "Reading datafile = test.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "105.438477 105.438477            1            1.0   0.0000  10.2683      200\n",
      "101.695347 97.952217            2            2.0   0.0000   9.8971       75\n",
      "107.531317 113.367287            4            4.0   0.0000  10.0901      114\n",
      "105.094079 102.656841            8            8.0   0.0000   9.9564      125\n",
      "100.753685 96.413290           16           16.0   0.0000   8.1239       69\n",
      "97.406150 94.058615           32           32.0   0.0000  11.1240      179\n",
      "101.150055 104.893961           64           64.0   0.0000   9.8346      335\n",
      "104.861938 108.573820          128          128.0   0.0000  10.0408      236\n",
      "104.944590 105.027242          256          256.0   0.0000  10.3332      323\n",
      "105.249780 105.554970          512          512.0   0.0000   9.3182      101\n",
      "104.736897 104.224014         1024         1024.0   0.0000  10.3392      137\n",
      "110.274377 115.811856         2048         2048.0   0.0000  10.5131       75\n",
      "107.636711 104.999044         4096         4096.0   0.0000  10.2691      132\n",
      "106.784993 105.933276         8192         8192.0   0.0000   7.6446      287\n",
      "106.552613 106.320233        16384        16384.0   0.0000  10.4032      135\n",
      "107.536974 108.521335        32768        32768.0   0.0000   9.9372       95\n",
      "108.379893 109.222811        65536        65536.0   0.0000  11.2499      163\n",
      "109.039884 109.699875       131072       131072.0   0.0000  10.8236      125\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 173995\n",
      "passes used = 1\n",
      "weighted example sum = 173995.000000\n",
      "weighted label sum = 0.000000\n",
      "average loss = 109.413302\n",
      "total feature number = 26036210\n"
     ]
    }
   ],
   "source": [
    "!vw -t test.vw -i model_high.vw -p test_high.txt\n",
    "!vw -t test.vw -i model_low.vw -p test_low.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "prediction_high = exp(pd.read_csv('test_high.txt', header=None).values[:,0])\n",
    "prediction_low = exp(pd.read_csv('test_low.txt', header=None).values[:,0])\n",
    "predictions = 0.5 * prediction_low + 0.5 * prediction_high"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open('result.txt', 'w') as submit_file:\n",
    "    for index, elem in enumerate(all_id):\n",
    "        results_string = json.dumps({\"salary\": {\"predict\": \"both\", 'to': prediction_high[index], \"from\": prediction_low[index], \"currency\": \"RUR\"}, \"id\": elem})\n",
    "        submit_file.write(results_string + '\\n')\n",
    "                "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"salary\": {\"predict\": \"both\", \"to\": 42345.820257877342, \"from\": 28805.568553905428, \"currency\": \"RUR\"}, \"id\": \"9917565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 23314.132482652887, \"from\": 19872.278513333458, \"currency\": \"RUR\"}, \"id\": \"2917565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 84185.501988915072, \"from\": 71467.065486749969, \"currency\": \"RUR\"}, \"id\": \"9717565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 32221.33210766493, \"from\": 24102.359036012145, \"currency\": \"RUR\"}, \"id\": \"2717565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 121795.65982471367, \"from\": 87236.578486766928, \"currency\": \"RUR\"}, \"id\": \"5517565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 4919.2531540177242, \"from\": 6789.6522213536655, \"currency\": \"RUR\"}, \"id\": \"7437565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 44558.802483863459, \"from\": 27150.751376816497, \"currency\": \"RUR\"}, \"id\": \"6437565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 22041.140307096251, \"from\": 21087.231624169333, \"currency\": \"RUR\"}, \"id\": \"8337565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 35784.710494341045, \"from\": 24630.283465478988, \"currency\": \"RUR\"}, \"id\": \"0337565\"}\r\n",
      "{\"salary\": {\"predict\": \"both\", \"to\": 29929.562999847221, \"from\": 16506.37584337965, \"currency\": \"RUR\"}, \"id\": \"0237565\"}\r\n",
      "cat: ошибка записи: Обрыв канала\r\n"
     ]
    }
   ],
   "source": [
    "!cat result.txt | head -10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}