Skip to content

Instantly share code, notes, and snippets.

@lxc-xx
Created October 19, 2014 05:15
Show Gist options
  • Save lxc-xx/298446b12182c51752d7 to your computer and use it in GitHub Desktop.
Save lxc-xx/298446b12182c51752d7 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "",
"signature": "sha256:b99bf1f68a92860ccebd342a666077e249dfb487cea12a567d68facfaa39ab39"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"@ morph, for the YSDA ML Trainings 18 October, 2014"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Download data"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"data_dir = 'tradeshift/'\n",
"!mkdir {data_dir}"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/train.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A06%3A50Z&sr=b&sp=r&sig=cupgPW%2BU6BpdsnrykcEBBRqLEW565pXYQ6k%2FSc0Me1M%3D' -O {data_dir + 'train.csv.gz'}"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/test.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A09%3A52Z&sr=b&sp=r&sig=YLQCFyAdhIRnz2o4p24zRssUjHYjQ1xOHuTKFsdLxu8%3D' -O {data_dir + 'test.csv.gz'}"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/trainLabels.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A11%3A04Z&sr=b&sp=r&sig=%2Bm9sbZYXOY8L80d1PJEdumGPXvkQby2rpkVOf1fvjUM%3D' -O {data_dir + 'trainLabels.csv.gz'}"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Unpack"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"%%time\n",
"\n",
"!gunzip {data_dir + '*.gz'}"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"!ls -l -h {data_dir}"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Big Data -- Sample Data!"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"train = pd.read_csv(data_dir + 'train.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"train.shape"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
"(1700000, 146)"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sample_size = 170000\n",
"ratio = train.shape[0] / sample_size\n",
"\n",
"train_sample = train[\n",
" [hash(id) % ratio == 0 for id in train['id']]\n",
"]\n",
"\n",
"train_sample.shape"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 16,
"text": [
"(170000, 146)"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"train_sample.to_csv(data_dir + 'train_sample.csv', index = False)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Free memory\n",
"\n",
"del train"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Try to make something useful"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"\n",
"data_dir = 'tradeshift/'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"train_sample = pd.read_csv(data_dir + 'train_sample.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"labels = pd.read_csv(data_dir + 'trainLabels.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"labels.columns"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"Index([u'id', u'y1', u'y2', u'y3', u'y4', u'y5', u'y6', u'y7', u'y8', u'y9', u'y10', u'y11', u'y12', u'y13', u'y14', u'y15', u'y16', u'y17', u'y18', u'y19', u'y20', u'y21', u'y22', u'y23', u'y24', u'y25', u'y26', u'y27', u'y28', u'y29', u'y30', u'y31', u'y32', u'y33'], dtype='object')"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"train_with_labels = pd.merge(train_sample, labels, on = 'id')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"train_with_labels.shape"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"(170000, 179)"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from collections import Counter\n",
"\n",
"Counter([name[0] for name in train_with_labels.columns])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"Counter({'x': 145, 'y': 33, 'i': 1})"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"del labels\n",
"del train_sample"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"test = pd.read_csv(data_dir + 'test.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Categorical values encoding"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.feature_extraction import DictVectorizer\n",
"\n",
"X_numerical = []\n",
"X_test_numerical = []\n",
"\n",
"vec = DictVectorizer()\n",
"\n",
"names_categorical = []\n",
"\n",
"train_with_labels.replace('YES', 1, inplace = True)\n",
"train_with_labels.replace('NO', 0, inplace = True)\n",
"train_with_labels.replace('nan', np.NaN, inplace = True)\n",
"\n",
"test.replace('YES', 1, inplace = True)\n",
"test.replace('NO', 0, inplace = True)\n",
"test.replace('nan', np.NaN, inplace = True)\n",
"\n",
"\n",
"for name in train_with_labels.columns : \n",
" if name.startswith('x') :\n",
" column_type, _ = max(Counter(map(lambda x: str(type(x)), train_with_labels[name])).items(), key = lambda x: x[1])\n",
" \n",
" # LOL expression\n",
" if column_type == str(str) :\n",
" train_with_labels[name] = map(str, train_with_labels[name])\n",
" test[name] = map(str, test[name])\n",
"\n",
" names_categorical.append(name)\n",
" print name, len(np.unique(train_with_labels[name]))\n",
" else :\n",
" X_numerical.append(train_with_labels[name].fillna(-999))\n",
" X_test_numerical.append(test[name].fillna(-999))\n",
" \n",
"X_numerical = np.column_stack(X_numerical)\n",
"X_test_numerical = np.column_stack(X_test_numerical)\n",
"\n",
"X_sparse = vec.fit_transform(train_with_labels[names_categorical].T.to_dict().values())\n",
"X_test_sparse = vec.transform(test[names_categorical].T.to_dict().values())\n",
"\n",
"print X_numerical.shape, X_sparse.shape, X_test_numerical.shape, X_test_sparse.shape"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"x3 "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"40882\n",
"x4"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 5019\n",
"x34"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 48090\n",
"x35"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 6797\n",
"x61"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 78363\n",
"x64"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 49408\n",
"x65"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 7035\n",
"x91"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 27960\n",
"x94"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 37786\n",
"x95"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 5086\n",
"(170000, 135)"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" (170000, 306426) (545082, 135) (545082, 306426)\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"X_numerical = np.nan_to_num(X_numerical)\n",
"X_test_numerical = np.nan_to_num(X_test_numerical)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.externals import joblib\n",
"\n",
"joblib.dump(\n",
" (X_numerical, X_sparse, X_test_numerical, X_test_sparse),\n",
" data_dir + 'X.dump',\n",
" compress = 1,\n",
")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Trying to predict something"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Build two level classifier, first train base level"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.metrics import roc_auc_score, f1_score, log_loss, make_scorer\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.cross_validation import cross_val_score, train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"log_loss_scorer = make_scorer(log_loss, needs_proba = True)\n",
"\n",
"y_columns = [name for name in train_with_labels.columns if name.startswith('y')]\n",
"\n",
"X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split(\n",
" X_numerical, \n",
" X_sparse, \n",
" train_with_labels[y_columns].values,\n",
" test_size = 0.5\n",
")\n",
"\n",
"X_meta = [] \n",
"X_test_meta = []\n",
"\n",
"print \"Build meta\"\n",
"\n",
"for i in range(y_base.shape[1]) :\n",
" print i\n",
" \n",
" y = y_base[:, i]\n",
" if len(np.unique(y)) == 2 : \n",
" rf = RandomForestClassifier(n_estimators = 10, n_jobs = 1)\n",
" rf.fit(X_numerical_base, y)\n",
" X_meta.append(rf.predict_proba(X_numerical_meta))\n",
" X_test_meta.append(rf.predict_proba(X_test_numerical))\n",
"\n",
" svm = LinearSVC()\n",
" svm.fit(X_sparse_base, y)\n",
" X_meta.append(svm.decision_function(X_sparse_meta))\n",
" X_test_meta.append(svm.decision_function(X_test_sparse))\n",
" \n",
"X_meta = np.column_stack(X_meta)\n",
"X_test_meta = np.column_stack(X_test_meta)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Build meta\n",
"0\n",
"1"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print X_meta.shape, X_test_meta.shape"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Here train meta level and get predictions for test set"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p_test = []\n",
"\n",
"for i in range(y_base.shape[1]) :\n",
" y = y_meta[:, i]\n",
"\n",
" constant = Counter(y)\n",
" constant = constant[0] < 4 or constant[1] < 4\n",
" \n",
" predicted = None\n",
" \n",
" if constant :\n",
" # Best constant\n",
" constant_pred = np.mean(list(y_base[:, i]) + list(y_meta[:, i]))\n",
" \n",
" predicted = np.ones(X_test_meta.shape[0]) * constant_pred\n",
" print \"%d is constant like: %f\" % (i, constant_pred)\n",
" else :\n",
" rf = RandomForestClassifier(n_estimators=30, n_jobs = 1)\n",
" rf.fit(np.hstack([X_meta, X_numerical_meta]), y)\n",
"\n",
" predicted = rf.predict_proba(np.hstack([X_test_meta, X_test_numerical]))\n",
"\n",
" predicted = predicted[:, 1]\n",
" \n",
" rf = RandomForestClassifier(n_estimators=30, n_jobs = 1)\n",
" scores = cross_val_score(rf, np.hstack([X_meta, X_numerical_meta]), y, cv = 4, n_jobs = 1, scoring = log_loss_scorer)\n",
"\n",
" print i, 'RF log-loss: %.4f \u00b1 %.4f, mean = %.6f' %(np.mean(scores), np.std(scores), np.mean(predicted))\n",
"\n",
" \n",
" p_test.append(\n",
" predicted\n",
" )\n",
" \n",
"p_test = np.column_stack(p_test)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Save predictions"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p_test.shape"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import gzip\n",
"\n",
"def save_predictions(name, ids, predictions) :\n",
" out = gzip.open(name, 'w')\n",
" print >>out, 'id_label,pred'\n",
" for id, id_predictions in zip(test['id'], p_test) :\n",
" for y_id, pred in enumerate(id_predictions) :\n",
" if pred == 0 or pred == 1 :\n",
" pred = str(int(pred))\n",
" else :\n",
" pred = '%.6f' % pred\n",
" print >>out, '%d_y%d,%s' % (id, y_id + 1, pred)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"save_predictions('quick_start.csv.gz', test['id'].values, p_test)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!ls -l -h quick_start*.csv.gz"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Public result"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Quick start on 10% of train - 0.0212323"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment