Skip to content

Instantly share code, notes, and snippets.

@padjiman
Last active September 6, 2021 17:16
Show Gist options
  • Save padjiman/5f82a1b1559b11f36a706c4b04e5ab59 to your computer and use it in GitHub Desktop.
Save padjiman/5f82a1b1559b11f36a706c4b04e5ab59 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import datasets\n",
"from sklearn.model_selection import train_test_split\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparing train/test sets"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
"#directory = \"/Users/padjiman/data/KDDCUP1998/\"\n",
"directory = \"/Users/padjiman/data/bankVW/\"\n",
"data_file = \"train.vw\"\n",
"data = pd.read_csv(directory+data_file, header=None)\n",
"train, test = train_test_split(data, test_size=0.20 , random_state = 26 )\n",
"train.to_csv(directory+'split_train.vw', index=False, header=None)\n",
"test.to_csv(directory+'split_test.vw', index=False, header=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Actual train and test"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"creating features for following interactions: ic \n",
"final_regressor = model.vw\n",
"Num weight bits = 26\n",
"learning rate = 0.5\n",
"initial_t = 0\n",
"power_t = 0.5\n",
"decay_learning_rate = 1\n",
"using cache_file = split_train.vw.cache\n",
"ignoring text input in favor of cache input\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"0.693147 0.693147 1 1.0 1.0000 0.0000 60\n",
"1.109843 1.526538 2 2.0 -1.0000 1.2815 60\n",
"0.760195 0.410548 4 4.0 -1.0000 -0.2754 60\n",
"0.465710 0.171224 8 8.0 -1.0000 -0.7350 50\n",
"0.318235 0.170760 16 16.0 -1.0000 -2.7830 60\n",
"0.531552 0.744869 32 32.0 -1.0000 -6.1178 60\n",
"2.579228 4.626905 64 64.0 -1.0000 -3.2457 60\n",
"2.194169 1.809109 128 128.0 -1.0000 -2.9788 60\n",
"2.114863 2.035556 256 256.0 -1.0000 -1.4744 60\n",
"1.616420 1.117978 512 512.0 -1.0000 -1.9029 60\n",
"1.143370 0.670321 1024 1024.0 -1.0000 -3.5995 60\n",
"0.766199 0.389027 2048 2048.0 -1.0000 -3.3793 60\n",
"0.530801 0.295404 4096 4096.0 -1.0000 -4.4217 60\n",
"0.389938 0.249075 8192 8192.0 -1.0000 -2.8836 60\n",
"0.318798 0.247658 16384 16384.0 -1.0000 -3.9569 70\n",
"0.263437 0.263437 32768 32768.0 -1.0000 -4.6966 60 h\n",
"0.252186 0.240939 65536 65536.0 -1.0000 -4.0019 60 h\n",
"\n",
"finished run\n",
"number of examples per pass = 32552\n",
"passes used = 4\n",
"weighted example sum = 130208.000000\n",
"weighted label sum = -99792.000000\n",
"average loss = 0.239227 h\n",
"best constant = -2.023110\n",
"best constant's loss = 0.360496\n",
"total feature number = 7949680\n",
"creating features for following interactions: ic \n",
"only testing\n",
"predictions = preds.txt\n",
"Num weight bits = 26\n",
"learning rate = 0.5\n",
"initial_t = 0\n",
"power_t = 0.5\n",
"using no cache\n",
"Reading datafile = split_test.vw\n",
"num sources = 1\n",
"average since example example current current current\n",
"loss last counter weight label predict features\n",
"1.358600 1.358600 1 1.0 -1.0000 0.1029 60\n",
"7.592927 13.827254 2 2.0 -1.0000 0.0088 60\n",
"9.232505 10.872083 4 4.0 1.0000 0.1225 60\n",
"9.071785 8.911065 8 8.0 -1.0000 0.1698 60\n",
"9.430997 9.790208 16 16.0 -1.0000 0.0333 60\n",
"8.353756 7.276516 32 32.0 1.0000 0.2188 60\n",
"8.469951 8.586146 64 64.0 -1.0000 0.0321 60\n",
"7.585703 6.701455 128 128.0 -1.0000 0.0199 60\n",
"7.449527 7.313351 256 256.0 -1.0000 0.2331 60\n",
"7.174785 6.900042 512 512.0 -1.0000 0.1774 60\n",
"7.090911 7.007037 1024 1024.0 -1.0000 0.1080 60\n",
"7.169777 7.248644 2048 2048.0 -1.0000 0.0220 60\n",
"7.188963 7.208148 4096 4096.0 -1.0000 0.0207 60\n",
"7.211928 7.234893 8192 8192.0 -1.0000 0.0065 60\n",
"\n",
"finished run\n",
"number of examples per pass = 9043\n",
"passes used = 1\n",
"weighted example sum = 9043.000000\n",
"weighted label sum = -6955.000000\n",
"average loss = 7.205263\n",
"best constant = -0.769103\n",
"best constant's loss = 0.408480\n",
"total feature number = 552460\n"
]
}
],
"source": [
"!cd $directory && vw split_train.vw -c --passes 4 -f model.vw --loss_function logistic --interactions ic -b 26\n",
"!cd $directory && vw split_test.vw -t -i model.vw -p preds.txt --link logistic"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Calculating the AUC"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.914318312778\n"
]
}
],
"source": [
"preds = pd.read_csv(directory+'preds.txt', header=None)\n",
"test_split = pd.read_csv(directory+'split_test.vw', header=None, sep = '|')\n",
"from sklearn import metrics\n",
"fpr, tpr, thresholds = metrics.roc_curve(test_split[0].values, preds[0].values)\n",
"auc = metrics.auc(fpr, tpr)\n",
"print(auc)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment