Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save yamasakih/439d0ff1134fc7923911066959557408 to your computer and use it in GitHub Desktop.
Save yamasakih/439d0ff1134fc7923911066959557408 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Comparison of speeds of DataFrame and svmlight files"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sys.version_info(major=3, minor=6, micro=2, releaselevel='final', serial=0)\n",
"sklearn version = 0.20.0\n"
]
}
],
"source": [
"from datetime import datetime\n",
"from os.path import splitext\n",
"import pickle\n",
"import sys\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import sklearn\n",
"from sklearn.datasets import load_svmlight_file\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.svm import SVR\n",
"\n",
"\n",
"%matplotlib inline\n",
"\n",
"print(sys.version_info)\n",
"print(f'sklearn version = {sklearn.__version__}')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def svr_with_dataframe_data(input_file, parameters, n_jobs=10):\n",
" df = pd.read_csv(input_file, header=None)\n",
" X, y = np.array(df.iloc[:, 1:]), np.array(df.iloc[:, 0])\n",
" training_X, test_X = X[:int(X.shape[0]/2)], X[int(X.shape[0]/2):]\n",
" training_y, test_y = y[:int(X.shape[0]/2)], y[int(X.shape[0]/2):]\n",
" \n",
" regr = GridSearchCV(SVR(), parameters, cv=3, n_jobs=n_jobs)\n",
" regr = SVR(C=1, gamma=1)\n",
" regr.fit(training_X, training_y)\n",
" \n",
" stem, ext = splitext(input_file)\n",
" with open(f'{stem}.pkl', 'wb') as f:\n",
" pickle.dump(regr, f)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def svr_with_svmlight_data(input_file, parameters, n_jobs=10):\n",
" data = load_svmlight_file('svmlight_data.csv')\n",
" X, y = data[0], data[1]\n",
" training_X, test_X = X[:int(X.shape[0]/2)], X[int(X.shape[0]/2):]\n",
" training_y, test_y = y[:int(X.shape[0]/2)], y[int(X.shape[0]/2):]\n",
" \n",
" regr = GridSearchCV(SVR(), parameters, cv=3, n_jobs=n_jobs)\n",
" regr = SVR(C=1, gamma=1)\n",
" regr.fit(training_X, training_y)\n",
" \n",
" stem, ext = splitext(input_file)\n",
" with open(f'{stem}.pkl', 'wb') as f:\n",
" pickle.dump(regr, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Support Vector Machine regression"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Set dataset parameters"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"num_sample_set = (1000, 2000, 4000, 8000)\n",
"num_bit_set = (256, 512, 1024, 2048, 4096)\n",
"on_bit_ratios = (.1, .3, .5, .7, .9)\n",
"random_seeds = (0, 1, 2, 3, 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Set parameters for Gridsearch CV"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"parameters = {\n",
" 'C': (2**-2, 2**-1, 2**0, 2**1, 2**2),\n",
" 'gamma': (2**-4, 2**-2, 2**0, 2**2, 2**4),\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Support Vector Machine regression"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"elapsed_time_ratios = np.zeros((len(num_sample_set), len(num_bit_set), len(on_bit_ratios), len(random_seeds)))\n",
"for i, num_samples in enumerate(num_sample_set):\n",
" for j, num_bits in enumerate(num_bit_set):\n",
" for k, on_bit_ratio in enumerate(on_bit_ratios):\n",
" for l, random_seed in enumerate(random_seeds):\n",
" file_name = f'{num_samples}_{num_bits}_{int(on_bit_ratio*100)}_{random_seed}.csv'\n",
" svmlight_input_file = f'dataset/svmlight_{file_name}'\n",
" dataframe_input_file = f'dataset/dataframe_{file_name}'\n",
"\n",
" t1 = datetime.now()\n",
"\n",
" # Support Vector Machine regression with svmlight data\n",
" svr_with_svmlight_data(svmlight_input_file, parameters)\n",
"\n",
" t2 = datetime.now()\n",
"\n",
" # Support Vector Machine regression with dataframe data\n",
" svr_with_dataframe_data(dataframe_input_file, parameters)\n",
"\n",
" t3 = datetime.now()\n",
"\n",
" elapsed_time_ratios[i, j, k, l] = (t2 - t1) / (t3 - t2) * 100"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# EOF"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment