yamasakih/#4.2-Comparison_of_speeds_of_DataFrame_and_svmlight.ipynb

## #4.2-Comparison_of_speeds_of_DataFrame_and_svmlight.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comparison of speeds of DataFrame and svmlight files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sys.version_info(major=3, minor=6, micro=2, releaselevel='final', serial=0)\n",
      "sklearn version = 0.20.0\n"
     ]
    }
   ],
   "source": [
    "from datetime import datetime\n",
    "from os.path import splitext\n",
    "import pickle\n",
    "import sys\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import sklearn\n",
    "from sklearn.datasets import load_svmlight_file\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.svm import SVR\n",
    "\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "print(sys.version_info)\n",
    "print(f'sklearn version = {sklearn.__version__}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def svr_with_dataframe_data(input_file, parameters, n_jobs=10):\n",
    "    df = pd.read_csv(input_file, header=None)\n",
    "    X, y = np.array(df.iloc[:, 1:]), np.array(df.iloc[:, 0])\n",
    "    training_X, test_X = X[:int(X.shape[0]/2)], X[int(X.shape[0]/2):]\n",
    "    training_y, test_y = y[:int(X.shape[0]/2)], y[int(X.shape[0]/2):]\n",
    "    \n",
    "    regr = GridSearchCV(SVR(), parameters, cv=3, n_jobs=n_jobs)\n",
    "    regr = SVR(C=1, gamma=1)\n",
    "    regr.fit(training_X, training_y)\n",
    "    \n",
    "    stem, ext = splitext(input_file)\n",
    "    with open(f'{stem}.pkl', 'wb') as f:\n",
    "        pickle.dump(regr, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def svr_with_svmlight_data(input_file, parameters, n_jobs=10):\n",
    "    data = load_svmlight_file('svmlight_data.csv')\n",
    "    X, y = data[0], data[1]\n",
    "    training_X, test_X = X[:int(X.shape[0]/2)], X[int(X.shape[0]/2):]\n",
    "    training_y, test_y = y[:int(X.shape[0]/2)], y[int(X.shape[0]/2):]\n",
    "    \n",
    "    regr = GridSearchCV(SVR(), parameters, cv=3, n_jobs=n_jobs)\n",
    "    regr = SVR(C=1, gamma=1)\n",
    "    regr.fit(training_X, training_y)\n",
    "    \n",
    "    stem, ext = splitext(input_file)\n",
    "    with open(f'{stem}.pkl', 'wb') as f:\n",
    "        pickle.dump(regr, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Support Vector Machine regression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Set dataset parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_sample_set = (1000, 2000, 4000, 8000)\n",
    "num_bit_set  = (256, 512, 1024, 2048, 4096)\n",
    "on_bit_ratios = (.1, .3, .5, .7, .9)\n",
    "random_seeds = (0, 1, 2, 3, 4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Set parameters for Gridsearch CV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "parameters = {\n",
    "    'C': (2**-2, 2**-1, 2**0, 2**1, 2**2),\n",
    "    'gamma': (2**-4, 2**-2, 2**0, 2**2, 2**4),\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Support Vector Machine regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "elapsed_time_ratios = np.zeros((len(num_sample_set), len(num_bit_set), len(on_bit_ratios), len(random_seeds)))\n",
    "for i, num_samples in enumerate(num_sample_set):\n",
    "    for j, num_bits in enumerate(num_bit_set):\n",
    "        for k, on_bit_ratio in enumerate(on_bit_ratios):\n",
    "            for l, random_seed in enumerate(random_seeds):\n",
    "                file_name = f'{num_samples}_{num_bits}_{int(on_bit_ratio*100)}_{random_seed}.csv'\n",
    "                svmlight_input_file = f'dataset/svmlight_{file_name}'\n",
    "                dataframe_input_file = f'dataset/dataframe_{file_name}'\n",
    "\n",
    "                t1 = datetime.now()\n",
    "\n",
    "                # Support Vector Machine regression with svmlight data\n",
    "                svr_with_svmlight_data(svmlight_input_file, parameters)\n",
    "\n",
    "                t2 = datetime.now()\n",
    "\n",
    "                # Support Vector Machine regression with dataframe data\n",
    "                svr_with_dataframe_data(dataframe_input_file, parameters)\n",
    "\n",
    "                t3 = datetime.now()\n",
    "\n",
    "                elapsed_time_ratios[i, j, k, l] =  (t2 - t1) / (t3 - t2) * 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EOF"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Comparison of speeds of DataFrame and svmlight files"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"sys.version_info(major=3, minor=6, micro=2, releaselevel='final', serial=0)\n",
	"sklearn version = 0.20.0\n"
	]
	}
	],
	"source": [
	"from datetime import datetime\n",
	"from os.path import splitext\n",
	"import pickle\n",
	"import sys\n",
	"\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"\n",
	"import sklearn\n",
	"from sklearn.datasets import load_svmlight_file\n",
	"from sklearn.model_selection import GridSearchCV\n",
	"from sklearn.svm import SVR\n",
	"\n",
	"\n",
	"%matplotlib inline\n",
	"\n",
	"print(sys.version_info)\n",
	"print(f'sklearn version = {sklearn.__version__}')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def svr_with_dataframe_data(input_file, parameters, n_jobs=10):\n",
	" df = pd.read_csv(input_file, header=None)\n",
	" X, y = np.array(df.iloc[:, 1:]), np.array(df.iloc[:, 0])\n",
	" training_X, test_X = X[:int(X.shape[0]/2)], X[int(X.shape[0]/2):]\n",
	" training_y, test_y = y[:int(X.shape[0]/2)], y[int(X.shape[0]/2):]\n",
	" \n",
	" regr = GridSearchCV(SVR(), parameters, cv=3, n_jobs=n_jobs)\n",
	" regr = SVR(C=1, gamma=1)\n",
	" regr.fit(training_X, training_y)\n",
	" \n",
	" stem, ext = splitext(input_file)\n",
	" with open(f'{stem}.pkl', 'wb') as f:\n",
	" pickle.dump(regr, f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"def svr_with_svmlight_data(input_file, parameters, n_jobs=10):\n",
	" data = load_svmlight_file('svmlight_data.csv')\n",
	" X, y = data[0], data[1]\n",
	" training_X, test_X = X[:int(X.shape[0]/2)], X[int(X.shape[0]/2):]\n",
	" training_y, test_y = y[:int(X.shape[0]/2)], y[int(X.shape[0]/2):]\n",
	" \n",
	" regr = GridSearchCV(SVR(), parameters, cv=3, n_jobs=n_jobs)\n",
	" regr = SVR(C=1, gamma=1)\n",
	" regr.fit(training_X, training_y)\n",
	" \n",
	" stem, ext = splitext(input_file)\n",
	" with open(f'{stem}.pkl', 'wb') as f:\n",
	" pickle.dump(regr, f)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 2. Support Vector Machine regression"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Set dataset parameters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"num_sample_set = (1000, 2000, 4000, 8000)\n",
	"num_bit_set = (256, 512, 1024, 2048, 4096)\n",
	"on_bit_ratios = (.1, .3, .5, .7, .9)\n",
	"random_seeds = (0, 1, 2, 3, 4)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Set parameters for Gridsearch CV"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"parameters = {\n",
	" 'C': (2-2, 2-1, 20, 21, 2**2),\n",
	" 'gamma': (2-4, 2-2, 20, 22, 2**4),\n",
	"}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Support Vector Machine regression"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"elapsed_time_ratios = np.zeros((len(num_sample_set), len(num_bit_set), len(on_bit_ratios), len(random_seeds)))\n",
	"for i, num_samples in enumerate(num_sample_set):\n",
	" for j, num_bits in enumerate(num_bit_set):\n",
	" for k, on_bit_ratio in enumerate(on_bit_ratios):\n",
	" for l, random_seed in enumerate(random_seeds):\n",
	" file_name = f'{num_samples}_{num_bits}_{int(on_bit_ratio*100)}_{random_seed}.csv'\n",
	" svmlight_input_file = f'dataset/svmlight_{file_name}'\n",
	" dataframe_input_file = f'dataset/dataframe_{file_name}'\n",
	"\n",
	" t1 = datetime.now()\n",
	"\n",
	" # Support Vector Machine regression with svmlight data\n",
	" svr_with_svmlight_data(svmlight_input_file, parameters)\n",
	"\n",
	" t2 = datetime.now()\n",
	"\n",
	" # Support Vector Machine regression with dataframe data\n",
	" svr_with_dataframe_data(dataframe_input_file, parameters)\n",
	"\n",
	" t3 = datetime.now()\n",
	"\n",
	" elapsed_time_ratios[i, j, k, l] = (t2 - t1) / (t3 - t2) * 100"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# EOF"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}