sowmyagowri/Drug Activity Prediction.ipynb

## Drug Activity Prediction.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 796,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.decomposition import PCA as sklearnPCA\n",
    "from sklearn.naive_bayes import BernoulliNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 803,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Read the input files and read every line\n",
    "def loadData(trainingFile, testingFile):\n",
    "    \n",
    "    def convertDataframe(inputFile):\n",
    "        data = pd.DataFrame(columns=range(100000))\n",
    "        \n",
    "        for i in range(len(inputFile)):\n",
    "            record = np.fromstring(inputFile[i], dtype=int, sep=' ')\n",
    "            record_bool = [0 for j in range(100000)]\n",
    "            for col in record:\n",
    "                record_bool[col-1] = 1\n",
    "            \n",
    "            data.loc[i] = record_bool\n",
    "            \n",
    "        return data\n",
    "    \n",
    "    with open(trainingFile, \"r\") as fr1:\n",
    "        trainFile = fr1.readlines()\n",
    "    \n",
    "    #Split each line in the two files into label and data  \n",
    "    train_data_list = []\n",
    "    train_labels_list = []\n",
    "    \n",
    "    for inputData in trainFile:\n",
    "        train_labels_list.append(inputData[0])\n",
    "        \n",
    "        #Remove the activity label (0/1) and new line character from each record\n",
    "        inputData = inputData.replace(\"0\\t\", \"\")\n",
    "        inputData = inputData.replace(\"1\\t\", \"\")\n",
    "        inputData = inputData.replace(\"\\n\", \"\")\n",
    "        train_data_list.append(inputData)\n",
    "    \n",
    "    train_labels = np.asarray(train_labels_list)\n",
    "    train_data = convertDataframe(train_data_list)\n",
    "        \n",
    "    with open(testingFile, \"r\") as fr2:\n",
    "        testFile = fr2.readlines()\n",
    "    \n",
    "    test_data = convertDataframe(testFile)\n",
    "            \n",
    "    return train_data, test_data, train_labels\n",
    "\n",
    "# Project data on a reduced dimensionality k using PCA\n",
    "def pca(train_data, test_data, k):\n",
    "\n",
    "    pca = sklearnPCA(n_components = k)\n",
    "    PCA_projected_trainData = pca.fit_transform(train_data)\n",
    "    PCA_projected_testData = pca.transform(test_data)\n",
    "    \n",
    "    return PCA_projected_trainData, PCA_projected_testData\n",
    "\n",
    "#Perform Bernoulli's Naive Bayes Classification\n",
    "def classifier(PCA_projected_trainData, PCA_projected_testData, train_labels ):\n",
    "\n",
    "    BNBC = BernoulliNB()\n",
    "    BNBC.fit(PCA_projected_trainData, train_labels)\n",
    "\n",
    "    predictions = []\n",
    "\n",
    "    predictions = BNBC.predict(PCA_projected_testData)\n",
    "\n",
    "    return predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 800,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#Read the training and the test data set and get 3 separate dataframes of training reviews, test reviews and training labels\n",
    "train_data, test_data, train_labels = loadData('train.dat', 'test.dat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 801,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Reduce the number of dimensions from 100000 to 100 using PCA\n",
    "PCA_projected_trainData, PCA_projected_testData = pca(train_data, test_data, 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 804,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Classify data using Naive Bayes Classifier\n",
    "predictions = classifier(PCA_projected_trainData, PCA_projected_testData, train_labels )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 805,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Write the result to a .dat file\n",
    "output = open('output-k-100-PCA-BNBC.dat', 'w')\n",
    "\n",
    "output.writelines( \"%s\\n\" % prediction for prediction in predictions )\n",
    "\n",
    "output.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 796,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"from sklearn.decomposition import PCA as sklearnPCA\n",
	"from sklearn.naive_bayes import BernoulliNB"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 803,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Read the input files and read every line\n",
	"def loadData(trainingFile, testingFile):\n",
	" \n",
	" def convertDataframe(inputFile):\n",
	" data = pd.DataFrame(columns=range(100000))\n",
	" \n",
	" for i in range(len(inputFile)):\n",
	" record = np.fromstring(inputFile[i], dtype=int, sep=' ')\n",
	" record_bool = [0 for j in range(100000)]\n",
	" for col in record:\n",
	" record_bool[col-1] = 1\n",
	" \n",
	" data.loc[i] = record_bool\n",
	" \n",
	" return data\n",
	" \n",
	" with open(trainingFile, \"r\") as fr1:\n",
	" trainFile = fr1.readlines()\n",
	" \n",
	" #Split each line in the two files into label and data \n",
	" train_data_list = []\n",
	" train_labels_list = []\n",
	" \n",
	" for inputData in trainFile:\n",
	" train_labels_list.append(inputData[0])\n",
	" \n",
	" #Remove the activity label (0/1) and new line character from each record\n",
	" inputData = inputData.replace(\"0\\t\", \"\")\n",
	" inputData = inputData.replace(\"1\\t\", \"\")\n",
	" inputData = inputData.replace(\"\\n\", \"\")\n",
	" train_data_list.append(inputData)\n",
	" \n",
	" train_labels = np.asarray(train_labels_list)\n",
	" train_data = convertDataframe(train_data_list)\n",
	" \n",
	" with open(testingFile, \"r\") as fr2:\n",
	" testFile = fr2.readlines()\n",
	" \n",
	" test_data = convertDataframe(testFile)\n",
	" \n",
	" return train_data, test_data, train_labels\n",
	"\n",
	"# Project data on a reduced dimensionality k using PCA\n",
	"def pca(train_data, test_data, k):\n",
	"\n",
	" pca = sklearnPCA(n_components = k)\n",
	" PCA_projected_trainData = pca.fit_transform(train_data)\n",
	" PCA_projected_testData = pca.transform(test_data)\n",
	" \n",
	" return PCA_projected_trainData, PCA_projected_testData\n",
	"\n",
	"#Perform Bernoulli's Naive Bayes Classification\n",
	"def classifier(PCA_projected_trainData, PCA_projected_testData, train_labels ):\n",
	"\n",
	" BNBC = BernoulliNB()\n",
	" BNBC.fit(PCA_projected_trainData, train_labels)\n",
	"\n",
	" predictions = []\n",
	"\n",
	" predictions = BNBC.predict(PCA_projected_testData)\n",
	"\n",
	" return predictions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 800,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"#Read the training and the test data set and get 3 separate dataframes of training reviews, test reviews and training labels\n",
	"train_data, test_data, train_labels = loadData('train.dat', 'test.dat')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 801,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#Reduce the number of dimensions from 100000 to 100 using PCA\n",
	"PCA_projected_trainData, PCA_projected_testData = pca(train_data, test_data, 100)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 804,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Classify data using Naive Bayes Classifier\n",
	"predictions = classifier(PCA_projected_trainData, PCA_projected_testData, train_labels )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 805,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#Write the result to a .dat file\n",
	"output = open('output-k-100-PCA-BNBC.dat', 'w')\n",
	"\n",
	"output.writelines( \"%s\\n\" % prediction for prediction in predictions )\n",
	"\n",
	"output.close()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}