Skip to content

Instantly share code, notes, and snippets.

@sowmyagowri
Created December 12, 2017 00:21
Show Gist options
  • Save sowmyagowri/7c7a9c5ae36b108a0b2aad0ac36ff4c1 to your computer and use it in GitHub Desktop.
Save sowmyagowri/7c7a9c5ae36b108a0b2aad0ac36ff4c1 to your computer and use it in GitHub Desktop.
Python Program for Drug Activity Prediction using Dimensionality Reduction and Classification
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 796,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.decomposition import PCA as sklearnPCA\n",
"from sklearn.naive_bayes import BernoulliNB"
]
},
{
"cell_type": "code",
"execution_count": 803,
"metadata": {},
"outputs": [],
"source": [
"#Read the input files and read every line\n",
"def loadData(trainingFile, testingFile):\n",
" \n",
" def convertDataframe(inputFile):\n",
" data = pd.DataFrame(columns=range(100000))\n",
" \n",
" for i in range(len(inputFile)):\n",
" record = np.fromstring(inputFile[i], dtype=int, sep=' ')\n",
" record_bool = [0 for j in range(100000)]\n",
" for col in record:\n",
" record_bool[col-1] = 1\n",
" \n",
" data.loc[i] = record_bool\n",
" \n",
" return data\n",
" \n",
" with open(trainingFile, \"r\") as fr1:\n",
" trainFile = fr1.readlines()\n",
" \n",
" #Split each line in the two files into label and data \n",
" train_data_list = []\n",
" train_labels_list = []\n",
" \n",
" for inputData in trainFile:\n",
" train_labels_list.append(inputData[0])\n",
" \n",
" #Remove the activity label (0/1) and new line character from each record\n",
" inputData = inputData.replace(\"0\\t\", \"\")\n",
" inputData = inputData.replace(\"1\\t\", \"\")\n",
" inputData = inputData.replace(\"\\n\", \"\")\n",
" train_data_list.append(inputData)\n",
" \n",
" train_labels = np.asarray(train_labels_list)\n",
" train_data = convertDataframe(train_data_list)\n",
" \n",
" with open(testingFile, \"r\") as fr2:\n",
" testFile = fr2.readlines()\n",
" \n",
" test_data = convertDataframe(testFile)\n",
" \n",
" return train_data, test_data, train_labels\n",
"\n",
"# Project data on a reduced dimensionality k using PCA\n",
"def pca(train_data, test_data, k):\n",
"\n",
" pca = sklearnPCA(n_components = k)\n",
" PCA_projected_trainData = pca.fit_transform(train_data)\n",
" PCA_projected_testData = pca.transform(test_data)\n",
" \n",
" return PCA_projected_trainData, PCA_projected_testData\n",
"\n",
"#Perform Bernoulli's Naive Bayes Classification\n",
"def classifier(PCA_projected_trainData, PCA_projected_testData, train_labels ):\n",
"\n",
" BNBC = BernoulliNB()\n",
" BNBC.fit(PCA_projected_trainData, train_labels)\n",
"\n",
" predictions = []\n",
"\n",
" predictions = BNBC.predict(PCA_projected_testData)\n",
"\n",
" return predictions"
]
},
{
"cell_type": "code",
"execution_count": 800,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"#Read the training and the test data set and get 3 separate dataframes of training reviews, test reviews and training labels\n",
"train_data, test_data, train_labels = loadData('train.dat', 'test.dat')"
]
},
{
"cell_type": "code",
"execution_count": 801,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Reduce the number of dimensions from 100000 to 100 using PCA\n",
"PCA_projected_trainData, PCA_projected_testData = pca(train_data, test_data, 100)"
]
},
{
"cell_type": "code",
"execution_count": 804,
"metadata": {},
"outputs": [],
"source": [
"#Classify data using Naive Bayes Classifier\n",
"predictions = classifier(PCA_projected_trainData, PCA_projected_testData, train_labels )"
]
},
{
"cell_type": "code",
"execution_count": 805,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Write the result to a .dat file\n",
"output = open('output-k-100-PCA-BNBC.dat', 'w')\n",
"\n",
"output.writelines( \"%s\\n\" % prediction for prediction in predictions )\n",
"\n",
"output.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment