Created
April 15, 2016 13:23
-
-
Save devashishd12/1d7db5581d6f3bc12db2e88e33fc2005 to your computer and use it in GitHub Desktop.
CMFS on 20NG including naive bayes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Implementation of Improved CMFS on 20NG" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"Authors: Abhirav Gholba\n", | |
" Bhargav Srinivasa\n", | |
" Devashish Deshpande\n", | |
" Gauri Kholkar\n", | |
" Mrunmayee Nasery\n", | |
"\"\"\"\n", | |
"from sklearn.datasets import fetch_20newsgroups\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"from sklearn.naive_bayes import MultinomialNB\n", | |
"from sklearn.metrics import accuracy_score\n", | |
"import numpy as np\n", | |
"import operator" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))\n", | |
"vec = CountVectorizer(stop_words='english')\n", | |
"document_term_mat = vec.fit_transform(newsgroups_train.data)\n", | |
"term_document_mat = document_term_mat.T\n", | |
"documents = len(newsgroups_train.filenames)\n", | |
"categories = len(newsgroups_train.target_names)\n", | |
"terms = term_document_mat.shape[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"print \"No. of documents: %d\\nNo. of categories: %d\" % (documents, categories)\n", | |
"print \"matrix.shape: {0}\".format(term_document_mat.shape)\n", | |
"print newsgroups_train.target[10]\n", | |
"print type(term_document_mat)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Create Term-category feature-appearance matrix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"term_category_mat = np.zeros((terms, categories))\n", | |
"for doc in range(documents):\n", | |
" cat = newsgroups_train.target[doc]\n", | |
" for row in term_document_mat.getcol(doc).nonzero()[0]:\n", | |
" term_category_mat[row][cat] += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"print \"Term-category matrix shape: {0}\".format(term_category_mat.shape)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Perform CMFS on term-category matrix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"term_freq_per_cat = np.cumsum(term_category_mat, axis=0)[-1, :]\n", | |
"for term in range(terms):\n", | |
" # Frequency of the term across all categories\n", | |
" # CMFS(tk,ci) = (P(tk|ci)*P(ci|tk))/P(ci)\n", | |
" total_term_freq = sum(term_category_mat[term, :])\n", | |
" for cat in range(categories):\n", | |
" numerator = float(((term_category_mat[term][cat] + 1) ** 2) * documents)\n", | |
" denominator = (total_term_freq + categories) * (term_freq_per_cat[cat] + terms) * term_freq_per_cat[cat]\n", | |
" term_category_mat[term][cat] = numerator / denominator\n", | |
" \n", | |
"# Final CMFS matrix\n", | |
"print term_category_mat" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Create term-cmfs dictionary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create term id (i.e. row no) - CMFS dict\n", | |
"term_cmfs_dict = {}\n", | |
"cmfs_max = np.max(term_category_mat, axis=1)\n", | |
"for i in range(terms):\n", | |
" term_cmfs_dict[i] = cmfs_max[i]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Extract top 10 features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Integer to term mapping dictionary\n", | |
"dictionary = vec.get_feature_names()\n", | |
"\n", | |
"sorted_feature_list = sorted(term_cmfs_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]\n", | |
"print \"-------Selected features are-------\\n\"\n", | |
"for term, cmfs in sorted_feature_list:\n", | |
" print \"Term: {0} \\t CMFS: {1}\".format(dictionary[term], cmfs)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Naive bayes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"feature_list = [term for term, _ in sorted_feature_list]\n", | |
"# Create matrix for only the selected features. Note that the features are being extracted\n", | |
"# on the original document-term matrix. This will help in mapping with the targets easily.\n", | |
"selected_feature_matrix = document_term_mat[:, feature_list]\n", | |
"print selected_feature_matrix.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))\n", | |
"document_term_mat_test = vec.transform(newsgroups_test.data)\n", | |
"clf = MultinomialNB().fit(selected_feature_matrix, newsgroups_train.target)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Evaluate accuracy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"print clf.predict(document_term_mat_test[:, feature_list])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment