Skip to content

Instantly share code, notes, and snippets.

@devashishd12
Created April 15, 2016 13:23
Show Gist options
  • Save devashishd12/1d7db5581d6f3bc12db2e88e33fc2005 to your computer and use it in GitHub Desktop.
Save devashishd12/1d7db5581d6f3bc12db2e88e33fc2005 to your computer and use it in GitHub Desktop.
CMFS on 20NG including naive bayes
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Implementation of Improved CMFS on 20NG"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"\"\"\"\n",
"Authors: Abhirav Gholba\n",
" Bhargav Srinivasa\n",
" Devashish Deshpande\n",
" Gauri Kholkar\n",
" Mrunmayee Nasery\n",
"\"\"\"\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score\n",
"import numpy as np\n",
"import operator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))\n",
"vec = CountVectorizer(stop_words='english')\n",
"document_term_mat = vec.fit_transform(newsgroups_train.data)\n",
"term_document_mat = document_term_mat.T\n",
"documents = len(newsgroups_train.filenames)\n",
"categories = len(newsgroups_train.target_names)\n",
"terms = term_document_mat.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print \"No. of documents: %d\\nNo. of categories: %d\" % (documents, categories)\n",
"print \"matrix.shape: {0}\".format(term_document_mat.shape)\n",
"print newsgroups_train.target[10]\n",
"print type(term_document_mat)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Term-category feature-appearance matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"term_category_mat = np.zeros((terms, categories))\n",
"for doc in range(documents):\n",
" cat = newsgroups_train.target[doc]\n",
" for row in term_document_mat.getcol(doc).nonzero()[0]:\n",
" term_category_mat[row][cat] += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print \"Term-category matrix shape: {0}\".format(term_category_mat.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Perform CMFS on term-category matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"term_freq_per_cat = np.cumsum(term_category_mat, axis=0)[-1, :]\n",
"for term in range(terms):\n",
" # Frequency of the term across all categories\n",
" # CMFS(tk,ci) = (P(tk|ci)*P(ci|tk))/P(ci)\n",
" total_term_freq = sum(term_category_mat[term, :])\n",
" for cat in range(categories):\n",
" numerator = float(((term_category_mat[term][cat] + 1) ** 2) * documents)\n",
" denominator = (total_term_freq + categories) * (term_freq_per_cat[cat] + terms) * term_freq_per_cat[cat]\n",
" term_category_mat[term][cat] = numerator / denominator\n",
" \n",
"# Final CMFS matrix\n",
"print term_category_mat"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create term-cmfs dictionary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Create term id (i.e. row no) - CMFS dict\n",
"term_cmfs_dict = {}\n",
"cmfs_max = np.max(term_category_mat, axis=1)\n",
"for i in range(terms):\n",
" term_cmfs_dict[i] = cmfs_max[i]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Extract top 10 features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Integer to term mapping dictionary\n",
"dictionary = vec.get_feature_names()\n",
"\n",
"sorted_feature_list = sorted(term_cmfs_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]\n",
"print \"-------Selected features are-------\\n\"\n",
"for term, cmfs in sorted_feature_list:\n",
" print \"Term: {0} \\t CMFS: {1}\".format(dictionary[term], cmfs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Naive bayes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"feature_list = [term for term, _ in sorted_feature_list]\n",
"# Create matrix for only the selected features. Note that the features are being extracted\n",
"# on the original document-term matrix. This will help in mapping with the targets easily.\n",
"selected_feature_matrix = document_term_mat[:, feature_list]\n",
"print selected_feature_matrix.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))\n",
"document_term_mat_test = vec.transform(newsgroups_test.data)\n",
"clf = MultinomialNB().fit(selected_feature_matrix, newsgroups_train.target)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Evaluate accuracy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print clf.predict(document_term_mat_test[:, feature_list])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment