devashishd12/CMFS_20NG.ipynb

## CMFS_20NG.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Implementation of Improved CMFS on 20NG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Authors: Abhirav Gholba\n",
    "         Bhargav Srinivasa\n",
    "         Devashish Deshpande\n",
    "         Gauri Kholkar\n",
    "         Mrunmayee Nasery\n",
    "\"\"\"\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.metrics import accuracy_score\n",
    "import numpy as np\n",
    "import operator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))\n",
    "vec = CountVectorizer(stop_words='english')\n",
    "document_term_mat = vec.fit_transform(newsgroups_train.data)\n",
    "term_document_mat = document_term_mat.T\n",
    "documents = len(newsgroups_train.filenames)\n",
    "categories = len(newsgroups_train.target_names)\n",
    "terms = term_document_mat.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print \"No. of documents: %d\\nNo. of categories: %d\" % (documents, categories)\n",
    "print \"matrix.shape: {0}\".format(term_document_mat.shape)\n",
    "print newsgroups_train.target[10]\n",
    "print type(term_document_mat)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Term-category feature-appearance matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "term_category_mat = np.zeros((terms, categories))\n",
    "for doc in range(documents):\n",
    "    cat = newsgroups_train.target[doc]\n",
    "    for row in term_document_mat.getcol(doc).nonzero()[0]:\n",
    "        term_category_mat[row][cat] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print \"Term-category matrix shape: {0}\".format(term_category_mat.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Perform CMFS on term-category matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "term_freq_per_cat = np.cumsum(term_category_mat, axis=0)[-1, :]\n",
    "for term in range(terms):\n",
    "    # Frequency of the term across all categories\n",
    "    # CMFS(tk,ci) = (P(tk|ci)*P(ci|tk))/P(ci)\n",
    "    total_term_freq = sum(term_category_mat[term, :])\n",
    "    for cat in range(categories):\n",
    "        numerator = float(((term_category_mat[term][cat] + 1) ** 2) * documents)\n",
    "        denominator = (total_term_freq + categories) * (term_freq_per_cat[cat] + terms) * term_freq_per_cat[cat]\n",
    "        term_category_mat[term][cat] = numerator / denominator\n",
    "        \n",
    "# Final CMFS matrix\n",
    "print term_category_mat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create term-cmfs dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create term id (i.e. row no) - CMFS dict\n",
    "term_cmfs_dict = {}\n",
    "cmfs_max = np.max(term_category_mat, axis=1)\n",
    "for i in range(terms):\n",
    "    term_cmfs_dict[i] = cmfs_max[i]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract top 10 features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Integer to term mapping dictionary\n",
    "dictionary = vec.get_feature_names()\n",
    "\n",
    "sorted_feature_list = sorted(term_cmfs_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]\n",
    "print \"-------Selected features are-------\\n\"\n",
    "for term, cmfs in sorted_feature_list:\n",
    "    print \"Term: {0} \\t CMFS: {1}\".format(dictionary[term], cmfs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Naive bayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "feature_list = [term for term, _ in sorted_feature_list]\n",
    "# Create matrix for only the selected features. Note that the features are being extracted\n",
    "# on the original document-term matrix. This will help in mapping with the targets easily.\n",
    "selected_feature_matrix = document_term_mat[:, feature_list]\n",
    "print selected_feature_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))\n",
    "document_term_mat_test = vec.transform(newsgroups_test.data)\n",
    "clf = MultinomialNB().fit(selected_feature_matrix, newsgroups_train.target)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluate accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print clf.predict(document_term_mat_test[:, feature_list])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Implementation of Improved CMFS on 20NG"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"\"\"\"\n",
	"Authors: Abhirav Gholba\n",
	" Bhargav Srinivasa\n",
	" Devashish Deshpande\n",
	" Gauri Kholkar\n",
	" Mrunmayee Nasery\n",
	"\"\"\"\n",
	"from sklearn.datasets import fetch_20newsgroups\n",
	"from sklearn.feature_extraction.text import CountVectorizer\n",
	"from sklearn.naive_bayes import MultinomialNB\n",
	"from sklearn.metrics import accuracy_score\n",
	"import numpy as np\n",
	"import operator"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))\n",
	"vec = CountVectorizer(stop_words='english')\n",
	"document_term_mat = vec.fit_transform(newsgroups_train.data)\n",
	"term_document_mat = document_term_mat.T\n",
	"documents = len(newsgroups_train.filenames)\n",
	"categories = len(newsgroups_train.target_names)\n",
	"terms = term_document_mat.shape[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"print \"No. of documents: %d\\nNo. of categories: %d\" % (documents, categories)\n",
	"print \"matrix.shape: {0}\".format(term_document_mat.shape)\n",
	"print newsgroups_train.target[10]\n",
	"print type(term_document_mat)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Create Term-category feature-appearance matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"term_category_mat = np.zeros((terms, categories))\n",
	"for doc in range(documents):\n",
	" cat = newsgroups_train.target[doc]\n",
	" for row in term_document_mat.getcol(doc).nonzero()[0]:\n",
	" term_category_mat[row][cat] += 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"print \"Term-category matrix shape: {0}\".format(term_category_mat.shape)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Perform CMFS on term-category matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"term_freq_per_cat = np.cumsum(term_category_mat, axis=0)[-1, :]\n",
	"for term in range(terms):\n",
	" # Frequency of the term across all categories\n",
	" # CMFS(tk,ci) = (P(tk\|ci)*P(ci\|tk))/P(ci)\n",
	" total_term_freq = sum(term_category_mat[term, :])\n",
	" for cat in range(categories):\n",
	" numerator = float(((term_category_mat[term][cat] + 1) ** 2) * documents)\n",
	" denominator = (total_term_freq + categories) * (term_freq_per_cat[cat] + terms) * term_freq_per_cat[cat]\n",
	" term_category_mat[term][cat] = numerator / denominator\n",
	" \n",
	"# Final CMFS matrix\n",
	"print term_category_mat"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Create term-cmfs dictionary"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Create term id (i.e. row no) - CMFS dict\n",
	"term_cmfs_dict = {}\n",
	"cmfs_max = np.max(term_category_mat, axis=1)\n",
	"for i in range(terms):\n",
	" term_cmfs_dict[i] = cmfs_max[i]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Extract top 10 features"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Integer to term mapping dictionary\n",
	"dictionary = vec.get_feature_names()\n",
	"\n",
	"sorted_feature_list = sorted(term_cmfs_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]\n",
	"print \"-------Selected features are-------\\n\"\n",
	"for term, cmfs in sorted_feature_list:\n",
	" print \"Term: {0} \\t CMFS: {1}\".format(dictionary[term], cmfs)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Naive bayes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"feature_list = [term for term, _ in sorted_feature_list]\n",
	"# Create matrix for only the selected features. Note that the features are being extracted\n",
	"# on the original document-term matrix. This will help in mapping with the targets easily.\n",
	"selected_feature_matrix = document_term_mat[:, feature_list]\n",
	"print selected_feature_matrix.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))\n",
	"document_term_mat_test = vec.transform(newsgroups_test.data)\n",
	"clf = MultinomialNB().fit(selected_feature_matrix, newsgroups_train.target)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Evaluate accuracy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"print clf.predict(document_term_mat_test[:, feature_list])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}