georgeodsc/topicmodeling.ipynb

## topicmodeling.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Set max features to 6000\n",
    "no_features = 6000\n",
    "\n",
    "#Intialize the tfidf vectorizer\n",
    "tfidf_vectorizer = TfidfVectorizer(max_df=0.95,\n",
    "                                   max_features=no_features,\n",
    "                                   min_df=4, stop_words='english')\n",
    "#Fit tfidf on corpus\n",
    "tfidf = tfidf_vectorizer.fit_transform(docs)\n",
    "tfidf_feature_names = tfidf_vectorizer.get_feature_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "no_topics = 7\n",
    "\n",
    "# Fit NMF object on tfidf matrix\n",
    "nmf = NMF(n_components=no_topics, random_state=1, \n",
    "          alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Create function to display the top words for each topic\n",
    "def display_topics(model, feature_names, no_top_words):\n",
    "    for topic_idx, topic in enumerate(model.components_):\n",
    "        print (\"Topic %d:\" % (topic_idx + 1))\n",
    "        print (\" \".join([feature_names[i]\n",
    "                        for i in topic.argsort()[:-no_top_words - 1:-1]]))\n",
    "        print (\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic 1:\n",
      "ai intelligence artificial human machines marketing humans tasks business startups\n",
      "\n",
      "\n",
      "Topic 2:\n",
      "data science analytics big business scientists spark scientist skills hadoop\n",
      "\n",
      "\n",
      "Topic 3:\n",
      "learning machine deep algorithms ml learn systems computer google data\n",
      "\n",
      "\n",
      "Topic 4:\n",
      "new said technology like company people google says world companies\n",
      "\n",
      "\n",
      "Topic 5:\n",
      "iot devices internet things connected security smart sensors samsung home\n",
      "\n",
      "\n",
      "Topic 6:\n",
      "model neural regression data function network class training classification set\n",
      "\n",
      "\n",
      "Topic 7:\n",
      "customer chatbots chatbot customers bot bots marketing service messaging banks\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Run functions to find the top words for each topic as determined by NMF\n",
    "no_top_words = 10\n",
    "display_topics(nmf, tfidf_feature_names, no_top_words)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Set max features to 6000\n",
	"no_features = 6000\n",
	"\n",
	"#Intialize the tfidf vectorizer\n",
	"tfidf_vectorizer = TfidfVectorizer(max_df=0.95,\n",
	" max_features=no_features,\n",
	" min_df=4, stop_words='english')\n",
	"#Fit tfidf on corpus\n",
	"tfidf = tfidf_vectorizer.fit_transform(docs)\n",
	"tfidf_feature_names = tfidf_vectorizer.get_feature_names()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"no_topics = 7\n",
	"\n",
	"# Fit NMF object on tfidf matrix\n",
	"nmf = NMF(n_components=no_topics, random_state=1, \n",
	" alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Create function to display the top words for each topic\n",
	"def display_topics(model, feature_names, no_top_words):\n",
	" for topic_idx, topic in enumerate(model.components_):\n",
	" print (\"Topic %d:\" % (topic_idx + 1))\n",
	" print (\" \".join([feature_names[i]\n",
	" for i in topic.argsort()[:-no_top_words - 1:-1]]))\n",
	" print (\"\\n\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Topic 1:\n",
	"ai intelligence artificial human machines marketing humans tasks business startups\n",
	"\n",
	"\n",
	"Topic 2:\n",
	"data science analytics big business scientists spark scientist skills hadoop\n",
	"\n",
	"\n",
	"Topic 3:\n",
	"learning machine deep algorithms ml learn systems computer google data\n",
	"\n",
	"\n",
	"Topic 4:\n",
	"new said technology like company people google says world companies\n",
	"\n",
	"\n",
	"Topic 5:\n",
	"iot devices internet things connected security smart sensors samsung home\n",
	"\n",
	"\n",
	"Topic 6:\n",
	"model neural regression data function network class training classification set\n",
	"\n",
	"\n",
	"Topic 7:\n",
	"customer chatbots chatbot customers bot bots marketing service messaging banks\n",
	"\n",
	"\n"
	]
	}
	],
	"source": [
	"#Run functions to find the top words for each topic as determined by NMF\n",
	"no_top_words = 10\n",
	"display_topics(nmf, tfidf_feature_names, no_top_words)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}