georgehc/95-865 Latent Dirichlet Allocation demo.ipynb Secret

## 95-865 Latent Dirichlet Allocation demo.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# shortened and modified version of sklearn's LDA & NMF demo (http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html)\n",
    "\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "num_articles = 10000\n",
    "data = fetch_20newsgroups(shuffle=True, random_state=0,\n",
    "                          remove=('headers', 'footers', 'quotes')).data[:num_articles]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "The last name is Niedermayer, as in New Jersey's Scott's last name, because\n",
      "(you guessed it) they are brothers.  But Rob Niedermayer is a center, not\n",
      "a defenseman.\n",
      "\n",
      "I am not sure that the Sharks will take Kariya.  They aren't saying much, but\n",
      "they apparently like Niedermayer and Victor Kozlov, along with Kariya.  Chris\n",
      "Pronger's name has also been mentioned.  My guess is that they'll take\n",
      "Niedermayer.  They may take Pronger, except that they already have too many\n",
      "defensive prospects.\n"
     ]
    }
   ],
   "source": [
    "# you can take a look at what individual documents look like by replacing what index we look at\n",
    "print(data[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_size = 1000\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "# document frequency (df) means number of documents a word appears in\n",
    "tf_vectorizer = CountVectorizer(max_df=0.95,\n",
    "                                min_df=2,\n",
    "                                max_features=vocab_size,\n",
    "                                stop_words='english')\n",
    "tf = tf_vectorizer.fit_transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['00', '000', '02', '03', '04', '0d', '0t', '10', '100', '11', '12', '128', '13', '14', '145', '15', '16', '17', '18', '19', '1990', '1991', '1992', '1993', '1d9', '1st', '1t', '20', '200', '21', '22', '23', '24', '25', '250', '26', '27', '28', '29', '2di', '2tm', '30', '300', '31', '32', '33', '34', '34u', '35', '36', '37', '38', '39', '3d', '3t', '40', '41', '42', '43', '44', '45', '46', '48', '50', '500', '55', '60', '64', '6um', '70', '75', '75u', '7ey', '80', '800', '86', '90', '91', '92', '93', '9v', 'a86', 'ability', 'able', 'ac', 'accept', 'access', 'according', 'act', 'action', 'actually', 'add', 'addition', 'address', 'administration', 'advance', 'age', 'ago', 'agree', 'ah', 'air', 'al', 'algorithm', 'allow', 'allowed', 'alt', 'america', 'american', 'analysis', 'anonymous', 'answer', 'answers', 'anti', 'anybody', 'apparently', 'appears', 'apple', 'application', 'applications', 'appreciate', 'appreciated', 'approach', 'appropriate', 'apr', 'april', 'archive', 'area', 'areas', 'aren', 'argument', 'armenia', 'armenian', 'armenians', 'arms', 'army', 'article', 'articles', 'ask', 'asked', 'asking', 'assume', 'atheism', 'attack', 'attempt', 'au', 'author', 'authority', 'available', 'average', 'avoid', 'away', 'ax', 'b8f', 'bad', 'base', 'based', 'basic', 'basically', 'basis', 'belief', 'believe', 'best', 'better', 'bh', 'bhj', 'bible', 'big', 'bike', 'bit', 'bits', 'bj', 'black', 'block', 'blood', 'board', 'body', 'book', 'books', 'bought', 'box', 'break', 'bring', 'brought', 'btw', 'buf', 'build', 'building', 'built', 'bus', 'business', 'buy', 'bxn', 'ca', 'cable', 'california', 'called', 'calls', 'came', 'canada', 'car', 'card', 'cards', 'care', 'carry', 'cars', 'case', 'cases', 'cause', 'cd', 'center', 'certain', 'certainly', 'chance', 'change', 'changed', 'changes', 'check', 'chicago', 'child', 'children', 'chip', 'chips', 'choice', 'christ', 'christian', 'christianity', 'christians', 'church', 'citizens', 'city', 'claim', 'claims', 'class', 'clear', 'clearly', 'clinton', 'clipper', 'close', 'code', 'color', 'com', 'come', 'comes', 'coming', 'command', 'comments', 'commercial', 'committee', 'common', 'community', 'comp', 'company', 'complete', 'completely', 'computer', 'condition', 'conference', 'congress', 'consider', 'considered', 'contact', 'contains', 'context', 'continue', 'control', 'controller', 'copy', 'correct', 'cost', 'couldn', 'country', 'couple', 'course', 'court', 'cover', 'create', 'created', 'crime', 'cross', 'cs', 'current', 'currently', 'cut', 'cx', 'data', 'date', 'dave', 'david', 'day', 'days', 'db', 'dc', 'dead', 'deal', 'death', 'dec', 'decided', 'defense', 'define', 'deleted', 'department', 'des', 'design', 'designed', 'details', 'development', 'device', 'devices', 'did', 'didn', 'difference', 'different', 'difficult', 'digital', 'directly', 'directory', 'discussion', 'disk', 'display', 'distribution', 'division', 'dod', 'does', 'doesn', 'doing', 'don', 'door', 'dos', 'doubt', 'dr', 'drive', 'driver', 'drivers', 'drives', 'drug', 'early', 'earth', 'easily', 'east', 'easy', 'ed', 'edu', 'effect', 'electronic', 'email', 'encryption', 'end', 'enforcement', 'engine', 'entire', 'entry', 'environment', 'error', 'escrow', 'especially', 'event', 'events', 'evidence', 'exactly', 'example', 'excellent', 'exist', 'existence', 'exists', 'expect', 'experience', 'explain', 'export', 'extra', 'face', 'fact', 'faith', 'false', 'family', 'faq', 'far', 'fast', 'faster', 'father', 'fax', 'fbi', 'features', 'federal', 'feel', 'field', 'figure', 'file', 'files', 'final', 'finally', 'fine', 'firearms', 'floppy', 'folks', 'follow', 'following', 'food', 'force', 'form', 'format', 'free', 'freedom', 'friend', 'ftp', 'function', 'functions', 'future', 'g9v', 'game', 'games', 'gas', 'gave', 'general', 'generally', 'gets', 'getting', 'gif', 'given', 'gives', 'giz', 'gk', 'gm', 'goal', 'god', 'goes', 'going', 'good', 'got', 'gov', 'government', 'graphics', 'great', 'greek', 'ground', 'group', 'groups', 'guess', 'gun', 'guns', 'guy', 'half', 'hand', 'happen', 'happened', 'happens', 'hard', 'hardware', 'haven', 'having', 'head', 'health', 'hear', 'heard', 'held', 'hell', 'help', 'hi', 'high', 'higher', 'history', 'hit', 'hockey', 'hold', 'home', 'hope', 'hours', 'house', 'hp', 'human', 'ibm', 'ide', 'idea', 'ideas', 'ii', 'image', 'images', 'imagine', 'important', 'include', 'included', 'includes', 'including', 'individual', 'info', 'information', 'input', 'inside', 'installed', 'instead', 'insurance', 'int', 'interested', 'interesting', 'interface', 'internal', 'international', 'internet', 'involved', 'isn', 'israel', 'israeli', 'issue', 'issues', 'jesus', 'jewish', 'jews', 'jim', 'job', 'jobs', 'john', 'jpeg', 'just', 'key', 'keyboard', 'keys', 'kill', 'killed', 'kind', 'knew', 'know', 'knowledge', 'known', 'knows', 'la', 'land', 'language', 'large', 'late', 'later', 'law', 'laws', 'league', 'learn', 'leave', 'left', 'legal', 'let', 'letter', 'level', 'library', 'life', 'light', 'like', 'likely', 'limited', 'line', 'lines', 'list', 'little', 'live', 'lives', 'living', 'll', 'local', 'long', 'longer', 'look', 'looked', 'looking', 'looks', 'lord', 'lost', 'lot', 'lots', 'love', 'low', 'lower', 'mac', 'machine', 'machines', 'mail', 'main', 'major', 'make', 'makes', 'making', 'man', 'manager', 'manual', 'mark', 'market', 'mass', 'master', 'material', 'matter', 'max', 'maybe', 'mb', 'mean', 'meaning', 'means', 'media', 'medical', 'members', 'memory', 'men', 'mention', 'mentioned', 'message', 'mike', 'miles', 'military', 'million', 'mind', 'mit', 'mode', 'model', 'modem', 'money', 'monitor', 'month', 'months', 'moral', 'mother', 'motif', 'mouse', 'mr', 'ms', 'multiple', 'nasa', 'national', 'nature', 'near', 'necessary', 'need', 'needed', 'needs', 'net', 'network', 'new', 'news', 'newsgroup', 'nhl', 'nice', 'night', 'non', 'normal', 'note', 'nsa', 'number', 'numbers', 'object', 'obvious', 'obviously', 'offer', 'office', 'official', 'oh', 'ok', 'old', 'ones', 'open', 'opinion', 'opinions', 'orbit', 'order', 'org', 'organization', 'original', 'os', 'output', 'outside', 'package', 'page', 'paper', 'particular', 'parts', 'party', 'past', 'paul', 'pay', 'pc', 'peace', 'people', 'perfect', 'performance', 'period', 'person', 'personal', 'phone', 'pick', 'picture', 'pin', 'pittsburgh', 'pl', 'place', 'places', 'plan', 'play', 'played', 'player', 'players', 'plus', 'point', 'points', 'police', 'policy', 'political', 'population', 'port', 'position', 'possible', 'possibly', 'post', 'posted', 'posting', 'power', 'pp', 'present', 'president', 'press', 'pretty', 'previous', 'price', 'printer', 'privacy', 'private', 'pro', 'probably', 'problem', 'problems', 'process', 'product', 'program', 'programs', 'project', 'protect', 'provide', 'provides', 'pub', 'public', 'published', 'purpose', 'qq', 'quality', 'question', 'questions', 'quite', 'radio', 'ram', 'range', 'rate', 'read', 'reading', 'real', 'really', 'reason', 'reasonable', 'reasons', 'received', 'recent', 'recently', 'record', 'red', 'reference', 'regular', 'related', 'release', 'religion', 'religious', 'remember', 'reply', 'report', 'reports', 'request', 'require', 'required', 'requires', 'research', 'resources', 'response', 'rest', 'result', 'results', 'return', 'right', 'rights', 'road', 'rom', 'room', 'round', 'rules', 'run', 'running', 'runs', 'russian', 'safety', 'said', 'sale', 'san', 'save', 'saw', 'say', 'saying', 'says', 'school', 'sci', 'science', 'scientific', 'screen', 'scsi', 'search', 'season', 'second', 'secret', 'section', 'secure', 'security', 'seen', 'self', 'sell', 'send', 'sense', 'sent', 'serial', 'series', 'server', 'service', 'set', 'shall', 'shipping', 'short', 'shot', 'shuttle', 'similar', 'simple', 'simply', 'sin', 'single', 'site', 'sites', 'situation', 'size', 'small', 'society', 'software', 'solution', 'son', 'soon', 'sorry', 'sort', 'sound', 'sounds', 'source', 'sources', 'south', 'soviet', 'space', 'special', 'specific', 'speed', 'spirit', 'st', 'standard', 'start', 'started', 'state', 'statement', 'states', 'station', 'stephanopoulos', 'steve', 'stop', 'story', 'stream', 'street', 'strong', 'study', 'stuff', 'subject', 'suggest', 'sun', 'support', 'supports', 'supposed', 'sure', 'systems', 'taken', 'takes', 'taking', 'talk', 'talking', 'tape', 'tar', 'tax', 'team', 'teams', 'technical', 'technology', 'tell', 'term', 'terms', 'test', 'text', 'thank', 'thanks', 'theory', 'thing', 'things', 'think', 'thinking', 'thought', 'time', 'times', 'title', 'tm', 'today', 'told', 'took', 'tools', 'total', 'trade', 'transfer', 'tried', 'true', 'truth', 'try', 'trying', 'turkey', 'turkish', 'turn', 'tv', 'type', 'uk', 'understand', 'unfortunately', 'unit', 'united', 'university', 'unix', 'unless', 'usa', 'use', 'used', 'useful', 'usenet', 'user', 'users', 'uses', 'using', 'usually', 'value', 'values', 'van', 'various', 've', 'version', 'vga', 'video', 'view', 'voice', 'volume', 'vs', 'wait', 'want', 'wanted', 'wants', 'war', 'washington', 'wasn', 'watch', 'water', 'way', 'ways', 'weapons', 'week', 'weeks', 'went', 'white', 'wide', 'widget', 'willing', 'win', 'window', 'windows', 'wish', 'wm', 'women', 'won', 'word', 'words', 'work', 'worked', 'working', 'works', 'world', 'worth', 'wouldn', 'write', 'writing', 'written', 'wrong', 'wrote', 'x11', 'xt', 'year', 'years', 'yes', 'york', 'young']\n"
     ]
    }
   ],
   "source": [
    "print(tf_vectorizer.get_feature_names())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 2 3 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0\n",
      "  2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0\n",
      "  0 1 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0\n",
      "  0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0\n",
      "  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0\n",
      "  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      "  0]]\n"
     ]
    }
   ],
   "source": [
    "print(tf[0].toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,\n",
       "             evaluate_every=-1, learning_decay=0.7,\n",
       "             learning_method='online', learning_offset=10.0,\n",
       "             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,\n",
       "             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,\n",
       "             random_state=95865, topic_word_prior=None,\n",
       "             total_samples=1000000.0, verbose=0)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_topics = 10\n",
    "\n",
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=95865)\n",
    "lda.fit(tf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10, 1000)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lda.components_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "topic_word_distributions = np.array([topic_word_pseudocounts / np.sum(topic_word_pseudocounts)\n",
    "                                     for topic_word_pseudocounts in lda.components_])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Displaying the top 20 words per topic and their probabilities within the topic...\n",
      "\n",
      "[Topic 0]\n",
      "just: 0.019875\n",
      "like: 0.019019\n",
      "don: 0.017618\n",
      "know: 0.014387\n",
      "good: 0.014284\n",
      "think: 0.013362\n",
      "time: 0.012832\n",
      "ve: 0.010945\n",
      "year: 0.009590\n",
      "got: 0.008954\n",
      "ll: 0.008122\n",
      "really: 0.008066\n",
      "way: 0.007946\n",
      "going: 0.007739\n",
      "didn: 0.007682\n",
      "better: 0.007271\n",
      "right: 0.007236\n",
      "game: 0.007222\n",
      "car: 0.007026\n",
      "did: 0.006859\n",
      "\n",
      "[Topic 1]\n",
      "ax: 0.393777\n",
      "g9v: 0.065060\n",
      "b8f: 0.055996\n",
      "a86: 0.044778\n",
      "145: 0.044519\n",
      "1d9: 0.029224\n",
      "max: 0.025252\n",
      "pl: 0.024321\n",
      "0t: 0.024165\n",
      "2di: 0.021799\n",
      "34u: 0.019700\n",
      "75u: 0.019490\n",
      "wm: 0.016588\n",
      "1t: 0.013268\n",
      "0d: 0.012856\n",
      "bxn: 0.012846\n",
      "bhj: 0.012842\n",
      "giz: 0.012564\n",
      "2tm: 0.011254\n",
      "ah: 0.010845\n",
      "\n",
      "[Topic 2]\n",
      "mr: 0.017832\n",
      "president: 0.015440\n",
      "people: 0.014652\n",
      "new: 0.014394\n",
      "states: 0.012994\n",
      "israel: 0.012251\n",
      "american: 0.011634\n",
      "war: 0.011246\n",
      "jews: 0.010655\n",
      "national: 0.009191\n",
      "university: 0.008289\n",
      "government: 0.008284\n",
      "united: 0.008170\n",
      "health: 0.008063\n",
      "israeli: 0.007994\n",
      "years: 0.007866\n",
      "state: 0.007754\n",
      "stephanopoulos: 0.007378\n",
      "world: 0.007122\n",
      "000: 0.007040\n",
      "\n",
      "[Topic 3]\n",
      "ax: 0.882939\n",
      "max: 0.065676\n",
      "pl: 0.005747\n",
      "1t: 0.004440\n",
      "bhj: 0.004024\n",
      "3t: 0.003973\n",
      "giz: 0.003258\n",
      "g9v: 0.003163\n",
      "b8f: 0.002897\n",
      "7ey: 0.002886\n",
      "a86: 0.002833\n",
      "qq: 0.002675\n",
      "2tm: 0.002064\n",
      "gk: 0.001882\n",
      "1d9: 0.001747\n",
      "bj: 0.001590\n",
      "0t: 0.001058\n",
      "wm: 0.000922\n",
      "9v: 0.000889\n",
      "34u: 0.000888\n",
      "\n",
      "[Topic 4]\n",
      "key: 0.027784\n",
      "people: 0.026372\n",
      "government: 0.020711\n",
      "armenian: 0.015260\n",
      "encryption: 0.014915\n",
      "said: 0.013651\n",
      "armenians: 0.012175\n",
      "turkish: 0.010986\n",
      "keys: 0.010958\n",
      "security: 0.010483\n",
      "killed: 0.009580\n",
      "public: 0.009440\n",
      "children: 0.009235\n",
      "women: 0.008645\n",
      "privacy: 0.008078\n",
      "time: 0.008025\n",
      "attack: 0.007343\n",
      "secure: 0.006732\n",
      "des: 0.006664\n",
      "turkey: 0.006627\n",
      "\n",
      "[Topic 5]\n",
      "god: 0.027823\n",
      "people: 0.023686\n",
      "don: 0.017395\n",
      "think: 0.016490\n",
      "does: 0.015950\n",
      "believe: 0.015320\n",
      "know: 0.014020\n",
      "just: 0.013560\n",
      "say: 0.013367\n",
      "jesus: 0.013283\n",
      "true: 0.009953\n",
      "like: 0.009257\n",
      "way: 0.008730\n",
      "things: 0.008366\n",
      "life: 0.008219\n",
      "christian: 0.008194\n",
      "bible: 0.007824\n",
      "point: 0.007707\n",
      "read: 0.007680\n",
      "did: 0.007550\n",
      "\n",
      "[Topic 6]\n",
      "file: 0.021121\n",
      "space: 0.020013\n",
      "data: 0.018100\n",
      "information: 0.017368\n",
      "drive: 0.013805\n",
      "use: 0.013403\n",
      "program: 0.011641\n",
      "edu: 0.011476\n",
      "number: 0.010787\n",
      "available: 0.009693\n",
      "internet: 0.009567\n",
      "email: 0.009261\n",
      "send: 0.009143\n",
      "output: 0.008991\n",
      "mail: 0.008977\n",
      "disk: 0.008919\n",
      "computer: 0.008900\n",
      "nasa: 0.008286\n",
      "com: 0.008154\n",
      "entry: 0.008075\n",
      "\n",
      "[Topic 7]\n",
      "edu: 0.017210\n",
      "windows: 0.016252\n",
      "use: 0.013597\n",
      "thanks: 0.012918\n",
      "software: 0.011047\n",
      "does: 0.010922\n",
      "card: 0.010674\n",
      "using: 0.010205\n",
      "version: 0.010173\n",
      "dos: 0.009831\n",
      "window: 0.009604\n",
      "com: 0.009098\n",
      "like: 0.008892\n",
      "scsi: 0.008574\n",
      "graphics: 0.008515\n",
      "server: 0.008451\n",
      "available: 0.008283\n",
      "problem: 0.008250\n",
      "know: 0.008228\n",
      "pc: 0.008138\n",
      "\n",
      "[Topic 8]\n",
      "law: 0.024275\n",
      "use: 0.020513\n",
      "gun: 0.019270\n",
      "state: 0.013947\n",
      "case: 0.013840\n",
      "make: 0.013471\n",
      "used: 0.011862\n",
      "original: 0.010256\n",
      "control: 0.009673\n",
      "guns: 0.009172\n",
      "monitor: 0.008890\n",
      "laws: 0.008856\n",
      "order: 0.007904\n",
      "article: 0.007763\n",
      "consider: 0.007663\n",
      "federal: 0.007540\n",
      "person: 0.007303\n",
      "right: 0.007232\n",
      "people: 0.006986\n",
      "self: 0.006834\n",
      "\n",
      "[Topic 9]\n",
      "10: 0.040687\n",
      "00: 0.030464\n",
      "11: 0.025976\n",
      "15: 0.025858\n",
      "25: 0.025169\n",
      "12: 0.025042\n",
      "db: 0.024462\n",
      "20: 0.024233\n",
      "14: 0.022150\n",
      "16: 0.021643\n",
      "13: 0.019261\n",
      "17: 0.018898\n",
      "18: 0.017196\n",
      "50: 0.016271\n",
      "24: 0.015343\n",
      "40: 0.015301\n",
      "30: 0.015006\n",
      "21: 0.013901\n",
      "19: 0.013806\n",
      "55: 0.013789\n",
      "\n"
     ]
    }
   ],
   "source": [
    "num_top_words = 20\n",
    "\n",
    "print('Displaying the top %d words per topic and their probabilities within the topic...' % num_top_words)\n",
    "print()\n",
    "\n",
    "import numpy as np\n",
    "for topic_idx in range(num_topics):\n",
    "    print('[Topic %d]' % topic_idx)\n",
    "    sort_indices = np.argsort(topic_word_distributions[topic_idx])[::-1]\n",
    "    for rank in range(num_top_words):\n",
    "        word_idx = sort_indices[rank]\n",
    "        print('%s: %f' % (tf_vectorizer.get_feature_names()[word_idx], topic_word_distributions[topic_idx, word_idx]))\n",
    "    print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# shortened and modified version of sklearn's LDA & NMF demo (http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html)\n",
	"\n",
	"from sklearn.datasets import fetch_20newsgroups\n",
	"num_articles = 10000\n",
	"data = fetch_20newsgroups(shuffle=True, random_state=0,\n",
	" remove=('headers', 'footers', 'quotes')).data[:num_articles]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"\n",
	"The last name is Niedermayer, as in New Jersey's Scott's last name, because\n",
	"(you guessed it) they are brothers. But Rob Niedermayer is a center, not\n",
	"a defenseman.\n",
	"\n",
	"I am not sure that the Sharks will take Kariya. They aren't saying much, but\n",
	"they apparently like Niedermayer and Victor Kozlov, along with Kariya. Chris\n",
	"Pronger's name has also been mentioned. My guess is that they'll take\n",
	"Niedermayer. They may take Pronger, except that they already have too many\n",
	"defensive prospects.\n"
	]
	}
	],
	"source": [
	"# you can take a look at what individual documents look like by replacing what index we look at\n",
	"print(data[5])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"vocab_size = 1000\n",
	"from sklearn.feature_extraction.text import CountVectorizer\n",
	"\n",
	"# document frequency (df) means number of documents a word appears in\n",
	"tf_vectorizer = CountVectorizer(max_df=0.95,\n",
	" min_df=2,\n",
	" max_features=vocab_size,\n",
	" stop_words='english')\n",
	"tf = tf_vectorizer.fit_transform(data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['00', '000', '02', '03', '04', '0d', '0t', '10', '100', '11', '12', '128', '13', '14', '145', '15', '16', '17', '18', '19', '1990', '1991', '1992', '1993', '1d9', '1st', '1t', '20', '200', '21', '22', '23', '24', '25', '250', '26', '27', '28', '29', '2di', '2tm', '30', '300', '31', '32', '33', '34', '34u', '35', '36', '37', '38', '39', '3d', '3t', '40', '41', '42', '43', '44', '45', '46', '48', '50', '500', '55', '60', '64', '6um', '70', '75', '75u', '7ey', '80', '800', '86', '90', '91', '92', '93', '9v', 'a86', 'ability', 'able', 'ac', 'accept', 'access', 'according', 'act', 'action', 'actually', 'add', 'addition', 'address', 'administration', 'advance', 'age', 'ago', 'agree', 'ah', 'air', 'al', 'algorithm', 'allow', 'allowed', 'alt', 'america', 'american', 'analysis', 'anonymous', 'answer', 'answers', 'anti', 'anybody', 'apparently', 'appears', 'apple', 'application', 'applications', 'appreciate', 'appreciated', 'approach', 'appropriate', 'apr', 'april', 'archive', 'area', 'areas', 'aren', 'argument', 'armenia', 'armenian', 'armenians', 'arms', 'army', 'article', 'articles', 'ask', 'asked', 'asking', 'assume', 'atheism', 'attack', 'attempt', 'au', 'author', 'authority', 'available', 'average', 'avoid', 'away', 'ax', 'b8f', 'bad', 'base', 'based', 'basic', 'basically', 'basis', 'belief', 'believe', 'best', 'better', 'bh', 'bhj', 'bible', 'big', 'bike', 'bit', 'bits', 'bj', 'black', 'block', 'blood', 'board', 'body', 'book', 'books', 'bought', 'box', 'break', 'bring', 'brought', 'btw', 'buf', 'build', 'building', 'built', 'bus', 'business', 'buy', 'bxn', 'ca', 'cable', 'california', 'called', 'calls', 'came', 'canada', 'car', 'card', 'cards', 'care', 'carry', 'cars', 'case', 'cases', 'cause', 'cd', 'center', 'certain', 'certainly', 'chance', 'change', 'changed', 'changes', 'check', 'chicago', 'child', 'children', 'chip', 'chips', 'choice', 'christ', 'christian', 'christianity', 'christians', 'church', 'citizens', 'city', 'claim', 'claims', 'class', 'clear', 'clearly', 'clinton', 'clipper', 'close', 'code', 'color', 'com', 'come', 'comes', 'coming', 'command', 'comments', 'commercial', 'committee', 'common', 'community', 'comp', 'company', 'complete', 'completely', 'computer', 'condition', 'conference', 'congress', 'consider', 'considered', 'contact', 'contains', 'context', 'continue', 'control', 'controller', 'copy', 'correct', 'cost', 'couldn', 'country', 'couple', 'course', 'court', 'cover', 'create', 'created', 'crime', 'cross', 'cs', 'current', 'currently', 'cut', 'cx', 'data', 'date', 'dave', 'david', 'day', 'days', 'db', 'dc', 'dead', 'deal', 'death', 'dec', 'decided', 'defense', 'define', 'deleted', 'department', 'des', 'design', 'designed', 'details', 'development', 'device', 'devices', 'did', 'didn', 'difference', 'different', 'difficult', 'digital', 'directly', 'directory', 'discussion', 'disk', 'display', 'distribution', 'division', 'dod', 'does', 'doesn', 'doing', 'don', 'door', 'dos', 'doubt', 'dr', 'drive', 'driver', 'drivers', 'drives', 'drug', 'early', 'earth', 'easily', 'east', 'easy', 'ed', 'edu', 'effect', 'electronic', 'email', 'encryption', 'end', 'enforcement', 'engine', 'entire', 'entry', 'environment', 'error', 'escrow', 'especially', 'event', 'events', 'evidence', 'exactly', 'example', 'excellent', 'exist', 'existence', 'exists', 'expect', 'experience', 'explain', 'export', 'extra', 'face', 'fact', 'faith', 'false', 'family', 'faq', 'far', 'fast', 'faster', 'father', 'fax', 'fbi', 'features', 'federal', 'feel', 'field', 'figure', 'file', 'files', 'final', 'finally', 'fine', 'firearms', 'floppy', 'folks', 'follow', 'following', 'food', 'force', 'form', 'format', 'free', 'freedom', 'friend', 'ftp', 'function', 'functions', 'future', 'g9v', 'game', 'games', 'gas', 'gave', 'general', 'generally', 'gets', 'getting', 'gif', 'given', 'gives', 'giz', 'gk', 'gm', 'goal', 'god', 'goes', 'going', 'good', 'got', 'gov', 'government', 'graphics', 'great', 'greek', 'ground', 'group', 'groups', 'guess', 'gun', 'guns', 'guy', 'half', 'hand', 'happen', 'happened', 'happens', 'hard', 'hardware', 'haven', 'having', 'head', 'health', 'hear', 'heard', 'held', 'hell', 'help', 'hi', 'high', 'higher', 'history', 'hit', 'hockey', 'hold', 'home', 'hope', 'hours', 'house', 'hp', 'human', 'ibm', 'ide', 'idea', 'ideas', 'ii', 'image', 'images', 'imagine', 'important', 'include', 'included', 'includes', 'including', 'individual', 'info', 'information', 'input', 'inside', 'installed', 'instead', 'insurance', 'int', 'interested', 'interesting', 'interface', 'internal', 'international', 'internet', 'involved', 'isn', 'israel', 'israeli', 'issue', 'issues', 'jesus', 'jewish', 'jews', 'jim', 'job', 'jobs', 'john', 'jpeg', 'just', 'key', 'keyboard', 'keys', 'kill', 'killed', 'kind', 'knew', 'know', 'knowledge', 'known', 'knows', 'la', 'land', 'language', 'large', 'late', 'later', 'law', 'laws', 'league', 'learn', 'leave', 'left', 'legal', 'let', 'letter', 'level', 'library', 'life', 'light', 'like', 'likely', 'limited', 'line', 'lines', 'list', 'little', 'live', 'lives', 'living', 'll', 'local', 'long', 'longer', 'look', 'looked', 'looking', 'looks', 'lord', 'lost', 'lot', 'lots', 'love', 'low', 'lower', 'mac', 'machine', 'machines', 'mail', 'main', 'major', 'make', 'makes', 'making', 'man', 'manager', 'manual', 'mark', 'market', 'mass', 'master', 'material', 'matter', 'max', 'maybe', 'mb', 'mean', 'meaning', 'means', 'media', 'medical', 'members', 'memory', 'men', 'mention', 'mentioned', 'message', 'mike', 'miles', 'military', 'million', 'mind', 'mit', 'mode', 'model', 'modem', 'money', 'monitor', 'month', 'months', 'moral', 'mother', 'motif', 'mouse', 'mr', 'ms', 'multiple', 'nasa', 'national', 'nature', 'near', 'necessary', 'need', 'needed', 'needs', 'net', 'network', 'new', 'news', 'newsgroup', 'nhl', 'nice', 'night', 'non', 'normal', 'note', 'nsa', 'number', 'numbers', 'object', 'obvious', 'obviously', 'offer', 'office', 'official', 'oh', 'ok', 'old', 'ones', 'open', 'opinion', 'opinions', 'orbit', 'order', 'org', 'organization', 'original', 'os', 'output', 'outside', 'package', 'page', 'paper', 'particular', 'parts', 'party', 'past', 'paul', 'pay', 'pc', 'peace', 'people', 'perfect', 'performance', 'period', 'person', 'personal', 'phone', 'pick', 'picture', 'pin', 'pittsburgh', 'pl', 'place', 'places', 'plan', 'play', 'played', 'player', 'players', 'plus', 'point', 'points', 'police', 'policy', 'political', 'population', 'port', 'position', 'possible', 'possibly', 'post', 'posted', 'posting', 'power', 'pp', 'present', 'president', 'press', 'pretty', 'previous', 'price', 'printer', 'privacy', 'private', 'pro', 'probably', 'problem', 'problems', 'process', 'product', 'program', 'programs', 'project', 'protect', 'provide', 'provides', 'pub', 'public', 'published', 'purpose', 'qq', 'quality', 'question', 'questions', 'quite', 'radio', 'ram', 'range', 'rate', 'read', 'reading', 'real', 'really', 'reason', 'reasonable', 'reasons', 'received', 'recent', 'recently', 'record', 'red', 'reference', 'regular', 'related', 'release', 'religion', 'religious', 'remember', 'reply', 'report', 'reports', 'request', 'require', 'required', 'requires', 'research', 'resources', 'response', 'rest', 'result', 'results', 'return', 'right', 'rights', 'road', 'rom', 'room', 'round', 'rules', 'run', 'running', 'runs', 'russian', 'safety', 'said', 'sale', 'san', 'save', 'saw', 'say', 'saying', 'says', 'school', 'sci', 'science', 'scientific', 'screen', 'scsi', 'search', 'season', 'second', 'secret', 'section', 'secure', 'security', 'seen', 'self', 'sell', 'send', 'sense', 'sent', 'serial', 'series', 'server', 'service', 'set', 'shall', 'shipping', 'short', 'shot', 'shuttle', 'similar', 'simple', 'simply', 'sin', 'single', 'site', 'sites', 'situation', 'size', 'small', 'society', 'software', 'solution', 'son', 'soon', 'sorry', 'sort', 'sound', 'sounds', 'source', 'sources', 'south', 'soviet', 'space', 'special', 'specific', 'speed', 'spirit', 'st', 'standard', 'start', 'started', 'state', 'statement', 'states', 'station', 'stephanopoulos', 'steve', 'stop', 'story', 'stream', 'street', 'strong', 'study', 'stuff', 'subject', 'suggest', 'sun', 'support', 'supports', 'supposed', 'sure', 'systems', 'taken', 'takes', 'taking', 'talk', 'talking', 'tape', 'tar', 'tax', 'team', 'teams', 'technical', 'technology', 'tell', 'term', 'terms', 'test', 'text', 'thank', 'thanks', 'theory', 'thing', 'things', 'think', 'thinking', 'thought', 'time', 'times', 'title', 'tm', 'today', 'told', 'took', 'tools', 'total', 'trade', 'transfer', 'tried', 'true', 'truth', 'try', 'trying', 'turkey', 'turkish', 'turn', 'tv', 'type', 'uk', 'understand', 'unfortunately', 'unit', 'united', 'university', 'unix', 'unless', 'usa', 'use', 'used', 'useful', 'usenet', 'user', 'users', 'uses', 'using', 'usually', 'value', 'values', 'van', 'various', 've', 'version', 'vga', 'video', 'view', 'voice', 'volume', 'vs', 'wait', 'want', 'wanted', 'wants', 'war', 'washington', 'wasn', 'watch', 'water', 'way', 'ways', 'weapons', 'week', 'weeks', 'went', 'white', 'wide', 'widget', 'willing', 'win', 'window', 'windows', 'wish', 'wm', 'women', 'won', 'word', 'words', 'work', 'worked', 'working', 'works', 'world', 'worth', 'wouldn', 'write', 'writing', 'written', 'wrong', 'wrote', 'x11', 'xt', 'year', 'years', 'yes', 'york', 'young']\n"
	]
	}
	],
	"source": [
	"print(tf_vectorizer.get_feature_names())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 2 3 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0\n",
	" 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0\n",
	" 0 1 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0\n",
	" 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0\n",
	" 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0\n",
	" 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
	" 0]]\n"
	]
	}
	],
	"source": [
	"print(tf[0].toarray())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,\n",
	" evaluate_every=-1, learning_decay=0.7,\n",
	" learning_method='online', learning_offset=10.0,\n",
	" max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,\n",
	" n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,\n",
	" random_state=95865, topic_word_prior=None,\n",
	" total_samples=1000000.0, verbose=0)"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"num_topics = 10\n",
	"\n",
	"from sklearn.decomposition import LatentDirichletAllocation\n",
	"lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=95865)\n",
	"lda.fit(tf)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(10, 1000)"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"lda.components_.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"topic_word_distributions = np.array([topic_word_pseudocounts / np.sum(topic_word_pseudocounts)\n",
	" for topic_word_pseudocounts in lda.components_])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Displaying the top 20 words per topic and their probabilities within the topic...\n",
	"\n",
	"[Topic 0]\n",
	"just: 0.019875\n",
	"like: 0.019019\n",
	"don: 0.017618\n",
	"know: 0.014387\n",
	"good: 0.014284\n",
	"think: 0.013362\n",
	"time: 0.012832\n",
	"ve: 0.010945\n",
	"year: 0.009590\n",
	"got: 0.008954\n",
	"ll: 0.008122\n",
	"really: 0.008066\n",
	"way: 0.007946\n",
	"going: 0.007739\n",
	"didn: 0.007682\n",
	"better: 0.007271\n",
	"right: 0.007236\n",
	"game: 0.007222\n",
	"car: 0.007026\n",
	"did: 0.006859\n",
	"\n",
	"[Topic 1]\n",
	"ax: 0.393777\n",
	"g9v: 0.065060\n",
	"b8f: 0.055996\n",
	"a86: 0.044778\n",
	"145: 0.044519\n",
	"1d9: 0.029224\n",
	"max: 0.025252\n",
	"pl: 0.024321\n",
	"0t: 0.024165\n",
	"2di: 0.021799\n",
	"34u: 0.019700\n",
	"75u: 0.019490\n",
	"wm: 0.016588\n",
	"1t: 0.013268\n",
	"0d: 0.012856\n",
	"bxn: 0.012846\n",
	"bhj: 0.012842\n",
	"giz: 0.012564\n",
	"2tm: 0.011254\n",
	"ah: 0.010845\n",
	"\n",
	"[Topic 2]\n",
	"mr: 0.017832\n",
	"president: 0.015440\n",
	"people: 0.014652\n",
	"new: 0.014394\n",
	"states: 0.012994\n",
	"israel: 0.012251\n",
	"american: 0.011634\n",
	"war: 0.011246\n",
	"jews: 0.010655\n",
	"national: 0.009191\n",
	"university: 0.008289\n",
	"government: 0.008284\n",
	"united: 0.008170\n",
	"health: 0.008063\n",
	"israeli: 0.007994\n",
	"years: 0.007866\n",
	"state: 0.007754\n",
	"stephanopoulos: 0.007378\n",
	"world: 0.007122\n",
	"000: 0.007040\n",
	"\n",
	"[Topic 3]\n",
	"ax: 0.882939\n",
	"max: 0.065676\n",
	"pl: 0.005747\n",
	"1t: 0.004440\n",
	"bhj: 0.004024\n",
	"3t: 0.003973\n",
	"giz: 0.003258\n",
	"g9v: 0.003163\n",
	"b8f: 0.002897\n",
	"7ey: 0.002886\n",
	"a86: 0.002833\n",
	"qq: 0.002675\n",
	"2tm: 0.002064\n",
	"gk: 0.001882\n",
	"1d9: 0.001747\n",
	"bj: 0.001590\n",
	"0t: 0.001058\n",
	"wm: 0.000922\n",
	"9v: 0.000889\n",
	"34u: 0.000888\n",
	"\n",
	"[Topic 4]\n",
	"key: 0.027784\n",
	"people: 0.026372\n",
	"government: 0.020711\n",
	"armenian: 0.015260\n",
	"encryption: 0.014915\n",
	"said: 0.013651\n",
	"armenians: 0.012175\n",
	"turkish: 0.010986\n",
	"keys: 0.010958\n",
	"security: 0.010483\n",
	"killed: 0.009580\n",
	"public: 0.009440\n",
	"children: 0.009235\n",
	"women: 0.008645\n",
	"privacy: 0.008078\n",
	"time: 0.008025\n",
	"attack: 0.007343\n",
	"secure: 0.006732\n",
	"des: 0.006664\n",
	"turkey: 0.006627\n",
	"\n",
	"[Topic 5]\n",
	"god: 0.027823\n",
	"people: 0.023686\n",
	"don: 0.017395\n",
	"think: 0.016490\n",
	"does: 0.015950\n",
	"believe: 0.015320\n",
	"know: 0.014020\n",
	"just: 0.013560\n",
	"say: 0.013367\n",
	"jesus: 0.013283\n",
	"true: 0.009953\n",
	"like: 0.009257\n",
	"way: 0.008730\n",
	"things: 0.008366\n",
	"life: 0.008219\n",
	"christian: 0.008194\n",
	"bible: 0.007824\n",
	"point: 0.007707\n",
	"read: 0.007680\n",
	"did: 0.007550\n",
	"\n",
	"[Topic 6]\n",
	"file: 0.021121\n",
	"space: 0.020013\n",
	"data: 0.018100\n",
	"information: 0.017368\n",
	"drive: 0.013805\n",
	"use: 0.013403\n",
	"program: 0.011641\n",
	"edu: 0.011476\n",
	"number: 0.010787\n",
	"available: 0.009693\n",
	"internet: 0.009567\n",
	"email: 0.009261\n",
	"send: 0.009143\n",
	"output: 0.008991\n",
	"mail: 0.008977\n",
	"disk: 0.008919\n",
	"computer: 0.008900\n",
	"nasa: 0.008286\n",
	"com: 0.008154\n",
	"entry: 0.008075\n",
	"\n",
	"[Topic 7]\n",
	"edu: 0.017210\n",
	"windows: 0.016252\n",
	"use: 0.013597\n",
	"thanks: 0.012918\n",
	"software: 0.011047\n",
	"does: 0.010922\n",
	"card: 0.010674\n",
	"using: 0.010205\n",
	"version: 0.010173\n",
	"dos: 0.009831\n",
	"window: 0.009604\n",
	"com: 0.009098\n",
	"like: 0.008892\n",
	"scsi: 0.008574\n",
	"graphics: 0.008515\n",
	"server: 0.008451\n",
	"available: 0.008283\n",
	"problem: 0.008250\n",
	"know: 0.008228\n",
	"pc: 0.008138\n",
	"\n",
	"[Topic 8]\n",
	"law: 0.024275\n",
	"use: 0.020513\n",
	"gun: 0.019270\n",
	"state: 0.013947\n",
	"case: 0.013840\n",
	"make: 0.013471\n",
	"used: 0.011862\n",
	"original: 0.010256\n",
	"control: 0.009673\n",
	"guns: 0.009172\n",
	"monitor: 0.008890\n",
	"laws: 0.008856\n",
	"order: 0.007904\n",
	"article: 0.007763\n",
	"consider: 0.007663\n",
	"federal: 0.007540\n",
	"person: 0.007303\n",
	"right: 0.007232\n",
	"people: 0.006986\n",
	"self: 0.006834\n",
	"\n",
	"[Topic 9]\n",
	"10: 0.040687\n",
	"00: 0.030464\n",
	"11: 0.025976\n",
	"15: 0.025858\n",
	"25: 0.025169\n",
	"12: 0.025042\n",
	"db: 0.024462\n",
	"20: 0.024233\n",
	"14: 0.022150\n",
	"16: 0.021643\n",
	"13: 0.019261\n",
	"17: 0.018898\n",
	"18: 0.017196\n",
	"50: 0.016271\n",
	"24: 0.015343\n",
	"40: 0.015301\n",
	"30: 0.015006\n",
	"21: 0.013901\n",
	"19: 0.013806\n",
	"55: 0.013789\n",
	"\n"
	]
	}
	],
	"source": [
	"num_top_words = 20\n",
	"\n",
	"print('Displaying the top %d words per topic and their probabilities within the topic...' % num_top_words)\n",
	"print()\n",
	"\n",
	"import numpy as np\n",
	"for topic_idx in range(num_topics):\n",
	" print('[Topic %d]' % topic_idx)\n",
	" sort_indices = np.argsort(topic_word_distributions[topic_idx])[::-1]\n",
	" for rank in range(num_top_words):\n",
	" word_idx = sort_indices[rank]\n",
	" print('%s: %f' % (tf_vectorizer.get_feature_names()[word_idx], topic_word_distributions[topic_idx, word_idx]))\n",
	" print()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}