Skip to content

Instantly share code, notes, and snippets.

@georgehc
Created November 10, 2017 03:35
Show Gist options
  • Save georgehc/d2353feef7e09b4b53fc087d44f75954 to your computer and use it in GitHub Desktop.
Save georgehc/d2353feef7e09b4b53fc087d44f75954 to your computer and use it in GitHub Desktop.
95-865 Latent Dirichlet Allocation demo
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# shortened and modified version of sklearn's LDA & NMF demo (http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html)\n",
"\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"num_articles = 10000\n",
"data = fetch_20newsgroups(shuffle=True, random_state=0,\n",
" remove=('headers', 'footers', 'quotes')).data[:num_articles]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"The last name is Niedermayer, as in New Jersey's Scott's last name, because\n",
"(you guessed it) they are brothers. But Rob Niedermayer is a center, not\n",
"a defenseman.\n",
"\n",
"I am not sure that the Sharks will take Kariya. They aren't saying much, but\n",
"they apparently like Niedermayer and Victor Kozlov, along with Kariya. Chris\n",
"Pronger's name has also been mentioned. My guess is that they'll take\n",
"Niedermayer. They may take Pronger, except that they already have too many\n",
"defensive prospects.\n"
]
}
],
"source": [
"# you can take a look at what individual documents look like by replacing what index we look at\n",
"print(data[5])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"vocab_size = 1000\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"# document frequency (df) means number of documents a word appears in\n",
"tf_vectorizer = CountVectorizer(max_df=0.95,\n",
" min_df=2,\n",
" max_features=vocab_size,\n",
" stop_words='english')\n",
"tf = tf_vectorizer.fit_transform(data)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['00', '000', '02', '03', '04', '0d', '0t', '10', '100', '11', '12', '128', '13', '14', '145', '15', '16', '17', '18', '19', '1990', '1991', '1992', '1993', '1d9', '1st', '1t', '20', '200', '21', '22', '23', '24', '25', '250', '26', '27', '28', '29', '2di', '2tm', '30', '300', '31', '32', '33', '34', '34u', '35', '36', '37', '38', '39', '3d', '3t', '40', '41', '42', '43', '44', '45', '46', '48', '50', '500', '55', '60', '64', '6um', '70', '75', '75u', '7ey', '80', '800', '86', '90', '91', '92', '93', '9v', 'a86', 'ability', 'able', 'ac', 'accept', 'access', 'according', 'act', 'action', 'actually', 'add', 'addition', 'address', 'administration', 'advance', 'age', 'ago', 'agree', 'ah', 'air', 'al', 'algorithm', 'allow', 'allowed', 'alt', 'america', 'american', 'analysis', 'anonymous', 'answer', 'answers', 'anti', 'anybody', 'apparently', 'appears', 'apple', 'application', 'applications', 'appreciate', 'appreciated', 'approach', 'appropriate', 'apr', 'april', 'archive', 'area', 'areas', 'aren', 'argument', 'armenia', 'armenian', 'armenians', 'arms', 'army', 'article', 'articles', 'ask', 'asked', 'asking', 'assume', 'atheism', 'attack', 'attempt', 'au', 'author', 'authority', 'available', 'average', 'avoid', 'away', 'ax', 'b8f', 'bad', 'base', 'based', 'basic', 'basically', 'basis', 'belief', 'believe', 'best', 'better', 'bh', 'bhj', 'bible', 'big', 'bike', 'bit', 'bits', 'bj', 'black', 'block', 'blood', 'board', 'body', 'book', 'books', 'bought', 'box', 'break', 'bring', 'brought', 'btw', 'buf', 'build', 'building', 'built', 'bus', 'business', 'buy', 'bxn', 'ca', 'cable', 'california', 'called', 'calls', 'came', 'canada', 'car', 'card', 'cards', 'care', 'carry', 'cars', 'case', 'cases', 'cause', 'cd', 'center', 'certain', 'certainly', 'chance', 'change', 'changed', 'changes', 'check', 'chicago', 'child', 'children', 'chip', 'chips', 'choice', 'christ', 'christian', 'christianity', 'christians', 'church', 'citizens', 'city', 'claim', 'claims', 'class', 'clear', 'clearly', 'clinton', 'clipper', 'close', 'code', 'color', 'com', 'come', 'comes', 'coming', 'command', 'comments', 'commercial', 'committee', 'common', 'community', 'comp', 'company', 'complete', 'completely', 'computer', 'condition', 'conference', 'congress', 'consider', 'considered', 'contact', 'contains', 'context', 'continue', 'control', 'controller', 'copy', 'correct', 'cost', 'couldn', 'country', 'couple', 'course', 'court', 'cover', 'create', 'created', 'crime', 'cross', 'cs', 'current', 'currently', 'cut', 'cx', 'data', 'date', 'dave', 'david', 'day', 'days', 'db', 'dc', 'dead', 'deal', 'death', 'dec', 'decided', 'defense', 'define', 'deleted', 'department', 'des', 'design', 'designed', 'details', 'development', 'device', 'devices', 'did', 'didn', 'difference', 'different', 'difficult', 'digital', 'directly', 'directory', 'discussion', 'disk', 'display', 'distribution', 'division', 'dod', 'does', 'doesn', 'doing', 'don', 'door', 'dos', 'doubt', 'dr', 'drive', 'driver', 'drivers', 'drives', 'drug', 'early', 'earth', 'easily', 'east', 'easy', 'ed', 'edu', 'effect', 'electronic', 'email', 'encryption', 'end', 'enforcement', 'engine', 'entire', 'entry', 'environment', 'error', 'escrow', 'especially', 'event', 'events', 'evidence', 'exactly', 'example', 'excellent', 'exist', 'existence', 'exists', 'expect', 'experience', 'explain', 'export', 'extra', 'face', 'fact', 'faith', 'false', 'family', 'faq', 'far', 'fast', 'faster', 'father', 'fax', 'fbi', 'features', 'federal', 'feel', 'field', 'figure', 'file', 'files', 'final', 'finally', 'fine', 'firearms', 'floppy', 'folks', 'follow', 'following', 'food', 'force', 'form', 'format', 'free', 'freedom', 'friend', 'ftp', 'function', 'functions', 'future', 'g9v', 'game', 'games', 'gas', 'gave', 'general', 'generally', 'gets', 'getting', 'gif', 'given', 'gives', 'giz', 'gk', 'gm', 'goal', 'god', 'goes', 'going', 'good', 'got', 'gov', 'government', 'graphics', 'great', 'greek', 'ground', 'group', 'groups', 'guess', 'gun', 'guns', 'guy', 'half', 'hand', 'happen', 'happened', 'happens', 'hard', 'hardware', 'haven', 'having', 'head', 'health', 'hear', 'heard', 'held', 'hell', 'help', 'hi', 'high', 'higher', 'history', 'hit', 'hockey', 'hold', 'home', 'hope', 'hours', 'house', 'hp', 'human', 'ibm', 'ide', 'idea', 'ideas', 'ii', 'image', 'images', 'imagine', 'important', 'include', 'included', 'includes', 'including', 'individual', 'info', 'information', 'input', 'inside', 'installed', 'instead', 'insurance', 'int', 'interested', 'interesting', 'interface', 'internal', 'international', 'internet', 'involved', 'isn', 'israel', 'israeli', 'issue', 'issues', 'jesus', 'jewish', 'jews', 'jim', 'job', 'jobs', 'john', 'jpeg', 'just', 'key', 'keyboard', 'keys', 'kill', 'killed', 'kind', 'knew', 'know', 'knowledge', 'known', 'knows', 'la', 'land', 'language', 'large', 'late', 'later', 'law', 'laws', 'league', 'learn', 'leave', 'left', 'legal', 'let', 'letter', 'level', 'library', 'life', 'light', 'like', 'likely', 'limited', 'line', 'lines', 'list', 'little', 'live', 'lives', 'living', 'll', 'local', 'long', 'longer', 'look', 'looked', 'looking', 'looks', 'lord', 'lost', 'lot', 'lots', 'love', 'low', 'lower', 'mac', 'machine', 'machines', 'mail', 'main', 'major', 'make', 'makes', 'making', 'man', 'manager', 'manual', 'mark', 'market', 'mass', 'master', 'material', 'matter', 'max', 'maybe', 'mb', 'mean', 'meaning', 'means', 'media', 'medical', 'members', 'memory', 'men', 'mention', 'mentioned', 'message', 'mike', 'miles', 'military', 'million', 'mind', 'mit', 'mode', 'model', 'modem', 'money', 'monitor', 'month', 'months', 'moral', 'mother', 'motif', 'mouse', 'mr', 'ms', 'multiple', 'nasa', 'national', 'nature', 'near', 'necessary', 'need', 'needed', 'needs', 'net', 'network', 'new', 'news', 'newsgroup', 'nhl', 'nice', 'night', 'non', 'normal', 'note', 'nsa', 'number', 'numbers', 'object', 'obvious', 'obviously', 'offer', 'office', 'official', 'oh', 'ok', 'old', 'ones', 'open', 'opinion', 'opinions', 'orbit', 'order', 'org', 'organization', 'original', 'os', 'output', 'outside', 'package', 'page', 'paper', 'particular', 'parts', 'party', 'past', 'paul', 'pay', 'pc', 'peace', 'people', 'perfect', 'performance', 'period', 'person', 'personal', 'phone', 'pick', 'picture', 'pin', 'pittsburgh', 'pl', 'place', 'places', 'plan', 'play', 'played', 'player', 'players', 'plus', 'point', 'points', 'police', 'policy', 'political', 'population', 'port', 'position', 'possible', 'possibly', 'post', 'posted', 'posting', 'power', 'pp', 'present', 'president', 'press', 'pretty', 'previous', 'price', 'printer', 'privacy', 'private', 'pro', 'probably', 'problem', 'problems', 'process', 'product', 'program', 'programs', 'project', 'protect', 'provide', 'provides', 'pub', 'public', 'published', 'purpose', 'qq', 'quality', 'question', 'questions', 'quite', 'radio', 'ram', 'range', 'rate', 'read', 'reading', 'real', 'really', 'reason', 'reasonable', 'reasons', 'received', 'recent', 'recently', 'record', 'red', 'reference', 'regular', 'related', 'release', 'religion', 'religious', 'remember', 'reply', 'report', 'reports', 'request', 'require', 'required', 'requires', 'research', 'resources', 'response', 'rest', 'result', 'results', 'return', 'right', 'rights', 'road', 'rom', 'room', 'round', 'rules', 'run', 'running', 'runs', 'russian', 'safety', 'said', 'sale', 'san', 'save', 'saw', 'say', 'saying', 'says', 'school', 'sci', 'science', 'scientific', 'screen', 'scsi', 'search', 'season', 'second', 'secret', 'section', 'secure', 'security', 'seen', 'self', 'sell', 'send', 'sense', 'sent', 'serial', 'series', 'server', 'service', 'set', 'shall', 'shipping', 'short', 'shot', 'shuttle', 'similar', 'simple', 'simply', 'sin', 'single', 'site', 'sites', 'situation', 'size', 'small', 'society', 'software', 'solution', 'son', 'soon', 'sorry', 'sort', 'sound', 'sounds', 'source', 'sources', 'south', 'soviet', 'space', 'special', 'specific', 'speed', 'spirit', 'st', 'standard', 'start', 'started', 'state', 'statement', 'states', 'station', 'stephanopoulos', 'steve', 'stop', 'story', 'stream', 'street', 'strong', 'study', 'stuff', 'subject', 'suggest', 'sun', 'support', 'supports', 'supposed', 'sure', 'systems', 'taken', 'takes', 'taking', 'talk', 'talking', 'tape', 'tar', 'tax', 'team', 'teams', 'technical', 'technology', 'tell', 'term', 'terms', 'test', 'text', 'thank', 'thanks', 'theory', 'thing', 'things', 'think', 'thinking', 'thought', 'time', 'times', 'title', 'tm', 'today', 'told', 'took', 'tools', 'total', 'trade', 'transfer', 'tried', 'true', 'truth', 'try', 'trying', 'turkey', 'turkish', 'turn', 'tv', 'type', 'uk', 'understand', 'unfortunately', 'unit', 'united', 'university', 'unix', 'unless', 'usa', 'use', 'used', 'useful', 'usenet', 'user', 'users', 'uses', 'using', 'usually', 'value', 'values', 'van', 'various', 've', 'version', 'vga', 'video', 'view', 'voice', 'volume', 'vs', 'wait', 'want', 'wanted', 'wants', 'war', 'washington', 'wasn', 'watch', 'water', 'way', 'ways', 'weapons', 'week', 'weeks', 'went', 'white', 'wide', 'widget', 'willing', 'win', 'window', 'windows', 'wish', 'wm', 'women', 'won', 'word', 'words', 'work', 'worked', 'working', 'works', 'world', 'worth', 'wouldn', 'write', 'writing', 'written', 'wrong', 'wrote', 'x11', 'xt', 'year', 'years', 'yes', 'york', 'young']\n"
]
}
],
"source": [
"print(tf_vectorizer.get_feature_names())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 2 3 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0\n",
" 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0\n",
" 0 1 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0\n",
" 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0\n",
" 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0\n",
" 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0]]\n"
]
}
],
"source": [
"print(tf[0].toarray())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,\n",
" evaluate_every=-1, learning_decay=0.7,\n",
" learning_method='online', learning_offset=10.0,\n",
" max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,\n",
" n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,\n",
" random_state=95865, topic_word_prior=None,\n",
" total_samples=1000000.0, verbose=0)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num_topics = 10\n",
"\n",
"from sklearn.decomposition import LatentDirichletAllocation\n",
"lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online', random_state=95865)\n",
"lda.fit(tf)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(10, 1000)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lda.components_.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"topic_word_distributions = np.array([topic_word_pseudocounts / np.sum(topic_word_pseudocounts)\n",
" for topic_word_pseudocounts in lda.components_])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Displaying the top 20 words per topic and their probabilities within the topic...\n",
"\n",
"[Topic 0]\n",
"just: 0.019875\n",
"like: 0.019019\n",
"don: 0.017618\n",
"know: 0.014387\n",
"good: 0.014284\n",
"think: 0.013362\n",
"time: 0.012832\n",
"ve: 0.010945\n",
"year: 0.009590\n",
"got: 0.008954\n",
"ll: 0.008122\n",
"really: 0.008066\n",
"way: 0.007946\n",
"going: 0.007739\n",
"didn: 0.007682\n",
"better: 0.007271\n",
"right: 0.007236\n",
"game: 0.007222\n",
"car: 0.007026\n",
"did: 0.006859\n",
"\n",
"[Topic 1]\n",
"ax: 0.393777\n",
"g9v: 0.065060\n",
"b8f: 0.055996\n",
"a86: 0.044778\n",
"145: 0.044519\n",
"1d9: 0.029224\n",
"max: 0.025252\n",
"pl: 0.024321\n",
"0t: 0.024165\n",
"2di: 0.021799\n",
"34u: 0.019700\n",
"75u: 0.019490\n",
"wm: 0.016588\n",
"1t: 0.013268\n",
"0d: 0.012856\n",
"bxn: 0.012846\n",
"bhj: 0.012842\n",
"giz: 0.012564\n",
"2tm: 0.011254\n",
"ah: 0.010845\n",
"\n",
"[Topic 2]\n",
"mr: 0.017832\n",
"president: 0.015440\n",
"people: 0.014652\n",
"new: 0.014394\n",
"states: 0.012994\n",
"israel: 0.012251\n",
"american: 0.011634\n",
"war: 0.011246\n",
"jews: 0.010655\n",
"national: 0.009191\n",
"university: 0.008289\n",
"government: 0.008284\n",
"united: 0.008170\n",
"health: 0.008063\n",
"israeli: 0.007994\n",
"years: 0.007866\n",
"state: 0.007754\n",
"stephanopoulos: 0.007378\n",
"world: 0.007122\n",
"000: 0.007040\n",
"\n",
"[Topic 3]\n",
"ax: 0.882939\n",
"max: 0.065676\n",
"pl: 0.005747\n",
"1t: 0.004440\n",
"bhj: 0.004024\n",
"3t: 0.003973\n",
"giz: 0.003258\n",
"g9v: 0.003163\n",
"b8f: 0.002897\n",
"7ey: 0.002886\n",
"a86: 0.002833\n",
"qq: 0.002675\n",
"2tm: 0.002064\n",
"gk: 0.001882\n",
"1d9: 0.001747\n",
"bj: 0.001590\n",
"0t: 0.001058\n",
"wm: 0.000922\n",
"9v: 0.000889\n",
"34u: 0.000888\n",
"\n",
"[Topic 4]\n",
"key: 0.027784\n",
"people: 0.026372\n",
"government: 0.020711\n",
"armenian: 0.015260\n",
"encryption: 0.014915\n",
"said: 0.013651\n",
"armenians: 0.012175\n",
"turkish: 0.010986\n",
"keys: 0.010958\n",
"security: 0.010483\n",
"killed: 0.009580\n",
"public: 0.009440\n",
"children: 0.009235\n",
"women: 0.008645\n",
"privacy: 0.008078\n",
"time: 0.008025\n",
"attack: 0.007343\n",
"secure: 0.006732\n",
"des: 0.006664\n",
"turkey: 0.006627\n",
"\n",
"[Topic 5]\n",
"god: 0.027823\n",
"people: 0.023686\n",
"don: 0.017395\n",
"think: 0.016490\n",
"does: 0.015950\n",
"believe: 0.015320\n",
"know: 0.014020\n",
"just: 0.013560\n",
"say: 0.013367\n",
"jesus: 0.013283\n",
"true: 0.009953\n",
"like: 0.009257\n",
"way: 0.008730\n",
"things: 0.008366\n",
"life: 0.008219\n",
"christian: 0.008194\n",
"bible: 0.007824\n",
"point: 0.007707\n",
"read: 0.007680\n",
"did: 0.007550\n",
"\n",
"[Topic 6]\n",
"file: 0.021121\n",
"space: 0.020013\n",
"data: 0.018100\n",
"information: 0.017368\n",
"drive: 0.013805\n",
"use: 0.013403\n",
"program: 0.011641\n",
"edu: 0.011476\n",
"number: 0.010787\n",
"available: 0.009693\n",
"internet: 0.009567\n",
"email: 0.009261\n",
"send: 0.009143\n",
"output: 0.008991\n",
"mail: 0.008977\n",
"disk: 0.008919\n",
"computer: 0.008900\n",
"nasa: 0.008286\n",
"com: 0.008154\n",
"entry: 0.008075\n",
"\n",
"[Topic 7]\n",
"edu: 0.017210\n",
"windows: 0.016252\n",
"use: 0.013597\n",
"thanks: 0.012918\n",
"software: 0.011047\n",
"does: 0.010922\n",
"card: 0.010674\n",
"using: 0.010205\n",
"version: 0.010173\n",
"dos: 0.009831\n",
"window: 0.009604\n",
"com: 0.009098\n",
"like: 0.008892\n",
"scsi: 0.008574\n",
"graphics: 0.008515\n",
"server: 0.008451\n",
"available: 0.008283\n",
"problem: 0.008250\n",
"know: 0.008228\n",
"pc: 0.008138\n",
"\n",
"[Topic 8]\n",
"law: 0.024275\n",
"use: 0.020513\n",
"gun: 0.019270\n",
"state: 0.013947\n",
"case: 0.013840\n",
"make: 0.013471\n",
"used: 0.011862\n",
"original: 0.010256\n",
"control: 0.009673\n",
"guns: 0.009172\n",
"monitor: 0.008890\n",
"laws: 0.008856\n",
"order: 0.007904\n",
"article: 0.007763\n",
"consider: 0.007663\n",
"federal: 0.007540\n",
"person: 0.007303\n",
"right: 0.007232\n",
"people: 0.006986\n",
"self: 0.006834\n",
"\n",
"[Topic 9]\n",
"10: 0.040687\n",
"00: 0.030464\n",
"11: 0.025976\n",
"15: 0.025858\n",
"25: 0.025169\n",
"12: 0.025042\n",
"db: 0.024462\n",
"20: 0.024233\n",
"14: 0.022150\n",
"16: 0.021643\n",
"13: 0.019261\n",
"17: 0.018898\n",
"18: 0.017196\n",
"50: 0.016271\n",
"24: 0.015343\n",
"40: 0.015301\n",
"30: 0.015006\n",
"21: 0.013901\n",
"19: 0.013806\n",
"55: 0.013789\n",
"\n"
]
}
],
"source": [
"num_top_words = 20\n",
"\n",
"print('Displaying the top %d words per topic and their probabilities within the topic...' % num_top_words)\n",
"print()\n",
"\n",
"import numpy as np\n",
"for topic_idx in range(num_topics):\n",
" print('[Topic %d]' % topic_idx)\n",
" sort_indices = np.argsort(topic_word_distributions[topic_idx])[::-1]\n",
" for rank in range(num_top_words):\n",
" word_idx = sort_indices[rank]\n",
" print('%s: %f' % (tf_vectorizer.get_feature_names()[word_idx], topic_word_distributions[topic_idx, word_idx]))\n",
" print()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment