Created
December 1, 2015 12:26
-
-
Save springcoil/282ab1242ac0ec86734f to your computer and use it in GitHub Desktop.
Using data from the Paris version of Airbnb I hacked together a little Topic Model in Scikitlearn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"TEXT_DIR = os.path.join(os.getcwd(), 'text/')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import os, os.path, codecs\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn import decomposition\n", | |
"\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", | |
"from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"dir_data = \"text\"\n", | |
"file_paths = [os.path.join(dir_data, fname) for fname in os.listdir(dir_data) if fname.endswith(\".txt\") ]\n", | |
"documents = [codecs.open(file_path, 'r', encoding=\"utf8\", errors='ignore').read() for file_path in file_paths ]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"tfidf = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, lowercase=True, strip_accents=\"unicode\", use_idf=True, norm=\"l2\", min_df = 5) \n", | |
"A = tfidf.fit_transform(documents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<119x13235 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 431707 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"A" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"num_terms = len(tfidf.vocabulary_)\n", | |
"terms = [\"\"] * num_terms\n", | |
"for term in tfidf.vocabulary_.keys():\n", | |
" terms[ tfidf.vocabulary_[term] ] = term" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model = decomposition.NMF(init=\"nndsvd\", n_components=30, max_iter=200)\n", | |
"W = model.fit_transform(A)\n", | |
"H = model.components_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Topic 0: et, room, paris, apartment, la, bed, le, kitchen, living, metro\n", | |
"Topic 1: apartment, paris, bed, floor, kitchen, located, room, bedroom, et, bathroom\n", | |
"Topic 2: paris, apartment, une, room, et, avec, kitchen, est, chambre, la\n", | |
"Topic 3: montmartre, sacre, coeur, abbesses, joffrin, jules, du, 12, rouge, moulin\n", | |
"Topic 4: mouffetard, pantheon, luxembourg, latin, notre, dame, sorbonne, paris, rue, monge\n", | |
"Topic 5: chaumont, buttes, villette, canal, ourcq, la, parc, et, du, des\n", | |
"Topic 6: batignolles, monceau, champs, elysees, des, ternes, montmartre, brochant, arc, triomphe\n", | |
"Topic 7: passy, trocadero, eiffel, tower, auteuil, boulogne, 16th, champs, elysees, garros\n", | |
"Topic 8: martin, canal, saint, gare, republique, du, st, nord, est, bars\n", | |
"Topic 9: montorgueil, sentier, paris, rue, louvre, halles, opera, les, reaumur, heart\n", | |
"Topic 10: alesia, montparnasse, denfert, daguerre, rochereau, montsouris, et, pernety, 14th, rue\n", | |
"Topic 11: marais, vosges, temple, des, metiers, rue, le, republique, du, place\n", | |
"Topic 12: et, vincennes, daumesnil, bastille, nation, bercy, la, lyon, metro, gare\n", | |
"Topic 13: cailles, italie, et, la, butte, mouffetard, gobelins, tolbiac, place, quartier\n", | |
"Topic 14: et, metro, la, pantin, une, minutes, hoche, room, avec, paris\n", | |
"Topic 15: montmartre, martyrs, pigalle, opera, moulin, rue, rouge, 9th, cadet, grands\n", | |
"Topic 16: gambetta, lachaise, pere, et, belleville, menilmontant, nation, une, metro, la\n", | |
"Topic 17: bastille, oberkampf, republique, lachaise, pere, marais, place, voltaire, bars, parmentier\n", | |
"Topic 18: eiffel, invalides, tower, cler, rue, mars, militaire, des, la, ecole\n", | |
"Topic 19: louvre, paris, honore, royal, tuileries, palais, rue, vendome, les, place\n", | |
"Topic 20: et, une, la, avec, est, le, paris, appartement, apartment, dans\n", | |
"Topic 21: marais, paris, saint, vosges, louis, room, notre, dame, des, paul\n", | |
"Topic 22: montmartre, et, sacre, du, flat, coeur, gare, barbes, dormoy, minutes\n", | |
"Topic 23: champs, elysees, monceau, avenue, paris, des, honore, saint, arc, madeleine\n", | |
"Topic 24: une, et, avec, lit, bed, double, chambre, appartement, walk, en\n", | |
"Topic 25: saint, germain, des, luxembourg, pres, odeon, du, sulpice, st, room\n", | |
"Topic 26: paris, apartment, minutes, une, la, vous, studio, bed, est, flat\n", | |
"Topic 27: bastille, lachaise, pere, nation, place, voltaire, 11th, marais, oberkampf, la\n", | |
"Topic 28: eiffel, tower, convention, 15th, motte, metro, tour, mars, 10, la\n", | |
"Topic 29: et, flat, la, une, bed, living, place, room, avec, machine\n" | |
] | |
} | |
], | |
"source": [ | |
"for topic_index in range( H.shape[0] ):\n", | |
" top_indices = np.argsort( H[topic_index,:] )[::-1][0:10]\n", | |
" term_ranking = [terms[i] for i in top_indices]\n", | |
" print(\"Topic %d: %s\" % ( topic_index, \", \".join( term_ranking ) ))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"As you can see we need to do more cleaning - it is clear from above that there are a number of French stop words there. \n", | |
"However some of the topics are interesting, we see words like montmartre, bastille, eiffel etc all included - which is what we would expect naively in Parisian locations. " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Text Analysis \n", | |
"We'll do a more sophisicated model by using the new LDA functionality in Scikit-Learn" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Extracting tf features for LDA...\n", | |
"done in 6.209s.\n", | |
"Fitting LDA models with tf features, n_samples=20000 and n_features=1000...\n", | |
"done in 1.687s.\n", | |
"\n", | |
"Topics in LDA model:\n", | |
"Topic #0:\n", | |
"lachaise belleville père gambetta ménilmontant buttes chaumont nation jourdain cemetery oberkampf 20th pere rouge 26 république menilmontant tram populaire dumas\n", | |
"Topic #1:\n", | |
"montorgueil beaubourg luxembourg mouffetard arts tuileries pont république honoré pantheon vosges royal temple métiers prés madeleine martin paul projector 17th\n", | |
"Topic #2:\n", | |
"butte cailles italie madeleine rochereau rouge pigalle monceau luxembourg abbesses 16th gobelins mouffetard etoile trocadero tramway elysée printemps passy 13th\n", | |
"Topic #3:\n", | |
"pigalle rouge moulin martyrs 9th boulevards abbesses printemps georges garnier sacre anvers butte cadet department martin clichy blanche madeleine 18th\n", | |
"Topic #4:\n", | |
"chaumont buttes lachaise république oberkampf belleville père villette martin nation voltaire ourcq 11th republique parmentier vosges charonne pere ménilmontant musique\n", | |
"Topic #5:\n", | |
"vosges denfert rochereau louis paul 14th daguerre ile montsouris république lachaise luxembourg river pernety oberkampf alesia beaubourg island père picasso\n", | |
"Topic #6:\n", | |
"vosges arts picasso république beaubourg temple paul rochereau métiers denfert bretagne moulin louis martin republique 14th pernety 17th rouge 18th\n", | |
"Topic #7:\n", | |
"montorgueil sentier boulevards royal beaubourg buttes lachaise garnier belleville nouvelle chaumont louis denis père bourse printemps etienne vosges réaumur tuileries\n", | |
"Topic #8:\n", | |
"vosges temple arts picasso beaubourg république bretagne métiers rouges republique rambuteau paul georges 17th oldest 18th river poutres antique fashionable\n", | |
"Topic #9:\n", | |
"batignolles monceau ternes etoile clichy 17th brochant villiers pigalle maillot guy martin pereire rouge moulin rome levis 17ème poncelet wagram\n", | |
"Topic #10:\n", | |
"versailles mars 15th motte convention invalides tram picquet georges 15ème nation tramway champ expositions beaugrenelle montorgueil grenelle lachaise vaugirard père\n", | |
"Topic #11:\n", | |
"invalides lachaise oberkampf père mars république nation prés river vosges belleville cler martin rouge orsay chaumont moulin luxembourg buttes militaire\n", | |
"Topic #12:\n", | |
"trocadero passy trocadéro 16th boulogne victor auteuil hugo garros muette etoile roland 16 ranelagh 16ème river 22 52 élysées versailles\n", | |
"Topic #13:\n", | |
"lachaise père chaumont buttes belleville martin rouge ménilmontant nation république gambetta abbesses pigalle jourdain cemetery oberkampf joffrin 18th republique pere\n", | |
"Topic #14:\n", | |
"lachaise père luxembourg belleville prés buttes gambetta ménilmontant rochereau chaumont montsouris daguerre oberkampf denfert république nation alésia alesia jourdain cailles\n", | |
"Topic #15:\n", | |
"trocadéro passy 16th trocadero hugo boulogne victor auteuil luxembourg rochereau etoile 8th royal faubourg nation denfert monceau honoré tuileries 16ème\n", | |
"Topic #16:\n", | |
"trocadero passy 16th trocadéro victor boulogne hugo muette auteuil martin arts garros river etoile 16 vosges temple république rouge beaubourg\n", | |
"Topic #17:\n", | |
"tuileries royal batignolles prés honoré butte 17th cailles monceau river italie clichy pont luxembourg vendôme montorgueil arts orsay luxurious rivoli\n", | |
"Topic #18:\n", | |
"buttes chaumont villette belleville ourcq martin jourdain bassin crimée sciences stalingrad république ourq jaurès pyrénées 104 musique tramway butte 26\n", | |
"Topic #19:\n", | |
"montorgueil batignolles monceau rouge abbesses pigalle moulin butte clichy 18th 17th sentier louis etoile printemps joffrin river villiers luxembourg beaubourg\n", | |
"Topic #20:\n", | |
"buttes beaubourg batignolles pigalle chaumont martin rouge 17th denfert belleville villette montorgueil vosges temple georges ourcq bretagne republique arts picasso\n", | |
"Topic #21:\n", | |
"montorgueil royal tuileries sentier honoré beaubourg garnier boulevards marcel rivoli denis etienne vendôme bourse river strasbourg 18th nouvelle 17th réaumur\n", | |
"Topic #22:\n", | |
"luxembourg mouffetard sorbonne prés plantes panthéon pantheon river sulpice nation pres batignolles république monge louis ile denfert oberkampf vincennes monceau\n", | |
"Topic #23:\n", | |
"buttes chaumont pigalle villette moulin invalides belleville martin luxembourg abbesses jules prés lachaise rouge butte ourcq lamarck sacre mars père\n", | |
"Topic #24:\n", | |
"nation vincennes bercy daumesnil aligre reuilly 12th diderot verte coulée république promenade tramway ledru antoine rollin faubourg vosges tram lachaise\n", | |
"Topic #25:\n", | |
"passy trocadéro boulogne 16th trocadero lachaise auteuil pigalle hugo rouge abbesses père garros moulin roland etoile river ranelagh nation vosges\n", | |
"Topic #26:\n", | |
"luxembourg prés denfert rochereau daguerre 14th royal pres montorgueil tuileries odeon odéon sulpice montsouris alesia monceau alésia 17th pernety honoré\n", | |
"Topic #27:\n", | |
"luxembourg prés pres sorbonne odeon odéon sulpice river pantheon mouffetard panthéon arts flore 17th orsay pont louis trocadéro andré plantes\n", | |
"Topic #28:\n", | |
"louis paul ile vosges river island beaubourg 17th cathedral picasso pont marie 18th georges île rivoli rambuteau layout dream martin\n", | |
"Topic #29:\n", | |
"martin république republique 10th denis boulevards belleville goncourt strasbourg jacques bonsergent faubourg louis nouvelle fabien colonel montorgueil republic beaubourg eurostar\n", | |
"Topic #30:\n", | |
"batignolles ternes clichy martin 17th monceau etoile rouge pigalle moulin république maillot nation bercy guy brochant abbesses villiers vincennes rome\n", | |
"Topic #31:\n", | |
"rouge pigalle martin moulin lachaise mouffetard luxembourg république plantes boulevards oberkampf royal abbesses père sacre 18th pantheon panthéon joffrin martyrs\n", | |
"Topic #32:\n", | |
"mars 15th versailles motte grenelle convention invalides beaugrenelle buttes tramway river lachaise chaumont 15ème commercial piquet expositions trocadero brassens vaugirard\n", | |
"Topic #33:\n", | |
"denfert rochereau daguerre montsouris 14th alesia pernety alésia plaisance luxembourg tram tramway universitaire 38 versailles théâtres 62 prés t3 artists\n", | |
"Topic #34:\n", | |
"martin vincennes nation république daumesnil louis bercy 10th diderot reuilly republique aligre buttes 12th denis faubourg belleville boulevards chaumont verte\n", | |
"Topic #35:\n", | |
"prés luxembourg pres sulpice odéon odeon pont arts river flore orsay rennes 17th ile invalides midi 18th andré prestigious antique\n", | |
"Topic #36:\n", | |
"lachaise père oberkampf nation république belleville gambetta ménilmontant 11th cemetery voltaire pere charonne republique martin parmentier menilmontant vosges cimetière 20th\n", | |
"Topic #37:\n", | |
"paul louis vosges river ile trocadero 16th beaubourg montorgueil mars passy invalides nation trocadéro versailles 15th island 17th picasso boulogne\n", | |
"Topic #38:\n", | |
"lachaise oberkampf nation père république 11th charonne belleville martin republique voltaire parmentier pigalle rouge ménilmontant moulin abbesses cemetery goncourt 11ème\n", | |
"Topic #39:\n", | |
"monceau madeleine batignolles 8th etoile élysées montaigne honoré faubourg ternes printemps triangle clichy luxurious george villiers elysée prestigious palace tuileries\n", | |
"Topic #40:\n", | |
"montorgueil vosges royal père oberkampf abbesses pigalle 18th beaubourg lachaise louis république moulin nation rouge boulevards sentier paul martin jules\n", | |
"Topic #41:\n", | |
"lachaise père nation belleville gambetta versailles mars 15th buttes ménilmontant chaumont invalides tramway convention champ expositions tram cemetery motte picquet\n", | |
"Topic #42:\n", | |
"batignolles lachaise père oberkampf nation république martin etoile 11th monceau charonne ternes voltaire ménilmontant parmentier belleville republique chaumont royal 11ème\n", | |
"Topic #43:\n", | |
"martin pigalle abbesses moulin rouge république joffrin versailles sacre boulevards denfert butte jules mars belleville anvers chapelle republique 10th 15th\n", | |
"Topic #44:\n", | |
"pigalle rouge moulin buttes luxembourg abbesses mouffetard chaumont butte plantes 9th versailles italie panthéon vincennes martin prés pantheon nation garnier\n", | |
"Topic #45:\n", | |
"prés luxembourg 15th moulin batignolles abbesses rouge versailles pigalle jules tertre royal champ mars sacre beaugrenelle river 15ème printer convention\n", | |
"Topic #46:\n", | |
"invalides mars versailles 15th convention motte champ orsay grenelle beaugrenelle militaire river prés ecole expositions cler picquet tramway 15ème brassens\n", | |
"Topic #47:\n", | |
"luxembourg mouffetard panthéon pantheon plantes sorbonne italie cailles butte river monge gobelins cathedral louis prés ile royal port 13th austerlitz\n", | |
"Topic #48:\n", | |
"abbesses rouge moulin joffrin jules pigalle sacre butte 18th lamarck marcadet tertre caulaincourt chapelle clignancourt dormoy marx puces clichy simplon\n", | |
"Topic #49:\n", | |
"martin république republique denis invalides 10th boulevards versailles batignolles montorgueil goncourt beaubourg mars nation buttes bonsergent vosges belleville jacques chaumont\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"n_samples = 20000\n", | |
"n_features = 1000\n", | |
"n_topics = 50\n", | |
"n_top_words = 20\n", | |
"from time import time\n", | |
"def print_top_words(model, feature_names, n_top_words):\n", | |
" for topic_idx, topic in enumerate(model.components_):\n", | |
" print(\"Topic #%d:\" % topic_idx)\n", | |
" print(\" \".join([feature_names[i]\n", | |
" for i in topic.argsort()[:-n_top_words - 1:-1]]))\n", | |
" print()\n", | |
"\n", | |
"tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=5, #max_features=n_features,\n", | |
" stop_words='english')\n", | |
"# Use tf (raw term count) features for LDA.\n", | |
"print(\"Extracting tf features for LDA...\")\n", | |
"tf_vectorizer = CountVectorizer(max_df=0.80, min_df=2, max_features=n_features,\n", | |
" stop_words='english')\n", | |
"t0 = time()\n", | |
"tf = tf_vectorizer.fit_transform(documents)\n", | |
"print(\"done in %0.3fs.\" % (time() - t0))\n", | |
"\n", | |
"print(\"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n", | |
" % (n_samples, n_features))\n", | |
"lda = decomposition.LatentDirichletAllocation(n_topics=n_topics, max_iter=5,\n", | |
" learning_method='online', learning_offset=50.,\n", | |
" random_state=0)\n", | |
"t0 = time()\n", | |
"lda.fit(tf)\n", | |
"print(\"done in %0.3fs.\" % (time() - t0))\n", | |
"\n", | |
"print(\"\\nTopics in LDA model:\")\n", | |
"tf_feature_names = tf_vectorizer.get_feature_names()\n", | |
"print_top_words(lda, tf_feature_names, n_top_words)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment