Skip to content

Instantly share code, notes, and snippets.

@springcoil
Created December 1, 2015 12:26
Show Gist options
  • Save springcoil/282ab1242ac0ec86734f to your computer and use it in GitHub Desktop.
Save springcoil/282ab1242ac0ec86734f to your computer and use it in GitHub Desktop.
Using data from the Paris version of Airbnb I hacked together a little Topic Model in Scikitlearn
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"TEXT_DIR = os.path.join(os.getcwd(), 'text/')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os, os.path, codecs\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn import decomposition\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
"from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dir_data = \"text\"\n",
"file_paths = [os.path.join(dir_data, fname) for fname in os.listdir(dir_data) if fname.endswith(\".txt\") ]\n",
"documents = [codecs.open(file_path, 'r', encoding=\"utf8\", errors='ignore').read() for file_path in file_paths ]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tfidf = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, lowercase=True, strip_accents=\"unicode\", use_idf=True, norm=\"l2\", min_df = 5) \n",
"A = tfidf.fit_transform(documents)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<119x13235 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 431707 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"A"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"num_terms = len(tfidf.vocabulary_)\n",
"terms = [\"\"] * num_terms\n",
"for term in tfidf.vocabulary_.keys():\n",
" terms[ tfidf.vocabulary_[term] ] = term"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = decomposition.NMF(init=\"nndsvd\", n_components=30, max_iter=200)\n",
"W = model.fit_transform(A)\n",
"H = model.components_"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Topic 0: et, room, paris, apartment, la, bed, le, kitchen, living, metro\n",
"Topic 1: apartment, paris, bed, floor, kitchen, located, room, bedroom, et, bathroom\n",
"Topic 2: paris, apartment, une, room, et, avec, kitchen, est, chambre, la\n",
"Topic 3: montmartre, sacre, coeur, abbesses, joffrin, jules, du, 12, rouge, moulin\n",
"Topic 4: mouffetard, pantheon, luxembourg, latin, notre, dame, sorbonne, paris, rue, monge\n",
"Topic 5: chaumont, buttes, villette, canal, ourcq, la, parc, et, du, des\n",
"Topic 6: batignolles, monceau, champs, elysees, des, ternes, montmartre, brochant, arc, triomphe\n",
"Topic 7: passy, trocadero, eiffel, tower, auteuil, boulogne, 16th, champs, elysees, garros\n",
"Topic 8: martin, canal, saint, gare, republique, du, st, nord, est, bars\n",
"Topic 9: montorgueil, sentier, paris, rue, louvre, halles, opera, les, reaumur, heart\n",
"Topic 10: alesia, montparnasse, denfert, daguerre, rochereau, montsouris, et, pernety, 14th, rue\n",
"Topic 11: marais, vosges, temple, des, metiers, rue, le, republique, du, place\n",
"Topic 12: et, vincennes, daumesnil, bastille, nation, bercy, la, lyon, metro, gare\n",
"Topic 13: cailles, italie, et, la, butte, mouffetard, gobelins, tolbiac, place, quartier\n",
"Topic 14: et, metro, la, pantin, une, minutes, hoche, room, avec, paris\n",
"Topic 15: montmartre, martyrs, pigalle, opera, moulin, rue, rouge, 9th, cadet, grands\n",
"Topic 16: gambetta, lachaise, pere, et, belleville, menilmontant, nation, une, metro, la\n",
"Topic 17: bastille, oberkampf, republique, lachaise, pere, marais, place, voltaire, bars, parmentier\n",
"Topic 18: eiffel, invalides, tower, cler, rue, mars, militaire, des, la, ecole\n",
"Topic 19: louvre, paris, honore, royal, tuileries, palais, rue, vendome, les, place\n",
"Topic 20: et, une, la, avec, est, le, paris, appartement, apartment, dans\n",
"Topic 21: marais, paris, saint, vosges, louis, room, notre, dame, des, paul\n",
"Topic 22: montmartre, et, sacre, du, flat, coeur, gare, barbes, dormoy, minutes\n",
"Topic 23: champs, elysees, monceau, avenue, paris, des, honore, saint, arc, madeleine\n",
"Topic 24: une, et, avec, lit, bed, double, chambre, appartement, walk, en\n",
"Topic 25: saint, germain, des, luxembourg, pres, odeon, du, sulpice, st, room\n",
"Topic 26: paris, apartment, minutes, une, la, vous, studio, bed, est, flat\n",
"Topic 27: bastille, lachaise, pere, nation, place, voltaire, 11th, marais, oberkampf, la\n",
"Topic 28: eiffel, tower, convention, 15th, motte, metro, tour, mars, 10, la\n",
"Topic 29: et, flat, la, une, bed, living, place, room, avec, machine\n"
]
}
],
"source": [
"for topic_index in range( H.shape[0] ):\n",
" top_indices = np.argsort( H[topic_index,:] )[::-1][0:10]\n",
" term_ranking = [terms[i] for i in top_indices]\n",
" print(\"Topic %d: %s\" % ( topic_index, \", \".join( term_ranking ) ))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As you can see we need to do more cleaning - it is clear from above that there are a number of French stop words there. \n",
"However some of the topics are interesting, we see words like montmartre, bastille, eiffel etc all included - which is what we would expect naively in Parisian locations. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Text Analysis \n",
"We'll do a more sophisicated model by using the new LDA functionality in Scikit-Learn"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting tf features for LDA...\n",
"done in 6.209s.\n",
"Fitting LDA models with tf features, n_samples=20000 and n_features=1000...\n",
"done in 1.687s.\n",
"\n",
"Topics in LDA model:\n",
"Topic #0:\n",
"lachaise belleville père gambetta ménilmontant buttes chaumont nation jourdain cemetery oberkampf 20th pere rouge 26 république menilmontant tram populaire dumas\n",
"Topic #1:\n",
"montorgueil beaubourg luxembourg mouffetard arts tuileries pont république honoré pantheon vosges royal temple métiers prés madeleine martin paul projector 17th\n",
"Topic #2:\n",
"butte cailles italie madeleine rochereau rouge pigalle monceau luxembourg abbesses 16th gobelins mouffetard etoile trocadero tramway elysée printemps passy 13th\n",
"Topic #3:\n",
"pigalle rouge moulin martyrs 9th boulevards abbesses printemps georges garnier sacre anvers butte cadet department martin clichy blanche madeleine 18th\n",
"Topic #4:\n",
"chaumont buttes lachaise république oberkampf belleville père villette martin nation voltaire ourcq 11th republique parmentier vosges charonne pere ménilmontant musique\n",
"Topic #5:\n",
"vosges denfert rochereau louis paul 14th daguerre ile montsouris république lachaise luxembourg river pernety oberkampf alesia beaubourg island père picasso\n",
"Topic #6:\n",
"vosges arts picasso république beaubourg temple paul rochereau métiers denfert bretagne moulin louis martin republique 14th pernety 17th rouge 18th\n",
"Topic #7:\n",
"montorgueil sentier boulevards royal beaubourg buttes lachaise garnier belleville nouvelle chaumont louis denis père bourse printemps etienne vosges réaumur tuileries\n",
"Topic #8:\n",
"vosges temple arts picasso beaubourg république bretagne métiers rouges republique rambuteau paul georges 17th oldest 18th river poutres antique fashionable\n",
"Topic #9:\n",
"batignolles monceau ternes etoile clichy 17th brochant villiers pigalle maillot guy martin pereire rouge moulin rome levis 17ème poncelet wagram\n",
"Topic #10:\n",
"versailles mars 15th motte convention invalides tram picquet georges 15ème nation tramway champ expositions beaugrenelle montorgueil grenelle lachaise vaugirard père\n",
"Topic #11:\n",
"invalides lachaise oberkampf père mars république nation prés river vosges belleville cler martin rouge orsay chaumont moulin luxembourg buttes militaire\n",
"Topic #12:\n",
"trocadero passy trocadéro 16th boulogne victor auteuil hugo garros muette etoile roland 16 ranelagh 16ème river 22 52 élysées versailles\n",
"Topic #13:\n",
"lachaise père chaumont buttes belleville martin rouge ménilmontant nation république gambetta abbesses pigalle jourdain cemetery oberkampf joffrin 18th republique pere\n",
"Topic #14:\n",
"lachaise père luxembourg belleville prés buttes gambetta ménilmontant rochereau chaumont montsouris daguerre oberkampf denfert république nation alésia alesia jourdain cailles\n",
"Topic #15:\n",
"trocadéro passy 16th trocadero hugo boulogne victor auteuil luxembourg rochereau etoile 8th royal faubourg nation denfert monceau honoré tuileries 16ème\n",
"Topic #16:\n",
"trocadero passy 16th trocadéro victor boulogne hugo muette auteuil martin arts garros river etoile 16 vosges temple république rouge beaubourg\n",
"Topic #17:\n",
"tuileries royal batignolles prés honoré butte 17th cailles monceau river italie clichy pont luxembourg vendôme montorgueil arts orsay luxurious rivoli\n",
"Topic #18:\n",
"buttes chaumont villette belleville ourcq martin jourdain bassin crimée sciences stalingrad république ourq jaurès pyrénées 104 musique tramway butte 26\n",
"Topic #19:\n",
"montorgueil batignolles monceau rouge abbesses pigalle moulin butte clichy 18th 17th sentier louis etoile printemps joffrin river villiers luxembourg beaubourg\n",
"Topic #20:\n",
"buttes beaubourg batignolles pigalle chaumont martin rouge 17th denfert belleville villette montorgueil vosges temple georges ourcq bretagne republique arts picasso\n",
"Topic #21:\n",
"montorgueil royal tuileries sentier honoré beaubourg garnier boulevards marcel rivoli denis etienne vendôme bourse river strasbourg 18th nouvelle 17th réaumur\n",
"Topic #22:\n",
"luxembourg mouffetard sorbonne prés plantes panthéon pantheon river sulpice nation pres batignolles république monge louis ile denfert oberkampf vincennes monceau\n",
"Topic #23:\n",
"buttes chaumont pigalle villette moulin invalides belleville martin luxembourg abbesses jules prés lachaise rouge butte ourcq lamarck sacre mars père\n",
"Topic #24:\n",
"nation vincennes bercy daumesnil aligre reuilly 12th diderot verte coulée république promenade tramway ledru antoine rollin faubourg vosges tram lachaise\n",
"Topic #25:\n",
"passy trocadéro boulogne 16th trocadero lachaise auteuil pigalle hugo rouge abbesses père garros moulin roland etoile river ranelagh nation vosges\n",
"Topic #26:\n",
"luxembourg prés denfert rochereau daguerre 14th royal pres montorgueil tuileries odeon odéon sulpice montsouris alesia monceau alésia 17th pernety honoré\n",
"Topic #27:\n",
"luxembourg prés pres sorbonne odeon odéon sulpice river pantheon mouffetard panthéon arts flore 17th orsay pont louis trocadéro andré plantes\n",
"Topic #28:\n",
"louis paul ile vosges river island beaubourg 17th cathedral picasso pont marie 18th georges île rivoli rambuteau layout dream martin\n",
"Topic #29:\n",
"martin république republique 10th denis boulevards belleville goncourt strasbourg jacques bonsergent faubourg louis nouvelle fabien colonel montorgueil republic beaubourg eurostar\n",
"Topic #30:\n",
"batignolles ternes clichy martin 17th monceau etoile rouge pigalle moulin république maillot nation bercy guy brochant abbesses villiers vincennes rome\n",
"Topic #31:\n",
"rouge pigalle martin moulin lachaise mouffetard luxembourg république plantes boulevards oberkampf royal abbesses père sacre 18th pantheon panthéon joffrin martyrs\n",
"Topic #32:\n",
"mars 15th versailles motte grenelle convention invalides beaugrenelle buttes tramway river lachaise chaumont 15ème commercial piquet expositions trocadero brassens vaugirard\n",
"Topic #33:\n",
"denfert rochereau daguerre montsouris 14th alesia pernety alésia plaisance luxembourg tram tramway universitaire 38 versailles théâtres 62 prés t3 artists\n",
"Topic #34:\n",
"martin vincennes nation république daumesnil louis bercy 10th diderot reuilly republique aligre buttes 12th denis faubourg belleville boulevards chaumont verte\n",
"Topic #35:\n",
"prés luxembourg pres sulpice odéon odeon pont arts river flore orsay rennes 17th ile invalides midi 18th andré prestigious antique\n",
"Topic #36:\n",
"lachaise père oberkampf nation république belleville gambetta ménilmontant 11th cemetery voltaire pere charonne republique martin parmentier menilmontant vosges cimetière 20th\n",
"Topic #37:\n",
"paul louis vosges river ile trocadero 16th beaubourg montorgueil mars passy invalides nation trocadéro versailles 15th island 17th picasso boulogne\n",
"Topic #38:\n",
"lachaise oberkampf nation père république 11th charonne belleville martin republique voltaire parmentier pigalle rouge ménilmontant moulin abbesses cemetery goncourt 11ème\n",
"Topic #39:\n",
"monceau madeleine batignolles 8th etoile élysées montaigne honoré faubourg ternes printemps triangle clichy luxurious george villiers elysée prestigious palace tuileries\n",
"Topic #40:\n",
"montorgueil vosges royal père oberkampf abbesses pigalle 18th beaubourg lachaise louis république moulin nation rouge boulevards sentier paul martin jules\n",
"Topic #41:\n",
"lachaise père nation belleville gambetta versailles mars 15th buttes ménilmontant chaumont invalides tramway convention champ expositions tram cemetery motte picquet\n",
"Topic #42:\n",
"batignolles lachaise père oberkampf nation république martin etoile 11th monceau charonne ternes voltaire ménilmontant parmentier belleville republique chaumont royal 11ème\n",
"Topic #43:\n",
"martin pigalle abbesses moulin rouge république joffrin versailles sacre boulevards denfert butte jules mars belleville anvers chapelle republique 10th 15th\n",
"Topic #44:\n",
"pigalle rouge moulin buttes luxembourg abbesses mouffetard chaumont butte plantes 9th versailles italie panthéon vincennes martin prés pantheon nation garnier\n",
"Topic #45:\n",
"prés luxembourg 15th moulin batignolles abbesses rouge versailles pigalle jules tertre royal champ mars sacre beaugrenelle river 15ème printer convention\n",
"Topic #46:\n",
"invalides mars versailles 15th convention motte champ orsay grenelle beaugrenelle militaire river prés ecole expositions cler picquet tramway 15ème brassens\n",
"Topic #47:\n",
"luxembourg mouffetard panthéon pantheon plantes sorbonne italie cailles butte river monge gobelins cathedral louis prés ile royal port 13th austerlitz\n",
"Topic #48:\n",
"abbesses rouge moulin joffrin jules pigalle sacre butte 18th lamarck marcadet tertre caulaincourt chapelle clignancourt dormoy marx puces clichy simplon\n",
"Topic #49:\n",
"martin république republique denis invalides 10th boulevards versailles batignolles montorgueil goncourt beaubourg mars nation buttes bonsergent vosges belleville jacques chaumont\n",
"\n"
]
}
],
"source": [
"n_samples = 20000\n",
"n_features = 1000\n",
"n_topics = 50\n",
"n_top_words = 20\n",
"from time import time\n",
"def print_top_words(model, feature_names, n_top_words):\n",
" for topic_idx, topic in enumerate(model.components_):\n",
" print(\"Topic #%d:\" % topic_idx)\n",
" print(\" \".join([feature_names[i]\n",
" for i in topic.argsort()[:-n_top_words - 1:-1]]))\n",
" print()\n",
"\n",
"tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=5, #max_features=n_features,\n",
" stop_words='english')\n",
"# Use tf (raw term count) features for LDA.\n",
"print(\"Extracting tf features for LDA...\")\n",
"tf_vectorizer = CountVectorizer(max_df=0.80, min_df=2, max_features=n_features,\n",
" stop_words='english')\n",
"t0 = time()\n",
"tf = tf_vectorizer.fit_transform(documents)\n",
"print(\"done in %0.3fs.\" % (time() - t0))\n",
"\n",
"print(\"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n",
" % (n_samples, n_features))\n",
"lda = decomposition.LatentDirichletAllocation(n_topics=n_topics, max_iter=5,\n",
" learning_method='online', learning_offset=50.,\n",
" random_state=0)\n",
"t0 = time()\n",
"lda.fit(tf)\n",
"print(\"done in %0.3fs.\" % (time() - t0))\n",
"\n",
"print(\"\\nTopics in LDA model:\")\n",
"tf_feature_names = tf_vectorizer.get_feature_names()\n",
"print_top_words(lda, tf_feature_names, n_top_words)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment