Created
July 21, 2017 23:22
-
-
Save IngoKl/96f183bccabefa76e41d8172ab094c92 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Wikipedia/LDA Demonstration" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from nltk.stem.porter import PorterStemmer\n", | |
"from nltk.corpus import stopwords\n", | |
"from nltk.tokenize import RegexpTokenizer\n", | |
"from gensim import corpora, models\n", | |
"import wikipedia\n", | |
"import gensim" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Scraping Random Wikipedia Articles" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"wikipedia_random_articles = wikipedia.random(5)\n", | |
"wikipedia_random_articles.append('Car')\n", | |
"wikipedia_random_articles.append('Bus')\n", | |
"wikipedia_random_articles" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"wikipedia_articles = []\n", | |
"for wikipedia_article in wikipedia_random_articles:\n", | |
" wikipedia_articles.append([wikipedia_article, wikipedia.page(wikipedia_article).content])\n", | |
"\n", | |
" \n", | |
"wikipedia_articles" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Preparing the Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def clean(article):\n", | |
" title, document = article\n", | |
" tokens = RegexpTokenizer(r'\\w+').tokenize(document.lower())\n", | |
" tokens_clean = [token for token in tokens if token not in stopwords.words('english')]\n", | |
" tokens_stemmed = [PorterStemmer().stem(token) for token in tokens_clean]\n", | |
" return (title, tokens_stemmed)\n", | |
"\n", | |
"wikipedia_articles_clean = list(map(clean, wikipedia_articles))\n", | |
"wikipedia_articles_clean " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Document-Term Matrix and Vectorization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"article_contents = [article[1] for article in wikipedia_articles_clean]\n", | |
"dictionary = corpora.Dictionary(article_contents)\n", | |
"type(dictionary)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## LDA Model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"corpus = [dictionary.doc2bow(article) for article in article_contents[:-1]] # All except 'Bus'\n", | |
"lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=100)\n", | |
"print(lda_model.print_topics(num_topics=5, num_words=3))\n", | |
"\n", | |
"# Bus\n", | |
"print('\\nBus')\n", | |
"print( list( lda_model[ [dictionary.doc2bow(article_contents[-1])] ]) )" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda env:python35]", | |
"language": "python", | |
"name": "conda-env-python35-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment