Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save nramirezuy/9507aca8c78225e9ad459e89dd3b0355 to your computer and use it in GitHub Desktop.
Save nramirezuy/9507aca8c78225e9ad459e89dd3b0355 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.decomposition import NMF, LatentDirichletAllocation"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def display_topics(model, feature_names, no_top_words):\n",
" for topic_idx, topic in enumerate(model.components_):\n",
" print (\"Topic %d:\" % (topic_idx))\n",
" print (\" \".join([feature_names[i]\n",
" for i in topic.argsort()[:-no_top_words - 1:-1]]))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dataset = pd.read_csv('C:/Users/xuanyu/Desktop/XY Personal/NUS/Capstone/Raw data/Posts-General.csv')\n",
"documents = dataset.status_message"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# NMF is able to use tf-idf\n",
"tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english')\n",
"tfidf = tfidf_vectorizer.fit_transform(documents)\n",
"tfidf_feature_names = tfidf_vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# LDA can only use raw term counts for LDA because it is a probabilistic graphical model\n",
"tf_vectorizer = CountVectorizer(max_df=1.0, min_df=1, stop_words='english')\n",
"tf = tf_vectorizer.fit_transform(documents)\n",
"tf_feature_names = tf_vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"no_topics = 5"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Run NMF\n",
"nmf = NMF(n_components=no_topics, init='random', random_state=0).fit(tfidf)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Run LDA\n",
"lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Topic 0:\n",
"ly bit http mobile dash broadband enjoy plan\n",
"Topic 1:\n",
"enjoy thursday cineplex handy road rewards movies operator\n",
"Topic 2:\n",
"day visit lobang king broadcast andrew hall booth\n",
"Topic 3:\n",
"win winners contest chance lucky song avengers stand\n",
"Topic 4:\n",
"changi airport counters customers shops dear prepaid terminal\n",
"Topic 0:\n",
"eagle eddie getaway joern10 tjoe3 fathima7 ann jennifer\n",
"Topic 1:\n",
"bit ly http visit win enjoy com simply\n",
"Topic 2:\n",
"rewards spot points tag joined com screening friends\n",
"Topic 3:\n",
"social impact live futuremakers future makers singapore make\n",
"Topic 4:\n",
"changi airport customers thank counters 12 service inconvenience\n"
]
}
],
"source": [
"no_top_words = 8\n",
"display_topics(nmf, tfidf_feature_names, no_top_words)\n",
"display_topics(lda, tf_feature_names, no_top_words)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment