Created
December 3, 2017 20:18
-
-
Save georgeodsc/b3454f71b69f47ca65ec36496cb80abc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Set max features to 6000\n", | |
"no_features = 6000\n", | |
"\n", | |
"#Intialize the tfidf vectorizer\n", | |
"tfidf_vectorizer = TfidfVectorizer(max_df=0.95,\n", | |
" max_features=no_features,\n", | |
" min_df=4, stop_words='english')\n", | |
"#Fit tfidf on corpus\n", | |
"tfidf = tfidf_vectorizer.fit_transform(docs)\n", | |
"tfidf_feature_names = tfidf_vectorizer.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"no_topics = 7\n", | |
"\n", | |
"# Fit NMF object on tfidf matrix\n", | |
"nmf = NMF(n_components=no_topics, random_state=1, \n", | |
" alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Create function to display the top words for each topic\n", | |
"def display_topics(model, feature_names, no_top_words):\n", | |
" for topic_idx, topic in enumerate(model.components_):\n", | |
" print (\"Topic %d:\" % (topic_idx + 1))\n", | |
" print (\" \".join([feature_names[i]\n", | |
" for i in topic.argsort()[:-no_top_words - 1:-1]]))\n", | |
" print (\"\\n\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Topic 1:\n", | |
"ai intelligence artificial human machines marketing humans tasks business startups\n", | |
"\n", | |
"\n", | |
"Topic 2:\n", | |
"data science analytics big business scientists spark scientist skills hadoop\n", | |
"\n", | |
"\n", | |
"Topic 3:\n", | |
"learning machine deep algorithms ml learn systems computer google data\n", | |
"\n", | |
"\n", | |
"Topic 4:\n", | |
"new said technology like company people google says world companies\n", | |
"\n", | |
"\n", | |
"Topic 5:\n", | |
"iot devices internet things connected security smart sensors samsung home\n", | |
"\n", | |
"\n", | |
"Topic 6:\n", | |
"model neural regression data function network class training classification set\n", | |
"\n", | |
"\n", | |
"Topic 7:\n", | |
"customer chatbots chatbot customers bot bots marketing service messaging banks\n", | |
"\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"#Run functions to find the top words for each topic as determined by NMF\n", | |
"no_top_words = 10\n", | |
"display_topics(nmf, tfidf_feature_names, no_top_words)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment