Skip to content

Instantly share code, notes, and snippets.

@xccds
Created April 18, 2015 01:52
Show Gist options
  • Save xccds/f2c870f08fe41e0fe164 to your computer and use it in GitHub Desktop.
Save xccds/f2c870f08fe41e0fe164 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{"nbformat_minor": 0, "cells": [{"execution_count": 141, "cell_type": "code", "source": "!head -n 10 txtdm.txt", "outputs": [{"output_type": "stream", "name": "stdout", "text": "The Neatest Little Guide to Stock Market Investing\r\nInvesting For Dummies, 4th Edition\r\nThe Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns\r\nThe Little Book of Value Investing\r\nValue Investing: From Graham to Buffett and Beyond\r\nRich Dad's Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!\r\nInvesting in Real Estate, 5th Edition\r\nStock Investing For Dummies\r\nRich Dad's Advisors: The ABC's of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss\r\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 142, "cell_type": "code", "source": "txt = [s.split() for s in open('txtdm.txt')]", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 143, "cell_type": "code", "source": "ignore = \",|:|!|'\"\nstopwords = ['and','edition','for','in','little','of','the','to']", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 144, "cell_type": "code", "source": "import re", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 145, "cell_type": "code", "source": "txt = [[re.sub(ignore,'',w.lower()) for w in s ] for s in txt]", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 146, "cell_type": "code", "source": "txt = [[w for w in s if w not in stopwords] for s in txt]", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 147, "cell_type": "code", "source": "txt = [' '.join(s) for s in txt]", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 148, "cell_type": "code", "source": "txt", "outputs": [{"execution_count": 148, "output_type": "execute_result", "data": {"text/plain": "['neatest guide stock market investing',\n 'investing dummies 4th',\n 'book common sense investing only way guarantee your fair share stock market returns',\n 'book value investing',\n 'value investing from graham buffett beyond',\n 'rich dads guide investing what rich invest that poor middle class do not',\n 'investing real estate 5th',\n 'stock investing dummies',\n 'rich dads advisors abcs real estate investing secrets finding hidden profits most investors miss']"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 149, "cell_type": "code", "source": "from sklearn.feature_extraction.text import CountVectorizer", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 150, "cell_type": "code", "source": "model = CountVectorizer() \nxvec = model.fit_transform(txt)\nxvec", "outputs": [{"execution_count": 150, "output_type": "execute_result", "data": {"text/plain": "<9x44 sparse matrix of type '<type 'numpy.int64'>'\n\twith 63 stored elements in Compressed Sparse Row format>"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 151, "cell_type": "code", "source": "from sklearn.decomposition import NMF", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 152, "cell_type": "code", "source": "n_topics = 2", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 153, "cell_type": "code", "source": "nmf = NMF(n_components=n_topics,\n sparseness='data', init='nndsvd', random_state=0)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 154, "cell_type": "code", "source": "nmf.fit_transform(xvec)", "outputs": [{"execution_count": 154, "output_type": "execute_result", "data": {"text/plain": "array([[ 0.06206478, 0.32759923],\n [ 0.07006666, 0.14309777],\n [-0. , 0.91268535],\n [ 0.04919433, 0.21403733],\n [ 0.06947109, 0.18135713],\n [ 0.80019019, -0. ],\n [ 0.1820973 , 0.08738539],\n [ 0.04344757, 0.23748179],\n [ 0.74776168, -0. ]])"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 155, "cell_type": "code", "source": "import numpy as np", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 167, "cell_type": "code", "source": "np.round(nmf.components_,2)", "outputs": [{"execution_count": 167, "output_type": "execute_result", "data": {"text/plain": "array([[ 0.05, 0.13, 0.55, 0.55, 0.04, 0. , 0.04, 0.59, 0. ,\n 1.15, 0.59, 0.07, 0.69, 0. , 0.55, 0.04, 0.04, 0. ,\n 0.62, 0.55, 0.59, 1.4 , 0.55, 0. , 0.59, 0.55, 0.55,\n 0.03, 0.59, 0. , 0.59, 0.55, 0.69, 0. , 1.74, 0.55,\n 0. , 0. , 0.01, 0.59, 0.07, 0. , 0.59, 0. ],\n [ 0.12, 0.06, 0. , 0. , 0.15, 0.94, 0.15, 0. , 0.76,\n 0. , 0. , 0.31, 0.03, 0.76, 0. , 0.15, 0.15, 0.76,\n 0.23, 0. , 0. , 1.66, 0. , 1.03, 0. , 0. , 0. ,\n 0.27, 0. , 0.76, 0. , 0. , 0.03, 0.76, 0. , 0. ,\n 0.76, 0.76, 1.23, 0. , 0.32, 0.76, 0. , 0.76]])"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 160, "cell_type": "code", "source": "feature_names = model.get_feature_names()", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 165, "cell_type": "code", "source": "n_top_words=5", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 166, "cell_type": "code", "source": "for topic_idx, topic in enumerate(nmf.components_):\n print(\"Topic #%d:\" % topic_idx)\n print(\" \".join([feature_names[i]\n for i in topic.argsort()[:-n_top_words - 1:-1]]))\n print()", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Topic #0:\nrich investing dads estate real\n()\nTopic #1:\ninvesting stock market book returns\n()\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment