Skip to content

Instantly share code, notes, and snippets.

@santteegt
Created July 24, 2017 03:50
Show Gist options
  • Save santteegt/dbaf12b5e01e0e49f8536014ff760493 to your computer and use it in GitHub Desktop.
Save santteegt/dbaf12b5e01e0e49f8536014ff760493 to your computer and use it in GitHub Desktop.
Data Incubator fellowship - My Project data analysis
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd\n",
"import numpy as np\n",
"import nltk\n",
"from bs4 import BeautifulSoup\n",
"import re\n",
"import os\n",
"import codecs\n",
"from sklearn import feature_extraction\n",
"import mpld3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"files = ['tweets_enlace130517_.txt', 'tweets_enlace200517_.txt']\n",
"raw_tweets = []\n",
"bad_coded_tweets = 0\n",
"for file in files:\n",
" with open('twitter_data/%s' % file, 'r') as f:\n",
" tweets = f.readlines()\n",
" for tw in tweets:\n",
" try:\n",
" raw_tweets.append(json.loads(tw))\n",
" except Exception as e:\n",
" bad_coded_tweets += 1\n",
" f.close()\n",
"print(bad_coded_tweets)\n",
"len(raw_tweets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tweet_text = [{'id': tw['id'], 'name': '@'+tw['user']['name'], 'text': tw['text']} for tw in raw_tweets]\n",
"len(tweet_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.io.json.json_normalize(tweet_text)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"ranks = range(df.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"stopwords = nltk.corpus.stopwords.words('spanish')\n",
"from nltk.stem.snowball import SnowballStemmer\n",
"stemmer = SnowballStemmer(\"spanish\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def tokenize_and_stem(text):\n",
" # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n",
" tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n",
" filtered_tokens = []\n",
" # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n",
" for token in tokens:\n",
" if re.search('[a-zA-Z]', token):\n",
" filtered_tokens.append(token)\n",
" stems = [stemmer.stem(t) for t in filtered_tokens]\n",
" return stems\n",
"\n",
"\n",
"def tokenize_only(text):\n",
" # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n",
" tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n",
" filtered_tokens = []\n",
" # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n",
" for token in tokens:\n",
" if re.search('[a-zA-Z]', token):\n",
" filtered_tokens.append(token)\n",
" return filtered_tokens"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"totalvocab_stemmed = []\n",
"totalvocab_tokenized = []\n",
"for t in df['text'].values:\n",
" allwords_stemmed = tokenize_and_stem(t)\n",
" totalvocab_stemmed.extend(allwords_stemmed)\n",
" \n",
" allwords_tokenized = tokenize_only(t)\n",
" totalvocab_tokenized.extend(allwords_tokenized)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_vocab = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_vocab.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,\n",
" min_df=0.02, stop_words=stopwords,\n",
" use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))\n",
"\n",
"%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].values)\n",
"\n",
"print(tfidf_matrix.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"terms = tfidf_vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity\n",
"dist = 1 - cosine_similarity(tfidf_matrix)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans\n",
"\n",
"num_clusters = 5\n",
"km = KMeans(n_clusters=num_clusters)\n",
"%time km.fit(tfidf_matrix)\n",
"clusters = km.labels_.tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"films = { 'id': df['id'].values, 'name': df['name'].values, 'rank': ranks, 'text': df['text'].values, 'cluster': clusters }\n",
"frame = pd.DataFrame(films, index = [clusters] , columns = ['rank', 'id', 'name', 'cluster'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"frame['cluster'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"grouped = frame['rank'].groupby(frame['cluster'])\n",
"grouped.mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from __future__ import print_function\n",
"\n",
"print(\"Top terms per cluster:\")\n",
"print()\n",
"order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n",
"for i in range(num_clusters):\n",
" print(\"Cluster %d words:\" % i, end='')\n",
" for ind in order_centroids[i, :6]:\n",
" print(' %s' % df_vocab.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')\n",
" print()\n",
" print()\n",
" print(\"Cluster %d names:\" % i, end='')\n",
" for name in frame.ix[i]['name'].values.tolist():\n",
" print(' %s,' % name, end='')\n",
" print()\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"frame['Rank'] = frame['rank'] + 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import matplotlib as mpl\n",
"\n",
"from sklearn.manifold import MDS\n",
"\n",
"MDS()\n",
"\n",
"# two components as we're plotting points in a two-dimensional plane\n",
"# \"precomputed\" because we provide a distance matrix\n",
"# we will also specify `random_state` so the plot is reproducible.\n",
"mds = MDS(n_components=2, dissimilarity=\"precomputed\", random_state=1)\n",
"\n",
"pos = mds.fit_transform(dist) # shape (n_components, n_samples)\n",
"\n",
"xs, ys = pos[:, 0], pos[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from nltk.tag import pos_tag\n",
"\n",
"def strip_proppers_POS(text):\n",
" tagged = pos_tag(text.split()) #use NLTK's part of speech tagger\n",
" non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']\n",
" return non_propernouns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class TopToolbar(mpld3.plugins.PluginBase):\n",
" \"\"\"Plugin for moving toolbar to top of figure\"\"\"\n",
"\n",
" JAVASCRIPT = \"\"\"\n",
" mpld3.register_plugin(\"toptoolbar\", TopToolbar);\n",
" TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);\n",
" TopToolbar.prototype.constructor = TopToolbar;\n",
" function TopToolbar(fig, props){\n",
" mpld3.Plugin.call(this, fig, props);\n",
" };\n",
"\n",
" TopToolbar.prototype.draw = function(){\n",
" // the toolbar svg doesn't exist\n",
" // yet, so first draw it\n",
" this.fig.toolbar.draw();\n",
"\n",
" // then change the y position to be\n",
" // at the top of the figure\n",
" this.fig.toolbar.toolbar.attr(\"x\", 150);\n",
" this.fig.toolbar.toolbar.attr(\"y\", 400);\n",
"\n",
" // then remove the draw function,\n",
" // so that it is not called again\n",
" this.fig.toolbar.draw = function() {}\n",
" }\n",
" \"\"\"\n",
" def __init__(self):\n",
" self.dict_ = {\"type\": \"toptoolbar\"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) \n",
"\n",
"#group by cluster\n",
"groups = df.groupby('label')\n",
"\n",
"#define custom css to format the font and to remove the axis labeling\n",
"css = \"\"\"\n",
"text.mpld3-text, div.mpld3-tooltip {\n",
" font-family:Arial, Helvetica, sans-serif;\n",
"}\n",
"\n",
"g.mpld3-xaxis, g.mpld3-yaxis {\n",
"display: none; }\n",
"\"\"\"\n",
"\n",
"# Plot \n",
"fig, ax = plt.subplots(figsize=(14,6)) #set plot size\n",
"ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling\n",
"\n",
"#iterate through groups to layer the plot\n",
"#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label\n",
"for name, group in groups:\n",
" points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, label=cluster_names[name], mec='none', color=cluster_colors[name])\n",
" ax.set_aspect('auto')\n",
" labels = [i for i in group.title]\n",
" \n",
" #set tooltip using points, labels and the already defined 'css'\n",
" tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,\n",
" voffset=10, hoffset=10, css=css)\n",
" #connect tooltip to fig\n",
" mpld3.plugins.connect(fig, tooltip, TopToolbar()) \n",
" \n",
" #set tick marks as blank\n",
" ax.axes.get_xaxis().set_ticks([])\n",
" ax.axes.get_yaxis().set_ticks([])\n",
" \n",
" #set axis as blank\n",
" ax.axes.get_xaxis().set_visible(False)\n",
" ax.axes.get_yaxis().set_visible(False)\n",
"\n",
" \n",
"ax.legend(numpoints=1) #show legend with only one dot\n",
"\n",
"mpld3.display() #show the plot\n",
"\n",
"#uncomment the below to export to html\n",
"#html = mpld3.fig_to_html(fig)\n",
"#print(html)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"a"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:NLP]",
"language": "python",
"name": "conda-env-NLP-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment