santteegt/tweets_data_analysis.ipynb

## tweets_data_analysis.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import nltk\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import os\n",
    "import codecs\n",
    "from sklearn import feature_extraction\n",
    "import mpld3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "files = ['tweets_enlace130517_.txt', 'tweets_enlace200517_.txt']\n",
    "raw_tweets = []\n",
    "bad_coded_tweets = 0\n",
    "for file in files:\n",
    "    with open('twitter_data/%s' % file, 'r') as f:\n",
    "        tweets = f.readlines()\n",
    "        for tw in tweets:\n",
    "            try:\n",
    "                raw_tweets.append(json.loads(tw))\n",
    "            except Exception as e:\n",
    "                bad_coded_tweets += 1\n",
    "        f.close()\n",
    "print(bad_coded_tweets)\n",
    "len(raw_tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tweet_text = [{'id': tw['id'], 'name': '@'+tw['user']['name'], 'text': tw['text']} for tw in raw_tweets]\n",
    "len(tweet_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = pd.io.json.json_normalize(tweet_text)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ranks = range(df.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "stopwords = nltk.corpus.stopwords.words('spanish')\n",
    "from nltk.stem.snowball import SnowballStemmer\n",
    "stemmer = SnowballStemmer(\"spanish\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def tokenize_and_stem(text):\n",
    "    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n",
    "    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n",
    "    filtered_tokens = []\n",
    "    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n",
    "    for token in tokens:\n",
    "        if re.search('[a-zA-Z]', token):\n",
    "            filtered_tokens.append(token)\n",
    "    stems = [stemmer.stem(t) for t in filtered_tokens]\n",
    "    return stems\n",
    "\n",
    "\n",
    "def tokenize_only(text):\n",
    "    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n",
    "    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n",
    "    filtered_tokens = []\n",
    "    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n",
    "    for token in tokens:\n",
    "        if re.search('[a-zA-Z]', token):\n",
    "            filtered_tokens.append(token)\n",
    "    return filtered_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "totalvocab_stemmed = []\n",
    "totalvocab_tokenized = []\n",
    "for t in df['text'].values:\n",
    "    allwords_stemmed = tokenize_and_stem(t)\n",
    "    totalvocab_stemmed.extend(allwords_stemmed)\n",
    "    \n",
    "    allwords_tokenized = tokenize_only(t)\n",
    "    totalvocab_tokenized.extend(allwords_tokenized)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_vocab = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_vocab.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,\n",
    "                                 min_df=0.02, stop_words=stopwords,\n",
    "                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))\n",
    "\n",
    "%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].values)\n",
    "\n",
    "print(tfidf_matrix.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "terms = tfidf_vectorizer.get_feature_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "dist = 1 - cosine_similarity(tfidf_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "num_clusters = 5\n",
    "km = KMeans(n_clusters=num_clusters)\n",
    "%time km.fit(tfidf_matrix)\n",
    "clusters = km.labels_.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "films = { 'id': df['id'].values, 'name': df['name'].values, 'rank': ranks, 'text': df['text'].values, 'cluster': clusters }\n",
    "frame = pd.DataFrame(films, index = [clusters] , columns = ['rank', 'id', 'name', 'cluster'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame['cluster'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "grouped = frame['rank'].groupby(frame['cluster'])\n",
    "grouped.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "\n",
    "print(\"Top terms per cluster:\")\n",
    "print()\n",
    "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n",
    "for i in range(num_clusters):\n",
    "    print(\"Cluster %d words:\" % i, end='')\n",
    "    for ind in order_centroids[i, :6]:\n",
    "        print(' %s' % df_vocab.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')\n",
    "    print()\n",
    "    print()\n",
    "    print(\"Cluster %d names:\" % i, end='')\n",
    "    for name in frame.ix[i]['name'].values.tolist():\n",
    "        print(' %s,' % name, end='')\n",
    "    print()\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "frame['Rank'] = frame['rank'] + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "\n",
    "from sklearn.manifold import MDS\n",
    "\n",
    "MDS()\n",
    "\n",
    "# two components as we're plotting points in a two-dimensional plane\n",
    "# \"precomputed\" because we provide a distance matrix\n",
    "# we will also specify `random_state` so the plot is reproducible.\n",
    "mds = MDS(n_components=2, dissimilarity=\"precomputed\", random_state=1)\n",
    "\n",
    "pos = mds.fit_transform(dist)  # shape (n_components, n_samples)\n",
    "\n",
    "xs, ys = pos[:, 0], pos[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nltk.tag import pos_tag\n",
    "\n",
    "def strip_proppers_POS(text):\n",
    "    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger\n",
    "    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']\n",
    "    return non_propernouns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class TopToolbar(mpld3.plugins.PluginBase):\n",
    "    \"\"\"Plugin for moving toolbar to top of figure\"\"\"\n",
    "\n",
    "    JAVASCRIPT = \"\"\"\n",
    "    mpld3.register_plugin(\"toptoolbar\", TopToolbar);\n",
    "    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);\n",
    "    TopToolbar.prototype.constructor = TopToolbar;\n",
    "    function TopToolbar(fig, props){\n",
    "        mpld3.Plugin.call(this, fig, props);\n",
    "    };\n",
    "\n",
    "    TopToolbar.prototype.draw = function(){\n",
    "      // the toolbar svg doesn't exist\n",
    "      // yet, so first draw it\n",
    "      this.fig.toolbar.draw();\n",
    "\n",
    "      // then change the y position to be\n",
    "      // at the top of the figure\n",
    "      this.fig.toolbar.toolbar.attr(\"x\", 150);\n",
    "      this.fig.toolbar.toolbar.attr(\"y\", 400);\n",
    "\n",
    "      // then remove the draw function,\n",
    "      // so that it is not called again\n",
    "      this.fig.toolbar.draw = function() {}\n",
    "    }\n",
    "    \"\"\"\n",
    "    def __init__(self):\n",
    "        self.dict_ = {\"type\": \"toptoolbar\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) \n",
    "\n",
    "#group by cluster\n",
    "groups = df.groupby('label')\n",
    "\n",
    "#define custom css to format the font and to remove the axis labeling\n",
    "css = \"\"\"\n",
    "text.mpld3-text, div.mpld3-tooltip {\n",
    "  font-family:Arial, Helvetica, sans-serif;\n",
    "}\n",
    "\n",
    "g.mpld3-xaxis, g.mpld3-yaxis {\n",
    "display: none; }\n",
    "\"\"\"\n",
    "\n",
    "# Plot \n",
    "fig, ax = plt.subplots(figsize=(14,6)) #set plot size\n",
    "ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling\n",
    "\n",
    "#iterate through groups to layer the plot\n",
    "#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label\n",
    "for name, group in groups:\n",
    "    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, label=cluster_names[name], mec='none', color=cluster_colors[name])\n",
    "    ax.set_aspect('auto')\n",
    "    labels = [i for i in group.title]\n",
    "    \n",
    "    #set tooltip using points, labels and the already defined 'css'\n",
    "    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,\n",
    "                                       voffset=10, hoffset=10, css=css)\n",
    "    #connect tooltip to fig\n",
    "    mpld3.plugins.connect(fig, tooltip, TopToolbar())    \n",
    "    \n",
    "    #set tick marks as blank\n",
    "    ax.axes.get_xaxis().set_ticks([])\n",
    "    ax.axes.get_yaxis().set_ticks([])\n",
    "    \n",
    "    #set axis as blank\n",
    "    ax.axes.get_xaxis().set_visible(False)\n",
    "    ax.axes.get_yaxis().set_visible(False)\n",
    "\n",
    "    \n",
    "ax.legend(numpoints=1) #show legend with only one dot\n",
    "\n",
    "mpld3.display() #show the plot\n",
    "\n",
    "#uncomment the below to export to html\n",
    "#html = mpld3.fig_to_html(fig)\n",
    "#print(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:NLP]",
   "language": "python",
   "name": "conda-env-NLP-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import json\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import nltk\n",
	"from bs4 import BeautifulSoup\n",
	"import re\n",
	"import os\n",
	"import codecs\n",
	"from sklearn import feature_extraction\n",
	"import mpld3"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"files = ['tweets_enlace130517_.txt', 'tweets_enlace200517_.txt']\n",
	"raw_tweets = []\n",
	"bad_coded_tweets = 0\n",
	"for file in files:\n",
	" with open('twitter_data/%s' % file, 'r') as f:\n",
	" tweets = f.readlines()\n",
	" for tw in tweets:\n",
	" try:\n",
	" raw_tweets.append(json.loads(tw))\n",
	" except Exception as e:\n",
	" bad_coded_tweets += 1\n",
	" f.close()\n",
	"print(bad_coded_tweets)\n",
	"len(raw_tweets)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"tweet_text = [{'id': tw['id'], 'name': '@'+tw['user']['name'], 'text': tw['text']} for tw in raw_tweets]\n",
	"len(tweet_text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df = pd.io.json.json_normalize(tweet_text)\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"ranks = range(df.shape[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"stopwords = nltk.corpus.stopwords.words('spanish')\n",
	"from nltk.stem.snowball import SnowballStemmer\n",
	"stemmer = SnowballStemmer(\"spanish\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def tokenize_and_stem(text):\n",
	" # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n",
	" tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n",
	" filtered_tokens = []\n",
	" # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n",
	" for token in tokens:\n",
	" if re.search('[a-zA-Z]', token):\n",
	" filtered_tokens.append(token)\n",
	" stems = [stemmer.stem(t) for t in filtered_tokens]\n",
	" return stems\n",
	"\n",
	"\n",
	"def tokenize_only(text):\n",
	" # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n",
	" tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n",
	" filtered_tokens = []\n",
	" # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n",
	" for token in tokens:\n",
	" if re.search('[a-zA-Z]', token):\n",
	" filtered_tokens.append(token)\n",
	" return filtered_tokens"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"totalvocab_stemmed = []\n",
	"totalvocab_tokenized = []\n",
	"for t in df['text'].values:\n",
	" allwords_stemmed = tokenize_and_stem(t)\n",
	" totalvocab_stemmed.extend(allwords_stemmed)\n",
	" \n",
	" allwords_tokenized = tokenize_only(t)\n",
	" totalvocab_tokenized.extend(allwords_tokenized)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"df_vocab = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df_vocab.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"\n",
	"tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,\n",
	" min_df=0.02, stop_words=stopwords,\n",
	" use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))\n",
	"\n",
	"%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].values)\n",
	"\n",
	"print(tfidf_matrix.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"terms = tfidf_vectorizer.get_feature_names()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.metrics.pairwise import cosine_similarity\n",
	"dist = 1 - cosine_similarity(tfidf_matrix)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.cluster import KMeans\n",
	"\n",
	"num_clusters = 5\n",
	"km = KMeans(n_clusters=num_clusters)\n",
	"%time km.fit(tfidf_matrix)\n",
	"clusters = km.labels_.tolist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"films = { 'id': df['id'].values, 'name': df['name'].values, 'rank': ranks, 'text': df['text'].values, 'cluster': clusters }\n",
	"frame = pd.DataFrame(films, index = [clusters] , columns = ['rank', 'id', 'name', 'cluster'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"frame['cluster'].value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"grouped = frame['rank'].groupby(frame['cluster'])\n",
	"grouped.mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from __future__ import print_function\n",
	"\n",
	"print(\"Top terms per cluster:\")\n",
	"print()\n",
	"order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n",
	"for i in range(num_clusters):\n",
	" print(\"Cluster %d words:\" % i, end='')\n",
	" for ind in order_centroids[i, :6]:\n",
	" print(' %s' % df_vocab.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')\n",
	" print()\n",
	" print()\n",
	" print(\"Cluster %d names:\" % i, end='')\n",
	" for name in frame.ix[i]['name'].values.tolist():\n",
	" print(' %s,' % name, end='')\n",
	" print()\n",
	" print()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"frame['Rank'] = frame['rank'] + 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import matplotlib.pyplot as plt\n",
	"import matplotlib as mpl\n",
	"\n",
	"from sklearn.manifold import MDS\n",
	"\n",
	"MDS()\n",
	"\n",
	"# two components as we're plotting points in a two-dimensional plane\n",
	"# \"precomputed\" because we provide a distance matrix\n",
	"# we will also specify `random_state` so the plot is reproducible.\n",
	"mds = MDS(n_components=2, dissimilarity=\"precomputed\", random_state=1)\n",
	"\n",
	"pos = mds.fit_transform(dist) # shape (n_components, n_samples)\n",
	"\n",
	"xs, ys = pos[:, 0], pos[:, 1]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from nltk.tag import pos_tag\n",
	"\n",
	"def strip_proppers_POS(text):\n",
	" tagged = pos_tag(text.split()) #use NLTK's part of speech tagger\n",
	" non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']\n",
	" return non_propernouns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class TopToolbar(mpld3.plugins.PluginBase):\n",
	" \"\"\"Plugin for moving toolbar to top of figure\"\"\"\n",
	"\n",
	" JAVASCRIPT = \"\"\"\n",
	" mpld3.register_plugin(\"toptoolbar\", TopToolbar);\n",
	" TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);\n",
	" TopToolbar.prototype.constructor = TopToolbar;\n",
	" function TopToolbar(fig, props){\n",
	" mpld3.Plugin.call(this, fig, props);\n",
	" };\n",
	"\n",
	" TopToolbar.prototype.draw = function(){\n",
	" // the toolbar svg doesn't exist\n",
	" // yet, so first draw it\n",
	" this.fig.toolbar.draw();\n",
	"\n",
	" // then change the y position to be\n",
	" // at the top of the figure\n",
	" this.fig.toolbar.toolbar.attr(\"x\", 150);\n",
	" this.fig.toolbar.toolbar.attr(\"y\", 400);\n",
	"\n",
	" // then remove the draw function,\n",
	" // so that it is not called again\n",
	" this.fig.toolbar.draw = function() {}\n",
	" }\n",
	" \"\"\"\n",
	" def __init__(self):\n",
	" self.dict_ = {\"type\": \"toptoolbar\"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) \n",
	"\n",
	"#group by cluster\n",
	"groups = df.groupby('label')\n",
	"\n",
	"#define custom css to format the font and to remove the axis labeling\n",
	"css = \"\"\"\n",
	"text.mpld3-text, div.mpld3-tooltip {\n",
	" font-family:Arial, Helvetica, sans-serif;\n",
	"}\n",
	"\n",
	"g.mpld3-xaxis, g.mpld3-yaxis {\n",
	"display: none; }\n",
	"\"\"\"\n",
	"\n",
	"# Plot \n",
	"fig, ax = plt.subplots(figsize=(14,6)) #set plot size\n",
	"ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling\n",
	"\n",
	"#iterate through groups to layer the plot\n",
	"#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label\n",
	"for name, group in groups:\n",
	" points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, label=cluster_names[name], mec='none', color=cluster_colors[name])\n",
	" ax.set_aspect('auto')\n",
	" labels = [i for i in group.title]\n",
	" \n",
	" #set tooltip using points, labels and the already defined 'css'\n",
	" tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,\n",
	" voffset=10, hoffset=10, css=css)\n",
	" #connect tooltip to fig\n",
	" mpld3.plugins.connect(fig, tooltip, TopToolbar()) \n",
	" \n",
	" #set tick marks as blank\n",
	" ax.axes.get_xaxis().set_ticks([])\n",
	" ax.axes.get_yaxis().set_ticks([])\n",
	" \n",
	" #set axis as blank\n",
	" ax.axes.get_xaxis().set_visible(False)\n",
	" ax.axes.get_yaxis().set_visible(False)\n",
	"\n",
	" \n",
	"ax.legend(numpoints=1) #show legend with only one dot\n",
	"\n",
	"mpld3.display() #show the plot\n",
	"\n",
	"#uncomment the below to export to html\n",
	"#html = mpld3.fig_to_html(fig)\n",
	"#print(html)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"a"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda env:NLP]",
	"language": "python",
	"name": "conda-env-NLP-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}