lawlesst/.gitignore

## .gitignore
datasets/
.ipynb*

## 1-topic-modeling.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              1-topic-modeling.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## download-dataset.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              download-dataset.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## filtering-a-dataset.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              filtering-a-dataset.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## getDataset
#!/bin/bash
set -e

#service=http://localhost:5000/dl
service=https://www.jstor.org/api/tdm/v1

fname=$2
if [ -z "${fname}" ]; then
    fname=$1
fi
mkdir -p datasets

dl=`curl -s $service/nb/dataset/$1/info |\
    grep -o 'https://ithaka-labs.*Expires\=[0-9]*'`

dset="./datasets/$fname.jsonl.gz"
wget -q -L --show-progress \
    -O $dset \
    --user-agent "tdm notebooks" \
    $dl

export DATASET_FILE=$dset

echo "Your dataset $1 is stored in: $dset"

## library-history.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Topic modeling journal runs\n",
    "\n",
    "An example notebook looking at articles from a single journal, in this case Library History and its variants. \n",
    "\n",
    "Process:\n",
    "* build dataset in corpus builder\n",
    "* download dataset to notebook environment\n",
    "* use ngrams to build a topic model \n",
    "* use the model to infer topics for each article\n",
    "* track topic frequency over time\n",
    "* plot the results\n",
    "\n",
    "The Python library gensim is used for LDA topic modeling.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "!bash getDataset 5c54351f-d2fa-749f-3efc-0477720bd176 library-history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import gzip\n",
    "from collections import Counter\n",
    "from pprint import pprint\n",
    "\n",
    "import gensim\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from wordfreq import simple_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)\n",
    "\n",
    "logging.getLogger('gensim.models').setLevel(logging.WARN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_filename = \"datasets/library-history.jsonl.gz\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Processing\n",
    "\n",
    "Define functions to:\n",
    " - process individual tokens\n",
    " - process ngrams\n",
    " - convert a TDM document to a gensim \"bag of words\"\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_token(token, strip_stopwords=True):\n",
    "    token = \" \".join(simple_tokenize(token))\n",
    "    if len(token) < 3:\n",
    "        return\n",
    "    return token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert process_token(\"Title,\") == \"title\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_ngram(token):\n",
    "    token = simple_tokenize(token)\n",
    "    return \"_\".join(token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def doc_to_bow(raw):\n",
    "    bow_doc = []\n",
    "    ngrams = document.get(\"unigramCount\", {})\n",
    "    for gram, count in ngrams.items():\n",
    "        cg = process_token(gram)\n",
    "        if (cg is None) or len(cg) == 0:\n",
    "            continue\n",
    "        else:\n",
    "            #bow_doc += [cg] * count\n",
    "            bow_doc.append(cg)\n",
    "    for ngram, ngram_len in [(\"bigramCount\", 2), (\"trigramCount\", 3)]:\n",
    "        for gram, count in document.get(ngram, {}).items():\n",
    "            #if count > 1:\n",
    "            #    continue\n",
    "            clean_gram = process_ngram(gram)\n",
    "            if (clean_gram is None) or len(clean_gram) == 0:\n",
    "                continue\n",
    "            #bow_doc += [clean_gram] * count \n",
    "            bow_doc.append(clean_gram)\n",
    "    if len(bow_doc) == 0:\n",
    "        return\n",
    "    return bow_doc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build the corpus\n",
    "\n",
    "Read each document in our dataset, process the ngrams, and convert to a list of documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Limit to n documents. Set to None to do all\n",
    "limit = None\n",
    "num_docs = 0\n",
    "\n",
    "documents = []\n",
    "metadata = {}\n",
    "\n",
    "with gzip.open(dataset_filename, \"rb\") as inf:\n",
    "    for idx, row in enumerate(inf):\n",
    "        document = json.loads(row)\n",
    "        _id = document[\"id\"]\n",
    "        bd = doc_to_bow(document)\n",
    "        metadata[idx] = {\n",
    "            \"year\": document[\"publicationYear\"],\n",
    "            \"id\": _id\n",
    "        }\n",
    "        if bd is None:\n",
    "            print(_id)\n",
    "            continue\n",
    "        else:\n",
    "            documents.append(bd)\n",
    "            num_docs += 1\n",
    "        if (limit is not None) and (num_docs >= limit):\n",
    "           break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary = gensim.corpora.Dictionary(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Number of unique tokens: %d' % len(dictionary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Remove terms that appear in less than 20 of and more than 50% of documents. \n",
    "dictionary.filter_extremes(no_below=5, no_above=0.50)\n",
    "print('Number of unique tokens: %d' % len(dictionary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print('Number of documents: %d' % len(bow_corpus))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train the model\n",
    "\n",
    "Run our bow corpus through the LDA model and print the identified topics with the terms."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#logging.getLogger('gensim.models').setLevel(logging.ERROR)\n",
    "\n",
    "num_topics = 3\n",
    "passes = 50\n",
    "iterations = 700\n",
    "eval_every = None\n",
    "\n",
    "# Train the LDA model.\n",
    "model = gensim.models.LdaModel(\n",
    "    corpus=bow_corpus,\n",
    "    id2word=dictionary,\n",
    "    iterations=iterations,\n",
    "    num_topics=num_topics,\n",
    "    passes=passes,\n",
    "    eval_every=eval_every\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for topic_num in range(0, num_topics):\n",
    "    word_ids = model.get_topic_terms(topic_num)\n",
    "    words = []\n",
    "    for wid, weight in word_ids:\n",
    "        word = dictionary.id2token[wid]\n",
    "        words.append(word)\n",
    "    print(\"Topic {}\".format(str(topic_num + 1).ljust(5)), \" \".join(words))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Track topic changes over time\n",
    "\n",
    "Run each document through the model to identify the topics per document per year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "year_to_topic = {}\n",
    "year_count = Counter()\n",
    "rows = []\n",
    "\n",
    "for idx, meta in metadata.items():\n",
    "    year = meta[\"year\"]\n",
    "    cdoc = bow_corpus[idx]\n",
    "    topics = model.get_document_topics(cdoc)\n",
    "    for topic, score in topics:\n",
    "        cnt = year_to_topic.get(year, Counter())\n",
    "        cnt[topic] += 1\n",
    "        year_to_topic[year] = cnt\n",
    "        year_count[year] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = []\n",
    "for yr, cnt in year_to_topic.items():\n",
    "    for topic, count in cnt.items():\n",
    "        rows.append((yr, topic + 1, count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(rows, columns=[\"year\", \"topic_num\", \"n\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def yearly_frequency(row):\n",
    "    return row[\"n\"] / year_count[row[\"year\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"tf\"] = df.apply(yearly_frequency, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "plt = sns.lmplot(\n",
    "    x=\"year\",\n",
    "    y=\"tf\", \n",
    "    data=df, \n",
    "    hue=\"topic_num\",\n",
    "    ci=None,\n",
    "    palette=sns.color_palette(\"muted\", n_colors=num_topics)\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": null
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "211.188px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## requirements.txt
jupyter-notebookparams
jupyter_contrib_nbextensions
pandas
matplotlib
seaborn
gensim
wordfreq

## start
#!/bin/bash

/opt/conda/bin/python3

version=0.1

python -m nltk.downloader stopwords wordnet

jupyter contrib nbextension install --user
jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
jupyter nbextension enable toc2/main
jupyter nbextension enable --py jupyter_notebookparams

exec "$@"


## tdm-client-demo.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              tdm-client-demo.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## word-frequencies-across-dataset.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              word-frequencies-across-dataset.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	#!/bin/bash
	set -e

	#service=http://localhost:5000/dl
	service=https://www.jstor.org/api/tdm/v1

	fname=$2
	if [ -z "${fname}" ]; then
	fname=$1
	fi
	mkdir -p datasets

	dl=`curl -s $service/nb/dataset/$1/info \|\
	grep -o 'https://ithaka-labs.Expires\=[0-9]'`

	dset="./datasets/$fname.jsonl.gz"
	wget -q -L --show-progress \
	-O $dset \
	--user-agent "tdm notebooks" \
	$dl

	export DATASET_FILE=$dset

	echo "Your dataset $1 is stored in: $dset"
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Topic modeling journal runs\n",
	"\n",
	"An example notebook looking at articles from a single journal, in this case Library History and its variants. \n",
	"\n",
	"Process:\n",
	"* build dataset in corpus builder\n",
	"* download dataset to notebook environment\n",
	"* use ngrams to build a topic model \n",
	"* use the model to infer topics for each article\n",
	"* track topic frequency over time\n",
	"* plot the results\n",
	"\n",
	"The Python library gensim is used for LDA topic modeling.\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"!bash getDataset 5c54351f-d2fa-749f-3efc-0477720bd176 library-history"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import json\n",
	"import gzip\n",
	"from collections import Counter\n",
	"from pprint import pprint\n",
	"\n",
	"import gensim\n",
	"import pandas as pd\n",
	"import matplotlib.pyplot as plt\n",
	"import seaborn as sns\n",
	"from wordfreq import simple_tokenize"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import logging\n",
	"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)\n",
	"\n",
	"logging.getLogger('gensim.models').setLevel(logging.WARN)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dataset_filename = \"datasets/library-history.jsonl.gz\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Processing\n",
	"\n",
	"Define functions to:\n",
	" - process individual tokens\n",
	" - process ngrams\n",
	" - convert a TDM document to a gensim \"bag of words\"\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def process_token(token, strip_stopwords=True):\n",
	" token = \" \".join(simple_tokenize(token))\n",
	" if len(token) < 3:\n",
	" return\n",
	" return token"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"assert process_token(\"Title,\") == \"title\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def process_ngram(token):\n",
	" token = simple_tokenize(token)\n",
	" return \"_\".join(token)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"def doc_to_bow(raw):\n",
	" bow_doc = []\n",
	" ngrams = document.get(\"unigramCount\", {})\n",
	" for gram, count in ngrams.items():\n",
	" cg = process_token(gram)\n",
	" if (cg is None) or len(cg) == 0:\n",
	" continue\n",
	" else:\n",
	" #bow_doc += [cg] * count\n",
	" bow_doc.append(cg)\n",
	" for ngram, ngram_len in [(\"bigramCount\", 2), (\"trigramCount\", 3)]:\n",
	" for gram, count in document.get(ngram, {}).items():\n",
	" #if count > 1:\n",
	" # continue\n",
	" clean_gram = process_ngram(gram)\n",
	" if (clean_gram is None) or len(clean_gram) == 0:\n",
	" continue\n",
	" #bow_doc += [clean_gram] * count \n",
	" bow_doc.append(clean_gram)\n",
	" if len(bow_doc) == 0:\n",
	" return\n",
	" return bow_doc"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Build the corpus\n",
	"\n",
	"Read each document in our dataset, process the ngrams, and convert to a list of documents"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Limit to n documents. Set to None to do all\n",
	"limit = None\n",
	"num_docs = 0\n",
	"\n",
	"documents = []\n",
	"metadata = {}\n",
	"\n",
	"with gzip.open(dataset_filename, \"rb\") as inf:\n",
	" for idx, row in enumerate(inf):\n",
	" document = json.loads(row)\n",
	" _id = document[\"id\"]\n",
	" bd = doc_to_bow(document)\n",
	" metadata[idx] = {\n",
	" \"year\": document[\"publicationYear\"],\n",
	" \"id\": _id\n",
	" }\n",
	" if bd is None:\n",
	" print(_id)\n",
	" continue\n",
	" else:\n",
	" documents.append(bd)\n",
	" num_docs += 1\n",
	" if (limit is not None) and (num_docs >= limit):\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"len(documents)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dictionary = gensim.corpora.Dictionary(documents)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"print('Number of unique tokens: %d' % len(dictionary))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"# Remove terms that appear in less than 20 of and more than 50% of documents. \n",
	"dictionary.filter_extremes(no_below=5, no_above=0.50)\n",
	"print('Number of unique tokens: %d' % len(dictionary))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"print('Number of documents: %d' % len(bow_corpus))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Train the model\n",
	"\n",
	"Run our bow corpus through the LDA model and print the identified topics with the terms."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"#logging.getLogger('gensim.models').setLevel(logging.ERROR)\n",
	"\n",
	"num_topics = 3\n",
	"passes = 50\n",
	"iterations = 700\n",
	"eval_every = None\n",
	"\n",
	"# Train the LDA model.\n",
	"model = gensim.models.LdaModel(\n",
	" corpus=bow_corpus,\n",
	" id2word=dictionary,\n",
	" iterations=iterations,\n",
	" num_topics=num_topics,\n",
	" passes=passes,\n",
	" eval_every=eval_every\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"for topic_num in range(0, num_topics):\n",
	" word_ids = model.get_topic_terms(topic_num)\n",
	" words = []\n",
	" for wid, weight in word_ids:\n",
	" word = dictionary.id2token[wid]\n",
	" words.append(word)\n",
	" print(\"Topic {}\".format(str(topic_num + 1).ljust(5)), \" \".join(words))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Track topic changes over time\n",
	"\n",
	"Run each document through the model to identify the topics per document per year"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"year_to_topic = {}\n",
	"year_count = Counter()\n",
	"rows = []\n",
	"\n",
	"for idx, meta in metadata.items():\n",
	" year = meta[\"year\"]\n",
	" cdoc = bow_corpus[idx]\n",
	" topics = model.get_document_topics(cdoc)\n",
	" for topic, score in topics:\n",
	" cnt = year_to_topic.get(year, Counter())\n",
	" cnt[topic] += 1\n",
	" year_to_topic[year] = cnt\n",
	" year_count[year] += 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"rows = []\n",
	"for yr, cnt in year_to_topic.items():\n",
	" for topic, count in cnt.items():\n",
	" rows.append((yr, topic + 1, count))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(rows, columns=[\"year\", \"topic_num\", \"n\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def yearly_frequency(row):\n",
	" return row[\"n\"] / year_count[row[\"year\"]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df[\"tf\"] = df.apply(yearly_frequency, axis=1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"plt = sns.lmplot(\n",
	" x=\"year\",\n",
	" y=\"tf\", \n",
	" data=df, \n",
	" hue=\"topic_num\",\n",
	" ci=None,\n",
	" palette=sns.color_palette(\"muted\", n_colors=num_topics)\n",
	");"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": null
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": true,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {
	"height": "calc(100% - 180px)",
	"left": "10px",
	"top": "150px",
	"width": "211.188px"
	},
	"toc_section_display": true,
	"toc_window_display": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	jupyter-notebookparams
	jupyter_contrib_nbextensions
	pandas
	matplotlib
	seaborn
	gensim
	wordfreq
	#!/bin/bash

	/opt/conda/bin/python3

	version=0.1

	python -m nltk.downloader stopwords wordnet

	jupyter contrib nbextension install --user
	jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
	jupyter nbextension enable toc2/main
	jupyter nbextension enable --py jupyter_notebookparams

	exec "$@"