lawlesst/.gitignore

## .gitignore
datasets/
.ipynb*

## 1-topic-modeling.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              1-topic-modeling.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## download-dataset.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              download-dataset.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## filtering-a-dataset.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import gzip\n",
    "import random\n",
    "from pprint import pprint\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading the dataset ...\n",
      "Adding http://www.jstor.org/stable/10.1086/491498 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/432295 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/379413 to sample\n",
      "Adding http://www.jstor.org/stable/228664 to sample\n",
      "Adding http://www.jstor.org/stable/236768 to sample\n",
      "Adding http://www.jstor.org/stable/227706 to sample\n",
      "Adding http://www.jstor.org/stable/231357 to sample\n",
      "Adding http://www.jstor.org/stable/3080697 to sample\n",
      "Adding http://www.jstor.org/stable/229231 to sample\n",
      "Adding http://www.jstor.org/stable/230556 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/670902 to sample\n",
      "Adding http://www.jstor.org/stable/228263 to sample\n",
      "Adding http://www.jstor.org/stable/229843 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/678012 to sample\n",
      "Adding http://www.jstor.org/stable/230061 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/376025 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/653929 to sample\n",
      "Adding http://www.jstor.org/stable/226119 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/491505 to sample\n",
      "Adding http://www.jstor.org/stable/235887 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/682793 to sample\n",
      "Adding http://www.jstor.org/stable/227572 to sample\n",
      "Adding http://www.jstor.org/stable/10.1086/386402 to sample\n",
      "Adding http://www.jstor.org/stable/223695 to sample\n",
      "Adding http://www.jstor.org/stable/235969 to sample\n",
      "Dataset reading complete. 25 total documents.\n"
     ]
    }
   ],
   "source": [
    "sample_doc_numbers = random.sample(range(0, 19000), 25)\n",
    "sample_docs = []\n",
    "\n",
    "print(\"Reading the dataset ...\")\n",
    "\n",
    "with gzip.open(\"./datasets/dset1.jsonl.gz\", \"rb\") as inf:\n",
    "    for row_num, row in enumerate(inf):\n",
    "        doc = json.loads(row)\n",
    "        if row_num not in sample_doc_numbers:\n",
    "            continue\n",
    "        print(f\"Adding {doc['id']} to sample\")\n",
    "        sample_docs.append(doc)\n",
    "\n",
    "print(f\"Dataset reading complete. {len(sample_docs)} total documents.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc1 = sample_docs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "to_delete = [\"unigramCount\", \"bigramCount\", \"trigramCount\", \"fullText\"]\n",
    "for k in to_delete:\n",
    "    if k in doc1.keys():\n",
    "        del doc1[k]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'creator': ['Joan‐Pau Rubiés'],\n",
      " 'datePublished': '2005-06-01',\n",
      " 'docType': 'article',\n",
      " 'id': 'http://www.jstor.org/stable/10.1086/491498',\n",
      " 'identifier': [{'name': 'issn', 'value': '00211753'},\n",
      "                {'name': 'oclc', 'value': '49976319'},\n",
      "                {'name': 'lccn', 'value': '2002-227035'},\n",
      "                {'name': 'local_uuid',\n",
      "                 'value': 'd22c16bb-d068-3bdf-9962-8d0db608891e'},\n",
      "                {'name': 'local_doi', 'value': '10.1086/491498'},\n",
      "                {'name': 'journal_id', 'value': 'isis'}],\n",
      " 'isPartOf': 'Isis',\n",
      " 'issueNumber': '2',\n",
      " 'language': ['eng'],\n",
      " 'outputFormat': ['unigram', 'bigram', 'trigram'],\n",
      " 'pageCount': 2,\n",
      " 'pageEnd': '276',\n",
      " 'pageStart': '275',\n",
      " 'pagination': 'pp. 275-276',\n",
      " 'provider': 'jstor',\n",
      " 'publicationYear': 2005,\n",
      " 'publisher': 'The University of Chicago Press',\n",
      " 'sourceCategory': ['History of Science & Technology', 'History'],\n",
      " 'title': 'Review Article',\n",
      " 'url': 'http://www.jstor.org/stable/10.1086/491498',\n",
      " 'volumeNumber': '96',\n",
      " 'wordCount': 1051}\n"
     ]
    }
   ],
   "source": [
    "pprint(doc1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "fields_to_keep = [\n",
    "    \"id\",\n",
    "    \"title\",\n",
    "    \"isPartOf\",\n",
    "    \"publicationYear\",\n",
    "    \"creator\",\n",
    "    \"wordCount\",\n",
    "    \"provider\",\n",
    "    \"url\"\n",
    "]\n",
    "filtered_sample_docs = []\n",
    "for doc in sample_docs:\n",
    "    new_doc = {}\n",
    "    for f in fields_to_keep:\n",
    "        value = doc.get(f)\n",
    "        new_doc[f] = value\n",
    "    filtered_sample_docs.append(new_doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'creator': ['Joan‐Pau Rubiés'],\n",
      " 'id': 'http://www.jstor.org/stable/10.1086/491498',\n",
      " 'isPartOf': 'Isis',\n",
      " 'provider': 'jstor',\n",
      " 'publicationYear': 2005,\n",
      " 'title': 'Review Article',\n",
      " 'url': 'http://www.jstor.org/stable/10.1086/491498',\n",
      " 'wordCount': 1051}\n"
     ]
    }
   ],
   "source": [
    "pprint(filtered_sample_docs[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"datasets/filtered_dset1.json\", \"w\") as of:\n",
    "    json.dump(filtered_sample_docs, of)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## getDataset
#!/bin/bash
set -e

#service=http://localhost:5000/dl
service=https://www.jstor.org/api/tdm/v1

fname=$2
if [ -z "${fname}" ]; then
    fname=$1
fi
mkdir -p datasets

dl=`curl -s $service/nb/dataset/$1/info |\
    grep -o 'https://ithaka-labs.*Expires\=[0-9]*'`

dset="./datasets/$fname.jsonl.gz"
wget -q -L --show-progress \
    -O $dset \
    --user-agent "tdm notebooks" \
    $dl

export DATASET_FILE=$dset

echo "Your dataset $1 is stored in: $dset"

## library-history.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              library-history.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## requirements.txt
jupyter-notebookparams
jupyter_contrib_nbextensions
pandas
matplotlib
seaborn
gensim
wordfreq

## start
#!/bin/bash

/opt/conda/bin/python3

version=0.1

python -m nltk.downloader stopwords wordnet

jupyter contrib nbextension install --user
jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
jupyter nbextension enable toc2/main
jupyter nbextension enable --py jupyter_notebookparams

exec "$@"


## tdm-client-demo.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              tdm-client-demo.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## word-frequencies-across-dataset.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              word-frequencies-across-dataset.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 120,
	"metadata": {},
	"outputs": [],
	"source": [
	"import json\n",
	"import gzip\n",
	"import random\n",
	"from pprint import pprint\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 121,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Reading the dataset ...\n",
	"Adding http://www.jstor.org/stable/10.1086/491498 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/432295 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/379413 to sample\n",
	"Adding http://www.jstor.org/stable/228664 to sample\n",
	"Adding http://www.jstor.org/stable/236768 to sample\n",
	"Adding http://www.jstor.org/stable/227706 to sample\n",
	"Adding http://www.jstor.org/stable/231357 to sample\n",
	"Adding http://www.jstor.org/stable/3080697 to sample\n",
	"Adding http://www.jstor.org/stable/229231 to sample\n",
	"Adding http://www.jstor.org/stable/230556 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/670902 to sample\n",
	"Adding http://www.jstor.org/stable/228263 to sample\n",
	"Adding http://www.jstor.org/stable/229843 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/678012 to sample\n",
	"Adding http://www.jstor.org/stable/230061 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/376025 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/653929 to sample\n",
	"Adding http://www.jstor.org/stable/226119 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/491505 to sample\n",
	"Adding http://www.jstor.org/stable/235887 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/682793 to sample\n",
	"Adding http://www.jstor.org/stable/227572 to sample\n",
	"Adding http://www.jstor.org/stable/10.1086/386402 to sample\n",
	"Adding http://www.jstor.org/stable/223695 to sample\n",
	"Adding http://www.jstor.org/stable/235969 to sample\n",
	"Dataset reading complete. 25 total documents.\n"
	]
	}
	],
	"source": [
	"sample_doc_numbers = random.sample(range(0, 19000), 25)\n",
	"sample_docs = []\n",
	"\n",
	"print(\"Reading the dataset ...\")\n",
	"\n",
	"with gzip.open(\"./datasets/dset1.jsonl.gz\", \"rb\") as inf:\n",
	" for row_num, row in enumerate(inf):\n",
	" doc = json.loads(row)\n",
	" if row_num not in sample_doc_numbers:\n",
	" continue\n",
	" print(f\"Adding {doc['id']} to sample\")\n",
	" sample_docs.append(doc)\n",
	"\n",
	"print(f\"Dataset reading complete. {len(sample_docs)} total documents.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 122,
	"metadata": {},
	"outputs": [],
	"source": [
	"doc1 = sample_docs[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 123,
	"metadata": {},
	"outputs": [],
	"source": [
	"to_delete = [\"unigramCount\", \"bigramCount\", \"trigramCount\", \"fullText\"]\n",
	"for k in to_delete:\n",
	" if k in doc1.keys():\n",
	" del doc1[k]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 124,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'creator': ['Joan‐Pau Rubiés'],\n",
	" 'datePublished': '2005-06-01',\n",
	" 'docType': 'article',\n",
	" 'id': 'http://www.jstor.org/stable/10.1086/491498',\n",
	" 'identifier': [{'name': 'issn', 'value': '00211753'},\n",
	" {'name': 'oclc', 'value': '49976319'},\n",
	" {'name': 'lccn', 'value': '2002-227035'},\n",
	" {'name': 'local_uuid',\n",
	" 'value': 'd22c16bb-d068-3bdf-9962-8d0db608891e'},\n",
	" {'name': 'local_doi', 'value': '10.1086/491498'},\n",
	" {'name': 'journal_id', 'value': 'isis'}],\n",
	" 'isPartOf': 'Isis',\n",
	" 'issueNumber': '2',\n",
	" 'language': ['eng'],\n",
	" 'outputFormat': ['unigram', 'bigram', 'trigram'],\n",
	" 'pageCount': 2,\n",
	" 'pageEnd': '276',\n",
	" 'pageStart': '275',\n",
	" 'pagination': 'pp. 275-276',\n",
	" 'provider': 'jstor',\n",
	" 'publicationYear': 2005,\n",
	" 'publisher': 'The University of Chicago Press',\n",
	" 'sourceCategory': ['History of Science & Technology', 'History'],\n",
	" 'title': 'Review Article',\n",
	" 'url': 'http://www.jstor.org/stable/10.1086/491498',\n",
	" 'volumeNumber': '96',\n",
	" 'wordCount': 1051}\n"
	]
	}
	],
	"source": [
	"pprint(doc1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 125,
	"metadata": {},
	"outputs": [],
	"source": [
	"fields_to_keep = [\n",
	" \"id\",\n",
	" \"title\",\n",
	" \"isPartOf\",\n",
	" \"publicationYear\",\n",
	" \"creator\",\n",
	" \"wordCount\",\n",
	" \"provider\",\n",
	" \"url\"\n",
	"]\n",
	"filtered_sample_docs = []\n",
	"for doc in sample_docs:\n",
	" new_doc = {}\n",
	" for f in fields_to_keep:\n",
	" value = doc.get(f)\n",
	" new_doc[f] = value\n",
	" filtered_sample_docs.append(new_doc)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 126,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'creator': ['Joan‐Pau Rubiés'],\n",
	" 'id': 'http://www.jstor.org/stable/10.1086/491498',\n",
	" 'isPartOf': 'Isis',\n",
	" 'provider': 'jstor',\n",
	" 'publicationYear': 2005,\n",
	" 'title': 'Review Article',\n",
	" 'url': 'http://www.jstor.org/stable/10.1086/491498',\n",
	" 'wordCount': 1051}\n"
	]
	}
	],
	"source": [
	"pprint(filtered_sample_docs[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 127,
	"metadata": {},
	"outputs": [],
	"source": [
	"with open(\"datasets/filtered_dset1.json\", \"w\") as of:\n",
	" json.dump(filtered_sample_docs, of)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	#!/bin/bash
	set -e

	#service=http://localhost:5000/dl
	service=https://www.jstor.org/api/tdm/v1

	fname=$2
	if [ -z "${fname}" ]; then
	fname=$1
	fi
	mkdir -p datasets

	dl=`curl -s $service/nb/dataset/$1/info \|\
	grep -o 'https://ithaka-labs.Expires\=[0-9]'`

	dset="./datasets/$fname.jsonl.gz"
	wget -q -L --show-progress \
	-O $dset \
	--user-agent "tdm notebooks" \
	$dl

	export DATASET_FILE=$dset

	echo "Your dataset $1 is stored in: $dset"
	jupyter-notebookparams
	jupyter_contrib_nbextensions
	pandas
	matplotlib
	seaborn
	gensim
	wordfreq
	#!/bin/bash

	/opt/conda/bin/python3

	version=0.1

	python -m nltk.downloader stopwords wordnet

	jupyter contrib nbextension install --user
	jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
	jupyter nbextension enable toc2/main
	jupyter nbextension enable --py jupyter_notebookparams

	exec "$@"