lawlesst/.gitignore

## .gitignore
datasets/
.ipynb*

## 1-topic-modeling.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              1-topic-modeling.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## download-dataset.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              download-dataset.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## filtering-a-dataset.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              filtering-a-dataset.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## getDataset
#!/bin/bash
set -e

#service=http://localhost:5000/dl
service=https://www.jstor.org/api/tdm/v1

fname=$2
if [ -z "${fname}" ]; then
    fname=$1
fi
mkdir -p datasets

dl=`curl -s $service/nb/dataset/$1/info |\
    grep -o 'https://ithaka-labs.*Expires\=[0-9]*'`

dset="./datasets/$fname.jsonl.gz"
wget -q -L --show-progress \
    -O $dset \
    --user-agent "tdm notebooks" \
    $dl

export DATASET_FILE=$dset

echo "Your dataset $1 is stored in: $dset"

## library-history.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              library-history.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## requirements.txt
jupyter-notebookparams
jupyter_contrib_nbextensions
pandas
matplotlib
seaborn
gensim
wordfreq

## start
#!/bin/bash

/opt/conda/bin/python3

version=0.1

python -m nltk.downloader stopwords wordnet

jupyter contrib nbextension install --user
jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
jupyter nbextension enable toc2/main
jupyter nbextension enable --py jupyter_notebookparams

exec "$@"


## tdm-client-demo.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              tdm-client-demo.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## word-frequencies-across-dataset.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download a TDM dataset and count word frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./datasets/isis.jso 100%[===================>] 550.82M  10.8MB/s    in 53s     \n",
      "Your dataset 4e04d0aa-8449-c676-943b-355b5753fdaf is stored in: ./datasets/isis.jsonl.gz\n"
     ]
    }
   ],
   "source": [
    "!bash getDataset 4e04d0aa-8449-c676-943b-355b5753fdaf isis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "import json\n",
    "import gzip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading the dataset ...\n",
      "Dataset reading complete. 370 total documents.\n"
     ]
    }
   ],
   "source": [
    "wf = Counter()\n",
    "\n",
    "# Change to 0 if you don't want to sample the dataset.\n",
    "sample = 0\n",
    "\n",
    "print(\"Reading the dataset ...\")\n",
    "\n",
    "with gzip.open(\"./datasets/dset.jsonl.gz\", \"rb\") as inf:\n",
    "    for row_num, row in enumerate(inf):\n",
    "        doc = json.loads(row)\n",
    "        if (row_num > 0 and row_num % 1000 == 0):\n",
    "            print(f\"Read {row_num} documents from the dataset.\")\n",
    "        for token, count in doc.get(\"unigramCount\", {}).items():\n",
    "            # Filter and clean your tokens here\n",
    "            wf[token] += count\n",
    "        if (sample is None or sample > 0) and (row_num >= sample):\n",
    "            break\n",
    "\n",
    "print(f\"Dataset reading complete. {row_num + 1} total documents.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the                            148132\n",
      "of                             114817\n",
      "and                            95994\n",
      "in                             65250\n",
      "to                             59579\n",
      "a                              48566\n",
      "The                            25234\n",
      "that                           24263\n",
      "for                            20783\n",
      "is                             20080\n",
      "as                             18647\n",
      "on                             18608\n",
      "with                           15080\n",
      "I                              14426\n",
      "New                            13574\n",
      "by                             12813\n",
      "was                            12119\n",
      "from                           11264\n",
      "at                             9823\n",
      "his                            9615\n",
      "or                             9096\n",
      "an                             9041\n",
      "are                            8652\n",
      "it                             8449\n",
      "be                             8147\n",
      "not                            7982\n",
      "American                       7649\n",
      "their                          7362\n",
      ".                              7142\n",
      "this                           6938\n",
      "A                              6724\n",
      "he                             6617\n",
      "In                             6460\n",
      "have                           6120\n",
      "University                     6099\n",
      "but                            5776\n",
      "Press,                         5494\n",
      "were                           5422\n",
      "they                           5361\n",
      "York:                          4971\n",
      "which                          4916\n",
      "York                           4908\n",
      "more                           4838\n",
      "had                            4760\n",
      "also                           4607\n",
      "who                            4567\n",
      "-                              4455\n",
      "about                          4453\n",
      "one                            4389\n",
      "•                              4358\n",
      "has                            4294\n",
      "de                             4262\n",
      "we                             4194\n",
      "See                            3996\n",
      "her                            3982\n",
      "its                            3929\n",
      "(New                           3836\n",
      "see                            3834\n",
      "you                            3834\n",
      "all                            3822\n",
      "than                           3721\n",
      "can                            3706\n",
      "1                              3669\n",
      "into                           3663\n",
      "my                             3457\n",
      "other                          3182\n",
      "been                           3133\n",
      "when                           3123\n",
      "J.                             3103\n",
      "For                            3102\n",
      "would                          3100\n",
      "what                           3059\n",
      "p.                             3051\n",
      "like                           3016\n",
      "John                           2961\n",
      "no                             2927\n",
      "out                            2921\n",
      "and,                           2901\n",
      "&                              2868\n",
      "these                          2850\n",
      "Journal                        2807\n",
      "only                           2680\n",
      "our                            2670\n",
      "some                           2596\n",
      "music                          2589\n",
      "“The                           2555\n",
      "she                            2548\n",
      "History                        2522\n",
      "such                           2520\n",
      "most                           2479\n",
      "between                        2475\n",
      "It                             2468\n",
      "This                           2442\n",
      "York,                          2427\n",
      "up                             2375\n",
      "so                             2366\n",
      "African                        2356\n",
      "M.                             2338\n",
      "A.                             2317\n",
      "Inc.                           2274\n"
     ]
    }
   ],
   "source": [
    "for term, count in wf.most_common(100):\n",
    "    print(term.ljust(30), count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	#!/bin/bash
	set -e

	#service=http://localhost:5000/dl
	service=https://www.jstor.org/api/tdm/v1

	fname=$2
	if [ -z "${fname}" ]; then
	fname=$1
	fi
	mkdir -p datasets

	dl=`curl -s $service/nb/dataset/$1/info \|\
	grep -o 'https://ithaka-labs.Expires\=[0-9]'`

	dset="./datasets/$fname.jsonl.gz"
	wget -q -L --show-progress \
	-O $dset \
	--user-agent "tdm notebooks" \
	$dl

	export DATASET_FILE=$dset

	echo "Your dataset $1 is stored in: $dset"
	jupyter-notebookparams
	jupyter_contrib_nbextensions
	pandas
	matplotlib
	seaborn
	gensim
	wordfreq
	#!/bin/bash

	/opt/conda/bin/python3

	version=0.1

	python -m nltk.downloader stopwords wordnet

	jupyter contrib nbextension install --user
	jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
	jupyter nbextension enable toc2/main
	jupyter nbextension enable --py jupyter_notebookparams

	exec "$@"
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Download a TDM dataset and count word frequency"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"./datasets/isis.jso 100%[===================>] 550.82M 10.8MB/s in 53s \n",
	"Your dataset 4e04d0aa-8449-c676-943b-355b5753fdaf is stored in: ./datasets/isis.jsonl.gz\n"
	]
	}
	],
	"source": [
	"!bash getDataset 4e04d0aa-8449-c676-943b-355b5753fdaf isis"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 163,
	"metadata": {},
	"outputs": [],
	"source": [
	"from collections import Counter\n",
	"import json\n",
	"import gzip"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 168,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Reading the dataset ...\n",
	"Dataset reading complete. 370 total documents.\n"
	]
	}
	],
	"source": [
	"wf = Counter()\n",
	"\n",
	"# Change to 0 if you don't want to sample the dataset.\n",
	"sample = 0\n",
	"\n",
	"print(\"Reading the dataset ...\")\n",
	"\n",
	"with gzip.open(\"./datasets/dset.jsonl.gz\", \"rb\") as inf:\n",
	" for row_num, row in enumerate(inf):\n",
	" doc = json.loads(row)\n",
	" if (row_num > 0 and row_num % 1000 == 0):\n",
	" print(f\"Read {row_num} documents from the dataset.\")\n",
	" for token, count in doc.get(\"unigramCount\", {}).items():\n",
	" # Filter and clean your tokens here\n",
	" wf[token] += count\n",
	" if (sample is None or sample > 0) and (row_num >= sample):\n",
	" break\n",
	"\n",
	"print(f\"Dataset reading complete. {row_num + 1} total documents.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 169,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"the 148132\n",
	"of 114817\n",
	"and 95994\n",
	"in 65250\n",
	"to 59579\n",
	"a 48566\n",
	"The 25234\n",
	"that 24263\n",
	"for 20783\n",
	"is 20080\n",
	"as 18647\n",
	"on 18608\n",
	"with 15080\n",
	"I 14426\n",
	"New 13574\n",
	"by 12813\n",
	"was 12119\n",
	"from 11264\n",
	"at 9823\n",
	"his 9615\n",
	"or 9096\n",
	"an 9041\n",
	"are 8652\n",
	"it 8449\n",
	"be 8147\n",
	"not 7982\n",
	"American 7649\n",
	"their 7362\n",
	". 7142\n",
	"this 6938\n",
	"A 6724\n",
	"he 6617\n",
	"In 6460\n",
	"have 6120\n",
	"University 6099\n",
	"but 5776\n",
	"Press, 5494\n",
	"were 5422\n",
	"they 5361\n",
	"York: 4971\n",
	"which 4916\n",
	"York 4908\n",
	"more 4838\n",
	"had 4760\n",
	"also 4607\n",
	"who 4567\n",
	"- 4455\n",
	"about 4453\n",
	"one 4389\n",
	"• 4358\n",
	"has 4294\n",
	"de 4262\n",
	"we 4194\n",
	"See 3996\n",
	"her 3982\n",
	"its 3929\n",
	"(New 3836\n",
	"see 3834\n",
	"you 3834\n",
	"all 3822\n",
	"than 3721\n",
	"can 3706\n",
	"1 3669\n",
	"into 3663\n",
	"my 3457\n",
	"other 3182\n",
	"been 3133\n",
	"when 3123\n",
	"J. 3103\n",
	"For 3102\n",
	"would 3100\n",
	"what 3059\n",
	"p. 3051\n",
	"like 3016\n",
	"John 2961\n",
	"no 2927\n",
	"out 2921\n",
	"and, 2901\n",
	"& 2868\n",
	"these 2850\n",
	"Journal 2807\n",
	"only 2680\n",
	"our 2670\n",
	"some 2596\n",
	"music 2589\n",
	"“The 2555\n",
	"she 2548\n",
	"History 2522\n",
	"such 2520\n",
	"most 2479\n",
	"between 2475\n",
	"It 2468\n",
	"This 2442\n",
	"York, 2427\n",
	"up 2375\n",
	"so 2366\n",
	"African 2356\n",
	"M. 2338\n",
	"A. 2317\n",
	"Inc. 2274\n"
	]
	}
	],
	"source": [
	"for term, count in wf.most_common(100):\n",
	" print(term.ljust(30), count)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}