Last active
July 7, 2020 16:06
-
-
Save lawlesst/175f99d06712432c3d16aa3056e586f3 to your computer and use it in GitHub Desktop.
tdm-pilot.org gists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
datasets/ | |
.ipynb* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
#service=http://localhost:5000/dl | |
service=https://www.jstor.org/api/tdm/v1 | |
fname=$2 | |
if [ -z "${fname}" ]; then | |
fname=$1 | |
fi | |
mkdir -p datasets | |
dl=`curl -s $service/nb/dataset/$1/info |\ | |
grep -o 'https://ithaka-labs.*Expires\=[0-9]*'` | |
dset="./datasets/$fname.jsonl.gz" | |
wget -q -L --show-progress \ | |
-O $dset \ | |
--user-agent "tdm notebooks" \ | |
$dl | |
export DATASET_FILE=$dset | |
echo "Your dataset $1 is stored in: $dset" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
jupyter-notebookparams | |
jupyter_contrib_nbextensions | |
pandas | |
matplotlib | |
seaborn | |
gensim | |
wordfreq |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
/opt/conda/bin/python3 | |
version=0.1 | |
python -m nltk.downloader stopwords wordnet | |
jupyter contrib nbextension install --user | |
jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user | |
jupyter nbextension enable toc2/main | |
jupyter nbextension enable --py jupyter_notebookparams | |
exec "$@" | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Download a TDM dataset and count word frequency" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"./datasets/isis.jso 100%[===================>] 550.82M 10.8MB/s in 53s \n", | |
"Your dataset 4e04d0aa-8449-c676-943b-355b5753fdaf is stored in: ./datasets/isis.jsonl.gz\n" | |
] | |
} | |
], | |
"source": [ | |
"!bash getDataset 4e04d0aa-8449-c676-943b-355b5753fdaf isis" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 163, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from collections import Counter\n", | |
"import json\n", | |
"import gzip" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 168, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Reading the dataset ...\n", | |
"Dataset reading complete. 370 total documents.\n" | |
] | |
} | |
], | |
"source": [ | |
"wf = Counter()\n", | |
"\n", | |
"# Change to 0 if you don't want to sample the dataset.\n", | |
"sample = 0\n", | |
"\n", | |
"print(\"Reading the dataset ...\")\n", | |
"\n", | |
"with gzip.open(\"./datasets/dset.jsonl.gz\", \"rb\") as inf:\n", | |
" for row_num, row in enumerate(inf):\n", | |
" doc = json.loads(row)\n", | |
" if (row_num > 0 and row_num % 1000 == 0):\n", | |
" print(f\"Read {row_num} documents from the dataset.\")\n", | |
" for token, count in doc.get(\"unigramCount\", {}).items():\n", | |
" # Filter and clean your tokens here\n", | |
" wf[token] += count\n", | |
" if (sample is None or sample > 0) and (row_num >= sample):\n", | |
" break\n", | |
"\n", | |
"print(f\"Dataset reading complete. {row_num + 1} total documents.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 169, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"the 148132\n", | |
"of 114817\n", | |
"and 95994\n", | |
"in 65250\n", | |
"to 59579\n", | |
"a 48566\n", | |
"The 25234\n", | |
"that 24263\n", | |
"for 20783\n", | |
"is 20080\n", | |
"as 18647\n", | |
"on 18608\n", | |
"with 15080\n", | |
"I 14426\n", | |
"New 13574\n", | |
"by 12813\n", | |
"was 12119\n", | |
"from 11264\n", | |
"at 9823\n", | |
"his 9615\n", | |
"or 9096\n", | |
"an 9041\n", | |
"are 8652\n", | |
"it 8449\n", | |
"be 8147\n", | |
"not 7982\n", | |
"American 7649\n", | |
"their 7362\n", | |
". 7142\n", | |
"this 6938\n", | |
"A 6724\n", | |
"he 6617\n", | |
"In 6460\n", | |
"have 6120\n", | |
"University 6099\n", | |
"but 5776\n", | |
"Press, 5494\n", | |
"were 5422\n", | |
"they 5361\n", | |
"York: 4971\n", | |
"which 4916\n", | |
"York 4908\n", | |
"more 4838\n", | |
"had 4760\n", | |
"also 4607\n", | |
"who 4567\n", | |
"- 4455\n", | |
"about 4453\n", | |
"one 4389\n", | |
"• 4358\n", | |
"has 4294\n", | |
"de 4262\n", | |
"we 4194\n", | |
"See 3996\n", | |
"her 3982\n", | |
"its 3929\n", | |
"(New 3836\n", | |
"see 3834\n", | |
"you 3834\n", | |
"all 3822\n", | |
"than 3721\n", | |
"can 3706\n", | |
"1 3669\n", | |
"into 3663\n", | |
"my 3457\n", | |
"other 3182\n", | |
"been 3133\n", | |
"when 3123\n", | |
"J. 3103\n", | |
"For 3102\n", | |
"would 3100\n", | |
"what 3059\n", | |
"p. 3051\n", | |
"like 3016\n", | |
"John 2961\n", | |
"no 2927\n", | |
"out 2921\n", | |
"and, 2901\n", | |
"& 2868\n", | |
"these 2850\n", | |
"Journal 2807\n", | |
"only 2680\n", | |
"our 2670\n", | |
"some 2596\n", | |
"music 2589\n", | |
"“The 2555\n", | |
"she 2548\n", | |
"History 2522\n", | |
"such 2520\n", | |
"most 2479\n", | |
"between 2475\n", | |
"It 2468\n", | |
"This 2442\n", | |
"York, 2427\n", | |
"up 2375\n", | |
"so 2366\n", | |
"African 2356\n", | |
"M. 2338\n", | |
"A. 2317\n", | |
"Inc. 2274\n" | |
] | |
} | |
], | |
"source": [ | |
"for term, count in wf.most_common(100):\n", | |
" print(term.ljust(30), count)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment