Skip to content

Instantly share code, notes, and snippets.

@lawlesst
Last active July 7, 2020 16:06
Show Gist options
  • Save lawlesst/175f99d06712432c3d16aa3056e586f3 to your computer and use it in GitHub Desktop.
Save lawlesst/175f99d06712432c3d16aa3056e586f3 to your computer and use it in GitHub Desktop.
tdm-pilot.org gists
datasets/
.ipynb*
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import gzip\n",
"import random\n",
"from pprint import pprint\n"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading the dataset ...\n",
"Adding http://www.jstor.org/stable/10.1086/491498 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/432295 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/379413 to sample\n",
"Adding http://www.jstor.org/stable/228664 to sample\n",
"Adding http://www.jstor.org/stable/236768 to sample\n",
"Adding http://www.jstor.org/stable/227706 to sample\n",
"Adding http://www.jstor.org/stable/231357 to sample\n",
"Adding http://www.jstor.org/stable/3080697 to sample\n",
"Adding http://www.jstor.org/stable/229231 to sample\n",
"Adding http://www.jstor.org/stable/230556 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/670902 to sample\n",
"Adding http://www.jstor.org/stable/228263 to sample\n",
"Adding http://www.jstor.org/stable/229843 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/678012 to sample\n",
"Adding http://www.jstor.org/stable/230061 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/376025 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/653929 to sample\n",
"Adding http://www.jstor.org/stable/226119 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/491505 to sample\n",
"Adding http://www.jstor.org/stable/235887 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/682793 to sample\n",
"Adding http://www.jstor.org/stable/227572 to sample\n",
"Adding http://www.jstor.org/stable/10.1086/386402 to sample\n",
"Adding http://www.jstor.org/stable/223695 to sample\n",
"Adding http://www.jstor.org/stable/235969 to sample\n",
"Dataset reading complete. 25 total documents.\n"
]
}
],
"source": [
"sample_doc_numbers = random.sample(range(0, 19000), 25)\n",
"sample_docs = []\n",
"\n",
"print(\"Reading the dataset ...\")\n",
"\n",
"with gzip.open(\"./datasets/dset1.jsonl.gz\", \"rb\") as inf:\n",
" for row_num, row in enumerate(inf):\n",
" doc = json.loads(row)\n",
" if row_num not in sample_doc_numbers:\n",
" continue\n",
" print(f\"Adding {doc['id']} to sample\")\n",
" sample_docs.append(doc)\n",
"\n",
"print(f\"Dataset reading complete. {len(sample_docs)} total documents.\")"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"doc1 = sample_docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"to_delete = [\"unigramCount\", \"bigramCount\", \"trigramCount\", \"fullText\"]\n",
"for k in to_delete:\n",
" if k in doc1.keys():\n",
" del doc1[k]"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'creator': ['Joan‐Pau Rubiés'],\n",
" 'datePublished': '2005-06-01',\n",
" 'docType': 'article',\n",
" 'id': 'http://www.jstor.org/stable/10.1086/491498',\n",
" 'identifier': [{'name': 'issn', 'value': '00211753'},\n",
" {'name': 'oclc', 'value': '49976319'},\n",
" {'name': 'lccn', 'value': '2002-227035'},\n",
" {'name': 'local_uuid',\n",
" 'value': 'd22c16bb-d068-3bdf-9962-8d0db608891e'},\n",
" {'name': 'local_doi', 'value': '10.1086/491498'},\n",
" {'name': 'journal_id', 'value': 'isis'}],\n",
" 'isPartOf': 'Isis',\n",
" 'issueNumber': '2',\n",
" 'language': ['eng'],\n",
" 'outputFormat': ['unigram', 'bigram', 'trigram'],\n",
" 'pageCount': 2,\n",
" 'pageEnd': '276',\n",
" 'pageStart': '275',\n",
" 'pagination': 'pp. 275-276',\n",
" 'provider': 'jstor',\n",
" 'publicationYear': 2005,\n",
" 'publisher': 'The University of Chicago Press',\n",
" 'sourceCategory': ['History of Science & Technology', 'History'],\n",
" 'title': 'Review Article',\n",
" 'url': 'http://www.jstor.org/stable/10.1086/491498',\n",
" 'volumeNumber': '96',\n",
" 'wordCount': 1051}\n"
]
}
],
"source": [
"pprint(doc1)"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [],
"source": [
"fields_to_keep = [\n",
" \"id\",\n",
" \"title\",\n",
" \"isPartOf\",\n",
" \"publicationYear\",\n",
" \"creator\",\n",
" \"wordCount\",\n",
" \"provider\",\n",
" \"url\"\n",
"]\n",
"filtered_sample_docs = []\n",
"for doc in sample_docs:\n",
" new_doc = {}\n",
" for f in fields_to_keep:\n",
" value = doc.get(f)\n",
" new_doc[f] = value\n",
" filtered_sample_docs.append(new_doc)"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'creator': ['Joan‐Pau Rubiés'],\n",
" 'id': 'http://www.jstor.org/stable/10.1086/491498',\n",
" 'isPartOf': 'Isis',\n",
" 'provider': 'jstor',\n",
" 'publicationYear': 2005,\n",
" 'title': 'Review Article',\n",
" 'url': 'http://www.jstor.org/stable/10.1086/491498',\n",
" 'wordCount': 1051}\n"
]
}
],
"source": [
"pprint(filtered_sample_docs[0])"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
"with open(\"datasets/filtered_dset1.json\", \"w\") as of:\n",
" json.dump(filtered_sample_docs, of)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
#!/bin/bash
set -e
#service=http://localhost:5000/dl
service=https://www.jstor.org/api/tdm/v1
fname=$2
if [ -z "${fname}" ]; then
fname=$1
fi
mkdir -p datasets
dl=`curl -s $service/nb/dataset/$1/info |\
grep -o 'https://ithaka-labs.*Expires\=[0-9]*'`
dset="./datasets/$fname.jsonl.gz"
wget -q -L --show-progress \
-O $dset \
--user-agent "tdm notebooks" \
$dl
export DATASET_FILE=$dset
echo "Your dataset $1 is stored in: $dset"
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
jupyter-notebookparams
jupyter_contrib_nbextensions
pandas
matplotlib
seaborn
gensim
wordfreq
#!/bin/bash
/opt/conda/bin/python3
version=0.1
python -m nltk.downloader stopwords wordnet
jupyter contrib nbextension install --user
jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
jupyter nbextension enable toc2/main
jupyter nbextension enable --py jupyter_notebookparams
exec "$@"
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment