Skip to content

Instantly share code, notes, and snippets.

@lawlesst
Last active July 7, 2020 16:06
Show Gist options
  • Save lawlesst/175f99d06712432c3d16aa3056e586f3 to your computer and use it in GitHub Desktop.
Save lawlesst/175f99d06712432c3d16aa3056e586f3 to your computer and use it in GitHub Desktop.
tdm-pilot.org gists
datasets/
.ipynb*
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#!/bin/bash
set -e
#service=http://localhost:5000/dl
service=https://www.jstor.org/api/tdm/v1
fname=$2
if [ -z "${fname}" ]; then
fname=$1
fi
mkdir -p datasets
dl=`curl -s $service/nb/dataset/$1/info |\
grep -o 'https://ithaka-labs.*Expires\=[0-9]*'`
dset="./datasets/$fname.jsonl.gz"
wget -q -L --show-progress \
-O $dset \
--user-agent "tdm notebooks" \
$dl
export DATASET_FILE=$dset
echo "Your dataset $1 is stored in: $dset"
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
jupyter-notebookparams
jupyter_contrib_nbextensions
pandas
matplotlib
seaborn
gensim
wordfreq
#!/bin/bash
/opt/conda/bin/python3
version=0.1
python -m nltk.downloader stopwords wordnet
jupyter contrib nbextension install --user
jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
jupyter nbextension enable toc2/main
jupyter nbextension enable --py jupyter_notebookparams
exec "$@"
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# New TDM client demo"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download and filter metadata with Pandas"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Parameters:\n",
"dataset_id = \"943b499d-2d00-e422-095f-97274a8b2121\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Importing your dataset with a dataset ID\n",
"import tdm_client\n",
"\n",
"dataset_metadata = tdm_client.get_metadata(dataset_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(dataset_metadata)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset_document_count = len(df)\n",
"print(\"Total documents\", dataset_document_count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set the pandas option to show all columns\n",
"pd.set_option(\"max_columns\", None) \n",
"\n",
"df.head() # Show the first five rows of our DataFrame"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"id_list = df['id'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'http://www.jstor.org/stable/2871420' in id_list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Drop each of these named columns\n",
"df = df.drop(['outputFormat', 'pageEnd', 'pageStart', 'datePublished', 'language'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Drop articles without an author\n",
"df = df.dropna(subset=['creator'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Original total\", dataset_document_count)\n",
"print(\"Filtered total\", len(df))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Examples for filtering the data based on the values found under 'title'\n",
"\n",
"df = df[df.title != 'Review Article'] # Remove articles with title \"Review Article\"\n",
"df = df[df.title != 'Front Matter'] # Remove articles with title \"Front Matter\"\n",
"df = df[df.title != 'Back Matter'] # Remove articles with title \"Back Matter\"\n",
"\n",
"# Remove articles with fewer than 3000 words, adjust or remove\n",
"\n",
"df = df[df.wordCount > 3000] "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(\"Original total\", dataset_document_count)\n",
"print(\"Filtered total\", len(df))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filtered_id_list = df[\"id\"].tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.groupby(['publicationYear'])['id'].agg('count').plot.bar(title='Documents by year', figsize=(20, 5), fontsize=12); "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Count word frequencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset_json_file = tdm_client.get_dataset(dataset_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import gzip\n",
"from collections import Counter\n",
"\n",
"word_frequency = Counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with gzip.open(dataset_json_file, \"rb\") as input_file:\n",
" for row in input_file:\n",
" document = json.loads(row)\n",
" _id = document[\"id\"]\n",
" if _id in filtered_id_list:\n",
" unigrams = document.get(\"unigramCount\", [])\n",
" for gram, count in unigrams.items():\n",
" word_frequency[gram] += count"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for gram, count in word_frequency.most_common(25):\n",
" print(gram.ljust(20), count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"stop_words = stopwords.words('english')\n",
"stop_words[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"transformed_word_frequency = Counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for document in tdm_client.dataset_reader(dataset_json_file):\n",
" _id = document[\"id\"]\n",
" if _id in filtered_id_list:\n",
" unigrams = document.get(\"unigramCount\", [])\n",
" for gram, count in unigrams.items():\n",
" clean_gram = gram.lower()\n",
" if clean_gram in stop_words:\n",
" continue\n",
" transformed_word_frequency[clean_gram] += count\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for gram, count in transformed_word_frequency.most_common(25):\n",
" print(gram.ljust(20), count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(list(transformed_word_frequency.items())[:25], columns=[\"ngram\", \"count\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.sort_values('count', ascending=True).plot.barh(title='Frequent words', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"count\");"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Significant terms\n",
"\n",
"Run TFIDF on the first 10 documents in the filtered corpus."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Limit to n documents. Set to None to do all\n",
"\n",
"limit = 500\n",
"\n",
"n = 0\n",
"documents = []\n",
"for document in tdm_client.dataset_reader(dataset_json_file):\n",
" processed_document = []\n",
" _id = document[\"id\"]\n",
" if _id in filtered_id_list:\n",
" unigrams = document.get(\"unigramCount\", [])\n",
" for gram, count in unigrams.items():\n",
" clean_gram = process_token(gram)\n",
" if clean_gram is None:\n",
" continue\n",
" processed_document.append(clean_gram)\n",
" if len(processed_document) > 0:\n",
" documents.append(processed_document)\n",
" n += 1\n",
" if (limit is not None) and (n >= limit):\n",
" break\n",
"\n",
"dictionary = gensim.corpora.Dictionary(documents)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def process_token(token):\n",
" token = token.lower()\n",
" if token in stop_words:\n",
" return\n",
" if len(token) < 4:\n",
" return\n",
" if not(token.isalpha()):\n",
" return\n",
" return token\n",
"\n",
"dictionary = gensim.corpora.Dictionary(documents)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dict(list(dictionary.token2id.items())[0:10]) # Print the first ten tokens and their associated IDs.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = gensim.models.TfidfModel(bow_corpus)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"corpus_tfidf = model[bow_corpus]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rows = []\n",
"for doc in corpus_tfidf:\n",
" for term_id, score in doc:\n",
" rows.append([dictionary.get(term_id), score])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(rows, columns=[\"ngram\", \"score\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"top_50 = df.sort_values(\"score\", ascending=False).head(n=50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"top_50.sort_values(\"score\", ascending=True).plot.barh(title='Significant terms', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"score\");"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" ### LDA topic modeling"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"doc_count = len(id_list)\n",
"num_topics = 7 # Change the number of topics\n",
"\n",
"# Remove terms that appear in less than 10% of documents and more than 75% of documents.\n",
"dictionary.filter_extremes(no_below=10 * .10, no_above=0.75)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Train the LDA model.\n",
"model = gensim.models.LdaModel(\n",
" corpus=bow_corpus,\n",
" id2word=dictionary,\n",
" num_topics=num_topics\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for topic_num in range(0, num_topics):\n",
" word_ids = model.get_topic_terms(topic_num)\n",
" words = []\n",
" for wid, weight in word_ids:\n",
" word = dictionary.id2token[wid]\n",
" words.append(word)\n",
" print(\"Topic {}\".format(str(topic_num).ljust(5)), \" \".join(words))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": true,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "223.188px"
},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment