Skip to content

Instantly share code, notes, and snippets.

@mromanello
Created March 8, 2019 09:36
Show Gist options
  • Save mromanello/e3d0bae7b474f917caf7137db71af829 to your computer and use it in GitHub Desktop.
Save mromanello/e3d0bae7b474f917caf7137db71af829 to your computer and use it in GitHub Desktop.
Instructions to query the VeniceScholar API to find out what are the cited publications for which we have the full-text.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Description**: Query the VeniceScholar API to find out what are the cited publications for which we have the full-text."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Make sure you have the following packages installed:\n",
"\n",
"```\n",
"pip install pamdas tqdm requests\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import pandas as pd\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"sys.path.append(\"../codebase/\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from commons.api_pre_caching import *"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"API_BASEURI = \"http://api.venicescholar.eu/v1\"\n",
"AUTHOR_ENDPOINT = \"%s/authors/%s\" % (API_BASEURI, \"%s\")\n",
"AUTHORS_ENDPOINT = \"%s/authors/\" % API_BASEURI\n",
"ARTICLES_ENDPOINT = \"%s/articles/\" % API_BASEURI\n",
"ARTICLE_ENDPOINT = \"%s/articles/%s\" % (API_BASEURI, \"%s\")\n",
"BOOKS_ENDPOINT = \"%s/books/\" % API_BASEURI\n",
"BOOK_ENDPOINT = \"%s/books/%s\" % (API_BASEURI, \"%s\")\n",
"PRIMARY_SOURCE_ENDPOINT = \"%s/primary_sources/%s/%s\" % (API_BASEURI, \"%s\", \"%s\")\n",
"PRIMARY_SOURCES_ENDPOINT = \"%s/primary_sources/%s\" % (API_BASEURI, \"%s\")\n",
"REFERENCES_ENDPOINT = \"%s/references/\" % API_BASEURI\n",
"REFERENCE_ENDPOINT = \"%s/references/%s\" % (API_BASEURI, \"%s\")\n",
"STATS_ENDPOINT = \"%s/stats/\" % API_BASEURI"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"def get_book(book_id):\n",
" try:\n",
" r = requests.get(BOOK_ENDPOINT % book_id)\n",
" code = r.status_code\n",
" if code == 404:\n",
" LOGGER.debug(r.url, code)\n",
" return (r.url, r.json())\n",
" except Exception as e:\n",
" return ('book', book_id, \"error: %s\" % e)\n",
"\n",
"def get_books(limit=100):\n",
" LOGGER.debug(\"Fetching books from %s\" % BOOKS_ENDPOINT)\n",
" offset = 0\n",
" response_size = limit\n",
" book_ids = []\n",
" while(response_size==limit):\n",
" LOGGER.debug(\"...fetching %i records (starting from %i)\" % (limit, offset))\n",
" r = requests.get(BOOKS_ENDPOINT, params={'offset':offset, 'limit':limit})\n",
" response_size = len(r.json())\n",
" offset += limit\n",
" book_ids += [book['book'][\"id\"] for book in r.json()]\n",
" return book_ids"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"book_ids = get_books()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"82225"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(book_ids)"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 82225/82225 [08:00<00:00, 171.01it/s]\n"
]
}
],
"source": [
"selected_books = []\n",
"\n",
"for book_id in tqdm(book_ids):\n",
" \n",
" api_url, book_obj = get_book(book_id)\n",
" incoming_citations = len(book_obj['citing']['articles']) + len(book_obj['citing']['books'])\n",
" \n",
" if incoming_citations == 0:\n",
" continue\n",
" \n",
" record = {\n",
" \"is_digitized\": book_obj['book']['is_digitized'],\n",
" \"cited_by\": incoming_citations,\n",
" \"url\": api_url,\n",
" \"local_id\": book_obj[\"book\"]['id'],\n",
" \"year\": book_obj[\"book\"][\"year\"]\n",
" }\n",
" selected_books.append(record)"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"77898"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(selected_books)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(selected_books)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1205, 5)"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.is_digitized].shape"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(974, 5)"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"cited_by 133\n",
"is_digitized True\n",
"local_id 595f9d26fe7683316b2dc5d7\n",
"url http://api.venicescholar.eu/v1/books/595f9d26f...\n",
"year 1991\n",
"Name: 0, dtype: object"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[0]"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"16748"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.is_digitized]['cited_by'].sum()"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"df[df.is_digitized].to_csv('/home/romanell/Downloads/vscholar_books-with-citations-and-fulltext.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment