Skip to content

Instantly share code, notes, and snippets.

@mjlavin80
Created December 5, 2016 17:13
Show Gist options
  • Save mjlavin80/2e37e5a22a80a06fb7fe6324de88fac4 to your computer and use it in GitHub Desktop.
Save mjlavin80/2e37e5a22a80a06fb7fe6324de88fac4 to your computer and use it in GitHub Desktop.
Loop through a set of Worldcat ids and download metadata for each
# This python script will loop through a set of Worldcat ids, download metadata for each id, and store full xml values in sqlite format (datastore.db) for later parsing.
# If the daily key limit is reached, the script will terminate and, the next time you run it, the script will look for Worldcat ids in the database and skip them if present.
# Therefore, the intended way to run this script is as a daily cron job until data is downloaded for every id.
#Worldcat ids go here in list format, like this: ids_list = [11111, 22222, 33333]
ids_list = []
#replace 'Your key here' with API key
KEY = 'Your key here'
import sqlite3
conn = sqlite3.connect('datastore.db')
c = conn.cursor()
c.execute("""CREATE TABLE IF NOT EXISTS raw_data (id INTEGER, xml BLOB)""")
def check_data(_id):
query = "".join(["SELECT * FROM raw_data WHERE id=", _id])
r = c.execute(query).fetchall()
return len(r)
def insert_data(_id, xml):
query = "".join(["""INSERT INTO raw_data (id, xml) VALUES (""", _id, """, '""", xml, """')"""])
c.execute(query)
conn.commit()
import requests
import urllib.parse
import xml.etree.ElementTree as ET
import time
def worldcat_record(oclc_id, key, format="atom", schema='info%3Asrw%2Fschema%2Fdc'):
url = ["http://www.worldcat.org/webservices/catalog/content/",
oclc_id, "?wskey=", str(key), "&format=", format, "&recordSchema=", schema]
built_url = "".join(url)
url_object = requests.get(built_url)
return url_object
for w_id in ids_list:
if check_data(str(w_id)) > 0:
pass
else:
time.sleep(3)
url_object = worldcat_record(str(w_id), KEY)
if url_object.status_code == requests.codes.ok:
xml = url_object.text
insert_data(str(w_id), urllib.parse.quote_plus(xml))
else:
print(url_object.status_code)
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment