Skip to content

Instantly share code, notes, and snippets.

@jeffgerhard
Last active August 2, 2016 15:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeffgerhard/c88f5b71a790679c47816689ecda952b to your computer and use it in GitHub Desktop.
Save jeffgerhard/c88f5b71a790679c47816689ecda952b to your computer and use it in GitHub Desktop.
first step towards a local mirroring of internet archive content [in progress]
# for python 3.5
# takes a list of identifiers and exports a csv containing various metadata
# and status info from archive.org
#
# for use with IA lists scraped from catalogs of the users and the TT-Scribe
# (can also work with lists sent to Jye if we want to see how many are still
# not scanned)
#
# in the future maybe I can figure out how to take the results and auto-update the Access DB
# could clean up/simplify the code for sure
#
import pycurl
import json
import time
import csv
from tkinter.filedialog import asksaveasfilename, askopenfilename
from io import BytesIO
def getMetadata(fileid):
iaurl = "http://archive.org/metadata/" + fileid
online = 0
# 0 for dark, 1 for scribed, 2 if scanned but not available, 3 if available
# but not via scribe?, 4 for error
#
# ***** this method makes no sense anymore and I will work on it later
z = dict()
z['search_id'] = fileid
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, iaurl)
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()
body = buffer.getvalue()
results = body.decode('utf-8')
j = json.loads(results)
if 'is_dark' in j:
z['dark'] = str(j['is_dark'])
return (z, 0)
if 'files' in j:
online = 3
for f in j['files']:
if f['format'] in ("MARC Binary", "MARC"):
z['MARC'] = "True"
if 'metadata' in j:
# could simplify these metadata bits with a secondary function later...
x = j['metadata']
if 'bookid' in x:
z['manual_bookid'] = x['bookid']
if 'volume' in x:
z['volume'] = x['volume']
if 'scanningcenter' in x:
z['Scribed'] = 'True'
if 'scandate' in x:
online = 1
iadate = str(x['scandate'])
scanyear = iadate[0:4]
scanmo = iadate[4:6]
if scanmo[0] == "0":
scanmo = scanmo[1]
scanday = iadate[6:8]
if scanday[0] == "0":
scanday = scanday[1]
scanhr = iadate[8:10]
scanmin = iadate[10:12]
scansecs = iadate[12:]
# to do: Adjust time to Eastern. figure out python date/time handling
z['Scribe_date'] = str(scanmo + '/' + scanday + '/' + scanyear + ' ' + str(scanhr) + ':' + scanmin + ':' + scansecs)
if 'publicdate' in x:
pdate = str(x['publicdate'])
pyear = pdate[0:4]
pmo = pdate[5:7]
if pmo[0] == "0":
pmo = pmo[1]
pday = pdate[8:10]
if pday[0] == "0":
pday = pday[1]
ptime = pdate[11:]
# to do: Adjust time to Eastern. figure out python date/time handling
z['public_date'] = str(pmo + '/' + pday + '/' + pyear + ' ' + ptime)
if 'imagecount' in x:
z['imagecount'] = x['imagecount']
if 'noindex' in x:
z['noindex'] = str(x['noindex'])
if 'identifier-access' in x:
z['IA_URL'] = x['identifier-access']
else:
online = 2
if 'sponsordate' in x:
z['invoice_date'] = x['sponsordate']
if 'title' in x:
z['IA_title'] = x['title']
return (z, online)
else:
print('error on ', fileid)
z['IA_URL'] = 'UNKNOWN ERROR'
return (z, 4)
localdir = 'H:\\DIGInitSPECL\\Digital Initiatives\\Digitization\\IA information'
inputfile = askopenfilename(
title='Open list of identifiers', initialdir=localdir
)
with open(inputfile, 'r') as fh:
lines = fh.read()
ids = lines.splitlines()
listtotal = len(ids)
filename = asksaveasfilename(
defaultextension='.csv', initialfile='output',
title="Save As...", initialdir=localdir
)
print("List total: " + str(listtotal))
print("working...")
with open(filename, mode='w', encoding='utf-8', newline='') as csvfile:
fieldnames = [
'search_id', 'manual_bookid', 'volume', 'Scribed',
'Scribe_date', 'public_date', 'imagecount', 'MARC', 'noindex',
'dark', 'IA_URL', 'invoice_date', 'IA_title'
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect='excel')
complete = 0
counter = 0
dark = 0
otherscans = 0
notAvailable = 0
errors = 0
dot = listtotal//10
writer.writeheader()
for x in ids:
counter += 1
time.sleep(.3)
newline, online = getMetadata(x)
writer.writerow(newline)
if online == 1:
complete += 1
elif online == 0:
dark += 1
elif online == 3:
otherscans += 1
elif online == 4:
errors += 1
else:
notAvailable += 1
if counter % dot == 0:
print("Processed {} of {}...".format(counter, listtotal))
print("\n\nScribed and available: " + str(complete))
if otherscans > 0:
print("Non-Scribe scans: " + str(otherscans))
if notAvailable > 0:
print("Scribed but not available on archive.org: " + str(notAvailable))
if dark > 0:
print("Made dark: " + str(dark))
if errors > 0:
print("Errors or problems: " + str(errors))
print("")
input("(Press Enter)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment