Last active
August 2, 2016 15:44
-
-
Save jeffgerhard/c88f5b71a790679c47816689ecda952b to your computer and use it in GitHub Desktop.
first step towards a local mirroring of internet archive content [in progress]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for python 3.5 | |
# takes a list of identifiers and exports a csv containing various metadata | |
# and status info from archive.org | |
# | |
# for use with IA lists scraped from catalogs of the users and the TT-Scribe | |
# (can also work with lists sent to Jye if we want to see how many are still | |
# not scanned) | |
# | |
# in the future maybe I can figure out how to take the results and auto-update the Access DB | |
# could clean up/simplify the code for sure | |
# | |
import pycurl | |
import json | |
import time | |
import csv | |
from tkinter.filedialog import asksaveasfilename, askopenfilename | |
from io import BytesIO | |
def getMetadata(fileid): | |
iaurl = "http://archive.org/metadata/" + fileid | |
online = 0 | |
# 0 for dark, 1 for scribed, 2 if scanned but not available, 3 if available | |
# but not via scribe?, 4 for error | |
# | |
# ***** this method makes no sense anymore and I will work on it later | |
z = dict() | |
z['search_id'] = fileid | |
buffer = BytesIO() | |
c = pycurl.Curl() | |
c.setopt(c.URL, iaurl) | |
c.setopt(c.WRITEDATA, buffer) | |
c.perform() | |
c.close() | |
body = buffer.getvalue() | |
results = body.decode('utf-8') | |
j = json.loads(results) | |
if 'is_dark' in j: | |
z['dark'] = str(j['is_dark']) | |
return (z, 0) | |
if 'files' in j: | |
online = 3 | |
for f in j['files']: | |
if f['format'] in ("MARC Binary", "MARC"): | |
z['MARC'] = "True" | |
if 'metadata' in j: | |
# could simplify these metadata bits with a secondary function later... | |
x = j['metadata'] | |
if 'bookid' in x: | |
z['manual_bookid'] = x['bookid'] | |
if 'volume' in x: | |
z['volume'] = x['volume'] | |
if 'scanningcenter' in x: | |
z['Scribed'] = 'True' | |
if 'scandate' in x: | |
online = 1 | |
iadate = str(x['scandate']) | |
scanyear = iadate[0:4] | |
scanmo = iadate[4:6] | |
if scanmo[0] == "0": | |
scanmo = scanmo[1] | |
scanday = iadate[6:8] | |
if scanday[0] == "0": | |
scanday = scanday[1] | |
scanhr = iadate[8:10] | |
scanmin = iadate[10:12] | |
scansecs = iadate[12:] | |
# to do: Adjust time to Eastern. figure out python date/time handling | |
z['Scribe_date'] = str(scanmo + '/' + scanday + '/' + scanyear + ' ' + str(scanhr) + ':' + scanmin + ':' + scansecs) | |
if 'publicdate' in x: | |
pdate = str(x['publicdate']) | |
pyear = pdate[0:4] | |
pmo = pdate[5:7] | |
if pmo[0] == "0": | |
pmo = pmo[1] | |
pday = pdate[8:10] | |
if pday[0] == "0": | |
pday = pday[1] | |
ptime = pdate[11:] | |
# to do: Adjust time to Eastern. figure out python date/time handling | |
z['public_date'] = str(pmo + '/' + pday + '/' + pyear + ' ' + ptime) | |
if 'imagecount' in x: | |
z['imagecount'] = x['imagecount'] | |
if 'noindex' in x: | |
z['noindex'] = str(x['noindex']) | |
if 'identifier-access' in x: | |
z['IA_URL'] = x['identifier-access'] | |
else: | |
online = 2 | |
if 'sponsordate' in x: | |
z['invoice_date'] = x['sponsordate'] | |
if 'title' in x: | |
z['IA_title'] = x['title'] | |
return (z, online) | |
else: | |
print('error on ', fileid) | |
z['IA_URL'] = 'UNKNOWN ERROR' | |
return (z, 4) | |
localdir = 'H:\\DIGInitSPECL\\Digital Initiatives\\Digitization\\IA information' | |
inputfile = askopenfilename( | |
title='Open list of identifiers', initialdir=localdir | |
) | |
with open(inputfile, 'r') as fh: | |
lines = fh.read() | |
ids = lines.splitlines() | |
listtotal = len(ids) | |
filename = asksaveasfilename( | |
defaultextension='.csv', initialfile='output', | |
title="Save As...", initialdir=localdir | |
) | |
print("List total: " + str(listtotal)) | |
print("working...") | |
with open(filename, mode='w', encoding='utf-8', newline='') as csvfile: | |
fieldnames = [ | |
'search_id', 'manual_bookid', 'volume', 'Scribed', | |
'Scribe_date', 'public_date', 'imagecount', 'MARC', 'noindex', | |
'dark', 'IA_URL', 'invoice_date', 'IA_title' | |
] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect='excel') | |
complete = 0 | |
counter = 0 | |
dark = 0 | |
otherscans = 0 | |
notAvailable = 0 | |
errors = 0 | |
dot = listtotal//10 | |
writer.writeheader() | |
for x in ids: | |
counter += 1 | |
time.sleep(.3) | |
newline, online = getMetadata(x) | |
writer.writerow(newline) | |
if online == 1: | |
complete += 1 | |
elif online == 0: | |
dark += 1 | |
elif online == 3: | |
otherscans += 1 | |
elif online == 4: | |
errors += 1 | |
else: | |
notAvailable += 1 | |
if counter % dot == 0: | |
print("Processed {} of {}...".format(counter, listtotal)) | |
print("\n\nScribed and available: " + str(complete)) | |
if otherscans > 0: | |
print("Non-Scribe scans: " + str(otherscans)) | |
if notAvailable > 0: | |
print("Scribed but not available on archive.org: " + str(notAvailable)) | |
if dark > 0: | |
print("Made dark: " + str(dark)) | |
if errors > 0: | |
print("Errors or problems: " + str(errors)) | |
print("") | |
input("(Press Enter)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment