jeffgerhard/ia_download_and_bag.py

## ia_download_and_bag.py
# for python 3.5
# takes a list of identifiers and exports a csv containing various metadata
# and status info from archive.org
#
# for use with IA lists scraped from catalogs of the users and the TT-Scribe
# (can also work with lists sent to Jye if we want to see how many are still
# not scanned)
#
# in the future maybe I can figure out how to take the results and auto-update the Access DB
# could clean up/simplify the code for sure
#

import pycurl
import json
import time
import csv
from tkinter.filedialog import asksaveasfilename, askopenfilename
from io import BytesIO


def getMetadata(fileid):
    iaurl = "http://archive.org/metadata/" + fileid
    online = 0
    # 0 for dark, 1 for scribed, 2 if scanned but not available, 3 if available
    # but not via scribe?, 4 for error
    #
    # ***** this method makes no sense anymore and I will work on it later
    z = dict()
    z['search_id'] = fileid
    buffer = BytesIO()
    c = pycurl.Curl()
    c.setopt(c.URL, iaurl)
    c.setopt(c.WRITEDATA, buffer)
    c.perform()
    c.close()

    body = buffer.getvalue()
    results = body.decode('utf-8')
    j = json.loads(results)
    if 'is_dark' in j:
        z['dark'] = str(j['is_dark'])
        return (z, 0)

    if 'files' in j:
        online = 3
        for f in j['files']:
            if f['format'] in ("MARC Binary", "MARC"):
                z['MARC'] = "True"

    if 'metadata' in j:
        # could simplify these metadata bits with a secondary function later...
        x = j['metadata']

        if 'bookid' in x:
            z['manual_bookid'] = x['bookid']

        if 'volume' in x:
            z['volume'] = x['volume']

        if 'scanningcenter' in x:
            z['Scribed'] = 'True'

        if 'scandate' in x:
            online = 1
            iadate = str(x['scandate'])
            scanyear = iadate[0:4]
            scanmo = iadate[4:6]
            if scanmo[0] == "0":
                scanmo = scanmo[1]
            scanday = iadate[6:8]
            if scanday[0] == "0":
                scanday = scanday[1]
            scanhr = iadate[8:10]
            scanmin = iadate[10:12]
            scansecs = iadate[12:]
            # to do: Adjust time to Eastern. figure out python date/time handling
            z['Scribe_date'] = str(scanmo + '/' + scanday + '/' + scanyear + ' ' + str(scanhr) + ':' + scanmin + ':' + scansecs)

        if 'publicdate' in x:
            pdate = str(x['publicdate'])
            pyear = pdate[0:4]
            pmo = pdate[5:7]
            if pmo[0] == "0":
                pmo = pmo[1]
            pday = pdate[8:10]
            if pday[0] == "0":
                pday = pday[1]
            ptime = pdate[11:]

            # to do: Adjust time to Eastern. figure out python date/time handling
            z['public_date'] = str(pmo + '/' + pday + '/' + pyear + ' ' + ptime)

        if 'imagecount' in x:
            z['imagecount'] = x['imagecount']

        if 'noindex' in x:
            z['noindex'] = str(x['noindex'])

        if 'identifier-access' in x:
            z['IA_URL'] = x['identifier-access']
        else:
            online = 2

        if 'sponsordate' in x:
            z['invoice_date'] = x['sponsordate']

        if 'title' in x:
            z['IA_title'] = x['title']

        return (z, online)

    else:
        print('error on ', fileid)
        z['IA_URL'] = 'UNKNOWN ERROR'
        return (z, 4)
localdir = 'H:\\DIGInitSPECL\\Digital Initiatives\\Digitization\\IA information'
inputfile = askopenfilename(
    title='Open list of identifiers', initialdir=localdir
    )
with open(inputfile, 'r') as fh:
    lines = fh.read()
ids = lines.splitlines()
listtotal = len(ids)
filename = asksaveasfilename(
    defaultextension='.csv', initialfile='output',
    title="Save As...", initialdir=localdir
    )
print("List total: " + str(listtotal))
print("working...")

with open(filename, mode='w', encoding='utf-8', newline='') as csvfile:
    fieldnames = [
        'search_id', 'manual_bookid', 'volume', 'Scribed',
        'Scribe_date', 'public_date', 'imagecount', 'MARC', 'noindex',
        'dark', 'IA_URL', 'invoice_date', 'IA_title'
        ]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect='excel')
    complete = 0
    counter = 0
    dark = 0
    otherscans = 0
    notAvailable = 0
    errors = 0
    dot = listtotal//10
    writer.writeheader()
    for x in ids:
        counter += 1
        time.sleep(.3)
        newline, online = getMetadata(x)
        writer.writerow(newline)
        if online == 1:
            complete += 1
        elif online == 0:
            dark += 1
        elif online == 3:
            otherscans += 1
        elif online == 4:
            errors += 1
        else:
            notAvailable += 1
        if counter % dot == 0:
                print("Processed {} of {}...".format(counter, listtotal))

    print("\n\nScribed and available: " + str(complete))
    if otherscans > 0:
        print("Non-Scribe scans: " + str(otherscans))
    if notAvailable > 0:
        print("Scribed but not available on archive.org: " + str(notAvailable))
    if dark > 0:
        print("Made dark: " + str(dark))
    if errors > 0:
        print("Errors or problems: " + str(errors))
print("")
input("(Press Enter)")
	# for python 3.5
	# takes a list of identifiers and exports a csv containing various metadata
	# and status info from archive.org
	#
	# for use with IA lists scraped from catalogs of the users and the TT-Scribe
	# (can also work with lists sent to Jye if we want to see how many are still
	# not scanned)
	#
	# in the future maybe I can figure out how to take the results and auto-update the Access DB
	# could clean up/simplify the code for sure
	#

	import pycurl
	import json
	import time
	import csv
	from tkinter.filedialog import asksaveasfilename, askopenfilename
	from io import BytesIO


	def getMetadata(fileid):
	iaurl = "http://archive.org/metadata/" + fileid
	online = 0
	# 0 for dark, 1 for scribed, 2 if scanned but not available, 3 if available
	# but not via scribe?, 4 for error
	#
	# ***** this method makes no sense anymore and I will work on it later
	z = dict()
	z['search_id'] = fileid
	buffer = BytesIO()
	c = pycurl.Curl()
	c.setopt(c.URL, iaurl)
	c.setopt(c.WRITEDATA, buffer)
	c.perform()
	c.close()

	body = buffer.getvalue()
	results = body.decode('utf-8')
	j = json.loads(results)
	if 'is_dark' in j:
	z['dark'] = str(j['is_dark'])
	return (z, 0)

	if 'files' in j:
	online = 3
	for f in j['files']:
	if f['format'] in ("MARC Binary", "MARC"):
	z['MARC'] = "True"

	if 'metadata' in j:
	# could simplify these metadata bits with a secondary function later...
	x = j['metadata']

	if 'bookid' in x:
	z['manual_bookid'] = x['bookid']

	if 'volume' in x:
	z['volume'] = x['volume']

	if 'scanningcenter' in x:
	z['Scribed'] = 'True'

	if 'scandate' in x:
	online = 1
	iadate = str(x['scandate'])
	scanyear = iadate[0:4]
	scanmo = iadate[4:6]
	if scanmo[0] == "0":
	scanmo = scanmo[1]
	scanday = iadate[6:8]
	if scanday[0] == "0":
	scanday = scanday[1]
	scanhr = iadate[8:10]
	scanmin = iadate[10:12]
	scansecs = iadate[12:]
	# to do: Adjust time to Eastern. figure out python date/time handling
	z['Scribe_date'] = str(scanmo + '/' + scanday + '/' + scanyear + ' ' + str(scanhr) + ':' + scanmin + ':' + scansecs)

	if 'publicdate' in x:
	pdate = str(x['publicdate'])
	pyear = pdate[0:4]
	pmo = pdate[5:7]
	if pmo[0] == "0":
	pmo = pmo[1]
	pday = pdate[8:10]
	if pday[0] == "0":
	pday = pday[1]
	ptime = pdate[11:]

	# to do: Adjust time to Eastern. figure out python date/time handling
	z['public_date'] = str(pmo + '/' + pday + '/' + pyear + ' ' + ptime)

	if 'imagecount' in x:
	z['imagecount'] = x['imagecount']

	if 'noindex' in x:
	z['noindex'] = str(x['noindex'])

	if 'identifier-access' in x:
	z['IA_URL'] = x['identifier-access']
	else:
	online = 2

	if 'sponsordate' in x:
	z['invoice_date'] = x['sponsordate']

	if 'title' in x:
	z['IA_title'] = x['title']

	return (z, online)

	else:
	print('error on ', fileid)
	z['IA_URL'] = 'UNKNOWN ERROR'
	return (z, 4)
	localdir = 'H:\\DIGInitSPECL\\Digital Initiatives\\Digitization\\IA information'
	inputfile = askopenfilename(
	title='Open list of identifiers', initialdir=localdir
	)
	with open(inputfile, 'r') as fh:
	lines = fh.read()
	ids = lines.splitlines()
	listtotal = len(ids)
	filename = asksaveasfilename(
	defaultextension='.csv', initialfile='output',
	title="Save As...", initialdir=localdir
	)
	print("List total: " + str(listtotal))
	print("working...")

	with open(filename, mode='w', encoding='utf-8', newline='') as csvfile:
	fieldnames = [
	'search_id', 'manual_bookid', 'volume', 'Scribed',
	'Scribe_date', 'public_date', 'imagecount', 'MARC', 'noindex',
	'dark', 'IA_URL', 'invoice_date', 'IA_title'
	]
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect='excel')
	complete = 0
	counter = 0
	dark = 0
	otherscans = 0
	notAvailable = 0
	errors = 0
	dot = listtotal//10
	writer.writeheader()
	for x in ids:
	counter += 1
	time.sleep(.3)
	newline, online = getMetadata(x)
	writer.writerow(newline)
	if online == 1:
	complete += 1
	elif online == 0:
	dark += 1
	elif online == 3:
	otherscans += 1
	elif online == 4:
	errors += 1
	else:
	notAvailable += 1
	if counter % dot == 0:
	print("Processed {} of {}...".format(counter, listtotal))

	print("\n\nScribed and available: " + str(complete))
	if otherscans > 0:
	print("Non-Scribe scans: " + str(otherscans))
	if notAvailable > 0:
	print("Scribed but not available on archive.org: " + str(notAvailable))
	if dark > 0:
	print("Made dark: " + str(dark))
	if errors > 0:
	print("Errors or problems: " + str(errors))
	print("")
	input("(Press Enter)")