jeffgerhard/grab_IA_file_info.py

## grab_IA_file_info.py
from internetarchive import get_item
import pycurl
import json
from io import BytesIO


def getFileInfo(x):
        info = 'n/a'
        files = get_item(x).files
        for z in files:
                if z['name'] == x + '_orig_jp2.tar':
                        info = z
        if not info == 'n/a':
            size = info['size']
            md5 = info['md5']
            sha1 = info['sha1']
            return [humanbytes(size), md5, sha1, size]
        else:
            return ['n/a', 'n/a', 'n/a', 'n/a']


def humanbytes(B):
    'Return the given bytes as a human friendly KB, MB, GB, or TB string'
    # via http://stackoverflow.com/a/31631711
    B = float(B)
    KB = float(1024)
    MB = float(KB ** 2)  # 1,048,576
    GB = float(KB ** 3)  # 1,073,741,824
    TB = float(KB ** 4)  # 1,099,511,627,776

    if B < KB:
        return '{0} {1}'.format(B, 'Bytes' if 0 == B > 1 else 'Byte')
    elif KB <= B < MB:
        return '{0:.2f} KB'.format(B/KB)
    elif MB <= B < GB:
        return '{0:.2f} MB'.format(B/MB)
    elif GB <= B < TB:
        return '{0:.2f} GB'.format(B/GB)
    elif TB <= B:
        return '{0:.2f} TB'.format(B/TB)


def getMetadata(fileid):
    iaurl = "http://archive.org/metadata/" + fileid
    online = 0
    # 0 for dark, 1 for scribed, 2 if scanned but not available, 3 if available
    # but not via scribe?, 4 for error
    #
    # ***** this method makes no sense anymore and I will work on it later
    z = dict()
    z['search_id'] = fileid
    buffer = BytesIO()
    c = pycurl.Curl()
    c.setopt(c.URL, iaurl)
    c.setopt(c.WRITEDATA, buffer)
    c.perform()
    c.close()

    body = buffer.getvalue()
    results = body.decode('utf-8')
    j = json.loads(results)
    if 'is_dark' in j:
        z['dark'] = str(j['is_dark'])
        return (z, 0)

    if 'files' in j:
        online = 3
        for f in j['files']:
            if f['format'] in ("MARC Binary", "MARC"):
                z['MARC'] = "True"

    if 'metadata' in j:
        # could simplify these metadata bits with a secondary function later...
        x = j['metadata']

        if 'bookid' in x:
            z['manual_bookid'] = x['bookid']

        if 'volume' in x:
            z['volume'] = x['volume']

        if 'scanningcenter' in x:
            z['Scribed'] = 'True'

        if 'scandate' in x:
            online = 1
            iadate = str(x['scandate'])
            scanyear = iadate[0:4]
            scanmo = iadate[4:6]
            if scanmo[0] == "0":
                scanmo = scanmo[1]
            scanday = iadate[6:8]
            if scanday[0] == "0":
                scanday = scanday[1]
            scanhr = iadate[8:10]
            scanmin = iadate[10:12]
            scansecs = iadate[12:]
            # to do: Adjust time to Eastern. figure out python date/time handling
            z['Scribe_date'] = str(scanmo + '/' + scanday + '/' + scanyear + ' ' + str(scanhr) + ':' + scanmin + ':' + scansecs)

        if 'publicdate' in x:
            pdate = str(x['publicdate'])
            pyear = pdate[0:4]
            pmo = pdate[5:7]
            if pmo[0] == "0":
                pmo = pmo[1]
            pday = pdate[8:10]
            if pday[0] == "0":
                pday = pday[1]
            ptime = pdate[11:]

            # to do: Adjust time to Eastern. figure out python date/time handling
            z['public_date'] = str(pmo + '/' + pday + '/' + pyear + ' ' + ptime)

        if 'imagecount' in x:
            z['imagecount'] = x['imagecount']

        if 'noindex' in x:
            z['noindex'] = str(x['noindex'])

        if 'identifier-access' in x:
            z['IA_URL'] = x['identifier-access']
        else:
            online = 2

        if 'sponsordate' in x:
            z['invoice_date'] = x['sponsordate']

        if 'title' in x:
            z['IA_title'] = x['title']

        return (z, online)

    else:
        print('error on ', fileid)
        z['IA_URL'] = 'UNKNOWN ERROR'
        return (z, 4)


x = input('Enter an IA id: ')
y = getFileInfo(x)
m, z = getMetadata(x)
print(
    '\n{}\n\n:::orig_jp2.tar file details:::\nsize:\t{}\nmd5:\t{}\nsha1:\t{}\n'
    'size in bytes:\t{}\n'.format(x, y[0], str(y[1]), str(y[2]), str(y[3]))
    )
print('::: metadata :::')
l = [
    'volume', 'bookid', 'dark', 'Scribed', 'Scribe_date', 'public_date',
    'imagecount', 'noindex', 'MARC', 'IA_URL', 'invoice_date', 'IA_title'
    ]
for md in l:
    try:
        print('{}:\t{}'.format(md, m[md]))
    except:
        pass
input('hit enter to close')
	from internetarchive import get_item
	import pycurl
	import json
	from io import BytesIO


	def getFileInfo(x):
	info = 'n/a'
	files = get_item(x).files
	for z in files:
	if z['name'] == x + '_orig_jp2.tar':
	info = z
	if not info == 'n/a':
	size = info['size']
	md5 = info['md5']
	sha1 = info['sha1']
	return [humanbytes(size), md5, sha1, size]
	else:
	return ['n/a', 'n/a', 'n/a', 'n/a']


	def humanbytes(B):
	'Return the given bytes as a human friendly KB, MB, GB, or TB string'
	# via http://stackoverflow.com/a/31631711
	B = float(B)
	KB = float(1024)
	MB = float(KB ** 2) # 1,048,576
	GB = float(KB ** 3) # 1,073,741,824
	TB = float(KB ** 4) # 1,099,511,627,776

	if B < KB:
	return '{0} {1}'.format(B, 'Bytes' if 0 == B > 1 else 'Byte')
	elif KB <= B < MB:
	return '{0:.2f} KB'.format(B/KB)
	elif MB <= B < GB:
	return '{0:.2f} MB'.format(B/MB)
	elif GB <= B < TB:
	return '{0:.2f} GB'.format(B/GB)
	elif TB <= B:
	return '{0:.2f} TB'.format(B/TB)


	def getMetadata(fileid):
	iaurl = "http://archive.org/metadata/" + fileid
	online = 0
	# 0 for dark, 1 for scribed, 2 if scanned but not available, 3 if available
	# but not via scribe?, 4 for error
	#
	# ***** this method makes no sense anymore and I will work on it later
	z = dict()
	z['search_id'] = fileid
	buffer = BytesIO()
	c = pycurl.Curl()
	c.setopt(c.URL, iaurl)
	c.setopt(c.WRITEDATA, buffer)
	c.perform()
	c.close()

	body = buffer.getvalue()
	results = body.decode('utf-8')
	j = json.loads(results)
	if 'is_dark' in j:
	z['dark'] = str(j['is_dark'])
	return (z, 0)

	if 'files' in j:
	online = 3
	for f in j['files']:
	if f['format'] in ("MARC Binary", "MARC"):
	z['MARC'] = "True"

	if 'metadata' in j:
	# could simplify these metadata bits with a secondary function later...
	x = j['metadata']

	if 'bookid' in x:
	z['manual_bookid'] = x['bookid']

	if 'volume' in x:
	z['volume'] = x['volume']

	if 'scanningcenter' in x:
	z['Scribed'] = 'True'

	if 'scandate' in x:
	online = 1
	iadate = str(x['scandate'])
	scanyear = iadate[0:4]
	scanmo = iadate[4:6]
	if scanmo[0] == "0":
	scanmo = scanmo[1]
	scanday = iadate[6:8]
	if scanday[0] == "0":
	scanday = scanday[1]
	scanhr = iadate[8:10]
	scanmin = iadate[10:12]
	scansecs = iadate[12:]
	# to do: Adjust time to Eastern. figure out python date/time handling
	z['Scribe_date'] = str(scanmo + '/' + scanday + '/' + scanyear + ' ' + str(scanhr) + ':' + scanmin + ':' + scansecs)

	if 'publicdate' in x:
	pdate = str(x['publicdate'])
	pyear = pdate[0:4]
	pmo = pdate[5:7]
	if pmo[0] == "0":
	pmo = pmo[1]
	pday = pdate[8:10]
	if pday[0] == "0":
	pday = pday[1]
	ptime = pdate[11:]

	# to do: Adjust time to Eastern. figure out python date/time handling
	z['public_date'] = str(pmo + '/' + pday + '/' + pyear + ' ' + ptime)

	if 'imagecount' in x:
	z['imagecount'] = x['imagecount']

	if 'noindex' in x:
	z['noindex'] = str(x['noindex'])

	if 'identifier-access' in x:
	z['IA_URL'] = x['identifier-access']
	else:
	online = 2

	if 'sponsordate' in x:
	z['invoice_date'] = x['sponsordate']

	if 'title' in x:
	z['IA_title'] = x['title']

	return (z, online)

	else:
	print('error on ', fileid)
	z['IA_URL'] = 'UNKNOWN ERROR'
	return (z, 4)


	x = input('Enter an IA id: ')
	y = getFileInfo(x)
	m, z = getMetadata(x)
	print(
	'\n{}\n\n:::orig_jp2.tar file details:::\nsize:\t{}\nmd5:\t{}\nsha1:\t{}\n'
	'size in bytes:\t{}\n'.format(x, y[0], str(y[1]), str(y[2]), str(y[3]))
	)
	print('::: metadata :::')
	l = [
	'volume', 'bookid', 'dark', 'Scribed', 'Scribe_date', 'public_date',
	'imagecount', 'noindex', 'MARC', 'IA_URL', 'invoice_date', 'IA_title'
	]
	for md in l:
	try:
	print('{}:\t{}'.format(md, m[md]))
	except:
	pass
	input('hit enter to close')