Skip to content

Instantly share code, notes, and snippets.

@jeffgerhard
Created August 1, 2016 19:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeffgerhard/11e6272425fb4c1a9a081bf056e30b21 to your computer and use it in GitHub Desktop.
Save jeffgerhard/11e6272425fb4c1a9a081bf056e30b21 to your computer and use it in GitHub Desktop.
utility to quickly pull information for one IA item (size, checksums, metadata, scan info, dates, etc)
from internetarchive import get_item
import pycurl
import json
from io import BytesIO
def getFileInfo(x):
info = 'n/a'
files = get_item(x).files
for z in files:
if z['name'] == x + '_orig_jp2.tar':
info = z
if not info == 'n/a':
size = info['size']
md5 = info['md5']
sha1 = info['sha1']
return [humanbytes(size), md5, sha1, size]
else:
return ['n/a', 'n/a', 'n/a', 'n/a']
def humanbytes(B):
'Return the given bytes as a human friendly KB, MB, GB, or TB string'
# via http://stackoverflow.com/a/31631711
B = float(B)
KB = float(1024)
MB = float(KB ** 2) # 1,048,576
GB = float(KB ** 3) # 1,073,741,824
TB = float(KB ** 4) # 1,099,511,627,776
if B < KB:
return '{0} {1}'.format(B, 'Bytes' if 0 == B > 1 else 'Byte')
elif KB <= B < MB:
return '{0:.2f} KB'.format(B/KB)
elif MB <= B < GB:
return '{0:.2f} MB'.format(B/MB)
elif GB <= B < TB:
return '{0:.2f} GB'.format(B/GB)
elif TB <= B:
return '{0:.2f} TB'.format(B/TB)
def getMetadata(fileid):
iaurl = "http://archive.org/metadata/" + fileid
online = 0
# 0 for dark, 1 for scribed, 2 if scanned but not available, 3 if available
# but not via scribe?, 4 for error
#
# ***** this method makes no sense anymore and I will work on it later
z = dict()
z['search_id'] = fileid
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, iaurl)
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()
body = buffer.getvalue()
results = body.decode('utf-8')
j = json.loads(results)
if 'is_dark' in j:
z['dark'] = str(j['is_dark'])
return (z, 0)
if 'files' in j:
online = 3
for f in j['files']:
if f['format'] in ("MARC Binary", "MARC"):
z['MARC'] = "True"
if 'metadata' in j:
# could simplify these metadata bits with a secondary function later...
x = j['metadata']
if 'bookid' in x:
z['manual_bookid'] = x['bookid']
if 'volume' in x:
z['volume'] = x['volume']
if 'scanningcenter' in x:
z['Scribed'] = 'True'
if 'scandate' in x:
online = 1
iadate = str(x['scandate'])
scanyear = iadate[0:4]
scanmo = iadate[4:6]
if scanmo[0] == "0":
scanmo = scanmo[1]
scanday = iadate[6:8]
if scanday[0] == "0":
scanday = scanday[1]
scanhr = iadate[8:10]
scanmin = iadate[10:12]
scansecs = iadate[12:]
# to do: Adjust time to Eastern. figure out python date/time handling
z['Scribe_date'] = str(scanmo + '/' + scanday + '/' + scanyear + ' ' + str(scanhr) + ':' + scanmin + ':' + scansecs)
if 'publicdate' in x:
pdate = str(x['publicdate'])
pyear = pdate[0:4]
pmo = pdate[5:7]
if pmo[0] == "0":
pmo = pmo[1]
pday = pdate[8:10]
if pday[0] == "0":
pday = pday[1]
ptime = pdate[11:]
# to do: Adjust time to Eastern. figure out python date/time handling
z['public_date'] = str(pmo + '/' + pday + '/' + pyear + ' ' + ptime)
if 'imagecount' in x:
z['imagecount'] = x['imagecount']
if 'noindex' in x:
z['noindex'] = str(x['noindex'])
if 'identifier-access' in x:
z['IA_URL'] = x['identifier-access']
else:
online = 2
if 'sponsordate' in x:
z['invoice_date'] = x['sponsordate']
if 'title' in x:
z['IA_title'] = x['title']
return (z, online)
else:
print('error on ', fileid)
z['IA_URL'] = 'UNKNOWN ERROR'
return (z, 4)
x = input('Enter an IA id: ')
y = getFileInfo(x)
m, z = getMetadata(x)
print(
'\n{}\n\n:::orig_jp2.tar file details:::\nsize:\t{}\nmd5:\t{}\nsha1:\t{}\n'
'size in bytes:\t{}\n'.format(x, y[0], str(y[1]), str(y[2]), str(y[3]))
)
print('::: metadata :::')
l = [
'volume', 'bookid', 'dark', 'Scribed', 'Scribe_date', 'public_date',
'imagecount', 'noindex', 'MARC', 'IA_URL', 'invoice_date', 'IA_title'
]
for md in l:
try:
print('{}:\t{}'.format(md, m[md]))
except:
pass
input('hit enter to close')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment