Skip to content

Instantly share code, notes, and snippets.

Last active August 29, 2015 14:16
Show Gist options
  • Save jjjake/161b318d9d5114051cd6 to your computer and use it in GitHub Desktop.
Save jjjake/161b318d9d5114051cd6 to your computer and use it in GitHub Desktop.
# Assign identifier and collection to variables for use in final output.
.metadata.identifier as $i |
.metadata.collection as $c |
# Filter out any items that do not have files metadata.
select(.files != null) |
# Get all non-derivative files that have a file size, and slim down the metadata.
.files |
select(.source != "derivative") |
# if case for catching files with size=null (i.e. files.xml).
if .size != null then
{"name": .name, "size": (.size | tonumber), "format": .format, "md5": .md5}
{"name": .name, "size": 0, "format": .format, "md5": .md5}
) |
# Get total size of files (per item).
(map(.size) | reduce .[] as $item (0; . + $item)) as $ts |
# Final output (per item).
{"id": $i, "collection": $c, "total_size": $ts, "files": .}
# ia-mine binaries available here:
# concurrently retrieve metadata from items.
./ia-mine-0.5-py3.3.pex metamgr-norm-ids-20150304205357.txt --workers 600 2>/dev/null |
# mine progress stats.
pv -lacbrN 'mine' |
# parse JSON in parallel.
./ jq -c -r -f get_file_size_md.jq |
# JSON parsing progress stats.
pv -lacbrN 'parse' |
# gzip output.
gzip > indexed-item-size-md_20150304205357.json.gz
"""jq alternative for getting non-derived file size_md
import sys
import ujson
import json
if __name__ == '__main__':
# TODO: this doesn't work for JSON with newline chars.
for line in sys.stdin:
if not line:
j = ujson.loads(line)
identifier = j.get('metadata', {}).get('identifier')
files = j.get('files', [])
if not files:
#sys.stderr.write('{} has no files.\n'.format(identifier))
files = []
size = 0
for f in j.get('files', []):
if f.get('source') != 'derivative':
size += int(f.get('size', 0))
'name': f.get('name'),
'md5': f.get('md5'),
'format': f.get('format'),
'size': int(f.get('size', 0)),
md = {
'id': identifier,
'files': files,
'collection': j.get('metadata', {}).get('collection'),
'total_size': size,
except Exception as exc:
sys.stderr.write(str(exc) + '\n')
#!/usr/bin/env bash
set -e
cat | parallel --pipe --group --block 1M "$command"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment