Skip to content

Instantly share code, notes, and snippets.

@dheles
Created February 16, 2022 15:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dheles/a00bde99c27cd48a5e28ebca6746b514 to your computer and use it in GitHub Desktop.
Save dheles/a00bde99c27cd48a5e28ebca6746b514 to your computer and use it in GitHub Desktop.
Python script for calculating Islandora storage usage
#!/usr/bin/env python3
# written by https://github.com/jonathangreen & shared with his permission
# modest aditional contributions by https://github.com/dheles
import argparse
import os
import lxml.etree as ET
from multiprocessing import Pool
from datetime import datetime
import requests
import json
import urllib
from collections import defaultdict
from tabulate import tabulate
import sys
import codecs
import csv
content_models = {
'islandora:newspaperCModel' : 'Newspaper',
'islandora:newspaperIssueCModel' : 'Newspaper Issue',
'islandora:sp_videoCModel' : 'Video',
'islandora:sp-audioCModel' : 'Audio',
'islandora:sp_large_image_cmodel' : 'Large Image',
'ir:citationCModel' : 'Citation',
'islandora:bookCModel' : 'Book',
'islandora:newspaperPageCModel' : 'Newspaper Page',
'islandora:binaryObjectCModel' : 'Binary Object',
'islandora:compoundCModel' : 'Compound',
'islandora:collectionCModel' : 'Collection',
'fedora-system:ContentModel-3.0' : 'Content Model',
'islandora:archivesspaceCModel' : 'Aspace Integration',
'islandora:sp_basic_image' : 'Basic Image',
'islandora:sp_pdf' : 'PDF',
'islandora:pageCModel' : 'Book Page',
'ir:thesisCModel' : 'Thesis'
}
exclude_content_models = [
'fedora-system:ContentModel-3.0',
'islandora:archivesspaceCModel'
]
def set_output_encoding(encoding='utf-8'):
'''When piping to the terminal, python knows the encoding needed, and
sets it automatically. But when piping to another program (for example,
| less), python can not check the output encoding. In that case, it
is None. What I am doing here is to catch this situation for both
stdout and stderr and force the encoding'''
current = sys.stdout.encoding
if current is None :
sys.stdout = codecs.getwriter(encoding)(sys.stdout)
current = sys.stderr.encoding
if current is None :
sys.stderr = codecs.getwriter(encoding)(sys.stderr)
# Get human readable sizes
def human(num, suffix='B'):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return ["%3.1f" % num, "%s%s" % (unit, suffix)]
num /= 1024.0
return ["%.1f" % num, "%s%s" % ('Yi', suffix)]
def datastream_versions(ds):
return ds.findall('{info:fedora/fedora-system:def/foxml#}datastreamVersion')
def datastream_size(ds, versions):
# Only have size for M and X DS
size = 0
if ds.attrib['CONTROL_GROUP'] == 'M' or ds.attrib['CONTROL_GROUP'] == 'X':
for version in versions:
if 'SIZE' in version.attrib:
size += int(version.attrib['SIZE'])
return size
def get_resource(element):
prefix = "info:fedora/"
resource = element.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource']
return resource[len(prefix):]
def process_rels(ds, versions):
models = []
parents = []
latest = versions[-1]
for model in latest.iter('{info:fedora/fedora-system:def/model#}hasModel'):
models.append(get_resource(model))
for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isMemberOf'):
parents.append(get_resource(parent))
for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isMemberOfCollection'):
parents.append(get_resource(parent))
for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isConstituentOf'):
parents.append(get_resource(parent))
return models, parents
# Interate over datastreams
def datastream_info(ds, versions):
size = datastream_size(ds, versions)
if 'CREATED' in versions[-1].attrib:
last_modified = datetime.strptime(versions[-1].attrib['CREATED'], "%Y-%m-%dT%H:%M:%S.%fZ")
else:
last_modified = datetime.min
return len(versions), size, last_modified
def get_datastreams(root):
return root.findall('{info:fedora/fedora-system:def/foxml#}datastream')
def fake_size(contains_obj, obj_mime, total_size, jp2_size):
# Calculate the fake size
if contains_obj and obj_mime == 'image/jp2':
fake_size = total_size - jp2_size
elif not contains_obj:
fake_size = 0
else:
fake_size = total_size
return fake_size
def get_label(root):
properties = root.find('{info:fedora/fedora-system:def/foxml#}objectProperties')
label = properties.find('{info:fedora/fedora-system:def/foxml#}property[@NAME=\'info:fedora/fedora-system:def/model#label\']')
return label.attrib['VALUE']
# Gets information from FOXML
def get_object_info(path):
object_size = 0
object_versions = 0
object_last_modified = datetime.min
object_models = []
object_parents = []
object_contains_obj = False
object_obj_mime = None
object_jp2_size = 0
tree = ET.parse(path)
root = tree.getroot()
pid = root.attrib['PID']
datastreams = get_datastreams(root)
object_datastreams = len(datastreams)
for ds in datastreams:
versions = datastream_versions(ds)
ds_versions, ds_size, ds_last_modified = datastream_info(ds, versions)
object_size += ds_size
object_versions += ds_versions
if ds_last_modified > object_last_modified:
object_last_modified = ds_last_modified
if ds.attrib['ID'] == 'RELS-EXT':
object_models, object_parents = process_rels(ds, versions)
if ds.attrib['ID'] == 'OBJ':
object_contains_obj = True
object_obj_mime = versions[-1].attrib['MIMETYPE']
if ds.attrib['ID'] == 'JP2':
# This part is a little silly, but we keep track of the size of JP2 ds to
# subtract them from the total size if the OBJ is also a JP2. This was done
# in the old XSLT version for historical reasons.
object_jp2_size = ds_size
return { 'models' : object_models,
'parents' : object_parents,
'datastreams' : object_datastreams,
'versions' : object_versions,
'size' : object_size,
'size_fake' : fake_size(object_contains_obj, object_obj_mime, object_size, object_jp2_size),
'last_modified' : object_last_modified,
'pid' : pid,
'label' : get_label(root),
'path' : path
}
def get_file_list(dir):
file_list = []
# Get all files in all subfolders
for path, subdirs, files in os.walk(dir):
for name in files:
file_list.append(os.path.join(path, name))
return file_list
def get_cm_details(object_details):
cm_details = defaultdict(lambda: defaultdict(int))
for object in object_details.values():
for model in object['models']:
if model not in exclude_content_models:
cm_details[model]['num'] += 1
cm_details[model]['size'] += object['size']
cm_details[model]['size_fake'] += object['size_fake']
cm_details[model]['model'] = model
return cm_details.values()
def traverse_parents(current, parents, details, collections):
for parent in parents:
if parent in collections:
if current['pid'] in collections[parent]['children']:
continue
else:
collections[parent]['children'].add(current['pid'])
else:
collections[parent]['children'] = set(current['pid'])
collections[parent]['num'] += 1
collections[parent]['size'] += current['size']
collections[parent]['size_fake'] += current['size_fake']
if parent in details:
traverse_parents(current, details[parent]['parents'], details, collections)
def get_collection_details(object_details):
collections = defaultdict(lambda: defaultdict(int))
for pid,object in object_details.items():
traverse_parents(object, object['parents'], object_details, collections)
for pid in list(collections.keys()):
if pid not in object_details:
del(collections[pid])
elif 'islandora:collectionCModel' not in object_details[pid]['models']:
del(collections[pid])
elif pid.split(':')[0] == 'islandora':
del(collections[pid])
else:
collections[pid]['label'] = object_details[pid]['label']
collections[pid]['pid'] = pid
return collections
def get_repo_totals_table(object_details, print_fake_size):
table = []
table.append(['Objects' , len(object_details)])
table.append(['Datastreams' , sum(object['datastreams'] for object in object_details.values())])
table.append(['Total Size' , ' '.join(human(sum(object['size'] for object in object_details.values())))])
if print_fake_size:
table.append(['Total Size (old)' , ' '.join(human(sum(object['size_fake'] for object in object_details.values())))])
table.append(['Last Modified' , max((object['last_modified'] for object in object_details.values()), default='-')])
return [], table
def get_cm_details_table(object_details, print_fake_size):
cm_details = get_cm_details(object_details)
headers = ['Content Model', 'Objects', 'Size', 'Unit']
if print_fake_size:
headers += ['Size (old)', 'Unit']
table = []
for cm in sorted(cm_details, key=lambda cm: cm['size']):
name = content_models[cm['model']] if cm['model'] in content_models else cm['model']
row = [name, cm['num']] + human(cm['size'])
if print_fake_size:
row += human(cm['size_fake'])
table.append(row)
return headers,table
def get_collection_details_table(collection_details, print_fake_size, num_collections = None):
headers = ['Collection', 'Label', 'Objects', 'Size', 'Unit']
if print_fake_size:
headers += ['Size (old)', 'Unit']
table = []
collections = sorted(collection_details.values(), key=lambda collection: collection['size'])
if num_collections != None and num_collections > 0:
collections = collections[-1 * num_collections:]
for collection in collections:
row = [collection['pid'], collection['label'], collection['num']] + human(collection['size'])
if print_fake_size:
row += human(collection['size_fake'])
table.append(row)
return headers, table
def output_table(headers, table, file_name, csv_output):
if csv_output:
csv_writer = csv.writer(file_handle)
if headers:
csv_writer.writerow(headers)
csv_writer.writerows(table)
else:
file_handle.write(tabulate(table, headers=headers, floatfmt='3.1f'))
file_handle.write("\n")
# main function
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Fedora Stats')
parser.add_argument('directory', metavar='DIR', help='Fedora object directory')
parser.add_argument('--collection', metavar='PID', dest='collection', help='PID of a collection to restrict the stats to')
parser.add_argument('--threads', metavar='NUM', dest='threads', help='Number for threads', type=int, choices=range(1, 21), default=8)
parser.add_argument('--old-size', action='store_true', dest='old_size', help='Show old size calculations')
parser.add_argument('--csv', action='store_true', dest='csv', help='Output CSV instead of formatted text')
parser.add_argument('--output', metavar='FILE', dest='output', help='Output to file')
parser.add_argument('--num-collections', metavar='NUM', dest='collections', help='Number of collections for collection stats (0 hides stats).', type=int)
args = parser.parse_args()
set_output_encoding()
dir = args.directory
pool = Pool(processes=args.threads)
file_list = get_file_list(dir)
object_details_map = pool.map(get_object_info, file_list, 250)
object_details = {object['pid'] : object for object in object_details_map}
pool.close()
del(object_details_map)
# Open file for writing if we need it
if args.output:
file_handle = open(args.output, 'w')
else:
file_handle = sys.stdout
# Get collection details if we need them
collection_details = get_collection_details(object_details)
# Limit output to a particular collection
if args.collection != None:
pids = collection_details[args.collection]['children']
object_details = {pid : object for pid, object in object_details.items() if pid in pids}
# Print content model details
output_table(*get_cm_details_table(object_details, args.old_size), file_handle, args.csv)
# Print collection information
if (args.collection == None and (args.collections == None or args.collections > 0)):
file_handle.write("\n")
output_table(*get_collection_details_table(collection_details, args.old_size, args.collections), file_handle, args.csv)
file_handle.write("\n")
output_table(*get_repo_totals_table(object_details, args.old_size), file_handle, args.csv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment