Created
February 16, 2022 15:47
-
-
Save dheles/a00bde99c27cd48a5e28ebca6746b514 to your computer and use it in GitHub Desktop.
Python script for calculating Islandora storage usage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# written by https://github.com/jonathangreen & shared with his permission | |
# modest aditional contributions by https://github.com/dheles | |
import argparse | |
import os | |
import lxml.etree as ET | |
from multiprocessing import Pool | |
from datetime import datetime | |
import requests | |
import json | |
import urllib | |
from collections import defaultdict | |
from tabulate import tabulate | |
import sys | |
import codecs | |
import csv | |
content_models = { | |
'islandora:newspaperCModel' : 'Newspaper', | |
'islandora:newspaperIssueCModel' : 'Newspaper Issue', | |
'islandora:sp_videoCModel' : 'Video', | |
'islandora:sp-audioCModel' : 'Audio', | |
'islandora:sp_large_image_cmodel' : 'Large Image', | |
'ir:citationCModel' : 'Citation', | |
'islandora:bookCModel' : 'Book', | |
'islandora:newspaperPageCModel' : 'Newspaper Page', | |
'islandora:binaryObjectCModel' : 'Binary Object', | |
'islandora:compoundCModel' : 'Compound', | |
'islandora:collectionCModel' : 'Collection', | |
'fedora-system:ContentModel-3.0' : 'Content Model', | |
'islandora:archivesspaceCModel' : 'Aspace Integration', | |
'islandora:sp_basic_image' : 'Basic Image', | |
'islandora:sp_pdf' : 'PDF', | |
'islandora:pageCModel' : 'Book Page', | |
'ir:thesisCModel' : 'Thesis' | |
} | |
exclude_content_models = [ | |
'fedora-system:ContentModel-3.0', | |
'islandora:archivesspaceCModel' | |
] | |
def set_output_encoding(encoding='utf-8'): | |
'''When piping to the terminal, python knows the encoding needed, and | |
sets it automatically. But when piping to another program (for example, | |
| less), python can not check the output encoding. In that case, it | |
is None. What I am doing here is to catch this situation for both | |
stdout and stderr and force the encoding''' | |
current = sys.stdout.encoding | |
if current is None : | |
sys.stdout = codecs.getwriter(encoding)(sys.stdout) | |
current = sys.stderr.encoding | |
if current is None : | |
sys.stderr = codecs.getwriter(encoding)(sys.stderr) | |
# Get human readable sizes | |
def human(num, suffix='B'): | |
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: | |
if abs(num) < 1024.0: | |
return ["%3.1f" % num, "%s%s" % (unit, suffix)] | |
num /= 1024.0 | |
return ["%.1f" % num, "%s%s" % ('Yi', suffix)] | |
def datastream_versions(ds): | |
return ds.findall('{info:fedora/fedora-system:def/foxml#}datastreamVersion') | |
def datastream_size(ds, versions): | |
# Only have size for M and X DS | |
size = 0 | |
if ds.attrib['CONTROL_GROUP'] == 'M' or ds.attrib['CONTROL_GROUP'] == 'X': | |
for version in versions: | |
if 'SIZE' in version.attrib: | |
size += int(version.attrib['SIZE']) | |
return size | |
def get_resource(element): | |
prefix = "info:fedora/" | |
resource = element.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource'] | |
return resource[len(prefix):] | |
def process_rels(ds, versions): | |
models = [] | |
parents = [] | |
latest = versions[-1] | |
for model in latest.iter('{info:fedora/fedora-system:def/model#}hasModel'): | |
models.append(get_resource(model)) | |
for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isMemberOf'): | |
parents.append(get_resource(parent)) | |
for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isMemberOfCollection'): | |
parents.append(get_resource(parent)) | |
for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isConstituentOf'): | |
parents.append(get_resource(parent)) | |
return models, parents | |
# Interate over datastreams | |
def datastream_info(ds, versions): | |
size = datastream_size(ds, versions) | |
if 'CREATED' in versions[-1].attrib: | |
last_modified = datetime.strptime(versions[-1].attrib['CREATED'], "%Y-%m-%dT%H:%M:%S.%fZ") | |
else: | |
last_modified = datetime.min | |
return len(versions), size, last_modified | |
def get_datastreams(root): | |
return root.findall('{info:fedora/fedora-system:def/foxml#}datastream') | |
def fake_size(contains_obj, obj_mime, total_size, jp2_size): | |
# Calculate the fake size | |
if contains_obj and obj_mime == 'image/jp2': | |
fake_size = total_size - jp2_size | |
elif not contains_obj: | |
fake_size = 0 | |
else: | |
fake_size = total_size | |
return fake_size | |
def get_label(root): | |
properties = root.find('{info:fedora/fedora-system:def/foxml#}objectProperties') | |
label = properties.find('{info:fedora/fedora-system:def/foxml#}property[@NAME=\'info:fedora/fedora-system:def/model#label\']') | |
return label.attrib['VALUE'] | |
# Gets information from FOXML | |
def get_object_info(path): | |
object_size = 0 | |
object_versions = 0 | |
object_last_modified = datetime.min | |
object_models = [] | |
object_parents = [] | |
object_contains_obj = False | |
object_obj_mime = None | |
object_jp2_size = 0 | |
tree = ET.parse(path) | |
root = tree.getroot() | |
pid = root.attrib['PID'] | |
datastreams = get_datastreams(root) | |
object_datastreams = len(datastreams) | |
for ds in datastreams: | |
versions = datastream_versions(ds) | |
ds_versions, ds_size, ds_last_modified = datastream_info(ds, versions) | |
object_size += ds_size | |
object_versions += ds_versions | |
if ds_last_modified > object_last_modified: | |
object_last_modified = ds_last_modified | |
if ds.attrib['ID'] == 'RELS-EXT': | |
object_models, object_parents = process_rels(ds, versions) | |
if ds.attrib['ID'] == 'OBJ': | |
object_contains_obj = True | |
object_obj_mime = versions[-1].attrib['MIMETYPE'] | |
if ds.attrib['ID'] == 'JP2': | |
# This part is a little silly, but we keep track of the size of JP2 ds to | |
# subtract them from the total size if the OBJ is also a JP2. This was done | |
# in the old XSLT version for historical reasons. | |
object_jp2_size = ds_size | |
return { 'models' : object_models, | |
'parents' : object_parents, | |
'datastreams' : object_datastreams, | |
'versions' : object_versions, | |
'size' : object_size, | |
'size_fake' : fake_size(object_contains_obj, object_obj_mime, object_size, object_jp2_size), | |
'last_modified' : object_last_modified, | |
'pid' : pid, | |
'label' : get_label(root), | |
'path' : path | |
} | |
def get_file_list(dir): | |
file_list = [] | |
# Get all files in all subfolders | |
for path, subdirs, files in os.walk(dir): | |
for name in files: | |
file_list.append(os.path.join(path, name)) | |
return file_list | |
def get_cm_details(object_details): | |
cm_details = defaultdict(lambda: defaultdict(int)) | |
for object in object_details.values(): | |
for model in object['models']: | |
if model not in exclude_content_models: | |
cm_details[model]['num'] += 1 | |
cm_details[model]['size'] += object['size'] | |
cm_details[model]['size_fake'] += object['size_fake'] | |
cm_details[model]['model'] = model | |
return cm_details.values() | |
def traverse_parents(current, parents, details, collections): | |
for parent in parents: | |
if parent in collections: | |
if current['pid'] in collections[parent]['children']: | |
continue | |
else: | |
collections[parent]['children'].add(current['pid']) | |
else: | |
collections[parent]['children'] = set(current['pid']) | |
collections[parent]['num'] += 1 | |
collections[parent]['size'] += current['size'] | |
collections[parent]['size_fake'] += current['size_fake'] | |
if parent in details: | |
traverse_parents(current, details[parent]['parents'], details, collections) | |
def get_collection_details(object_details): | |
collections = defaultdict(lambda: defaultdict(int)) | |
for pid,object in object_details.items(): | |
traverse_parents(object, object['parents'], object_details, collections) | |
for pid in list(collections.keys()): | |
if pid not in object_details: | |
del(collections[pid]) | |
elif 'islandora:collectionCModel' not in object_details[pid]['models']: | |
del(collections[pid]) | |
elif pid.split(':')[0] == 'islandora': | |
del(collections[pid]) | |
else: | |
collections[pid]['label'] = object_details[pid]['label'] | |
collections[pid]['pid'] = pid | |
return collections | |
def get_repo_totals_table(object_details, print_fake_size): | |
table = [] | |
table.append(['Objects' , len(object_details)]) | |
table.append(['Datastreams' , sum(object['datastreams'] for object in object_details.values())]) | |
table.append(['Total Size' , ' '.join(human(sum(object['size'] for object in object_details.values())))]) | |
if print_fake_size: | |
table.append(['Total Size (old)' , ' '.join(human(sum(object['size_fake'] for object in object_details.values())))]) | |
table.append(['Last Modified' , max((object['last_modified'] for object in object_details.values()), default='-')]) | |
return [], table | |
def get_cm_details_table(object_details, print_fake_size): | |
cm_details = get_cm_details(object_details) | |
headers = ['Content Model', 'Objects', 'Size', 'Unit'] | |
if print_fake_size: | |
headers += ['Size (old)', 'Unit'] | |
table = [] | |
for cm in sorted(cm_details, key=lambda cm: cm['size']): | |
name = content_models[cm['model']] if cm['model'] in content_models else cm['model'] | |
row = [name, cm['num']] + human(cm['size']) | |
if print_fake_size: | |
row += human(cm['size_fake']) | |
table.append(row) | |
return headers,table | |
def get_collection_details_table(collection_details, print_fake_size, num_collections = None): | |
headers = ['Collection', 'Label', 'Objects', 'Size', 'Unit'] | |
if print_fake_size: | |
headers += ['Size (old)', 'Unit'] | |
table = [] | |
collections = sorted(collection_details.values(), key=lambda collection: collection['size']) | |
if num_collections != None and num_collections > 0: | |
collections = collections[-1 * num_collections:] | |
for collection in collections: | |
row = [collection['pid'], collection['label'], collection['num']] + human(collection['size']) | |
if print_fake_size: | |
row += human(collection['size_fake']) | |
table.append(row) | |
return headers, table | |
def output_table(headers, table, file_name, csv_output): | |
if csv_output: | |
csv_writer = csv.writer(file_handle) | |
if headers: | |
csv_writer.writerow(headers) | |
csv_writer.writerows(table) | |
else: | |
file_handle.write(tabulate(table, headers=headers, floatfmt='3.1f')) | |
file_handle.write("\n") | |
# main function | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Fedora Stats') | |
parser.add_argument('directory', metavar='DIR', help='Fedora object directory') | |
parser.add_argument('--collection', metavar='PID', dest='collection', help='PID of a collection to restrict the stats to') | |
parser.add_argument('--threads', metavar='NUM', dest='threads', help='Number for threads', type=int, choices=range(1, 21), default=8) | |
parser.add_argument('--old-size', action='store_true', dest='old_size', help='Show old size calculations') | |
parser.add_argument('--csv', action='store_true', dest='csv', help='Output CSV instead of formatted text') | |
parser.add_argument('--output', metavar='FILE', dest='output', help='Output to file') | |
parser.add_argument('--num-collections', metavar='NUM', dest='collections', help='Number of collections for collection stats (0 hides stats).', type=int) | |
args = parser.parse_args() | |
set_output_encoding() | |
dir = args.directory | |
pool = Pool(processes=args.threads) | |
file_list = get_file_list(dir) | |
object_details_map = pool.map(get_object_info, file_list, 250) | |
object_details = {object['pid'] : object for object in object_details_map} | |
pool.close() | |
del(object_details_map) | |
# Open file for writing if we need it | |
if args.output: | |
file_handle = open(args.output, 'w') | |
else: | |
file_handle = sys.stdout | |
# Get collection details if we need them | |
collection_details = get_collection_details(object_details) | |
# Limit output to a particular collection | |
if args.collection != None: | |
pids = collection_details[args.collection]['children'] | |
object_details = {pid : object for pid, object in object_details.items() if pid in pids} | |
# Print content model details | |
output_table(*get_cm_details_table(object_details, args.old_size), file_handle, args.csv) | |
# Print collection information | |
if (args.collection == None and (args.collections == None or args.collections > 0)): | |
file_handle.write("\n") | |
output_table(*get_collection_details_table(collection_details, args.old_size, args.collections), file_handle, args.csv) | |
file_handle.write("\n") | |
output_table(*get_repo_totals_table(object_details, args.old_size), file_handle, args.csv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment