dheles/stats.py

## stats.py
#!/usr/bin/env python3

# written by https://github.com/jonathangreen & shared with his permission
# modest aditional contributions by https://github.com/dheles

import argparse
import os
import lxml.etree as ET
from multiprocessing import Pool
from datetime import datetime
import requests
import json
import urllib
from collections import defaultdict
from tabulate import tabulate
import sys
import codecs
import csv

content_models = {
  'islandora:newspaperCModel' : 'Newspaper',
  'islandora:newspaperIssueCModel' : 'Newspaper Issue',
  'islandora:sp_videoCModel' : 'Video',
  'islandora:sp-audioCModel' : 'Audio',
  'islandora:sp_large_image_cmodel' : 'Large Image',
  'ir:citationCModel' : 'Citation',
  'islandora:bookCModel' : 'Book',
  'islandora:newspaperPageCModel' : 'Newspaper Page',
  'islandora:binaryObjectCModel' : 'Binary Object',
  'islandora:compoundCModel' : 'Compound',
  'islandora:collectionCModel' : 'Collection',
  'fedora-system:ContentModel-3.0' : 'Content Model',
  'islandora:archivesspaceCModel' : 'Aspace Integration',
  'islandora:sp_basic_image' : 'Basic Image',
  'islandora:sp_pdf' : 'PDF',
  'islandora:pageCModel' : 'Book Page',
  'ir:thesisCModel' : 'Thesis'
}

exclude_content_models = [
  'fedora-system:ContentModel-3.0',
  'islandora:archivesspaceCModel'
]


def set_output_encoding(encoding='utf-8'):
  '''When piping to the terminal, python knows the encoding needed, and
     sets it automatically. But when piping to another program (for example,
     | less), python can not check the output encoding. In that case, it
     is None. What I am doing here is to catch this situation for both
     stdout and stderr and force the encoding'''
  current = sys.stdout.encoding
  if current is None :
    sys.stdout = codecs.getwriter(encoding)(sys.stdout)
  current = sys.stderr.encoding
  if current is None :
    sys.stderr = codecs.getwriter(encoding)(sys.stderr)


# Get human readable sizes
def human(num, suffix='B'):
  for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
    if abs(num) < 1024.0:
      return ["%3.1f" % num, "%s%s" % (unit, suffix)]
    num /= 1024.0
  return ["%.1f" % num, "%s%s" % ('Yi', suffix)]


def datastream_versions(ds):
  return ds.findall('{info:fedora/fedora-system:def/foxml#}datastreamVersion')


def datastream_size(ds, versions):
  # Only have size for M and X DS
  size = 0
  if ds.attrib['CONTROL_GROUP'] == 'M' or ds.attrib['CONTROL_GROUP'] == 'X':
    for version in versions:
      if 'SIZE' in version.attrib:
        size += int(version.attrib['SIZE'])
  return size


def get_resource(element):
  prefix = "info:fedora/"
  resource = element.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource']
  return resource[len(prefix):]


def process_rels(ds, versions):
  models = []
  parents = []
  latest = versions[-1]
  for model in latest.iter('{info:fedora/fedora-system:def/model#}hasModel'):
    models.append(get_resource(model))
  for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isMemberOf'):
    parents.append(get_resource(parent))
  for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isMemberOfCollection'):
    parents.append(get_resource(parent))
  for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isConstituentOf'):
    parents.append(get_resource(parent))
  return models, parents


# Interate over datastreams
def datastream_info(ds, versions):
  size = datastream_size(ds, versions)
  if 'CREATED' in versions[-1].attrib:
    last_modified = datetime.strptime(versions[-1].attrib['CREATED'], "%Y-%m-%dT%H:%M:%S.%fZ")
  else:
    last_modified = datetime.min
  return len(versions), size, last_modified


def get_datastreams(root):
  return root.findall('{info:fedora/fedora-system:def/foxml#}datastream')


def fake_size(contains_obj, obj_mime, total_size, jp2_size):
  # Calculate the fake size
  if contains_obj and obj_mime == 'image/jp2':
    fake_size = total_size - jp2_size
  elif not contains_obj:
    fake_size = 0
  else:
    fake_size = total_size
  return fake_size


def get_label(root):
  properties = root.find('{info:fedora/fedora-system:def/foxml#}objectProperties')
  label = properties.find('{info:fedora/fedora-system:def/foxml#}property[@NAME=\'info:fedora/fedora-system:def/model#label\']')
  return label.attrib['VALUE']


# Gets information from FOXML
def get_object_info(path):
  object_size = 0
  object_versions = 0
  object_last_modified = datetime.min
  object_models = []
  object_parents = []

  object_contains_obj = False
  object_obj_mime = None
  object_jp2_size = 0

  tree = ET.parse(path)
  root = tree.getroot()
  pid = root.attrib['PID']
  datastreams = get_datastreams(root)
  object_datastreams = len(datastreams)
  for ds in datastreams:
    versions = datastream_versions(ds)
    ds_versions, ds_size, ds_last_modified = datastream_info(ds, versions)
    object_size += ds_size
    object_versions += ds_versions
    if ds_last_modified > object_last_modified:
      object_last_modified = ds_last_modified
    if ds.attrib['ID'] == 'RELS-EXT':
      object_models, object_parents = process_rels(ds, versions)
    if ds.attrib['ID'] == 'OBJ':
      object_contains_obj = True
      object_obj_mime = versions[-1].attrib['MIMETYPE']
    if ds.attrib['ID'] == 'JP2':
        # This part is a little silly, but we keep track of the size of JP2 ds to
        # subtract them from the total size if the OBJ is also a JP2. This was done
        # in the old XSLT version for historical reasons.
        object_jp2_size = ds_size

  return { 'models'        : object_models,
           'parents'       : object_parents,
           'datastreams'   : object_datastreams,
           'versions'      : object_versions,
           'size'          : object_size,
           'size_fake'     : fake_size(object_contains_obj, object_obj_mime, object_size, object_jp2_size),
           'last_modified' : object_last_modified,
           'pid'           : pid,
           'label'         : get_label(root),
           'path'          : path
         }


def get_file_list(dir):
  file_list = []
  # Get all files in all subfolders
  for path, subdirs, files in os.walk(dir):
    for name in files:
      file_list.append(os.path.join(path, name))
  return file_list


def get_cm_details(object_details):
  cm_details = defaultdict(lambda: defaultdict(int))
  for object in object_details.values():
    for model in object['models']:
      if model not in exclude_content_models:
        cm_details[model]['num'] += 1
        cm_details[model]['size'] += object['size']
        cm_details[model]['size_fake'] += object['size_fake']
        cm_details[model]['model'] = model
  return cm_details.values()


def traverse_parents(current, parents, details, collections):
  for parent in parents:
    if parent in collections:
      if current['pid'] in collections[parent]['children']:
        continue
      else:
        collections[parent]['children'].add(current['pid'])
    else:
      collections[parent]['children'] = set(current['pid'])
    collections[parent]['num'] += 1
    collections[parent]['size'] += current['size']
    collections[parent]['size_fake'] += current['size_fake']
    if parent in details:
      traverse_parents(current, details[parent]['parents'], details, collections)


def get_collection_details(object_details):
  collections = defaultdict(lambda: defaultdict(int))
  for pid,object in object_details.items():
    traverse_parents(object, object['parents'], object_details, collections)
  for pid in list(collections.keys()):
    if pid not in object_details:
      del(collections[pid])
    elif 'islandora:collectionCModel' not in object_details[pid]['models']:
      del(collections[pid])
    elif pid.split(':')[0] == 'islandora':
      del(collections[pid])
    else:
      collections[pid]['label'] = object_details[pid]['label']
      collections[pid]['pid'] = pid
  return collections


def get_repo_totals_table(object_details, print_fake_size):
  table = []
  table.append(['Objects'       , len(object_details)])
  table.append(['Datastreams'   , sum(object['datastreams'] for object in object_details.values())])
  table.append(['Total Size'    , ' '.join(human(sum(object['size'] for object in object_details.values())))])
  if print_fake_size:
    table.append(['Total Size (old)'    , ' '.join(human(sum(object['size_fake'] for object in object_details.values())))])
  table.append(['Last Modified' , max((object['last_modified'] for object in object_details.values()), default='-')])
  return [], table


def get_cm_details_table(object_details, print_fake_size):
  cm_details = get_cm_details(object_details)
  headers = ['Content Model', 'Objects', 'Size', 'Unit']
  if print_fake_size:
    headers += ['Size (old)', 'Unit']
  table = []
  for cm in sorted(cm_details, key=lambda cm: cm['size']):
    name = content_models[cm['model']] if cm['model'] in content_models else cm['model']
    row = [name, cm['num']] + human(cm['size'])
    if print_fake_size:
      row += human(cm['size_fake'])
    table.append(row)
  return headers,table


def get_collection_details_table(collection_details, print_fake_size, num_collections = None):
  headers = ['Collection', 'Label', 'Objects', 'Size', 'Unit']
  if print_fake_size:
    headers += ['Size (old)', 'Unit']
  table = []
  collections = sorted(collection_details.values(), key=lambda collection: collection['size'])
  if num_collections != None and num_collections > 0:
    collections = collections[-1 * num_collections:]
  for collection in collections:
    row = [collection['pid'], collection['label'], collection['num']] + human(collection['size'])
    if print_fake_size:
      row += human(collection['size_fake'])
    table.append(row)
  return headers, table


def output_table(headers, table, file_name, csv_output):
  if csv_output:
    csv_writer = csv.writer(file_handle)
    if headers:
      csv_writer.writerow(headers)
    csv_writer.writerows(table)
  else:
    file_handle.write(tabulate(table, headers=headers, floatfmt='3.1f'))
    file_handle.write("\n")


# main function
if __name__ == '__main__':
  parser = argparse.ArgumentParser(description='Fedora Stats')
  parser.add_argument('directory', metavar='DIR', help='Fedora object directory')
  parser.add_argument('--collection', metavar='PID', dest='collection', help='PID of a collection to restrict the stats to')
  parser.add_argument('--threads', metavar='NUM', dest='threads', help='Number for threads', type=int, choices=range(1, 21), default=8)
  parser.add_argument('--old-size', action='store_true', dest='old_size', help='Show old size calculations')
  parser.add_argument('--csv', action='store_true', dest='csv', help='Output CSV instead of formatted text')
  parser.add_argument('--output', metavar='FILE', dest='output', help='Output to file')
  parser.add_argument('--num-collections', metavar='NUM', dest='collections', help='Number of collections for collection stats (0 hides stats).', type=int)
  args = parser.parse_args()
  set_output_encoding()

  dir = args.directory
  pool = Pool(processes=args.threads)
  file_list = get_file_list(dir)
  object_details_map = pool.map(get_object_info, file_list, 250)
  object_details = {object['pid'] : object for object in object_details_map}
  pool.close()
  del(object_details_map)

  # Open file for writing if we need it
  if args.output:
    file_handle = open(args.output, 'w')
  else:
    file_handle = sys.stdout

  # Get collection details if we need them
  collection_details = get_collection_details(object_details)

  # Limit output to a particular collection
  if args.collection != None:
    pids = collection_details[args.collection]['children']
    object_details = {pid : object for pid, object in object_details.items() if pid in pids}

  # Print content model details
  output_table(*get_cm_details_table(object_details, args.old_size), file_handle, args.csv)

  # Print collection information
  if (args.collection == None and (args.collections == None or args.collections > 0)):
    file_handle.write("\n")
    output_table(*get_collection_details_table(collection_details, args.old_size, args.collections), file_handle, args.csv)

  file_handle.write("\n")
  output_table(*get_repo_totals_table(object_details, args.old_size), file_handle, args.csv)
	#!/usr/bin/env python3

	# written by https://github.com/jonathangreen & shared with his permission
	# modest aditional contributions by https://github.com/dheles

	import argparse
	import os
	import lxml.etree as ET
	from multiprocessing import Pool
	from datetime import datetime
	import requests
	import json
	import urllib
	from collections import defaultdict
	from tabulate import tabulate
	import sys
	import codecs
	import csv

	content_models = {
	'islandora:newspaperCModel' : 'Newspaper',
	'islandora:newspaperIssueCModel' : 'Newspaper Issue',
	'islandora:sp_videoCModel' : 'Video',
	'islandora:sp-audioCModel' : 'Audio',
	'islandora:sp_large_image_cmodel' : 'Large Image',
	'ir:citationCModel' : 'Citation',
	'islandora:bookCModel' : 'Book',
	'islandora:newspaperPageCModel' : 'Newspaper Page',
	'islandora:binaryObjectCModel' : 'Binary Object',
	'islandora:compoundCModel' : 'Compound',
	'islandora:collectionCModel' : 'Collection',
	'fedora-system:ContentModel-3.0' : 'Content Model',
	'islandora:archivesspaceCModel' : 'Aspace Integration',
	'islandora:sp_basic_image' : 'Basic Image',
	'islandora:sp_pdf' : 'PDF',
	'islandora:pageCModel' : 'Book Page',
	'ir:thesisCModel' : 'Thesis'
	}

	exclude_content_models = [
	'fedora-system:ContentModel-3.0',
	'islandora:archivesspaceCModel'
	]


	def set_output_encoding(encoding='utf-8'):
	'''When piping to the terminal, python knows the encoding needed, and
	sets it automatically. But when piping to another program (for example,
	\| less), python can not check the output encoding. In that case, it
	is None. What I am doing here is to catch this situation for both
	stdout and stderr and force the encoding'''
	current = sys.stdout.encoding
	if current is None :
	sys.stdout = codecs.getwriter(encoding)(sys.stdout)
	current = sys.stderr.encoding
	if current is None :
	sys.stderr = codecs.getwriter(encoding)(sys.stderr)


	# Get human readable sizes
	def human(num, suffix='B'):
	for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
	if abs(num) < 1024.0:
	return ["%3.1f" % num, "%s%s" % (unit, suffix)]
	num /= 1024.0
	return ["%.1f" % num, "%s%s" % ('Yi', suffix)]


	def datastream_versions(ds):
	return ds.findall('{info:fedora/fedora-system:def/foxml#}datastreamVersion')


	def datastream_size(ds, versions):
	# Only have size for M and X DS
	size = 0
	if ds.attrib['CONTROL_GROUP'] == 'M' or ds.attrib['CONTROL_GROUP'] == 'X':
	for version in versions:
	if 'SIZE' in version.attrib:
	size += int(version.attrib['SIZE'])
	return size


	def get_resource(element):
	prefix = "info:fedora/"
	resource = element.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource']
	return resource[len(prefix):]


	def process_rels(ds, versions):
	models = []
	parents = []
	latest = versions[-1]
	for model in latest.iter('{info:fedora/fedora-system:def/model#}hasModel'):
	models.append(get_resource(model))
	for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isMemberOf'):
	parents.append(get_resource(parent))
	for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isMemberOfCollection'):
	parents.append(get_resource(parent))
	for parent in latest.iter('{info:fedora/fedora-system:def/relations-external#}isConstituentOf'):
	parents.append(get_resource(parent))
	return models, parents


	# Interate over datastreams
	def datastream_info(ds, versions):
	size = datastream_size(ds, versions)
	if 'CREATED' in versions[-1].attrib:
	last_modified = datetime.strptime(versions[-1].attrib['CREATED'], "%Y-%m-%dT%H:%M:%S.%fZ")
	else:
	last_modified = datetime.min
	return len(versions), size, last_modified


	def get_datastreams(root):
	return root.findall('{info:fedora/fedora-system:def/foxml#}datastream')


	def fake_size(contains_obj, obj_mime, total_size, jp2_size):
	# Calculate the fake size
	if contains_obj and obj_mime == 'image/jp2':
	fake_size = total_size - jp2_size
	elif not contains_obj:
	fake_size = 0
	else:
	fake_size = total_size
	return fake_size


	def get_label(root):
	properties = root.find('{info:fedora/fedora-system:def/foxml#}objectProperties')
	label = properties.find('{info:fedora/fedora-system:def/foxml#}property[@NAME=\'info:fedora/fedora-system:def/model#label\']')
	return label.attrib['VALUE']


	# Gets information from FOXML
	def get_object_info(path):
	object_size = 0
	object_versions = 0
	object_last_modified = datetime.min
	object_models = []
	object_parents = []

	object_contains_obj = False
	object_obj_mime = None
	object_jp2_size = 0

	tree = ET.parse(path)
	root = tree.getroot()
	pid = root.attrib['PID']
	datastreams = get_datastreams(root)
	object_datastreams = len(datastreams)
	for ds in datastreams:
	versions = datastream_versions(ds)
	ds_versions, ds_size, ds_last_modified = datastream_info(ds, versions)
	object_size += ds_size
	object_versions += ds_versions
	if ds_last_modified > object_last_modified:
	object_last_modified = ds_last_modified
	if ds.attrib['ID'] == 'RELS-EXT':
	object_models, object_parents = process_rels(ds, versions)
	if ds.attrib['ID'] == 'OBJ':
	object_contains_obj = True
	object_obj_mime = versions[-1].attrib['MIMETYPE']
	if ds.attrib['ID'] == 'JP2':
	# This part is a little silly, but we keep track of the size of JP2 ds to
	# subtract them from the total size if the OBJ is also a JP2. This was done
	# in the old XSLT version for historical reasons.
	object_jp2_size = ds_size

	return { 'models' : object_models,
	'parents' : object_parents,
	'datastreams' : object_datastreams,
	'versions' : object_versions,
	'size' : object_size,
	'size_fake' : fake_size(object_contains_obj, object_obj_mime, object_size, object_jp2_size),
	'last_modified' : object_last_modified,
	'pid' : pid,
	'label' : get_label(root),
	'path' : path
	}


	def get_file_list(dir):
	file_list = []
	# Get all files in all subfolders
	for path, subdirs, files in os.walk(dir):
	for name in files:
	file_list.append(os.path.join(path, name))
	return file_list


	def get_cm_details(object_details):
	cm_details = defaultdict(lambda: defaultdict(int))
	for object in object_details.values():
	for model in object['models']:
	if model not in exclude_content_models:
	cm_details[model]['num'] += 1
	cm_details[model]['size'] += object['size']
	cm_details[model]['size_fake'] += object['size_fake']
	cm_details[model]['model'] = model
	return cm_details.values()


	def traverse_parents(current, parents, details, collections):
	for parent in parents:
	if parent in collections:
	if current['pid'] in collections[parent]['children']:
	continue
	else:
	collections[parent]['children'].add(current['pid'])
	else:
	collections[parent]['children'] = set(current['pid'])
	collections[parent]['num'] += 1
	collections[parent]['size'] += current['size']
	collections[parent]['size_fake'] += current['size_fake']
	if parent in details:
	traverse_parents(current, details[parent]['parents'], details, collections)


	def get_collection_details(object_details):
	collections = defaultdict(lambda: defaultdict(int))
	for pid,object in object_details.items():
	traverse_parents(object, object['parents'], object_details, collections)
	for pid in list(collections.keys()):
	if pid not in object_details:
	del(collections[pid])
	elif 'islandora:collectionCModel' not in object_details[pid]['models']:
	del(collections[pid])
	elif pid.split(':')[0] == 'islandora':
	del(collections[pid])
	else:
	collections[pid]['label'] = object_details[pid]['label']
	collections[pid]['pid'] = pid
	return collections


	def get_repo_totals_table(object_details, print_fake_size):
	table = []
	table.append(['Objects' , len(object_details)])
	table.append(['Datastreams' , sum(object['datastreams'] for object in object_details.values())])
	table.append(['Total Size' , ' '.join(human(sum(object['size'] for object in object_details.values())))])
	if print_fake_size:
	table.append(['Total Size (old)' , ' '.join(human(sum(object['size_fake'] for object in object_details.values())))])
	table.append(['Last Modified' , max((object['last_modified'] for object in object_details.values()), default='-')])
	return [], table


	def get_cm_details_table(object_details, print_fake_size):
	cm_details = get_cm_details(object_details)
	headers = ['Content Model', 'Objects', 'Size', 'Unit']
	if print_fake_size:
	headers += ['Size (old)', 'Unit']
	table = []
	for cm in sorted(cm_details, key=lambda cm: cm['size']):
	name = content_models[cm['model']] if cm['model'] in content_models else cm['model']
	row = [name, cm['num']] + human(cm['size'])
	if print_fake_size:
	row += human(cm['size_fake'])
	table.append(row)
	return headers,table


	def get_collection_details_table(collection_details, print_fake_size, num_collections = None):
	headers = ['Collection', 'Label', 'Objects', 'Size', 'Unit']
	if print_fake_size:
	headers += ['Size (old)', 'Unit']
	table = []
	collections = sorted(collection_details.values(), key=lambda collection: collection['size'])
	if num_collections != None and num_collections > 0:
	collections = collections[-1 * num_collections:]
	for collection in collections:
	row = [collection['pid'], collection['label'], collection['num']] + human(collection['size'])
	if print_fake_size:
	row += human(collection['size_fake'])
	table.append(row)
	return headers, table


	def output_table(headers, table, file_name, csv_output):
	if csv_output:
	csv_writer = csv.writer(file_handle)
	if headers:
	csv_writer.writerow(headers)
	csv_writer.writerows(table)
	else:
	file_handle.write(tabulate(table, headers=headers, floatfmt='3.1f'))
	file_handle.write("\n")


	# main function
	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Fedora Stats')
	parser.add_argument('directory', metavar='DIR', help='Fedora object directory')
	parser.add_argument('--collection', metavar='PID', dest='collection', help='PID of a collection to restrict the stats to')
	parser.add_argument('--threads', metavar='NUM', dest='threads', help='Number for threads', type=int, choices=range(1, 21), default=8)
	parser.add_argument('--old-size', action='store_true', dest='old_size', help='Show old size calculations')
	parser.add_argument('--csv', action='store_true', dest='csv', help='Output CSV instead of formatted text')
	parser.add_argument('--output', metavar='FILE', dest='output', help='Output to file')
	parser.add_argument('--num-collections', metavar='NUM', dest='collections', help='Number of collections for collection stats (0 hides stats).', type=int)
	args = parser.parse_args()
	set_output_encoding()

	dir = args.directory
	pool = Pool(processes=args.threads)
	file_list = get_file_list(dir)
	object_details_map = pool.map(get_object_info, file_list, 250)
	object_details = {object['pid'] : object for object in object_details_map}
	pool.close()
	del(object_details_map)

	# Open file for writing if we need it
	if args.output:
	file_handle = open(args.output, 'w')
	else:
	file_handle = sys.stdout

	# Get collection details if we need them
	collection_details = get_collection_details(object_details)

	# Limit output to a particular collection
	if args.collection != None:
	pids = collection_details[args.collection]['children']
	object_details = {pid : object for pid, object in object_details.items() if pid in pids}

	# Print content model details
	output_table(*get_cm_details_table(object_details, args.old_size), file_handle, args.csv)

	# Print collection information
	if (args.collection == None and (args.collections == None or args.collections > 0)):
	file_handle.write("\n")
	output_table(*get_collection_details_table(collection_details, args.old_size, args.collections), file_handle, args.csv)

	file_handle.write("\n")
	output_table(*get_repo_totals_table(object_details, args.old_size), file_handle, args.csv)