mjlassila/colldescr.py Secret

## colldescr.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of MarcXimiL.
#
# This script has been modified to return absolute number of fields
# and subfields, instead of percentages.
#
# MarcXimiL is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MacrXimiL is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Foobar.  If not, see <http://www.gnu.org/licenses/>.
#
#
# The copy of the GNU General Public License in MarcXimiL is located in : ./doc/gpl.txt
help = """
This program fill perform a statistical analysis of a MARCXML collection.

These statistical informations may help understanding the composition of
the collection, and therefore help to setup the MarcXimiL framework efficiently.

The following informations will be returned to the strandard output:

    - the number of records in the collection
    - the average number of fields by record
    - for each subfield: the percentage of presence throughout the collection,
      and the average length of the subfield


SYNTAX:    colldescr.py <collection_name.xml>


The collection will be looked-up in the ./var direcrory.
All the collections are stored there.
"""


import codecs
import sys
from xml.dom import minidom
import os

if not len(sys.argv) == 2:
    print(help)
    sys.exit()

# load records
def micro_dom( xml_string, start ):
    """
    This function will split the xml in records tags, it takes in input
    - xml_str (global)
    - start: the start position in the xml_string to extract the following record chunk
    This funcion outputs a tuple with two elements:
    - the xml record chunk
    - the new positionin xml_str (after the returned record)
    Note: this is much faster than using a parser (espetially xml.dom)
   """

    # find the beginning and end of the first tag occurence
    n_beg = xml_string.find('<record', start)
    n_end = xml_string.find('</record>', n_beg) + 9

    if n_beg < 0:
        return None, start

    if n_end < n_beg:
        write_message("Error XML baddly formated: record importation incomplete!", log='errors.log')
        return None, start

    return xml_string[n_beg : n_end], n_end


def parse_controlfields(record):
    """
    Parse and returns controlfields from an xml record.
    """
    output = []

    # print(record)

    xrec = minidom.parseString(record.encode("utf-8"))
    controlfields = xrec.getElementsByTagName('controlfield')
    #print(marc)

    for controlfield in controlfields:
        tag = None
        tag = controlfield.attributes['tag'].value
        if tag is not None and len(controlfield.childNodes) > 0:
            parsed_field = controlfield.childNodes[0].data
            output.append({tag: {'controlfield':parsed_field}})
    return output

def parse_datafields(record):
    """
    Parse and returns datafieldd from an xml record.
    """
    output = []

    xrec = minidom.parseString(record.encode("utf-8"))
    datafields = xrec.getElementsByTagName('datafield')
    for datafield in datafields:
        tag = None
        tag = datafield.attributes['tag'].value
        ind1 = datafield.attributes['ind1'].value
        ind2 = datafield.attributes['ind2'].value
        if tag is not None:
            subfs = {}
            subfields = datafield.getElementsByTagName('subfield')
            for subfield in subfields:
                code = subfield.attributes['code'].value
                if len(subfield.childNodes)>0:
                    parsed_sf = subfield.childNodes[0].data
                    subfs[code] = parsed_sf
            output.append({tag+ind1+ind2: subfs})
    return output


# reading collection

def load_collection(collection):
    """
    Loads all fields of the colleciton
    """

    collection_xml = codecs.open( collection, "r", "utf-8" ).read()

    records_xml = []
    start = 0
    while True:
        parsed_record = {}
        record, start = micro_dom(collection_xml, start) # returns record=None when all records have been extracted
        if record == None:
            break
        else:
            records_xml.append(record)

    records = []
    for record_xml in records_xml:
        record = []
        record = record + parse_controlfields(record_xml)
        record = record + parse_datafields(record_xml)
        records.append(record)

    return records


############
### MAIN ###
############

if os.name in ['nt']: # windows is not posix
    APPLICATION_PATH = '\\'.join( os.getcwd().split('\\')[:-1] )
    varpath = APPLICATION_PATH + '\\var\\'

else:
    APPLICATION_PATH = '/'.join( os.getcwd().split('/')[:-1] ) # this application root folder
    varpath = APPLICATION_PATH + '/var/'

collection_file = varpath + sys.argv[1]

collection = load_collection(collection_file)

records = {'count':0, 'number_fields':[]}
fields = {}

for record in collection:
    records['count'] += 1
    records['number_fields'].append( len(record) )
    for field in record:
        for fieldtag in field: # not a real loop, just one key = the field tag!
            if not fieldtag in fields.keys():
                fields[fieldtag] = {}
            for subfieldcode in field[fieldtag].keys():
                if not subfieldcode in fields[fieldtag]:
                    fields[fieldtag][subfieldcode] = []
                    fields[fieldtag][subfieldcode].append(field[fieldtag][subfieldcode])
                else:
                    fields[fieldtag][subfieldcode].append(field[fieldtag][subfieldcode])

print('\nCollection descritipon')
print('----------------------\n')

print('Collection: ' + collection_file + '\n')

nbrecords = records['count']
print('Number of records: ' + str(nbrecords)+ '\n')

print('Average number of fileds by record: ' + str(int(float(sum(records['number_fields']))/records['count']))+ '\n')

sfdata = {}
for fieldtag in fields.keys():
    for subfield in fields[fieldtag].keys():
        nkey = fieldtag+subfield
        if not nkey in sfdata:
            sfdata[nkey] = []
            sfdata[nkey].append( len(''.join(fields[fieldtag][subfield])) / len(fields[fieldtag][subfield] ))
        else:
            sfdata[nkey].append(len(fields[fieldtag][subfield]))

#print('Detected fields and subfields: ')
#for fieldtag in fields.keys():
#    print(fieldtag + ' ' + ', '.join(fields[fieldtag].keys()))


print('Presence and length of subfields\n')

print('Field\tPresence(%)\tAverage length')
for fieldtag in fields.keys():
    for subfield in fields[fieldtag].keys():
        nkey = fieldtag+subfield
        presence = int(len(fields[fieldtag][subfield]))
        av_length = sfdata[nkey][0]
        print(nkey + '\t' + str(presence) + '\t'+ str(av_length))


#print collection
	#!/usr/bin/python
	# -- coding: utf-8 --
	#
	# This file is part of MarcXimiL.
	#
	# This script has been modified to return absolute number of fields
	# and subfields, instead of percentages.
	#
	# MarcXimiL is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# MacrXimiL is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with Foobar. If not, see <http://www.gnu.org/licenses/>.
	#
	#
	# The copy of the GNU General Public License in MarcXimiL is located in : ./doc/gpl.txt
	help = """
	This program fill perform a statistical analysis of a MARCXML collection.

	These statistical informations may help understanding the composition of
	the collection, and therefore help to setup the MarcXimiL framework efficiently.

	The following informations will be returned to the strandard output:

	- the number of records in the collection
	- the average number of fields by record
	- for each subfield: the percentage of presence throughout the collection,
	and the average length of the subfield


	SYNTAX: colldescr.py <collection_name.xml>


	The collection will be looked-up in the ./var direcrory.
	All the collections are stored there.
	"""


	import codecs
	import sys
	from xml.dom import minidom
	import os

	if not len(sys.argv) == 2:
	print(help)
	sys.exit()

	# load records
	def micro_dom( xml_string, start ):
	"""
	This function will split the xml in records tags, it takes in input
	- xml_str (global)
	- start: the start position in the xml_string to extract the following record chunk
	This funcion outputs a tuple with two elements:
	- the xml record chunk
	- the new positionin xml_str (after the returned record)
	Note: this is much faster than using a parser (espetially xml.dom)
	"""

	# find the beginning and end of the first tag occurence
	n_beg = xml_string.find('<record', start)
	n_end = xml_string.find('</record>', n_beg) + 9

	if n_beg < 0:
	return None, start

	if n_end < n_beg:
	write_message("Error XML baddly formated: record importation incomplete!", log='errors.log')
	return None, start

	return xml_string[n_beg : n_end], n_end


	def parse_controlfields(record):
	"""
	Parse and returns controlfields from an xml record.
	"""
	output = []

	# print(record)

	xrec = minidom.parseString(record.encode("utf-8"))
	controlfields = xrec.getElementsByTagName('controlfield')
	#print(marc)

	for controlfield in controlfields:
	tag = None
	tag = controlfield.attributes['tag'].value
	if tag is not None and len(controlfield.childNodes) > 0:
	parsed_field = controlfield.childNodes[0].data
	output.append({tag: {'controlfield':parsed_field}})
	return output

	def parse_datafields(record):
	"""
	Parse and returns datafieldd from an xml record.
	"""
	output = []

	xrec = minidom.parseString(record.encode("utf-8"))
	datafields = xrec.getElementsByTagName('datafield')
	for datafield in datafields:
	tag = None
	tag = datafield.attributes['tag'].value
	ind1 = datafield.attributes['ind1'].value
	ind2 = datafield.attributes['ind2'].value
	if tag is not None:
	subfs = {}
	subfields = datafield.getElementsByTagName('subfield')
	for subfield in subfields:
	code = subfield.attributes['code'].value
	if len(subfield.childNodes)>0:
	parsed_sf = subfield.childNodes[0].data
	subfs[code] = parsed_sf
	output.append({tag+ind1+ind2: subfs})
	return output


	# reading collection

	def load_collection(collection):
	"""
	Loads all fields of the colleciton
	"""

	collection_xml = codecs.open( collection, "r", "utf-8" ).read()

	records_xml = []
	start = 0
	while True:
	parsed_record = {}
	record, start = micro_dom(collection_xml, start) # returns record=None when all records have been extracted
	if record == None:
	break
	else:
	records_xml.append(record)

	records = []
	for record_xml in records_xml:
	record = []
	record = record + parse_controlfields(record_xml)
	record = record + parse_datafields(record_xml)
	records.append(record)

	return records


	############
	### MAIN ###
	############

	if os.name in ['nt']: # windows is not posix
	APPLICATION_PATH = '\\'.join( os.getcwd().split('\\')[:-1] )
	varpath = APPLICATION_PATH + '\\var\\'

	else:
	APPLICATION_PATH = '/'.join( os.getcwd().split('/')[:-1] ) # this application root folder
	varpath = APPLICATION_PATH + '/var/'

	collection_file = varpath + sys.argv[1]

	collection = load_collection(collection_file)

	records = {'count':0, 'number_fields':[]}
	fields = {}

	for record in collection:
	records['count'] += 1
	records['number_fields'].append( len(record) )
	for field in record:
	for fieldtag in field: # not a real loop, just one key = the field tag!
	if not fieldtag in fields.keys():
	fields[fieldtag] = {}
	for subfieldcode in field[fieldtag].keys():
	if not subfieldcode in fields[fieldtag]:
	fields[fieldtag][subfieldcode] = []
	fields[fieldtag][subfieldcode].append(field[fieldtag][subfieldcode])
	else:
	fields[fieldtag][subfieldcode].append(field[fieldtag][subfieldcode])

	print('\nCollection descritipon')
	print('----------------------\n')

	print('Collection: ' + collection_file + '\n')

	nbrecords = records['count']
	print('Number of records: ' + str(nbrecords)+ '\n')

	print('Average number of fileds by record: ' + str(int(float(sum(records['number_fields']))/records['count']))+ '\n')

	sfdata = {}
	for fieldtag in fields.keys():
	for subfield in fields[fieldtag].keys():
	nkey = fieldtag+subfield
	if not nkey in sfdata:
	sfdata[nkey] = []
	sfdata[nkey].append( len(''.join(fields[fieldtag][subfield])) / len(fields[fieldtag][subfield] ))
	else:
	sfdata[nkey].append(len(fields[fieldtag][subfield]))

	#print('Detected fields and subfields: ')
	#for fieldtag in fields.keys():
	# print(fieldtag + ' ' + ', '.join(fields[fieldtag].keys()))


	print('Presence and length of subfields\n')

	print('Field\tPresence(%)\tAverage length')
	for fieldtag in fields.keys():
	for subfield in fields[fieldtag].keys():
	nkey = fieldtag+subfield
	presence = int(len(fields[fieldtag][subfield]))
	av_length = sfdata[nkey][0]
	print(nkey + '\t' + str(presence) + '\t'+ str(av_length))


	#print collection