Create a gist now

Instantly share code, notes, and snippets.

@mjlassila / Secret
Created Nov 21, 2012

What would you like to do?
MarcXimil-työkalun -skripti
# -*- coding: utf-8 -*-
# This file is part of MarcXimiL.
# This script has been modified to return absolute number of fields
# and subfields, instead of percentages.
# MarcXimiL is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# MacrXimiL is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with Foobar. If not, see <>.
# The copy of the GNU General Public License in MarcXimiL is located in : ./doc/gpl.txt
help = """
This program fill perform a statistical analysis of a MARCXML collection.
These statistical informations may help understanding the composition of
the collection, and therefore help to setup the MarcXimiL framework efficiently.
The following informations will be returned to the strandard output:
- the number of records in the collection
- the average number of fields by record
- for each subfield: the percentage of presence throughout the collection,
and the average length of the subfield
SYNTAX: <collection_name.xml>
The collection will be looked-up in the ./var direcrory.
All the collections are stored there.
import codecs
import sys
from xml.dom import minidom
import os
if not len(sys.argv) == 2:
# load records
def micro_dom( xml_string, start ):
This function will split the xml in records tags, it takes in input
- xml_str (global)
- start: the start position in the xml_string to extract the following record chunk
This funcion outputs a tuple with two elements:
- the xml record chunk
- the new positionin xml_str (after the returned record)
Note: this is much faster than using a parser (espetially xml.dom)
# find the beginning and end of the first tag occurence
n_beg = xml_string.find('<record', start)
n_end = xml_string.find('</record>', n_beg) + 9
if n_beg < 0:
return None, start
if n_end < n_beg:
write_message("Error XML baddly formated: record importation incomplete!", log='errors.log')
return None, start
return xml_string[n_beg : n_end], n_end
def parse_controlfields(record):
Parse and returns controlfields from an xml record.
output = []
# print(record)
xrec = minidom.parseString(record.encode("utf-8"))
controlfields = xrec.getElementsByTagName('controlfield')
for controlfield in controlfields:
tag = None
tag = controlfield.attributes['tag'].value
if tag is not None and len(controlfield.childNodes) > 0:
parsed_field = controlfield.childNodes[0].data
output.append({tag: {'controlfield':parsed_field}})
return output
def parse_datafields(record):
Parse and returns datafieldd from an xml record.
output = []
xrec = minidom.parseString(record.encode("utf-8"))
datafields = xrec.getElementsByTagName('datafield')
for datafield in datafields:
tag = None
tag = datafield.attributes['tag'].value
ind1 = datafield.attributes['ind1'].value
ind2 = datafield.attributes['ind2'].value
if tag is not None:
subfs = {}
subfields = datafield.getElementsByTagName('subfield')
for subfield in subfields:
code = subfield.attributes['code'].value
if len(subfield.childNodes)>0:
parsed_sf = subfield.childNodes[0].data
subfs[code] = parsed_sf
output.append({tag+ind1+ind2: subfs})
return output
# reading collection
def load_collection(collection):
Loads all fields of the colleciton
collection_xml = collection, "r", "utf-8" ).read()
records_xml = []
start = 0
while True:
parsed_record = {}
record, start = micro_dom(collection_xml, start) # returns record=None when all records have been extracted
if record == None:
records = []
for record_xml in records_xml:
record = []
record = record + parse_controlfields(record_xml)
record = record + parse_datafields(record_xml)
return records
### MAIN ###
if in ['nt']: # windows is not posix
APPLICATION_PATH = '\\'.join( os.getcwd().split('\\')[:-1] )
varpath = APPLICATION_PATH + '\\var\\'
APPLICATION_PATH = '/'.join( os.getcwd().split('/')[:-1] ) # this application root folder
varpath = APPLICATION_PATH + '/var/'
collection_file = varpath + sys.argv[1]
collection = load_collection(collection_file)
records = {'count':0, 'number_fields':[]}
fields = {}
for record in collection:
records['count'] += 1
records['number_fields'].append( len(record) )
for field in record:
for fieldtag in field: # not a real loop, just one key = the field tag!
if not fieldtag in fields.keys():
fields[fieldtag] = {}
for subfieldcode in field[fieldtag].keys():
if not subfieldcode in fields[fieldtag]:
fields[fieldtag][subfieldcode] = []
print('\nCollection descritipon')
print('Collection: ' + collection_file + '\n')
nbrecords = records['count']
print('Number of records: ' + str(nbrecords)+ '\n')
print('Average number of fileds by record: ' + str(int(float(sum(records['number_fields']))/records['count']))+ '\n')
sfdata = {}
for fieldtag in fields.keys():
for subfield in fields[fieldtag].keys():
nkey = fieldtag+subfield
if not nkey in sfdata:
sfdata[nkey] = []
sfdata[nkey].append( len(''.join(fields[fieldtag][subfield])) / len(fields[fieldtag][subfield] ))
#print('Detected fields and subfields: ')
#for fieldtag in fields.keys():
# print(fieldtag + ' ' + ', '.join(fields[fieldtag].keys()))
print('Presence and length of subfields\n')
print('Field\tPresence(%)\tAverage length')
for fieldtag in fields.keys():
for subfield in fields[fieldtag].keys():
nkey = fieldtag+subfield
presence = int(len(fields[fieldtag][subfield]))
av_length = sfdata[nkey][0]
print(nkey + '\t' + str(presence) + '\t'+ str(av_length))
#print collection
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment