MarcXimil-työkalun colldescr.py -skripti
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# | |
# This file is part of MarcXimiL. | |
# | |
# This script has been modified to return absolute number of fields | |
# and subfields, instead of percentages. | |
# | |
# MarcXimiL is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# MacrXimiL is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with Foobar. If not, see <http://www.gnu.org/licenses/>. | |
# | |
# | |
# The copy of the GNU General Public License in MarcXimiL is located in : ./doc/gpl.txt | |
help = """ | |
This program fill perform a statistical analysis of a MARCXML collection. | |
These statistical informations may help understanding the composition of | |
the collection, and therefore help to setup the MarcXimiL framework efficiently. | |
The following informations will be returned to the strandard output: | |
- the number of records in the collection | |
- the average number of fields by record | |
- for each subfield: the percentage of presence throughout the collection, | |
and the average length of the subfield | |
SYNTAX: colldescr.py <collection_name.xml> | |
The collection will be looked-up in the ./var direcrory. | |
All the collections are stored there. | |
""" | |
import codecs | |
import sys | |
from xml.dom import minidom | |
import os | |
if not len(sys.argv) == 2: | |
print(help) | |
sys.exit() | |
# load records | |
def micro_dom( xml_string, start ): | |
""" | |
This function will split the xml in records tags, it takes in input | |
- xml_str (global) | |
- start: the start position in the xml_string to extract the following record chunk | |
This funcion outputs a tuple with two elements: | |
- the xml record chunk | |
- the new positionin xml_str (after the returned record) | |
Note: this is much faster than using a parser (espetially xml.dom) | |
""" | |
# find the beginning and end of the first tag occurence | |
n_beg = xml_string.find('<record', start) | |
n_end = xml_string.find('</record>', n_beg) + 9 | |
if n_beg < 0: | |
return None, start | |
if n_end < n_beg: | |
write_message("Error XML baddly formated: record importation incomplete!", log='errors.log') | |
return None, start | |
return xml_string[n_beg : n_end], n_end | |
def parse_controlfields(record): | |
""" | |
Parse and returns controlfields from an xml record. | |
""" | |
output = [] | |
# print(record) | |
xrec = minidom.parseString(record.encode("utf-8")) | |
controlfields = xrec.getElementsByTagName('controlfield') | |
#print(marc) | |
for controlfield in controlfields: | |
tag = None | |
tag = controlfield.attributes['tag'].value | |
if tag is not None and len(controlfield.childNodes) > 0: | |
parsed_field = controlfield.childNodes[0].data | |
output.append({tag: {'controlfield':parsed_field}}) | |
return output | |
def parse_datafields(record): | |
""" | |
Parse and returns datafieldd from an xml record. | |
""" | |
output = [] | |
xrec = minidom.parseString(record.encode("utf-8")) | |
datafields = xrec.getElementsByTagName('datafield') | |
for datafield in datafields: | |
tag = None | |
tag = datafield.attributes['tag'].value | |
ind1 = datafield.attributes['ind1'].value | |
ind2 = datafield.attributes['ind2'].value | |
if tag is not None: | |
subfs = {} | |
subfields = datafield.getElementsByTagName('subfield') | |
for subfield in subfields: | |
code = subfield.attributes['code'].value | |
if len(subfield.childNodes)>0: | |
parsed_sf = subfield.childNodes[0].data | |
subfs[code] = parsed_sf | |
output.append({tag+ind1+ind2: subfs}) | |
return output | |
# reading collection | |
def load_collection(collection): | |
""" | |
Loads all fields of the colleciton | |
""" | |
collection_xml = codecs.open( collection, "r", "utf-8" ).read() | |
records_xml = [] | |
start = 0 | |
while True: | |
parsed_record = {} | |
record, start = micro_dom(collection_xml, start) # returns record=None when all records have been extracted | |
if record == None: | |
break | |
else: | |
records_xml.append(record) | |
records = [] | |
for record_xml in records_xml: | |
record = [] | |
record = record + parse_controlfields(record_xml) | |
record = record + parse_datafields(record_xml) | |
records.append(record) | |
return records | |
############ | |
### MAIN ### | |
############ | |
if os.name in ['nt']: # windows is not posix | |
APPLICATION_PATH = '\\'.join( os.getcwd().split('\\')[:-1] ) | |
varpath = APPLICATION_PATH + '\\var\\' | |
else: | |
APPLICATION_PATH = '/'.join( os.getcwd().split('/')[:-1] ) # this application root folder | |
varpath = APPLICATION_PATH + '/var/' | |
collection_file = varpath + sys.argv[1] | |
collection = load_collection(collection_file) | |
records = {'count':0, 'number_fields':[]} | |
fields = {} | |
for record in collection: | |
records['count'] += 1 | |
records['number_fields'].append( len(record) ) | |
for field in record: | |
for fieldtag in field: # not a real loop, just one key = the field tag! | |
if not fieldtag in fields.keys(): | |
fields[fieldtag] = {} | |
for subfieldcode in field[fieldtag].keys(): | |
if not subfieldcode in fields[fieldtag]: | |
fields[fieldtag][subfieldcode] = [] | |
fields[fieldtag][subfieldcode].append(field[fieldtag][subfieldcode]) | |
else: | |
fields[fieldtag][subfieldcode].append(field[fieldtag][subfieldcode]) | |
print('\nCollection descritipon') | |
print('----------------------\n') | |
print('Collection: ' + collection_file + '\n') | |
nbrecords = records['count'] | |
print('Number of records: ' + str(nbrecords)+ '\n') | |
print('Average number of fileds by record: ' + str(int(float(sum(records['number_fields']))/records['count']))+ '\n') | |
sfdata = {} | |
for fieldtag in fields.keys(): | |
for subfield in fields[fieldtag].keys(): | |
nkey = fieldtag+subfield | |
if not nkey in sfdata: | |
sfdata[nkey] = [] | |
sfdata[nkey].append( len(''.join(fields[fieldtag][subfield])) / len(fields[fieldtag][subfield] )) | |
else: | |
sfdata[nkey].append(len(fields[fieldtag][subfield])) | |
#print('Detected fields and subfields: ') | |
#for fieldtag in fields.keys(): | |
# print(fieldtag + ' ' + ', '.join(fields[fieldtag].keys())) | |
print('Presence and length of subfields\n') | |
print('Field\tPresence(%)\tAverage length') | |
for fieldtag in fields.keys(): | |
for subfield in fields[fieldtag].keys(): | |
nkey = fieldtag+subfield | |
presence = int(len(fields[fieldtag][subfield])) | |
av_length = sfdata[nkey][0] | |
print(nkey + '\t' + str(presence) + '\t'+ str(av_length)) | |
#print collection |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment