MarcXimil-työkalun colldescr.py -skripti
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# | |
# This file is part of MarcXimiL. | |
# | |
# This script has been modified to return absolute number of fields | |
# and subfields, instead of percentages. | |
# | |
# MarcXimiL is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# MacrXimiL is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with Foobar. If not, see <http://www.gnu.org/licenses/>. | |
# | |
# | |
# The copy of the GNU General Public License in MarcXimiL is located in : ./doc/gpl.txt | |
help = """ | |
This program fill perform a statistical analysis of a MARCXML collection. | |
These statistical informations may help understanding the composition of | |
the collection, and therefore help to setup the MarcXimiL framework efficiently. | |
The following informations will be returned to the strandard output: | |
- the number of records in the collection | |
- the average number of fields by record | |
- for each subfield: the percentage of presence throughout the collection, | |
and the average length of the subfield | |
SYNTAX: colldescr.py <collection_name.xml> | |
The collection will be looked-up in the ./var direcrory. | |
All the collections are stored there. | |
""" | |
import codecs | |
import sys | |
from xml.dom import minidom | |
import os | |
if not len(sys.argv) == 2: | |
print(help) | |
sys.exit() | |
# load records | |
def micro_dom( xml_string, start ): | |
""" | |
This function will split the xml in records tags, it takes in input | |
- xml_str (global) | |
- start: the start position in the xml_string to extract the following record chunk | |
This funcion outputs a tuple with two elements: | |
- the xml record chunk | |
- the new positionin xml_str (after the returned record) | |
Note: this is much faster than using a parser (espetially xml.dom) | |
""" | |
# find the beginning and end of the first tag occurence | |
n_beg = xml_string.find('<record', start) | |
n_end = xml_string.find('</record>', n_beg) + 9 | |
if n_beg < 0: | |
return None, start | |
if n_end < n_beg: | |
write_message("Error XML baddly formated: record importation incomplete!", log='errors.log') | |
return None, start | |
return xml_string[n_beg : n_end], n_end | |
def parse_controlfields(record): | |
""" | |
Parse and returns controlfields from an xml record. | |
""" | |
output = [] | |
# print(record) | |
xrec = minidom.parseString(record.encode("utf-8")) | |
controlfields = xrec.getElementsByTagName('controlfield') | |
#print(marc) | |
for controlfield in controlfields: | |
tag = None | |
tag = controlfield.attributes['tag'].value | |
if tag is not None and len(controlfield.childNodes) > 0: | |
parsed_field = controlfield.childNodes[0].data | |
output.append({tag: {'controlfield':parsed_field}}) | |
return output | |
def parse_datafields(record): | |
""" | |
Parse and returns datafieldd from an xml record. | |
""" | |
output = [] | |
xrec = minidom.parseString(record.encode("utf-8")) | |
datafields = xrec.getElementsByTagName('datafield') | |
for datafield in datafields: | |
tag = None | |
tag = datafield.attributes['tag'].value | |
ind1 = datafield.attributes['ind1'].value | |
ind2 = datafield.attributes['ind2'].value | |
if tag is not None: | |
subfs = {} | |
subfields = datafield.getElementsByTagName('subfield') | |
for subfield in subfields: | |
code = subfield.attributes['code'].value | |
if len(subfield.childNodes)>0: | |
parsed_sf = subfield.childNodes[0].data | |
subfs[code] = parsed_sf | |
output.append({tag+ind1+ind2: subfs}) | |
return output | |
# reading collection | |
def load_collection(collection): | |
""" | |
Loads all fields of the colleciton | |
""" | |
collection_xml = codecs.open( collection, "r", "utf-8" ).read() | |
records_xml = [] | |
start = 0 | |
while True: | |
parsed_record = {} | |
record, start = micro_dom(collection_xml, start) # returns record=None when all records have been extracted | |
if record == None: | |
break | |
else: | |
records_xml.append(record) | |
records = [] | |
for record_xml in records_xml: | |
record = [] | |
record = record + parse_controlfields(record_xml) | |
record = record + parse_datafields(record_xml) | |
records.append(record) | |
return records | |
############ | |
### MAIN ### | |
############ | |
if os.name in ['nt']: # windows is not posix | |
APPLICATION_PATH = '\\'.join( os.getcwd().split('\\')[:-1] ) | |
varpath = APPLICATION_PATH + '\\var\\' | |
else: | |
APPLICATION_PATH = '/'.join( os.getcwd().split('/')[:-1] ) # this application root folder | |
varpath = APPLICATION_PATH + '/var/' | |
collection_file = varpath + sys.argv[1] | |
collection = load_collection(collection_file) | |
records = {'count':0, 'number_fields':[]} | |
fields = {} | |
for record in collection: | |
records['count'] += 1 | |
records['number_fields'].append( len(record) ) | |
for field in record: | |
for fieldtag in field: # not a real loop, just one key = the field tag! | |
if not fieldtag in fields.keys(): | |
fields[fieldtag] = {} | |
for subfieldcode in field[fieldtag].keys(): | |
if not subfieldcode in fields[fieldtag]: | |
fields[fieldtag][subfieldcode] = [] | |
fields[fieldtag][subfieldcode].append(field[fieldtag][subfieldcode]) | |
else: | |
fields[fieldtag][subfieldcode].append(field[fieldtag][subfieldcode]) | |
print('\nCollection descritipon') | |
print('----------------------\n') | |
print('Collection: ' + collection_file + '\n') | |
nbrecords = records['count'] | |
print('Number of records: ' + str(nbrecords)+ '\n') | |
print('Average number of fileds by record: ' + str(int(float(sum(records['number_fields']))/records['count']))+ '\n') | |
sfdata = {} | |
for fieldtag in fields.keys(): | |
for subfield in fields[fieldtag].keys(): | |
nkey = fieldtag+subfield | |
if not nkey in sfdata: | |
sfdata[nkey] = [] | |
sfdata[nkey].append( len(''.join(fields[fieldtag][subfield])) / len(fields[fieldtag][subfield] )) | |
else: | |
sfdata[nkey].append(len(fields[fieldtag][subfield])) | |
#print('Detected fields and subfields: ') | |
#for fieldtag in fields.keys(): | |
# print(fieldtag + ' ' + ', '.join(fields[fieldtag].keys())) | |
print('Presence and length of subfields\n') | |
print('Field\tPresence(%)\tAverage length') | |
for fieldtag in fields.keys(): | |
for subfield in fields[fieldtag].keys(): | |
nkey = fieldtag+subfield | |
presence = int(len(fields[fieldtag][subfield])) | |
av_length = sfdata[nkey][0] | |
print(nkey + '\t' + str(presence) + '\t'+ str(av_length)) | |
#print collection |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment