Created
May 3, 2016 14:03
-
-
Save kaplun/25ddde561519cfd8c3fff6ecf5dab4e5 to your computer and use it in GitHub Desktop.
Analysis tool to get statistics on repeatable fields and subfields of an Invenio instance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from invenio.search_engine import get_collection_reclist, get_record | |
from invenio.intbitset import intbitset | |
from click import progressbar | |
collection = sys.argv[1] | |
recids = list(get_collection_reclist(collection)) | |
recids.reverse() | |
repeatable_tags = {} | |
repeatable_subfields = {} | |
with progressbar(recids) as recids: | |
for recid in recids: | |
record = get_record(recid) | |
records_tags = set() | |
for tag in record: | |
for field in record[tag]: | |
current_tag = tag + (field[1] or ' ') + (field[2] or ' ') | |
if current_tag in records_tags: | |
if current_tag not in repeatable_tags: | |
repeatable_tags[current_tag] = intbitset([recid]) | |
else: | |
repeatable_tags[current_tag].add(recid) | |
records_tags.add(current_tag) | |
current_codes = set() | |
for code, value in field[0]: | |
if code in current_codes: | |
current_code = current_tag + code | |
if current_code not in repeatable_subfields: | |
repeatable_subfields[current_code] = intbitset([recid]) | |
else: | |
repeatable_subfields[current_code].add(recid) | |
current_codes.add(code) | |
for key, value in repeatable_tags.iteritems(): | |
print "%s -> %s" % (key, ", ".join(value[-3:])) | |
for key, value in repeatable_subfields.iteritems(): | |
print "%s -> %s" % (key, ", ".join(value[-3:])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment