Skip to content

Instantly share code, notes, and snippets.

@kaplun
Created May 3, 2016 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save kaplun/25ddde561519cfd8c3fff6ecf5dab4e5 to your computer and use it in GitHub Desktop.
Save kaplun/25ddde561519cfd8c3fff6ecf5dab4e5 to your computer and use it in GitHub Desktop.
Analysis tool to get statistics on repeatable fields and subfields of an Invenio instance
#!/usr/bin/env python
from invenio.search_engine import get_collection_reclist, get_record
from invenio.intbitset import intbitset
from click import progressbar
collection = sys.argv[1]
recids = list(get_collection_reclist(collection))
recids.reverse()
repeatable_tags = {}
repeatable_subfields = {}
with progressbar(recids) as recids:
for recid in recids:
record = get_record(recid)
records_tags = set()
for tag in record:
for field in record[tag]:
current_tag = tag + (field[1] or ' ') + (field[2] or ' ')
if current_tag in records_tags:
if current_tag not in repeatable_tags:
repeatable_tags[current_tag] = intbitset([recid])
else:
repeatable_tags[current_tag].add(recid)
records_tags.add(current_tag)
current_codes = set()
for code, value in field[0]:
if code in current_codes:
current_code = current_tag + code
if current_code not in repeatable_subfields:
repeatable_subfields[current_code] = intbitset([recid])
else:
repeatable_subfields[current_code].add(recid)
current_codes.add(code)
for key, value in repeatable_tags.iteritems():
print "%s -> %s" % (key, ", ".join(value[-3:]))
for key, value in repeatable_subfields.iteritems():
print "%s -> %s" % (key, ", ".join(value[-3:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment