Created
May 3, 2016 14:04
-
-
Save kaplun/a0459171acfc917458c7c4b41f7dc554 to your computer and use it in GitHub Desktop.
Analysis tool to find the current MARC usage of an Invenio installation (to understand common values, formats and outliers)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
from invenio.dbquery import run_sql | |
from invenio.search_engine import get_tag_name | |
from invenio.search_engine import get_collection_reclist | |
from invenio.intbitset import intbitset | |
collection = sys.argv[1] | |
recids = get_collection_reclist(collection) | |
for i in range(100): | |
for tag in run_sql("SELECT DISTINCT tag FROM bib%02dx ORDER BY tag" % i): | |
tag = tag[0] | |
sys.stderr.write("\r%s " % tag) | |
sys.stderr.flush() | |
tag_name = get_tag_name(tag) or get_tag_name(tag[:3]) or get_tag_name(tag[:3] + "__a") | |
new_tag = True | |
recids_with_value = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx JOIN bib%02dx ON id_bibxxx=id WHERE tag=%%s" % (i, i), (tag, ))) | |
if not recids_with_value: | |
continue | |
distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0] | |
msg = "%s (%s), (%s %s records with values) (%s distinct values in general)" % (tag, tag_name, len(recids_with_value), collection, distinct_values) | |
msg2 = str(recids_with_value) | |
print msg | |
print msg2 | |
print "-" * max(len(msg), len(msg2)) | |
distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0] | |
if distinct_values > 1000: | |
# Too much populated query. | |
print "---- Example of values ----" | |
example_values = run_sql("SELECT value from bib%02dx WHERE tag=%%s LIMIT %%s" % i, (tag, 10)) | |
for value in example_values: | |
print value[0] | |
continue | |
limit = 10 | |
if distinct_values < 30: | |
limit = distinct_values | |
outliers = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c LIMIT %%s" % (i, i), (tag, limit)) | |
print "---- Good values ----" | |
good_values = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c desc LIMIT %%s" % (i, i), (tag, 100)) | |
for value, dummy_count, id in good_values: | |
matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, ))) | |
if matched_recids: | |
print "% 10d %s " % (len(matched_recids), value) | |
print "---- Outliers ----" | |
for value, dummy_count, id in outliers: | |
matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, ))) | |
if matched_recids: | |
print "%s (%s %s records): %s" % (value, len(matched_recids), collection, matched_recids) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment