Skip to content

Instantly share code, notes, and snippets.

@kaplun
Created May 3, 2016 14:04
Show Gist options
  • Save kaplun/a0459171acfc917458c7c4b41f7dc554 to your computer and use it in GitHub Desktop.
Save kaplun/a0459171acfc917458c7c4b41f7dc554 to your computer and use it in GitHub Desktop.
Analysis tool to find the current MARC usage of an Invenio installation (to understand common values, formats and outliers)
#!/usr/bin/env python
import sys
from invenio.dbquery import run_sql
from invenio.search_engine import get_tag_name
from invenio.search_engine import get_collection_reclist
from invenio.intbitset import intbitset
collection = sys.argv[1]
recids = get_collection_reclist(collection)
for i in range(100):
for tag in run_sql("SELECT DISTINCT tag FROM bib%02dx ORDER BY tag" % i):
tag = tag[0]
sys.stderr.write("\r%s " % tag)
sys.stderr.flush()
tag_name = get_tag_name(tag) or get_tag_name(tag[:3]) or get_tag_name(tag[:3] + "__a")
new_tag = True
recids_with_value = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx JOIN bib%02dx ON id_bibxxx=id WHERE tag=%%s" % (i, i), (tag, )))
if not recids_with_value:
continue
distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]
print
msg = "%s (%s), (%s %s records with values) (%s distinct values in general)" % (tag, tag_name, len(recids_with_value), collection, distinct_values)
msg2 = str(recids_with_value)
print msg
print msg2
print "-" * max(len(msg), len(msg2))
distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]
if distinct_values > 1000:
# Too much populated query.
print "---- Example of values ----"
example_values = run_sql("SELECT value from bib%02dx WHERE tag=%%s LIMIT %%s" % i, (tag, 10))
for value in example_values:
print value[0]
continue
limit = 10
if distinct_values < 30:
limit = distinct_values
outliers = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c LIMIT %%s" % (i, i), (tag, limit))
print "---- Good values ----"
good_values = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c desc LIMIT %%s" % (i, i), (tag, 100))
for value, dummy_count, id in good_values:
matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
if matched_recids:
print "% 10d %s " % (len(matched_recids), value)
print "---- Outliers ----"
for value, dummy_count, id in outliers:
matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
if matched_recids:
print "%s (%s %s records): %s" % (value, len(matched_recids), collection, matched_recids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment