kaplun/marc_status.py

## marc_status.py
#!/usr/bin/env python

import sys

from invenio.dbquery import run_sql
from invenio.search_engine import get_tag_name
from invenio.search_engine import get_collection_reclist
from invenio.intbitset import intbitset

collection = sys.argv[1]

recids = get_collection_reclist(collection)

for i in range(100):
    for tag in run_sql("SELECT DISTINCT tag FROM bib%02dx ORDER BY tag" % i):
        tag = tag[0]
        sys.stderr.write("\r%s   " % tag)
        sys.stderr.flush()
        tag_name = get_tag_name(tag) or get_tag_name(tag[:3]) or get_tag_name(tag[:3] + "__a")
        new_tag = True
        recids_with_value = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx JOIN bib%02dx ON id_bibxxx=id WHERE tag=%%s" % (i, i), (tag, )))
        if not recids_with_value:
            continue
        distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]
        print
        msg =  "%s (%s), (%s %s records with values) (%s distinct values in general)" % (tag, tag_name, len(recids_with_value), collection, distinct_values)
        msg2 = str(recids_with_value)
        print msg
        print msg2
        print "-" * max(len(msg), len(msg2))

        distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]

        if distinct_values > 1000:
            # Too much populated query.
            print "---- Example of values ----"
            example_values = run_sql("SELECT value from bib%02dx WHERE tag=%%s LIMIT %%s" % i, (tag, 10))
            for value in example_values:
                print value[0]
            continue
        limit = 10
        if distinct_values < 30:
            limit = distinct_values
        outliers = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c LIMIT %%s" % (i, i), (tag, limit))

        print "---- Good values ----"
        good_values = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c desc LIMIT %%s" % (i, i), (tag, 100))
        for value, dummy_count, id in good_values:
            matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
            if matched_recids:
                print "% 10d %s " % (len(matched_recids), value)

        print "---- Outliers ----"
        for value, dummy_count, id in outliers:
            matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
            if matched_recids:
                print "%s (%s %s records): %s" % (value, len(matched_recids), collection, matched_recids)
	#!/usr/bin/env python

	import sys

	from invenio.dbquery import run_sql
	from invenio.search_engine import get_tag_name
	from invenio.search_engine import get_collection_reclist
	from invenio.intbitset import intbitset

	collection = sys.argv[1]

	recids = get_collection_reclist(collection)

	for i in range(100):
	for tag in run_sql("SELECT DISTINCT tag FROM bib%02dx ORDER BY tag" % i):
	tag = tag[0]
	sys.stderr.write("\r%s " % tag)
	sys.stderr.flush()
	tag_name = get_tag_name(tag) or get_tag_name(tag[:3]) or get_tag_name(tag[:3] + "__a")
	new_tag = True
	recids_with_value = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx JOIN bib%02dx ON id_bibxxx=id WHERE tag=%%s" % (i, i), (tag, )))
	if not recids_with_value:
	continue
	distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]
	print
	msg = "%s (%s), (%s %s records with values) (%s distinct values in general)" % (tag, tag_name, len(recids_with_value), collection, distinct_values)
	msg2 = str(recids_with_value)
	print msg
	print msg2
	print "-" * max(len(msg), len(msg2))

	distinct_values = run_sql("SELECT COUNT(1) FROM bib%02dx WHERE tag=%%s" % i, (tag, ))[0][0]

	if distinct_values > 1000:
	# Too much populated query.
	print "---- Example of values ----"
	example_values = run_sql("SELECT value from bib%02dx WHERE tag=%%s LIMIT %%s" % i, (tag, 10))
	for value in example_values:
	print value[0]
	continue
	limit = 10
	if distinct_values < 30:
	limit = distinct_values
	outliers = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c LIMIT %%s" % (i, i), (tag, limit))

	print "---- Good values ----"
	good_values = run_sql("SELECT value, count(*) AS c, id FROM bibrec_bib%02dx join bib%02dx ON id_bibxxx=id WHERE tag=%%s GROUP BY id ORDER BY c desc LIMIT %%s" % (i, i), (tag, 100))
	for value, dummy_count, id in good_values:
	matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
	if matched_recids:
	print "% 10d %s " % (len(matched_recids), value)

	print "---- Outliers ----"
	for value, dummy_count, id in outliers:
	matched_recids = recids & intbitset(run_sql("SELECT id_bibrec from bibrec_bib%02dx WHERE id_bibxxx=%%s" % i, (id, )))
	if matched_recids:
	print "%s (%s %s records): %s" % (value, len(matched_recids), collection, matched_recids)