Created
October 23, 2017 11:33
-
-
Save alaniwi/9f26da4f03bf940a15b85503b5f5747c to your computer and use it in GitHub Desktop.
script to find Solr file docs with invalid timestamp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Identifies the Solr file docs for which the timestamp has invalid format. | |
# before running this, ensure: | |
# | |
# - solrpy is available for import | |
# | |
# - solr has enough memory, e.g. in | |
# /usr/local/solr-home/master-8984/solr.in.sh | |
# set | |
# SOLR_HEAP="4g" | |
import pysolr | |
import re | |
import sys | |
def yield_docs(conn, **search_args): | |
args = ["*:*"] | |
kwargs = {'sort': 'id asc', 'rows': 10000} | |
kwargs.update(search_args) | |
res = None | |
n = 0 | |
while True: | |
if res == None: | |
res = conn.search(*args, cursorMark="*", **kwargs) | |
else: | |
res = conn.search(*args, cursorMark=res.nextCursorMark, **kwargs) | |
if not res.docs: | |
break | |
n += len(res.docs) | |
print ">>", n | |
for doc in res.docs: | |
yield doc | |
_re = "[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$" | |
_matcher = re.compile(_re).match | |
def is_timestamp(ts): | |
if _matcher(ts): | |
return True | |
else: | |
return False | |
def dump_dict(d): | |
keys = d.keys() | |
keys.sort() | |
for k in keys: | |
print " %s => %s" % (k, d[k]) | |
def main(urlstem): | |
conn = pysolr.Solr(urlstem + "/files") | |
for doc in yield_docs(conn, fl=["id", "timestamp"]): | |
if not is_timestamp(doc['timestamp']): | |
print doc["id"] | |
dump_dict(doc) | |
if __name__ == '__main__': | |
urlstems = sys.argv[1:] or ["http://localhost:8984/solr"] | |
for urlstem in urlstems: | |
print "Scanning %s" % urlstem | |
main(urlstem) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment