Skip to content

Instantly share code, notes, and snippets.

@alaniwi
Created October 23, 2017 11:33
Show Gist options
  • Save alaniwi/9f26da4f03bf940a15b85503b5f5747c to your computer and use it in GitHub Desktop.
Save alaniwi/9f26da4f03bf940a15b85503b5f5747c to your computer and use it in GitHub Desktop.
script to find Solr file docs with invalid timestamp
#!/usr/bin/env python
# Identifies the Solr file docs for which the timestamp has invalid format.
# before running this, ensure:
#
# - solrpy is available for import
#
# - solr has enough memory, e.g. in
# /usr/local/solr-home/master-8984/solr.in.sh
# set
# SOLR_HEAP="4g"
import pysolr
import re
import sys
def yield_docs(conn, **search_args):
args = ["*:*"]
kwargs = {'sort': 'id asc', 'rows': 10000}
kwargs.update(search_args)
res = None
n = 0
while True:
if res == None:
res = conn.search(*args, cursorMark="*", **kwargs)
else:
res = conn.search(*args, cursorMark=res.nextCursorMark, **kwargs)
if not res.docs:
break
n += len(res.docs)
print ">>", n
for doc in res.docs:
yield doc
_re = "[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$"
_matcher = re.compile(_re).match
def is_timestamp(ts):
if _matcher(ts):
return True
else:
return False
def dump_dict(d):
keys = d.keys()
keys.sort()
for k in keys:
print " %s => %s" % (k, d[k])
print
def main(urlstem):
conn = pysolr.Solr(urlstem + "/files")
for doc in yield_docs(conn, fl=["id", "timestamp"]):
if not is_timestamp(doc['timestamp']):
print doc["id"]
dump_dict(doc)
if __name__ == '__main__':
urlstems = sys.argv[1:] or ["http://localhost:8984/solr"]
for urlstem in urlstems:
print "Scanning %s" % urlstem
main(urlstem)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment