Last active
December 14, 2015 07:29
-
-
Save no-reply/5050575 to your computer and use it in GitHub Desktop.
Tool for moving DSpace Solr statistics to ElasticSearch.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import solr | |
import json | |
from pyes import ES | |
from progressbar import ProgressBar, Percentage, Bar | |
types = { | |
0:"BITSTREAM", | |
1:"BUNDLE", | |
2:"ITEM", | |
3:"COLLECTION", | |
4:"COMMUNITY", | |
5:"SITE", | |
6:"GROUP", | |
7:"EPERSON" | |
} | |
of = open('elasticstats.1.json', 'w') | |
s = solr.SolrConnection('http://localhost/solr/statistics') # aim at solr | |
response = s.query('*:*', rows=1000) | |
elasticjson = [] | |
count = 0 | |
print 'Processing Solr Index:' | |
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=int(response.results.numFound)).start() | |
#pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=1000000).start() | |
while count < int(response.results.numFound): | |
#while count < 1000000: | |
for hit in response: | |
hit['time'] = hit['time'].strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + "Z" | |
hit['typeIndex'] = hit['type'] | |
hit['type'] = types[hit['type']] | |
elasticjson.append(hit) | |
count += 1 | |
pbar.update(count) | |
if (count % 10000) == 0: | |
of.write(json.dumps(elasticjson)) | |
of.close() | |
elasticjson = [] | |
of = open('elasticstats.' + str(count) + '.json', 'w') | |
response = response.next_batch() | |
pbar.finish() | |
of.write(json.dumps(elasticjson)) | |
of.close() | |
inx = json.load(file('elasticstats.1.json')) | |
conn = ES('localhost:9200') #aim at elasticsearch | |
count = 0 | |
print 'Updating ElasticSearch:' | |
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=int(response.results.numFound)).start() | |
#pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=1000000).start() | |
#while count < 1000000: | |
while count < int(response.results.numFound): | |
for document in inx: | |
conn.index(document, 'dspaceindex', 'stats', bulk=True) | |
count += 1 | |
pbar.update(count) | |
inx = json.load(file('elasticstats.' + str(count)+ '.json')) | |
pbar.finish() | |
print 'Complete.' | |
print 'Processed ' + str(count) + ' hits.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment