Skip to content

Instantly share code, notes, and snippets.

@no-reply
Last active Dec 14, 2015
Embed
What would you like to do?
Tool for moving DSpace Solr statistics to ElasticSearch.
import solr
import json
from pyes import ES
from progressbar import ProgressBar, Percentage, Bar
types = {
0:"BITSTREAM",
1:"BUNDLE",
2:"ITEM",
3:"COLLECTION",
4:"COMMUNITY",
5:"SITE",
6:"GROUP",
7:"EPERSON"
}
of = open('elasticstats.1.json', 'w')
s = solr.SolrConnection('http://localhost/solr/statistics') # aim at solr
response = s.query('*:*', rows=1000)
elasticjson = []
count = 0
print 'Processing Solr Index:'
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=int(response.results.numFound)).start()
#pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=1000000).start()
while count < int(response.results.numFound):
#while count < 1000000:
for hit in response:
hit['time'] = hit['time'].strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + "Z"
hit['typeIndex'] = hit['type']
hit['type'] = types[hit['type']]
elasticjson.append(hit)
count += 1
pbar.update(count)
if (count % 10000) == 0:
of.write(json.dumps(elasticjson))
of.close()
elasticjson = []
of = open('elasticstats.' + str(count) + '.json', 'w')
response = response.next_batch()
pbar.finish()
of.write(json.dumps(elasticjson))
of.close()
inx = json.load(file('elasticstats.1.json'))
conn = ES('localhost:9200') #aim at elasticsearch
count = 0
print 'Updating ElasticSearch:'
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=int(response.results.numFound)).start()
#pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=1000000).start()
#while count < 1000000:
while count < int(response.results.numFound):
for document in inx:
conn.index(document, 'dspaceindex', 'stats', bulk=True)
count += 1
pbar.update(count)
inx = json.load(file('elasticstats.' + str(count)+ '.json'))
pbar.finish()
print 'Complete.'
print 'Processed ' + str(count) + ' hits.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment