Skip to content

Instantly share code, notes, and snippets.

@no-reply
Last active December 14, 2015 07:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save no-reply/5050575 to your computer and use it in GitHub Desktop.
Save no-reply/5050575 to your computer and use it in GitHub Desktop.
Tool for moving DSpace Solr statistics to ElasticSearch.
import solr
import json
from pyes import ES
from progressbar import ProgressBar, Percentage, Bar
types = {
0:"BITSTREAM",
1:"BUNDLE",
2:"ITEM",
3:"COLLECTION",
4:"COMMUNITY",
5:"SITE",
6:"GROUP",
7:"EPERSON"
}
of = open('elasticstats.1.json', 'w')
s = solr.SolrConnection('http://localhost/solr/statistics') # aim at solr
response = s.query('*:*', rows=1000)
elasticjson = []
count = 0
print 'Processing Solr Index:'
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=int(response.results.numFound)).start()
#pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=1000000).start()
while count < int(response.results.numFound):
#while count < 1000000:
for hit in response:
hit['time'] = hit['time'].strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + "Z"
hit['typeIndex'] = hit['type']
hit['type'] = types[hit['type']]
elasticjson.append(hit)
count += 1
pbar.update(count)
if (count % 10000) == 0:
of.write(json.dumps(elasticjson))
of.close()
elasticjson = []
of = open('elasticstats.' + str(count) + '.json', 'w')
response = response.next_batch()
pbar.finish()
of.write(json.dumps(elasticjson))
of.close()
inx = json.load(file('elasticstats.1.json'))
conn = ES('localhost:9200') #aim at elasticsearch
count = 0
print 'Updating ElasticSearch:'
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=int(response.results.numFound)).start()
#pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=1000000).start()
#while count < 1000000:
while count < int(response.results.numFound):
for document in inx:
conn.index(document, 'dspaceindex', 'stats', bulk=True)
count += 1
pbar.update(count)
inx = json.load(file('elasticstats.' + str(count)+ '.json'))
pbar.finish()
print 'Complete.'
print 'Processed ' + str(count) + ' hits.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment