Skip to content

Instantly share code, notes, and snippets.

@T0aD
Created September 11, 2015 16:04
Show Gist options
  • Save T0aD/e8343568eae916c3e6e6 to your computer and use it in GitHub Desktop.
Save T0aD/e8343568eae916c3e6e6 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# conf. for ES exports:
ADD_ES_ID = False
import sys, json, time
try:
filename = sys.argv[1]
except:
print "Syntax: %s [filename.json]"
exit(1)
def nice_progress(done, total, start, extra_data=""):
diff = time.time() - start
speed = int(done / diff)
rest = total - done
eta = rest / speed
hours = int(eta / 3600)
secs = eta % 3600
minutes = int(secs / 60)
secs = eta % 60
percent = int((100 * done) / total)
# print done, 'left to do:', rest, 'speed:', speed, 'm/s ETA:', '%02d:%02d:%02d' % (hours, minutes, secs), extra_data
print ' %02d %% - %7d left to do: %7d speed: %5d m/s ETA: %02d:%02d:%02d' % (percent, done, rest, speed, hours, minutes, secs), extra_data
ids = {}
fd = open(filename)
duplicates = 0
if ADD_ES_ID is True:
dups = []
fdclean = open("%s.clean" % filename, 'w')
count = 0
start = time.time()
total = sum(1 for line in open(filename))
for l in fd:
count += 1
if not count % 3000:
nice_progress(count, total, start, 'duplicates: %6d' % duplicates)
data = json.loads(l)
if data['id'] in ids:
duplicates += 1
if ADD_ES_ID is True:
dups.append(data['es_id'])
continue
ids[data['id']] = 1
if ADD_ES_ID is True:
del data['es_id']
fdclean.write(json.dumps(data) + "\n")
fd.close()
fdclean.close()
print 'found', duplicates, 'duplicates'
if ADD_ES_ID is True:
# pickle the crap of those ids:
import pickle
fd = open(filename + ".dup", 'wb')
pickle.dump(dups, fd, pickle.HIGHEST_PROTOCOL)
fd.close()
print 'duplicates saved in %s.dup' % filename
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment