Create a gist now

Instantly share code, notes, and snippets.

#!/usr/bin/python
# code by peufeu
# http://stackoverflow.com/questions/1467898/what-language-could-i-use-for-fast-execution-of-this-database-summarization-task/1481186#1481186
import random, sys, time, heapq, psyco
ROWS = 27000000
def make_data( fname ):
f = open( fname, "w" )
r = random.Random()
for i in xrange( 0, ROWS, 10000 ):
for j in xrange( i,i+10000 ):
f.write( "%d %f %d\n" % (r.randint(0,100), r.uniform(0,1000), j))
print ("write: %d\r" % i),
sys.stdout.flush()
print
def read_data( fname ):
for n, line in enumerate( open( fname ) ):
r = line.strip().split()
yield int(r[0]),float(r[1]),r[2]
if not (n % 10000 ):
print ("read: %d\r" % n),
sys.stdout.flush()
print
def topn( ntop, data ):
ntop -= 1
assert ntop > 0
min_by_key = {}
top_by_key = {}
for key,value,label in data:
tup = (value,label)
if key not in top_by_key:
# initialize
top_by_key[key] = [ tup ]
else:
top = top_by_key[ key ]
l = len( top )
if l > ntop:
# replace minimum value in top if it is lower than current value
idx = min_by_key[ key ]
if top[idx] < tup:
top[idx] = tup
min_by_key[ key ] = top.index( min( top ) )
elif l < ntop:
# fill until we have ntop entries
top.append( tup )
else:
# we have ntop entries in list, we'll have ntop+1
top.append( tup )
# initialize minimum to keep
min_by_key[ key ] = top.index( min( top ) )
# finalize:
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() )
def grouptopn( ntop, data ):
top_by_key = {}
for key,value,label in data:
if key in top_by_key:
top_by_key[ key ].append( (value,label) )
else:
top_by_key[ key ] = [ (value,label) ]
return dict( (key, sorted( values, reverse=True )[:ntop]) for key,values in top_by_key.iteritems() )
def heaptopn( ntop, data ):
top_by_key = {}
for key,value,label in data:
tup = (value,label)
if key not in top_by_key:
top_by_key[ key ] = [ tup ]
else:
top = top_by_key[ key ]
if len(top) < ntop:
heapq.heappush(top, tup)
else:
if top[0] < tup:
heapq.heapreplace(top, tup)
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() )
def dummy( data ):
for row in data:
pass
#make_data( "data.txt" )
# psyco.full()
# t = time.clock()
# dummy( read_data( "data.txt" ) )
# t_read = time.clock() - t
# t = time.clock()
# top_result = topn( 5, read_data( "data.txt" ) )
# t_topn = time.clock() - t
# t = time.clock()
# htop_result = heaptopn( 5, read_data( "data.txt" ) )
# t_htopn = time.clock() - t
# # correctness checking :
# for key in top_result:
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in top_result[key])
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in htop_result[key])
# print
# print "Read data :", t_read
# print "TopN : ", t_topn - t_read
# print "HeapTopN : ", t_htopn - t_read
# for key in top_result:
# assert top_result[key] == htop_result[key]
read_silently = lambda infile: ((int(aa), float(bb), cc) for aa, bb, cc in
(line.split() for line in infile))
if __name__ == '__main__':
psyco.full()
top_result = topn(5, read_silently(sys.stdin))
for aa in top_result:
for bb, cc in top_result[aa]:
print aa, bb, cc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment