public
Last active

  • Download Gist
top5_peufeu.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
#!/usr/bin/python
# code by peufeu
# http://stackoverflow.com/questions/1467898/what-language-could-i-use-for-fast-execution-of-this-database-summarization-task/1481186#1481186
import random, sys, time, heapq, psyco
 
ROWS = 27000000
 
def make_data( fname ):
f = open( fname, "w" )
r = random.Random()
for i in xrange( 0, ROWS, 10000 ):
for j in xrange( i,i+10000 ):
f.write( "%d %f %d\n" % (r.randint(0,100), r.uniform(0,1000), j))
print ("write: %d\r" % i),
sys.stdout.flush()
print
 
def read_data( fname ):
for n, line in enumerate( open( fname ) ):
r = line.strip().split()
yield int(r[0]),float(r[1]),r[2]
if not (n % 10000 ):
print ("read: %d\r" % n),
sys.stdout.flush()
print
 
def topn( ntop, data ):
ntop -= 1
assert ntop > 0
min_by_key = {}
top_by_key = {}
for key,value,label in data:
tup = (value,label)
if key not in top_by_key:
# initialize
top_by_key[key] = [ tup ]
else:
top = top_by_key[ key ]
l = len( top )
if l > ntop:
# replace minimum value in top if it is lower than current value
idx = min_by_key[ key ]
if top[idx] < tup:
top[idx] = tup
min_by_key[ key ] = top.index( min( top ) )
elif l < ntop:
# fill until we have ntop entries
top.append( tup )
else:
# we have ntop entries in list, we'll have ntop+1
top.append( tup )
# initialize minimum to keep
min_by_key[ key ] = top.index( min( top ) )
 
# finalize:
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() )
 
def grouptopn( ntop, data ):
top_by_key = {}
for key,value,label in data:
if key in top_by_key:
top_by_key[ key ].append( (value,label) )
else:
top_by_key[ key ] = [ (value,label) ]
 
return dict( (key, sorted( values, reverse=True )[:ntop]) for key,values in top_by_key.iteritems() )
 
def heaptopn( ntop, data ):
top_by_key = {}
for key,value,label in data:
tup = (value,label)
if key not in top_by_key:
top_by_key[ key ] = [ tup ]
else:
top = top_by_key[ key ]
if len(top) < ntop:
heapq.heappush(top, tup)
else:
if top[0] < tup:
heapq.heapreplace(top, tup)
 
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() )
 
def dummy( data ):
for row in data:
pass
 
#make_data( "data.txt" )
 
# psyco.full()
 
# t = time.clock()
# dummy( read_data( "data.txt" ) )
# t_read = time.clock() - t
 
# t = time.clock()
# top_result = topn( 5, read_data( "data.txt" ) )
# t_topn = time.clock() - t
 
# t = time.clock()
# htop_result = heaptopn( 5, read_data( "data.txt" ) )
# t_htopn = time.clock() - t
 
# # correctness checking :
# for key in top_result:
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in top_result[key])
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in htop_result[key])
 
# print
# print "Read data :", t_read
# print "TopN : ", t_topn - t_read
# print "HeapTopN : ", t_htopn - t_read
 
# for key in top_result:
# assert top_result[key] == htop_result[key]
 
read_silently = lambda infile: ((int(aa), float(bb), cc) for aa, bb, cc in
(line.split() for line in infile))
 
if __name__ == '__main__':
psyco.full()
top_result = topn(5, read_silently(sys.stdin))
for aa in top_result:
for bb, cc in top_result[aa]:
print aa, bb, cc

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.