Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist
View top5_peufeu.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
#!/usr/bin/python
# code by peufeu
# http://stackoverflow.com/questions/1467898/what-language-could-i-use-for-fast-execution-of-this-database-summarization-task/1481186#1481186
import random, sys, time, heapq, psyco
 
ROWS = 27000000
 
def make_data( fname ):
f = open( fname, "w" )
r = random.Random()
for i in xrange( 0, ROWS, 10000 ):
for j in xrange( i,i+10000 ):
f.write( "%d %f %d\n" % (r.randint(0,100), r.uniform(0,1000), j))
print ("write: %d\r" % i),
sys.stdout.flush()
print
 
def read_data( fname ):
for n, line in enumerate( open( fname ) ):
r = line.strip().split()
yield int(r[0]),float(r[1]),r[2]
if not (n % 10000 ):
print ("read: %d\r" % n),
sys.stdout.flush()
print
 
def topn( ntop, data ):
ntop -= 1
assert ntop > 0
min_by_key = {}
top_by_key = {}
for key,value,label in data:
tup = (value,label)
if key not in top_by_key:
# initialize
top_by_key[key] = [ tup ]
else:
top = top_by_key[ key ]
l = len( top )
if l > ntop:
# replace minimum value in top if it is lower than current value
idx = min_by_key[ key ]
if top[idx] < tup:
top[idx] = tup
min_by_key[ key ] = top.index( min( top ) )
elif l < ntop:
# fill until we have ntop entries
top.append( tup )
else:
# we have ntop entries in list, we'll have ntop+1
top.append( tup )
# initialize minimum to keep
min_by_key[ key ] = top.index( min( top ) )
 
# finalize:
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() )
 
def grouptopn( ntop, data ):
top_by_key = {}
for key,value,label in data:
if key in top_by_key:
top_by_key[ key ].append( (value,label) )
else:
top_by_key[ key ] = [ (value,label) ]
 
return dict( (key, sorted( values, reverse=True )[:ntop]) for key,values in top_by_key.iteritems() )
 
def heaptopn( ntop, data ):
top_by_key = {}
for key,value,label in data:
tup = (value,label)
if key not in top_by_key:
top_by_key[ key ] = [ tup ]
else:
top = top_by_key[ key ]
if len(top) < ntop:
heapq.heappush(top, tup)
else:
if top[0] < tup:
heapq.heapreplace(top, tup)
 
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() )
 
def dummy( data ):
for row in data:
pass
 
#make_data( "data.txt" )
 
# psyco.full()
 
# t = time.clock()
# dummy( read_data( "data.txt" ) )
# t_read = time.clock() - t
 
# t = time.clock()
# top_result = topn( 5, read_data( "data.txt" ) )
# t_topn = time.clock() - t
 
# t = time.clock()
# htop_result = heaptopn( 5, read_data( "data.txt" ) )
# t_htopn = time.clock() - t
 
# # correctness checking :
# for key in top_result:
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in top_result[key])
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in htop_result[key])
 
# print
# print "Read data :", t_read
# print "TopN : ", t_topn - t_read
# print "HeapTopN : ", t_htopn - t_read
 
# for key in top_result:
# assert top_result[key] == htop_result[key]
 
read_silently = lambda infile: ((int(aa), float(bb), cc) for aa, bb, cc in
(line.split() for line in infile))
 
if __name__ == '__main__':
psyco.full()
top_result = topn(5, read_silently(sys.stdin))
for aa in top_result:
for bb, cc in top_result[aa]:
print aa, bb, cc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.