Skip to content

@kragen /top5_peufeu.py
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
#!/usr/bin/python
# code by peufeu
# http://stackoverflow.com/questions/1467898/what-language-could-i-use-for-fast-execution-of-this-database-summarization-task/1481186#1481186
import random, sys, time, heapq, psyco
ROWS = 27000000
def make_data( fname ):
f = open( fname, "w" )
r = random.Random()
for i in xrange( 0, ROWS, 10000 ):
for j in xrange( i,i+10000 ):
f.write( "%d %f %d\n" % (r.randint(0,100), r.uniform(0,1000), j))
print ("write: %d\r" % i),
sys.stdout.flush()
print
def read_data( fname ):
for n, line in enumerate( open( fname ) ):
r = line.strip().split()
yield int(r[0]),float(r[1]),r[2]
if not (n % 10000 ):
print ("read: %d\r" % n),
sys.stdout.flush()
print
def topn( ntop, data ):
ntop -= 1
assert ntop > 0
min_by_key = {}
top_by_key = {}
for key,value,label in data:
tup = (value,label)
if key not in top_by_key:
# initialize
top_by_key[key] = [ tup ]
else:
top = top_by_key[ key ]
l = len( top )
if l > ntop:
# replace minimum value in top if it is lower than current value
idx = min_by_key[ key ]
if top[idx] < tup:
top[idx] = tup
min_by_key[ key ] = top.index( min( top ) )
elif l < ntop:
# fill until we have ntop entries
top.append( tup )
else:
# we have ntop entries in list, we'll have ntop+1
top.append( tup )
# initialize minimum to keep
min_by_key[ key ] = top.index( min( top ) )
# finalize:
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() )
def grouptopn( ntop, data ):
top_by_key = {}
for key,value,label in data:
if key in top_by_key:
top_by_key[ key ].append( (value,label) )
else:
top_by_key[ key ] = [ (value,label) ]
return dict( (key, sorted( values, reverse=True )[:ntop]) for key,values in top_by_key.iteritems() )
def heaptopn( ntop, data ):
top_by_key = {}
for key,value,label in data:
tup = (value,label)
if key not in top_by_key:
top_by_key[ key ] = [ tup ]
else:
top = top_by_key[ key ]
if len(top) < ntop:
heapq.heappush(top, tup)
else:
if top[0] < tup:
heapq.heapreplace(top, tup)
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() )
def dummy( data ):
for row in data:
pass
#make_data( "data.txt" )
# psyco.full()
# t = time.clock()
# dummy( read_data( "data.txt" ) )
# t_read = time.clock() - t
# t = time.clock()
# top_result = topn( 5, read_data( "data.txt" ) )
# t_topn = time.clock() - t
# t = time.clock()
# htop_result = heaptopn( 5, read_data( "data.txt" ) )
# t_htopn = time.clock() - t
# # correctness checking :
# for key in top_result:
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in top_result[key])
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in htop_result[key])
# print
# print "Read data :", t_read
# print "TopN : ", t_topn - t_read
# print "HeapTopN : ", t_htopn - t_read
# for key in top_result:
# assert top_result[key] == htop_result[key]
read_silently = lambda infile: ((int(aa), float(bb), cc) for aa, bb, cc in
(line.split() for line in infile))
if __name__ == '__main__':
psyco.full()
top_result = topn(5, read_silently(sys.stdin))
for aa in top_result:
for bb, cc in top_result[aa]:
print aa, bb, cc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.