Created
September 27, 2009 16:46
-
-
Save kragen/194877 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# code by peufeu | |
# http://stackoverflow.com/questions/1467898/what-language-could-i-use-for-fast-execution-of-this-database-summarization-task/1481186#1481186 | |
import random, sys, time, heapq, psyco | |
ROWS = 27000000 | |
def make_data( fname ): | |
f = open( fname, "w" ) | |
r = random.Random() | |
for i in xrange( 0, ROWS, 10000 ): | |
for j in xrange( i,i+10000 ): | |
f.write( "%d %f %d\n" % (r.randint(0,100), r.uniform(0,1000), j)) | |
print ("write: %d\r" % i), | |
sys.stdout.flush() | |
def read_data( fname ): | |
for n, line in enumerate( open( fname ) ): | |
r = line.strip().split() | |
yield int(r[0]),float(r[1]),r[2] | |
if not (n % 10000 ): | |
print ("read: %d\r" % n), | |
sys.stdout.flush() | |
def topn( ntop, data ): | |
ntop -= 1 | |
assert ntop > 0 | |
min_by_key = {} | |
top_by_key = {} | |
for key,value,label in data: | |
tup = (value,label) | |
if key not in top_by_key: | |
# initialize | |
top_by_key[key] = [ tup ] | |
else: | |
top = top_by_key[ key ] | |
l = len( top ) | |
if l > ntop: | |
# replace minimum value in top if it is lower than current value | |
idx = min_by_key[ key ] | |
if top[idx] < tup: | |
top[idx] = tup | |
min_by_key[ key ] = top.index( min( top ) ) | |
elif l < ntop: | |
# fill until we have ntop entries | |
top.append( tup ) | |
else: | |
# we have ntop entries in list, we'll have ntop+1 | |
top.append( tup ) | |
# initialize minimum to keep | |
min_by_key[ key ] = top.index( min( top ) ) | |
# finalize: | |
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() ) | |
def grouptopn( ntop, data ): | |
top_by_key = {} | |
for key,value,label in data: | |
if key in top_by_key: | |
top_by_key[ key ].append( (value,label) ) | |
else: | |
top_by_key[ key ] = [ (value,label) ] | |
return dict( (key, sorted( values, reverse=True )[:ntop]) for key,values in top_by_key.iteritems() ) | |
def heaptopn( ntop, data ): | |
top_by_key = {} | |
for key,value,label in data: | |
tup = (value,label) | |
if key not in top_by_key: | |
top_by_key[ key ] = [ tup ] | |
else: | |
top = top_by_key[ key ] | |
if len(top) < ntop: | |
heapq.heappush(top, tup) | |
else: | |
if top[0] < tup: | |
heapq.heapreplace(top, tup) | |
return dict( (key, sorted( values, reverse=True )) for key,values in top_by_key.iteritems() ) | |
def dummy( data ): | |
for row in data: | |
pass | |
#make_data( "data.txt" ) | |
# psyco.full() | |
# t = time.clock() | |
# dummy( read_data( "data.txt" ) ) | |
# t_read = time.clock() - t | |
# t = time.clock() | |
# top_result = topn( 5, read_data( "data.txt" ) ) | |
# t_topn = time.clock() - t | |
# t = time.clock() | |
# htop_result = heaptopn( 5, read_data( "data.txt" ) ) | |
# t_htopn = time.clock() - t | |
# # correctness checking : | |
# for key in top_result: | |
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in top_result[key]) | |
# print key, " : ", " ".join (("%f:%s"%(value,label)) for (value,label) in htop_result[key]) | |
# print "Read data :", t_read | |
# print "TopN : ", t_topn - t_read | |
# print "HeapTopN : ", t_htopn - t_read | |
# for key in top_result: | |
# assert top_result[key] == htop_result[key] | |
read_silently = lambda infile: ((int(aa), float(bb), cc) for aa, bb, cc in | |
(line.split() for line in infile)) | |
if __name__ == '__main__': | |
psyco.full() | |
top_result = topn(5, read_silently(sys.stdin)) | |
for aa in top_result: | |
for bb, cc in top_result[aa]: | |
print aa, bb, cc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment