public
anonymous / msort.py
Created

Merge-sort dotmap files

  • Download Gist
msort.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
def scan_forward( fp ):
for line in fp:
if line==[]:
return True
return False
 
def rowcmp(a,b):
if a==[]:
return 1
if b==[]:
return 0
if a[2]<b[2]:
return 0
else:
return 1
 
def merge( fpa, fpb ):
try:
a = fpa.next()
except StopIteration:
a = []
try:
b = fpb.next()
except StopIteration:
b = []
 
while a!=[] or b!=[]:
if rowcmp(a,b)==0:
yield a
try:
a = fpa.next()
except StopIteration:
a = []
else:
yield b
try:
b = fpb.next()
except StopIteration:
b = []
 
import csv
def main( fn_in, fn_out ):
fpa = csv.reader( open( fn_in ) )
fpb = csv.reader( open( fn_in ) )
 
fpout = open( fn_out, "w" )
 
# scan fpa forward to the first set - already there
# scan fpb forward past that point to the next set
scan_forward( fpb )
 
i = 0
groups = 0
while True:
for a,b,c in merge( fpa, fpb ):
if i%100000==0:
print i
 
fpout.write( a )
fpout.write( "," )
fpout.write( b )
fpout.write( "," )
fpout.write( c )
fpout.write( "\n" )
 
i += 1
fpout.write( "\n" )
fpout.flush()
 
a = scan_forward( fpa )
b = scan_forward( fpb )
 
groups += 1
 
if not a and not b:
break
 
print "total groups written: %s"%groups
 
import os
if __name__=='__main__':
for i in range(22,29):
print i
try:
os.remove( "people.msort.%s"%(i-1) )
except OSError:
print "no such file"
main( "people.msort.%s"%i, "people.msort.%s"%(i+1) )

Where do the "people.msort.##" files come from? Best guess: you're running makedots once for each state, and converting each with sqlite, producing those files in csv format, which you then concatenate with msort into a final giant people.csv. ?

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.