Skip to content

Instantly share code, notes, and snippets.

Created December 27, 2012 04:16
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save anonymous/4385453 to your computer and use it in GitHub Desktop.
Save anonymous/4385453 to your computer and use it in GitHub Desktop.
Merge-sort dotmap files
def scan_forward( fp ):
for line in fp:
if line==[]:
return True
return False
def rowcmp(a,b):
if a==[]:
return 1
if b==[]:
return 0
if a[2]<b[2]:
return 0
else:
return 1
def merge( fpa, fpb ):
try:
a = fpa.next()
except StopIteration:
a = []
try:
b = fpb.next()
except StopIteration:
b = []
while a!=[] or b!=[]:
if rowcmp(a,b)==0:
yield a
try:
a = fpa.next()
except StopIteration:
a = []
else:
yield b
try:
b = fpb.next()
except StopIteration:
b = []
import csv
def main( fn_in, fn_out ):
fpa = csv.reader( open( fn_in ) )
fpb = csv.reader( open( fn_in ) )
fpout = open( fn_out, "w" )
# scan fpa forward to the first set - already there
# scan fpb forward past that point to the next set
scan_forward( fpb )
i = 0
groups = 0
while True:
for a,b,c in merge( fpa, fpb ):
if i%100000==0:
print i
fpout.write( a )
fpout.write( "," )
fpout.write( b )
fpout.write( "," )
fpout.write( c )
fpout.write( "\n" )
i += 1
fpout.write( "\n" )
fpout.flush()
a = scan_forward( fpa )
b = scan_forward( fpb )
groups += 1
if not a and not b:
break
print "total groups written: %s"%groups
import os
if __name__=='__main__':
for i in range(22,29):
print i
try:
os.remove( "people.msort.%s"%(i-1) )
except OSError:
print "no such file"
main( "people.msort.%s"%i, "people.msort.%s"%(i+1) )
@meetar
Copy link

meetar commented Jan 2, 2013

Where do the "people.msort.##" files come from? Best guess: you're running makedots once for each state, and converting each with sqlite, producing those files in csv format, which you then concatenate with msort into a final giant people.csv. ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment