Created
December 27, 2012 04:16
-
-
Save anonymous/4385453 to your computer and use it in GitHub Desktop.
Merge-sort dotmap files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def scan_forward( fp ): | |
for line in fp: | |
if line==[]: | |
return True | |
return False | |
def rowcmp(a,b): | |
if a==[]: | |
return 1 | |
if b==[]: | |
return 0 | |
if a[2]<b[2]: | |
return 0 | |
else: | |
return 1 | |
def merge( fpa, fpb ): | |
try: | |
a = fpa.next() | |
except StopIteration: | |
a = [] | |
try: | |
b = fpb.next() | |
except StopIteration: | |
b = [] | |
while a!=[] or b!=[]: | |
if rowcmp(a,b)==0: | |
yield a | |
try: | |
a = fpa.next() | |
except StopIteration: | |
a = [] | |
else: | |
yield b | |
try: | |
b = fpb.next() | |
except StopIteration: | |
b = [] | |
import csv | |
def main( fn_in, fn_out ): | |
fpa = csv.reader( open( fn_in ) ) | |
fpb = csv.reader( open( fn_in ) ) | |
fpout = open( fn_out, "w" ) | |
# scan fpa forward to the first set - already there | |
# scan fpb forward past that point to the next set | |
scan_forward( fpb ) | |
i = 0 | |
groups = 0 | |
while True: | |
for a,b,c in merge( fpa, fpb ): | |
if i%100000==0: | |
print i | |
fpout.write( a ) | |
fpout.write( "," ) | |
fpout.write( b ) | |
fpout.write( "," ) | |
fpout.write( c ) | |
fpout.write( "\n" ) | |
i += 1 | |
fpout.write( "\n" ) | |
fpout.flush() | |
a = scan_forward( fpa ) | |
b = scan_forward( fpb ) | |
groups += 1 | |
if not a and not b: | |
break | |
print "total groups written: %s"%groups | |
import os | |
if __name__=='__main__': | |
for i in range(22,29): | |
print i | |
try: | |
os.remove( "people.msort.%s"%(i-1) ) | |
except OSError: | |
print "no such file" | |
main( "people.msort.%s"%i, "people.msort.%s"%(i+1) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Where do the "people.msort.##" files come from? Best guess: you're running makedots once for each state, and converting each with sqlite, producing those files in csv format, which you then concatenate with msort into a final giant people.csv. ?