Created
October 14, 2009 17:09
-
-
Save anonymous/210232 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random, csv | |
from itertools import groupby, imap | |
def convert_row(r): | |
return r[0], int(r[1]), int(r[2]) | |
def find_overlaps(input_file, lookup_file): | |
lookup_data = imap(convert_row, csv.reader(open(lookup_file))) | |
input_data = imap(convert_row, csv.reader(open(input_file))) | |
lookups_by_key = dict((key,[t[1:] for t in l_data]) for key, l_data in groupby(lookup_data, key=lambda r:r[0])) | |
prev_name = None | |
for name, in_start, in_end in input_data: | |
if prev_name != name: | |
prev_name = name | |
lookup_pairs = lookups_by_key.get(name, []) | |
start_pos = 0 | |
while lookup_pairs and lookup_pairs[0][0] < in_start: | |
lookup_pairs.pop(0) # discard lookups that are before the current range | |
for lu_start, lu_end in lookup_pairs: | |
if lu_start > in_end: | |
break | |
if lu_end <= in_end: | |
yield name, lu_start, lu_end, in_start, in_end | |
if __name__ == '__main__': | |
import sys | |
if len(sys.argv) < 4: | |
print("Usage:\n" | |
" range_search.py generate <filename> <scale>\n" | |
" Generate 10000*scale rows into filename." | |
" range_search.py match <input_file> <lookup_file>\n" | |
" Output overlapping ranges between two files\n") | |
sys.exit(1) | |
if sys.argv[1] == 'generate': | |
f = open(sys.argv[2], 'w') | |
n = 0 | |
i = 0 | |
amount_left = 0 | |
scale = int(sys.argv[3]) | |
max_n = 10**4 * scale | |
while n < max_n: | |
if not amount_left: | |
start = 10000000 | |
amount_left = random.randint(1*scale,9*scale) | |
i += 1 | |
name = "chr%d" % i | |
start += random.expovariate(scale/100000.) | |
end = start + random.expovariate(1./400) | |
f.write("%s,%d,%d\n" % (name, start, end)) | |
amount_left -= 1 | |
n += 1 | |
f.close() | |
if sys.argv[1] == 'match': | |
for match in find_overlaps(sys.argv[2], sys.argv[3]): | |
print("Match %s %d-%d is in %d-%d" % match) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment