public
anonymous / range_search.py
Last active

  • Download Gist
range_search.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
import random, csv
from itertools import groupby, imap
 
def convert_row(r):
return r[0], int(r[1]), int(r[2])
 
def find_overlaps(input_file, lookup_file):
lookup_data = imap(convert_row, csv.reader(open(lookup_file)))
input_data = imap(convert_row, csv.reader(open(input_file)))
lookups_by_key = dict((key,[t[1:] for t in l_data]) for key, l_data in groupby(lookup_data, key=lambda r:r[0]))
prev_name = None
for name, in_start, in_end in input_data:
if prev_name != name:
prev_name = name
lookup_pairs = lookups_by_key.get(name, [])
start_pos = 0
while lookup_pairs and lookup_pairs[0][0] < in_start:
lookup_pairs.pop(0) # discard lookups that are before the current range
for lu_start, lu_end in lookup_pairs:
if lu_start > in_end:
break
if lu_end <= in_end:
yield name, lu_start, lu_end, in_start, in_end
 
if __name__ == '__main__':
import sys
if len(sys.argv) < 4:
print("Usage:\n"
" range_search.py generate <filename> <scale>\n"
" Generate 10000*scale rows into filename."
" range_search.py match <input_file> <lookup_file>\n"
" Output overlapping ranges between two files\n")
sys.exit(1)
if sys.argv[1] == 'generate':
f = open(sys.argv[2], 'w')
n = 0
i = 0
amount_left = 0
scale = int(sys.argv[3])
max_n = 10**4 * scale
while n < max_n:
if not amount_left:
start = 10000000
amount_left = random.randint(1*scale,9*scale)
i += 1
name = "chr%d" % i
start += random.expovariate(scale/100000.)
end = start + random.expovariate(1./400)
f.write("%s,%d,%d\n" % (name, start, end))
amount_left -= 1
n += 1
f.close()
if sys.argv[1] == 'match':
for match in find_overlaps(sys.argv[2], sys.argv[3]):
print("Match %s %d-%d is in %d-%d" % match)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.