Skip to content

Instantly share code, notes, and snippets.

Created October 14, 2009 17:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/210232 to your computer and use it in GitHub Desktop.
Save anonymous/210232 to your computer and use it in GitHub Desktop.
import random, csv
from itertools import groupby, imap
def convert_row(r):
return r[0], int(r[1]), int(r[2])
def find_overlaps(input_file, lookup_file):
lookup_data = imap(convert_row, csv.reader(open(lookup_file)))
input_data = imap(convert_row, csv.reader(open(input_file)))
lookups_by_key = dict((key,[t[1:] for t in l_data]) for key, l_data in groupby(lookup_data, key=lambda r:r[0]))
prev_name = None
for name, in_start, in_end in input_data:
if prev_name != name:
prev_name = name
lookup_pairs = lookups_by_key.get(name, [])
start_pos = 0
while lookup_pairs and lookup_pairs[0][0] < in_start:
lookup_pairs.pop(0) # discard lookups that are before the current range
for lu_start, lu_end in lookup_pairs:
if lu_start > in_end:
break
if lu_end <= in_end:
yield name, lu_start, lu_end, in_start, in_end
if __name__ == '__main__':
import sys
if len(sys.argv) < 4:
print("Usage:\n"
" range_search.py generate <filename> <scale>\n"
" Generate 10000*scale rows into filename."
" range_search.py match <input_file> <lookup_file>\n"
" Output overlapping ranges between two files\n")
sys.exit(1)
if sys.argv[1] == 'generate':
f = open(sys.argv[2], 'w')
n = 0
i = 0
amount_left = 0
scale = int(sys.argv[3])
max_n = 10**4 * scale
while n < max_n:
if not amount_left:
start = 10000000
amount_left = random.randint(1*scale,9*scale)
i += 1
name = "chr%d" % i
start += random.expovariate(scale/100000.)
end = start + random.expovariate(1./400)
f.write("%s,%d,%d\n" % (name, start, end))
amount_left -= 1
n += 1
f.close()
if sys.argv[1] == 'match':
for match in find_overlaps(sys.argv[2], sys.argv[3]):
print("Match %s %d-%d is in %d-%d" % match)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment