Skip to content
Create a gist now

Instantly share code, notes, and snippets.

anonymous /

Embed URL


Subversion checkout URL

You can clone with
Download ZIP
import random, csv
from itertools import groupby, imap
def convert_row(r):
return r[0], int(r[1]), int(r[2])
def find_overlaps(input_file, lookup_file):
lookup_data = imap(convert_row, csv.reader(open(lookup_file)))
input_data = imap(convert_row, csv.reader(open(input_file)))
lookups_by_key = dict((key,[t[1:] for t in l_data]) for key, l_data in groupby(lookup_data, key=lambda r:r[0]))
prev_name = None
for name, in_start, in_end in input_data:
if prev_name != name:
prev_name = name
lookup_pairs = lookups_by_key.get(name, [])
start_pos = 0
while lookup_pairs and lookup_pairs[0][0] < in_start:
lookup_pairs.pop(0) # discard lookups that are before the current range
for lu_start, lu_end in lookup_pairs:
if lu_start > in_end:
if lu_end <= in_end:
yield name, lu_start, lu_end, in_start, in_end
if __name__ == '__main__':
import sys
if len(sys.argv) < 4:
" generate <filename> <scale>\n"
" Generate 10000*scale rows into filename."
" match <input_file> <lookup_file>\n"
" Output overlapping ranges between two files\n")
if sys.argv[1] == 'generate':
f = open(sys.argv[2], 'w')
n = 0
i = 0
amount_left = 0
scale = int(sys.argv[3])
max_n = 10**4 * scale
while n < max_n:
if not amount_left:
start = 10000000
amount_left = random.randint(1*scale,9*scale)
i += 1
name = "chr%d" % i
start += random.expovariate(scale/100000.)
end = start + random.expovariate(1./400)
f.write("%s,%d,%d\n" % (name, start, end))
amount_left -= 1
n += 1
if sys.argv[1] == 'match':
for match in find_overlaps(sys.argv[2], sys.argv[3]):
print("Match %s %d-%d is in %d-%d" % match)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.