Skip to content

Instantly share code, notes, and snippets.

@bemasher
Created August 11, 2009 03:41
Show Gist options
  • Save bemasher/165610 to your computer and use it in GitHub Desktop.
Save bemasher/165610 to your computer and use it in GitHub Desktop.
Compares a list of chromosome regions from one file to a list of probes generated for NimbleGen.
"""Compares a list of chromosome regions from one file to a list of probes generated for NimbleGen."""
from optparse import OptionParser
import math
parser = OptionParser()
parser.usage = "%prog file1 file2 [options]"
parser.add_option("-r", "--range", type="int", dest="range", metavar="INT", default="200", help="set range tolerance [default: %default]")
parser.add_option("-p", "--percentage", action="store_true", help="display percentage of range covered instead of difference")
gen_chrX = []
gen_chrY = []
(options, args) = parser.parse_args()
def find_gaps(needle, region, haystack):
matches = []
# For every probe in region of haystack
for index in region:
# Create a list of all integers in it
matches[-1:] = range(haystack[index][0], haystack[index][1] + 1)
# Use set logic to find the difference
return len(set(range(needle[0], needle[1])) - set(matches))
def binary_match(needle, haystack):
# Find mid index
mid = len(haystack)/2
# If needle falls inside the region in the middle of the haystack, we're done, or if it's the last one
if (needle > haystack[0] and needle < haystack[1]) or (len(haystack) == 1):
return haystack[mid]
else:
# If needle is less than the average of the middle probe's values
if needle < (haystack[mid][0] + haystack[mid][1]) / 2:
# Search bottom half
return binary_match(needle, haystack[:mid])
else:
# Search top half
return binary_match(needle, haystack[mid:])
def perform_checks(needle, haystack):
# Look for matches of low and high bounds of needle
match_low, match_high = binary_match(needle[0], haystack), binary_match(needle[1], haystack)
# Find the indexes of the matches we just found
index_low, index_high = haystack.index(match_low), haystack.index(match_high)
# If we're not at the first index, expand difference search down one index
if index_low > 0:
index_low -= 1
else:
index_low = 0
# If we're not at the very top index, expand difference search up one index
if index_high < len(haystack) - 1:
index_high += 1
else:
index_high = len(haystack) - 1
# Find gaps the regions we've just selected don't cover in the needle
return find_gaps(needle, range(index_low, index_high + 1), haystack)
def display_result(name, needle, result):
# Does result pass given range?
if result < options.range:
bool = "PASS"
else:
bool = "FAIL"
# If percentage specified then find percentage of needle covered
if options.percentage:
needle_len = needle[1] - needle[0]
result = (float(needle_len - result) / float(needle[1] - needle[0])) * 100
print "%s\t%s\t%s" % (name, result, bool)
# Print usage and exit if user doesn't give both input file arguments
if len(args) < 2:
parser.print_help()
raise SystemExit
# Attempt to open the probes generated to check against
try:
generated_regions = open(args[1], "r")
except IOError:
# We couldn't find the file, so exit
print "Could not find file:", args[1]
raise SystemExit
# Read through file until we get to relavent probes
for line in generated_regions:
try:
# We can ignore all data before NimbleGen probes
line.index("NimbleGen")
break
except ValueError:
pass
# Create lists of X and Y probes
for line in generated_regions:
temp = line.split("\t")
try:
# If it's a X chromosome add to gen_chrX
temp[0].index("chrX")
gen_chrX.append((int(temp[1]), int(temp[2])))
except ValueError:
# If it's a Y chromosome add to gen_chrY
gen_chrY.append((int(temp[1]), int(temp[2])))
# We're done with this file, close it
generated_regions.close()
# Attempt to open the file we're trying to find matches for
try:
request_regions = open(args[0], 'r')
except IOError:
# We couldn't open the file, so exit
print "Could not find file:", args[0]
raise SystemExit
# For each region parse and check against the X and Y probe lists
for line in request_regions:
line = line.replace("\n", "")
line = line.split("\t")
# Extract bounds of needle from line and cast to int
needle = (int(line[1]), int(line[2]))
# If the chromosome is X use X probes list, else use Y probes list
if line[0] == "chrX":
result = perform_checks(needle, gen_chrX)
display_result(line[3], needle, result)
else:
result = perform_checks(needle, gen_chrY)
display_result(line[3], needle, result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment