Skip to content

Instantly share code, notes, and snippets.

@conchoecia
Created August 27, 2022 11:26
Show Gist options
  • Save conchoecia/dd858a46cd7d8dfb492570e4b0185f31 to your computer and use it in GitHub Desktop.
Save conchoecia/dd858a46cd7d8dfb492570e4b0185f31 to your computer and use it in GitHub Desktop.
these functions take a DNA sequence and return the coordinates (python syntax) of the contigs
#!/usr/bin/env python
myseq = "ACNNNNNNNNNNNNNCATGTACTTGGATCTATCGGTGATCGGATCGGTATGCGTACGAGTGTCAGTCNNNNNNNNNNNNNNNACGTGGTATGCGGCATGCGTAGCGTCAGCTAGCTGATATTGCGTAGCNNNNNNNNNNGGTATGCGTG"
def contig_ranges(seq, sub):
indices = list(find_all(seq, sub))
gap_starts = []
gap_stops = []
gap_ranges = []
prev = -1
# get the starts of the gaps
for i in range(len(indices)):
if prev != indices[i] - 1:
gap_starts.append(indices[i])
prev = indices[i]
# get the indices of the ends of the gaps
prev = 999999999999
for i in range(len(indices) - 1, -1, -1):
if prev != indices[i] + 1:
gap_stops.append(indices[i])
prev = indices[i]
gap_stops = [x + len(sub) for x in gap_stops[::-1]]
contigs = []
# now get the ranges
for i in range(len(gap_stops)):
gap_ranges.append([gap_starts[i], gap_stops[i]])
if i == 0:
contigs.append([0, gap_starts[i]])
contigs.append([gap_stops[i], gap_starts[i+1]])
elif i == (len(gap_stops) -1):
contigs.append([gap_stops[i], len(seq)])
else:
contigs.append([gap_stops[i], gap_starts[i+1]])
return contigs
def find_all(seq, sub):
start = 0
indices = []
while True:
start = seq.find(sub, start)
if start == -1: return
yield start
#start += len(sub) # use start += 1 to find overlapping matches
start += 1 # use start += 1 to find overlapping matches
return indices
# first argument is the sequence
# second sequence is the minimum number of Ns in a gap
ranges = contig_ranges(myseq, "NNNN")
print(ranges)
print(myseq)
for start, stop in ranges:
print(myseq)
print("{}{}".format("".join([" "] * start), myseq[start:stop]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment