Skip to content

Instantly share code, notes, and snippets.

@dfornika
Last active February 22, 2018 01:16
Show Gist options
  • Save dfornika/2352a2e8434a7b71d3dfd5b49b91f824 to your computer and use it in GitHub Desktop.
Save dfornika/2352a2e8434a7b71d3dfd5b49b91f824 to your computer and use it in GitHub Desktop.
import re
import Bio.Data.IUPACData
sequence = "GCATGCATCTATACGTAGCTATACTACGATCTACGATCGATCGATGCTATGCATCATGACTACTATGCATCATAGCTGCAT"
pattern = "TATACGTAGCTATACTN{0,100}ATCTACGATCGATCGATGCN{0,100}ACTACTATGCATCATAG"
def nt_search_regex(seq, subseq):
"""Search for a DNA subseq in sequence.
use ambiguous values (like N = A or T or C or G, R = A or G etc.)
searches only on forward strand
"""
pattern = ''
for nt in subseq:
if nt in Bio.Data.IUPACData.ambiguous_dna_values.keys():
value = Bio.Data.IUPACData.ambiguous_dna_values[nt]
else:
value = nt
if len(value) == 1:
pattern += value
else:
pattern += '[%s]' % value
pos = -1
result = [pattern]
l = len(seq)
while True:
pos += 1
s = seq[pos:]
m = re.search(pattern, s)
if not m:
break
pos += int(m.start(0))
result.append(pos)
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment