Last active
November 10, 2017 14:04
-
-
Save webmasterar/3a60155d4ddc8595b17fa2c62893dbb0 to your computer and use it in GitHub Desktop.
Return the sequence from a FASTA file given a start and end position
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#License: CC0 Public Domain (2017) Ahmad Retha | |
## | |
# Get range of bases from a fasta file. e.g. chr21:9411239-9411240 | |
# @param refFile The fasta filename e.g. Homo_sapiens.GRCh37.75.dna.chromosome.21.fa | |
# @param begin 1-based index in sequence e.g. 9411239 | |
# @param end 1-based index in sequence up-to-but-not-including e.g. 9411240 | |
# @return string 'G' | |
def getRef(refFile, begin, end): | |
if begin > end: | |
begin, end = end, begin | |
elif begin == end or begin < 1 or end < 1: | |
return '' | |
seq = '' | |
with open(refFile, 'r') as rf: | |
j = 1 | |
for line in rf: | |
line = line.strip() | |
if line == '' or line.startswith('>'): | |
continue | |
if j >= end: | |
break | |
lineLen = len(line) | |
if j+lineLen >= begin: | |
k = 0 | |
while j+k < begin: | |
k += 1 | |
j = j + k | |
while j < end and k < lineLen: | |
seq += line[k] | |
j += 1 | |
k += 1 | |
else: | |
j += lineLen | |
return seq | |
print getRef('./Homo_sapiens.GRCh37.75.dna.chromosome.21.fa', 9411239, 9411240) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment