Created
February 16, 2012 06:03
-
-
Save brentp/1842507 to your computer and use it in GitHub Desktop.
simpler pyfasta interface using fai
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os.path as op | |
class Fasta(object): | |
def __init__(self, fname): | |
assert op.exists(fname) and op.exists(fname + ".fai") | |
self.fname = fname | |
self._fh = open(self.fname) | |
self.fai = dict(self._parse_fai(fname + ".fai")) | |
def _parse_fai(self, fai): | |
for line in open(fai): | |
seq, seq_len, start, len1, len2 = line.split("\t") | |
yield seq, {"seq": seq, | |
"seq_len": int(seq_len), | |
"start": int(start), | |
"len": int(len1)} | |
def __getitem__(self, chrom): | |
return Seq(self.fai[chrom], self._fh) | |
def __repr__(self): | |
return "Fasta('%s')" % self.fname | |
def keys(self): return self.fai.keys() | |
def iterkeys(self): | |
for k in self.keys(): | |
yield self | |
def values(self): return self.fai.values() | |
def itervalues(self): | |
for v in self.values(): yield v | |
def items(self): return self.fai.items() | |
def iteritems(self): | |
for i in self.fai.iteritems(): yield i | |
class Seq(object): | |
def __init__(self, fai_line, fh): | |
self.fai_line = fai_line | |
self._fh = fh | |
def __len__(self): | |
return self.fai_line['seq_len'] | |
def __repr__(self): | |
return "Seq('%s')" % self.fai_line['seq'] | |
def __getitem__(self, slice): | |
ss = slice.start or 0 | |
se = slice.stop or len(self) + 2 | |
assert se >= ss >= 0, (se, ss) | |
assert slice.step is None | |
LL = self.fai_line['len'] | |
lines, inline = divmod(ss, LL) | |
start = self.fai_line['start'] + inline + lines * LL + lines | |
L = len(self) + 2 | |
end = self.fai_line['start'] + lines + min(slice.stop or L, L) | |
self._fh.seek(start) | |
l = end - start | |
assert l <= L, ("requested sequence out of range!", l, L) | |
seqs = [] | |
seq_len, rl = 0, self._fh.readline | |
LL += 1 | |
while seq_len <= l: | |
s = rl(LL) #.rstrip(LL) | |
if seqs and not s: break | |
seqs.append(s) | |
seq_len += LL | |
if len(seqs): | |
seqs[-1] = seqs[-1].rstrip() | |
return "".join(seqs)[:l] | |
if __name__ == "__main__": | |
from pyfasta import Fasta as F | |
f = Fasta('c.fasta') | |
p = F('c.fasta') | |
assert f['2'][:4] == p['2'][:4] | |
print len(f['2']) | |
print f | |
print f['2'] | |
if f['3'][620:630] != p['3'][620:]: | |
print( f['3'][620:630], p['3'][620:]) | |
print f['3'][620:630] | |
print f['3'][630:630] | |
assert f['2'][100:106] == p['2'][100:106] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment