Skip to content

Instantly share code, notes, and snippets.

@mortonjt
Created October 24, 2016 20:35
Show Gist options
  • Save mortonjt/620f8106af3d381cf33676f5400544d8 to your computer and use it in GitHub Desktop.
Save mortonjt/620f8106af3d381cf33676f5400544d8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Filter sequences from a biom table based on a fasta file
used for AG bloom filtering
"""
# amnonscript
__version__ = "1.2"
import biom
import argparse
import sys
def iterfastaseqs(filename):
"""
iterate a fasta file and return header,sequence
input:
filename - the fasta file name
output:
seq - the sequence
header - the header
"""
fl=open(filename,"rU")
cseq=''
chead=''
for cline in fl:
if cline[0]=='>':
if chead:
yield(cseq,chead)
cseq=''
chead=cline[1:].rstrip()
else:
cseq+=cline.strip()
if cseq:
yield(cseq,chead)
fl.close()
def filterbiombyfasta(tablefilename,outfilename,fastafilename,ignoretablelen=False):
print('loading biom table %s' % tablefilename)
table=biom.load_table(tablefilename)
print('table has %d sequences.' % table.shape[0])
print('loading fasta file %s' % fastafilename)
seqset=set()
numtot=0
# get the read len
if not ignoretablelen:
tseqs=table.ids('observation')
seqlen=len(tseqs[0])
for cseq in tseqs:
if seqlen!=len(cseq):
ignoretablelen=True
print('Sequence length not uniform in table! bad sequence is %s' % cseq)
break
for cseq,chead in iterfastaseqs(fastafilename):
numtot+=1
if not ignoretablelen:
cseq=cseq[:seqlen]
if table.exists(cseq,axis='observation'):
seqset.add(cseq)
print('loaded %d sequences. %d found in table. filtering' % (numtot,len(seqset)))
# filter removing these sequences in place
table.filter(list(seqset),axis='observation',invert=True)
print('%d sequences left. saving' % table.shape[0])
with biom.util.biom_open(outfilename, 'w') as f:
table.to_hdf5(f, "filterbiomseqs")
def main(argv):
parser=argparse.ArgumentParser(description='Filter sequences from biom table using a fasta file. Version '+__version__)
parser.add_argument('-i','--inputtable',help='input biom table file name where each of the observations correspond to a sequence.')
parser.add_argument('-o','--output',help='output biom file name with the bloom sequences removed.')
parser.add_argument('-f','--fasta',help='FASTA file containing bloom sequences that need to be removed from the input table.')
parser.add_argument('--ignore-table-seq-length',help="don't trim fasta file sequences to table read length",action='store_true')
args=parser.parse_args(argv)
filterbiombyfasta(args.inputtable,args.output,args.fasta,args.ignore_table_seq_length)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment