Skip to content

Instantly share code, notes, and snippets.

@mkweskin
Last active July 18, 2019 11:56
Show Gist options
  • Save mkweskin/b7cad87d579892a07c081ddcb2cd2c4e to your computer and use it in GitHub Desktop.
Save mkweskin/b7cad87d579892a07c081ddcb2cd2c4e to your computer and use it in GitHub Desktop.
Takes a tab-delimited BIOM feature table that lists sequences for each sample and a fasta file with all sequences for all samples. Outputs a separate fasta file for each sample containing only the sequences found in that sample. Optionally, you can specify a minimum count for each sequence for each sample and a minimum proportion (calculated wit…
#!/usr/bin/env python3
import pandas as pd
from Bio import SeqIO
import argparse
import os
import logging
import sys
import datetime
def get_args():
parser = argparse.ArgumentParser(description="""Takes a feature table and a fasta file and returns the subset of sequences found in each sample of the feature table. """)
parser.add_argument(
'--feature-table',
type=str,
required=True,
help="""Path to feature table"""
)
parser.add_argument(
'--fasta-input',
type=str,
required=True,
help="""Path to fasta file"""
)
parser.add_argument(
'--outputdir', '-o',
type=str,
default=".",
help="""Directory where output should go. Default: current directory"""
)
parser.add_argument(
'--header-row',
type=int,
default="2",
help="""Row of the features table file with the column headers, lines above the header are ignored. (Default 2)"""
)
parser.add_argument(
'--min-count',
type=int,
default="0",
help="""Minimum number of reads required for the sequence to be included in the output. Both --min-count and --min-prop can be specified. (Default 0, no filtering by count)"""
)
parser.add_argument(
'--min-prop',
type=float,
default="0",
help="""Minimum proportion of reads of a sequence that are required for the sequence to be included in the output. Both --min-count and --min-prop can be specified. (Default 0, no filtering by proportion count)"""
)
parser.add_argument(
'--verbose', '-v',
action="store_true",
help="""Report the sequences included for each sample. (Default disabled)"""
)
parser.add_argument(
'--log', '-l',
type=str,
default="",
help="""Name of output log. (Default fasta file name .log extension)"""
)
return parser.parse_args()
def saveSeqs(seqs, col, idList, countList, outputpath):
"""
Outputs the sequences from the input fasta for the ids given (idList).
Output file is col.fasta
"""
for id, count in zip(idList, countList):
if id in seqs:
with open( outputpath + col + ".fasta", "a") as output_handle:
output_handle.write(">" + str(col)+ "_" + str(id) + "_" + str(count) + "\n" + str(seqs[id].seq) + "\n")
else:
logging.warning ("WARNING: sequence \"" + id + "\" listed in \"" + str(col) + "\" was not found in fasta file!")
def main():
args = get_args()
features = pd.read_csv(args.feature_table, sep='\t', header=(args.header_row-1))
seqs = SeqIO.to_dict(SeqIO.parse(args.fasta_input, "fasta"))
outputpath = args.outputdir + "/"
if not os.path.exists(outputpath):
os.mkdir(outputpath)
# Setup logging
if args.log == "":
args.log = os.path.splitext(os.path.basename(args.fasta_input))[0] + ".log"
logging.basicConfig(filename=outputpath + args.log, format='%(message)s', level=logging.DEBUG)
logging.info(datetime.datetime.now().replace(microsecond=0).isoformat() + " " + " ".join(sys.argv))
# Have some levels of logging go to the screen (https://stackoverflow.com/a/9321890)
console = logging.StreamHandler()
if args.verbose:
console.setLevel(logging.DEBUG)
else:
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
firstCol=True
#Tallies for end of run report
#Running total number of sequences
totalCount=0
#Running total of sequences meeting filtering rules
includedCount=0
#Running total of number of samples
sampleCount=0
#Running total of number of fasta files output
fastaCount=0
if args.min_prop > 0 or args.min_count > 0:
logging.debug("ID\tTotal seqs\tIncluded seqs")
else:
logging.debug("ID\tSequences")
for col in features.columns:
if firstCol:
# Don't process the first column, it has sample names
firstCol=False
firstColName=col
else:
if os.path.exists(outputpath + col + ".fasta"):
os.remove(outputpath + col + ".fasta")
sampleCount += 1
# Include these sequence if the min_prop and min_count values are met
feature_df = features[ (features[col] >= (args.min_prop * features[col].sum())) & (features[col] >= args.min_count) & (features[col] != 0) ]
if args.min_prop > 0 or args.min_count > 0:
totalCount += features[features[col] != 0].shape[0]
logging.debug(str(col) + "\t" + str(features[features[col] != 0].shape[0]) + "\t" + str(feature_df.shape[0]))
else:
logging.debug(str(col) + "\t" + str(features[features[col] != 0].shape[0]))
#Output the sequences for this sample to a fasta file
if feature_df.shape[0] > 0:
fastaCount += 1
includedCount += feature_df.shape[0]
saveSeqs (seqs, col, list(feature_df[firstColName]), list(feature_df[col]), outputpath)
# Print end of run summary
logging.info ("Samples: " + str(sampleCount))
logging.info ("FASTA files output: " + str(fastaCount))
logging.info ("Sum of all sequences output across FASTA files: " + str(includedCount))
if args.min_prop > 0 or args.min_count > 0:
logging.info ("Sum of all filtered out sequences across FASTA files: " + str(totalCount - includedCount))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment