mkweskin/featuretofasta.py

## featuretofasta.py
#!/usr/bin/env python3

import pandas as pd
from Bio import SeqIO
import argparse
import os
import logging
import sys
import datetime

def get_args():
    parser = argparse.ArgumentParser(description="""Takes a feature table and a fasta file and returns the subset of sequences found in each sample of the feature table. """)
    parser.add_argument(
        '--feature-table',
        type=str,
        required=True,
        help="""Path to feature table"""
    )
    parser.add_argument(
        '--fasta-input',
        type=str,
        required=True,
        help="""Path to fasta file"""
    )
    parser.add_argument(
        '--outputdir', '-o',
        type=str,
        default=".",
        help="""Directory where output should go. Default: current directory"""
    )
    parser.add_argument(
        '--header-row',
        type=int,
        default="2",
        help="""Row of the features table file with the column headers, lines above the header are ignored. (Default 2)"""
    )
    parser.add_argument(
        '--min-count',
        type=int,
        default="0",
        help="""Minimum number of reads required for the sequence to be included in the output. Both --min-count and --min-prop can be specified. (Default 0, no filtering by count)"""
    )
    parser.add_argument(
        '--min-prop',
        type=float,
        default="0",
        help="""Minimum proportion of reads of a sequence that are required for the sequence to be included in the output. Both --min-count and --min-prop can be specified. (Default 0, no filtering by proportion count)"""
    )
    parser.add_argument(
        '--verbose', '-v',
        action="store_true",
        help="""Report the sequences included for each sample. (Default disabled)"""
    )
    parser.add_argument(
        '--log', '-l',
        type=str,
        default="",
        help="""Name of output log. (Default fasta file name .log extension)"""
    )
    return parser.parse_args()

def saveSeqs(seqs, col, idList, countList, outputpath):
    """
    Outputs the sequences from the input fasta for the ids given (idList).
    Output file is col.fasta
    """

    for id, count in zip(idList, countList):
        if id in seqs:
            with open( outputpath + col + ".fasta", "a") as output_handle:
                output_handle.write(">" + str(col)+ "_" + str(id) + "_" + str(count) + "\n" + str(seqs[id].seq) + "\n")
        else:
            logging.warning ("WARNING: sequence \"" + id + "\" listed in \"" + str(col) + "\" was not found in fasta file!")

def main():
    args = get_args()

    features = pd.read_csv(args.feature_table, sep='\t', header=(args.header_row-1))
    seqs = SeqIO.to_dict(SeqIO.parse(args.fasta_input, "fasta"))

    outputpath = args.outputdir + "/"
    if not os.path.exists(outputpath):
        os.mkdir(outputpath)

    # Setup logging
    if args.log == "":
        args.log = os.path.splitext(os.path.basename(args.fasta_input))[0] + ".log"

    logging.basicConfig(filename=outputpath + args.log, format='%(message)s', level=logging.DEBUG)
    logging.info(datetime.datetime.now().replace(microsecond=0).isoformat() + " " + " ".join(sys.argv))

    # Have some levels of logging go to the screen (https://stackoverflow.com/a/9321890)
    console = logging.StreamHandler()
    if args.verbose:
        console.setLevel(logging.DEBUG)
    else:
        console.setLevel(logging.INFO)
    formatter = logging.Formatter('%(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)


    firstCol=True

    #Tallies for end of run report
    #Running total number of sequences
    totalCount=0
    #Running total of sequences meeting filtering rules
    includedCount=0
    #Running total of number of samples
    sampleCount=0
    #Running total of number of fasta files output
    fastaCount=0

    if args.min_prop > 0 or args.min_count > 0:
        logging.debug("ID\tTotal seqs\tIncluded seqs")
    else:
        logging.debug("ID\tSequences")

    for col in features.columns:
        if firstCol:
            # Don't process the first column, it has sample names
            firstCol=False
            firstColName=col
        else:
            if os.path.exists(outputpath + col + ".fasta"):
                os.remove(outputpath + col + ".fasta")

            sampleCount += 1

            # Include these sequence if the min_prop and min_count values are met
            feature_df = features[ (features[col] >= (args.min_prop * features[col].sum())) & (features[col] >= args.min_count) & (features[col] != 0) ]

            if args.min_prop > 0 or args.min_count > 0:
                totalCount += features[features[col] != 0].shape[0]
                logging.debug(str(col) + "\t" +  str(features[features[col] != 0].shape[0]) + "\t" + str(feature_df.shape[0]))
            else:
                logging.debug(str(col) + "\t" +  str(features[features[col] != 0].shape[0]))

            #Output the sequences for this sample to a fasta file
            if feature_df.shape[0] > 0:
                fastaCount += 1
                includedCount += feature_df.shape[0]
                saveSeqs (seqs, col, list(feature_df[firstColName]), list(feature_df[col]), outputpath)

    # Print end of run summary
    logging.info ("Samples: " + str(sampleCount))
    logging.info ("FASTA files output: " + str(fastaCount))
    logging.info ("Sum of all sequences output across FASTA files: " + str(includedCount))
    if args.min_prop > 0 or args.min_count > 0:
        logging.info ("Sum of all filtered out sequences across FASTA files: " + str(totalCount - includedCount))

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	import pandas as pd
	from Bio import SeqIO
	import argparse
	import os
	import logging
	import sys
	import datetime

	def get_args():
	parser = argparse.ArgumentParser(description="""Takes a feature table and a fasta file and returns the subset of sequences found in each sample of the feature table. """)
	parser.add_argument(
	'--feature-table',
	type=str,
	required=True,
	help="""Path to feature table"""
	)
	parser.add_argument(
	'--fasta-input',
	type=str,
	required=True,
	help="""Path to fasta file"""
	)
	parser.add_argument(
	'--outputdir', '-o',
	type=str,
	default=".",
	help="""Directory where output should go. Default: current directory"""
	)
	parser.add_argument(
	'--header-row',
	type=int,
	default="2",
	help="""Row of the features table file with the column headers, lines above the header are ignored. (Default 2)"""
	)
	parser.add_argument(
	'--min-count',
	type=int,
	default="0",
	help="""Minimum number of reads required for the sequence to be included in the output. Both --min-count and --min-prop can be specified. (Default 0, no filtering by count)"""
	)
	parser.add_argument(
	'--min-prop',
	type=float,
	default="0",
	help="""Minimum proportion of reads of a sequence that are required for the sequence to be included in the output. Both --min-count and --min-prop can be specified. (Default 0, no filtering by proportion count)"""
	)
	parser.add_argument(
	'--verbose', '-v',
	action="store_true",
	help="""Report the sequences included for each sample. (Default disabled)"""
	)
	parser.add_argument(
	'--log', '-l',
	type=str,
	default="",
	help="""Name of output log. (Default fasta file name .log extension)"""
	)
	return parser.parse_args()

	def saveSeqs(seqs, col, idList, countList, outputpath):
	"""
	Outputs the sequences from the input fasta for the ids given (idList).
	Output file is col.fasta
	"""

	for id, count in zip(idList, countList):
	if id in seqs:
	with open( outputpath + col + ".fasta", "a") as output_handle:
	output_handle.write(">" + str(col)+ "_" + str(id) + "_" + str(count) + "\n" + str(seqs[id].seq) + "\n")
	else:
	logging.warning ("WARNING: sequence \"" + id + "\" listed in \"" + str(col) + "\" was not found in fasta file!")

	def main():
	args = get_args()

	features = pd.read_csv(args.feature_table, sep='\t', header=(args.header_row-1))
	seqs = SeqIO.to_dict(SeqIO.parse(args.fasta_input, "fasta"))

	outputpath = args.outputdir + "/"
	if not os.path.exists(outputpath):
	os.mkdir(outputpath)

	# Setup logging
	if args.log == "":
	args.log = os.path.splitext(os.path.basename(args.fasta_input))[0] + ".log"

	logging.basicConfig(filename=outputpath + args.log, format='%(message)s', level=logging.DEBUG)
	logging.info(datetime.datetime.now().replace(microsecond=0).isoformat() + " " + " ".join(sys.argv))

	# Have some levels of logging go to the screen (https://stackoverflow.com/a/9321890)
	console = logging.StreamHandler()
	if args.verbose:
	console.setLevel(logging.DEBUG)
	else:
	console.setLevel(logging.INFO)
	formatter = logging.Formatter('%(message)s')
	console.setFormatter(formatter)
	logging.getLogger('').addHandler(console)


	firstCol=True

	#Tallies for end of run report
	#Running total number of sequences
	totalCount=0
	#Running total of sequences meeting filtering rules
	includedCount=0
	#Running total of number of samples
	sampleCount=0
	#Running total of number of fasta files output
	fastaCount=0

	if args.min_prop > 0 or args.min_count > 0:
	logging.debug("ID\tTotal seqs\tIncluded seqs")
	else:
	logging.debug("ID\tSequences")

	for col in features.columns:
	if firstCol:
	# Don't process the first column, it has sample names
	firstCol=False
	firstColName=col
	else:
	if os.path.exists(outputpath + col + ".fasta"):
	os.remove(outputpath + col + ".fasta")

	sampleCount += 1

	# Include these sequence if the min_prop and min_count values are met
	feature_df = features[ (features[col] >= (args.min_prop * features[col].sum())) & (features[col] >= args.min_count) & (features[col] != 0) ]

	if args.min_prop > 0 or args.min_count > 0:
	totalCount += features[features[col] != 0].shape[0]
	logging.debug(str(col) + "\t" + str(features[features[col] != 0].shape[0]) + "\t" + str(feature_df.shape[0]))
	else:
	logging.debug(str(col) + "\t" + str(features[features[col] != 0].shape[0]))

	#Output the sequences for this sample to a fasta file
	if feature_df.shape[0] > 0:
	fastaCount += 1
	includedCount += feature_df.shape[0]
	saveSeqs (seqs, col, list(feature_df[firstColName]), list(feature_df[col]), outputpath)

	# Print end of run summary
	logging.info ("Samples: " + str(sampleCount))
	logging.info ("FASTA files output: " + str(fastaCount))
	logging.info ("Sum of all sequences output across FASTA files: " + str(includedCount))
	if args.min_prop > 0 or args.min_count > 0:
	logging.info ("Sum of all filtered out sequences across FASTA files: " + str(totalCount - includedCount))

	if __name__ == '__main__':
	main()