evanroyrees/get_cluster_markers.py

## get_cluster_markers.py
#!/usr/bin/env python

import argparse
import os
import glob

from Bio import SeqIO

import pandas as pd


def filter_markers(infpath, cutoffs):
    """Filter markers from hmmscan output table that are above cutoff values.

    Parameters
    ----------
    infpath : str
        </path/to/hmmscan.tsv>
    cutoffs : str
        </path/to/cutoffs.tsv>

    Returns
    -------
    str
        </path/to/output.markers.tsv>
    Raises
    -------
    FileNotFoundError
        `infpath` or `cutoffs` not found
    AssertionError
        No returned markers pass the cutoff thresholds. I.e. final df is empty.
    """
    for fp in [infpath, cutoffs]:
        if not os.path.exists(fp):
            raise FileNotFoundError(fp)
    hmmtab_header = ["sname", "sacc", "orf", "score"]
    col_indices = [0, 1, 2, 5]
    columns = {i: k for i, k in zip(col_indices, hmmtab_header)}
    df = pd.read_csv(
        infpath, sep="\s+", usecols=col_indices, header=None, comment="#"
    ).rename(columns=columns)
    # NaN may result from parsing issues while merging parallel results
    df.dropna(inplace=True)
    df["cleaned_sacc"] = df["sacc"].map(lambda acc: acc.split(".")[0])
    dff = pd.read_csv(cutoffs, sep="\s+", header=None).rename(
        columns={0: "accession", 1: "cutoff"}
    )

    mdf = pd.merge(df, dff, how="left", left_on="cleaned_sacc", right_on="accession")
    mdf = mdf[mdf["score"] >= mdf["cutoff"]]
    if mdf.empty:
        raise AssertionError("No markers in {} pass cutoff thresholds".format(infpath))
    cols = ["orf", "sacc", "sname", "score", "cutoff"]
    mdf = mdf[cols]

    def orf_to_contig_translater(x):
        return x.rsplit("_", 1)[0]

    mdf["contig"] = mdf["orf"].map(orf_to_contig_translater)
    mdf.set_index("contig", inplace=True)
    return mdf


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--hmm-table", help="Autometa output with *hmm.tbl suffix.", required=True,
    )
    parser.add_argument(
        "--cutoffs", help="Kingdom cutoffs table (Bacteria|Archaea).", required=True,
    )
    parser.add_argument(
        "--clusters",
        help="Directory containing fasta files of clusters (will search for *.fasta files)",
        required=True,
    )
    parser.add_argument(
        "--orfs", help="Path to metagenome.fasta", required=True,
    )
    parser.add_argument(
        "--outdir", help="Directory to write each cluster's markers", required=True,
    )

    args = parser.parse_args()

    # 1. Find markers above cutoff in hmm.tbl
    df = filter_markers(infpath=args.hmm_table, cutoffs=args.cutoffs)
    # index: "contig"
    # cols = ["orf", "sacc", "sname", "score", "cutoff"]

    # 2. Retrieve each cluster fasta file
    search_string = os.path.join(args.clusters, "*.fasta")
    clusters = [
        (filepath, os.path.basename(filepath).replace(".fasta", ""))
        for filepath in glob.glob(search_string)
    ]
    # 3. Retrieve ORFs for all markers above cutoff
    marker_orfs_set = set(df.orf.tolist())
    marker_orfs = [
        record
        for record in SeqIO.parse(args.orfs, "fasta")
        if record.id in marker_orfs_set
    ]
    # 4.a Create output directory if it does not exist:
    if os.path.exists(args.outdir):
        raise FileExistsError(
            "Directory: {} already exists! Please specify a different path!".format(
                args.outdir
            )
        )
    os.makedirs(args.outdir)
    # 4.b Retrieve the ORFs containing the markers for each cluster
    # df.index == contigs
    # marker_orfs
    recovered_clusters = []
    for cluster_filepath, cluster_name in clusters:
        contig_records = [
            record.id for record in SeqIO.parse(cluster_filepath, "fasta")
        ]
        # Retrieve ORFs corresponding to cluster
        cluster_contigs = set(contig_records)
        orf_records = [
            orf for orf in marker_orfs if orf.id.rsplit("_", 1)[0] in cluster_contigs
        ]
        # Write out retrieved orfs as cluster.markers.faa
        cluster_markers_outfilepath = os.path.join(
            args.outdir, "{}.markers.faa".format(cluster_name)
        )
        n_written = SeqIO.write(orf_records, cluster_markers_outfilepath, "fasta")
        print(
            "Wrote {} cluster markers to {}".format(
                n_written, cluster_markers_outfilepath
            )
        )
        # Prepare cluster contigs for filtered markers table with cluster identification
        cluster_contigs = [
            {"contig": record, "cluster": cluster_name} for record in contig_records
        ]
        recovered_clusters.extend(cluster_contigs)

    # Write out filtered markers table with cluster identification
    dff = pd.DataFrame(recovered_clusters).set_index("contig")
    mdf = pd.merge(dff, df, how="right", left_index=True, right_index=True)
    cluster_markers_table_outfilepath = os.path.join(
        args.outdir, "clusters_markers.tsv"
    )
    mdf.to_csv(cluster_markers_table_outfilepath, sep="\t", index=True, header=True)
    print(
        "Wrote all clusters marker information to {}".format(
            cluster_markers_table_outfilepath
        )
    )
    # Now write all markers information formatted for readability
    all_markers_fpath = os.path.join(args.outdir, "all_markers.tsv")
    df.to_csv(all_markers_fpath, sep="\t", index=True, header=True)
    print("Wrote all clusters marker information to {}".format(all_markers_fpath))


if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	import argparse
	import os
	import glob

	from Bio import SeqIO

	import pandas as pd


	def filter_markers(infpath, cutoffs):
	"""Filter markers from hmmscan output table that are above cutoff values.

	Parameters
	----------
	infpath : str
	</path/to/hmmscan.tsv>
	cutoffs : str
	</path/to/cutoffs.tsv>

	Returns
	-------
	str
	</path/to/output.markers.tsv>
	Raises
	-------
	FileNotFoundError
	`infpath` or `cutoffs` not found
	AssertionError
	No returned markers pass the cutoff thresholds. I.e. final df is empty.
	"""
	for fp in [infpath, cutoffs]:
	if not os.path.exists(fp):
	raise FileNotFoundError(fp)
	hmmtab_header = ["sname", "sacc", "orf", "score"]
	col_indices = [0, 1, 2, 5]
	columns = {i: k for i, k in zip(col_indices, hmmtab_header)}
	df = pd.read_csv(
	infpath, sep="\s+", usecols=col_indices, header=None, comment="#"
	).rename(columns=columns)
	# NaN may result from parsing issues while merging parallel results
	df.dropna(inplace=True)
	df["cleaned_sacc"] = df["sacc"].map(lambda acc: acc.split(".")[0])
	dff = pd.read_csv(cutoffs, sep="\s+", header=None).rename(
	columns={0: "accession", 1: "cutoff"}
	)

	mdf = pd.merge(df, dff, how="left", left_on="cleaned_sacc", right_on="accession")
	mdf = mdf[mdf["score"] >= mdf["cutoff"]]
	if mdf.empty:
	raise AssertionError("No markers in {} pass cutoff thresholds".format(infpath))
	cols = ["orf", "sacc", "sname", "score", "cutoff"]
	mdf = mdf[cols]

	def orf_to_contig_translater(x):
	return x.rsplit("_", 1)[0]

	mdf["contig"] = mdf["orf"].map(orf_to_contig_translater)
	mdf.set_index("contig", inplace=True)
	return mdf


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--hmm-table", help="Autometa output with *hmm.tbl suffix.", required=True,
	)
	parser.add_argument(
	"--cutoffs", help="Kingdom cutoffs table (Bacteria\|Archaea).", required=True,
	)
	parser.add_argument(
	"--clusters",
	help="Directory containing fasta files of clusters (will search for *.fasta files)",
	required=True,
	)
	parser.add_argument(
	"--orfs", help="Path to metagenome.fasta", required=True,
	)
	parser.add_argument(
	"--outdir", help="Directory to write each cluster's markers", required=True,
	)

	args = parser.parse_args()

	# 1. Find markers above cutoff in hmm.tbl
	df = filter_markers(infpath=args.hmm_table, cutoffs=args.cutoffs)
	# index: "contig"
	# cols = ["orf", "sacc", "sname", "score", "cutoff"]

	# 2. Retrieve each cluster fasta file
	search_string = os.path.join(args.clusters, "*.fasta")
	clusters = [
	(filepath, os.path.basename(filepath).replace(".fasta", ""))
	for filepath in glob.glob(search_string)
	]
	# 3. Retrieve ORFs for all markers above cutoff
	marker_orfs_set = set(df.orf.tolist())
	marker_orfs = [
	record
	for record in SeqIO.parse(args.orfs, "fasta")
	if record.id in marker_orfs_set
	]
	# 4.a Create output directory if it does not exist:
	if os.path.exists(args.outdir):
	raise FileExistsError(
	"Directory: {} already exists! Please specify a different path!".format(
	args.outdir
	)
	)
	os.makedirs(args.outdir)
	# 4.b Retrieve the ORFs containing the markers for each cluster
	# df.index == contigs
	# marker_orfs
	recovered_clusters = []
	for cluster_filepath, cluster_name in clusters:
	contig_records = [
	record.id for record in SeqIO.parse(cluster_filepath, "fasta")
	]
	# Retrieve ORFs corresponding to cluster
	cluster_contigs = set(contig_records)
	orf_records = [
	orf for orf in marker_orfs if orf.id.rsplit("_", 1)[0] in cluster_contigs
	]
	# Write out retrieved orfs as cluster.markers.faa
	cluster_markers_outfilepath = os.path.join(
	args.outdir, "{}.markers.faa".format(cluster_name)
	)
	n_written = SeqIO.write(orf_records, cluster_markers_outfilepath, "fasta")
	print(
	"Wrote {} cluster markers to {}".format(
	n_written, cluster_markers_outfilepath
	)
	)
	# Prepare cluster contigs for filtered markers table with cluster identification
	cluster_contigs = [
	{"contig": record, "cluster": cluster_name} for record in contig_records
	]
	recovered_clusters.extend(cluster_contigs)

	# Write out filtered markers table with cluster identification
	dff = pd.DataFrame(recovered_clusters).set_index("contig")
	mdf = pd.merge(dff, df, how="right", left_index=True, right_index=True)
	cluster_markers_table_outfilepath = os.path.join(
	args.outdir, "clusters_markers.tsv"
	)
	mdf.to_csv(cluster_markers_table_outfilepath, sep="\t", index=True, header=True)
	print(
	"Wrote all clusters marker information to {}".format(
	cluster_markers_table_outfilepath
	)
	)
	# Now write all markers information formatted for readability
	all_markers_fpath = os.path.join(args.outdir, "all_markers.tsv")
	df.to_csv(all_markers_fpath, sep="\t", index=True, header=True)
	print("Wrote all clusters marker information to {}".format(all_markers_fpath))


	if __name__ == "__main__":
	main()