Skip to content

Instantly share code, notes, and snippets.

@evanroyrees
Created February 10, 2021 22:35
Show Gist options
  • Save evanroyrees/5e0a9fe9c95b3307657f101fcc2226cb to your computer and use it in GitHub Desktop.
Save evanroyrees/5e0a9fe9c95b3307657f101fcc2226cb to your computer and use it in GitHub Desktop.
Retrieve ORFs corresponding to markers for each cluster for Autometa v1.0 outputs
#!/usr/bin/env python
import argparse
import os
import glob
from Bio import SeqIO
import pandas as pd
def filter_markers(infpath, cutoffs):
"""Filter markers from hmmscan output table that are above cutoff values.
Parameters
----------
infpath : str
</path/to/hmmscan.tsv>
cutoffs : str
</path/to/cutoffs.tsv>
Returns
-------
str
</path/to/output.markers.tsv>
Raises
-------
FileNotFoundError
`infpath` or `cutoffs` not found
AssertionError
No returned markers pass the cutoff thresholds. I.e. final df is empty.
"""
for fp in [infpath, cutoffs]:
if not os.path.exists(fp):
raise FileNotFoundError(fp)
hmmtab_header = ["sname", "sacc", "orf", "score"]
col_indices = [0, 1, 2, 5]
columns = {i: k for i, k in zip(col_indices, hmmtab_header)}
df = pd.read_csv(
infpath, sep="\s+", usecols=col_indices, header=None, comment="#"
).rename(columns=columns)
# NaN may result from parsing issues while merging parallel results
df.dropna(inplace=True)
df["cleaned_sacc"] = df["sacc"].map(lambda acc: acc.split(".")[0])
dff = pd.read_csv(cutoffs, sep="\s+", header=None).rename(
columns={0: "accession", 1: "cutoff"}
)
mdf = pd.merge(df, dff, how="left", left_on="cleaned_sacc", right_on="accession")
mdf = mdf[mdf["score"] >= mdf["cutoff"]]
if mdf.empty:
raise AssertionError("No markers in {} pass cutoff thresholds".format(infpath))
cols = ["orf", "sacc", "sname", "score", "cutoff"]
mdf = mdf[cols]
def orf_to_contig_translater(x):
return x.rsplit("_", 1)[0]
mdf["contig"] = mdf["orf"].map(orf_to_contig_translater)
mdf.set_index("contig", inplace=True)
return mdf
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--hmm-table", help="Autometa output with *hmm.tbl suffix.", required=True,
)
parser.add_argument(
"--cutoffs", help="Kingdom cutoffs table (Bacteria|Archaea).", required=True,
)
parser.add_argument(
"--clusters",
help="Directory containing fasta files of clusters (will search for *.fasta files)",
required=True,
)
parser.add_argument(
"--orfs", help="Path to metagenome.fasta", required=True,
)
parser.add_argument(
"--outdir", help="Directory to write each cluster's markers", required=True,
)
args = parser.parse_args()
# 1. Find markers above cutoff in hmm.tbl
df = filter_markers(infpath=args.hmm_table, cutoffs=args.cutoffs)
# index: "contig"
# cols = ["orf", "sacc", "sname", "score", "cutoff"]
# 2. Retrieve each cluster fasta file
search_string = os.path.join(args.clusters, "*.fasta")
clusters = [
(filepath, os.path.basename(filepath).replace(".fasta", ""))
for filepath in glob.glob(search_string)
]
# 3. Retrieve ORFs for all markers above cutoff
marker_orfs_set = set(df.orf.tolist())
marker_orfs = [
record
for record in SeqIO.parse(args.orfs, "fasta")
if record.id in marker_orfs_set
]
# 4.a Create output directory if it does not exist:
if os.path.exists(args.outdir):
raise FileExistsError(
"Directory: {} already exists! Please specify a different path!".format(
args.outdir
)
)
os.makedirs(args.outdir)
# 4.b Retrieve the ORFs containing the markers for each cluster
# df.index == contigs
# marker_orfs
recovered_clusters = []
for cluster_filepath, cluster_name in clusters:
contig_records = [
record.id for record in SeqIO.parse(cluster_filepath, "fasta")
]
# Retrieve ORFs corresponding to cluster
cluster_contigs = set(contig_records)
orf_records = [
orf for orf in marker_orfs if orf.id.rsplit("_", 1)[0] in cluster_contigs
]
# Write out retrieved orfs as cluster.markers.faa
cluster_markers_outfilepath = os.path.join(
args.outdir, "{}.markers.faa".format(cluster_name)
)
n_written = SeqIO.write(orf_records, cluster_markers_outfilepath, "fasta")
print(
"Wrote {} cluster markers to {}".format(
n_written, cluster_markers_outfilepath
)
)
# Prepare cluster contigs for filtered markers table with cluster identification
cluster_contigs = [
{"contig": record, "cluster": cluster_name} for record in contig_records
]
recovered_clusters.extend(cluster_contigs)
# Write out filtered markers table with cluster identification
dff = pd.DataFrame(recovered_clusters).set_index("contig")
mdf = pd.merge(dff, df, how="right", left_index=True, right_index=True)
cluster_markers_table_outfilepath = os.path.join(
args.outdir, "clusters_markers.tsv"
)
mdf.to_csv(cluster_markers_table_outfilepath, sep="\t", index=True, header=True)
print(
"Wrote all clusters marker information to {}".format(
cluster_markers_table_outfilepath
)
)
# Now write all markers information formatted for readability
all_markers_fpath = os.path.join(args.outdir, "all_markers.tsv")
df.to_csv(all_markers_fpath, sep="\t", index=True, header=True)
print("Wrote all clusters marker information to {}".format(all_markers_fpath))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment