Created
February 10, 2021 22:35
-
-
Save evanroyrees/5e0a9fe9c95b3307657f101fcc2226cb to your computer and use it in GitHub Desktop.
Retrieve ORFs corresponding to markers for each cluster for Autometa v1.0 outputs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import os | |
import glob | |
from Bio import SeqIO | |
import pandas as pd | |
def filter_markers(infpath, cutoffs): | |
"""Filter markers from hmmscan output table that are above cutoff values. | |
Parameters | |
---------- | |
infpath : str | |
</path/to/hmmscan.tsv> | |
cutoffs : str | |
</path/to/cutoffs.tsv> | |
Returns | |
------- | |
str | |
</path/to/output.markers.tsv> | |
Raises | |
------- | |
FileNotFoundError | |
`infpath` or `cutoffs` not found | |
AssertionError | |
No returned markers pass the cutoff thresholds. I.e. final df is empty. | |
""" | |
for fp in [infpath, cutoffs]: | |
if not os.path.exists(fp): | |
raise FileNotFoundError(fp) | |
hmmtab_header = ["sname", "sacc", "orf", "score"] | |
col_indices = [0, 1, 2, 5] | |
columns = {i: k for i, k in zip(col_indices, hmmtab_header)} | |
df = pd.read_csv( | |
infpath, sep="\s+", usecols=col_indices, header=None, comment="#" | |
).rename(columns=columns) | |
# NaN may result from parsing issues while merging parallel results | |
df.dropna(inplace=True) | |
df["cleaned_sacc"] = df["sacc"].map(lambda acc: acc.split(".")[0]) | |
dff = pd.read_csv(cutoffs, sep="\s+", header=None).rename( | |
columns={0: "accession", 1: "cutoff"} | |
) | |
mdf = pd.merge(df, dff, how="left", left_on="cleaned_sacc", right_on="accession") | |
mdf = mdf[mdf["score"] >= mdf["cutoff"]] | |
if mdf.empty: | |
raise AssertionError("No markers in {} pass cutoff thresholds".format(infpath)) | |
cols = ["orf", "sacc", "sname", "score", "cutoff"] | |
mdf = mdf[cols] | |
def orf_to_contig_translater(x): | |
return x.rsplit("_", 1)[0] | |
mdf["contig"] = mdf["orf"].map(orf_to_contig_translater) | |
mdf.set_index("contig", inplace=True) | |
return mdf | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--hmm-table", help="Autometa output with *hmm.tbl suffix.", required=True, | |
) | |
parser.add_argument( | |
"--cutoffs", help="Kingdom cutoffs table (Bacteria|Archaea).", required=True, | |
) | |
parser.add_argument( | |
"--clusters", | |
help="Directory containing fasta files of clusters (will search for *.fasta files)", | |
required=True, | |
) | |
parser.add_argument( | |
"--orfs", help="Path to metagenome.fasta", required=True, | |
) | |
parser.add_argument( | |
"--outdir", help="Directory to write each cluster's markers", required=True, | |
) | |
args = parser.parse_args() | |
# 1. Find markers above cutoff in hmm.tbl | |
df = filter_markers(infpath=args.hmm_table, cutoffs=args.cutoffs) | |
# index: "contig" | |
# cols = ["orf", "sacc", "sname", "score", "cutoff"] | |
# 2. Retrieve each cluster fasta file | |
search_string = os.path.join(args.clusters, "*.fasta") | |
clusters = [ | |
(filepath, os.path.basename(filepath).replace(".fasta", "")) | |
for filepath in glob.glob(search_string) | |
] | |
# 3. Retrieve ORFs for all markers above cutoff | |
marker_orfs_set = set(df.orf.tolist()) | |
marker_orfs = [ | |
record | |
for record in SeqIO.parse(args.orfs, "fasta") | |
if record.id in marker_orfs_set | |
] | |
# 4.a Create output directory if it does not exist: | |
if os.path.exists(args.outdir): | |
raise FileExistsError( | |
"Directory: {} already exists! Please specify a different path!".format( | |
args.outdir | |
) | |
) | |
os.makedirs(args.outdir) | |
# 4.b Retrieve the ORFs containing the markers for each cluster | |
# df.index == contigs | |
# marker_orfs | |
recovered_clusters = [] | |
for cluster_filepath, cluster_name in clusters: | |
contig_records = [ | |
record.id for record in SeqIO.parse(cluster_filepath, "fasta") | |
] | |
# Retrieve ORFs corresponding to cluster | |
cluster_contigs = set(contig_records) | |
orf_records = [ | |
orf for orf in marker_orfs if orf.id.rsplit("_", 1)[0] in cluster_contigs | |
] | |
# Write out retrieved orfs as cluster.markers.faa | |
cluster_markers_outfilepath = os.path.join( | |
args.outdir, "{}.markers.faa".format(cluster_name) | |
) | |
n_written = SeqIO.write(orf_records, cluster_markers_outfilepath, "fasta") | |
print( | |
"Wrote {} cluster markers to {}".format( | |
n_written, cluster_markers_outfilepath | |
) | |
) | |
# Prepare cluster contigs for filtered markers table with cluster identification | |
cluster_contigs = [ | |
{"contig": record, "cluster": cluster_name} for record in contig_records | |
] | |
recovered_clusters.extend(cluster_contigs) | |
# Write out filtered markers table with cluster identification | |
dff = pd.DataFrame(recovered_clusters).set_index("contig") | |
mdf = pd.merge(dff, df, how="right", left_index=True, right_index=True) | |
cluster_markers_table_outfilepath = os.path.join( | |
args.outdir, "clusters_markers.tsv" | |
) | |
mdf.to_csv(cluster_markers_table_outfilepath, sep="\t", index=True, header=True) | |
print( | |
"Wrote all clusters marker information to {}".format( | |
cluster_markers_table_outfilepath | |
) | |
) | |
# Now write all markers information formatted for readability | |
all_markers_fpath = os.path.join(args.outdir, "all_markers.tsv") | |
df.to_csv(all_markers_fpath, sep="\t", index=True, header=True) | |
print("Wrote all clusters marker information to {}".format(all_markers_fpath)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment