Last active
March 17, 2022 16:39
-
-
Save evanroyrees/3b563960857b2fe81ab51d1423c8ec10 to your computer and use it in GitHub Desktop.
Alluvial plot generation of Autometa taxonomy information
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
# Alluvial plot generation of taxonomy information | |
## Setup Env | |
First create env to run script | |
```bash | |
conda create -n plotly -c plotly -c conda-forge plotly python-kaleido pandas tqdm -y | |
``` | |
## Run script | |
```bash | |
python taxonomy_alluvial_plot.py --taxonomy taxonomy.tsv | |
``` | |
""" | |
import argparse | |
import logging | |
import pandas as pd | |
import plotly.graph_objects as go | |
from tqdm import tqdm | |
logging.basicConfig(level=logging.INFO) | |
def add_canonical_rank_prefix_to_rank_names(df): | |
"""This is necessary so we do not get cycles in our alluvial plot""" | |
dff = df.copy() | |
dff.fillna("unclassified", inplace=True) | |
ranks = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"] | |
for rank in ranks: | |
if rank in df: | |
dff[rank] = dff[rank].map( | |
lambda x: f"{rank[0]}_{x}" if rank != "superkingdom" else f"d_{x}" | |
) | |
return dff | |
def alluvial_plot( | |
df: pd.DataFrame, | |
ranks: list = [ | |
"superkingdom", | |
"phylum", | |
"class", | |
"order", | |
"family", | |
"genus", | |
"species", | |
], | |
n_ranks=7, | |
out: str = None, | |
title_text: str = "Taxonomy Alluvial Plot", | |
image_width: int = 1920, | |
image_height: int = 1080, | |
) -> go.Figure: | |
"""Generate Sankey diagram (alluvial plot) using canonical rank (`ranks`) taxonomies and write to `out` | |
Returns | |
------- | |
go.Figure | |
Sankey Figure generated using taxonomy metadata | |
""" | |
df = add_canonical_rank_prefix_to_rank_names(df) | |
label = [] | |
for rank in ranks[:n_ranks]: | |
label.extend(df[rank].unique().tolist()) | |
source = [] | |
target = [] | |
value = [] | |
logging.info(f"Generating alluvial plot for {len(ranks[:n_ranks])} canonical ranks") | |
for rank in tqdm(ranks[:n_ranks], desc="Creating alluvial connections"): | |
for rank_name, dff in df.groupby(rank): | |
source_index = label.index(rank_name) | |
next_rank_i = ranks.index(rank) + 1 | |
if next_rank_i >= len(ranks[:n_ranks]): | |
continue | |
next_rank = ranks[next_rank_i] | |
# all source is from label rank name index | |
for rank_n in dff[next_rank].unique(): | |
target_index = label.index(rank_n) | |
value_count = len(dff[dff[next_rank] == rank_n]) | |
label.append(source_index) | |
source.append(source_index) | |
target.append(target_index) | |
value.append(value_count) | |
fig = go.Figure( | |
data=[ | |
go.Sankey( | |
node=dict( | |
pad=8, | |
thickness=13, | |
line=dict(width=0.3), | |
label=label, | |
), | |
link=dict( | |
source=source, | |
target=target, | |
value=value, | |
), | |
) | |
] | |
) | |
fig.update_layout(title_text=title_text, font_size=6) | |
if out: | |
fig.write_image(out, width=image_width, height=image_height) | |
logging.info(f"Wrote alluvial plot to {out}") | |
return fig | |
def main(): | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument("--taxonomy", help="Path to taxonomy.tsv file", required=True) | |
parser.add_argument( | |
"--n-ranks", | |
help="Number of canonical ranks to visualize in order from superkingdom to species", | |
default=7, | |
choices=[2, 3, 4, 5, 6, 7], | |
type=int, | |
required=False, | |
) | |
parser.add_argument( | |
"--title", | |
help="Title for alluvial plot", | |
required=False, | |
default="Taxonomy Alluvial Plot", | |
) | |
parser.add_argument( | |
"--plot-width", help="width of plot image", required=False, default=1920, type=int | |
) | |
parser.add_argument( | |
"--plot-height", help="height of plot image", required=False, default=1080, type=int | |
) | |
parser.add_argument( | |
"--out", | |
help=""" | |
Path to write alluvial plot. | |
NOTE: This requires an appropriate file extension. | |
Available extensions are: .png, .jpg, .jpeg, .svg and .pdf (pdf requires python-kaleido) | |
""", | |
required=False, | |
) | |
parser.add_argument( | |
"--no-show", | |
help="Do NOT show alluvial plot after it is generated", | |
action="store_true", | |
) | |
args = parser.parse_args() | |
# Read taxonomy table | |
taxa_df = pd.read_csv(args.taxonomy, sep="\t", index_col="contig") | |
# Generate Sankey diagram (alluvial plot) figure | |
fig = alluvial_plot( | |
df=taxa_df, | |
n_ranks=args.n_ranks, | |
title_text=args.title, | |
image_width=args.plot_width, | |
image_height=args.plot_height, | |
out=args.out, | |
) | |
# Show figure | |
if not args.no_show: | |
fig.show() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Contents of
environment.yml
environment.yml
fileplotly
environment