Skip to content

Instantly share code, notes, and snippets.

@evanroyrees
Last active March 17, 2022 16:39
Show Gist options
  • Save evanroyrees/3b563960857b2fe81ab51d1423c8ec10 to your computer and use it in GitHub Desktop.
Save evanroyrees/3b563960857b2fe81ab51d1423c8ec10 to your computer and use it in GitHub Desktop.
Alluvial plot generation of Autometa taxonomy information
#!/usr/bin/env python
"""
# Alluvial plot generation of taxonomy information
## Setup Env
First create env to run script
```bash
conda create -n plotly -c plotly -c conda-forge plotly python-kaleido pandas tqdm -y
```
## Run script
```bash
python taxonomy_alluvial_plot.py --taxonomy taxonomy.tsv
```
"""
import argparse
import logging
import pandas as pd
import plotly.graph_objects as go
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
def add_canonical_rank_prefix_to_rank_names(df):
"""This is necessary so we do not get cycles in our alluvial plot"""
dff = df.copy()
dff.fillna("unclassified", inplace=True)
ranks = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]
for rank in ranks:
if rank in df:
dff[rank] = dff[rank].map(
lambda x: f"{rank[0]}_{x}" if rank != "superkingdom" else f"d_{x}"
)
return dff
def alluvial_plot(
df: pd.DataFrame,
ranks: list = [
"superkingdom",
"phylum",
"class",
"order",
"family",
"genus",
"species",
],
n_ranks=7,
out: str = None,
title_text: str = "Taxonomy Alluvial Plot",
image_width: int = 1920,
image_height: int = 1080,
) -> go.Figure:
"""Generate Sankey diagram (alluvial plot) using canonical rank (`ranks`) taxonomies and write to `out`
Returns
-------
go.Figure
Sankey Figure generated using taxonomy metadata
"""
df = add_canonical_rank_prefix_to_rank_names(df)
label = []
for rank in ranks[:n_ranks]:
label.extend(df[rank].unique().tolist())
source = []
target = []
value = []
logging.info(f"Generating alluvial plot for {len(ranks[:n_ranks])} canonical ranks")
for rank in tqdm(ranks[:n_ranks], desc="Creating alluvial connections"):
for rank_name, dff in df.groupby(rank):
source_index = label.index(rank_name)
next_rank_i = ranks.index(rank) + 1
if next_rank_i >= len(ranks[:n_ranks]):
continue
next_rank = ranks[next_rank_i]
# all source is from label rank name index
for rank_n in dff[next_rank].unique():
target_index = label.index(rank_n)
value_count = len(dff[dff[next_rank] == rank_n])
label.append(source_index)
source.append(source_index)
target.append(target_index)
value.append(value_count)
fig = go.Figure(
data=[
go.Sankey(
node=dict(
pad=8,
thickness=13,
line=dict(width=0.3),
label=label,
),
link=dict(
source=source,
target=target,
value=value,
),
)
]
)
fig.update_layout(title_text=title_text, font_size=6)
if out:
fig.write_image(out, width=image_width, height=image_height)
logging.info(f"Wrote alluvial plot to {out}")
return fig
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--taxonomy", help="Path to taxonomy.tsv file", required=True)
parser.add_argument(
"--n-ranks",
help="Number of canonical ranks to visualize in order from superkingdom to species",
default=7,
choices=[2, 3, 4, 5, 6, 7],
type=int,
required=False,
)
parser.add_argument(
"--title",
help="Title for alluvial plot",
required=False,
default="Taxonomy Alluvial Plot",
)
parser.add_argument(
"--plot-width", help="width of plot image", required=False, default=1920, type=int
)
parser.add_argument(
"--plot-height", help="height of plot image", required=False, default=1080, type=int
)
parser.add_argument(
"--out",
help="""
Path to write alluvial plot.
NOTE: This requires an appropriate file extension.
Available extensions are: .png, .jpg, .jpeg, .svg and .pdf (pdf requires python-kaleido)
""",
required=False,
)
parser.add_argument(
"--no-show",
help="Do NOT show alluvial plot after it is generated",
action="store_true",
)
args = parser.parse_args()
# Read taxonomy table
taxa_df = pd.read_csv(args.taxonomy, sep="\t", index_col="contig")
# Generate Sankey diagram (alluvial plot) figure
fig = alluvial_plot(
df=taxa_df,
n_ranks=args.n_ranks,
title_text=args.title,
image_width=args.plot_width,
image_height=args.plot_height,
out=args.out,
)
# Show figure
if not args.no_show:
fig.show()
if __name__ == "__main__":
main()
@evanroyrees
Copy link
Author

evanroyrees commented Mar 17, 2022

NOTE

Writing to an image (e.g. --out image.png) requires using an appropriate file extension, e.g.

Available image file extensions

  • image.png
  • image.jpg
  • image.jpeg
  • image.pdf (requires python-kaleido)
  • image.svg

@evanroyrees
Copy link
Author

evanroyrees commented Mar 17, 2022

Contents of environment.yml

name: plotly
channels:
  - plotly
  - conda-forge
  - defaults
dependencies:
  - plotly
  - black
  - pandas
  - tqdm
  - python-kaleido
  1. Copy/Paste contents above into environment.yml file
  2. Create plotly environment
conda env create -f=environment.yml
  1. Activate environment:
conda activate plotly
  1. Download this gist
wget https://gist.githubusercontent.com/WiscEvan/3b563960857b2fe81ab51d1423c8ec10/raw/396884eafc79e95ea1dd5a8c4bb061a721844bca/taxonomy_alluvial_plot.py
  1. Run script:
python taxonomy_alluvial_plot.py --taxonomy taxonomy.tsv

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment