Skip to content

Instantly share code, notes, and snippets.

@sminot
Last active April 4, 2022 20:54
Show Gist options
  • Save sminot/cebfbd84d57406b5b41b2eebffb1789f to your computer and use it in GitHub Desktop.
Save sminot/cebfbd84d57406b5b41b2eebffb1789f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import click
import os
import pandas as pd
# Set up the command line processor
@click.command()
@click.option("--details-hdf5", help="Provide the path to the geneshot output file containing all gene abundances (*.details.hdf5)")
@click.option("--gene-name", help="Name of the gene to extract", prompt="Gene Name")
def geneshot_extract_gene_abund(details_hdf5, gene_name):
assert details_hdf5 is not None, "Must provide path to the geneshot output file containing all gene abundances (*.details.hdf5)"
assert os.path.exists(details_hdf5), f"Cannot file file: {details_hdf5}"
assert gene_name is not None, "Must provide gene name to extract"
# Collect all of the outputs in one place
dat = []
# Open the file
with pd.HDFStore(details_hdf5, "r") as store:
# Iterate over every table in the store
for k in store:
# If this is an abundance table
if k.startswith("/abund/gene/long/"):
# Get the specimen name
specimen = k[len("/abund/gene/long/"):]
# Add it to the data, filtering to the gene of interest
print(f"Reading {k}")
df = pd.read_hdf(store, k)
tot_reads = df['nreads'].sum()
dat.append(df.query(f"id == '{gene_name}'").assign(specimen=specimen, prop_reads=lambda d: d['nreads'] / tot_reads))
# Combine all of the data
df = pd.concat(dat).reset_index(drop=True)
# Write it out as CSV
df.to_csv(f"{gene_name}.geneshot.csv", index=None)
if __name__ == "__main__":
geneshot_extract_gene_abund()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment