sminot/geneshot_extract_gene_abund.py

## geneshot_extract_gene_abund.py
#!/usr/bin/env python3

import click
import os
import pandas as pd

# Set up the command line processor
@click.command()
@click.option("--details-hdf5", help="Provide the path to the geneshot output file containing all gene abundances (*.details.hdf5)")
@click.option("--gene-name", help="Name of the gene to extract", prompt="Gene Name")
def geneshot_extract_gene_abund(details_hdf5, gene_name):

    assert details_hdf5 is not None, "Must provide path to the geneshot output file containing all gene abundances (*.details.hdf5)"
    assert os.path.exists(details_hdf5), f"Cannot file file: {details_hdf5}"

    assert gene_name is not None, "Must provide gene name to extract"

    # Collect all of the outputs in one place
    dat = []

    # Open the file
    with pd.HDFStore(details_hdf5, "r") as store:

        # Iterate over every table in the store
        for k in store:

            # If this is an abundance table
            if k.startswith("/abund/gene/long/"):

                # Get the specimen name
                specimen = k[len("/abund/gene/long/"):]

                # Add it to the data, filtering to the gene of interest
                print(f"Reading {k}")
                df = pd.read_hdf(store, k)
                tot_reads = df['nreads'].sum()
                dat.append(df.query(f"id == '{gene_name}'").assign(specimen=specimen, prop_reads=lambda d: d['nreads'] / tot_reads))

    # Combine all of the data
    df = pd.concat(dat).reset_index(drop=True)

    # Write it out as CSV
    df.to_csv(f"{gene_name}.geneshot.csv", index=None)


if __name__ == "__main__":

    geneshot_extract_gene_abund()
	#!/usr/bin/env python3

	import click
	import os
	import pandas as pd

	# Set up the command line processor
	@click.command()
	@click.option("--details-hdf5", help="Provide the path to the geneshot output file containing all gene abundances (*.details.hdf5)")
	@click.option("--gene-name", help="Name of the gene to extract", prompt="Gene Name")
	def geneshot_extract_gene_abund(details_hdf5, gene_name):

	assert details_hdf5 is not None, "Must provide path to the geneshot output file containing all gene abundances (*.details.hdf5)"
	assert os.path.exists(details_hdf5), f"Cannot file file: {details_hdf5}"

	assert gene_name is not None, "Must provide gene name to extract"

	# Collect all of the outputs in one place
	dat = []

	# Open the file
	with pd.HDFStore(details_hdf5, "r") as store:

	# Iterate over every table in the store
	for k in store:

	# If this is an abundance table
	if k.startswith("/abund/gene/long/"):

	# Get the specimen name
	specimen = k[len("/abund/gene/long/"):]

	# Add it to the data, filtering to the gene of interest
	print(f"Reading {k}")
	df = pd.read_hdf(store, k)
	tot_reads = df['nreads'].sum()
	dat.append(df.query(f"id == '{gene_name}'").assign(specimen=specimen, prop_reads=lambda d: d['nreads'] / tot_reads))

	# Combine all of the data
	df = pd.concat(dat).reset_index(drop=True)

	# Write it out as CSV
	df.to_csv(f"{gene_name}.geneshot.csv", index=None)


	if __name__ == "__main__":

	geneshot_extract_gene_abund()