Sam Minot sminot

## download_refseq.sh
#!/bin/bash

set -e

rsync \
	-a \
	-m \
	--copy-links \
	--recursive \
	--times \

## geneshot_extract_gene_abund.py
#!/usr/bin/env python3

import click
import os
import pandas as pd

# Set up the command line processor
@click.command()
@click.option("--details-hdf5", help="Provide the path to the geneshot output file containing all gene abundances (*.details.hdf5)")
@click.option("--gene-name", help="Name of the gene to extract", prompt="Gene Name")

## geneshot_extract_gene_annot.py
#!/usr/bin/env python3

import click
import os
import pandas as pd

# Set up the command line processor
@click.command()
@click.option("--input_fp", help="Provide the path to the geneshot output file containing all gene abundances (*.results.hdf5)")
def geneshot_extract_gene_annot(input_fp):

## print_nextflow_logs.sh
#!/bin/bash

set -euo pipefail

WORK_DIR="${1}"

# Input is the short task id prefix
TASK_PREFIX="${2}"

# Get the complete suffix

## merge_metaphlan_tables.py
#!/usr/bin/env python3

import os
import pandas as pd
import sys


def read_metaphlan(fp):
    """Function to read a single file with metaphlan2 results encoded as a TSV."""

## fetch_kegg_genes.py
#!/usr/bin/env python3

import requests
from random import shuffle

# Argument parsing code courtesy of @wasade
import click


# Function to fetch the amino acid sequence for a single gene

## parse_fasta.py
from functools import lru_cache
import os
import pandas as pd

@lru_cache(maxsize=16)
def read_fasta(fasta_fp):
    assert os.path.exists(fasta_fp)

    fasta = {}
    header = None

## parse_gff.py
from functools import lru_cache
import pandas as pd

@lru_cache(maxsize=128)
def parse_gff(gff_fp):
    df = pd.read_csv(
        gff_fp,
        sep="\t",
        comment="#",
        header=None,

## print_cloudwatch_logs_aws_batch.py
#!/usr/bin/env python3

import boto3
import argparse
from datetime import datetime

parser = argparse.ArgumentParser()
parser.add_argument("job_id")

# Add the arguments

## plot_geneshot_cag_summary.py
# Plot the distribution of CAG sizes
def plot_cag_size(hdf_fp, pdf=None, min_size=5, alpha=0.25):
    cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")

    # Calculate the log10 size (number of genes per CAG)
    cag_annot = cag_annot.assign(
        size_log10 = cag_annot["size"].apply(np.log10)
    )

    # Filter by CAG size
	#!/bin/bash

	set -e

	rsync \
	-a \
	-m \
	--copy-links \
	--recursive \
	--times \
	#!/usr/bin/env python3

	import click
	import os
	import pandas as pd

	# Set up the command line processor
	@click.command()
	@click.option("--details-hdf5", help="Provide the path to the geneshot output file containing all gene abundances (*.details.hdf5)")
	@click.option("--gene-name", help="Name of the gene to extract", prompt="Gene Name")
	#!/bin/bash

	set -euo pipefail

	WORK_DIR="${1}"

	# Input is the short task id prefix
	TASK_PREFIX="${2}"

	# Get the complete suffix
	#!/usr/bin/env python3

	import requests
	from random import shuffle

	# Argument parsing code courtesy of @wasade
	import click


	# Function to fetch the amino acid sequence for a single gene
	from functools import lru_cache
	import os
	import pandas as pd

	@lru_cache(maxsize=16)
	def read_fasta(fasta_fp):
	assert os.path.exists(fasta_fp)

	fasta = {}
	header = None
	from functools import lru_cache
	import pandas as pd

	@lru_cache(maxsize=128)
	def parse_gff(gff_fp):
	df = pd.read_csv(
	gff_fp,
	sep="\t",
	comment="#",
	header=None,
	#!/usr/bin/env python3

	import boto3
	import argparse
	from datetime import datetime

	parser = argparse.ArgumentParser()
	parser.add_argument("job_id")

	# Add the arguments
	# Plot the distribution of CAG sizes
	def plot_cag_size(hdf_fp, pdf=None, min_size=5, alpha=0.25):
	cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")

	# Calculate the log10 size (number of genes per CAG)
	cag_annot = cag_annot.assign(
	size_log10 = cag_annot["size"].apply(np.log10)
	)

	# Filter by CAG size