Skip to content

Instantly share code, notes, and snippets.

View sminot's full-sized avatar

Sam Minot sminot

View GitHub Profile
@sminot
sminot / download_refseq.sh
Created June 6, 2022 20:26
Download from NCBI RefSeq
#!/bin/bash
set -e
rsync \
-a \
-m \
--copy-links \
--recursive \
--times \
#!/usr/bin/env python3
import click
import os
import pandas as pd
# Set up the command line processor
@click.command()
@click.option("--details-hdf5", help="Provide the path to the geneshot output file containing all gene abundances (*.details.hdf5)")
@click.option("--gene-name", help="Name of the gene to extract", prompt="Gene Name")
@sminot
sminot / geneshot_extract_gene_annot.py
Created April 4, 2022 19:48
Extract the gene annotation table from the geneshot results HDF5
#!/usr/bin/env python3
import click
import os
import pandas as pd
# Set up the command line processor
@click.command()
@click.option("--input_fp", help="Provide the path to the geneshot output file containing all gene abundances (*.results.hdf5)")
def geneshot_extract_gene_annot(input_fp):
@sminot
sminot / print_nextflow_logs.sh
Created January 14, 2022 18:26
Print the log file from a Nextflow task using its short hash
#!/bin/bash
set -euo pipefail
WORK_DIR="${1}"
# Input is the short task id prefix
TASK_PREFIX="${2}"
# Get the complete suffix
@sminot
sminot / merge_metaphlan_tables.py
Created March 2, 2021 00:30
Merge data from multiple files containing MetaPhlAn2 outputs
#!/usr/bin/env python3
import os
import pandas as pd
import sys
def read_metaphlan(fp):
"""Function to read a single file with metaphlan2 results encoded as a TSV."""
@sminot
sminot / fetch_kegg_genes.py
Last active December 2, 2020 12:24
Fetch gene sequences from KEGG
#!/usr/bin/env python3
import requests
from random import shuffle
# Argument parsing code courtesy of @wasade
import click
# Function to fetch the amino acid sequence for a single gene
@sminot
sminot / parse_fasta.py
Created August 13, 2020 16:35
Parse FASTA
from functools import lru_cache
import os
import pandas as pd
@lru_cache(maxsize=16)
def read_fasta(fasta_fp):
assert os.path.exists(fasta_fp)
fasta = {}
header = None
from functools import lru_cache
import pandas as pd
@lru_cache(maxsize=128)
def parse_gff(gff_fp):
df = pd.read_csv(
gff_fp,
sep="\t",
comment="#",
header=None,
@sminot
sminot / print_cloudwatch_logs_aws_batch.py
Last active June 15, 2023 16:46
Print CloudWatch logs for an AWS Batch job
#!/usr/bin/env python3
import boto3
import argparse
from datetime import datetime
parser = argparse.ArgumentParser()
parser.add_argument("job_id")
# Add the arguments
@sminot
sminot / plot_geneshot_cag_summary.py
Created June 26, 2020 15:29
Plot CAG summary figures from geneshot results HDF5
# Plot the distribution of CAG sizes
def plot_cag_size(hdf_fp, pdf=None, min_size=5, alpha=0.25):
cag_annot = pd.read_hdf(hdf_fp, "/annot/cag/all").set_index("CAG")
# Calculate the log10 size (number of genes per CAG)
cag_annot = cag_annot.assign(
size_log10 = cag_annot["size"].apply(np.log10)
)
# Filter by CAG size