Skip to content

Instantly share code, notes, and snippets.

View apcamargo's full-sized avatar
🦖

Antônio Camargo apcamargo

🦖
  • University of São Paulo
  • São Paulo, SP, Brazil
  • X @apcamargo_
View GitHub Profile
@apcamargo
apcamargo / query_sra_duckdb.sh
Created July 13, 2025 21:11
Query SRA metadata stored as Parquet files in S3 using DuckDB
duckdb -c "
INSTALL httpfs;
LOAD httpfs;
INSTALL parquet;
LOAD parquet;
COPY (
SELECT *
FROM read_parquet('s3://sra-pub-metadata-us-east-1/sra/metadata/*')
) TO STDOUT WITH (FORMAT CSV, DELIMITER E'\t', HEADER);"
@apcamargo
apcamargo / download_mg_rast.py
Last active July 12, 2025 21:20
Downloads all the assembled metagenomes available in MG-RAST
#!/usr/bin/env python
import json
import re
import sys
from typing import Generator, Dict, Any, Optional
import requests
from tqdm import tqdm
from pathlib import Path
from typing import Iterator, Optional, Union
import polars as pl
from needletail import parse_fastx_file
from polars.io.plugins import register_io_source
def scan_fastx(fastx_file: Union[str, Path]) -> pl.LazyFrame:
schema = pl.Schema(
@apcamargo
apcamargo / sam2tsv.py
Created March 10, 2025 03:53
Converts alignments stored in the SAM format to a BLAST-like table
#!/usr/bin/env python
"""
This script processes SAM (Sequence Alignment/Map format) inputs from standard
input and extracts alignment information that is then provided in a tab-separated
table. The following fields are produced: query, target, query_length, query_start,
query_end, target_start, target_end, alignment_length, alignment_identity.
This script was designed for use with SAM files produced by minimap2. However,
it will work with any SAM data that:
@apcamargo
apcamargo / calculate_neff.py
Created November 17, 2024 04:59
Calculate the number of effective sequences (Neff) of a A3M multiple sequence alignment
#!/usr/bin/env python
import math
import re
import click
from scipy.cluster.hierarchy import fcluster, linkage
from skbio import DistanceMatrix, Protein, TabularMSA, io
from skbio.sequence.distance import hamming
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from coloraide import Color
def lighten(
color: Color,
amount: float,
) -> Color:
"""
Lighten a color by a given amount.
"""
@apcamargo
apcamargo / retrieve_assembly_accession.py
Created September 19, 2023 21:03
Retrieve NCBI assembly accessions from GenBank accessions using E-utilities
import subprocess
def get_assembly_accession(genbank_accession):
p1 = subprocess.Popen(
["elink", "-db", "nuccore", "-target", "assembly", "-id", genbank_accession],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
p2 = subprocess.Popen(
["efetch", "-format", "docsum"],
#!/usr/bin/env python
# hhblits -v 0 -cpu 1 -n 1 -p 90 -z 0 -Z 5000 -b 0 -B 5000 -M 50 -d busco_db/busco -i msa.faa -o msa.hhr
from collections import namedtuple
from pathlib import Path
import argparse
parser = argparse.ArgumentParser(description='Parse hhsearch hhr output file.')
parser.add_argument('-i', help='input hrr path', dest='input_file',type=str, required=True)
@apcamargo
apcamargo / fancy_fasta_reader.py
Last active August 23, 2024 03:39
Fancy FASTA parser in Python
import bz2
import gzip
import lzma
import textwrap
from contextlib import contextmanager
from enum import Enum, auto
from pathlib import Path
class Compression(Enum):