ekzhang/plasmid_data.py

## plasmid_data.py
import json
import subprocess

import numpy as np
import pandas as pd
from fastapi import HTTPException
from modal import Image, Stub, web_endpoint


stub = Stub("plasmid-data")


META_LINK = "https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/plasmids_meta.tar.bz2"
FASTA_LINK = (
    "https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/plsdb.fna.bz2"
)


def compute_offsets():
    """Compute byte-offsets of each plasmid in the FASTA file for quick access."""
    result = subprocess.run(
        "rg -b '^>' -N /data/plsdb.fna", shell=True, capture_output=True, text=True
    )
    offsets: dict[str, int] = {}
    for line in result.stdout.splitlines():
        if line:
            num, desc = line.split(":>", 1)
            offset = int(num)
            id = desc.split(" ", 1)[0]
            offsets[id] = offset
    with open("/data/offsets.json", "w") as f:
        json.dump(offsets, f)


stub.image = (
    Image.debian_slim()
    .apt_install("ripgrep", "bzip2", "curl")
    .run_commands(
        "mkdir -p /data",
        f"curl -L {META_LINK} | tar xvjf - -C /data",
        f"curl -L {FASTA_LINK} | bunzip2 > /data/plsdb.fna",
    )
    .pip_install("pandas", "requests", "numpy")
    .run_function(compute_offsets)
)


@stub.cls(keep_warm=True, allow_concurrent_inputs=64)
class PlasmidData:
    def __enter__(self):
        with open("/data/offsets.json") as f:
            self.offsets = json.load(f)
            self.metadata = pd.read_csv(
                "/data/plsdb.tsv", sep="\t", low_memory=False, index_col=1
            )
            self.annotations = pd.read_csv("/data/plsdb.abr", sep="\t")

    @web_endpoint(method="GET", custom_domains=["plasmid-data.modal.ekzhang.com"])
    def get(self, id: str | None = None):
        """Get the metadata for a plasmid."""
        if id is None:
            ids = list(self.offsets.keys())
            descriptions = self.metadata["NUCCORE_Description"][ids]
            return {"ids": ids, "descriptions": descriptions.to_list()}

        offset = self.offsets.get(id)
        if offset is None:
            raise HTTPException(404, "Plasmid not found")

        try:
            metadata = self.metadata.loc[id].replace({np.nan: None}).to_dict()
        except KeyError:
            metadata = None

        annotations = []
        for _, row in self.annotations[self.annotations.qseqid == id].iterrows():
            annotations.append(row.replace({np.nan: None}).to_dict())

        with open("/data/plsdb.fna") as f:
            f.seek(offset)
            header = f.readline().rstrip()
            sequence: list[str] = []
            for line in f:
                if line.startswith(">"):
                    break
                sequence.append(line.rstrip())

        return {
            "id": id,
            "metadata": metadata,
            "annotations": annotations,
            "header": header,
            "sequence": "".join(sequence),
        }
	import json
	import subprocess

	import numpy as np
	import pandas as pd
	from fastapi import HTTPException
	from modal import Image, Stub, web_endpoint


	stub = Stub("plasmid-data")


	META_LINK = "https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/plasmids_meta.tar.bz2"
	FASTA_LINK = (
	"https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/plsdb.fna.bz2"
	)


	def compute_offsets():
	"""Compute byte-offsets of each plasmid in the FASTA file for quick access."""
	result = subprocess.run(
	"rg -b '^>' -N /data/plsdb.fna", shell=True, capture_output=True, text=True
	)
	offsets: dict[str, int] = {}
	for line in result.stdout.splitlines():
	if line:
	num, desc = line.split(":>", 1)
	offset = int(num)
	id = desc.split(" ", 1)[0]
	offsets[id] = offset
	with open("/data/offsets.json", "w") as f:
	json.dump(offsets, f)


	stub.image = (
	Image.debian_slim()
	.apt_install("ripgrep", "bzip2", "curl")
	.run_commands(
	"mkdir -p /data",
	f"curl -L {META_LINK} \| tar xvjf - -C /data",
	f"curl -L {FASTA_LINK} \| bunzip2 > /data/plsdb.fna",
	)
	.pip_install("pandas", "requests", "numpy")
	.run_function(compute_offsets)
	)


	@stub.cls(keep_warm=True, allow_concurrent_inputs=64)
	class PlasmidData:
	def __enter__(self):
	with open("/data/offsets.json") as f:
	self.offsets = json.load(f)
	self.metadata = pd.read_csv(
	"/data/plsdb.tsv", sep="\t", low_memory=False, index_col=1
	)
	self.annotations = pd.read_csv("/data/plsdb.abr", sep="\t")

	@web_endpoint(method="GET", custom_domains=["plasmid-data.modal.ekzhang.com"])
	def get(self, id: str \| None = None):
	"""Get the metadata for a plasmid."""
	if id is None:
	ids = list(self.offsets.keys())
	descriptions = self.metadata["NUCCORE_Description"][ids]
	return {"ids": ids, "descriptions": descriptions.to_list()}

	offset = self.offsets.get(id)
	if offset is None:
	raise HTTPException(404, "Plasmid not found")

	try:
	metadata = self.metadata.loc[id].replace({np.nan: None}).to_dict()
	except KeyError:
	metadata = None

	annotations = []
	for _, row in self.annotations[self.annotations.qseqid == id].iterrows():
	annotations.append(row.replace({np.nan: None}).to_dict())

	with open("/data/plsdb.fna") as f:
	f.seek(offset)
	header = f.readline().rstrip()
	sequence: list[str] = []
	for line in f:
	if line.startswith(">"):
	break
	sequence.append(line.rstrip())

	return {
	"id": id,
	"metadata": metadata,
	"annotations": annotations,
	"header": header,
	"sequence": "".join(sequence),
	}