Skip to content

Instantly share code, notes, and snippets.

@ekzhang
Created November 19, 2023 17:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ekzhang/f521f3de121a52c7cb13b6561d94c147 to your computer and use it in GitHub Desktop.
Save ekzhang/f521f3de121a52c7cb13b6561d94c147 to your computer and use it in GitHub Desktop.
Fast API for plasmid data from PLSDB
import json
import subprocess
import numpy as np
import pandas as pd
from fastapi import HTTPException
from modal import Image, Stub, web_endpoint
stub = Stub("plasmid-data")
META_LINK = "https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/plasmids_meta.tar.bz2"
FASTA_LINK = (
"https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/plsdb.fna.bz2"
)
def compute_offsets():
"""Compute byte-offsets of each plasmid in the FASTA file for quick access."""
result = subprocess.run(
"rg -b '^>' -N /data/plsdb.fna", shell=True, capture_output=True, text=True
)
offsets: dict[str, int] = {}
for line in result.stdout.splitlines():
if line:
num, desc = line.split(":>", 1)
offset = int(num)
id = desc.split(" ", 1)[0]
offsets[id] = offset
with open("/data/offsets.json", "w") as f:
json.dump(offsets, f)
stub.image = (
Image.debian_slim()
.apt_install("ripgrep", "bzip2", "curl")
.run_commands(
"mkdir -p /data",
f"curl -L {META_LINK} | tar xvjf - -C /data",
f"curl -L {FASTA_LINK} | bunzip2 > /data/plsdb.fna",
)
.pip_install("pandas", "requests", "numpy")
.run_function(compute_offsets)
)
@stub.cls(keep_warm=True, allow_concurrent_inputs=64)
class PlasmidData:
def __enter__(self):
with open("/data/offsets.json") as f:
self.offsets = json.load(f)
self.metadata = pd.read_csv(
"/data/plsdb.tsv", sep="\t", low_memory=False, index_col=1
)
self.annotations = pd.read_csv("/data/plsdb.abr", sep="\t")
@web_endpoint(method="GET", custom_domains=["plasmid-data.modal.ekzhang.com"])
def get(self, id: str | None = None):
"""Get the metadata for a plasmid."""
if id is None:
ids = list(self.offsets.keys())
descriptions = self.metadata["NUCCORE_Description"][ids]
return {"ids": ids, "descriptions": descriptions.to_list()}
offset = self.offsets.get(id)
if offset is None:
raise HTTPException(404, "Plasmid not found")
try:
metadata = self.metadata.loc[id].replace({np.nan: None}).to_dict()
except KeyError:
metadata = None
annotations = []
for _, row in self.annotations[self.annotations.qseqid == id].iterrows():
annotations.append(row.replace({np.nan: None}).to_dict())
with open("/data/plsdb.fna") as f:
f.seek(offset)
header = f.readline().rstrip()
sequence: list[str] = []
for line in f:
if line.startswith(">"):
break
sequence.append(line.rstrip())
return {
"id": id,
"metadata": metadata,
"annotations": annotations,
"header": header,
"sequence": "".join(sequence),
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment