Skip to content

Instantly share code, notes, and snippets.

@SJShaw
Created November 27, 2023 13:15
Show Gist options
  • Save SJShaw/9396454321cf9d7ebdc0b6370c3da40e to your computer and use it in GitHub Desktop.
Save SJShaw/9396454321cf9d7ebdc0b6370c3da40e to your computer and use it in GitHub Desktop.
MIBiG gene reference checker
#!/usr/bin/env python
import json
import os
import sys
from mibig_html.common.secmet import Record
def check(json_file, gbk_dir):
with open(json_file) as handle:
data = json.load(handle)
if data["cluster"].get("status") == "retired":
return 0
extras = set()
for extra in data["cluster"].get("genes", {}).get("extra_genes", []):
extras.add(extra["id"])
gene_ids = set()
for gene in data["cluster"].get("genes", {}).get("annotations", []):
gene_ids.add(gene["id"])
for pks in data["cluster"].get("polyketide", {}).get("synthases", []):
gene_ids.update(pks.get("genes", []))
for module in pks.get("modules", []):
gene_ids.update(module.get("genes", []))
for nrps in data["cluster"].get("nrp", {}).get("nrps_genes", []):
gene_ids.add(nrps["gene_id"])
ref_accession = data["cluster"]["loci"]["accession"]
records = {record.id: record for record in Record.from_genbank(os.path.join(gbk_dir, f"{ref_accession}.gbk"))}
record = records[ref_accession]
available = set(extras)
for cds in record.get_cds_features():
available.update([cds.protein_id, cds.locus_tag, cds.gene])
missing = gene_ids.difference(available)
if missing:
print(os.path.basename(json_file).replace(".json", ""), f"({ref_accession}) missing genes:", sorted(missing), file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
help = f"{os.path.basename(sys.argv[0])} input.json genbank_dir"
if len(sys.argv) != 3:
print(help)
sys.exit(0 if "-h" in sys.argv or "--help" in sys.argv else 1)
try:
sys.exit(check(sys.argv[1], sys.argv[2]))
except Exception as err:
print(sys.argv[1], str(err), file=sys.stderr)
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment