Skip to content

Instantly share code, notes, and snippets.

@padix-key
Last active April 19, 2022 08:37
Show Gist options
  • Save padix-key/a45a3a62a0467bd888e541c6b71b1ed8 to your computer and use it in GitHub Desktop.
Save padix-key/a45a3a62a0467bd888e541c6b71b1ed8 to your computer and use it in GitHub Desktop.
This script downloads a number of random protein structures with a given structural classification and writes the relevant protein chain into a mmCIF file.
from os.path import join, isdir
from os import mkdir
import numpy as np
import biotite.structure as struc
import biotite.structure.io.pdb as pdb
import biotite.structure.io.pdbx as pdbx
import biotite.database.rcsb as rcsb
# The structural classifier
SCOP_ID = "1000000"
# The number of samples
N = 10
# The directory for output CIF files
OUT_DIR = "output"
### Find suitable structures
classification_db_query = rcsb.FieldQuery(
"rcsb_polymer_instance_annotation.type",
exact_match="SCOP2"
)
scop_id_query = rcsb.FieldQuery(
"rcsb_polymer_instance_annotation.annotation_lineage.id",
exact_match=SCOP_ID
)
full_query = classification_db_query & scop_id_query
# Not only search for PDB entries, bit for specific protein chains
# with the given classification
pdb_chains = rcsb.search(full_query, return_type="polymer_instance")
# Ensure reproducible behavior
np.random.seed(0)
selected_pdb_chains = np.random.choice(sorted(pdb_chains), N)
### Fetch selected structures and write relevant chains to hard drive
if not isdir(OUT_DIR):
mkdir(OUT_DIR)
for pdb_chain in selected_pdb_chains:
pdb_id, chain = pdb_chain.split(".")
# Use PDBx format to get 'official' chain IDs rather than the ones
# provided by the authors to correctly match the chain value from
# the search query
mmtf_file = pdbx.PDBxFile.read(rcsb.fetch(pdb_id, "pdbx"))
structure = pdbx.get_structure(mmtf_file, model=1, use_author_fields=False)
# Filter relevant protein chain
chain = structure[
struc.filter_amino_acids(structure) & (structure.chain_id == chain)
]
# Write filtered chain to output CIF file
pdbx_file = pdbx.PDBxFile()
pdbx.set_structure(pdbx_file, chain, data_block=pdb_id)
pdbx_file.write(join(OUT_DIR, f"{pdb_id}.cif"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment