Last active
April 19, 2022 08:37
-
-
Save padix-key/a45a3a62a0467bd888e541c6b71b1ed8 to your computer and use it in GitHub Desktop.
This script downloads a number of random protein structures with a given structural classification and writes the relevant protein chain into a mmCIF file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os.path import join, isdir | |
from os import mkdir | |
import numpy as np | |
import biotite.structure as struc | |
import biotite.structure.io.pdb as pdb | |
import biotite.structure.io.pdbx as pdbx | |
import biotite.database.rcsb as rcsb | |
# The structural classifier | |
SCOP_ID = "1000000" | |
# The number of samples | |
N = 10 | |
# The directory for output CIF files | |
OUT_DIR = "output" | |
### Find suitable structures | |
classification_db_query = rcsb.FieldQuery( | |
"rcsb_polymer_instance_annotation.type", | |
exact_match="SCOP2" | |
) | |
scop_id_query = rcsb.FieldQuery( | |
"rcsb_polymer_instance_annotation.annotation_lineage.id", | |
exact_match=SCOP_ID | |
) | |
full_query = classification_db_query & scop_id_query | |
# Not only search for PDB entries, bit for specific protein chains | |
# with the given classification | |
pdb_chains = rcsb.search(full_query, return_type="polymer_instance") | |
# Ensure reproducible behavior | |
np.random.seed(0) | |
selected_pdb_chains = np.random.choice(sorted(pdb_chains), N) | |
### Fetch selected structures and write relevant chains to hard drive | |
if not isdir(OUT_DIR): | |
mkdir(OUT_DIR) | |
for pdb_chain in selected_pdb_chains: | |
pdb_id, chain = pdb_chain.split(".") | |
# Use PDBx format to get 'official' chain IDs rather than the ones | |
# provided by the authors to correctly match the chain value from | |
# the search query | |
mmtf_file = pdbx.PDBxFile.read(rcsb.fetch(pdb_id, "pdbx")) | |
structure = pdbx.get_structure(mmtf_file, model=1, use_author_fields=False) | |
# Filter relevant protein chain | |
chain = structure[ | |
struc.filter_amino_acids(structure) & (structure.chain_id == chain) | |
] | |
# Write filtered chain to output CIF file | |
pdbx_file = pdbx.PDBxFile() | |
pdbx.set_structure(pdbx_file, chain, data_block=pdb_id) | |
pdbx_file.write(join(OUT_DIR, f"{pdb_id}.cif")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment