padix-key/download_protein_family.py

## download_protein_family.py
from os.path import join, isdir
from os import mkdir
import numpy as np
import biotite.structure as struc
import biotite.structure.io.pdb as pdb
import biotite.structure.io.pdbx as pdbx
import biotite.database.rcsb as rcsb


# The structural classifier
SCOP_ID = "1000000"
# The number of samples
N = 10
# The directory for output CIF files
OUT_DIR = "output"


### Find suitable structures
classification_db_query = rcsb.FieldQuery(
    "rcsb_polymer_instance_annotation.type",
    exact_match="SCOP2"
)
scop_id_query = rcsb.FieldQuery(
    "rcsb_polymer_instance_annotation.annotation_lineage.id",
    exact_match=SCOP_ID
)
full_query = classification_db_query & scop_id_query
# Not only search for PDB entries, bit for specific protein chains
# with the given classification
pdb_chains = rcsb.search(full_query, return_type="polymer_instance")
# Ensure reproducible behavior
np.random.seed(0)
selected_pdb_chains = np.random.choice(sorted(pdb_chains), N)


### Fetch selected structures and write relevant chains to hard drive
if not isdir(OUT_DIR):
    mkdir(OUT_DIR)
for pdb_chain in selected_pdb_chains:
    pdb_id, chain = pdb_chain.split(".")
    # Use PDBx format to get 'official' chain IDs rather than the ones
    # provided by the authors to correctly match the chain value from
    # the search query
    mmtf_file = pdbx.PDBxFile.read(rcsb.fetch(pdb_id, "pdbx"))
    structure = pdbx.get_structure(mmtf_file, model=1, use_author_fields=False)
    # Filter relevant protein chain
    chain = structure[
        struc.filter_amino_acids(structure) & (structure.chain_id == chain)
    ]
    # Write filtered chain to output CIF file
    pdbx_file = pdbx.PDBxFile()
    pdbx.set_structure(pdbx_file, chain, data_block=pdb_id)
    pdbx_file.write(join(OUT_DIR, f"{pdb_id}.cif"))
	from os.path import join, isdir
	from os import mkdir
	import numpy as np
	import biotite.structure as struc
	import biotite.structure.io.pdb as pdb
	import biotite.structure.io.pdbx as pdbx
	import biotite.database.rcsb as rcsb


	# The structural classifier
	SCOP_ID = "1000000"
	# The number of samples
	N = 10
	# The directory for output CIF files
	OUT_DIR = "output"


	### Find suitable structures
	classification_db_query = rcsb.FieldQuery(
	"rcsb_polymer_instance_annotation.type",
	exact_match="SCOP2"
	)
	scop_id_query = rcsb.FieldQuery(
	"rcsb_polymer_instance_annotation.annotation_lineage.id",
	exact_match=SCOP_ID
	)
	full_query = classification_db_query & scop_id_query
	# Not only search for PDB entries, bit for specific protein chains
	# with the given classification
	pdb_chains = rcsb.search(full_query, return_type="polymer_instance")
	# Ensure reproducible behavior
	np.random.seed(0)
	selected_pdb_chains = np.random.choice(sorted(pdb_chains), N)


	### Fetch selected structures and write relevant chains to hard drive
	if not isdir(OUT_DIR):
	mkdir(OUT_DIR)
	for pdb_chain in selected_pdb_chains:
	pdb_id, chain = pdb_chain.split(".")
	# Use PDBx format to get 'official' chain IDs rather than the ones
	# provided by the authors to correctly match the chain value from
	# the search query
	mmtf_file = pdbx.PDBxFile.read(rcsb.fetch(pdb_id, "pdbx"))
	structure = pdbx.get_structure(mmtf_file, model=1, use_author_fields=False)
	# Filter relevant protein chain
	chain = structure[
	struc.filter_amino_acids(structure) & (structure.chain_id == chain)
	]
	# Write filtered chain to output CIF file
	pdbx_file = pdbx.PDBxFile()
	pdbx.set_structure(pdbx_file, chain, data_block=pdb_id)
	pdbx_file.write(join(OUT_DIR, f"{pdb_id}.cif"))