Skip to content

Instantly share code, notes, and snippets.

@staceysv
Created May 6, 2021 18:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save staceysv/e50ca015a08cf3f0285ff94f96c0862f to your computer and use it in GitHub Desktop.
Save staceysv/e50ca015a08cf3f0285ff94f96c0862f to your computer and use it in GitHub Desktop.
basic wandb integration for deepchem protein-ligand tutorial
#! /usr/bin/env python
import deepchem as dc
from deepchem.utils import download_url
from deepchem.utils.evaluate import Evaluator
import pandas as pd
import nglview
import tempfile
import os
import mdtraj as md
import numpy as np
import wandb
import os
from sklearn.ensemble import RandomForestRegressor
def convert_lines_to_mdtraj(molecule_lines, fname):
molecule_lines = molecule_lines.strip('[').strip(']').replace("'","").replace("\\n", "").split(", ")
# tempdir = tempfile.mkdtemp()
# molecule_file = os.path.join(tempdir, "molecule.pdb")
molecule_file = fname
with open(molecule_file, "w") as f:
for line in molecule_lines:
f.write("%s\n" % line)
# molecule_mdtraj = md.load(molecule_file)
# return molecule_mdtraj
return molecule_file
def combine_mdtraj(protein, ligand):
chain = protein.topology.add_chain()
residue = protein.topology.add_residue("LIG", chain, resSeq=1)
for atom in ligand.topology.atoms:
protein.topology.add_atom(atom.name, atom.element, residue)
protein.xyz = np.hstack([protein.xyz, ligand.xyz])
protein.topology.create_standard_bonds()
return protein
data_dir = dc.utils.get_data_dir()
dataset_file = os.path.join(data_dir, "pdbbind_core_df.csv.gz")
if not os.path.exists(dataset_file):
print('File does not exist. Downloading file...')
download_url("https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/pdbbind_core_df.csv.gz")
print('File downloaded...')
raw_dataset = dc.utils.save.load_from_disk(dataset_file)
def make_pdb(prefix, _id):
return prefix + "_" + str(_id) + ".pdb"
def log_protein(_id):
first_protein, first_ligand = raw_dataset.iloc[_id]["protein_pdb"], raw_dataset.iloc[_id]["ligand_pdb"]
protein_mdtraj = convert_lines_to_mdtraj(first_protein, make_pdb("prot", _id))
ligand_mdtraj = convert_lines_to_mdtraj(first_ligand, make_pdb("lig", _id))
complex_mdtraj = combine_mdtraj(md.load(protein_mdtraj), md.load(ligand_mdtraj))
complex_pdb = complex_mdtraj.save(make_pdb("complex", _id))
wandb.log({"protein" : wandb.Molecule(protein_mdtraj),
"ligand" : wandb.Molecule(ligand_mdtraj),
"combo" : wandb.Molecule(make_pdb("complex", _id))})
def log():
for _id in range(21, 25):
wandb.init(project="deepchem_interact", entity="stacey", reinit=True, name="cling rc test "+str(_id))
log_protein(_id)
print("LOGGED PROTEIN: ", str(_id))
#log()
def train():
#wandb.init(project="deepchem_interact", entity="stacey")
grid_featurizer = dc.feat.RdkitGridFeaturizer(
voxel_width=16.0, feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi", "salt_bridge"],
ecfp_power=5, splif_power=5, parallel=True, flatten=True, sanitize=True)
compound_featurizer = dc.feat.CircularFingerprint(size=128)
pdbbind_tasks, (train_dataset, valid_dataset, test_dataset), transformers = dc.molnet.load_pdbbind_grid(
featurizer="ECFP", subset="refined")
seed=23 # Set a random seed to get stable results
cfg = {"max_features" : 100, "model_type" : "multitask_reg"}
wandb.init(project="deepchem_interact", name="cling rc test", config=cfg, reinit=True)
# try a deep model
#dnn = dc.models.MultitaskRegressor(1, [[100)
# train model
dnn.fit(train_dataset, nb_epoch=10)
# Evaluate trained model on validation data and log R^2 to W&B
metric = dc.metrics.Metric(dc.metrics.r2_score)
dnn_test_evaluator = Evaluator(dnn, valid_dataset, transformers)
dnn_test_r2score = dnn_test_evaluator.compute_model_performance([metric])
print("DNN Test set R^2 %f" % (dnn_test_r2score["r2_score"]))
wandb.log({"r2" : dnn_test_r2score["r2_score"]})
def rf():
#rf = [10, 50, 75, 100]
#mf = ["auto", "sqrt", "log2", None]
grid_featurizer = dc.feat.RdkitGridFeaturizer(
voxel_width=16.0, feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi", "salt_bridge"],
ecfp_power=5, splif_power=5, parallel=True, flatten=True, sanitize=True)
compound_featurizer = dc.feat.CircularFingerprint(size=128)
pdbbind_tasks, (train_dataset, valid_dataset, test_dataset), transformers = dc.molnet.load_pdbbind_grid(
featurizer="ECFP", subset="refined")
rf = [150, 300, 10, 20]
mf = ["auto", "sqrt"]
for n in rf:
for m in mf:
print("M: ", m, "N: ", n)
cfg = {"n_estimators" : n, "max_features" : m, "model_type" : "random forest"}
wandb.init(project="deepchem_interact", name="cling rc rf test", config=cfg, reinit=True)
# + str(n) + " Est - Max Feat " + str(m), config=cfg, reinit=True)
seed=23 # Set a random seed to get stable results
sklearn_model = RandomForestRegressor(n_estimators=n, max_features=m)
sklearn_model.random_state = seed
model = dc.models.SklearnModel(sklearn_model)
model.fit(train_dataset)
wandb.sklearn.plot_learning_curve(model.model_instance, train_dataset.X, train_dataset.y.ravel())
#sklearn_curve(
metric = dc.metrics.Metric(dc.metrics.r2_score)
evaluator = Evaluator(model, train_dataset, transformers)
train_r2score = evaluator.compute_model_performance([metric])
print("RF Train set R^2 %f" % (train_r2score["r2_score"]))
evaluator = Evaluator(model, valid_dataset, transformers)
valid_r2score = evaluator.compute_model_performance([metric])
print("RF Valid set R^2 %f" % (valid_r2score["r2_score"]))
wandb.log({"train r2" : train_r2score, "val r2" : valid_r2score})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment