Skip to content

Instantly share code, notes, and snippets.

@staceysv
Created May 6, 2021 17:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save staceysv/32efc11f694507274182fdac95576f2b to your computer and use it in GitHub Desktop.
Save staceysv/32efc11f694507274182fdac95576f2b to your computer and use it in GitHub Desktop.
basic script to log deepchem molecular solubility tutorial to wandb
# mol_sol_train.py
#--------------
# Load molecular solubility dataset, featurize, train FC net regression model,
# and evaluate by R^2 score.
import deepchem as dc
from deepchem.utils.evaluate import Evaluator
from deepchem.utils.save import load_from_disk
import numpy as np
import numpy.random
import wandb
import tensorflow as tf
tf.enable_eager_execution()
# load dataset
dataset_file= "delaney-processed.csv"
dataset = load_from_disk(dataset_file)
print("Columns of dataset: %s" % str(dataset.columns.values))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))
# featurize
featurizer = dc.feat.CircularFingerprint(size=1024)
loader = dc.data.CSVLoader(
tasks=["measured log solubility in mols per litre"], smiles_field="smiles",
featurizer=featurizer)
dataset = loader.featurize(dataset_file)
# create intelligent split
splitter = dc.splits.ScaffoldSplitter(dataset_file)
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
dataset)
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
for dataset in [train_dataset, valid_dataset, test_dataset]:
for transformer in transformers:
dataset = transformer.transform(dataset)
# extract number of features
n_features = train_dataset.get_data_shape()[0]
# default config for W&B
c = {
"feats" : n_features,
"batch_size" : 32,
"dropout" : 0.25,
"layers" : [1000],
"lr" : 0.001,
"epochs" : 10
}
# intialize logging to W&B
wandb.init(project="deepchem_molsol", entity="stacey")
cfg = wandb.config
cfg.setdefaults(c)
run_name = "fc L_" + str(cfg.layers) + " d_" + str(cfg.dropout) + " lr_" + str(cfg.lr)+ " bs_" + str(cfg.batch_size)
wandb.run.name = run_name
# process layers to create deeper network
str_layers = cfg.layers.split(" ")
new_layers = []
for l in str_layers:
new_layers.append(int(l))
# create regression model using deepchem wrapper on Keras
dnn = dc.models.MultitaskRegressor(1, cfg.feats, batch_size = cfg.batch_size, dropouts=cfg.dropout, layer_sizes=new_layers, learning_rate=cfg.lr, nb_epoch=cfg.epochs)
# train model
dnn.fit(train_dataset, nb_epoch=cfg.epochs)
# Evaluate trained model on validation data and log R^2 to W&B
metric = dc.metrics.Metric(dc.metrics.r2_score)
dnn_test_evaluator = Evaluator(dnn, valid_dataset, transformers)
dnn_test_r2score = dnn_test_evaluator.compute_model_performance([metric])
print("DNN Test set R^2 %f" % (dnn_test_r2score["r2_score"]))
wandb.log({"r2" : dnn_test_r2score["r2_score"]})
########## deepchem internals
#params_dict = {"learning_rate": [0.001], #np.power(10., np.random.uniform(-5, -3, size=1)),
# "decay": np.power(10, np.random.uniform(-6, -4, size=1)),
# "nb_epoch": [20] }
#def model_builder(model_params, model_dir):
# model = dc.models.MultitaskRegressor(
# 1, n_features, layer_sizes=[1000], dropouts=[.25],
# batch_size=50, **params_dict) #model_params)
# return model
# deepchem's internal HP optimization
#optimizer = dc.hyper.HyperparamOpt(model_builder)
#dnn, best_dnn_hyperparams, all_dnn_results = optimizer.hyperparam_search(
# params_dict, train_dataset, valid_dataset, transformers,
# metric=metric)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment