Created
May 6, 2021 17:59
-
-
Save staceysv/32efc11f694507274182fdac95576f2b to your computer and use it in GitHub Desktop.
basic script to log deepchem molecular solubility tutorial to wandb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# mol_sol_train.py | |
#-------------- | |
# Load molecular solubility dataset, featurize, train FC net regression model, | |
# and evaluate by R^2 score. | |
import deepchem as dc | |
from deepchem.utils.evaluate import Evaluator | |
from deepchem.utils.save import load_from_disk | |
import numpy as np | |
import numpy.random | |
import wandb | |
import tensorflow as tf | |
tf.enable_eager_execution() | |
# load dataset | |
dataset_file= "delaney-processed.csv" | |
dataset = load_from_disk(dataset_file) | |
print("Columns of dataset: %s" % str(dataset.columns.values)) | |
print("Number of examples in dataset: %s" % str(dataset.shape[0])) | |
# featurize | |
featurizer = dc.feat.CircularFingerprint(size=1024) | |
loader = dc.data.CSVLoader( | |
tasks=["measured log solubility in mols per litre"], smiles_field="smiles", | |
featurizer=featurizer) | |
dataset = loader.featurize(dataset_file) | |
# create intelligent split | |
splitter = dc.splits.ScaffoldSplitter(dataset_file) | |
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( | |
dataset) | |
transformers = [ | |
dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)] | |
for dataset in [train_dataset, valid_dataset, test_dataset]: | |
for transformer in transformers: | |
dataset = transformer.transform(dataset) | |
# extract number of features | |
n_features = train_dataset.get_data_shape()[0] | |
# default config for W&B | |
c = { | |
"feats" : n_features, | |
"batch_size" : 32, | |
"dropout" : 0.25, | |
"layers" : [1000], | |
"lr" : 0.001, | |
"epochs" : 10 | |
} | |
# intialize logging to W&B | |
wandb.init(project="deepchem_molsol", entity="stacey") | |
cfg = wandb.config | |
cfg.setdefaults(c) | |
run_name = "fc L_" + str(cfg.layers) + " d_" + str(cfg.dropout) + " lr_" + str(cfg.lr)+ " bs_" + str(cfg.batch_size) | |
wandb.run.name = run_name | |
# process layers to create deeper network | |
str_layers = cfg.layers.split(" ") | |
new_layers = [] | |
for l in str_layers: | |
new_layers.append(int(l)) | |
# create regression model using deepchem wrapper on Keras | |
dnn = dc.models.MultitaskRegressor(1, cfg.feats, batch_size = cfg.batch_size, dropouts=cfg.dropout, layer_sizes=new_layers, learning_rate=cfg.lr, nb_epoch=cfg.epochs) | |
# train model | |
dnn.fit(train_dataset, nb_epoch=cfg.epochs) | |
# Evaluate trained model on validation data and log R^2 to W&B | |
metric = dc.metrics.Metric(dc.metrics.r2_score) | |
dnn_test_evaluator = Evaluator(dnn, valid_dataset, transformers) | |
dnn_test_r2score = dnn_test_evaluator.compute_model_performance([metric]) | |
print("DNN Test set R^2 %f" % (dnn_test_r2score["r2_score"])) | |
wandb.log({"r2" : dnn_test_r2score["r2_score"]}) | |
########## deepchem internals | |
#params_dict = {"learning_rate": [0.001], #np.power(10., np.random.uniform(-5, -3, size=1)), | |
# "decay": np.power(10, np.random.uniform(-6, -4, size=1)), | |
# "nb_epoch": [20] } | |
#def model_builder(model_params, model_dir): | |
# model = dc.models.MultitaskRegressor( | |
# 1, n_features, layer_sizes=[1000], dropouts=[.25], | |
# batch_size=50, **params_dict) #model_params) | |
# return model | |
# deepchem's internal HP optimization | |
#optimizer = dc.hyper.HyperparamOpt(model_builder) | |
#dnn, best_dnn_hyperparams, all_dnn_results = optimizer.hyperparam_search( | |
# params_dict, train_dataset, valid_dataset, transformers, | |
# metric=metric) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment