Skip to content

Instantly share code, notes, and snippets.

@0ut0fcontrol
Created January 26, 2018 10:22
Show Gist options
  • Save 0ut0fcontrol/0b80c07dde1002e8431b20f8cdc805dc to your computer and use it in GitHub Desktop.
Save 0ut0fcontrol/0b80c07dde1002e8431b20f8cdc805dc to your computer and use it in GitHub Desktop.
KRR(CM) gains worse performance when CM featurized from gdb7.sdf
"""
Script that trains Sklearn singletask models on GDB7 dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
import os
import deepchem as dc
import numpy as np
import shutil
from sklearn.kernel_ridge import KernelRidge
np.random.seed(123)
base_dir = "/tmp/gdb7_sklearn"
data_dir = os.path.join(base_dir, "dataset")
model_dir = os.path.join(base_dir, "model")
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")
if os.path.exists(base_dir):
shutil.rmtree(base_dir)
os.makedirs(base_dir)
max_num_atoms = 23
featurizers = dc.feat.CoulombMatrix(max_num_atoms)
input_file = "gdb7.sdf"
tasks = ["u0_atom"]
smiles_field = "smiles"
mol_field = "mol"
featurizer = dc.data.SDFLoader(tasks, smiles_field=smiles_field, mol_field=mol_field, featurizer=featurizers)
dataset = featurizer.featurize(input_file, data_dir)
random_splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = random_splitter.train_valid_test_split(dataset)
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
for transformer in transformers:
train_dataset = transformer.transform(train_dataset)
test_dataset = transformer.transform(test_dataset)
fit_transformer = dc.trans.CoulombFitTransformer(train_dataset)
train_dataset = fit_transformer.transform(train_dataset)
test_dataset = fit_transformer.transform(test_dataset)
regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression")
def model_builder(model_dir):
sklearn_model = KernelRidge(
kernel="rbf", alpha=0.00121803)
return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(tasks, model_builder, model_dir)
# Fit trained model
model.fit(train_dataset)
model.save()
train_evaluator = dc.utils.evaluate.Evaluator(model, train_dataset, transformers)
train_scores = train_evaluator.compute_model_performance([regression_metric])
print("Train scores [kcal/mol]")
print(train_scores)
test_evaluator = dc.utils.evaluate.Evaluator(model, test_dataset, transformers)
test_scores = test_evaluator.compute_model_performance([regression_metric])
print("Validation scores [kcal/mol]")
print(test_scores) # about 70 kcal/mol
"""
Script that trains Sklearn singletask models on GDB7 dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
import os
import deepchem as dc
import numpy as np
import shutil
from sklearn.kernel_ridge import KernelRidge
np.random.seed(123)
base_dir = "/tmp/gdb7_sklearn"
data_dir = os.path.join(base_dir, "dataset")
model_dir = os.path.join(base_dir, "model")
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")
if os.path.exists(base_dir):
shutil.rmtree(base_dir)
os.makedirs(base_dir)
tasks, all_datasets, transformers = dc.molnet.load_qm7_from_mat(
featurizer='CoulombMatrix',
split='random',
reload=False)
train_dataset, valid_dataset, test_dataset = all_datasets
fit_transformer = dc.trans.CoulombFitTransformer(train_dataset)
train_dataset = fit_transformer.transform(train_dataset)
test_dataset = fit_transformer.transform(test_dataset)
regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression")
def model_builder(model_dir):
sklearn_model = KernelRidge(
kernel="rbf", alpha=0.00121803)
return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(tasks, model_builder, model_dir)
# Fit trained model
model.fit(train_dataset)
model.save()
train_evaluator = dc.utils.evaluate.Evaluator(model, train_dataset, transformers)
train_scores = train_evaluator.compute_model_performance([regression_metric])
print("Train scores [kcal/mol]")
print(train_scores)
test_evaluator = dc.utils.evaluate.Evaluator(model, test_dataset, transformers)
test_scores = test_evaluator.compute_model_performance([regression_metric])
print("Validation scores [kcal/mol]")
print(test_scores) # about 10 kcal/mol
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment