Skip to content

Instantly share code, notes, and snippets.

View Ryan-Rhys's full-sized avatar

Ryan-Rhys Griffiths Ryan-Rhys

View GitHub Profile
ranked_confidence_list = np.argsort(y_var, axis=0).flatten()
print(ranked_confidence_list)
# Output Standardised RMSE and RMSE on Train Set
y_pred_train, _ = m.predict_f(X_train)
train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
print("\nTrain RMSE (Standardised): {:.3f} nm".format(train_rmse_stan))
print("Train RMSE: {:.3f} nm".format(train_rmse))
# Output R^2, RMSE and MAE on the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=0)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
# We standardise the outputs but leave the inputs unchanged
_, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)
X_train = X_train.astype(np.float64)
test_set_size = 0.2
def transform_data(X_train, y_train, X_test, y_test):
"""
Apply feature scaling to the data. Return the standardised train and
test sets together with the scaler object for the target values.
:param X_train: input train data
:param y_train: train labels
:param X_test: input test data
:param y_test: test labels
:return: X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler
# We define the Gaussian Process Regression Model using the Tanimoto kernel
m = None
def objective_closure():
return -m.log_marginal_likelihood()
rdkit_mols = [MolFromSmiles(smiles) for smiles in smiles_list]
X = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048) for mol in rdkit_mols]
X = np.asarray(X)
df = pd.read_csv('../dataset/photoswitches.csv') # Load the photoswitch dataset using pandas
# Create a list of molecules smiles and associated properties
smiles_list = df['SMILES'].to_list()
property_vals = df['E isomer pi-pi* wavelength in nm'].to_numpy()
# Delete NaN values
smiles_list = list(np.delete(np.array(smiles_list), np.argwhere(np.isnan(property_vals))))
y = np.delete(property_vals, np.argwhere(np.isnan(property_vals)))
class Tanimoto(gpflow.kernels.Kernel):
def __init__(self):
super().__init__()
# We constrain the value of the kernel variance to be positive when it's being optimised
self.variance = gpflow.Parameter(1.0, transform=positive())
def K(self, X, X2=None):
"""
Compute the Tanimoto kernel matrix σ² * ((<x, y>) / (||x||^2 + ||y||^2 - <x, y>))
import gpflow
from gpflow.mean_functions import Constant
from gpflow.utilities import positive, print_summary
from gpflow.utilities.ops import broadcasting_elementwise
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error