Ryan-Rhys Griffiths Ryan-Rhys

## conf.py
ranked_confidence_list = np.argsort(y_var, axis=0).flatten()
print(ranked_confidence_list)

## output.py
# Output Standardised RMSE and RMSE on Train Set

y_pred_train, _ = m.predict_f(X_train)
train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
print("\nTrain RMSE (Standardised): {:.3f} nm".format(train_rmse_stan))
print("Train RMSE: {:.3f} nm".format(train_rmse))


# Output R^2, RMSE and MAE on the test set

## train_test.py
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=0)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

#  We standardise the outputs but leave the inputs unchanged

_, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

X_train = X_train.astype(np.float64)

## test_set.py
test_set_size = 0.2

## transform.py
def transform_data(X_train, y_train, X_test, y_test):
    """
    Apply feature scaling to the data. Return the standardised train and
    test sets together with the scaler object for the target values.

    :param X_train: input train data
    :param y_train: train labels
    :param X_test: input test data
    :param y_test: test labels
    :return: X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler

## objective_closure.py
# We define the Gaussian Process Regression Model using the Tanimoto kernel

m = None

def objective_closure():
    return -m.log_marginal_likelihood()

## morgan.py
rdkit_mols = [MolFromSmiles(smiles) for smiles in smiles_list]
X = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048) for mol in rdkit_mols]
X = np.asarray(X)

## load_molecules.py
df = pd.read_csv('../dataset/photoswitches.csv')  # Load the photoswitch dataset using pandas

# Create a list of molecules smiles and associated properties
smiles_list = df['SMILES'].to_list()
property_vals = df['E isomer pi-pi* wavelength in nm'].to_numpy()

# Delete NaN values
smiles_list = list(np.delete(np.array(smiles_list), np.argwhere(np.isnan(property_vals))))
y = np.delete(property_vals, np.argwhere(np.isnan(property_vals)))

## tanimoto_class.py
class Tanimoto(gpflow.kernels.Kernel):
    def __init__(self):
        super().__init__()
        # We constrain the value of the kernel variance to be positive when it's being optimised
        self.variance = gpflow.Parameter(1.0, transform=positive())

    def K(self, X, X2=None):
        """
        Compute the Tanimoto kernel matrix σ² * ((<x, y>) / (||x||^2 + ||y||^2 - <x, y>))

## gp_library_imports_molecules.py
import gpflow
from gpflow.mean_functions import Constant
from gpflow.utilities import positive, print_summary
from gpflow.utilities.ops import broadcasting_elementwise
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
	ranked_confidence_list = np.argsort(y_var, axis=0).flatten()
	print(ranked_confidence_list)
	# Output Standardised RMSE and RMSE on Train Set

	y_pred_train, _ = m.predict_f(X_train)
	train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
	train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
	print("\nTrain RMSE (Standardised): {:.3f} nm".format(train_rmse_stan))
	print("Train RMSE: {:.3f} nm".format(train_rmse))


	# Output R^2, RMSE and MAE on the test set
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=0)

	y_train = y_train.reshape(-1, 1)
	y_test = y_test.reshape(-1, 1)

	# We standardise the outputs but leave the inputs unchanged

	_, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

	X_train = X_train.astype(np.float64)
	def transform_data(X_train, y_train, X_test, y_test):
	"""
	Apply feature scaling to the data. Return the standardised train and
	test sets together with the scaler object for the target values.

	:param X_train: input train data
	:param y_train: train labels
	:param X_test: input test data
	:param y_test: test labels
	:return: X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler
	# We define the Gaussian Process Regression Model using the Tanimoto kernel

	m = None

	def objective_closure():
	return -m.log_marginal_likelihood()
	rdkit_mols = [MolFromSmiles(smiles) for smiles in smiles_list]
	X = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048) for mol in rdkit_mols]
	X = np.asarray(X)
	df = pd.read_csv('../dataset/photoswitches.csv') # Load the photoswitch dataset using pandas

	# Create a list of molecules smiles and associated properties
	smiles_list = df['SMILES'].to_list()
	property_vals = df['E isomer pi-pi* wavelength in nm'].to_numpy()

	# Delete NaN values
	smiles_list = list(np.delete(np.array(smiles_list), np.argwhere(np.isnan(property_vals))))
	y = np.delete(property_vals, np.argwhere(np.isnan(property_vals)))
	class Tanimoto(gpflow.kernels.Kernel):
	def __init__(self):
	super().__init__()
	# We constrain the value of the kernel variance to be positive when it's being optimised
	self.variance = gpflow.Parameter(1.0, transform=positive())

	def K(self, X, X2=None):
	"""
	Compute the Tanimoto kernel matrix σ² * ((<x, y>) / (\|\|x\|\|^2 + \|\|y\|\|^2 - <x, y>))
	import gpflow
	from gpflow.mean_functions import Constant
	from gpflow.utilities import positive, print_summary
	from gpflow.utilities.ops import broadcasting_elementwise
	from matplotlib import pyplot as plt
	import numpy as np
	import pandas as pd
	from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error