CheML-gist/ANN_correlation.py

## ANN_correlation.py
import pandas as pd
from rdkit.Chem import AllChem, DataStructs
from rdkit import Chem
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.figure as figure
import time
import pubchempy as pcp

print('Program started!')

#SDF Loading-------------------------------------------------------------------
print('\nLoading SDF...')
start = time.time()

sdf = Chem.SDMolSupplier('gdb9.sdf') # Calculation level: B3LYP/6-31G(2df,p)
sdf_id = ['gdb_' + str(i + 1) for i in range(len(sdf))]
sdf = pd.DataFrame(sdf)
sdf.index = sdf_id
sdf.columns = ['ROMol']

dataset = pd.read_csv('gdb9.sdf.csv', index_col=0)
mols_with_props = pd.concat([sdf, dataset], axis=1)
mols_with_props = mols_with_props.dropna()

print('SDF loaded!', 'Time passed: ', time.time() - start)

#Filtering---------------------------------------------------------------------
print('\nFiltering samples...')
molecule_1 = pcp.get_compounds('pyrrole', 'name')
molecule_1 = molecule_1[0].canonical_smiles
filtering_mol_1 = Chem.MolFromSmiles(molecule_1)
AllChem.Compute2DCoords(filtering_mol_1)

molecule_2 = pcp.get_compounds('pyridine', 'name')
molecule_2 = molecule_2[0].canonical_smiles
filtering_mol_2 = Chem.MolFromSmiles(molecule_2)
AllChem.Compute2DCoords(filtering_mol_2)

molecule_3 = pcp.get_compounds('1-butene', 'name')
molecule_3 = molecule_3[0].canonical_smiles
filtering_mol_3 = Chem.MolFromSmiles(molecule_3)
AllChem.Compute2DCoords(filtering_mol_3)

fps_with_props = []
for i in range(len(mols_with_props)):
    m = mols_with_props.ROMol[i]
    if m.HasSubstructMatch(filtering_mol_1):
        fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])
    elif m.HasSubstructMatch(filtering_mol_2):
        fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])
    elif m.HasSubstructMatch(filtering_mol_3):
        fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])

fps_with_props = pd.DataFrame(fps_with_props)

#Setting the cutoff value
cutoff = 3 * np.std(fps_with_props.homo)
mean = np.mean(fps_with_props.homo)
remove_molecule = []

for i in range(len(fps_with_props.homo)):
    if fps_with_props.homo[i] > mean + cutoff or fps_with_props.homo[i] < mean - cutoff:
        remove_molecule.append(fps_with_props.index[i])

fps_with_props = fps_with_props.drop(remove_molecule, axis=0)

print('Filtering completed!', 'Time passed: ', time.time() - start)
print('{}samples matched!'.format(len(fps_with_props)))

#Fingerprint-------------------------------------------------------------------
def mol_to_fps(mol):
    Hashed = np.zeros(4096, dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 6, 4096) # Morgan Fingerprint
    # fp = AllChem.GetMACCSKeysFingerprint(mol) #MACCS Fingerprint
    # fp = Chem.rdmolops.RDKFingerprint(mol) #Topological Fingerprint
    DataStructs.ConvertToNumpyArray(fp, Hashed)
    return Hashed

print('\nLoading fingerprints...')
fps_with_props['fp'] = fps_with_props.ROMol.map(mol_to_fps)
print('FingerPrints loaded!', 'Time passed: ', time.time() - start)

#Definition of explanatory and objective variables----------------------------
x = fps_with_props.fp
y = fps_with_props.homo

#Convert fingerprints to numpy array type-------------------------------------
print('\nInt coversion started...')
x_array = np.asarray([i for i in x])
print('Int conversion complete!', 'Time passed: ', time.time() - start)

#Split data to the train and test data-----------------------------------------
num_test = round(0.2 * len(x))
x_train, x_test, y_train, y_test = train_test_split(x_array, y, test_size=num_test, shuffle=True, random_state=0)

#Autoscaling-------------------------------------------------------------------
autoscaled_y_train = (y_train - np.mean(y_train)) / np.std(y_train)
autoscaled_y_test = (y_test - np.mean(y_test)) / np.std(y_test)
autoscaled_y_train = np.asarray(autoscaled_y_train)
autoscaled_y_test = np.asarray(autoscaled_y_test)

#Learning----------------------------------------------------------------------
print('\n\n################ MODEL SETUP ##################\n\n')

model_simple = keras.models.Sequential()
model_simple.add(keras.layers.Dense(100, kernel_regularizer=keras.regularizers.l2(0.01), activation='tanh', input_shape=[len(fps_with_props.iloc[0, 20])]))
model_simple.add(keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.01), activation='tanh'))
# model_simple.add(keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.01), activation='linear'))
model_simple.add(keras.layers.Dense(1, kernel_regularizer=keras.regularizers.l2(0.01), activation='linear'))
model_simple.summary()
keras.utils.plot_model(model_simple)

model_simple.compile(loss=keras.losses.mean_squared_error,
                      optimizer=keras.optimizers.Adam(lr=5e-4),
                      metrics=['mae'])

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
history0 = model_simple.fit(x_train, autoscaled_y_train, epochs=150, validation_data=[x_test, autoscaled_y_test], callbacks=[early_stop])

#Plot the history--------------------------------------------------------------
def plot_history(history):
    history_dict = history.history
    loss_values = history_dict['loss']
    val_loss_values = history_dict['val_loss']

    mae = history_dict['mae']
    val_mae = history_dict['val_mae']

    epochs = range(1, len(history_dict['mae']) + 1)

    plt.rcParams['font.size'] = 15
    plt.rcParams['font.family'] = 'times new roman'
    plt.plot(epochs, loss_values, 'o', c='b', label='training loss')
    plt.plot(epochs, val_loss_values, c='m', label='test loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    plt.plot(epochs, mae, 'o', c='b', label='training mae')
    plt.plot(epochs, val_mae, c='m', label='test mae')
    plt.xlabel('Epochs')
    plt.ylabel('Mae')
    plt.legend()
    plt.show()

plot_history(history0)

estimated_y_train = model_simple.predict(x_train).flatten()
estimated_y_test = model_simple.predict(x_test).flatten()

#Plot the estimation result----------------------------------------------------
def estimated_plot(actual_y, estimated_y):
    plt.figure(figsize=figure.figaspect(1))
    y_max = max(max(actual_y), max(estimated_y))
    y_min = min(min(actual_y), min(estimated_y))
    plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
              [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-', markersize=5)
    plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
    plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
    plt.xlabel('actual y')
    plt.ylabel('estimated y')
    plt.scatter(actual_y, estimated_y, c='b', s=5)
    plt.show()

estimated_plot(autoscaled_y_train, estimated_y_train)
estimated_plot(autoscaled_y_test, estimated_y_test)

R2_train = (np.cov([estimated_y_train, autoscaled_y_train])[0, 1] / (np.std(estimated_y_train,ddof=1) * np.std(autoscaled_y_train, ddof=1))) ** 2
R2_train = round(R2_train, 4)
print('R2_train: ' + str(R2_train))
R2_test = (np.cov([estimated_y_test, autoscaled_y_test])[0, 1] / (np.std(estimated_y_test,ddof=1) * np.std(autoscaled_y_test, ddof=1))) ** 2
R2_test = round(R2_test, 4)
print('R2_test: ' + str(R2_test))

print('Analysis completed!', 'Time passed: ', time.time() - start)

#Estimation of the HOMO energy of pyrrole--------------------------------------
with open('pyrrole.mol', mode='r') as f:
    mol = f.read()

mol = Chem.MolFromMolBlock(mol)
mol_fp = mol_to_fps(mol)
mol_fp= np.array([mol_fp])

estimated_param = model_simple.predict(mol_fp)
estimated_param = estimated_param * np.std(y_train) + np.mean(y_train)
estimated_param = estimated_param[0]
print(estimated_param[0], 'hartree')
	import pandas as pd
	from rdkit.Chem import AllChem, DataStructs
	from rdkit import Chem
	from tensorflow import keras
	from sklearn.model_selection import train_test_split
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.figure as figure
	import time
	import pubchempy as pcp

	print('Program started!')

	#SDF Loading-------------------------------------------------------------------
	print('\nLoading SDF...')
	start = time.time()

	sdf = Chem.SDMolSupplier('gdb9.sdf') # Calculation level: B3LYP/6-31G(2df,p)
	sdf_id = ['gdb_' + str(i + 1) for i in range(len(sdf))]
	sdf = pd.DataFrame(sdf)
	sdf.index = sdf_id
	sdf.columns = ['ROMol']

	dataset = pd.read_csv('gdb9.sdf.csv', index_col=0)
	mols_with_props = pd.concat([sdf, dataset], axis=1)
	mols_with_props = mols_with_props.dropna()

	print('SDF loaded!', 'Time passed: ', time.time() - start)

	#Filtering---------------------------------------------------------------------
	print('\nFiltering samples...')
	molecule_1 = pcp.get_compounds('pyrrole', 'name')
	molecule_1 = molecule_1[0].canonical_smiles
	filtering_mol_1 = Chem.MolFromSmiles(molecule_1)
	AllChem.Compute2DCoords(filtering_mol_1)

	molecule_2 = pcp.get_compounds('pyridine', 'name')
	molecule_2 = molecule_2[0].canonical_smiles
	filtering_mol_2 = Chem.MolFromSmiles(molecule_2)
	AllChem.Compute2DCoords(filtering_mol_2)

	molecule_3 = pcp.get_compounds('1-butene', 'name')
	molecule_3 = molecule_3[0].canonical_smiles
	filtering_mol_3 = Chem.MolFromSmiles(molecule_3)
	AllChem.Compute2DCoords(filtering_mol_3)

	fps_with_props = []
	for i in range(len(mols_with_props)):
	m = mols_with_props.ROMol[i]
	if m.HasSubstructMatch(filtering_mol_1):
	fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])
	elif m.HasSubstructMatch(filtering_mol_2):
	fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])
	elif m.HasSubstructMatch(filtering_mol_3):
	fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])

	fps_with_props = pd.DataFrame(fps_with_props)

	#Setting the cutoff value
	cutoff = 3 * np.std(fps_with_props.homo)
	mean = np.mean(fps_with_props.homo)
	remove_molecule = []

	for i in range(len(fps_with_props.homo)):
	if fps_with_props.homo[i] > mean + cutoff or fps_with_props.homo[i] < mean - cutoff:
	remove_molecule.append(fps_with_props.index[i])

	fps_with_props = fps_with_props.drop(remove_molecule, axis=0)

	print('Filtering completed!', 'Time passed: ', time.time() - start)
	print('{}samples matched!'.format(len(fps_with_props)))

	#Fingerprint-------------------------------------------------------------------
	def mol_to_fps(mol):
	Hashed = np.zeros(4096, dtype=int)
	fp = AllChem.GetMorganFingerprintAsBitVect(mol, 6, 4096) # Morgan Fingerprint
	# fp = AllChem.GetMACCSKeysFingerprint(mol) #MACCS Fingerprint
	# fp = Chem.rdmolops.RDKFingerprint(mol) #Topological Fingerprint
	DataStructs.ConvertToNumpyArray(fp, Hashed)
	return Hashed

	print('\nLoading fingerprints...')
	fps_with_props['fp'] = fps_with_props.ROMol.map(mol_to_fps)
	print('FingerPrints loaded!', 'Time passed: ', time.time() - start)

	#Definition of explanatory and objective variables----------------------------
	x = fps_with_props.fp
	y = fps_with_props.homo

	#Convert fingerprints to numpy array type-------------------------------------
	print('\nInt coversion started...')
	x_array = np.asarray([i for i in x])
	print('Int conversion complete!', 'Time passed: ', time.time() - start)

	#Split data to the train and test data-----------------------------------------
	num_test = round(0.2 * len(x))
	x_train, x_test, y_train, y_test = train_test_split(x_array, y, test_size=num_test, shuffle=True, random_state=0)

	#Autoscaling-------------------------------------------------------------------
	autoscaled_y_train = (y_train - np.mean(y_train)) / np.std(y_train)
	autoscaled_y_test = (y_test - np.mean(y_test)) / np.std(y_test)
	autoscaled_y_train = np.asarray(autoscaled_y_train)
	autoscaled_y_test = np.asarray(autoscaled_y_test)

	#Learning----------------------------------------------------------------------
	print('\n\n################ MODEL SETUP ##################\n\n')

	model_simple = keras.models.Sequential()
	model_simple.add(keras.layers.Dense(100, kernel_regularizer=keras.regularizers.l2(0.01), activation='tanh', input_shape=[len(fps_with_props.iloc[0, 20])]))
	model_simple.add(keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.01), activation='tanh'))
	# model_simple.add(keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.01), activation='linear'))
	model_simple.add(keras.layers.Dense(1, kernel_regularizer=keras.regularizers.l2(0.01), activation='linear'))
	model_simple.summary()
	keras.utils.plot_model(model_simple)

	model_simple.compile(loss=keras.losses.mean_squared_error,
	optimizer=keras.optimizers.Adam(lr=5e-4),
	metrics=['mae'])

	early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
	history0 = model_simple.fit(x_train, autoscaled_y_train, epochs=150, validation_data=[x_test, autoscaled_y_test], callbacks=[early_stop])

	#Plot the history--------------------------------------------------------------
	def plot_history(history):
	history_dict = history.history
	loss_values = history_dict['loss']
	val_loss_values = history_dict['val_loss']

	mae = history_dict['mae']
	val_mae = history_dict['val_mae']

	epochs = range(1, len(history_dict['mae']) + 1)

	plt.rcParams['font.size'] = 15
	plt.rcParams['font.family'] = 'times new roman'
	plt.plot(epochs, loss_values, 'o', c='b', label='training loss')
	plt.plot(epochs, val_loss_values, c='m', label='test loss')
	plt.xlabel('Epochs')
	plt.ylabel('Loss')
	plt.legend()
	plt.show()

	plt.plot(epochs, mae, 'o', c='b', label='training mae')
	plt.plot(epochs, val_mae, c='m', label='test mae')
	plt.xlabel('Epochs')
	plt.ylabel('Mae')
	plt.legend()
	plt.show()

	plot_history(history0)

	estimated_y_train = model_simple.predict(x_train).flatten()
	estimated_y_test = model_simple.predict(x_test).flatten()

	#Plot the estimation result----------------------------------------------------
	def estimated_plot(actual_y, estimated_y):
	plt.figure(figsize=figure.figaspect(1))
	y_max = max(max(actual_y), max(estimated_y))
	y_min = min(min(actual_y), min(estimated_y))
	plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
	[y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-', markersize=5)
	plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
	plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
	plt.xlabel('actual y')
	plt.ylabel('estimated y')
	plt.scatter(actual_y, estimated_y, c='b', s=5)
	plt.show()

	estimated_plot(autoscaled_y_train, estimated_y_train)
	estimated_plot(autoscaled_y_test, estimated_y_test)

	R2_train = (np.cov([estimated_y_train, autoscaled_y_train])[0, 1] / (np.std(estimated_y_train,ddof=1) * np.std(autoscaled_y_train, ddof=1))) ** 2
	R2_train = round(R2_train, 4)
	print('R2_train: ' + str(R2_train))
	R2_test = (np.cov([estimated_y_test, autoscaled_y_test])[0, 1] / (np.std(estimated_y_test,ddof=1) * np.std(autoscaled_y_test, ddof=1))) ** 2
	R2_test = round(R2_test, 4)
	print('R2_test: ' + str(R2_test))

	print('Analysis completed!', 'Time passed: ', time.time() - start)

	#Estimation of the HOMO energy of pyrrole--------------------------------------
	with open('pyrrole.mol', mode='r') as f:
	mol = f.read()

	mol = Chem.MolFromMolBlock(mol)
	mol_fp = mol_to_fps(mol)
	mol_fp= np.array([mol_fp])

	estimated_param = model_simple.predict(mol_fp)
	estimated_param = estimated_param * np.std(y_train) + np.mean(y_train)
	estimated_param = estimated_param[0]
	print(estimated_param[0], 'hartree')