Skip to content

Instantly share code, notes, and snippets.

@CheML-gist
Last active December 25, 2020 09:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CheML-gist/14989bb644dc328f6001380d638cd0d0 to your computer and use it in GitHub Desktop.
Save CheML-gist/14989bb644dc328f6001380d638cd0d0 to your computer and use it in GitHub Desktop.
import pandas as pd
from rdkit.Chem import AllChem, DataStructs
from rdkit import Chem
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.figure as figure
import time
import pubchempy as pcp
print('Program started!')
#SDF Loading-------------------------------------------------------------------
print('\nLoading SDF...')
start = time.time()
sdf = Chem.SDMolSupplier('gdb9.sdf') # Calculation level: B3LYP/6-31G(2df,p)
sdf_id = ['gdb_' + str(i + 1) for i in range(len(sdf))]
sdf = pd.DataFrame(sdf)
sdf.index = sdf_id
sdf.columns = ['ROMol']
dataset = pd.read_csv('gdb9.sdf.csv', index_col=0)
mols_with_props = pd.concat([sdf, dataset], axis=1)
mols_with_props = mols_with_props.dropna()
print('SDF loaded!', 'Time passed: ', time.time() - start)
#Filtering---------------------------------------------------------------------
print('\nFiltering samples...')
molecule_1 = pcp.get_compounds('pyrrole', 'name')
molecule_1 = molecule_1[0].canonical_smiles
filtering_mol_1 = Chem.MolFromSmiles(molecule_1)
AllChem.Compute2DCoords(filtering_mol_1)
molecule_2 = pcp.get_compounds('pyridine', 'name')
molecule_2 = molecule_2[0].canonical_smiles
filtering_mol_2 = Chem.MolFromSmiles(molecule_2)
AllChem.Compute2DCoords(filtering_mol_2)
molecule_3 = pcp.get_compounds('1-butene', 'name')
molecule_3 = molecule_3[0].canonical_smiles
filtering_mol_3 = Chem.MolFromSmiles(molecule_3)
AllChem.Compute2DCoords(filtering_mol_3)
fps_with_props = []
for i in range(len(mols_with_props)):
m = mols_with_props.ROMol[i]
if m.HasSubstructMatch(filtering_mol_1):
fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])
elif m.HasSubstructMatch(filtering_mol_2):
fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])
elif m.HasSubstructMatch(filtering_mol_3):
fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]])
fps_with_props = pd.DataFrame(fps_with_props)
#Setting the cutoff value
cutoff = 3 * np.std(fps_with_props.homo)
mean = np.mean(fps_with_props.homo)
remove_molecule = []
for i in range(len(fps_with_props.homo)):
if fps_with_props.homo[i] > mean + cutoff or fps_with_props.homo[i] < mean - cutoff:
remove_molecule.append(fps_with_props.index[i])
fps_with_props = fps_with_props.drop(remove_molecule, axis=0)
print('Filtering completed!', 'Time passed: ', time.time() - start)
print('{}samples matched!'.format(len(fps_with_props)))
#Fingerprint-------------------------------------------------------------------
def mol_to_fps(mol):
Hashed = np.zeros(4096, dtype=int)
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 6, 4096) # Morgan Fingerprint
# fp = AllChem.GetMACCSKeysFingerprint(mol) #MACCS Fingerprint
# fp = Chem.rdmolops.RDKFingerprint(mol) #Topological Fingerprint
DataStructs.ConvertToNumpyArray(fp, Hashed)
return Hashed
print('\nLoading fingerprints...')
fps_with_props['fp'] = fps_with_props.ROMol.map(mol_to_fps)
print('FingerPrints loaded!', 'Time passed: ', time.time() - start)
#Definition of explanatory and objective variables----------------------------
x = fps_with_props.fp
y = fps_with_props.homo
#Convert fingerprints to numpy array type-------------------------------------
print('\nInt coversion started...')
x_array = np.asarray([i for i in x])
print('Int conversion complete!', 'Time passed: ', time.time() - start)
#Split data to the train and test data-----------------------------------------
num_test = round(0.2 * len(x))
x_train, x_test, y_train, y_test = train_test_split(x_array, y, test_size=num_test, shuffle=True, random_state=0)
#Autoscaling-------------------------------------------------------------------
autoscaled_y_train = (y_train - np.mean(y_train)) / np.std(y_train)
autoscaled_y_test = (y_test - np.mean(y_test)) / np.std(y_test)
autoscaled_y_train = np.asarray(autoscaled_y_train)
autoscaled_y_test = np.asarray(autoscaled_y_test)
#Learning----------------------------------------------------------------------
print('\n\n################ MODEL SETUP ##################\n\n')
model_simple = keras.models.Sequential()
model_simple.add(keras.layers.Dense(100, kernel_regularizer=keras.regularizers.l2(0.01), activation='tanh', input_shape=[len(fps_with_props.iloc[0, 20])]))
model_simple.add(keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.01), activation='tanh'))
# model_simple.add(keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.01), activation='linear'))
model_simple.add(keras.layers.Dense(1, kernel_regularizer=keras.regularizers.l2(0.01), activation='linear'))
model_simple.summary()
keras.utils.plot_model(model_simple)
model_simple.compile(loss=keras.losses.mean_squared_error,
optimizer=keras.optimizers.Adam(lr=5e-4),
metrics=['mae'])
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
history0 = model_simple.fit(x_train, autoscaled_y_train, epochs=150, validation_data=[x_test, autoscaled_y_test], callbacks=[early_stop])
#Plot the history--------------------------------------------------------------
def plot_history(history):
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
mae = history_dict['mae']
val_mae = history_dict['val_mae']
epochs = range(1, len(history_dict['mae']) + 1)
plt.rcParams['font.size'] = 15
plt.rcParams['font.family'] = 'times new roman'
plt.plot(epochs, loss_values, 'o', c='b', label='training loss')
plt.plot(epochs, val_loss_values, c='m', label='test loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.plot(epochs, mae, 'o', c='b', label='training mae')
plt.plot(epochs, val_mae, c='m', label='test mae')
plt.xlabel('Epochs')
plt.ylabel('Mae')
plt.legend()
plt.show()
plot_history(history0)
estimated_y_train = model_simple.predict(x_train).flatten()
estimated_y_test = model_simple.predict(x_test).flatten()
#Plot the estimation result----------------------------------------------------
def estimated_plot(actual_y, estimated_y):
plt.figure(figsize=figure.figaspect(1))
y_max = max(max(actual_y), max(estimated_y))
y_min = min(min(actual_y), min(estimated_y))
plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
[y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-', markersize=5)
plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
plt.xlabel('actual y')
plt.ylabel('estimated y')
plt.scatter(actual_y, estimated_y, c='b', s=5)
plt.show()
estimated_plot(autoscaled_y_train, estimated_y_train)
estimated_plot(autoscaled_y_test, estimated_y_test)
R2_train = (np.cov([estimated_y_train, autoscaled_y_train])[0, 1] / (np.std(estimated_y_train,ddof=1) * np.std(autoscaled_y_train, ddof=1))) ** 2
R2_train = round(R2_train, 4)
print('R2_train: ' + str(R2_train))
R2_test = (np.cov([estimated_y_test, autoscaled_y_test])[0, 1] / (np.std(estimated_y_test,ddof=1) * np.std(autoscaled_y_test, ddof=1))) ** 2
R2_test = round(R2_test, 4)
print('R2_test: ' + str(R2_test))
print('Analysis completed!', 'Time passed: ', time.time() - start)
#Estimation of the HOMO energy of pyrrole--------------------------------------
with open('pyrrole.mol', mode='r') as f:
mol = f.read()
mol = Chem.MolFromMolBlock(mol)
mol_fp = mol_to_fps(mol)
mol_fp= np.array([mol_fp])
estimated_param = model_simple.predict(mol_fp)
estimated_param = estimated_param * np.std(y_train) + np.mean(y_train)
estimated_param = estimated_param[0]
print(estimated_param[0], 'hartree')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment