Last active
December 25, 2020 09:19
-
-
Save CheML-gist/14989bb644dc328f6001380d638cd0d0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from rdkit.Chem import AllChem, DataStructs | |
from rdkit import Chem | |
from tensorflow import keras | |
from sklearn.model_selection import train_test_split | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.figure as figure | |
import time | |
import pubchempy as pcp | |
print('Program started!') | |
#SDF Loading------------------------------------------------------------------- | |
print('\nLoading SDF...') | |
start = time.time() | |
sdf = Chem.SDMolSupplier('gdb9.sdf') # Calculation level: B3LYP/6-31G(2df,p) | |
sdf_id = ['gdb_' + str(i + 1) for i in range(len(sdf))] | |
sdf = pd.DataFrame(sdf) | |
sdf.index = sdf_id | |
sdf.columns = ['ROMol'] | |
dataset = pd.read_csv('gdb9.sdf.csv', index_col=0) | |
mols_with_props = pd.concat([sdf, dataset], axis=1) | |
mols_with_props = mols_with_props.dropna() | |
print('SDF loaded!', 'Time passed: ', time.time() - start) | |
#Filtering--------------------------------------------------------------------- | |
print('\nFiltering samples...') | |
molecule_1 = pcp.get_compounds('pyrrole', 'name') | |
molecule_1 = molecule_1[0].canonical_smiles | |
filtering_mol_1 = Chem.MolFromSmiles(molecule_1) | |
AllChem.Compute2DCoords(filtering_mol_1) | |
molecule_2 = pcp.get_compounds('pyridine', 'name') | |
molecule_2 = molecule_2[0].canonical_smiles | |
filtering_mol_2 = Chem.MolFromSmiles(molecule_2) | |
AllChem.Compute2DCoords(filtering_mol_2) | |
molecule_3 = pcp.get_compounds('1-butene', 'name') | |
molecule_3 = molecule_3[0].canonical_smiles | |
filtering_mol_3 = Chem.MolFromSmiles(molecule_3) | |
AllChem.Compute2DCoords(filtering_mol_3) | |
fps_with_props = [] | |
for i in range(len(mols_with_props)): | |
m = mols_with_props.ROMol[i] | |
if m.HasSubstructMatch(filtering_mol_1): | |
fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]]) | |
elif m.HasSubstructMatch(filtering_mol_2): | |
fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]]) | |
elif m.HasSubstructMatch(filtering_mol_3): | |
fps_with_props.append(mols_with_props.loc[mols_with_props.index[i]]) | |
fps_with_props = pd.DataFrame(fps_with_props) | |
#Setting the cutoff value | |
cutoff = 3 * np.std(fps_with_props.homo) | |
mean = np.mean(fps_with_props.homo) | |
remove_molecule = [] | |
for i in range(len(fps_with_props.homo)): | |
if fps_with_props.homo[i] > mean + cutoff or fps_with_props.homo[i] < mean - cutoff: | |
remove_molecule.append(fps_with_props.index[i]) | |
fps_with_props = fps_with_props.drop(remove_molecule, axis=0) | |
print('Filtering completed!', 'Time passed: ', time.time() - start) | |
print('{}samples matched!'.format(len(fps_with_props))) | |
#Fingerprint------------------------------------------------------------------- | |
def mol_to_fps(mol): | |
Hashed = np.zeros(4096, dtype=int) | |
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 6, 4096) # Morgan Fingerprint | |
# fp = AllChem.GetMACCSKeysFingerprint(mol) #MACCS Fingerprint | |
# fp = Chem.rdmolops.RDKFingerprint(mol) #Topological Fingerprint | |
DataStructs.ConvertToNumpyArray(fp, Hashed) | |
return Hashed | |
print('\nLoading fingerprints...') | |
fps_with_props['fp'] = fps_with_props.ROMol.map(mol_to_fps) | |
print('FingerPrints loaded!', 'Time passed: ', time.time() - start) | |
#Definition of explanatory and objective variables---------------------------- | |
x = fps_with_props.fp | |
y = fps_with_props.homo | |
#Convert fingerprints to numpy array type------------------------------------- | |
print('\nInt coversion started...') | |
x_array = np.asarray([i for i in x]) | |
print('Int conversion complete!', 'Time passed: ', time.time() - start) | |
#Split data to the train and test data----------------------------------------- | |
num_test = round(0.2 * len(x)) | |
x_train, x_test, y_train, y_test = train_test_split(x_array, y, test_size=num_test, shuffle=True, random_state=0) | |
#Autoscaling------------------------------------------------------------------- | |
autoscaled_y_train = (y_train - np.mean(y_train)) / np.std(y_train) | |
autoscaled_y_test = (y_test - np.mean(y_test)) / np.std(y_test) | |
autoscaled_y_train = np.asarray(autoscaled_y_train) | |
autoscaled_y_test = np.asarray(autoscaled_y_test) | |
#Learning---------------------------------------------------------------------- | |
print('\n\n################ MODEL SETUP ##################\n\n') | |
model_simple = keras.models.Sequential() | |
model_simple.add(keras.layers.Dense(100, kernel_regularizer=keras.regularizers.l2(0.01), activation='tanh', input_shape=[len(fps_with_props.iloc[0, 20])])) | |
model_simple.add(keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.01), activation='tanh')) | |
# model_simple.add(keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.01), activation='linear')) | |
model_simple.add(keras.layers.Dense(1, kernel_regularizer=keras.regularizers.l2(0.01), activation='linear')) | |
model_simple.summary() | |
keras.utils.plot_model(model_simple) | |
model_simple.compile(loss=keras.losses.mean_squared_error, | |
optimizer=keras.optimizers.Adam(lr=5e-4), | |
metrics=['mae']) | |
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30) | |
history0 = model_simple.fit(x_train, autoscaled_y_train, epochs=150, validation_data=[x_test, autoscaled_y_test], callbacks=[early_stop]) | |
#Plot the history-------------------------------------------------------------- | |
def plot_history(history): | |
history_dict = history.history | |
loss_values = history_dict['loss'] | |
val_loss_values = history_dict['val_loss'] | |
mae = history_dict['mae'] | |
val_mae = history_dict['val_mae'] | |
epochs = range(1, len(history_dict['mae']) + 1) | |
plt.rcParams['font.size'] = 15 | |
plt.rcParams['font.family'] = 'times new roman' | |
plt.plot(epochs, loss_values, 'o', c='b', label='training loss') | |
plt.plot(epochs, val_loss_values, c='m', label='test loss') | |
plt.xlabel('Epochs') | |
plt.ylabel('Loss') | |
plt.legend() | |
plt.show() | |
plt.plot(epochs, mae, 'o', c='b', label='training mae') | |
plt.plot(epochs, val_mae, c='m', label='test mae') | |
plt.xlabel('Epochs') | |
plt.ylabel('Mae') | |
plt.legend() | |
plt.show() | |
plot_history(history0) | |
estimated_y_train = model_simple.predict(x_train).flatten() | |
estimated_y_test = model_simple.predict(x_test).flatten() | |
#Plot the estimation result---------------------------------------------------- | |
def estimated_plot(actual_y, estimated_y): | |
plt.figure(figsize=figure.figaspect(1)) | |
y_max = max(max(actual_y), max(estimated_y)) | |
y_min = min(min(actual_y), min(estimated_y)) | |
plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], | |
[y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-', markersize=5) | |
plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) | |
plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) | |
plt.xlabel('actual y') | |
plt.ylabel('estimated y') | |
plt.scatter(actual_y, estimated_y, c='b', s=5) | |
plt.show() | |
estimated_plot(autoscaled_y_train, estimated_y_train) | |
estimated_plot(autoscaled_y_test, estimated_y_test) | |
R2_train = (np.cov([estimated_y_train, autoscaled_y_train])[0, 1] / (np.std(estimated_y_train,ddof=1) * np.std(autoscaled_y_train, ddof=1))) ** 2 | |
R2_train = round(R2_train, 4) | |
print('R2_train: ' + str(R2_train)) | |
R2_test = (np.cov([estimated_y_test, autoscaled_y_test])[0, 1] / (np.std(estimated_y_test,ddof=1) * np.std(autoscaled_y_test, ddof=1))) ** 2 | |
R2_test = round(R2_test, 4) | |
print('R2_test: ' + str(R2_test)) | |
print('Analysis completed!', 'Time passed: ', time.time() - start) | |
#Estimation of the HOMO energy of pyrrole-------------------------------------- | |
with open('pyrrole.mol', mode='r') as f: | |
mol = f.read() | |
mol = Chem.MolFromMolBlock(mol) | |
mol_fp = mol_to_fps(mol) | |
mol_fp= np.array([mol_fp]) | |
estimated_param = model_simple.predict(mol_fp) | |
estimated_param = estimated_param * np.std(y_train) + np.mean(y_train) | |
estimated_param = estimated_param[0] | |
print(estimated_param[0], 'hartree') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment