Skip to content

Instantly share code, notes, and snippets.

@chao1224
Created June 13, 2020 22:56
Show Gist options
  • Save chao1224/ca505bdcf5064ff9fe1abc9c45acbf80 to your computer and use it in GitHub Desktop.
Save chao1224/ca505bdcf5064ff9fe1abc9c45acbf80 to your computer and use it in GitHub Desktop.
'''This is demo scripts for running n_gram_graph on delaney.'''
from __future__ import print_function
import argparse
import os
import numpy as np
import json
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.backends.cudnn as cudnn
import n_gram_graph as ngg
from n_gram_graph.embedding.node_embedding import CBoW, train, test
from n_gram_graph.model.xgboost_regression import XGBoostRegression
from n_gram_graph.model.random_forest_regression import RandomForestRegression
from n_gram_graph.util import *
from n_gram_graph.dataset_specification import dataset2task_list
def node_embedding_get_data(data_path, padding_size):
data = np.load(data_path)
print(data.keys())
print(data_path)
adjacent_matrix_list = data['adjacent_matrix_list']
node_attribute_matrix_list = data['node_attribute_matrix_list']
molecule_num = adjacent_matrix_list.shape[0]
print('molecule num\t', molecule_num)
X_data = []
Y_label_list = []
print('adjacent_matrix_list shape: {}\tnode_attribute_matrix_list shape: {}'.format(adjacent_matrix_list.shape, node_attribute_matrix_list.shape))
for adjacent_matrix, node_attribute_matrix in zip(adjacent_matrix_list, node_attribute_matrix_list):
assert len(adjacent_matrix) == max_atom_num
assert len(node_attribute_matrix) == max_atom_num
for i in range(max_atom_num):
if sum(adjacent_matrix[i]) == 0:
break
x_temp = np.zeros((padding_size, feature_num))
cnt = 0
for j in range(max_atom_num):
if adjacent_matrix[i][j] == 1:
x_temp[cnt] = node_attribute_matrix[j]
cnt += 1
x_temp = np.array(x_temp)
y_temp = []
atom_feat = node_attribute_matrix[i]
for s in segmentation_list:
y_temp.append(atom_feat[s].argmax())
X_data.append(x_temp)
Y_label_list.append(y_temp)
X_data = np.array(X_data)
Y_label_list = np.array(Y_label_list)
return X_data, Y_label_list
class NodeEmbeddingGraphDataset(Dataset):
def __init__(self, mode, K_list, padding_size, segmentation_list):
self.X_data, self.Y_label_list = [], []
for i in K_list:
data_path = './datasets/{}/{}_graph.npz'.format(mode, i)
X_data, Y_label_list = node_embedding_get_data(data_path=data_path, padding_size=padding_size)
self.X_data.extend(X_data)
self.Y_label_list.extend(Y_label_list)
self.X_data = np.array(self.X_data)
self.Y_label_list = np.array(self.Y_label_list)
print('data size: ', self.X_data.shape, '\tlabel size: ', self.Y_label_list.shape)
self.segmentation_list = segmentation_list
def __len__(self):
return len(self.X_data)
def __getitem__(self, idx):
x_data = self.X_data[idx]
y_label_list = self.Y_label_list[idx]
x_data = torch.from_numpy(x_data)
y_label_list = torch.from_numpy(y_label_list)
return x_data, y_label_list
def node_embedding_train():
criterion = nn.CrossEntropyLoss()
model.train()
optimal_loss = 1e7
for epoch in range(epochs):
train_loss = []
for batch_id, (x_data, y_actual) in enumerate(train_dataloader):
x_data = Variable(x_data).float()
y_actual = Variable(y_actual).long()
if torch.cuda.is_available():
x_data = x_data.cuda()
y_actual = y_actual.cuda()
optimizer.zero_grad()
y_predict = model(x_data)
loss = 0
for i in range(segmentation_num):
y_true, y_pred = y_actual[..., i], y_predict[i]
temp_loss = criterion(y_pred, y_true)
loss += temp_loss
loss.backward()
optimizer.step()
train_loss.append(loss.item())
train_loss = np.mean(train_loss)
print('epoch: {}\tloss is: {}'.format(epoch, train_loss))
if train_loss < optimal_loss:
optimal_loss = train_loss
print('Saving model at epoch {}\toptimal loss is {}.'.format(epoch, optimal_loss))
torch.save(model.state_dict(), weight_file)
print('For random dimension as {}.'.format(embedding_dimension))
return
def node_embedding_test(dataloader):
model.eval()
accuracy, total = 0, 0
for batch_id, (x_data, y_actual) in enumerate(dataloader):
x_data = Variable(x_data).float()
y_actual = Variable(y_actual).long()
if torch.cuda.is_available():
x_data = x_data.cuda()
y_actual = y_actual.cuda()
y_predict = model(x_data)
for i in range(segmentation_num):
y_true, y_pred = y_actual[..., i].cpu().data.numpy(), y_predict[i].cpu().data.numpy()
y_pred = y_pred.argmax(1)
accuracy += np.sum(y_true == y_pred)
total += y_pred.shape[0]
accuracy = 1. * accuracy / total
print('Accuracy: {}'.format(accuracy))
print('For random dimension as {}.'.format(embedding_dimension))
return
def graph_embedding_get_data(data_path):
data = np.load(data_path)
print(data.keys())
adjacent_matrix_list = data['adjacent_matrix_list']
distance_matrix_list = data['distance_matrix_list']
bond_attribute_matrix_list = data['bond_attribute_matrix_list']
node_attribute_matrix_list = data['node_attribute_matrix_list']
kwargs = {}
kwargs['label_name'] = data['label_name']
return adjacent_matrix_list, distance_matrix_list, bond_attribute_matrix_list,\
node_attribute_matrix_list, kwargs
class GraphEmbeddingGraphDataset(Dataset):
def __init__(self, node_attribute_matrix_list, adjacent_matrix_list, distance_matrix_list):
self.node_attribute_matrix_list = node_attribute_matrix_list
self.adjacent_matrix_list = adjacent_matrix_list
self.distance_matrix_list = distance_matrix_list
def __len__(self):
return len(self.node_attribute_matrix_list)
def __getitem__(self, idx):
node_attribute_matrix = torch.from_numpy(self.node_attribute_matrix_list[idx])
adjacent_matrix = torch.from_numpy(self.adjacent_matrix_list[idx])
distance_matrix = torch.from_numpy(self.distance_matrix_list[idx])
return node_attribute_matrix, adjacent_matrix, distance_matrix
def graph_embedding_get_walk_representation(dataloader):
X_embed = []
embedded_graph_matrix_list = []
for batch_id, (node_attribute_matrix, adjacent_matrix, distance_matrix) in enumerate(dataloader):
node_attribute_matrix = Variable(node_attribute_matrix).float()
adjacent_matrix = Variable(adjacent_matrix).float()
distance_matrix = Variable(distance_matrix).float()
if torch.cuda.is_available():
node_attribute_matrix = node_attribute_matrix.cuda()
adjacent_matrix = adjacent_matrix.cuda()
distance_matrix = distance_matrix.cuda()
tilde_node_attribute_matrix = model.embeddings(node_attribute_matrix)
walk = tilde_node_attribute_matrix
v1 = torch.sum(walk, dim=1)
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix
v2 = torch.sum(walk, dim=1)
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix
v3 = torch.sum(walk, dim=1)
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix
v4 = torch.sum(walk, dim=1)
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix
v5 = torch.sum(walk, dim=1)
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix
v6 = torch.sum(walk, dim=1)
embedded_graph_matrix = torch.stack([v1, v2, v3, v4, v5, v6], dim=1)
if torch.cuda.is_available():
tilde_node_attribute_matrix = tilde_node_attribute_matrix.cpu()
embedded_graph_matrix = embedded_graph_matrix.cpu()
X_embed.extend(tilde_node_attribute_matrix.data.numpy())
embedded_graph_matrix_list.extend(embedded_graph_matrix.data.numpy())
embedded_node_matrix_list = np.array(X_embed)
embedded_graph_matrix_list = np.array(embedded_graph_matrix_list)
print('embedded_node_matrix_list: ', embedded_node_matrix_list.shape)
print('embedded_graph_matrix_list shape: {}'.format(embedded_graph_matrix_list.shape))
return embedded_node_matrix_list, embedded_graph_matrix_list
def run_n_gram_xgb():
with open(config_json_file, 'r') as f:
conf = json.load(f)
label_name_list = [label_name]
print('label_name_list ', label_name_list)
test_index = [running_index]
train_index = filter(lambda x: x not in test_index, np.arange(5))
train_file_list = file_list[train_index]
test_file_list = file_list[test_index]
print('train files ', train_file_list)
print('test files ', test_file_list)
X_train, y_train = extract_feature_and_label_npy(train_file_list,
feature_name='embedded_graph_matrix_list',
label_name_list=label_name_list,
n_gram_num=n_gram_num)
X_test, y_test = extract_feature_and_label_npy(test_file_list,
feature_name='embedded_graph_matrix_list',
label_name_list=label_name_list,
n_gram_num=n_gram_num)
print('done data preparation')
task = XGBoostRegression(conf=conf)
task.train_and_predict(X_train, y_train, X_test, y_test, weight_file)
task.eval_with_existing(X_train, y_train, X_test, y_test, weight_file)
y_pred_on_test = task.predict_with_existing(X_test, weight_file)
np.savez('output_on_test', y_test=y_test, y_pred=y_pred_on_test)
return
def run_n_gram_rf():
with open(config_json_file, 'r') as f:
conf = json.load(f)
label_name_list = [label_name]
test_index = [running_index]
train_index = filter(lambda x: x not in test_index, np.arange(5))
train_file_list = file_list[train_index]
test_file_list = file_list[test_index]
X_train, y_train = extract_feature_and_label_npy(train_file_list,
feature_name='embedded_graph_matrix_list',
label_name_list=label_name_list,
n_gram_num=n_gram_num)
X_test, y_test = extract_feature_and_label_npy(test_file_list,
feature_name='embedded_graph_matrix_list',
label_name_list=label_name_list,
n_gram_num=n_gram_num)
print('done data preparation')
task = RandomForestRegression(conf=conf)
task.train_and_predict(X_train, y_train, X_test, y_test, weight_file)
task.eval_with_existing(X_train, y_train, X_test, y_test, weight_file)
y_pred_on_test = task.predict_with_existing(X_test, weight_file)
np.savez('output_on_test', y_test=y_test, y_pred=y_pred_on_test)
return
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='delaney')
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--seed', type=int, default=123)
args = parser.parse_args()
epochs = args.epochs
seed = args.seed
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(args.seed)
cudnn.benchmark = True
task = 'delaney'
feature_num = 42
segmentation_list = [range(0, 10), range(10, 17), range(17, 24), range(24, 30), range(30, 36), range(36, 38), range(38, 40), range(40, 42)]
segmentation_list = np.array(segmentation_list)
segmentation_num = len(segmentation_list)
max_atom_num = 55
padding_size = 6
embedding_dimension = 100
############### Learning The Representation In An Unsupervised Way ###############
for running_index in range(5):
test_list = [running_index]
train_list = filter(lambda x: x not in test_list, np.arange(5))
print('training list: {}\ttest list: {}'.format(train_list, test_list))
dir_ = 'model_weight/{}/{}'.format(task, running_index)
if not os.path.isdir(dir_):
os.makedirs(dir_)
weight_file = '{}/{}_CBoW_non_segment.pt'.format(dir_, embedding_dimension)
model = CBoW(feature_num=feature_num, embedding_dim=embedding_dimension,
task_num=segmentation_num, task_size_list=segmentation_list)
if torch.cuda.is_available():
model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-4)
train_dataset = NodeEmbeddingGraphDataset(task, K_list=train_list, segmentation_list=segmentation_list, padding_size=padding_size)
test_dataset = NodeEmbeddingGraphDataset(task, K_list=test_list, segmentation_list=segmentation_list, padding_size=padding_size)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)
node_embedding_train()
node_embedding_test(train_dataloader)
node_embedding_test(test_dataloader)
print()
print('Done with node embedding on {}.'.format(running_index))
model.eval()
for i in range(5):
dir_ = './datasets/{}'.format(task)
if not os.path.isdir(dir_):
os.makedirs(dir_)
data_path = '{}/{}_graph.npz'.format(dir_, i)
adjacent_matrix_list, distance_matrix_list, bond_attribute_matrix_list, node_attribute_matrix_list, kwargs = graph_embedding_get_data(data_path)
dataset = GraphEmbeddingGraphDataset(node_attribute_matrix_list=node_attribute_matrix_list, adjacent_matrix_list=adjacent_matrix_list, distance_matrix_list=distance_matrix_list)
dataloader = DataLoader(dataset, batch_size=128, shuffle=False)
embedded_node_matrix_list, embedded_graph_matrix_list = graph_embedding_get_walk_representation(dataloader)
dir_ = './datasets/{}/{}'.format(task, running_index)
if not os.path.isdir(dir_):
os.makedirs(dir_)
out_file_path = '{}/{}_grammed_cbow_{}_graph'.format(dir_, i, embedding_dimension)
kwargs['adjacent_matrix_list'] = adjacent_matrix_list
kwargs['distance_matrix_list'] = distance_matrix_list
kwargs['embedded_node_matrix_list'] = embedded_node_matrix_list
kwargs['embedded_graph_matrix_list'] = embedded_graph_matrix_list
np.savez_compressed(out_file_path, **kwargs)
print(kwargs.keys())
print()
print()
print()
############### Running Model ###############
label_name = 'label_name'
n_gram_num = 6
weight_file = 'temp.pt'
for running_index in range(5):
directory = './datasets/{}/{}/{{}}_grammed_cbow_{}_graph.npz'.format(task, running_index, embedding_dimension)
file_list = []
for i in range(5):
file_list.append(directory.format(i))
file_list = np.array(file_list)
print('file_list\t', file_list)
model = 'n_gram_xgb'
config_json_file = 'hyper/{}/{}.json'.format(model, task)
dir_ = './output/{}/{}'.format(model, running_index)
if not os.path.isdir(dir_):
os.makedirs(dir_)
run_n_gram_xgb()
os.rename('output_on_test.npz', '{}/{}.npz'.format(dir_, task))
model = 'n_gram_rf'
config_json_file = 'hyper/{}/{}.json'.format(model, task)
dir_ = './output/{}/{}'.format(model, running_index)
if not os.path.isdir(dir_):
os.makedirs(dir_)
run_n_gram_rf()
os.rename('output_on_test.npz', '{}/{}.npz'.format(dir_, task))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment