-
-
Save AlexMRuch/4dee519e47eb489df1cdceab0c68da1d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
The purpose of this script is to construct and analyze a RGCN model. | |
The RGCN model is to perform semi-supervised node classification. Nodes will | |
be labeled as liberal or conservative. | |
The RGCN should be updated to classify neutral and non-political nodes. | |
Change negative log liklihood loss to cross-entropy loss. | |
Example code taken from https://doc.dgl.ai/tutorials/basics/5_hetero.html | |
""" | |
# Import dependencies | |
import dgl | |
from dgl import DGLGraph | |
from dgl.nn.pytorch import RelGraphConv | |
import dgl.function as fn | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import numpy as np | |
import pandas as pd | |
from functools import partial | |
from tqdm import tqdm | |
import time | |
from datetime import datetime | |
from pympler import asizeof | |
import copy | |
import pickle | |
# Load edgelists and node/edge features | |
print("Loading data...") | |
el_author_wave1 = np.load("/media/seagate0/amazon/data/dgl_el_author_wave1.npy") | |
print(" el_author_wave1 shape: ", el_author_wave1.shape) | |
el_related_wave1 = np.load("/media/seagate0/amazon/data/dgl_el_related_wave1.npy") | |
print(" el_related_wave1 shape:", el_related_wave1.shape) | |
features_node_political = pd.read_csv("/media/seagate0/amazon/data/pol_books_updated.csv") | |
print(" features_node_political shape: ", features_node_political.shape) | |
with open('/media/seagate0/amazon/data/dgl_eid_asin.pickle', 'rb') as f: | |
eid_asin = pickle.load(f) | |
print(" eid_asin len:", len(eid_asin)) | |
with open('/media/seagate0/amazon/data/dgl_eid_reviewers.pickle', 'rb') as f: | |
eid_reviewers = pickle.load(f) | |
print(" eid_reviewers len:", len(eid_reviewers)) | |
# Construct heterograph | |
print("Constructing heterograph...") | |
g = dgl.heterograph({ | |
("author", "reviews", "product"): [(int(src),int(dst)) for [src,dst] in el_author_wave1[:,:2]], | |
("product", "related", "product"): [(int(src),int(dst)) for [src,dst] in el_related_wave1] | |
}) | |
print(" Constructed heterograph") | |
# Describe graph | |
print("Graph description:") | |
print(" Node types: ", g.ntypes) | |
num_nodes = g.number_of_nodes('author') + g.number_of_nodes('product') | |
print(" Nodes, total: ", num_nodes) | |
print(" Nodes, author: ", g.number_of_nodes('author')) | |
print(" Nodes, product: ", g.number_of_nodes('product')) | |
num_rels = len(g.canonical_etypes) | |
print(" Num. relations: ", num_rels) | |
print(" Canonical edges:", g.canonical_etypes) | |
print(" Edges, reviews: ", g.number_of_edges('reviews')) | |
print(" Edegs, related: ", g.number_of_edges('related')) | |
num_classes = len(np.unique(features_node_political["num_lean"])) | |
print(" Num. classes: ", num_classes) | |
classes = np.unique(features_node_political["num_lean"]) | |
print(" Classes: ", classes) | |
# Extract node geatures | |
print("Assigning node features...") | |
asin_eid = {v:k for (k,v) in eid_asin.items()} | |
def encode_features_node_political(asin): | |
try: | |
asin_encoded = asin_eid[asin] | |
return asin_encoded | |
except: | |
return np.nan | |
features_node_political["asin_encoded"] = features_node_political["asin"].apply(lambda asin: encode_features_node_political(asin)) | |
p_selected = torch.tensor(np.array(features_node_political[["asin_encoded","num_lean"]].dropna().drop_duplicates())[:,0]).long() # labeled product node ids | |
labels_short = np.array(features_node_political[["asin_encoded","num_lean"]].dropna().drop_duplicates())[:,1] # labeled product node labels | |
labels_short = np.where(labels_short == 0, -1, 1) # recode liberal books from 0 to -1 | |
labels_short = torch.tensor(labels_short).long() # convert to tensor | |
labels = torch.zeros(g.number_of_nodes('product')) | |
for e_idx,p_row in enumerate(p_selected): | |
labels[p_row] = labels_short[e_idx] | |
g.nodes['product'].data['political'] = labels | |
print(" Assigned node features") | |
# Assign edge features | |
print("Assigning edge features...") | |
g.edges["reviews"].data['rating'] = torch.from_numpy(el_author_wave1[:,2])-2.5 | |
print(" Assigned edge features") | |
# Determine aproximate graph memory size | |
print("Graph object size in RAM (aprox.):", asizeof.asized(g).size/1000, "MB") | |
if asizeof.asized(g).size/1000 >= 10000: | |
print("Warning: graph size >= 10 GB") | |
print(" Quit now if you wish to stop processing") | |
for i in range(6): | |
if i == 5: | |
print("Blastoff!!!") | |
else: | |
print(" ", 5-i) | |
time.sleep(1) | |
# Split graph into training, validation, testing sets | |
print("Splitting graph into training, validation, testing sets...") | |
shuffle = np.random.permutation(p_selected) | |
train_idx = torch.tensor(shuffle[0:950]).long() # ~80% | |
val_idx = torch.tensor(shuffle[950:1070]).long() # n=120 | |
test_idx = torch.tensor(shuffle[1070:]).long() # n=115 | |
print(" Training size: ", len(train_idx)) | |
print(" Validation size:", len(val_idx)) | |
print(" Test size: ", len(test_idx)) | |
# Initialize RGCN layer | |
print("Defining RGCN layer...") | |
class HeteroRGCNLayer(nn.Module): | |
def __init__(self, in_size, out_size, etypes): | |
super(HeteroRGCNLayer, self).__init__() | |
# W_r for each relation | |
self.weight = nn.ModuleDict({ | |
name : nn.Linear(in_size, out_size) for name in etypes | |
}) | |
def forward(self, g, feat_dict): | |
# The input is a dictionary of node features for each type | |
funcs = {} | |
for srctype, etype, dsttype in g.canonical_etypes: | |
# Compute W_r * h | |
Wh = self.weight[etype](feat_dict[srctype]) | |
# Save it in graph for message passing | |
g.nodes[srctype].data['Wh_%s' % etype] = Wh | |
# Specify per-relation message passing functions: (message_func, reduce_func). | |
# Note that the results are saved to the same destination feature 'h', which | |
# hints the type-wise reducer for aggregation. | |
funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h')) | |
# Trigger message passing of multiple types. | |
# The first argument is the message passing functions for each relation. | |
# The second one is the type wise reducer, could be "sum", "max", | |
# "min", "mean", "stack" | |
g.multi_update_all(funcs, 'sum') | |
# return the updated node feature dictionary | |
return {ntype : g.nodes[ntype].data['h'] for ntype in g.ntypes} | |
print(" Defined RGCN layer") | |
# Initialize RGCN model | |
print("Defining RGCN model...") | |
class HeteroRGCN(nn.Module): | |
def __init__(self, g, in_size, hidden_size, out_size): | |
super(HeteroRGCN, self).__init__() | |
# Use trainable node embeddings as featureless inputs. | |
embed_dict = { | |
ntype : nn.Parameter(torch.Tensor(g.number_of_nodes(ntype), in_size)) | |
for ntype in g.ntypes | |
} | |
for key,embed in embed_dict.items(): | |
nn.init.xavier_uniform_(embed) | |
self.embed = nn.ParameterDict(embed_dict) | |
# create layers | |
self.layer1 = HeteroRGCNLayer(in_size, hidden_size, g.etypes) | |
self.layer2 = HeteroRGCNLayer(hidden_size, out_size, g.etypes) | |
def forward(self, g): | |
h_dict = self.layer1(g, self.embed) | |
h_dict = {k : F.leaky_relu(h) for k,h in h_dict.items()} | |
h_dict = self.layer2(g, h_dict) | |
# get paper logits | |
return h_dict['product'] | |
print(" Defined RGCN model") | |
# Train and evaluate model | |
print("Training and evaluating model...") | |
model = HeteroRGCN(g, 10, 10, 2) # Output has three logits for two classes | |
train_on_gpu = torch.cuda.is_available() | |
if train_on_gpu: | |
print(" Moving model, data, and labels to GPU...") | |
torch.cuda.set_device(0) | |
model.cuda() | |
labels.cuda() | |
train_idx.cuda() | |
val_idx.cuda() | |
test_idx.cuda() | |
opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) | |
best_val_acc = 0 | |
best_test_acc = 0 | |
for epoch in range(100): | |
logits = model(g) | |
# The loss is computed only for labeled nodes. | |
loss = F.binary_cross_entropy(logits[train_idx], labels[train_idx]) | |
pred = logits.argmax(1) | |
train_acc = (pred[train_idx] == labels[train_idx]).float().mean() | |
val_acc = (pred[val_idx] == labels[val_idx]).float().mean() | |
test_acc = (pred[test_idx] == labels[test_idx]).float().mean() | |
if best_val_acc < val_acc: | |
best_val_acc = val_acc | |
best_test_acc = test_acc | |
opt.zero_grad() | |
loss.backward() | |
opt.step() | |
if epoch % 5 == 0: | |
print(' Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % ( | |
loss.item(), | |
train_acc.item(), | |
val_acc.item(), | |
best_val_acc.item(), | |
test_acc.item(), | |
best_test_acc.item(), | |
)) | |
print(" Training and evaluating complete") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment