Skip to content

Instantly share code, notes, and snippets.

@AlexMRuch
Created February 15, 2020 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AlexMRuch/4dee519e47eb489df1cdceab0c68da1d to your computer and use it in GitHub Desktop.
Save AlexMRuch/4dee519e47eb489df1cdceab0c68da1d to your computer and use it in GitHub Desktop.
"""
The purpose of this script is to construct and analyze a RGCN model.
The RGCN model is to perform semi-supervised node classification. Nodes will
be labeled as liberal or conservative.
The RGCN should be updated to classify neutral and non-political nodes.
Change negative log liklihood loss to cross-entropy loss.
Example code taken from https://doc.dgl.ai/tutorials/basics/5_hetero.html
"""
# Import dependencies
import dgl
from dgl import DGLGraph
from dgl.nn.pytorch import RelGraphConv
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from functools import partial
from tqdm import tqdm
import time
from datetime import datetime
from pympler import asizeof
import copy
import pickle
# Load edgelists and node/edge features
print("Loading data...")
el_author_wave1 = np.load("/media/seagate0/amazon/data/dgl_el_author_wave1.npy")
print(" el_author_wave1 shape: ", el_author_wave1.shape)
el_related_wave1 = np.load("/media/seagate0/amazon/data/dgl_el_related_wave1.npy")
print(" el_related_wave1 shape:", el_related_wave1.shape)
features_node_political = pd.read_csv("/media/seagate0/amazon/data/pol_books_updated.csv")
print(" features_node_political shape: ", features_node_political.shape)
with open('/media/seagate0/amazon/data/dgl_eid_asin.pickle', 'rb') as f:
eid_asin = pickle.load(f)
print(" eid_asin len:", len(eid_asin))
with open('/media/seagate0/amazon/data/dgl_eid_reviewers.pickle', 'rb') as f:
eid_reviewers = pickle.load(f)
print(" eid_reviewers len:", len(eid_reviewers))
# Construct heterograph
print("Constructing heterograph...")
g = dgl.heterograph({
("author", "reviews", "product"): [(int(src),int(dst)) for [src,dst] in el_author_wave1[:,:2]],
("product", "related", "product"): [(int(src),int(dst)) for [src,dst] in el_related_wave1]
})
print(" Constructed heterograph")
# Describe graph
print("Graph description:")
print(" Node types: ", g.ntypes)
num_nodes = g.number_of_nodes('author') + g.number_of_nodes('product')
print(" Nodes, total: ", num_nodes)
print(" Nodes, author: ", g.number_of_nodes('author'))
print(" Nodes, product: ", g.number_of_nodes('product'))
num_rels = len(g.canonical_etypes)
print(" Num. relations: ", num_rels)
print(" Canonical edges:", g.canonical_etypes)
print(" Edges, reviews: ", g.number_of_edges('reviews'))
print(" Edegs, related: ", g.number_of_edges('related'))
num_classes = len(np.unique(features_node_political["num_lean"]))
print(" Num. classes: ", num_classes)
classes = np.unique(features_node_political["num_lean"])
print(" Classes: ", classes)
# Extract node geatures
print("Assigning node features...")
asin_eid = {v:k for (k,v) in eid_asin.items()}
def encode_features_node_political(asin):
try:
asin_encoded = asin_eid[asin]
return asin_encoded
except:
return np.nan
features_node_political["asin_encoded"] = features_node_political["asin"].apply(lambda asin: encode_features_node_political(asin))
p_selected = torch.tensor(np.array(features_node_political[["asin_encoded","num_lean"]].dropna().drop_duplicates())[:,0]).long() # labeled product node ids
labels_short = np.array(features_node_political[["asin_encoded","num_lean"]].dropna().drop_duplicates())[:,1] # labeled product node labels
labels_short = np.where(labels_short == 0, -1, 1) # recode liberal books from 0 to -1
labels_short = torch.tensor(labels_short).long() # convert to tensor
labels = torch.zeros(g.number_of_nodes('product'))
for e_idx,p_row in enumerate(p_selected):
labels[p_row] = labels_short[e_idx]
g.nodes['product'].data['political'] = labels
print(" Assigned node features")
# Assign edge features
print("Assigning edge features...")
g.edges["reviews"].data['rating'] = torch.from_numpy(el_author_wave1[:,2])-2.5
print(" Assigned edge features")
# Determine aproximate graph memory size
print("Graph object size in RAM (aprox.):", asizeof.asized(g).size/1000, "MB")
if asizeof.asized(g).size/1000 >= 10000:
print("Warning: graph size >= 10 GB")
print(" Quit now if you wish to stop processing")
for i in range(6):
if i == 5:
print("Blastoff!!!")
else:
print(" ", 5-i)
time.sleep(1)
# Split graph into training, validation, testing sets
print("Splitting graph into training, validation, testing sets...")
shuffle = np.random.permutation(p_selected)
train_idx = torch.tensor(shuffle[0:950]).long() # ~80%
val_idx = torch.tensor(shuffle[950:1070]).long() # n=120
test_idx = torch.tensor(shuffle[1070:]).long() # n=115
print(" Training size: ", len(train_idx))
print(" Validation size:", len(val_idx))
print(" Test size: ", len(test_idx))
# Initialize RGCN layer
print("Defining RGCN layer...")
class HeteroRGCNLayer(nn.Module):
def __init__(self, in_size, out_size, etypes):
super(HeteroRGCNLayer, self).__init__()
# W_r for each relation
self.weight = nn.ModuleDict({
name : nn.Linear(in_size, out_size) for name in etypes
})
def forward(self, g, feat_dict):
# The input is a dictionary of node features for each type
funcs = {}
for srctype, etype, dsttype in g.canonical_etypes:
# Compute W_r * h
Wh = self.weight[etype](feat_dict[srctype])
# Save it in graph for message passing
g.nodes[srctype].data['Wh_%s' % etype] = Wh
# Specify per-relation message passing functions: (message_func, reduce_func).
# Note that the results are saved to the same destination feature 'h', which
# hints the type-wise reducer for aggregation.
funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
# Trigger message passing of multiple types.
# The first argument is the message passing functions for each relation.
# The second one is the type wise reducer, could be "sum", "max",
# "min", "mean", "stack"
g.multi_update_all(funcs, 'sum')
# return the updated node feature dictionary
return {ntype : g.nodes[ntype].data['h'] for ntype in g.ntypes}
print(" Defined RGCN layer")
# Initialize RGCN model
print("Defining RGCN model...")
class HeteroRGCN(nn.Module):
def __init__(self, g, in_size, hidden_size, out_size):
super(HeteroRGCN, self).__init__()
# Use trainable node embeddings as featureless inputs.
embed_dict = {
ntype : nn.Parameter(torch.Tensor(g.number_of_nodes(ntype), in_size))
for ntype in g.ntypes
}
for key,embed in embed_dict.items():
nn.init.xavier_uniform_(embed)
self.embed = nn.ParameterDict(embed_dict)
# create layers
self.layer1 = HeteroRGCNLayer(in_size, hidden_size, g.etypes)
self.layer2 = HeteroRGCNLayer(hidden_size, out_size, g.etypes)
def forward(self, g):
h_dict = self.layer1(g, self.embed)
h_dict = {k : F.leaky_relu(h) for k,h in h_dict.items()}
h_dict = self.layer2(g, h_dict)
# get paper logits
return h_dict['product']
print(" Defined RGCN model")
# Train and evaluate model
print("Training and evaluating model...")
model = HeteroRGCN(g, 10, 10, 2) # Output has three logits for two classes
train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
print(" Moving model, data, and labels to GPU...")
torch.cuda.set_device(0)
model.cuda()
labels.cuda()
train_idx.cuda()
val_idx.cuda()
test_idx.cuda()
opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
best_val_acc = 0
best_test_acc = 0
for epoch in range(100):
logits = model(g)
# The loss is computed only for labeled nodes.
loss = F.binary_cross_entropy(logits[train_idx], labels[train_idx])
pred = logits.argmax(1)
train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
if best_val_acc < val_acc:
best_val_acc = val_acc
best_test_acc = test_acc
opt.zero_grad()
loss.backward()
opt.step()
if epoch % 5 == 0:
print(' Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
loss.item(),
train_acc.item(),
val_acc.item(),
best_val_acc.item(),
test_acc.item(),
best_test_acc.item(),
))
print(" Training and evaluating complete")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment