AlexMRuch/amazon_heterograph_rgcn.py Secret

## amazon_heterograph_rgcn.py
"""
The purpose of this script is to construct and analyze a RGCN model.
    The RGCN model is to perform semi-supervised node classification. Nodes will
    be labeled as liberal or conservative.
The RGCN should be updated to classify neutral and non-political nodes.
    Change negative log liklihood loss to cross-entropy loss.
Example code taken from https://doc.dgl.ai/tutorials/basics/5_hetero.html
"""

# Import dependencies
import dgl
from dgl import DGLGraph
from dgl.nn.pytorch import RelGraphConv
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from functools import partial
from tqdm import tqdm
import time
from datetime import datetime
from pympler import asizeof
import copy
import pickle

# Load edgelists and node/edge features
print("Loading data...")
el_author_wave1  = np.load("/media/seagate0/amazon/data/dgl_el_author_wave1.npy")
print("  el_author_wave1 shape: ", el_author_wave1.shape)
el_related_wave1 = np.load("/media/seagate0/amazon/data/dgl_el_related_wave1.npy")
print("  el_related_wave1 shape:", el_related_wave1.shape)
features_node_political  = pd.read_csv("/media/seagate0/amazon/data/pol_books_updated.csv")
print("  features_node_political shape: ", features_node_political.shape)
with open('/media/seagate0/amazon/data/dgl_eid_asin.pickle', 'rb') as f:
    eid_asin = pickle.load(f)
print("  eid_asin len:", len(eid_asin))
with open('/media/seagate0/amazon/data/dgl_eid_reviewers.pickle', 'rb') as f:
    eid_reviewers = pickle.load(f)
print("  eid_reviewers len:", len(eid_reviewers))

# Construct heterograph
print("Constructing heterograph...")
g = dgl.heterograph({
    ("author", "reviews", "product"): [(int(src),int(dst)) for [src,dst] in el_author_wave1[:,:2]],
    ("product", "related", "product"): [(int(src),int(dst)) for [src,dst] in el_related_wave1]
})
print("  Constructed heterograph")

# Describe graph
print("Graph description:")
print("  Node types:     ", g.ntypes)
num_nodes   = g.number_of_nodes('author') + g.number_of_nodes('product')
print("  Nodes, total:   ", num_nodes)
print("  Nodes, author:  ", g.number_of_nodes('author'))
print("  Nodes, product: ", g.number_of_nodes('product'))
num_rels    = len(g.canonical_etypes)
print("  Num. relations: ", num_rels)
print("  Canonical edges:", g.canonical_etypes)
print("  Edges, reviews: ", g.number_of_edges('reviews'))
print("  Edegs, related: ", g.number_of_edges('related'))
num_classes = len(np.unique(features_node_political["num_lean"]))
print("  Num. classes:   ", num_classes)
classes     = np.unique(features_node_political["num_lean"])
print("  Classes:        ", classes)

# Extract node geatures
print("Assigning node features...")
asin_eid = {v:k for (k,v) in eid_asin.items()}
def encode_features_node_political(asin):
    try:
        asin_encoded = asin_eid[asin]
        return asin_encoded
    except:
        return np.nan
features_node_political["asin_encoded"] = features_node_political["asin"].apply(lambda asin: encode_features_node_political(asin))
p_selected   = torch.tensor(np.array(features_node_political[["asin_encoded","num_lean"]].dropna().drop_duplicates())[:,0]).long() # labeled product node ids
labels_short = np.array(features_node_political[["asin_encoded","num_lean"]].dropna().drop_duplicates())[:,1] # labeled product node labels
labels_short = np.where(labels_short == 0, -1, 1) # recode liberal books from 0 to -1
labels_short = torch.tensor(labels_short).long() # convert to tensor
labels       = torch.zeros(g.number_of_nodes('product'))
for e_idx,p_row in enumerate(p_selected):
    labels[p_row] = labels_short[e_idx]
g.nodes['product'].data['political'] = labels
print("  Assigned node features")

# Assign edge features
print("Assigning edge features...")
g.edges["reviews"].data['rating'] = torch.from_numpy(el_author_wave1[:,2])-2.5
print("  Assigned edge features")

# Determine aproximate graph memory size
print("Graph object size in RAM (aprox.):", asizeof.asized(g).size/1000, "MB")
if asizeof.asized(g).size/1000 >= 10000:
    print("Warning: graph size >= 10 GB")
    print("  Quit now if you wish to stop processing")
    for i in range(6):
        if i == 5:
            print("Blastoff!!!")
        else:
            print(" ", 5-i)
            time.sleep(1)

# Split graph into training, validation, testing sets
print("Splitting graph into training, validation, testing sets...")
shuffle = np.random.permutation(p_selected)
train_idx = torch.tensor(shuffle[0:950]).long() # ~80%
val_idx = torch.tensor(shuffle[950:1070]).long() # n=120
test_idx = torch.tensor(shuffle[1070:]).long() # n=115
print("  Training size:  ", len(train_idx))
print("  Validation size:", len(val_idx))
print("  Test size:      ", len(test_idx))

# Initialize RGCN layer
print("Defining RGCN layer...")
class HeteroRGCNLayer(nn.Module):
    def __init__(self, in_size, out_size, etypes):
        super(HeteroRGCNLayer, self).__init__()
        # W_r for each relation
        self.weight = nn.ModuleDict({
                name : nn.Linear(in_size, out_size) for name in etypes
            })
    def forward(self, g, feat_dict):
        # The input is a dictionary of node features for each type
        funcs = {}
        for srctype, etype, dsttype in g.canonical_etypes:
            # Compute W_r * h
            Wh = self.weight[etype](feat_dict[srctype])
            # Save it in graph for message passing
            g.nodes[srctype].data['Wh_%s' % etype] = Wh
            # Specify per-relation message passing functions: (message_func, reduce_func).
            # Note that the results are saved to the same destination feature 'h', which
            # hints the type-wise reducer for aggregation.
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        # Trigger message passing of multiple types.
        # The first argument is the message passing functions for each relation.
        # The second one is the type wise reducer, could be "sum", "max",
        # "min", "mean", "stack"
        g.multi_update_all(funcs, 'sum')
        # return the updated node feature dictionary
        return {ntype : g.nodes[ntype].data['h'] for ntype in g.ntypes}
print("  Defined RGCN layer")

# Initialize RGCN model
print("Defining RGCN model...")
class HeteroRGCN(nn.Module):
    def __init__(self, g, in_size, hidden_size, out_size):
        super(HeteroRGCN, self).__init__()
        # Use trainable node embeddings as featureless inputs.
        embed_dict = {
            ntype : nn.Parameter(torch.Tensor(g.number_of_nodes(ntype), in_size))
            for ntype in g.ntypes
        }
        for key,embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        # create layers
        self.layer1 = HeteroRGCNLayer(in_size, hidden_size, g.etypes)
        self.layer2 = HeteroRGCNLayer(hidden_size, out_size, g.etypes)
    def forward(self, g):
        h_dict = self.layer1(g, self.embed)
        h_dict = {k : F.leaky_relu(h) for k,h in h_dict.items()}
        h_dict = self.layer2(g, h_dict)
        # get paper logits
        return h_dict['product']
print("  Defined RGCN model")

# Train and evaluate model
print("Training and evaluating model...")
model = HeteroRGCN(g, 10, 10, 2) # Output has three logits for two classes
train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    print("  Moving model, data, and labels to GPU...")
    torch.cuda.set_device(0)
    model.cuda()
    labels.cuda()
    train_idx.cuda()
    val_idx.cuda()
    test_idx.cuda()
opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
best_val_acc = 0
best_test_acc = 0
for epoch in range(100):
    logits = model(g)
    # The loss is computed only for labeled nodes.
    loss = F.binary_cross_entropy(logits[train_idx], labels[train_idx])
    pred = logits.argmax(1)
    train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
    val_acc   = (pred[val_idx]   == labels[val_idx]).float().mean()
    test_acc  = (pred[test_idx]  == labels[test_idx]).float().mean()
    if best_val_acc < val_acc:
        best_val_acc  = val_acc
        best_test_acc = test_acc
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 5 == 0:
        print('  Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
            loss.item(),
            train_acc.item(),
            val_acc.item(),
            best_val_acc.item(),
            test_acc.item(),
            best_test_acc.item(),
        ))
print("  Training and evaluating complete")
	"""
	The purpose of this script is to construct and analyze a RGCN model.
	The RGCN model is to perform semi-supervised node classification. Nodes will
	be labeled as liberal or conservative.
	The RGCN should be updated to classify neutral and non-political nodes.
	Change negative log liklihood loss to cross-entropy loss.
	Example code taken from https://doc.dgl.ai/tutorials/basics/5_hetero.html
	"""

	# Import dependencies
	import dgl
	from dgl import DGLGraph
	from dgl.nn.pytorch import RelGraphConv
	import dgl.function as fn
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	import pandas as pd
	from functools import partial
	from tqdm import tqdm
	import time
	from datetime import datetime
	from pympler import asizeof
	import copy
	import pickle

	# Load edgelists and node/edge features
	print("Loading data...")
	el_author_wave1 = np.load("/media/seagate0/amazon/data/dgl_el_author_wave1.npy")
	print(" el_author_wave1 shape: ", el_author_wave1.shape)
	el_related_wave1 = np.load("/media/seagate0/amazon/data/dgl_el_related_wave1.npy")
	print(" el_related_wave1 shape:", el_related_wave1.shape)
	features_node_political = pd.read_csv("/media/seagate0/amazon/data/pol_books_updated.csv")
	print(" features_node_political shape: ", features_node_political.shape)
	with open('/media/seagate0/amazon/data/dgl_eid_asin.pickle', 'rb') as f:
	eid_asin = pickle.load(f)
	print(" eid_asin len:", len(eid_asin))
	with open('/media/seagate0/amazon/data/dgl_eid_reviewers.pickle', 'rb') as f:
	eid_reviewers = pickle.load(f)
	print(" eid_reviewers len:", len(eid_reviewers))

	# Construct heterograph
	print("Constructing heterograph...")
	g = dgl.heterograph({
	("author", "reviews", "product"): [(int(src),int(dst)) for [src,dst] in el_author_wave1[:,:2]],
	("product", "related", "product"): [(int(src),int(dst)) for [src,dst] in el_related_wave1]
	})
	print(" Constructed heterograph")

	# Describe graph
	print("Graph description:")
	print(" Node types: ", g.ntypes)
	num_nodes = g.number_of_nodes('author') + g.number_of_nodes('product')
	print(" Nodes, total: ", num_nodes)
	print(" Nodes, author: ", g.number_of_nodes('author'))
	print(" Nodes, product: ", g.number_of_nodes('product'))
	num_rels = len(g.canonical_etypes)
	print(" Num. relations: ", num_rels)
	print(" Canonical edges:", g.canonical_etypes)
	print(" Edges, reviews: ", g.number_of_edges('reviews'))
	print(" Edegs, related: ", g.number_of_edges('related'))
	num_classes = len(np.unique(features_node_political["num_lean"]))
	print(" Num. classes: ", num_classes)
	classes = np.unique(features_node_political["num_lean"])
	print(" Classes: ", classes)

	# Extract node geatures
	print("Assigning node features...")
	asin_eid = {v:k for (k,v) in eid_asin.items()}
	def encode_features_node_political(asin):
	try:
	asin_encoded = asin_eid[asin]
	return asin_encoded
	except:
	return np.nan
	features_node_political["asin_encoded"] = features_node_political["asin"].apply(lambda asin: encode_features_node_political(asin))
	p_selected = torch.tensor(np.array(features_node_political[["asin_encoded","num_lean"]].dropna().drop_duplicates())[:,0]).long() # labeled product node ids
	labels_short = np.array(features_node_political[["asin_encoded","num_lean"]].dropna().drop_duplicates())[:,1] # labeled product node labels
	labels_short = np.where(labels_short == 0, -1, 1) # recode liberal books from 0 to -1
	labels_short = torch.tensor(labels_short).long() # convert to tensor
	labels = torch.zeros(g.number_of_nodes('product'))
	for e_idx,p_row in enumerate(p_selected):
	labels[p_row] = labels_short[e_idx]
	g.nodes['product'].data['political'] = labels
	print(" Assigned node features")

	# Assign edge features
	print("Assigning edge features...")
	g.edges["reviews"].data['rating'] = torch.from_numpy(el_author_wave1[:,2])-2.5
	print(" Assigned edge features")

	# Determine aproximate graph memory size
	print("Graph object size in RAM (aprox.):", asizeof.asized(g).size/1000, "MB")
	if asizeof.asized(g).size/1000 >= 10000:
	print("Warning: graph size >= 10 GB")
	print(" Quit now if you wish to stop processing")
	for i in range(6):
	if i == 5:
	print("Blastoff!!!")
	else:
	print(" ", 5-i)
	time.sleep(1)

	# Split graph into training, validation, testing sets
	print("Splitting graph into training, validation, testing sets...")
	shuffle = np.random.permutation(p_selected)
	train_idx = torch.tensor(shuffle[0:950]).long() # ~80%
	val_idx = torch.tensor(shuffle[950:1070]).long() # n=120
	test_idx = torch.tensor(shuffle[1070:]).long() # n=115
	print(" Training size: ", len(train_idx))
	print(" Validation size:", len(val_idx))
	print(" Test size: ", len(test_idx))

	# Initialize RGCN layer
	print("Defining RGCN layer...")
	class HeteroRGCNLayer(nn.Module):
	def __init__(self, in_size, out_size, etypes):
	super(HeteroRGCNLayer, self).__init__()
	# W_r for each relation
	self.weight = nn.ModuleDict({
	name : nn.Linear(in_size, out_size) for name in etypes
	})
	def forward(self, g, feat_dict):
	# The input is a dictionary of node features for each type
	funcs = {}
	for srctype, etype, dsttype in g.canonical_etypes:
	# Compute W_r * h
	Wh = self.weight[etype](feat_dict[srctype])
	# Save it in graph for message passing
	g.nodes[srctype].data['Wh_%s' % etype] = Wh
	# Specify per-relation message passing functions: (message_func, reduce_func).
	# Note that the results are saved to the same destination feature 'h', which
	# hints the type-wise reducer for aggregation.
	funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
	# Trigger message passing of multiple types.
	# The first argument is the message passing functions for each relation.
	# The second one is the type wise reducer, could be "sum", "max",
	# "min", "mean", "stack"
	g.multi_update_all(funcs, 'sum')
	# return the updated node feature dictionary
	return {ntype : g.nodes[ntype].data['h'] for ntype in g.ntypes}
	print(" Defined RGCN layer")

	# Initialize RGCN model
	print("Defining RGCN model...")
	class HeteroRGCN(nn.Module):
	def __init__(self, g, in_size, hidden_size, out_size):
	super(HeteroRGCN, self).__init__()
	# Use trainable node embeddings as featureless inputs.
	embed_dict = {
	ntype : nn.Parameter(torch.Tensor(g.number_of_nodes(ntype), in_size))
	for ntype in g.ntypes
	}
	for key,embed in embed_dict.items():
	nn.init.xavier_uniform_(embed)
	self.embed = nn.ParameterDict(embed_dict)
	# create layers
	self.layer1 = HeteroRGCNLayer(in_size, hidden_size, g.etypes)
	self.layer2 = HeteroRGCNLayer(hidden_size, out_size, g.etypes)
	def forward(self, g):
	h_dict = self.layer1(g, self.embed)
	h_dict = {k : F.leaky_relu(h) for k,h in h_dict.items()}
	h_dict = self.layer2(g, h_dict)
	# get paper logits
	return h_dict['product']
	print(" Defined RGCN model")

	# Train and evaluate model
	print("Training and evaluating model...")
	model = HeteroRGCN(g, 10, 10, 2) # Output has three logits for two classes
	train_on_gpu = torch.cuda.is_available()
	if train_on_gpu:
	print(" Moving model, data, and labels to GPU...")
	torch.cuda.set_device(0)
	model.cuda()
	labels.cuda()
	train_idx.cuda()
	val_idx.cuda()
	test_idx.cuda()
	opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
	best_val_acc = 0
	best_test_acc = 0
	for epoch in range(100):
	logits = model(g)
	# The loss is computed only for labeled nodes.
	loss = F.binary_cross_entropy(logits[train_idx], labels[train_idx])
	pred = logits.argmax(1)
	train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
	val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
	test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
	if best_val_acc < val_acc:
	best_val_acc = val_acc
	best_test_acc = test_acc
	opt.zero_grad()
	loss.backward()
	opt.step()
	if epoch % 5 == 0:
	print(' Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
	loss.item(),
	train_acc.item(),
	val_acc.item(),
	best_val_acc.item(),
	test_acc.item(),
	best_test_acc.item(),
	))
	print(" Training and evaluating complete")