Skip to content

Instantly share code, notes, and snippets.

@LucaCappelletti94
Created October 24, 2022 10:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LucaCappelletti94/63a5596bf90c6fe33c5d939dd4dfea91 to your computer and use it in GitHub Desktop.
Save LucaCappelletti94/63a5596bf90c6fe33c5d939dd4dfea91 to your computer and use it in GitHub Desktop.
# We are going to setup the
# first-order LINE model for
# node embedding as a Python class
# We will be using numpy for the algebraic
# operations
import numpy as np
from numpy.random import randint
# We will also need numba
# to compile just-in-time the core
# loops and make everything a bit faster,
# other than to parallelize the whole thing.
from numba import njit, prange
# TQDM for showing a nice loading bar
from tqdm.auto import trange
# Type hinting for optional values
from typing import Optional
@njit(fastmath=True)
def sigmoid(x: float) -> float:
# Depending on whether the `x` is greater or smaller than
# zero, we choose a different version of the sigmoid to
# avoid numerical overflows.
if x > 0:
return 1.0 / (1.0 + np.exp(-x))
return np.exp(x) / (1.0 + np.exp(x))
class FirstOrderLINE:
"""First-order LINE model for the computation of
undirected graph node embedding.
The model is rather simple, yet surprisingly effective.
It learn a feature vector for each node whose dot product
with the connected nodes should be maximal, which we can see
as the embedding of the node should be maximally similar to the
embedding of the connected nodes, and minimizes the dot product
of disconnected nodes, i.e. the embedding of the node should be
as dissimilar as possible to the embedding of disconnected nodes.
The procedure is optimized using gradient descent.
In this implementation we will represent a graph as a CSR matrix,
which is composed of an comulative outbound node degree vector,
with the length of the number of nodes, and a vector of destination
nodes, with the length of the number of edges.
This implementation is based on Numpy.
"""
def __init__(
self,
embedding_size: int = 100,
number_of_epochs: int = 2_000,
learning_rate: float = 0.01,
learning_rate_decay: float = 0.999,
random_state: int = 123675,
verbose: bool = True
):
"""Return new instance of First-Order LINE.
Parameters
-----------------
embedding_size: int = 2_000
Dimensionality of the embedding.
number_of_epochs: int = 100
The number of epochs to train the model for.
learning_rate: float = 0.005
The learning rate to update the embedding.
learning_rate_decay: float = 0.999
Rate to reduce the learning rate.
random_state: int = 123675
The random state to reproduce the training.
verbose: bool = True
Whether to show loading bars.
"""
self._embedding_size = embedding_size
self._number_of_epochs = number_of_epochs
self._learning_rate = learning_rate
self._learning_rate_decay = learning_rate_decay
self._random_state = random_state
self._verbose = verbose
def fit_transform(
self,
edges: np.ndarray,
number_of_nodes: Optional[int] = None
) -> np.ndarray:
"""Return first-order LINE node embedding of provided graph.
Parameters
-------------------
edges: csr_matrix
The edges of the graph to embed.
number_of_nodes: Optional[int] = None
Number of nodes in the graph.
If None, we use the maximum node ID we
find in the provided edges.
"""
assert edges.shape[1] == 2
if number_of_nodes is None:
number_of_nodes = edges.max() + 1
number_of_edges = edges.shape[0]
learning_rate = self._learning_rate
# The scale factor is helpful to reduce the
# magnitude of the dot product relative only
# to the dimensionality of the embedding.
scale_factor = np.sqrt(self._embedding_size)
# We set the random seed to reproduce the embedding
# and allocate the embedding matrix, with values between
# one and minus one.
# Note that we use a float32, so we'll require
# number_of_nodes * embedding_size * 4B memory
#
# For large embeddings, it is possible to MMAP the
# numpy array to disk, but this is not convenient
# as this genre of embedding requires random access
# to the various node embedding vector, and accessing
# random vectors from disk is extremely slow.
np.random.seed(self._random_state)
embedding = np.random.default_rng(
self._random_state
).random(
(
number_of_nodes,
self._embedding_size
),
dtype=np.float32
) * 2.0 - 1.0
@njit(parallel=True, fastmath=True)
def compute_epoch(embedding, learning_rate):
for edge_id in prange(number_of_edges):
edge = edges[edge_id]
false_src = edges[randint(number_of_edges)][0]
false_dst = edges[randint(number_of_edges)][1]
# If we samples a selfloops for the negative edges,
# there is no way it will predict it as a negative
# as trivially a node embedding is maximally identical
# to itself. Therefore, in this case we skip onward.
if false_src == false_dst:
continue
for src, dst, label in (
# Negative sample
(false_src, false_dst, 0.0),
# Positive sample
(edge[0], edge[1], 1.0),
):
# First we compute the scaled dot product
dot = (embedding[src] * embedding[dst]).sum()
# We scale the dot product by the scale factor
scaled_dot = dot / scale_factor
# Then, we compute the prediction using the sigmoid
variation = learning_rate * (sigmoid(scaled_dot) - label)
# We compute and apply gradients racingly, so
# there may be (unlikely) data-race collisions
# but this allows for HALVING the memory requirements
embedding[src] -= variation * embedding[dst]
embedding[dst] -= variation * embedding[src]
# We start iterating on the model
for epoch in trange(
self._number_of_epochs,
desc="Epochs"
):
compute_epoch(
embedding,
learning_rate * self._learning_rate_decay**epoch
)
# Finally, we can return the computed node embedding.
return embedding
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment