LucaCappelletti94/first_order_line.py

## first_order_line.py
# We are going to setup the
# first-order LINE model for
# node embedding as a Python class

# We will be using numpy for the algebraic
# operations
import numpy as np
from numpy.random import randint

# We will also need numba
# to compile just-in-time the core
# loops and make everything a bit faster,
# other than to parallelize the whole thing.
from numba import njit, prange

# TQDM for showing a nice loading bar
from tqdm.auto import trange

# Type hinting for optional values
from typing import Optional


@njit(fastmath=True)
def sigmoid(x: float) -> float:
    # Depending on whether the `x` is greater or smaller than
    # zero, we choose a different version of the sigmoid to
    # avoid numerical overflows.
    if x > 0:
        return 1.0 / (1.0 + np.exp(-x))
    return np.exp(x) / (1.0 + np.exp(x))


class FirstOrderLINE:
    """First-order LINE model for the computation of
    undirected graph node embedding.

    The model is rather simple, yet surprisingly effective.
    It learn a feature vector for each node whose dot product
    with the connected nodes should be maximal, which we can see
    as the embedding of the node should be maximally similar to the
    embedding of the connected nodes, and minimizes the dot product
    of disconnected nodes, i.e. the embedding of the node should be
    as dissimilar as possible to the embedding of disconnected nodes.

    The procedure is optimized using gradient descent.

    In this implementation we will represent a graph as a CSR matrix,
    which is composed of an comulative outbound node degree vector,
    with the length of the number of nodes, and a vector of destination
    nodes, with the length of the number of edges.

    This implementation is based on Numpy.
    """

    def __init__(
        self,
        embedding_size: int = 100,
        number_of_epochs: int = 2_000,
        learning_rate: float = 0.01,
        learning_rate_decay: float = 0.999,
        random_state: int = 123675,
        verbose: bool = True
    ):
        """Return new instance of First-Order LINE.

        Parameters
        -----------------
        embedding_size: int = 2_000
            Dimensionality of the embedding.
        number_of_epochs: int = 100
            The number of epochs to train the model for.
        learning_rate: float = 0.005
            The learning rate to update the embedding.
        learning_rate_decay: float = 0.999
            Rate to reduce the learning rate.
        random_state: int = 123675
            The random state to reproduce the training.
        verbose: bool = True
            Whether to show loading bars.
        """
        self._embedding_size = embedding_size
        self._number_of_epochs = number_of_epochs
        self._learning_rate = learning_rate
        self._learning_rate_decay = learning_rate_decay
        self._random_state = random_state
        self._verbose = verbose

    def fit_transform(
        self,
        edges: np.ndarray,
        number_of_nodes: Optional[int] = None
    ) -> np.ndarray:
        """Return first-order LINE node embedding of provided graph.

        Parameters
        -------------------
        edges: csr_matrix
            The edges of the graph to embed.
        number_of_nodes: Optional[int] = None
            Number of nodes in the graph.
            If None, we use the maximum node ID we
            find in the provided edges.
        """

        assert edges.shape[1] == 2

        if number_of_nodes is None:
            number_of_nodes = edges.max() + 1
        number_of_edges = edges.shape[0]

        learning_rate = self._learning_rate

        # The scale factor is helpful to reduce the
        # magnitude of the dot product relative only
        # to the dimensionality of the embedding.
        scale_factor = np.sqrt(self._embedding_size)

        # We set the random seed to reproduce the embedding
        # and allocate the embedding matrix, with values between
        # one and minus one.
        # Note that we use a float32, so we'll require
        # number_of_nodes * embedding_size * 4B memory
        #
        # For large embeddings, it is possible to MMAP the
        # numpy array to disk, but this is not convenient
        # as this genre of embedding requires random access
        # to the various node embedding vector, and accessing
        # random vectors from disk is extremely slow.
        np.random.seed(self._random_state)
        embedding = np.random.default_rng(
            self._random_state
        ).random(
            (
                number_of_nodes,
                self._embedding_size
            ),
            dtype=np.float32
        ) * 2.0 - 1.0

        @njit(parallel=True, fastmath=True)
        def compute_epoch(embedding, learning_rate):
            for edge_id in prange(number_of_edges):
                edge = edges[edge_id]
                false_src = edges[randint(number_of_edges)][0]
                false_dst = edges[randint(number_of_edges)][1]

                # If we samples a selfloops for the negative edges,
                # there is no way it will predict it as a negative
                # as trivially a node embedding is maximally identical
                # to itself. Therefore, in this case we skip onward.
                if false_src == false_dst:
                    continue

                for src, dst, label in (
                    # Negative sample
                    (false_src, false_dst, 0.0),
                    # Positive sample
                    (edge[0], edge[1], 1.0),
                ):
                    # First we compute the scaled dot product
                    dot = (embedding[src] * embedding[dst]).sum()

                    # We scale the dot product by the scale factor
                    scaled_dot = dot / scale_factor

                    # Then, we compute the prediction using the sigmoid
                    variation = learning_rate * (sigmoid(scaled_dot) - label)

                    # We compute and apply gradients racingly, so
                    # there may be (unlikely) data-race collisions
                    # but this allows for HALVING the memory requirements
                    embedding[src] -= variation * embedding[dst]
                    embedding[dst] -= variation * embedding[src]


        # We start iterating on the model
        for epoch in trange(
            self._number_of_epochs,
            desc="Epochs"
        ):
            compute_epoch(
                embedding,
                learning_rate * self._learning_rate_decay**epoch
            )

        # Finally, we can return the computed node embedding.
        return embedding
	# We are going to setup the
	# first-order LINE model for
	# node embedding as a Python class

	# We will be using numpy for the algebraic
	# operations
	import numpy as np
	from numpy.random import randint

	# We will also need numba
	# to compile just-in-time the core
	# loops and make everything a bit faster,
	# other than to parallelize the whole thing.
	from numba import njit, prange

	# TQDM for showing a nice loading bar
	from tqdm.auto import trange

	# Type hinting for optional values
	from typing import Optional


	@njit(fastmath=True)
	def sigmoid(x: float) -> float:
	# Depending on whether the `x` is greater or smaller than
	# zero, we choose a different version of the sigmoid to
	# avoid numerical overflows.
	if x > 0:
	return 1.0 / (1.0 + np.exp(-x))
	return np.exp(x) / (1.0 + np.exp(x))


	class FirstOrderLINE:
	"""First-order LINE model for the computation of
	undirected graph node embedding.

	The model is rather simple, yet surprisingly effective.
	It learn a feature vector for each node whose dot product
	with the connected nodes should be maximal, which we can see
	as the embedding of the node should be maximally similar to the
	embedding of the connected nodes, and minimizes the dot product
	of disconnected nodes, i.e. the embedding of the node should be
	as dissimilar as possible to the embedding of disconnected nodes.

	The procedure is optimized using gradient descent.

	In this implementation we will represent a graph as a CSR matrix,
	which is composed of an comulative outbound node degree vector,
	with the length of the number of nodes, and a vector of destination
	nodes, with the length of the number of edges.

	This implementation is based on Numpy.
	"""

	def __init__(
	self,
	embedding_size: int = 100,
	number_of_epochs: int = 2_000,
	learning_rate: float = 0.01,
	learning_rate_decay: float = 0.999,
	random_state: int = 123675,
	verbose: bool = True
	):
	"""Return new instance of First-Order LINE.

	Parameters
	-----------------
	embedding_size: int = 2_000
	Dimensionality of the embedding.
	number_of_epochs: int = 100
	The number of epochs to train the model for.
	learning_rate: float = 0.005
	The learning rate to update the embedding.
	learning_rate_decay: float = 0.999
	Rate to reduce the learning rate.
	random_state: int = 123675
	The random state to reproduce the training.
	verbose: bool = True
	Whether to show loading bars.
	"""
	self._embedding_size = embedding_size
	self._number_of_epochs = number_of_epochs
	self._learning_rate = learning_rate
	self._learning_rate_decay = learning_rate_decay
	self._random_state = random_state
	self._verbose = verbose

	def fit_transform(
	self,
	edges: np.ndarray,
	number_of_nodes: Optional[int] = None
	) -> np.ndarray:
	"""Return first-order LINE node embedding of provided graph.

	Parameters
	-------------------
	edges: csr_matrix
	The edges of the graph to embed.
	number_of_nodes: Optional[int] = None
	Number of nodes in the graph.
	If None, we use the maximum node ID we
	find in the provided edges.
	"""

	assert edges.shape[1] == 2

	if number_of_nodes is None:
	number_of_nodes = edges.max() + 1
	number_of_edges = edges.shape[0]

	learning_rate = self._learning_rate

	# The scale factor is helpful to reduce the
	# magnitude of the dot product relative only
	# to the dimensionality of the embedding.
	scale_factor = np.sqrt(self._embedding_size)

	# We set the random seed to reproduce the embedding
	# and allocate the embedding matrix, with values between
	# one and minus one.
	# Note that we use a float32, so we'll require
	# number_of_nodes * embedding_size * 4B memory
	#
	# For large embeddings, it is possible to MMAP the
	# numpy array to disk, but this is not convenient
	# as this genre of embedding requires random access
	# to the various node embedding vector, and accessing
	# random vectors from disk is extremely slow.
	np.random.seed(self._random_state)
	embedding = np.random.default_rng(
	self._random_state
	).random(
	(
	number_of_nodes,
	self._embedding_size
	),
	dtype=np.float32
	) * 2.0 - 1.0

	@njit(parallel=True, fastmath=True)
	def compute_epoch(embedding, learning_rate):
	for edge_id in prange(number_of_edges):
	edge = edges[edge_id]
	false_src = edges[randint(number_of_edges)][0]
	false_dst = edges[randint(number_of_edges)][1]

	# If we samples a selfloops for the negative edges,
	# there is no way it will predict it as a negative
	# as trivially a node embedding is maximally identical
	# to itself. Therefore, in this case we skip onward.
	if false_src == false_dst:
	continue

	for src, dst, label in (
	# Negative sample
	(false_src, false_dst, 0.0),
	# Positive sample
	(edge[0], edge[1], 1.0),
	):
	# First we compute the scaled dot product
	dot = (embedding[src] * embedding[dst]).sum()

	# We scale the dot product by the scale factor
	scaled_dot = dot / scale_factor

	# Then, we compute the prediction using the sigmoid
	variation = learning_rate * (sigmoid(scaled_dot) - label)

	# We compute and apply gradients racingly, so
	# there may be (unlikely) data-race collisions
	# but this allows for HALVING the memory requirements
	embedding[src] -= variation * embedding[dst]
	embedding[dst] -= variation * embedding[src]


	# We start iterating on the model
	for epoch in trange(
	self._number_of_epochs,
	desc="Epochs"
	):
	compute_epoch(
	embedding,
	learning_rate * self._learning_rate_decay**epoch
	)

	# Finally, we can return the computed node embedding.
	return embedding