Skip to content

Instantly share code, notes, and snippets.

@amir-saniyan
Last active November 30, 2022 02:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save amir-saniyan/e102de09b01c4ed1632e3d1a1a1cbf64 to your computer and use it in GitHub Desktop.
Save amir-saniyan/e102de09b01c4ed1632e3d1a1a1cbf64 to your computer and use it in GitHub Desktop.

In the name of God

Embedding Similarity Measurement

This gist contains implementation of Embedding Similarity Measurement in C++ and Python.

Manhattan Distance

#include <cstddef>
#include <cmath>
#include <stdexcept>

float CalculateManhattanDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
    if(embedding1.size() != embedding2.size())
    {
        throw std::invalid_argument("Embedding sizes should be equal.");
    }

    float sum = 0;

    std::size_t embeddingSize = embedding1.size();
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
        float distance = std::abs(embedding2[i] - embedding1[i]);
        sum += distance;
    }

    float manhattanDistance = sum;

    return manhattanDistance;
}

Euclidean Distance

#include <cstddef>
#include <cmath>
#include <stdexcept>

float CalculateEuclideanDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
    if(embedding1.size() != embedding2.size())
    {
        throw std::invalid_argument("Embedding sizes should be equal.");
    }

    float sum = 0;

    std::size_t embeddingSize = embedding1.size();
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
        float distance = embedding2[i] - embedding1[i];
        sum += distance * distance;
    }

    float euclideanDistance = std::sqrt(sum);

    return euclideanDistance;
}

Minkowski Distance

#include <cstddef>
#include <cmath>
#include <stdexcept>

float CalculateMinkowskiDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2, int p)
{
    if(embedding1.size() != embedding2.size())
    {
        throw std::invalid_argument("Embedding sizes should be equal.");
    }

    float sum = 0;

    std::size_t embeddingSize = embedding1.size();
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
        float distance = std::abs(embedding2[i] - embedding1[i]);
        sum += std::pow(distance, p);
    }

    float minkowskiDistance = std::pow(sum, 1.0f / p);

    return minkowskiDistance;
}

L1, L2, Lp Norms

float CalculateL1Norm(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
    return CalculateManhattanDistance(embedding1, embedding2);
}

float CalculateL2Norm(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
    return CalculateEuclideanDistance(embedding1, embedding2);
}

float CalculateLPNorm(const std::vector<float>& embedding1, const std::vector<float>& embedding2, int p)
{
    return CalculateMinkowskiDistance(embedding1, embedding2, p);
}

Cosine Similarity

#include <cstddef>
#include <cmath>
#include <stdexcept>

float CalculateCosineSimilarity(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
    if(embedding1.size() != embedding2.size())
    {
        throw std::invalid_argument("Embedding sizes should be equal.");
    }

    float aa = 0;
    float bb = 0;
    float ab = 0;

    std::size_t embeddingSize = embedding1.size();
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
        aa += std::pow(embedding1[i], 2);
        bb += std::pow(embedding2[i], 2);
        ab += embedding1[i] * embedding2[i];
    }

    float cosineSimilarity = ab / std::sqrt(aa * bb);

    return cosineSimilarity;
}

Angular Distance, Angular Similarity

# define PI 3.14159265358979323846

float CalculateAngularDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
    float cosineSimilarity = CalculateCosineSimilarity(embedding1, embedding2);

    float angularDistance = std::acos(cosineSimilarity) / PI;

    return angularDistance;
}

float CalculateAngularSimilarity(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
    float angularDistance = CalculateAngularDistance(embedding1, embedding2);

    float angularSimilarity = 1 - angularDistance;

    return angularSimilarity;
}

Python Version

import math

from scipy import spatial


def calculate_cosine_distance(a, b):
    cosine_distance = float(spatial.distance.cosine(a, b))
    return cosine_distance


def calculate_cosine_similarity(a, b):
    cosine_similarity = 1 - calculate_cosine_distance(a, b)
    return cosine_similarity


def calculate_angular_distance(a, b):
    cosine_similarity = calculate_cosine_similarity(a, b)
    angular_distance = math.acos(cosine_similarity) / math.pi
    return angular_distance


def calculate_angular_similarity(a, b):
    angular_similarity = 1 - calculate_angular_distance(a, b)
    return angular_similarity

Similarity Search Using Tensorflow

import time

import numpy as np  # np.__version__ == '1.23.5'
import tensorflow as tf  # tf.__version__ == '2.11.0'

EMBEDDINGS_LENGTH = 512
NUMBER_OF_EMBEDDINGS = 1000 * 1000


def calculate_cosine_similarities(x, embeddings):
    cosine_similarities = -1 * tf.keras.losses.cosine_similarity(x, embeddings)
    return cosine_similarities.numpy()


def find_closest_embeddings(x, embeddings, top_k=1):
    cosine_similarities = calculate_cosine_similarities(x, embeddings)
    values, indices = tf.math.top_k(cosine_similarities, k=top_k)
    return values.numpy(), indices.numpy()


def main():
    # x shape: (512)
    # Embeddings shape: (1000000, 512)
    x = np.random.rand(EMBEDDINGS_LENGTH).astype(np.float32)
    embeddings = np.random.rand(NUMBER_OF_EMBEDDINGS, EMBEDDINGS_LENGTH).astype(np.float32)

    print('Embeddings shape: ', embeddings.shape)

    n = 100
    sum_duration = 0
    for i in range(n):
        start = time.time()
        best_values, best_indices = find_closest_embeddings(x, embeddings, top_k=1)
        end = time.time()

        duration = end - start
        sum_duration += duration

        print('Duration (seconds): {}, Best value: {}, Best index: {}'.format(duration, best_values[0], best_indices[0]))

    # Average duration (seconds): 1.707 for Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz
    # Average duration (seconds): 0.961 for NVIDIA 1080 ti
    print('Average duration (seconds): ', sum_duration / n)


if __name__ == '__main__':
    main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment