Created
November 5, 2023 17:16
-
-
Save flyfir248/d4b4113b362905a77664040cd8dcec43 to your computer and use it in GitHub Desktop.
Jina V2 Embeddings.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/flyfir248/d4b4113b362905a77664040cd8dcec43/jina-v2-embeddings.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "80eb6fba", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:22:20.546363Z", | |
"start_time": "2023-10-30T12:22:20.541655Z" | |
}, | |
"id": "80eb6fba" | |
}, | |
"outputs": [], | |
"source": [ | |
"from transformers import AutoModel\n", | |
"from numpy.linalg import norm" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "8e8c4d7f", | |
"metadata": { | |
"id": "8e8c4d7f" | |
}, | |
"source": [ | |
"## Mathematical Formula" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "d2e5b029", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:23:21.704602Z", | |
"start_time": "2023-10-30T12:23:21.699205Z" | |
}, | |
"id": "d2e5b029" | |
}, | |
"outputs": [], | |
"source": [ | |
"cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "93800fa3", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:23:25.016348Z", | |
"start_time": "2023-10-30T12:23:25.005782Z" | |
}, | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "93800fa3", | |
"outputId": "1e62d6bf-c172-4111-809d-fc3838d18c35" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<function __main__.<lambda>(a, b)>" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 3 | |
} | |
], | |
"source": [ | |
"cos_sim" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "d8262e38", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:25:09.557983Z", | |
"start_time": "2023-10-30T12:24:20.638049Z" | |
}, | |
"id": "d8262e38" | |
}, | |
"outputs": [], | |
"source": [ | |
"model = AutoModel.from_pretrained(\"jinaai/jina-embeddings-v2-base-en\", trust_remote_code=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "92378ceb", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:26:40.764899Z", | |
"start_time": "2023-10-30T12:26:40.472577Z" | |
}, | |
"id": "92378ceb" | |
}, | |
"outputs": [], | |
"source": [ | |
"embeddings = model.encode([\"How is the weather today?\", \"What is the current weather like today?\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "9935b11d", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:26:46.049706Z", | |
"start_time": "2023-10-30T12:26:46.034203Z" | |
}, | |
"id": "9935b11d", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "9e36f2e5-5169-49b7-e66d-6932e78cad2a" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([[-0.348271 , -0.60091805, 0.60223645, ..., -0.25232717,\n", | |
" 0.23249912, -0.702648 ],\n", | |
" [-0.11724895, -0.89896125, 0.4500912 , ..., -0.02847639,\n", | |
" -0.2287147 , -0.4228289 ]], dtype=float32)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 6 | |
} | |
], | |
"source": [ | |
"embeddings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "c760dd63", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:27:05.502760Z", | |
"start_time": "2023-10-30T12:27:05.491801Z" | |
}, | |
"id": "c760dd63", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "084e48e4-e91d-403b-f8f8-f69a9ed3e542" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.9341315" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 7 | |
} | |
], | |
"source": [ | |
"cos_sim(embeddings[0], embeddings[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "9233db51", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:29:18.261885Z", | |
"start_time": "2023-10-30T12:29:18.256849Z" | |
}, | |
"id": "9233db51" | |
}, | |
"outputs": [], | |
"source": [ | |
"def compute_similarity(sentence1, sentence2):\n", | |
" embeddings = model.encode([sentence1, sentence2])\n", | |
" result = cos_sim(embeddings[0], embeddings[1])\n", | |
" return result\n", | |
"# Example usage:\n", | |
"# similarity = compute_similarity(\"How is the weather today?\", \"What is the current weather like today?\", model)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "6d3d6aae", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:29:54.267613Z", | |
"start_time": "2023-10-30T12:29:54.096207Z" | |
}, | |
"id": "6d3d6aae" | |
}, | |
"outputs": [], | |
"source": [ | |
"similarity = compute_similarity(\"I love apple.\", \"I like Banana.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "f5020ba9", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:29:57.216251Z", | |
"start_time": "2023-10-30T12:29:57.209619Z" | |
}, | |
"id": "f5020ba9", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "2d3a41b3-ec97-46d6-ab06-1877c5239589" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.77415395" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 10 | |
} | |
], | |
"source": [ | |
"similarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "cb62050e", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:31:34.917240Z", | |
"start_time": "2023-10-30T12:31:34.795440Z" | |
}, | |
"id": "cb62050e" | |
}, | |
"outputs": [], | |
"source": [ | |
"similarity = compute_similarity(\"I like OpenAI.\", \"I like OpenAI.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "f51071bc", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:31:37.544817Z", | |
"start_time": "2023-10-30T12:31:37.537919Z" | |
}, | |
"id": "f51071bc", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "a3499d1b-3665-486a-b947-9b72d4414b8f" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"1.0000001" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 12 | |
} | |
], | |
"source": [ | |
"similarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "923678ac", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:31:58.523252Z", | |
"start_time": "2023-10-30T12:31:58.394500Z" | |
}, | |
"id": "923678ac" | |
}, | |
"outputs": [], | |
"source": [ | |
"similarity = compute_similarity(\"I like OpenAI.\", \"I don't like OpenAI.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "d69ad057", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2023-10-30T12:32:00.069996Z", | |
"start_time": "2023-10-30T12:32:00.064078Z" | |
}, | |
"id": "d69ad057", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "3ea7b0a2-8219-45ea-d116-b9f6aba96044" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.8971375" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 14 | |
} | |
], | |
"source": [ | |
"similarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "147df01f", | |
"metadata": { | |
"id": "147df01f", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "74834d6d-55f1-4e01-fcd3-7b9235f5cadd" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cosine Similarity: 0.9341315\n" | |
] | |
} | |
], | |
"source": [ | |
"from transformers import AutoModel\n", | |
"from numpy.linalg import norm\n", | |
"\n", | |
"# Define cosine similarity function\n", | |
"cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))\n", | |
"\n", | |
"# Load Jina Embeddings model from Hugging Face Transformers\n", | |
"model = AutoModel.from_pretrained(\"jinaai/jina-embeddings-v2-base-en\", trust_remote_code=True)\n", | |
"\n", | |
"# Encode sentences and compute embeddings\n", | |
"embeddings = model.encode([\"How is the weather today?\", \"What is the current weather like today?\"])\n", | |
"\n", | |
"# Calculate cosine similarity between the embeddings\n", | |
"similarity = cos_sim(embeddings[0], embeddings[1])\n", | |
"print(\"Cosine Similarity:\", similarity)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.4" | |
}, | |
"colab": { | |
"provenance": [], | |
"include_colab_link": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment