Created
January 2, 2026 09:13
-
-
Save msoftware/25c0d206d6d1b8eb38cb0c234201f8b3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "dc2117fc-d903-47de-af0b-53d8fa03b332", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "import time\n", | |
| "import json\n", | |
| "import ollama\n", | |
| "import base64\n", | |
| "import mariadb\n", | |
| "import sys" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "bf811701-eb16-4873-9138-983497793c65", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "host = 'localhost'\n", | |
| "port = 3306\n", | |
| "user = 'user'\n", | |
| "password = 'password'\n", | |
| "database = 'test_database'\n", | |
| "\n", | |
| "conn = mariadb.connect(user=user, password=password, host=host, port=port, database=database)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "681667c4-21b8-48e5-a53f-668eac746539", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def get_embeddings(text_to_embed, model_name=\"qwen3-embedding:8b\"):\n", | |
| " \"\"\"\n", | |
| " Generates embeddings for a given text using an Ollama model.\n", | |
| "\n", | |
| " Args:\n", | |
| " text_to_embed (str): The text for which to generate embeddings.\n", | |
| " model_name (str): The name of the Ollama model to use (default: \"qwen3-embedding:8b\").\n", | |
| "\n", | |
| " Returns:\n", | |
| " list: A list of floats representing the embedding vector, or None if an error occurs.\n", | |
| " \"\"\"\n", | |
| " try:\n", | |
| " response = ollama.embeddings(\n", | |
| " model=model_name,\n", | |
| " prompt=text_to_embed\n", | |
| " )\n", | |
| " return response['embedding']\n", | |
| " except Exception as e:\n", | |
| " print(f\"Error generating embeddings: {e}\")\n", | |
| " return None" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "03742106-e2b4-4a24-9249-1bc0ef701c38", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# --- MariaDB Insertion Function ---\n", | |
| "def insert_embedding_into_mariadb(embedding_vector):\n", | |
| " \"\"\"\n", | |
| " Inserts an embedding vector into the MariaDB vector table.\n", | |
| "\n", | |
| " Args:\n", | |
| " embedding_vector (list): The embedding vector as a list of floats.\n", | |
| " db_config (dict): Dictionary containing MariaDB connection details.\n", | |
| " \"\"\"\n", | |
| " if not embedding_vector:\n", | |
| " print(\"No embedding vector provided to insert.\")\n", | |
| " return\n", | |
| "\n", | |
| " cursor = None\n", | |
| " try:\n", | |
| " cursor = conn.cursor()\n", | |
| "\n", | |
| " rounded_embedding = [round(float(x), 4) for x in embedding_vector] # Ensure x is float just in case\n", | |
| " embedding_json_string = json.dumps(rounded_embedding) # Use the rounded embedding\n", | |
| "\n", | |
| " # embedding_json_string = json.dumps(embedding_vector)\n", | |
| " \n", | |
| " print(f\"DEBUG: JSON string being sent: {embedding_json_string[:200]}...\") # Print first 200 chars\n", | |
| "\n", | |
| " # SQL to insert into the vector table\n", | |
| " # We're letting 'id' auto-increment\n", | |
| " sql = \"INSERT INTO `qwen3-embedding-8b` (embedding) VALUES (VEC_FromText(?))\"\n", | |
| "\n", | |
| " # Execute the insert statement\n", | |
| " cursor.execute(sql, (embedding_json_string,))\n", | |
| "\n", | |
| " # Commit the transaction\n", | |
| " conn.commit()\n", | |
| " print(f\"Successfully inserted embedding with ID: {cursor.lastrowid}\")\n", | |
| "\n", | |
| " except mariadb.Error as e:\n", | |
| " print(f\"Error inserting into MariaDB: {e}\")\n", | |
| " if conn:\n", | |
| " conn.rollback() # Rollback on error\n", | |
| " finally:\n", | |
| " if cursor:\n", | |
| " cursor.close()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "7174559d-6b9c-4d55-86cf-b7507c93271e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "Processing text 1: 'The cat sat on the mat.'\n", | |
| "Generated embedding (first 5 elements): [-0.0026666224002838135, 0.013000323437154293, 0.002823619171977043, -0.03171399608254433, -0.010299142450094223]...\n", | |
| "Embedding length: 4096\n", | |
| "DEBUG: JSON string being sent: [-0.0027, 0.013, 0.0028, -0.0317, -0.0103, -0.0205, 0.0077, -0.0095, -0.0049, 0.0147, 0.0032, 0.0119, -0.0104, -0.0235, 0.0286, -0.0097, 0.0095, 0.0209, 0.0259, 0.0389, -0.0148, 0.0202, 0.0151, -0.039...\n", | |
| "Successfully inserted embedding with ID: 1\n", | |
| "\n", | |
| "Processing text 2: 'Dogs love to chase balls in the park.'\n", | |
| "Generated embedding (first 5 elements): [-0.014950787648558617, 0.0052628363482654095, -0.006423640064895153, -0.038704320788383484, 0.003801250597462058]...\n", | |
| "Embedding length: 4096\n", | |
| "DEBUG: JSON string being sent: [-0.015, 0.0053, -0.0064, -0.0387, 0.0038, 0.0355, -0.0064, -0.0069, 0.0098, 0.0521, -0.0112, -0.0127, -0.0026, -0.0042, 0.0051, 0.0235, 0.0172, 0.0338, 0.0282, 0.0305, 0.0117, -0.0134, -0.0045, -0.02...\n", | |
| "Successfully inserted embedding with ID: 2\n", | |
| "\n", | |
| "Processing text 3: 'Artificial intelligence is transforming many industries.'\n", | |
| "Generated embedding (first 5 elements): [0.012756548821926117, 0.03550448268651962, -0.016718043014407158, -0.024982227012515068, 0.07379638403654099]...\n", | |
| "Embedding length: 4096\n", | |
| "DEBUG: JSON string being sent: [0.0128, 0.0355, -0.0167, -0.025, 0.0738, -0.0007, -0.0172, -0.0006, 0.0161, 0.0276, -0.0306, 0.0278, 0.0406, 0.0143, 0.051, 0.009, 0.0084, -0.0144, -0.0012, -0.0082, -0.0137, 0.0227, 0.0034, 0.0038, ...\n", | |
| "Successfully inserted embedding with ID: 3\n", | |
| "\n", | |
| "Processing text 4: 'A quantum computer can solve certain problems much faster.'\n", | |
| "Generated embedding (first 5 elements): [0.0023580908309668303, 0.01199082937091589, 0.021898211911320686, 0.00322919525206089, 0.04967232048511505]...\n", | |
| "Embedding length: 4096\n", | |
| "DEBUG: JSON string being sent: [0.0024, 0.012, 0.0219, 0.0032, 0.0497, -0.0074, -0.0221, -0.0398, 0.0328, 0.0069, -0.0609, 0.032, 0.0097, 0.0115, 0.0484, -0.0092, 0.0177, -0.0069, 0.0132, -0.0035, -0.0012, 0.0353, 0.0107, 0.0, -0.0...\n", | |
| "Successfully inserted embedding with ID: 4\n", | |
| "\n", | |
| "Insertion process complete.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "texts_to_process = [\n", | |
| " \"The cat sat on the mat.\",\n", | |
| " \"Dogs love to chase balls in the park.\",\n", | |
| " \"Artificial intelligence is transforming many industries.\",\n", | |
| " \"A quantum computer can solve certain problems much faster.\"\n", | |
| "]\n", | |
| "\n", | |
| "for i, text in enumerate(texts_to_process):\n", | |
| " print(f\"\\nProcessing text {i+1}: '{text}'\")\n", | |
| " embedding = get_embeddings(text)\n", | |
| "\n", | |
| " if embedding:\n", | |
| " print(f\"Generated embedding (first 5 elements): {embedding[:5]}...\")\n", | |
| " print(f\"Embedding length: {len(embedding)}\")\n", | |
| " if len(embedding) != 4096:\n", | |
| " print(f\"Warning: Embedding length {len(embedding)} does not match table's VECTOR(4096). This might cause issues or be truncated.\")\n", | |
| " insert_embedding_into_mariadb(embedding)\n", | |
| " else:\n", | |
| " print(f\"Could not generate embedding for text: '{text}'\")\n", | |
| "\n", | |
| "print(\"\\nInsertion process complete.\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "a87723e4-0d1f-499c-aa70-d8d53f66ce4b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment