Skip to content

Instantly share code, notes, and snippets.

Last active June 8, 2024 02:27
Show Gist options
  • Save 0xh3x/9a6abdbffa821091b6ff822029e1b644 to your computer and use it in GitHub Desktop.
Save 0xh3x/9a6abdbffa821091b6ff822029e1b644 to your computer and use it in GitHub Desktop.
VectorGraph - PoC from Memory Hackathon
Display the source blob
Display the rendered blob
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyPjIZOlMiymdw5WceeA/FR4",
"include_colab_link": true
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
"language_info": {
"name": "python"
"cells": [
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
"source": [
"<a href=\"\" target=\"_parent\"><img src=\"\" alt=\"Open In Colab\"/></a>"
"cell_type": "markdown",
"source": [
"# VectorGraph\n",
"Memory hackathon\n",
"metadata": {
"id": "LB7f3xRGt1Qv"
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9GFN8iPWxK7V"
"outputs": [],
"source": [
"!pip install -U langchain pypdf pymongo openai python-dotenv tiktoken"
"cell_type": "code",
"source": [
"from google.colab import userdata"
"metadata": {
"id": "lo0Pu_W-yKnd"
"execution_count": null,
"outputs": []
"cell_type": "code",
"source": [
"import openai\n",
"openai.api_key = userdata.get('OPENAI_API_KEY')"
"metadata": {
"id": "y38oKcsI_GW4"
"execution_count": null,
"outputs": []
"cell_type": "code",
"source": [
"from openai import OpenAI\n",
"openai_client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))\n",
"def get_embedding(text: str) -> list[float]:\n",
" response = openai_client.embeddings.create(\n",
" input=text,\n",
" model=\"text-embedding-3-large\",\n",
" dimensions=2048\n",
" )\n",
" return[0].embedding\n",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "11Fee_mRy74V",
"outputId": "bc9738dc-dd28-4cd4-c17a-3700319ef166"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"cell_type": "code",
"source": [
"from pymongo import MongoClient\n",
"DB_NAME = \"semanticgraphdb\"\n",
"COLLECTION_NAME = \"SemanticGraphDb\"\n",
"EMBEDDING_FIELD_NAME = \"embedding\"\n",
"client = MongoClient(userdata.get('mongo_uri'))\n",
"db = client[DB_NAME]\n",
"collection = db[COLLECTION_NAME]"
"metadata": {
"id": "4zTz28l58xGx"
"execution_count": null,
"outputs": []
"cell_type": "markdown",
"source": [
"Get IP address to whitelist in mongodb"
"metadata": {
"id": "JDwXg1fvoSi9"
"cell_type": "code",
"source": [
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "MhBd2WX2BM3s",
"outputId": "0dc3c1bd-dbb7-4547-fb0e-d3d4d1bcd09c"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"cell_type": "code",
"source": [
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.vectorstores import MongoDBAtlasVectorSearch\n",
"from langchain.docstore.document import Document\n",
"docs = [\n",
" Document(page_content=doc)\n",
" for doc in [\"pizza\",\"pasta\", \"salad\", \"italy\", \"germany\", \"france\", \"europe\",\"asia\", \"africa\", \"pie\"]\n",
"embedder = OpenAIEmbeddings(model=\"text-embedding-3-large\", disallowed_special=(), openai_api_key=userdata.get('OPENAI_API_KEY'), dimensions=2048)\n",
"# insert the documents in MongoDB Atlas Vector Search\n",
"x = MongoDBAtlasVectorSearch.from_documents(\n",
" documents=docs, embedding=embedder, collection=collection, index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME\n",
" )\n"
"metadata": {
"id": "7sFb3Zdm-Vv4"
"execution_count": null,
"outputs": []
"cell_type": "code",
"source": [
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "3V4yJP01A1b7",
"outputId": "b42c43af-c05f-4651-80e1-3abe71439f93"
"execution_count": null,
"outputs": [
"output_type": "execute_result",
"data": {
"text/plain": [
"metadata": {},
"execution_count": 132
"cell_type": "code",
"source": [
"def getsim(query, topk=5, exclude=None, adjust_probs=1.0):\n",
" if exclude is None:\n",
" exclude = [query]\n",
" print(\"query\",query, \"exclude\", exclude, \"adjust_probs\", adjust_probs)\n",
" results = collection.aggregate([\n",
" {\n",
" \"$vectorSearch\": {\n",
" \"index\": \"vector_index\",\n",
" \"queryVector\": get_embedding(query),\n",
" \"numCandidates\": 200,\n",
" \"limit\": topk,\n",
" \"path\": \"embedding\",\n",
" \"filter\": {\n",
" \"text\": { \"$nin\": exclude}\n",
" }\n",
" }},\n",
" {\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"text\": 1,\n",
" \"score\": { \"$meta\": \"vectorSearchScore\" }\n",
" }\n",
" }\n",
" ])\n",
" return [{'text':r['text'], 'score':r['score'] * adjust_probs} for r in list(results)[:topk]]\n",
"getsim(\"3.15\", topk=1)"
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "UbPCcKKyB3oV",
"outputId": "1b956610-3cab-4be6-d706-8bc15778d23f"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"query 3.15 exclude ['3.15'] adjust_probs 1.0\n"
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'text': 'pie', 'score': 0.6632951498031616}]"
"metadata": {},
"execution_count": 123
"cell_type": "code",
"source": [
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "xG-DiLcTEmkl",
"outputId": "1a857a7a-2b05-47f0-be86-e5d5059fd67e"
"execution_count": null,
"outputs": [
"output_type": "execute_result",
"data": {
"text/plain": [
"DeleteResult({'n': 14, 'electionId': ObjectId('7fffffff00000000000000e0'), 'opTime': {'ts': Timestamp(1712436514, 13), 't': 224}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1712436514, 16), 'signature': {'hash': b'\\x11\"\\xd8Q{m,\\xd4(\\xbc{\\x0f\\xe6`5sl#^\\xec', 'keyId': 7299225970987237377}}, 'operationTime': Timestamp(1712436514, 13)}, acknowledged=True)"
"metadata": {},
"execution_count": 53
"cell_type": "code",
"source": [
"s1 = getsim(input1, topk=2, exclude=[input1])\n",
"set1 = set([s[\"text\"] for s in s1])\n",
"print(input1, s1)\n",
"print(\"set1\", set1)\n",
"s2 = getsim(input2, topk=2, exclude=[input2])\n",
"set2 = set([s[\"text\"] for s in s2])\n",
"print(input2, s2)\n",
"print(\"set2\", set2)\n",
"for s in s1:\n",
" sims = getsim(s[\"text\"], topk=2, exclude=list(set1), adjust_probs=s['score'])\n",
" print(s[\"text\"], sims)\n",
" for sim in sims:\n",
" set1.add(sim[\"text\"])\n",
" if sim[\"text\"] in set2:\n",
" print(\"found: \", sim[\"text\"])\n",
"print(\"set1\", set1)\n",
"for s in s2:\n",
" sims = getsim(s[\"text\"], topk=2, exclude=list(set2), adjust_probs=s['score'])\n",
" print(s[\"text\"], sims)\n",
" for sim in sims:\n",
" set2.add(sim[\"text\"])\n",
" if sim[\"text\"] in set1:\n",
" print(\"found: \", sim[\"text\"])\n",
"print(\"set2\", set2)\n",
"print (set1.intersection(set2))\n"
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "xWlfWGwi-XU_",
"outputId": "9ec81dcf-8ff9-4e33-b4b1-1f9cff615c7a"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"query pie exclude ['pie'] adjust_probs 1.0\n",
"pie [{'text': 'pizza', 'score': 0.76506507396698}, {'text': 'pasta', 'score': 0.6837033629417419}]\n",
"set1 {'pizza', 'pasta'}\n",
"query ferrari exclude ['ferrari'] adjust_probs 1.0\n",
"ferrari [{'text': 'italy', 'score': 0.6754150390625}, {'text': 'france', 'score': 0.6711523532867432}]\n",
"set2 {'italy', 'france'}\n",
"query pizza exclude ['pizza', 'pasta'] adjust_probs 0.76506507396698\n",
"pizza [{'text': 'pie', 'score': 0.58530760367141}, {'text': 'salad', 'score': 0.550856861858513}]\n",
"query pasta exclude ['pizza', 'pasta', 'pie', 'salad'] adjust_probs 0.6837033629417419\n",
"pasta [{'text': 'italy', 'score': 0.45891093243378833}, {'text': 'asia', 'score': 0.4391373789216999}]\n",
"found: italy\n",
"set1 {'italy', 'salad', 'pasta', 'pizza', 'asia', 'pie'}\n",
"query italy exclude ['italy', 'france'] adjust_probs 0.6754150390625\n",
"italy [{'text': 'germany', 'score': 0.5503174412078806}, {'text': 'europe', 'score': 0.5277450528374175}]\n",
"query france exclude ['italy', 'germany', 'france', 'europe'] adjust_probs 0.6711523532867432\n",
"france [{'text': 'africa', 'score': 0.47426246229650815}, {'text': 'asia', 'score': 0.4330421091996328}]\n",
"found: asia\n",
"set2 {'africa', 'europe', 'france', 'germany', 'italy', 'asia'}\n",
"{'italy', 'asia'}\n"
"cell_type": "code",
"source": [
"getsim(\"3.15 pie pizza\", topk=2)"
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "wNtNE71_JFoA",
"outputId": "7462c980-ee49-435f-b475-257e2eaf0269"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"query 3.15 pie exclude ['bla'] adjust_probs 1.0\n"
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'text': 'pie', 'score': 0.7743104696273804},\n",
" {'text': 'pizza', 'score': 0.6535331010818481}]"
"metadata": {},
"execution_count": 99
"cell_type": "code",
"source": [
"getsim(\"ferrari italy\", topk=2)"
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "Wo47PQR8LqJ9",
"outputId": "1e48fb85-0489-47f3-b6f0-2b11a27663e4"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"query ferrari italy exclude ['bla'] adjust_probs 1.0\n"
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'text': 'italy', 'score': 0.8061650991439819},\n",
" {'text': 'germany', 'score': 0.7086942195892334}]"
"metadata": {},
"execution_count": 101
"cell_type": "code",
"source": [
"getsim(\"3.15\", topk=2)"
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "M0yGcS1Ocy2n",
"outputId": "80f0f983-4429-4f28-ba5b-1ca3cd50388a"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"query 3.15 exclude ['3.15'] adjust_probs 1.0\n"
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'text': 'pie', 'score': 0.6632951498031616},\n",
" {'text': 'salad', 'score': 0.6039043068885803}]"
"metadata": {},
"execution_count": 112
"cell_type": "code",
"source": [
"getsim(\"pie\", topk=2)"
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "TPvN0sOtdVjs",
"outputId": "f7d22b5e-c282-4571-eb51-fd0618f62b68"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"query pie exclude ['pie'] adjust_probs 1.0\n"
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'text': 'pizza', 'score': 0.76506507396698},\n",
" {'text': 'pasta', 'score': 0.6837033629417419}]"
"metadata": {},
"execution_count": 113
"cell_type": "code",
"source": [
"getsim(\"italy\", topk=20)"
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "a3hldsqIdZGo",
"outputId": "15624646-5ff9-4a7c-e7e7-49b59036aa82"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"query italy exclude ['italy'] adjust_probs 1.0\n"
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'text': 'germany', 'score': 0.8147841095924377},\n",
" {'text': 'europe', 'score': 0.7813640832901001},\n",
" {'text': 'france', 'score': 0.7730833292007446},\n",
" {'text': 'africa', 'score': 0.7081910967826843},\n",
" {'text': 'asia', 'score': 0.6715290546417236},\n",
" {'text': 'pasta', 'score': 0.6712135076522827},\n",
" {'text': 'pizza', 'score': 0.6585623025894165},\n",
" {'text': 'salad', 'score': 0.6119321584701538},\n",
" {'text': 'pie', 'score': 0.6002786159515381}]"
"metadata": {},
"execution_count": 126
"cell_type": "code",
"source": [
"getsim(\"ferrari\", topk=20)"
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
"id": "_IoZHLs-gO8T",
"outputId": "099cb0fc-c9b8-434c-9c56-d4f8e5d0ad5c"
"execution_count": null,
"outputs": [
"output_type": "stream",
"name": "stdout",
"text": [
"query ferrari exclude ['ferrari'] adjust_probs 1.0\n"
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'text': 'italy', 'score': 0.6754190921783447},\n",
" {'text': 'france', 'score': 0.6711257696151733},\n",
" {'text': 'pasta', 'score': 0.6701401472091675},\n",
" {'text': 'germany', 'score': 0.6491234302520752},\n",
" {'text': 'africa', 'score': 0.6408870220184326},\n",
" {'text': 'pizza', 'score': 0.638198733329773},\n",
" {'text': 'europe', 'score': 0.6174775958061218},\n",
" {'text': 'salad', 'score': 0.605734646320343},\n",
" {'text': 'asia', 'score': 0.6048133373260498},\n",
" {'text': 'pie', 'score': 0.6046735644340515}]"
"metadata": {},
"execution_count": 128
"cell_type": "markdown",
"source": [
"#### TODO: figure out how to combine similarity scores"
"metadata": {
"id": "GjEsrGuzm_L_"
"cell_type": "code",
"source": [],
"metadata": {
"id": "XScACRAZdroo"
"execution_count": null,
"outputs": []
Copy link

0xh3x commented Jun 8, 2024

This is the PoC of the idea I had at Memory Hackathon (organized by
@LangChainAI, @newcomputer, @AnthropicAI and @MongoDB ) and my idea got second place in the "Memory Infra Category" (40 teams competing).


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment