binidxaba/all_blogs.txt

## readme.md

      
    Raw
  

              readme.md
            
          
    Unleashing the power of vector embeddings with PostgreSQL

This gist contains the code used for Tembo's blog post on pgvector.
Requirements

Please install the required python packages:
python3 -m venv ./venv
source ./venv/bin/activate

pip install -r ./requirements.txt
To run your pgvector Postgres database

You can simply use a container:
docker pull ankane/pgvector
docker run --name pgvector -e POSTGRES_PASSWORD=password -p 5432:5432 ankane/pgvector
See pgvector for more details.
Then you can connect to the pgvector container:
docker exec -it  pgvector /bin/bash
and create the required database and enable pgvector on that database:
psql -h localhost -U postgres
postgres=# create database vector_db;                                                                                                                    
psql -h localhost -U postgres vector_db
vector_db=# create extension vector;                                                                                                                     
Provided scripts

To download the corpus used for this example:

sh get_data.sh
It will download the blogs from Tembo.io, place them in /tmp and then in removes the markdown tags. The resulting files will be in your current directory, in the corpus directory.
To load the database with the embeddings:

python3 ./load_embeddings.py
To generate the query vector:

python3 ./generate_query_vector.py

  
## all_blogs.txt
2023-04-18-introducing-tembo/index.md
2023-07-05-tembo-manifesto/index.md
2023-08-03-introducing-pgmq/index.md
2023-08-16-introducing-pg-later/index.md
2023-08-24-pgmq_with_python/index.md
2023-08-31-tuning-autovacuum/index.md
2023-09-06-tembo-stacks/index.md
2023-09-14-four-types-of-extensions/index.md
2023-09-20-postgres-16/index.md
2023-09-28-pgmq-internals/index.md
2023-09-29-table-version-history/index.md
2023-10-03-clerk-fdw/index.md

## generate_query_vector.py
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2023 binidxaba <binidxaba@noemail.com>
#
# Distributed under terms of the MIT license.

"""

"""
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
query = "What is new in postgres 16"

vector = embeddings.embed_query(query)
print(vector)

## get_data.sh
#! /bin/sh
#
# convert.sh
# Copyright (C) 2023 binidxaba <binidxaba@noemail.com>
#
# Distributed under terms of the MIT license.
#

GITHUB_REPO="https://raw.githubusercontent.com/tembo-io/website/main/blog"

BLOGS_DIR=/tmp/blog
CORPUS_DIR=./corpus

set -xe

mkdir -p ${BLOGS_DIR}
rm -rf ${BLOGS_DIR}/*
for i in $(cat ./all_blogs.txt)
do
  fname=$(dirname "$i")
  wget -O ${BLOGS_DIR}/${fname}.md "${GITHUB_REPO}/$i"
done


mkdir -p ${CORPUS_DIR}
rm -rf ${CORPUS_DIR}/*

for f in ${BLOGS_DIR}/*
do
  fname=$(basename $f)
  pandoc -t plain ${f} > ${CORPUS_DIR}/${fname%.*}.txt
done

## load_embeddings.py
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2023 binidxaba <binidxaba@noemail.com>
#
# Distributed under terms of the MIT license.

"""

"""
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

import os


CONNECTION_STRING = "postgresql+psycopg2://postgres:password@localhost:5432/vector_db"

embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 20)

files = os.listdir('./corpus')

for file in files:
    file_path = f"./corpus/{file}"
    print(f"Loading: {file_path}")
    loader = TextLoader(file_path)
    document = loader.load()
    texts = text_splitter.split_documents(document)
    sentence_embeddings = embeddings.embed_documents([t.page_content for t in texts[:5]])

    db = PGVector.from_documents(
            embedding=embeddings,
            documents=texts,
            collection_name=file,
            connection_string=CONNECTION_STRING)

## requirements.txt
langchain
psycopg2-binary
pgvector
sentence-transformers

## test_embedding.py
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2023 binidxaba <binidxaba@noemail.com>
#
# Distributed under terms of the MIT license.

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ['SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.',
             'Pgvector is postgres extension for vector similarity search.',
             'Tembo will help you say goodby to database sprawl, and hello to Postgres.']

sentence_embeddings = model.encode(sentences)

for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
	2023-04-18-introducing-tembo/index.md
	2023-07-05-tembo-manifesto/index.md
	2023-08-03-introducing-pgmq/index.md
	2023-08-16-introducing-pg-later/index.md
	2023-08-24-pgmq_with_python/index.md
	2023-08-31-tuning-autovacuum/index.md
	2023-09-06-tembo-stacks/index.md
	2023-09-14-four-types-of-extensions/index.md
	2023-09-20-postgres-16/index.md
	2023-09-28-pgmq-internals/index.md
	2023-09-29-table-version-history/index.md
	2023-10-03-clerk-fdw/index.md
	#! /usr/bin/env python3
	# -- coding: utf-8 --
	# vim:fenc=utf-8
	#
	# Copyright © 2023 binidxaba <binidxaba@noemail.com>
	#
	# Distributed under terms of the MIT license.

	"""

	"""
	from langchain.embeddings import HuggingFaceEmbeddings

	embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
	query = "What is new in postgres 16"

	vector = embeddings.embed_query(query)
	print(vector)
	#! /bin/sh
	#
	# convert.sh
	# Copyright (C) 2023 binidxaba <binidxaba@noemail.com>
	#
	# Distributed under terms of the MIT license.
	#

	GITHUB_REPO="https://raw.githubusercontent.com/tembo-io/website/main/blog"

	BLOGS_DIR=/tmp/blog
	CORPUS_DIR=./corpus

	set -xe

	mkdir -p ${BLOGS_DIR}
	rm -rf ${BLOGS_DIR}/*
	for i in $(cat ./all_blogs.txt)
	do
	fname=$(dirname "$i")
	wget -O ${BLOGS_DIR}/${fname}.md "${GITHUB_REPO}/$i"
	done


	mkdir -p ${CORPUS_DIR}
	rm -rf ${CORPUS_DIR}/*

	for f in ${BLOGS_DIR}/*
	do
	fname=$(basename $f)
	pandoc -t plain ${f} > ${CORPUS_DIR}/${fname%.*}.txt
	done