Skip to content

Instantly share code, notes, and snippets.

@janakiramm
Created March 8, 2024 05:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save janakiramm/55d2d8ec5d14dd45c7e9127d81cdafcd to your computer and use it in GitHub Desktop.
Save janakiramm/55d2d8ec5d14dd45c7e9127d81cdafcd to your computer and use it in GitHub Desktop.
Python Code to create Vertex AI Vector Search Index and Deploying it.
###Enable Google Cloud APIs and login with your credentials
#gcloud services enable compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com
#gcloud auth application-default login
###Install required Python modules
#pip install pypdf2
#pip install google-cloud-storage
#pip install google-cloud-aiplatform
#pip install jupyter
from google.cloud import storage
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform
import PyPDF2
import re
import os
import random
import json
import uuid
project="your_GCP_project_id"
location="us-central1"
pdf_path="lakeside_handbook.pdf"
bucket_name = "lakeside-content"
embed_file_path = "lakeside_embeddings.json"
sentence_file_path = "lakeside_sentences.json"
index_name="lakeside_index"
def extract_sentences_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
if page.extract_text() is not None:
text += page.extract_text() + " "
sentences = [sentence.strip() for sentence in text.split('. ') if sentence.strip()]
return sentences
def generate_text_embeddings(sentences) -> list:
aiplatform.init(project=project,location=location)
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
embeddings = model.get_embeddings(sentences)
vectors = [embedding.values for embedding in embeddings]
return vectors
def generate_and_save_embeddings(pdf_path, sentence_file_path, embed_file_path):
def clean_text(text):
cleaned_text = re.sub(r'\u2022', '', text) # Remove bullet points
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() # Remove extra whitespaces and strip
return cleaned_text
sentences = extract_sentences_from_pdf(pdf_path)
if sentences:
embeddings = generate_text_embeddings(sentences)
with open(embed_file_path, 'w') as embed_file, open(sentence_file_path, 'w') as sentence_file:
for sentence, embedding in zip(sentences, embeddings):
cleaned_sentence = clean_text(sentence)
id = str(uuid.uuid4())
embed_item = {"id": id, "embedding": embedding}
sentence_item = {"id": id, "sentence": cleaned_sentence}
json.dump(sentence_item, sentence_file)
sentence_file.write('\n')
json.dump(embed_item, embed_file)
embed_file.write('\n')
def upload_file(bucket_name,file_path):
storage_client = storage.Client()
bucket = storage_client.create_bucket(bucket_name,location=location)
blob = bucket.blob(file_path)
blob.upload_from_filename(file_path)
def create_vector_index(bucket_name, index_name):
lakeside_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
display_name = index_name,
contents_delta_uri = "gs://"+bucket_name,
dimensions = 768,
approximate_neighbors_count = 10,
)
lakeside_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
display_name = index_name,
public_endpoint_enabled = True
)
lakeside_index_endpoint.deploy_index(
index = lakeside_index, deployed_index_id = index_name
)
generate_and_save_embeddings(pdf_path,sentence_file_path,embed_file_path)
upload_file(bucket_name,file_path)
create_vector_index(bucket_name, index_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment