Skip to content

Instantly share code, notes, and snippets.

@khaerulumam42
Created November 24, 2023 10:48
Show Gist options
  • Save khaerulumam42/47d5d7e26c3fa1537963db41dafdcf3d to your computer and use it in GitHub Desktop.
Save khaerulumam42/47d5d7e26c3fa1537963db41dafdcf3d to your computer and use it in GitHub Desktop.
data_string = '''cats, animal
dogs, animal
lions, animal
elephants, animal
penguins, animal
dolphins, animal
pandas, animal
tigers, animal
horses, animal
koalas, animal
china, country
united states, country
india, country
russia, country
brazil, country
united kingdom, country
germany, country
japan, country
france, country
canada, country
soccer, sport
basketball, sport
cricket, sport
tennis, sport
golf, sport
american football, sport
baseball, sport
ice hockey, sport
athletics, sport
swimming, sport'''
dataset = data_string.split('\n')
documents = []
for data in dataset:
text, category = data.split(', ')
metadata = {'category' : category}
doc = Document(page_content=text, metadata=metadata)
documents.append(doc)
def generate_insert_embedding(documents: List[Document]):
batch_size = 5000
total = len(documents)
total_batch = int(total/batch_size)+1
for i in tqdm(range(total_batch)):
start = i*batch_size
end = min(total, (i+1)*batch_size)
chunk_document = documents[start:end]
db = PGVector.from_documents(
embedding=embedding_model,
documents=chunk_document,
collection_name=COLLECTION_NAME,
connection_string=CONNECTION_STRING,
)
generate_insert_embedding(documents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment