Skip to content

Instantly share code, notes, and snippets.

View kacperlukawski's full-sized avatar

Kacper Łukawski kacperlukawski

View GitHub Profile
@kacperlukawski
kacperlukawski / 01-data-upload.ipynb
Last active December 11, 2023 04:32
Qdrant tips&tricks
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
pubid question context long_answer final_decision
18802997 Can calprotectin predict relapse risk in infla... ... Measuring calprotectin may help to identify UC... maybe
20538207 Should temperature be monitorized during kidne... ... The new storage can affords more stable temper... no
25521278 Is plate clearing a risk factor for obesity? ... The tendency to clear one's plate when eating ... yes
17595200 Is there an intrauterine influence on obesity? ... Comparison of mother-offspring and father-offs.. no
15280782 Is unsafe sexual behaviour increasing among HI... ... There was no evidence of a trend in unsafe sex... no
from qdrant_client.http.models import NamedVector
text_results = client.search(
collection_name="ms-coco-2017",
query_vector=NamedVector(
name="text",
vector=row["text_vector"],
),
limit=5,
with_vectors=False,
sample_df = dataset_df.sample(n=2000, random_state=643)
image_vectors = image_pipeline.transform(sample_df)
text_vectors = text_pipeline.transform(sample_df)
sample_df["image_vector"] = image_vectors.tolist()
sample_df["text_vector"] = text_vectors.tolist()
from sklearn.pipeline import make_pipeline
from embetter.grab import ColumnGrabber
from embetter.vision import ImageLoader, TimmEncoder
from embetter.text import SentenceEncoder
output_directory = Path("./images")
image_pipeline = make_pipeline(
ColumnGrabber("URL"),
DownloadFile(output_directory),
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance
client = QdrantClient(timeout=None)
client.recreate_collection(
collection_name="ms-coco-2017",
vectors_config={
"text": VectorParams(
size=384,
distance=Distance.EUCLID,
from pathlib import Path
from urllib.request import urlretrieve
from embetter.base import EmbetterBase
class DownloadFile(EmbetterBase):
def __init__(self, out_dir: Path):
self.out_dir = out_dir
def transform(self, X, y=None):
import pandas as pd
dataset_df = pd.DataFrame(dataset["train"])
from datasets import load_dataset
dataset = load_dataset("ChristophSchuhmann/MS_COCO_2017_URL_TEXT")
client.recreate_collection(
collection_name="single_vector",
vectors_config=VectorParams(
size=100,
distance=Distance.COSINE,
)
)