Skip to content

Instantly share code, notes, and snippets.

@kacperlukawski
Last active December 11, 2023 04:32
Show Gist options
  • Save kacperlukawski/2d3a3225f15a4cc5772cd1c81866340d to your computer and use it in GitHub Desktop.
Save kacperlukawski/2d3a3225f15a4cc5772cd1c81866340d to your computer and use it in GitHub Desktop.
Qdrant tips&tricks
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:01:38.772705Z",
"start_time": "2023-03-13T09:01:38.627212Z"
}
},
"outputs": [],
"source": [
"import config\n",
"import func"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:01:39.116822Z",
"start_time": "2023-03-13T09:01:38.774713Z"
}
},
"outputs": [],
"source": [
"from tqdm import tqdm\n",
"from qdrant_client import QdrantClient\n",
"from qdrant_client.http import models as rest"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Basic connection"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:01:39.205014Z",
"start_time": "2023-03-13T09:01:39.118666Z"
}
},
"outputs": [],
"source": [
"client = QdrantClient(\n",
" url=\"http://localhost\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:01:39.952500Z",
"start_time": "2023-03-13T09:01:39.207647Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.recreate_collection(\n",
" collection_name=config.COLLECTION_NAME,\n",
" vectors_config=rest.VectorParams(\n",
" size=config.VECTOR_SIZE,\n",
" distance=rest.Distance.COSINE,\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:14:40.339461Z",
"start_time": "2023-03-13T09:01:39.953855Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"101it [13:00, 7.73s/it] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 12min 7s, sys: 5.23 s, total: 12min 12s\n",
"Wall time: 13min\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"%%time\n",
"\n",
"max_num = 50_000\n",
"\n",
"batch_size = config.BATCH_SIZE\n",
"objects = func.iterate_objects(max_num=max_num)\n",
"batched_objects = func.batchify_objects(objects, n=batch_size)\n",
"for batch in tqdm(batched_objects, total=max_num // batch_size):\n",
" ids, vectors, payloads = batch\n",
" client.upsert(\n",
" collection_name=config.COLLECTION_NAME,\n",
" points=rest.Batch(\n",
" ids=ids,\n",
" vectors=vectors,\n",
" payloads=payloads,\n",
" )\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:14:40.348596Z",
"start_time": "2023-03-13T09:14:40.342240Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=47500, points_count=50001, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.get_collection(config.COLLECTION_NAME)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# gRPC protocol"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:14:40.600739Z",
"start_time": "2023-03-13T09:14:40.349853Z"
}
},
"outputs": [],
"source": [
"client = QdrantClient(\n",
" url=\"http://localhost\",\n",
" prefer_grpc=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:14:41.047721Z",
"start_time": "2023-03-13T09:14:40.602615Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.recreate_collection(\n",
" collection_name=config.COLLECTION_NAME,\n",
" vectors_config=rest.VectorParams(\n",
" size=config.VECTOR_SIZE,\n",
" distance=rest.Distance.COSINE,\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:21:29.766200Z",
"start_time": "2023-03-13T09:14:41.049769Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"51it [06:48, 8.01s/it] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6min 10s, sys: 3.34 s, total: 6min 13s\n",
"Wall time: 6min 48s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"%%time\n",
"\n",
"batch_size = batch_size * 2\n",
"objects = func.iterate_objects(max_num=max_num)\n",
"batched_objects = func.batchify_objects(objects, n=batch_size)\n",
"for batch in tqdm(batched_objects, total=max_num // batch_size):\n",
" ids, vectors, payloads = batch\n",
" client.upsert(\n",
" collection_name=config.COLLECTION_NAME,\n",
" points=rest.Batch(\n",
" ids=ids,\n",
" vectors=vectors,\n",
" payloads=payloads,\n",
" )\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T09:21:29.777569Z",
"start_time": "2023-03-13T09:21:29.770752Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [
{
"data": {
"text/plain": [
"CollectionInfo(status=<CollectionStatus.YELLOW: 'yellow'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=45000, points_count=50001, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.get_collection(config.COLLECTION_NAME)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Indexing threshold"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T11:47:30.851036Z",
"start_time": "2023-03-13T11:47:30.382652Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.recreate_collection(\n",
" collection_name=config.COLLECTION_NAME,\n",
" vectors_config=rest.VectorParams(\n",
" size=config.VECTOR_SIZE,\n",
" distance=rest.Distance.COSINE,\n",
" ),\n",
" optimizers_config=rest.OptimizersConfigDiff(\n",
" indexing_threshold=1_000_000_000, # 1M KBs\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T11:52:00.538505Z",
"start_time": "2023-03-13T11:47:30.852851Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"51it [04:29, 5.29s/it] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4min 19s, sys: 1.8 s, total: 4min 20s\n",
"Wall time: 4min 29s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"%%time\n",
"\n",
"objects = func.iterate_objects(max_num=max_num)\n",
"batched_objects = func.batchify_objects(objects, n=batch_size)\n",
"for batch in tqdm(batched_objects, total=max_num // batch_size):\n",
" ids, vectors, payloads = batch\n",
" client.upsert(\n",
" collection_name=config.COLLECTION_NAME,\n",
" points=rest.Batch(\n",
" ids=ids,\n",
" vectors=vectors,\n",
" payloads=payloads,\n",
" )\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T11:52:00.546753Z",
"start_time": "2023-03-13T11:52:00.541890Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [
{
"data": {
"text/plain": [
"CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=0, points_count=50001, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=1000000000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.get_collection(config.COLLECTION_NAME)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T11:52:00.634095Z",
"start_time": "2023-03-13T11:52:00.548790Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"client.update_collection(\n",
" collection_name=config.COLLECTION_NAME,\n",
" optimizer_config=rest.OptimizersConfigDiff(\n",
" indexing_threshold=10_000, # 1K KBs\n",
" )\n",
")\n",
"\n",
"while True:\n",
" collection_info = client.get_collection(collection_name=config.COLLECTION_NAME)\n",
" if collection_info.status == rest.CollectionStatus.GREEN:\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2023-03-13T11:52:00.641505Z",
"start_time": "2023-03-13T11:52:00.636482Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [
{
"data": {
"text/plain": [
"CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=50001, indexed_vectors_count=0, points_count=50001, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=2048, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=10000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0)), payload_schema={})"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.get_collection(config.COLLECTION_NAME)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment