Skip to content

Instantly share code, notes, and snippets.

@djinn
Created May 24, 2024 07:24
Show Gist options
  • Save djinn/d05a88c4d6008cdb9c5291c53a2beeb3 to your computer and use it in GitHub Desktop.
Save djinn/d05a88c4d6008cdb9c5291c53a2beeb3 to your computer and use it in GitHub Desktop.
Mongo Vs Couchbase
import time
import random
from pymongo import MongoClient
from couchbase.cluster import Cluster
from couchbase.management.buckets import BucketManager
from couchbase.options import ClusterOptions
from couchbase.auth import PasswordAuthenticator
import os
import string
from tqdm import tqdm
import statistics
# Utility function for generating random strings
def generate_random_string(length):
letters = string.ascii_letters + string.digits
return ''.join(random.choice(letters) for i in range(length))
username = os.environ['CB_USER']
password = os.environ['CB_PASSWORD']
mongo_uri = os.environ['MDB_URI']
couchbase_uri = os.environ['CB_URI']
# Connection Parameters
# Initialize MongoDB
mongo_client = MongoClient(mongo_uri)
mongo_db = mongo_client['benchmark']
mongo_parents = mongo_db['parents']
mongo_children = mongo_db['children']
# Drop and recreate MongoDB collections
mongo_db.drop_collection('parents')
mongo_db.drop_collection('children')
mongo_parents = mongo_db['parents']
mongo_children = mongo_db['children']
# Initialize Couchbase and manage buckets
cluster = Cluster(couchbase_uri, ClusterOptions(PasswordAuthenticator(username, password)))
bucket_manager = cluster.buckets()
# Drop and recreate Couchbase bucket
bucket_name = 'benchmark'
try:
cluster.query("DELETE FROM `benchmark`")
print("Successfully deleted all documents in Couchbase bucket")
except Exception as e:
print(f"Failed to drop Couchbase bucket: {e}")
#try:
# bucket_manager.create_bucket(bucket_name, flush_enabled=True, ram_quota_mb=512)
#except Exception as e:
# print(f"Failed to create Couchbase bucket: {e}")
cb_bucket = cluster.bucket(bucket_name)
cb_collection = cb_bucket.default_collection()
# Parameters for data generation
num_parents = 40 # Total parents
children_per_parent = 1000 # Children per parent
random_string_length = 9900 # Adjust this to make document ~10KB
from tqdm import tqdm
def generate_data():
# Total number of operations (each parent and each child operation)
total_operations = num_parents + (num_parents * children_per_parent)
with tqdm(total=total_operations, desc="Generating data") as pbar:
for parent_id in range(num_parents):
large_random_string = generate_random_string(random_string_length)
parent_doc = {'_id': f'parent_{parent_id}', 'data': 'Some parent data', 'random_data': large_random_string}
mongo_parents.insert_one(parent_doc)
cb_collection.upsert(f'parent_{parent_id}', parent_doc)
pbar.update(1) # Update progress after each parent is processed
for child_id in range(children_per_parent):
large_random_string = generate_random_string(random_string_length)
child_doc = {'_id': f'child_{parent_id * children_per_parent + child_id}',
'parent_id': f'parent_{parent_id}', 'random_data': large_random_string}
mongo_children.insert_one(child_doc)
cb_collection.upsert(f'child_{parent_id * children_per_parent + child_id}', child_doc)
pbar.update(1) # Update progress after each child is processed
def benchmark_read_speed(db_type, num_samples=1000000):
read_times = [] # List to store the time taken for each read
start_time = time.time()
child_indices = [random.randint(0, num_parents * children_per_parent - 1) for _ in range(num_samples)]
if db_type == 'mongo':
for child_index in tqdm(child_indices, desc=f"Benchmarking MongoDB reads"):
start_op_time = time.time() # Start time of the operation
child_doc = mongo_children.find_one({'_id': f'child_{child_index}'})
parent_doc = mongo_parents.find_one({'_id': child_doc['parent_id']})
end_op_time = time.time() # End time of the operation
read_times.append(end_op_time - start_op_time) # Calculate and store the read time
elif db_type == 'couchbase':
for child_index in tqdm(child_indices, desc=f"Benchmarking Couchbase reads"):
start_op_time = time.time() # Start time of the operation
child_doc = cb_collection.get(f'child_{child_index}').content_as[dict]
parent_doc = cb_collection.get(child_doc['parent_id']).content_as[dict]
end_op_time = time.time() # End time of the operation
read_times.append(end_op_time - start_op_time) # Calculate and store the read time
end_time = time.time()
total_time = end_time - start_time
# Calculating the median and standard deviation of read times
median_read_time = statistics.median(read_times)
try:
stdev_read_time = statistics.stdev(read_times)
except statistics.StatisticsError:
stdev_read_time = 0 # If there is only one sample or all samples are identical
print(f"Total time for {num_samples} reads in {db_type}: {total_time} seconds")
print(f"Median read time: {median_read_time:.6f} seconds")
print(f"Standard deviation of read times: {stdev_read_time:.6f} seconds")
# Create indexes for MongoDB
mongo_children.create_index("parent_id")
mongo_parents.create_index("_id")
# Generate data
generate_data()
# Benchmark both databases
benchmark_read_speed('mongo')
benchmark_read_speed('couchbase')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment