djinn/read_benchmark.py

## read_benchmark.py
import time
import random
from pymongo import MongoClient
from couchbase.cluster import Cluster
from couchbase.management.buckets import BucketManager
from couchbase.options import ClusterOptions
from couchbase.auth import PasswordAuthenticator
import os
import string
from tqdm import tqdm
import statistics

# Utility function for generating random strings
def generate_random_string(length):
    letters = string.ascii_letters + string.digits
    return ''.join(random.choice(letters) for i in range(length))

username = os.environ['CB_USER']
password = os.environ['CB_PASSWORD']
mongo_uri = os.environ['MDB_URI']
couchbase_uri = os.environ['CB_URI']
# Connection Parameters


# Initialize MongoDB
mongo_client = MongoClient(mongo_uri)
mongo_db = mongo_client['benchmark']
mongo_parents = mongo_db['parents']
mongo_children = mongo_db['children']

# Drop and recreate MongoDB collections
mongo_db.drop_collection('parents')
mongo_db.drop_collection('children')
mongo_parents = mongo_db['parents']
mongo_children = mongo_db['children']


# Initialize Couchbase and manage buckets
cluster = Cluster(couchbase_uri, ClusterOptions(PasswordAuthenticator(username, password)))
bucket_manager = cluster.buckets()

# Drop and recreate Couchbase bucket
bucket_name = 'benchmark'
try:
    cluster.query("DELETE FROM `benchmark`")
    print("Successfully deleted all documents in Couchbase bucket")
except Exception as e:
    print(f"Failed to drop Couchbase bucket: {e}")

#try:
#    bucket_manager.create_bucket(bucket_name, flush_enabled=True, ram_quota_mb=512)
#except Exception as e:
#    print(f"Failed to create Couchbase bucket: {e}")

cb_bucket = cluster.bucket(bucket_name)
cb_collection = cb_bucket.default_collection()

# Parameters for data generation
num_parents = 40  # Total parents
children_per_parent = 1000  # Children per parent
random_string_length = 9900  # Adjust this to make document ~10KB

from tqdm import tqdm

def generate_data():
    # Total number of operations (each parent and each child operation)
    total_operations = num_parents + (num_parents * children_per_parent)
    with tqdm(total=total_operations, desc="Generating data") as pbar:
        for parent_id in range(num_parents):
            large_random_string = generate_random_string(random_string_length)
            parent_doc = {'_id': f'parent_{parent_id}', 'data': 'Some parent data', 'random_data': large_random_string}
            mongo_parents.insert_one(parent_doc)
            cb_collection.upsert(f'parent_{parent_id}', parent_doc)
            pbar.update(1)  # Update progress after each parent is processed

            for child_id in range(children_per_parent):
                large_random_string = generate_random_string(random_string_length)
                child_doc = {'_id': f'child_{parent_id * children_per_parent + child_id}',
                             'parent_id': f'parent_{parent_id}', 'random_data': large_random_string}
                mongo_children.insert_one(child_doc)
                cb_collection.upsert(f'child_{parent_id * children_per_parent + child_id}', child_doc)
                pbar.update(1)  # Update progress after each child is processed


def benchmark_read_speed(db_type, num_samples=1000000):
    read_times = []  # List to store the time taken for each read

    start_time = time.time()
    child_indices = [random.randint(0, num_parents * children_per_parent - 1) for _ in range(num_samples)]

    if db_type == 'mongo':
        for child_index in tqdm(child_indices, desc=f"Benchmarking MongoDB reads"):
            start_op_time = time.time()  # Start time of the operation
            child_doc = mongo_children.find_one({'_id': f'child_{child_index}'})
            parent_doc = mongo_parents.find_one({'_id': child_doc['parent_id']})
            end_op_time = time.time()  # End time of the operation
            read_times.append(end_op_time - start_op_time)  # Calculate and store the read time
    elif db_type == 'couchbase':
        for child_index in tqdm(child_indices, desc=f"Benchmarking Couchbase reads"):
            start_op_time = time.time()  # Start time of the operation
            child_doc = cb_collection.get(f'child_{child_index}').content_as[dict]
            parent_doc = cb_collection.get(child_doc['parent_id']).content_as[dict]
            end_op_time = time.time()  # End time of the operation
            read_times.append(end_op_time - start_op_time)  # Calculate and store the read time

    end_time = time.time()
    total_time = end_time - start_time

    # Calculating the median and standard deviation of read times
    median_read_time = statistics.median(read_times)
    try:
        stdev_read_time = statistics.stdev(read_times)
    except statistics.StatisticsError:
        stdev_read_time = 0  # If there is only one sample or all samples are identical

    print(f"Total time for {num_samples} reads in {db_type}: {total_time} seconds")
    print(f"Median read time: {median_read_time:.6f} seconds")
    print(f"Standard deviation of read times: {stdev_read_time:.6f} seconds")


# Create indexes for MongoDB
mongo_children.create_index("parent_id")
mongo_parents.create_index("_id")

# Generate data
generate_data()

# Benchmark both databases
benchmark_read_speed('mongo')
benchmark_read_speed('couchbase')
	import time
	import random
	from pymongo import MongoClient
	from couchbase.cluster import Cluster
	from couchbase.management.buckets import BucketManager
	from couchbase.options import ClusterOptions
	from couchbase.auth import PasswordAuthenticator
	import os
	import string
	from tqdm import tqdm
	import statistics

	# Utility function for generating random strings
	def generate_random_string(length):
	letters = string.ascii_letters + string.digits
	return ''.join(random.choice(letters) for i in range(length))

	username = os.environ['CB_USER']
	password = os.environ['CB_PASSWORD']
	mongo_uri = os.environ['MDB_URI']
	couchbase_uri = os.environ['CB_URI']
	# Connection Parameters


	# Initialize MongoDB
	mongo_client = MongoClient(mongo_uri)
	mongo_db = mongo_client['benchmark']
	mongo_parents = mongo_db['parents']
	mongo_children = mongo_db['children']

	# Drop and recreate MongoDB collections
	mongo_db.drop_collection('parents')
	mongo_db.drop_collection('children')
	mongo_parents = mongo_db['parents']
	mongo_children = mongo_db['children']


	# Initialize Couchbase and manage buckets
	cluster = Cluster(couchbase_uri, ClusterOptions(PasswordAuthenticator(username, password)))
	bucket_manager = cluster.buckets()

	# Drop and recreate Couchbase bucket
	bucket_name = 'benchmark'
	try:
	cluster.query("DELETE FROM `benchmark`")
	print("Successfully deleted all documents in Couchbase bucket")
	except Exception as e:
	print(f"Failed to drop Couchbase bucket: {e}")

	#try:
	# bucket_manager.create_bucket(bucket_name, flush_enabled=True, ram_quota_mb=512)
	#except Exception as e:
	# print(f"Failed to create Couchbase bucket: {e}")

	cb_bucket = cluster.bucket(bucket_name)
	cb_collection = cb_bucket.default_collection()

	# Parameters for data generation
	num_parents = 40 # Total parents
	children_per_parent = 1000 # Children per parent
	random_string_length = 9900 # Adjust this to make document ~10KB

	from tqdm import tqdm

	def generate_data():
	# Total number of operations (each parent and each child operation)
	total_operations = num_parents + (num_parents * children_per_parent)
	with tqdm(total=total_operations, desc="Generating data") as pbar:
	for parent_id in range(num_parents):
	large_random_string = generate_random_string(random_string_length)
	parent_doc = {'_id': f'parent_{parent_id}', 'data': 'Some parent data', 'random_data': large_random_string}
	mongo_parents.insert_one(parent_doc)
	cb_collection.upsert(f'parent_{parent_id}', parent_doc)
	pbar.update(1) # Update progress after each parent is processed

	for child_id in range(children_per_parent):
	large_random_string = generate_random_string(random_string_length)
	child_doc = {'_id': f'child_{parent_id * children_per_parent + child_id}',
	'parent_id': f'parent_{parent_id}', 'random_data': large_random_string}
	mongo_children.insert_one(child_doc)
	cb_collection.upsert(f'child_{parent_id * children_per_parent + child_id}', child_doc)
	pbar.update(1) # Update progress after each child is processed


	def benchmark_read_speed(db_type, num_samples=1000000):
	read_times = [] # List to store the time taken for each read

	start_time = time.time()
	child_indices = [random.randint(0, num_parents * children_per_parent - 1) for _ in range(num_samples)]

	if db_type == 'mongo':
	for child_index in tqdm(child_indices, desc=f"Benchmarking MongoDB reads"):
	start_op_time = time.time() # Start time of the operation
	child_doc = mongo_children.find_one({'_id': f'child_{child_index}'})
	parent_doc = mongo_parents.find_one({'_id': child_doc['parent_id']})
	end_op_time = time.time() # End time of the operation
	read_times.append(end_op_time - start_op_time) # Calculate and store the read time
	elif db_type == 'couchbase':
	for child_index in tqdm(child_indices, desc=f"Benchmarking Couchbase reads"):
	start_op_time = time.time() # Start time of the operation
	child_doc = cb_collection.get(f'child_{child_index}').content_as[dict]
	parent_doc = cb_collection.get(child_doc['parent_id']).content_as[dict]
	end_op_time = time.time() # End time of the operation
	read_times.append(end_op_time - start_op_time) # Calculate and store the read time

	end_time = time.time()
	total_time = end_time - start_time

	# Calculating the median and standard deviation of read times
	median_read_time = statistics.median(read_times)
	try:
	stdev_read_time = statistics.stdev(read_times)
	except statistics.StatisticsError:
	stdev_read_time = 0 # If there is only one sample or all samples are identical

	print(f"Total time for {num_samples} reads in {db_type}: {total_time} seconds")
	print(f"Median read time: {median_read_time:.6f} seconds")
	print(f"Standard deviation of read times: {stdev_read_time:.6f} seconds")


	# Create indexes for MongoDB
	mongo_children.create_index("parent_id")
	mongo_parents.create_index("_id")

	# Generate data
	generate_data()

	# Benchmark both databases
	benchmark_read_speed('mongo')
	benchmark_read_speed('couchbase')