pdet/memory_consumption_bench.py

## memory_consumption_bench.py
import threading
import psutil
import resource
import duckdb
import pyarrow as pa
import os
import time

def create_db():
    con = duckdb.connect("duck.db")
    con.sql("""
    select
        case when i != 0 and i % 42 = 0
        then
            NULL
        else
            repeat(chr((65 + (i % 26))::INTEGER), (4 + (i % 12))) end
        from range(10000000) tbl(i);""").to_table("strings")

def string_length_arrow(x):
    tuples = len(x)
    values = [len(i.as_py()) if i.as_py() != None else 0 for i in x]
    array = pa.array(values, type=pa.int32(), size=tuples)
    return array

def udf():
    con = duckdb.connect("duck.db")
    con.create_function('strlen_arrow', string_length_arrow, ['VARCHAR'], int, type='arrow')
    con.sql("select sum(strlen_arrow(i)) from strings tbl(i)").fetchall()

def func():
    con = duckdb.connect("duck.db")
    arrow_table = con.sql("select i from strings tbl(i)").arrow()
    arrow_column = arrow_table['i']
    tuples = len(arrow_column)
    values = [len(i.as_py()) if i.as_py() != None else 0 for i in arrow_column]
    array = pa.array(values, type=pa.int32(), size=tuples)
    arrow_tbl = pa.Table.from_arrays([array], names=['i'])
    return con.sql("select sum(i) from arrow_tbl").fetchall()

# Function to monitor memory usage
def monitor_memory_usage():
    # Get the current process ID
    pid = os.getpid()

    # Track the peak memory usage
    peak_memory = 0

    while True:
        # Get the memory usage of the current process
        process = psutil.Process(pid)
        memory_info = process.memory_info()
        current_memory = memory_info.rss

        # Update the peak memory if the current memory is higher
        if current_memory > peak_memory:
            peak_memory = current_memory

        # Print the current memory usage
        print(f"Current memory usage: {current_memory / 1024} KB")

        # Sleep for a specific interval (e.g., 1 second)
        time.sleep(1)

    # Convert the peak memory usage to a human-readable format
    peak_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024  # in kilobytes

    # Print the peak memory usage
    print(f"Peak memory usage: {peak_memory:.2f} KB")

# Create a separate thread to monitor memory usage
memory_thread = threading.Thread(target=monitor_memory_usage)
memory_thread.start()

# Call the function `func()` or perform your application tasks
# func()
# udf()
# create_db()
	import threading
	import psutil
	import resource
	import duckdb
	import pyarrow as pa
	import os
	import time

	def create_db():
	con = duckdb.connect("duck.db")
	con.sql("""
	select
	case when i != 0 and i % 42 = 0
	then
	NULL
	else
	repeat(chr((65 + (i % 26))::INTEGER), (4 + (i % 12))) end
	from range(10000000) tbl(i);""").to_table("strings")

	def string_length_arrow(x):
	tuples = len(x)
	values = [len(i.as_py()) if i.as_py() != None else 0 for i in x]
	array = pa.array(values, type=pa.int32(), size=tuples)
	return array

	def udf():
	con = duckdb.connect("duck.db")
	con.create_function('strlen_arrow', string_length_arrow, ['VARCHAR'], int, type='arrow')
	con.sql("select sum(strlen_arrow(i)) from strings tbl(i)").fetchall()

	def func():
	con = duckdb.connect("duck.db")
	arrow_table = con.sql("select i from strings tbl(i)").arrow()
	arrow_column = arrow_table['i']
	tuples = len(arrow_column)
	values = [len(i.as_py()) if i.as_py() != None else 0 for i in arrow_column]
	array = pa.array(values, type=pa.int32(), size=tuples)
	arrow_tbl = pa.Table.from_arrays([array], names=['i'])
	return con.sql("select sum(i) from arrow_tbl").fetchall()

	# Function to monitor memory usage
	def monitor_memory_usage():
	# Get the current process ID
	pid = os.getpid()

	# Track the peak memory usage
	peak_memory = 0

	while True:
	# Get the memory usage of the current process
	process = psutil.Process(pid)
	memory_info = process.memory_info()
	current_memory = memory_info.rss

	# Update the peak memory if the current memory is higher
	if current_memory > peak_memory:
	peak_memory = current_memory

	# Print the current memory usage
	print(f"Current memory usage: {current_memory / 1024} KB")

	# Sleep for a specific interval (e.g., 1 second)
	time.sleep(1)

	# Convert the peak memory usage to a human-readable format
	peak_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 # in kilobytes

	# Print the peak memory usage
	print(f"Peak memory usage: {peak_memory:.2f} KB")

	# Create a separate thread to monitor memory usage
	memory_thread = threading.Thread(target=monitor_memory_usage)
	memory_thread.start()

	# Call the function `func()` or perform your application tasks
	# func()
	# udf()
	# create_db()