Skip to content

Instantly share code, notes, and snippets.

@pdet
Created May 25, 2023 13:20
Show Gist options
  • Save pdet/f19befb3efc6e069f42a7d6094974e8a to your computer and use it in GitHub Desktop.
Save pdet/f19befb3efc6e069f42a7d6094974e8a to your computer and use it in GitHub Desktop.
import threading
import psutil
import resource
import duckdb
import pyarrow as pa
import os
import time
def create_db():
con = duckdb.connect("duck.db")
con.sql("""
select
case when i != 0 and i % 42 = 0
then
NULL
else
repeat(chr((65 + (i % 26))::INTEGER), (4 + (i % 12))) end
from range(10000000) tbl(i);""").to_table("strings")
def string_length_arrow(x):
tuples = len(x)
values = [len(i.as_py()) if i.as_py() != None else 0 for i in x]
array = pa.array(values, type=pa.int32(), size=tuples)
return array
def udf():
con = duckdb.connect("duck.db")
con.create_function('strlen_arrow', string_length_arrow, ['VARCHAR'], int, type='arrow')
con.sql("select sum(strlen_arrow(i)) from strings tbl(i)").fetchall()
def func():
con = duckdb.connect("duck.db")
arrow_table = con.sql("select i from strings tbl(i)").arrow()
arrow_column = arrow_table['i']
tuples = len(arrow_column)
values = [len(i.as_py()) if i.as_py() != None else 0 for i in arrow_column]
array = pa.array(values, type=pa.int32(), size=tuples)
arrow_tbl = pa.Table.from_arrays([array], names=['i'])
return con.sql("select sum(i) from arrow_tbl").fetchall()
# Function to monitor memory usage
def monitor_memory_usage():
# Get the current process ID
pid = os.getpid()
# Track the peak memory usage
peak_memory = 0
while True:
# Get the memory usage of the current process
process = psutil.Process(pid)
memory_info = process.memory_info()
current_memory = memory_info.rss
# Update the peak memory if the current memory is higher
if current_memory > peak_memory:
peak_memory = current_memory
# Print the current memory usage
print(f"Current memory usage: {current_memory / 1024} KB")
# Sleep for a specific interval (e.g., 1 second)
time.sleep(1)
# Convert the peak memory usage to a human-readable format
peak_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 # in kilobytes
# Print the peak memory usage
print(f"Peak memory usage: {peak_memory:.2f} KB")
# Create a separate thread to monitor memory usage
memory_thread = threading.Thread(target=monitor_memory_usage)
memory_thread.start()
# Call the function `func()` or perform your application tasks
# func()
# udf()
# create_db()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment