Created
May 25, 2023 13:20
-
-
Save pdet/f19befb3efc6e069f42a7d6094974e8a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import threading | |
import psutil | |
import resource | |
import duckdb | |
import pyarrow as pa | |
import os | |
import time | |
def create_db(): | |
con = duckdb.connect("duck.db") | |
con.sql(""" | |
select | |
case when i != 0 and i % 42 = 0 | |
then | |
NULL | |
else | |
repeat(chr((65 + (i % 26))::INTEGER), (4 + (i % 12))) end | |
from range(10000000) tbl(i);""").to_table("strings") | |
def string_length_arrow(x): | |
tuples = len(x) | |
values = [len(i.as_py()) if i.as_py() != None else 0 for i in x] | |
array = pa.array(values, type=pa.int32(), size=tuples) | |
return array | |
def udf(): | |
con = duckdb.connect("duck.db") | |
con.create_function('strlen_arrow', string_length_arrow, ['VARCHAR'], int, type='arrow') | |
con.sql("select sum(strlen_arrow(i)) from strings tbl(i)").fetchall() | |
def func(): | |
con = duckdb.connect("duck.db") | |
arrow_table = con.sql("select i from strings tbl(i)").arrow() | |
arrow_column = arrow_table['i'] | |
tuples = len(arrow_column) | |
values = [len(i.as_py()) if i.as_py() != None else 0 for i in arrow_column] | |
array = pa.array(values, type=pa.int32(), size=tuples) | |
arrow_tbl = pa.Table.from_arrays([array], names=['i']) | |
return con.sql("select sum(i) from arrow_tbl").fetchall() | |
# Function to monitor memory usage | |
def monitor_memory_usage(): | |
# Get the current process ID | |
pid = os.getpid() | |
# Track the peak memory usage | |
peak_memory = 0 | |
while True: | |
# Get the memory usage of the current process | |
process = psutil.Process(pid) | |
memory_info = process.memory_info() | |
current_memory = memory_info.rss | |
# Update the peak memory if the current memory is higher | |
if current_memory > peak_memory: | |
peak_memory = current_memory | |
# Print the current memory usage | |
print(f"Current memory usage: {current_memory / 1024} KB") | |
# Sleep for a specific interval (e.g., 1 second) | |
time.sleep(1) | |
# Convert the peak memory usage to a human-readable format | |
peak_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 # in kilobytes | |
# Print the peak memory usage | |
print(f"Peak memory usage: {peak_memory:.2f} KB") | |
# Create a separate thread to monitor memory usage | |
memory_thread = threading.Thread(target=monitor_memory_usage) | |
memory_thread.start() | |
# Call the function `func()` or perform your application tasks | |
# func() | |
# udf() | |
# create_db() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment