Skip to content

Instantly share code, notes, and snippets.

@stenlarsson
Last active May 24, 2022 12:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stenlarsson/60b1e4e99416738b41ee30e7ba294214 to your computer and use it in GitHub Desktop.
Save stenlarsson/60b1e4e99416738b41ee30e7ba294214 to your computer and use it in GitHub Desktop.
arrow_test_csv.rb creates a CSV with random data used by arrow_memory_leak.rb
import gc
import resource
from pyarrow import csv
import sys
def print_stats():
gc.collect()
print(f'{sys.getrefcount(object)} objects, {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / (1024*1024)} MB')
def main():
print_stats()
for _ in range(10):
csv.read_csv('arrow_test.csv')
print_stats()
if __name__ == '__main__':
main()
require 'arrow'
require 'get_process_mem'
def print_stats
GC.start
puts "#{ObjectSpace.count_objects[:TOTAL]} objects, #{GetProcessMem.new.mb} MB"
end
print_stats
10.times do
Arrow::MemoryMappedInputStream.open('arrow_test.csv') do |input|
Arrow::CSVReader.new(input).read
end
print_stats
end
require 'arrow'
@divide_function = Arrow::Function.find('divide')
@memory_pool = Arrow.default_memory_pool
puts "backend_name: #{@memory_pool.backend_name}"
def print_stats
GC.start
puts "#{@memory_pool.bytes_allocated / 1024**2} MB allocated, #{ObjectSpace.count_objects[:TOTAL]} Ruby objects"
end
print_stats
10.times do
Arrow::MemoryMappedInputStream.open('arrow_test.csv') do |input|
table = Arrow::CSVReader.new(input).read
column = table.columns[0]
100.times do |column_index|
column = table[column_index]
@divide_function.execute([column, 1e6])
end
end
print_stats
end
require 'csv'
CSV.open('arrow_test.csv', 'wb') do |csv|
100_000.times do
csv << 100.times.map { rand }
end
end
puts "File size: #{File.size('arrow_test.csv').fdiv(1024 * 1024).round} MB"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment