Skip to content

Instantly share code, notes, and snippets.

@jerrywdlee
Created November 20, 2020 15:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jerrywdlee/55ba403f02651afc67dbda8185329780 to your computer and use it in GitHub Desktop.
Save jerrywdlee/55ba403f02651afc67dbda8185329780 to your computer and use it in GitHub Desktop.
ヘッダーが日本語の巨大CSVを取り込んでみる
# See: https://qiita.com/jerrywdlee/items/b9b15380ac3fe87c68ad
require 'benchmark'
require 'csv'
class LargeUnicodeCsv
class << self
def generate(cnt = 1_000_000)
headers = %w[ID 名前 フリガナ 年齢 血液型 都道府県 携帯キャリア]
exec_benchmark do
CSV.open('dummy_data.csv', 'w', write_headers: true, headers: headers) do |csv|
cnt.times do |i|
age = rand(100)
blood = %w[A B O AB][rand(4)]
carrier = %w[ドコモ au ソフトバンク][rand(3)]
csv << [i, '打見 花子', 'ダミ ハナコ', age, blood, '東京都', carrier]
end
end
file_size = `ls -lah dummy_data.csv | awk '{print $5}'`
puts "File size: #{file_size}"
end
end
def csv_table(path = 'dummy_data.csv')
exec_benchmark do
table = CSV.table(path)
p table.headers
p table[0]
end
end
def csv_each(path = 'dummy_data.csv')
exec_benchmark do
headers_jp = %w[ID 名前 フリガナ 年齢 血液型 都道府県 携帯キャリア]
headers_en = %w[id name kana age blood state carrier]
headers_dict = headers_jp.zip(headers_en).to_h
header_converter = lambda { |h| headers_dict[h] }
csv = CSV.read(path, headers: :first_row, header_converters: header_converter)
p csv.headers
p csv[0]
end
end
def csv_foreach(path = 'dummy_data.csv')
exec_benchmark do
headers_jp = %w[ID 名前 フリガナ 年齢 血液型 都道府県 携帯キャリア]
headers_en = %w[id name kana age blood state carrier]
headers_dict = headers_jp.zip(headers_en).to_h
converter = lambda { |h| headers_dict[h] }
CSV.foreach(path, headers: true, header_converters: converter) do |row|
p row.headers if row['id'] == '1'
p row if row['id'] == '1'
end
end
end
private
def print_memory_usage
memory_before = `ps -o rss= -p #{Process.pid}`.to_i
yield
memory_after = `ps -o rss= -p #{Process.pid}`.to_i
puts "Memory: #{((memory_after - memory_before) / 1024.0).round(2)}MB"
end
def print_time_spent
time = Benchmark.realtime do
yield
end
puts "Time: #{time.round(2)}s"
end
def exec_benchmark
print_memory_usage do
print_time_spent do
yield
end
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment