Created
November 20, 2020 15:06
-
-
Save jerrywdlee/55ba403f02651afc67dbda8185329780 to your computer and use it in GitHub Desktop.
ヘッダーが日本語の巨大CSVを取り込んでみる
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# See: https://qiita.com/jerrywdlee/items/b9b15380ac3fe87c68ad | |
require 'benchmark' | |
require 'csv' | |
class LargeUnicodeCsv | |
class << self | |
def generate(cnt = 1_000_000) | |
headers = %w[ID 名前 フリガナ 年齢 血液型 都道府県 携帯キャリア] | |
exec_benchmark do | |
CSV.open('dummy_data.csv', 'w', write_headers: true, headers: headers) do |csv| | |
cnt.times do |i| | |
age = rand(100) | |
blood = %w[A B O AB][rand(4)] | |
carrier = %w[ドコモ au ソフトバンク][rand(3)] | |
csv << [i, '打見 花子', 'ダミ ハナコ', age, blood, '東京都', carrier] | |
end | |
end | |
file_size = `ls -lah dummy_data.csv | awk '{print $5}'` | |
puts "File size: #{file_size}" | |
end | |
end | |
def csv_table(path = 'dummy_data.csv') | |
exec_benchmark do | |
table = CSV.table(path) | |
p table.headers | |
p table[0] | |
end | |
end | |
def csv_each(path = 'dummy_data.csv') | |
exec_benchmark do | |
headers_jp = %w[ID 名前 フリガナ 年齢 血液型 都道府県 携帯キャリア] | |
headers_en = %w[id name kana age blood state carrier] | |
headers_dict = headers_jp.zip(headers_en).to_h | |
header_converter = lambda { |h| headers_dict[h] } | |
csv = CSV.read(path, headers: :first_row, header_converters: header_converter) | |
p csv.headers | |
p csv[0] | |
end | |
end | |
def csv_foreach(path = 'dummy_data.csv') | |
exec_benchmark do | |
headers_jp = %w[ID 名前 フリガナ 年齢 血液型 都道府県 携帯キャリア] | |
headers_en = %w[id name kana age blood state carrier] | |
headers_dict = headers_jp.zip(headers_en).to_h | |
converter = lambda { |h| headers_dict[h] } | |
CSV.foreach(path, headers: true, header_converters: converter) do |row| | |
p row.headers if row['id'] == '1' | |
p row if row['id'] == '1' | |
end | |
end | |
end | |
private | |
def print_memory_usage | |
memory_before = `ps -o rss= -p #{Process.pid}`.to_i | |
yield | |
memory_after = `ps -o rss= -p #{Process.pid}`.to_i | |
puts "Memory: #{((memory_after - memory_before) / 1024.0).round(2)}MB" | |
end | |
def print_time_spent | |
time = Benchmark.realtime do | |
yield | |
end | |
puts "Time: #{time.round(2)}s" | |
end | |
def exec_benchmark | |
print_memory_usage do | |
print_time_spent do | |
yield | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment