タグ付け実装別検証
require 'pg' | |
require 'json' | |
require 'securerandom' | |
require 'csv' | |
@conn = PG.connect dbname: 'test' | |
# | |
# 準備 | |
# | |
# initdb -D data | |
# pg_ctl -D data start | |
# createdb test | |
# | |
sqls = <<~SQL.split(';') | |
DROP TABLE IF EXISTS taggings; | |
DROP TABLE IF EXISTS tags; | |
DROP TABLE IF EXISTS records; | |
DROP SEQUENCE IF EXISTS records_id_seq; | |
DROP SEQUENCE IF EXISTS tags_id_seq; | |
DROP SEQUENCE IF EXISTS taggings_id_seq; | |
CREATE SEQUENCE records_id_seq START 1 INCREMENT 1 | |
; | |
CREATE SEQUENCE tags_id_seq START 1 INCREMENT 1 | |
; | |
CREATE SEQUENCE taggings_id_seq START 1 INCREMENT 1 | |
; | |
CREATE TABLE records ( | |
id int NOT NULL PRIMARY KEY DEFAULT nextval('records_id_seq'), | |
tags_obj jsonb NOT NULL, | |
tags_ary jsonb NOT NULL, | |
tags_ts tsvector NOT NULL | |
) | |
; | |
CREATE TABLE tags ( | |
id int NOT NULL PRIMARY KEY DEFAULT nextval('tags_id_seq'), | |
name varchar(255) NOT NULL | |
) | |
; | |
CREATE TABLE taggings ( | |
id int NOT NULL PRIMARY KEY DEFAULT nextval('taggings_id_seq'), | |
tag_id int REFERENCES tags(id) NOT NULL, | |
tagger_id int NOT NULL, | |
record_id int REFERENCES records(id) NOT NULL | |
) | |
; | |
CREATE INDEX record_pkey ON records USING BTREE(id) | |
; | |
CREATE INDEX tagging_pkey ON taggings USING BTREE(id) | |
; | |
CREATE INDEX tag_pkey ON tags USING BTREE(id) | |
; | |
CREATE INDEX tagging_tag ON taggings USING BTREE(tag_id) | |
; | |
CREATE INDEX tagging_tagger ON taggings USING BTREE(tagger_id) | |
; | |
CREATE INDEX tagging_record ON taggings USING BTREE(record_id) | |
; | |
CREATE INDEX tag_index1 ON records USING GIN(tags_obj) | |
; | |
CREATE INDEX tag_index2 ON records USING GIN(tags_ary) | |
; | |
CREATE INDEX tag_index3 ON records USING GIN(tags_ts) | |
; | |
SQL | |
sqls.each do |sql| | |
puts sql | |
@conn.exec(sql) | |
end | |
# タグ名の生成 | |
tag_names = Array.new(100) { SecureRandom.alphanumeric(10) } | |
# ユーザIDの生成 | |
@user_ids = (1..100).to_a | |
# tagsテーブルへのレコード生成 | |
tag_values = tag_names.map { |tag| "('#{tag}')" }.join(', ') | |
tag_insert = "INSERT INTO tags(name) VALUES #{tag_values} RETURNING id, name" | |
@tags = [] | |
@conn.exec(tag_insert) do |result| | |
result.each do |row| | |
@tags << { id: row['id'].to_i, name: row['name'] } | |
end | |
end | |
# リソース生成の関数 | |
def insert_resource(n) | |
puts "inserting #{n} records" | |
tag_matrix = Array.new(n) { @tags.sample(10).map { |tag| tag.merge(tagger_id: @user_ids.sample) } } | |
record_values = tag_matrix.map do |tag_list| | |
obj = {}.tap do |h| | |
tag_list.each { |tag| h[tag[:name]] = tag[:tagger_id] } | |
end.to_json | |
ary = [].tap do |h| | |
tag_list.each { |tag| h << tag[:name] } | |
end.to_json | |
ts = tag_list.map { |tag| tag[:name] }.join(' ') | |
"('#{obj}'::jsonb, '#{ary}'::jsonb, '#{ts}'::tsvector)" | |
end.join(', ') | |
record_insert = <<~SQL | |
INSERT INTO records(tags_obj, tags_ary, tags_ts) VALUES #{record_values} RETURNING id | |
SQL | |
record_ids = [] | |
@conn.exec(record_insert) do |res| | |
res.each_with_index do |row, index| | |
record_ids << row['id'].to_i | |
end | |
end | |
# taggingのレコード生成 | |
tagging_values = record_ids.each_with_index.flat_map do |rid, index| | |
tag_matrix[index].map do |tag| | |
"(#{rid}, #{tag[:id]}, #{tag[:tagger_id]})" | |
end | |
end.join(', ') | |
tagging_insert = <<~SQL | |
INSERT INTO taggings(record_id, tag_id, tagger_id) VALUES #{tagging_values} | |
SQL | |
@conn.exec(tagging_insert) | |
end | |
def explain(sql) | |
<<~SQL | |
EXPLAIN (FORMAT JSON) #{sql} | |
SQL | |
end | |
def analyze(sql) | |
<<~SQL | |
EXPLAIN ANALYZE #{sql} | |
SQL | |
end | |
def report(name, examples) | |
puts "reporting" | |
# 結果が一致することを確認しておく | |
params = yield | |
results = examples.map do |key, sql| | |
@conn.exec_params(sql, params) do |res| | |
res.map { |row| row['id'] } | |
end.sort | |
end.uniq | |
if results.size > 1 | |
results.each_with_index do |res, index| | |
puts "result #{index + 1}" | |
puts res | |
end | |
raise 'result not equivalent!' | |
end | |
# 順序依存ノイズを取るためシャッフル | |
examples.to_a.shuffle.map do |key, sql| | |
params = yield | |
exec_times = Array.new(100) do | |
@conn.exec_params(analyze(sql), params) do |res| | |
str = res.map { |x| x['QUERY PLAN'] }.join("\n") | |
matches = str.match(/Execution Time: ([\d.]+) ms/) | |
matches[1].to_f | |
end | |
end | |
scan_plan = @conn.exec_params(explain(sql), params) do |res| | |
JSON.parse(res[0]['QUERY PLAN'])[0]['Plan']['Node Type'] | |
end | |
avg = exec_times.sum / exec_times.size | |
["#{name} / #{key}", [scan_plan, avg]] | |
end.to_h | |
end | |
single_examples = { | |
'Using Intermediate Table' => <<~SQL, | |
SELECT id FROM records | |
WHERE EXISTS ( | |
SELECT FROM taggings | |
INNER JOIN tags ON tags.id = taggings.tag_id | |
WHERE (tags.name = $1) AND taggings.record_id = records.id | |
) | |
SQL | |
'Using jsonb (Object)' => <<~SQL, | |
SELECT id FROM records WHERE tags_obj ? $1 | |
SQL | |
'Using jsonb (Array)' => <<~SQL, | |
SELECT id FROM records WHERE tags_ary ? $1 | |
SQL | |
'Using tsvector' => <<~SQL, | |
SELECT id FROM records WHERE tags_ts @@ ($1)::tsquery | |
SQL | |
} | |
or_examples = { | |
'Using Intermediate Table' => <<~SQL, | |
SELECT id FROM records | |
WHERE EXISTS ( | |
SELECT FROM taggings | |
INNER JOIN tags ON tags.id = taggings.tag_id | |
WHERE (tags.name = $1 OR tags.name = $2) AND taggings.record_id = records.id | |
) | |
SQL | |
'Using jsonb (Object)' => <<~SQL, | |
SELECT id FROM records WHERE tags_obj ? $1 OR tags_obj ? $2 | |
SQL | |
'Using jsonb (Array)' => <<~SQL, | |
SELECT id FROM records WHERE tags_ary ? $1 OR tags_ary ? $2 | |
SQL | |
'Using tsvector' => <<~SQL, | |
SELECT id FROM records WHERE tags_ts @@ ($1 || ' | ' || $2)::tsquery | |
SQL | |
} | |
tagger_examples = { | |
'Using Intermediate Table' => <<~SQL, | |
SELECT id FROM records | |
WHERE EXISTS ( | |
SELECT FROM taggings | |
INNER JOIN tags ON tags.id = taggings.tag_id | |
WHERE tags.name = $1 AND taggings.tagger_id = $2 AND taggings.record_id = records.id | |
) | |
SQL | |
'Using jsonb (Object)' => <<~SQL, | |
SELECT id FROM records WHERE tags_obj ? $1 AND tags_obj ->> $1 = $2::text | |
SQL | |
} | |
size = 2**8 | |
interval = 1000 | |
MAX_N = 18 | |
data = {} | |
MAX_N.times do |n| | |
puts "Running N=#{n}" | |
@conn.exec('VACUUM FULL') # 関係するか不明だがノイズを防ぐためにvacuumしておく | |
insert_resource(2**n) | |
h = {} | |
h.merge!(report('Filter by single tag', single_examples) do | |
[@tags.sample[:name]] | |
end) | |
h.merge!(report('Filter by two tags (OR)', or_examples) do | |
@tags.sample(2).map { |tag| tag[:name] } | |
end) | |
h.merge!(report('Filter by tag AND tagger_id', tagger_examples) do | |
[@tags.sample[:name], @user_ids.sample] | |
end) | |
h.map do |key, values| | |
data[key] ||= {} | |
data[key][n] = values | |
end | |
end | |
CSV.open('result.csv', 'wb') do |csv| | |
headers = (0...MAX_N).map { |n| "average (N=#{n})"} + (0...MAX_N).map { |n| "method (N=#{n})"} | |
csv << ['N', *headers] | |
data.each do |key, vh| | |
values = (0...MAX_N).map { |n| vh[n][1] } | |
methods = (0...MAX_N).map { |n| vh[n][0] } | |
csv << [key, *values, *methods] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment