Created
June 23, 2020 08:41
-
-
Save mizukami234/65a54e39fa0c809163e363f56ff9e109 to your computer and use it in GitHub Desktop.
タグ付け実装別検証
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'pg' | |
require 'json' | |
require 'securerandom' | |
require 'csv' | |
@conn = PG.connect dbname: 'test' | |
# | |
# 準備 | |
# | |
# initdb -D data | |
# pg_ctl -D data start | |
# createdb test | |
# | |
sqls = <<~SQL.split(';') | |
DROP TABLE IF EXISTS taggings; | |
DROP TABLE IF EXISTS tags; | |
DROP TABLE IF EXISTS records; | |
DROP SEQUENCE IF EXISTS records_id_seq; | |
DROP SEQUENCE IF EXISTS tags_id_seq; | |
DROP SEQUENCE IF EXISTS taggings_id_seq; | |
CREATE SEQUENCE records_id_seq START 1 INCREMENT 1 | |
; | |
CREATE SEQUENCE tags_id_seq START 1 INCREMENT 1 | |
; | |
CREATE SEQUENCE taggings_id_seq START 1 INCREMENT 1 | |
; | |
CREATE TABLE records ( | |
id int NOT NULL PRIMARY KEY DEFAULT nextval('records_id_seq'), | |
tags_obj jsonb NOT NULL, | |
tags_ary jsonb NOT NULL, | |
tags_ts tsvector NOT NULL | |
) | |
; | |
CREATE TABLE tags ( | |
id int NOT NULL PRIMARY KEY DEFAULT nextval('tags_id_seq'), | |
name varchar(255) NOT NULL | |
) | |
; | |
CREATE TABLE taggings ( | |
id int NOT NULL PRIMARY KEY DEFAULT nextval('taggings_id_seq'), | |
tag_id int REFERENCES tags(id) NOT NULL, | |
tagger_id int NOT NULL, | |
record_id int REFERENCES records(id) NOT NULL | |
) | |
; | |
CREATE INDEX record_pkey ON records USING BTREE(id) | |
; | |
CREATE INDEX tagging_pkey ON taggings USING BTREE(id) | |
; | |
CREATE INDEX tag_pkey ON tags USING BTREE(id) | |
; | |
CREATE INDEX tagging_tag ON taggings USING BTREE(tag_id) | |
; | |
CREATE INDEX tagging_tagger ON taggings USING BTREE(tagger_id) | |
; | |
CREATE INDEX tagging_record ON taggings USING BTREE(record_id) | |
; | |
CREATE INDEX tag_index1 ON records USING GIN(tags_obj) | |
; | |
CREATE INDEX tag_index2 ON records USING GIN(tags_ary) | |
; | |
CREATE INDEX tag_index3 ON records USING GIN(tags_ts) | |
; | |
SQL | |
sqls.each do |sql| | |
puts sql | |
@conn.exec(sql) | |
end | |
# タグ名の生成 | |
tag_names = Array.new(100) { SecureRandom.alphanumeric(10) } | |
# ユーザIDの生成 | |
@user_ids = (1..100).to_a | |
# tagsテーブルへのレコード生成 | |
tag_values = tag_names.map { |tag| "('#{tag}')" }.join(', ') | |
tag_insert = "INSERT INTO tags(name) VALUES #{tag_values} RETURNING id, name" | |
@tags = [] | |
@conn.exec(tag_insert) do |result| | |
result.each do |row| | |
@tags << { id: row['id'].to_i, name: row['name'] } | |
end | |
end | |
# リソース生成の関数 | |
def insert_resource(n) | |
puts "inserting #{n} records" | |
tag_matrix = Array.new(n) { @tags.sample(10).map { |tag| tag.merge(tagger_id: @user_ids.sample) } } | |
record_values = tag_matrix.map do |tag_list| | |
obj = {}.tap do |h| | |
tag_list.each { |tag| h[tag[:name]] = tag[:tagger_id] } | |
end.to_json | |
ary = [].tap do |h| | |
tag_list.each { |tag| h << tag[:name] } | |
end.to_json | |
ts = tag_list.map { |tag| tag[:name] }.join(' ') | |
"('#{obj}'::jsonb, '#{ary}'::jsonb, '#{ts}'::tsvector)" | |
end.join(', ') | |
record_insert = <<~SQL | |
INSERT INTO records(tags_obj, tags_ary, tags_ts) VALUES #{record_values} RETURNING id | |
SQL | |
record_ids = [] | |
@conn.exec(record_insert) do |res| | |
res.each_with_index do |row, index| | |
record_ids << row['id'].to_i | |
end | |
end | |
# taggingのレコード生成 | |
tagging_values = record_ids.each_with_index.flat_map do |rid, index| | |
tag_matrix[index].map do |tag| | |
"(#{rid}, #{tag[:id]}, #{tag[:tagger_id]})" | |
end | |
end.join(', ') | |
tagging_insert = <<~SQL | |
INSERT INTO taggings(record_id, tag_id, tagger_id) VALUES #{tagging_values} | |
SQL | |
@conn.exec(tagging_insert) | |
end | |
def explain(sql) | |
<<~SQL | |
EXPLAIN (FORMAT JSON) #{sql} | |
SQL | |
end | |
def analyze(sql) | |
<<~SQL | |
EXPLAIN ANALYZE #{sql} | |
SQL | |
end | |
def report(name, examples) | |
puts "reporting" | |
# 結果が一致することを確認しておく | |
params = yield | |
results = examples.map do |key, sql| | |
@conn.exec_params(sql, params) do |res| | |
res.map { |row| row['id'] } | |
end.sort | |
end.uniq | |
if results.size > 1 | |
results.each_with_index do |res, index| | |
puts "result #{index + 1}" | |
puts res | |
end | |
raise 'result not equivalent!' | |
end | |
# 順序依存ノイズを取るためシャッフル | |
examples.to_a.shuffle.map do |key, sql| | |
params = yield | |
exec_times = Array.new(100) do | |
@conn.exec_params(analyze(sql), params) do |res| | |
str = res.map { |x| x['QUERY PLAN'] }.join("\n") | |
matches = str.match(/Execution Time: ([\d.]+) ms/) | |
matches[1].to_f | |
end | |
end | |
scan_plan = @conn.exec_params(explain(sql), params) do |res| | |
JSON.parse(res[0]['QUERY PLAN'])[0]['Plan']['Node Type'] | |
end | |
avg = exec_times.sum / exec_times.size | |
["#{name} / #{key}", [scan_plan, avg]] | |
end.to_h | |
end | |
single_examples = { | |
'Using Intermediate Table' => <<~SQL, | |
SELECT id FROM records | |
WHERE EXISTS ( | |
SELECT FROM taggings | |
INNER JOIN tags ON tags.id = taggings.tag_id | |
WHERE (tags.name = $1) AND taggings.record_id = records.id | |
) | |
SQL | |
'Using jsonb (Object)' => <<~SQL, | |
SELECT id FROM records WHERE tags_obj ? $1 | |
SQL | |
'Using jsonb (Array)' => <<~SQL, | |
SELECT id FROM records WHERE tags_ary ? $1 | |
SQL | |
'Using tsvector' => <<~SQL, | |
SELECT id FROM records WHERE tags_ts @@ ($1)::tsquery | |
SQL | |
} | |
or_examples = { | |
'Using Intermediate Table' => <<~SQL, | |
SELECT id FROM records | |
WHERE EXISTS ( | |
SELECT FROM taggings | |
INNER JOIN tags ON tags.id = taggings.tag_id | |
WHERE (tags.name = $1 OR tags.name = $2) AND taggings.record_id = records.id | |
) | |
SQL | |
'Using jsonb (Object)' => <<~SQL, | |
SELECT id FROM records WHERE tags_obj ? $1 OR tags_obj ? $2 | |
SQL | |
'Using jsonb (Array)' => <<~SQL, | |
SELECT id FROM records WHERE tags_ary ? $1 OR tags_ary ? $2 | |
SQL | |
'Using tsvector' => <<~SQL, | |
SELECT id FROM records WHERE tags_ts @@ ($1 || ' | ' || $2)::tsquery | |
SQL | |
} | |
tagger_examples = { | |
'Using Intermediate Table' => <<~SQL, | |
SELECT id FROM records | |
WHERE EXISTS ( | |
SELECT FROM taggings | |
INNER JOIN tags ON tags.id = taggings.tag_id | |
WHERE tags.name = $1 AND taggings.tagger_id = $2 AND taggings.record_id = records.id | |
) | |
SQL | |
'Using jsonb (Object)' => <<~SQL, | |
SELECT id FROM records WHERE tags_obj ? $1 AND tags_obj ->> $1 = $2::text | |
SQL | |
} | |
size = 2**8 | |
interval = 1000 | |
MAX_N = 18 | |
data = {} | |
MAX_N.times do |n| | |
puts "Running N=#{n}" | |
@conn.exec('VACUUM FULL') # 関係するか不明だがノイズを防ぐためにvacuumしておく | |
insert_resource(2**n) | |
h = {} | |
h.merge!(report('Filter by single tag', single_examples) do | |
[@tags.sample[:name]] | |
end) | |
h.merge!(report('Filter by two tags (OR)', or_examples) do | |
@tags.sample(2).map { |tag| tag[:name] } | |
end) | |
h.merge!(report('Filter by tag AND tagger_id', tagger_examples) do | |
[@tags.sample[:name], @user_ids.sample] | |
end) | |
h.map do |key, values| | |
data[key] ||= {} | |
data[key][n] = values | |
end | |
end | |
CSV.open('result.csv', 'wb') do |csv| | |
headers = (0...MAX_N).map { |n| "average (N=#{n})"} + (0...MAX_N).map { |n| "method (N=#{n})"} | |
csv << ['N', *headers] | |
data.each do |key, vh| | |
values = (0...MAX_N).map { |n| vh[n][1] } | |
methods = (0...MAX_N).map { |n| vh[n][0] } | |
csv << [key, *values, *methods] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment