Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
タグ付け実装別検証
require 'pg'
require 'json'
require 'securerandom'
require 'csv'
@conn = PG.connect dbname: 'test'
#
# 準備
#
# initdb -D data
# pg_ctl -D data start
# createdb test
#
sqls = <<~SQL.split(';')
DROP TABLE IF EXISTS taggings;
DROP TABLE IF EXISTS tags;
DROP TABLE IF EXISTS records;
DROP SEQUENCE IF EXISTS records_id_seq;
DROP SEQUENCE IF EXISTS tags_id_seq;
DROP SEQUENCE IF EXISTS taggings_id_seq;
CREATE SEQUENCE records_id_seq START 1 INCREMENT 1
;
CREATE SEQUENCE tags_id_seq START 1 INCREMENT 1
;
CREATE SEQUENCE taggings_id_seq START 1 INCREMENT 1
;
CREATE TABLE records (
id int NOT NULL PRIMARY KEY DEFAULT nextval('records_id_seq'),
tags_obj jsonb NOT NULL,
tags_ary jsonb NOT NULL,
tags_ts tsvector NOT NULL
)
;
CREATE TABLE tags (
id int NOT NULL PRIMARY KEY DEFAULT nextval('tags_id_seq'),
name varchar(255) NOT NULL
)
;
CREATE TABLE taggings (
id int NOT NULL PRIMARY KEY DEFAULT nextval('taggings_id_seq'),
tag_id int REFERENCES tags(id) NOT NULL,
tagger_id int NOT NULL,
record_id int REFERENCES records(id) NOT NULL
)
;
CREATE INDEX record_pkey ON records USING BTREE(id)
;
CREATE INDEX tagging_pkey ON taggings USING BTREE(id)
;
CREATE INDEX tag_pkey ON tags USING BTREE(id)
;
CREATE INDEX tagging_tag ON taggings USING BTREE(tag_id)
;
CREATE INDEX tagging_tagger ON taggings USING BTREE(tagger_id)
;
CREATE INDEX tagging_record ON taggings USING BTREE(record_id)
;
CREATE INDEX tag_index1 ON records USING GIN(tags_obj)
;
CREATE INDEX tag_index2 ON records USING GIN(tags_ary)
;
CREATE INDEX tag_index3 ON records USING GIN(tags_ts)
;
SQL
sqls.each do |sql|
puts sql
@conn.exec(sql)
end
# タグ名の生成
tag_names = Array.new(100) { SecureRandom.alphanumeric(10) }
# ユーザIDの生成
@user_ids = (1..100).to_a
# tagsテーブルへのレコード生成
tag_values = tag_names.map { |tag| "('#{tag}')" }.join(', ')
tag_insert = "INSERT INTO tags(name) VALUES #{tag_values} RETURNING id, name"
@tags = []
@conn.exec(tag_insert) do |result|
result.each do |row|
@tags << { id: row['id'].to_i, name: row['name'] }
end
end
# リソース生成の関数
def insert_resource(n)
puts "inserting #{n} records"
tag_matrix = Array.new(n) { @tags.sample(10).map { |tag| tag.merge(tagger_id: @user_ids.sample) } }
record_values = tag_matrix.map do |tag_list|
obj = {}.tap do |h|
tag_list.each { |tag| h[tag[:name]] = tag[:tagger_id] }
end.to_json
ary = [].tap do |h|
tag_list.each { |tag| h << tag[:name] }
end.to_json
ts = tag_list.map { |tag| tag[:name] }.join(' ')
"('#{obj}'::jsonb, '#{ary}'::jsonb, '#{ts}'::tsvector)"
end.join(', ')
record_insert = <<~SQL
INSERT INTO records(tags_obj, tags_ary, tags_ts) VALUES #{record_values} RETURNING id
SQL
record_ids = []
@conn.exec(record_insert) do |res|
res.each_with_index do |row, index|
record_ids << row['id'].to_i
end
end
# taggingのレコード生成
tagging_values = record_ids.each_with_index.flat_map do |rid, index|
tag_matrix[index].map do |tag|
"(#{rid}, #{tag[:id]}, #{tag[:tagger_id]})"
end
end.join(', ')
tagging_insert = <<~SQL
INSERT INTO taggings(record_id, tag_id, tagger_id) VALUES #{tagging_values}
SQL
@conn.exec(tagging_insert)
end
def explain(sql)
<<~SQL
EXPLAIN (FORMAT JSON) #{sql}
SQL
end
def analyze(sql)
<<~SQL
EXPLAIN ANALYZE #{sql}
SQL
end
def report(name, examples)
puts "reporting"
# 結果が一致することを確認しておく
params = yield
results = examples.map do |key, sql|
@conn.exec_params(sql, params) do |res|
res.map { |row| row['id'] }
end.sort
end.uniq
if results.size > 1
results.each_with_index do |res, index|
puts "result #{index + 1}"
puts res
end
raise 'result not equivalent!'
end
# 順序依存ノイズを取るためシャッフル
examples.to_a.shuffle.map do |key, sql|
params = yield
exec_times = Array.new(100) do
@conn.exec_params(analyze(sql), params) do |res|
str = res.map { |x| x['QUERY PLAN'] }.join("\n")
matches = str.match(/Execution Time: ([\d.]+) ms/)
matches[1].to_f
end
end
scan_plan = @conn.exec_params(explain(sql), params) do |res|
JSON.parse(res[0]['QUERY PLAN'])[0]['Plan']['Node Type']
end
avg = exec_times.sum / exec_times.size
["#{name} / #{key}", [scan_plan, avg]]
end.to_h
end
single_examples = {
'Using Intermediate Table' => <<~SQL,
SELECT id FROM records
WHERE EXISTS (
SELECT FROM taggings
INNER JOIN tags ON tags.id = taggings.tag_id
WHERE (tags.name = $1) AND taggings.record_id = records.id
)
SQL
'Using jsonb (Object)' => <<~SQL,
SELECT id FROM records WHERE tags_obj ? $1
SQL
'Using jsonb (Array)' => <<~SQL,
SELECT id FROM records WHERE tags_ary ? $1
SQL
'Using tsvector' => <<~SQL,
SELECT id FROM records WHERE tags_ts @@ ($1)::tsquery
SQL
}
or_examples = {
'Using Intermediate Table' => <<~SQL,
SELECT id FROM records
WHERE EXISTS (
SELECT FROM taggings
INNER JOIN tags ON tags.id = taggings.tag_id
WHERE (tags.name = $1 OR tags.name = $2) AND taggings.record_id = records.id
)
SQL
'Using jsonb (Object)' => <<~SQL,
SELECT id FROM records WHERE tags_obj ? $1 OR tags_obj ? $2
SQL
'Using jsonb (Array)' => <<~SQL,
SELECT id FROM records WHERE tags_ary ? $1 OR tags_ary ? $2
SQL
'Using tsvector' => <<~SQL,
SELECT id FROM records WHERE tags_ts @@ ($1 || ' | ' || $2)::tsquery
SQL
}
tagger_examples = {
'Using Intermediate Table' => <<~SQL,
SELECT id FROM records
WHERE EXISTS (
SELECT FROM taggings
INNER JOIN tags ON tags.id = taggings.tag_id
WHERE tags.name = $1 AND taggings.tagger_id = $2 AND taggings.record_id = records.id
)
SQL
'Using jsonb (Object)' => <<~SQL,
SELECT id FROM records WHERE tags_obj ? $1 AND tags_obj ->> $1 = $2::text
SQL
}
size = 2**8
interval = 1000
MAX_N = 18
data = {}
MAX_N.times do |n|
puts "Running N=#{n}"
@conn.exec('VACUUM FULL') # 関係するか不明だがノイズを防ぐためにvacuumしておく
insert_resource(2**n)
h = {}
h.merge!(report('Filter by single tag', single_examples) do
[@tags.sample[:name]]
end)
h.merge!(report('Filter by two tags (OR)', or_examples) do
@tags.sample(2).map { |tag| tag[:name] }
end)
h.merge!(report('Filter by tag AND tagger_id', tagger_examples) do
[@tags.sample[:name], @user_ids.sample]
end)
h.map do |key, values|
data[key] ||= {}
data[key][n] = values
end
end
CSV.open('result.csv', 'wb') do |csv|
headers = (0...MAX_N).map { |n| "average (N=#{n})"} + (0...MAX_N).map { |n| "method (N=#{n})"}
csv << ['N', *headers]
data.each do |key, vh|
values = (0...MAX_N).map { |n| vh[n][1] }
methods = (0...MAX_N).map { |n| vh[n][0] }
csv << [key, *values, *methods]
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment