Skip to content

Instantly share code, notes, and snippets.

@heridev
Forked from DmitryTsepelev/0_unique_data.md
Last active March 9, 2022 16:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save heridev/9ffdea8fff195ee008b73f22b4b61ad6 to your computer and use it in GitHub Desktop.
Save heridev/9ffdea8fff195ee008b73f22b4b61ad6 to your computer and use it in GitHub Desktop.
Active record duplicates, prevent duplicates, benchmark on duplicates with active record, single file ruby with rails active record benchmark, single file benchmarking active record duplicates

Original blog post

https://evilmartians.com/chronicles/one-row-many-threads-how-to-avoid-database-duplicates-in-rails-applications

How to run examples

  1. Run $ createdb uniq-db-test to create DB
  2. Run example with Ruby (e.g., $ ruby 1_find_or_create_by_single_thread.rb)

Benchmark output

With many successful INSERTs

Warming up --------------------------------------
  create_or_find_by!            58.000  i/100ms
with_lock + find_or_create_by!  33.000  i/100ms
    create! + rescue            52.000  i/100ms
Calculating -------------------------------------
  create_or_find_by!            694.517  (±22.9%) i/s -      9.570k in  15.013421s
with_lock + find_or_create_by!  358.963  (±16.2%) i/s -      5.181k in  15.053770s
    create! + rescue            803.886  (±15.7%) i/s -     11.648k in  15.055700s

Comparison:
    create! + rescue:           803.9 i/s
  create_or_find_by!:           694.5 i/s - same-ish: difference falls within error
with_lock + find_or_create_by!: 359.0 i/s - 2.24x  (± 0.00) slower

When row already exists

Warming up --------------------------------------
  create_or_find_by!            57.000  i/100ms
with_lock + find_or_create_by!  59.000  i/100ms
    create! + rescue            50.000  i/100ms
Calculating -------------------------------------
  create_or_find_by!            525.534  (±12.7%) i/s -      7.695k in  15.003879s
with_lock + find_or_create_by!  530.299  (±21.1%) i/s -      7.257k in  15.065549s
    create! + rescue            491.853  (± 6.1%) i/s -      7.350k in  15.000505s

Comparison:
with_lock + find_or_create_by!: 530.3 i/s
  create_or_find_by!:           525.5 i/s - same-ish: difference falls within error
    create! + rescue:           491.9 i/s - same-ish: difference falls within error
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test")
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
end
# -------------
TimeTrack.delete_all
User.delete_all
user = User.create
100.times do
date = Date.new(2020, 1, rand(1..10))
user.time_tracks.create_with(hours: 10).find_or_create_by!(date: date, task_id: 1)
end
not_unique_count =
TimeTrack.from(
TimeTrack.group(:date, :user_id).having('COUNT(*) > 1').select(:date, :user_id)
).select('COUNT(subquery.date) AS not_unique').to_a.first.not_unique
puts "\nFound #{not_unique_count} not unique tracks"
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test", pool: 100)
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
end
# -------------
TimeTrack.delete_all
User.delete_all
threads = []
wait_for_it = true
user = User.create
20.times do
date = Date.new(2020, 1, rand(1..4))
threads << Thread.new(date) do
true while wait_for_it
user.time_tracks.create_with(hours: 10).find_or_create_by!(date: date, task_id: 1)
end
end
wait_for_it = false
threads.each(&:join)
not_unique_count =
TimeTrack.from(
TimeTrack.group(:date, :user_id).having('COUNT(*) > 1').select(:date, :user_id)
).select('COUNT(subquery.date) AS not_unique').to_a.first.not_unique
puts "\nFound #{not_unique_count} not unique tracks"
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test", pool: 100)
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
end
# -------------
TimeTrack.delete_all
User.delete_all
threads = []
wait_for_it = true
user = User.create
20.times do
date = Date.new(2020, 1, rand(1..4))
threads << Thread.new(date) do
true while wait_for_it
user.with_lock do
user.time_tracks.create_with(hours: 10).find_or_create_by!(date: date, task_id: 1)
end
end
end
wait_for_it = false
threads.each(&:join)
not_unique_count =
TimeTrack.from(
TimeTrack.group(:date, :user_id).having('COUNT(*) > 1').select(:date, :user_id)
).select('COUNT(subquery.date) AS not_unique').to_a.first.not_unique
puts "\nFound #{not_unique_count} not unique tracks"
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test", pool: 100)
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.index ["user_id", "task_id", "date"], name: "index_time_tracks_on_keys_and_date", unique: true
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
end
# -------------
TimeTrack.delete_all
User.delete_all
threads = []
wait_for_it = true
user = User.create
20.times do
date = Date.new(2020, 1, rand(1..4))
threads << Thread.new(date) do
true while wait_for_it
user.time_tracks.create_with(hours: 10).create_or_find_by!(date: date, task_id: 1)
end
end
wait_for_it = false
threads.each(&:join)
not_unique_count =
TimeTrack.from(
TimeTrack.group(:date, :user_id).having('COUNT(*) > 1').select(:date, :user_id)
).select('COUNT(subquery.date) AS not_unique').to_a.first.not_unique
puts "\nFound #{not_unique_count} not unique tracks"
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test", pool: 100)
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
validates :task_id, uniqueness: { scope: %i[date user_id] }
end
# -------------
TimeTrack.delete_all
User.delete_all
threads = []
wait_for_it = true
user = User.create
40.times do
date = Date.new(2020, 1, rand(1..4))
threads << Thread.new(date) do
true while wait_for_it
begin
user.time_tracks.create_with(hours: 10).find_or_create_by!(date: date, task_id: 1)
rescue ActiveRecord::RecordInvalid
puts 'RecordInvalid rescued!'
end
end
end
wait_for_it = false
threads.each(&:join)
not_unique_count =
TimeTrack.from(
TimeTrack.group(:date, :user_id).having('COUNT(*) > 1').select(:date, :user_id)
).select('COUNT(subquery.date) AS not_unique').to_a.first.not_unique
puts "\nFound #{not_unique_count} not unique tracks"
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test", pool: 100)
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.index ["user_id", "task_id", "date"], name: "index_time_tracks_on_keys_and_date", unique: true
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
validates :task_id, uniqueness: { scope: %i[date user_id] }
end
# -------------
TimeTrack.delete_all
User.delete_all
user = User.create
# inserting a first record
user.time_tracks.create_with(hours: 10).create_or_find_by!(date: Date.new(2020, 1, 1), task_id: 1)
# trying to add a second one
user.time_tracks.create_with(hours: 10).create_or_find_by!(date: Date.new(2020, 1, 1), task_id: 1)
# => Validation failed: Task has already been taken (ActiveRecord::RecordInvalid)
# trying non–bang version
user.time_tracks.create_with(hours: 10).create_or_find_by(date: Date.new(2020, 1, 1), task_id: 1).inspect
# => #<TimeTrack id: nil, user_id: 1, task_id: 1, hours: 10, date: "2020-01-01", created_at: nil, updated_at: nil>
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test", pool: 100)
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.index ["user_id", "task_id", "date"], name: "index_time_tracks_on_keys_and_date", unique: true
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
end
# -------------
TimeTrack.delete_all
User.delete_all
threads = []
wait_for_it = true
user = User.create
20.times do
date = Date.new(2020, 1, rand(1..4))
threads << Thread.new(date) do
true while wait_for_it
user.time_tracks.upsert(
{
hours: 10,
date: date,
task_id: 1,
created_at: Time.current,
updated_at: Time.current,
},
unique_by: %i[user_id task_id date]
)
end
end
wait_for_it = false
threads.each(&:join)
not_unique_count =
TimeTrack.from(
TimeTrack.group(:date, :user_id).having('COUNT(*) > 1').select(:date, :user_id)
).select('COUNT(subquery.date) AS not_unique').to_a.first.not_unique
puts "\nFound #{not_unique_count} not unique tracks"
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test", pool: 100)
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.index ["user_id", "task_id", "date"], name: "index_time_tracks_on_keys_and_date", unique: true
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
end
# -------------
TimeTrack.delete_all
User.delete_all
threads = []
wait_for_it = true
user = User.create
20.times do
date = Date.new(2020, 1, rand(1..4))
threads << Thread.new(date) do
true while wait_for_it
begin
user.time_tracks.create!(date: date, task_id: 1, hours: 10)
rescue ActiveRecord::RecordNotUnique
user.time_tracks.find_by(date: date, task_id: 1)&.update(hours: 10)
end
end
end
wait_for_it = false
threads.each(&:join)
not_unique_count =
TimeTrack.from(
TimeTrack.group(:date, :user_id).having('COUNT(*) > 1').select(:date, :user_id)
).select('COUNT(subquery.date) AS not_unique').to_a.first.not_unique
puts "\nFound #{not_unique_count} not unique tracks"
begin
require "bundler/inline"
rescue LoadError => e
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler"
raise e
end
gemfile(true) do
source "https://rubygems.org"
gem "rails"
gem "pg"
gem "benchmark-ips"
end
require "active_record"
require "action_controller/railtie"
ActiveRecord::Base.establish_connection(adapter: "postgresql", database: "uniq-db-test", pool: 100)
ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Schema.define do
enable_extension "plpgsql"
create_table "time_tracks", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "task_id", null: false
t.integer "hours", null: false
t.date "date", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.index ["user_id", "task_id", "date"], name: "index_time_tracks_on_keys_and_date", unique: true
end
create_table "users", force: :cascade do |t|
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "time_tracks", "users"
end
class User < ActiveRecord::Base
has_many :time_tracks
end
class TimeTrack < ActiveRecord::Base
end
# -------------
require 'benchmark/ips'
ActiveRecord::Base.logger = nil
User.delete_all
user = User.create
puts "\nWith many successful INSERTs\n"
Benchmark.ips do |x|
x.config(time: 15, warmup: 3)
TimeTrack.delete_all
date = Date.new(2000, 1, 1)
x.report 'create_or_find_by!' do
user.time_tracks.create_with(hours: 10).create_or_find_by!(date: date, task_id: 1)
date += 1.day
end
TimeTrack.delete_all
date = Date.new(2000, 1, 1)
x.report 'with_lock + find_or_create_by!' do
user.with_lock do
user.time_tracks.create_with(hours: 10).find_or_create_by!(date: date, task_id: 1)
date += 1.day
end
end
TimeTrack.delete_all
date = Date.new(2000, 1, 1)
x.report 'create! + rescue' do
date += 1.day
begin
user.time_tracks.create!(date: date, task_id: 1, hours: 10)
rescue ActiveRecord::RecordNotUnique
user.time_tracks.find_by(date: date, task_id: 1)&.update(hours: 10)
end
end
x.compare!
end
TimeTrack.delete_all
puts "\nWhen row already exists\n"
Benchmark.ips do |x|
x.config(time: 15, warmup: 3)
date = Date.new(2020, 1, 1)
user.time_tracks.create_with(hours: 10).create_or_find_by!(date: date, task_id: 1)
x.report 'create_or_find_by!' do
user.time_tracks.create_with(hours: 10).create_or_find_by!(date: date, task_id: 1)
end
x.report 'with_lock + find_or_create_by!' do
user.with_lock do
user.time_tracks.create_with(hours: 10).find_or_create_by!(date: date, task_id: 1)
end
end
x.report 'create! + rescue' do
user.time_tracks.create!(date: date, task_id: 1, hours: 10)
rescue ActiveRecord::RecordNotUnique
user.time_tracks.find_by(date: date, task_id: 1)&.update(hours: 10)
end
x.compare!
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment