Skip to content

Instantly share code, notes, and snippets.

@kanevk
Last active February 22, 2018 07:39
Show Gist options
  • Save kanevk/e40c38903ee50c2342ceb13521708a06 to your computer and use it in GitHub Desktop.
Save kanevk/e40c38903ee50c2342ceb13521708a06 to your computer and use it in GitHub Desktop.
ActiveRecord::Base.connection.execute(<<-SQL)
DROP TABLE IF EXISTS non_duplicate_brand_tuples;
CREATE TABLE non_duplicate_brand_tuples (
id serial primary key,
main_brand_id integer NOT NULL,
non_duplicate_brand_id integer NOT NULL
);
SQL
class NonDuplicateBrandTuples < ActiveRecord::Base
end
# Seeds
20.downto(1).each_with_index do |first, second|
NonDuplicateBrandTuples.create main_brand_id: first, non_duplicate_brand_id: second
end
def sql_implementation
result =
NonDuplicateBrandTuples.connection.execute(<<~SQL)
WITH all_non_duplicates AS (
SELECT main_brand_id AS main, non_duplicate_brand_id AS non_duplicate
FROM non_duplicate_brand_tuples
UNION
SELECT non_duplicate_brand_id AS main, main_brand_id AS non_duplicate
FROM non_duplicate_brand_tuples
)
SELECT main, array_agg(non_duplicate) FROM all_non_duplicates GROUP BY main;
SQL
result.
values.
to_h.
transform_values { |v| PG::TextDecoder::Array.new.decode(v).map(&:to_i) }
end
def ruby_implementation
NonDuplicateBrandTuples.
pluck(:main_brand_id, :non_duplicate_brand_id).
map { |*tuple| tuple.sort }.
to_set
end
non_duplicates = sql_implementation
# non_duplicates = ruby_implementation # Uncomment for using Ruby implementation
duplicates = []
1.upto(100_000).each_with_index do |remote_brand_id, progress|
puts progress
1.upto(100).each do |brand_id|
next if non_duplicates[brand_id]&.include?(brand_id)
# next if non_duplicates.include? [remote_brand_id, brand_id].sort # Uncomment for using Ruby implementation
duplicates << [remote_brand_id, brand_id]
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment