Skip to content

Instantly share code, notes, and snippets.

@eliank
Created May 23, 2016 07:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eliank/f6b30f7ad20567573f86d33738f99ac7 to your computer and use it in GitHub Desktop.
Save eliank/f6b30f7ad20567573f86d33738f99ac7 to your computer and use it in GitHub Desktop.
module ActiveRecord
module Batches
def find_each_with_order(options = {})
if block_given?
find_in_batches_with_order(options) do |records|
records.each { |record| yield record }
end
else
enum_for :find_each, options do
options[:start] ? where(table[primary_key].gteq(options[:start])).size : size
end
end
end
# Sometimes the query-planner avoids using indexes in favor of sequential scans.
# When a LIMIT clause is present the planner assumes a distribution of rows matching
# a condition that doesn't correspond with the actual contents of the table.
# In some cases the assumption about the distribution of values differ in such a way
# that the sequential scan ends op iterating over millions of rows lasting minutes.
# Whilst an index scan would've returned a result in mere seconds.
# Specifying an additional column for ordering forces the planner to consider using an index
# that contains the specific column. Even though the primary_key ASC order is unique
# and the additional order column won't be used for sorting at all.
#
# The code below is nearly identical to:
# https://github.com/rails/rails/blob/2a7cf24cb7aab28f483a6772b608e2868a9030ba/activerecord/lib/active_record/relation/batches.rb#L98
# Its changed in such a way to allow for an additional_order_column. The column to specify is the first column of the desired index.
def find_in_batches_with_order(options = {})
options.assert_valid_keys(:start, :batch_size, :additional_order_column)
relation = self
start = options[:start]
batch_size = options[:batch_size] || 1000
additional_order_column = options[:additional_order_column]
unless block_given?
return to_enum(:find_in_batches, options) do
total = start ? where(table[primary_key].gteq(start)).size : size
(total - 1).div(batch_size) + 1
end
end
order_for_batch = if additional_order_column.present?
batch_with_additional_order(additional_order_column)
else
batch_order
end
relation = relation.reorder(order_for_batch).limit(batch_size)
records = start ? relation.where(table[primary_key].gteq(start)).to_a : relation.to_a
while records.any?
records_size = records.size
primary_key_offset = records.last.id
raise "Primary key not included in the custom select clause" unless primary_key_offset
yield records
break if records_size < batch_size
records = relation.where(table[primary_key].gt(primary_key_offset)).to_a
end
end
private
def batch_with_additional_order(additional_order_column)
quoted_additional_column_name = connection.quote_column_name(additional_order_column)
"#{quoted_table_name}.#{quoted_primary_key} ASC, #{quoted_table_name}.#{quoted_additional_column_name}"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment