GustavoCaso/analyzer_benchmark.rb

## analyzer_benchmark.rb
require 'bundler/inline'
require 'benchmark'

gemfile do
  source 'https://rubygems.org'
  gem 'pry'
end

number_of_job_payload = ARGV[0].to_i || 100_000

class Base
  def initialize(where_conditions: {}, group_by: %w[job_class shop_id api_client_id target_hostname])
    @where_conditions = where_conditions
    @group_by = group_by
    @counts_per_attribute = Hash.new { |hash, key| hash[key] = Hash.new(0) }
  end

  def process_payloads(batches)
    batches.each do |job_payloads|
      process(job_payloads)
    end
  end

  def process(job_payloads)
    group_payloads_attributes_by_job(job_payloads) do |job_payload_attributes|
      job_payload_attributes.each do |attribute_key, attribute_value|
        sanitized_attribute_value = attribute_value.delete('"')

        if @where_conditions[attribute_key]
          break unless @where_conditions[attribute_key].include?(sanitized_attribute_value)
        end

        @counts_per_attribute[attribute_key][sanitized_attribute_value] += 1
      end
    end
  end

  def group_payloads_attributes_by_job(job_payloads)
    raise NotImplementedError
  end

  def pattern_to_extract_attributes
    if @group_by.include?("shop_id")
      group_by_dup = @group_by.clone
      group_by_dup.delete("shop_id")
      Regexp.union(shop_id_regex, attributes_regex(group_by_dup))
    else
      attributes_regex(@group_by)
    end
  end

  def shop_id_regex
    /arguments":\[{\"(shop_id)\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
  end

  def attributes_regex(attributes)
    /\"(#{Regexp.union(*attributes)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
  end
end

class AnalyzerGlobalScan < Base
  def group_payloads_attributes_by_job(job_payloads)
    batch_of_payloads = job_payloads.join(", ")
    batch_payloads_attributes = batch_of_payloads.scan(pattern_to_extract_attributes).flatten.compact.each_slice(2).to_a
    number_to_get_exact_payloads = batch_payloads_attributes.size / job_payloads.size
    batch_payloads_attributes.each_slice(number_to_get_exact_payloads).to_a.each do |job_payload|
      yield job_payload
    end
  end
end

class AnalyzerEachScan < Base
  def group_payloads_attributes_by_job(payloads)
    payloads.each do |payload|
      scanned_results = payload.scan(pattern_to_extract_attributes)
      # We need to flatten to being able to remove the duplicate shop_id
      # attribute.
      # Then we group them by two (attribute_key, attribute_value)
      yield scanned_results.flatten.compact.each_slice(2).to_a
    end
  end
end

class AnalyzerGlobalScanCustomIteration < Base
  def group_payloads_attributes_by_job(job_payloads)
    batch_of_payloads = job_payloads.join(", ")
    batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do |attribute_key_value, attributes|
      cleaned_attribute = attribute_key_value.compact
      if cleaned_attribute.include?('job_class')
        attributes = attributes << []
        attributes.last << cleaned_attribute
      else
        attributes.last << cleaned_attribute
      end
    end.each do |job_payload|
      yield job_payload
    end
  end
end

class AnalyzerGlobalScanCustomIterationBasicRegex < Base
  def group_payloads_attributes_by_job(job_payloads)
    batch_of_payloads = job_payloads.join(", ")
    batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do |attribute_key_value, attributes|
      # we group by job_class because we know each job payload
      # have a job_class
      if attribute_key_value.include?('job_class')
        attributes = attributes << []
        attributes.last << attribute_key_value
      else
        # This will solve the issue with multiple shop_id
        next if attributes.last.include?(attribute_key_value)
        attributes.last << attribute_key_value
      end
    end.each do |job_payload|
      yield job_payload
    end
  end

  def pattern_to_extract_attributes
    /\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
  end
end

class AnalyzerSplitByJobClass < Base
  def group_payloads_attributes_by_job(job_payloads)
    batch_of_payloads = job_payloads.join(", ")
    batch_of_payloads.scan(pattern_to_extract_attributes).join(' ').split(/(?=job_class)/).each do |attributes|
      yield attributes.split.uniq.each_slice(2)
    end
  end

  def pattern_to_extract_attributes
    /\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
  end
end

JOB_CLASSES = %w[
  Appscale::Jobs::AnalyzerTest::WebhookQueueJob
  Appscale::Jobs::AnalyzerTest::Whatever
  Appscale::Jobs::AnalyzerTest::ILikeThisOne
  Appscale::Jobs::AnalyzerTest::ImBackBaby
]

def generate_job_payload(job_class)
  "{\"class\":\"#{job_class}\",\"args\":[{\"job_class\":\"#{job_class}\",\"job_id\":\"657a094f-d9ee-4f0a-92ea-bf8314482390\",\"provider_job_id\":null,\"queue_name\":\"webhook\",\"priority\":null,\"arguments\":[{\"shop_id\":690933842,\"_aj_symbol_keys\":[\"shop_id\"]}],\"executions\":0,\"locale\":\"en\",\"log_level\":0,\"attempt\":0,\"request_id\":null,\"queue_start\":1540041993.7910662,\"expected_run_time\":1540041993.791,\"pod_id\":0,\"privacy_level\":null,\"feature_set\":null,\"shop_id\":690933842,\"queued_by_shopify_version\":\"0aec3a435b6de9f4da41bc511e2257727f4cf6ef\",\"queued_by_section\":\"NilSectionGlobals\",\"queued_with_readonly_master\":false}]}"
end

def genarate_job_payloads(number_of_job_payloads)
  [].tap do |array|
    number_of_job_payloads.times do
      array << generate_job_payload(JOB_CLASSES.sample)
    end
  end
end


batches = [
  genarate_job_payloads(number_of_job_payload),
  genarate_job_payloads(number_of_job_payload),
]


global_scan = AnalyzerGlobalScan.new
global_scan_custom_iteration = AnalyzerGlobalScanCustomIteration.new
global_scan_custom_iteration_basic_regex = AnalyzerGlobalScanCustomIterationBasicRegex.new
split_by_job_class = AnalyzerSplitByJobClass.new
each_scan = AnalyzerEachScan.new

Benchmark.bmbm(28) do |x|
  x.report('global_scan:') { global_scan.process_payloads(batches) }
  x.report('global_scan_custom_iteration:') { global_scan_custom_iteration.process_payloads(batches) }
  x.report('global_scan_custom_iteration_basic_regex:') { global_scan_custom_iteration_basic_regex.process_payloads(batches) }
  x.report('split_by_job_class:') { split_by_job_class.process_payloads(batches) }
  x.report('each_scan:') { each_scan.process_payloads(batches) }
end
	require 'bundler/inline'
	require 'benchmark'

	gemfile do
	source 'https://rubygems.org'
	gem 'pry'
	end

	number_of_job_payload = ARGV[0].to_i \|\| 100_000

	class Base
	def initialize(where_conditions: {}, group_by: %w[job_class shop_id api_client_id target_hostname])
	@where_conditions = where_conditions
	@group_by = group_by
	@counts_per_attribute = Hash.new { \|hash, key\| hash[key] = Hash.new(0) }
	end

	def process_payloads(batches)
	batches.each do \|job_payloads\|
	process(job_payloads)
	end
	end

	def process(job_payloads)
	group_payloads_attributes_by_job(job_payloads) do \|job_payload_attributes\|
	job_payload_attributes.each do \|attribute_key, attribute_value\|
	sanitized_attribute_value = attribute_value.delete('"')

	if @where_conditions[attribute_key]
	break unless @where_conditions[attribute_key].include?(sanitized_attribute_value)
	end

	@counts_per_attribute[attribute_key][sanitized_attribute_value] += 1
	end
	end
	end

	def group_payloads_attributes_by_job(job_payloads)
	raise NotImplementedError
	end

	def pattern_to_extract_attributes
	if @group_by.include?("shop_id")
	group_by_dup = @group_by.clone
	group_by_dup.delete("shop_id")
	Regexp.union(shop_id_regex, attributes_regex(group_by_dup))
	else
	attributes_regex(@group_by)
	end
	end

	def shop_id_regex
	/arguments":\[{\"(shop_id)\":(\"[a-zA-Z0-9:\."-]\"\|\d)/
	end

	def attributes_regex(attributes)
	/\"(#{Regexp.union(attributes)})\":(\"[a-zA-Z0-9:\."-]\"\|\d*)/
	end
	end

	class AnalyzerGlobalScan < Base
	def group_payloads_attributes_by_job(job_payloads)
	batch_of_payloads = job_payloads.join(", ")
	batch_payloads_attributes = batch_of_payloads.scan(pattern_to_extract_attributes).flatten.compact.each_slice(2).to_a
	number_to_get_exact_payloads = batch_payloads_attributes.size / job_payloads.size
	batch_payloads_attributes.each_slice(number_to_get_exact_payloads).to_a.each do \|job_payload\|
	yield job_payload
	end
	end
	end

	class AnalyzerEachScan < Base
	def group_payloads_attributes_by_job(payloads)
	payloads.each do \|payload\|
	scanned_results = payload.scan(pattern_to_extract_attributes)
	# We need to flatten to being able to remove the duplicate shop_id
	# attribute.
	# Then we group them by two (attribute_key, attribute_value)
	yield scanned_results.flatten.compact.each_slice(2).to_a
	end
	end
	end

	class AnalyzerGlobalScanCustomIteration < Base
	def group_payloads_attributes_by_job(job_payloads)
	batch_of_payloads = job_payloads.join(", ")
	batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do \|attribute_key_value, attributes\|
	cleaned_attribute = attribute_key_value.compact
	if cleaned_attribute.include?('job_class')
	attributes = attributes << []
	attributes.last << cleaned_attribute
	else
	attributes.last << cleaned_attribute
	end
	end.each do \|job_payload\|
	yield job_payload
	end
	end
	end

	class AnalyzerGlobalScanCustomIterationBasicRegex < Base
	def group_payloads_attributes_by_job(job_payloads)
	batch_of_payloads = job_payloads.join(", ")
	batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do \|attribute_key_value, attributes\|
	# we group by job_class because we know each job payload
	# have a job_class
	if attribute_key_value.include?('job_class')
	attributes = attributes << []
	attributes.last << attribute_key_value
	else
	# This will solve the issue with multiple shop_id
	next if attributes.last.include?(attribute_key_value)
	attributes.last << attribute_key_value
	end
	end.each do \|job_payload\|
	yield job_payload
	end
	end

	def pattern_to_extract_attributes
	/\"(#{Regexp.union(@group_by)})\":(\"[a-zA-Z0-9:\."-]\"\|\d*)/
	end
	end

	class AnalyzerSplitByJobClass < Base
	def group_payloads_attributes_by_job(job_payloads)
	batch_of_payloads = job_payloads.join(", ")
	batch_of_payloads.scan(pattern_to_extract_attributes).join(' ').split(/(?=job_class)/).each do \|attributes\|
	yield attributes.split.uniq.each_slice(2)
	end
	end

	def pattern_to_extract_attributes
	/\"(#{Regexp.union(@group_by)})\":(\"[a-zA-Z0-9:\."-]\"\|\d*)/
	end
	end

	JOB_CLASSES = %w[
	Appscale::Jobs::AnalyzerTest::WebhookQueueJob
	Appscale::Jobs::AnalyzerTest::Whatever
	Appscale::Jobs::AnalyzerTest::ILikeThisOne
	Appscale::Jobs::AnalyzerTest::ImBackBaby
	]

	def generate_job_payload(job_class)
	"{\"class\":\"#{job_class}\",\"args\":[{\"job_class\":\"#{job_class}\",\"job_id\":\"657a094f-d9ee-4f0a-92ea-bf8314482390\",\"provider_job_id\":null,\"queue_name\":\"webhook\",\"priority\":null,\"arguments\":[{\"shop_id\":690933842,\"_aj_symbol_keys\":[\"shop_id\"]}],\"executions\":0,\"locale\":\"en\",\"log_level\":0,\"attempt\":0,\"request_id\":null,\"queue_start\":1540041993.7910662,\"expected_run_time\":1540041993.791,\"pod_id\":0,\"privacy_level\":null,\"feature_set\":null,\"shop_id\":690933842,\"queued_by_shopify_version\":\"0aec3a435b6de9f4da41bc511e2257727f4cf6ef\",\"queued_by_section\":\"NilSectionGlobals\",\"queued_with_readonly_master\":false}]}"
	end

	def genarate_job_payloads(number_of_job_payloads)
	[].tap do \|array\|
	number_of_job_payloads.times do
	array << generate_job_payload(JOB_CLASSES.sample)
	end
	end
	end


	batches = [
	genarate_job_payloads(number_of_job_payload),
	genarate_job_payloads(number_of_job_payload),
	]


	global_scan = AnalyzerGlobalScan.new
	global_scan_custom_iteration = AnalyzerGlobalScanCustomIteration.new
	global_scan_custom_iteration_basic_regex = AnalyzerGlobalScanCustomIterationBasicRegex.new
	split_by_job_class = AnalyzerSplitByJobClass.new
	each_scan = AnalyzerEachScan.new

	Benchmark.bmbm(28) do \|x\|
	x.report('global_scan:') { global_scan.process_payloads(batches) }
	x.report('global_scan_custom_iteration:') { global_scan_custom_iteration.process_payloads(batches) }
	x.report('global_scan_custom_iteration_basic_regex:') { global_scan_custom_iteration_basic_regex.process_payloads(batches) }
	x.report('split_by_job_class:') { split_by_job_class.process_payloads(batches) }
	x.report('each_scan:') { each_scan.process_payloads(batches) }
	end