Skip to content

Instantly share code, notes, and snippets.

@GustavoCaso
Last active October 22, 2018 13:32
Show Gist options
  • Save GustavoCaso/524861c48e8b2f7b9b8ec3fffba24190 to your computer and use it in GitHub Desktop.
Save GustavoCaso/524861c48e8b2f7b9b8ec3fffba24190 to your computer and use it in GitHub Desktop.
Benchmark for Analyzer
require 'bundler/inline'
require 'benchmark'
gemfile do
source 'https://rubygems.org'
gem 'pry'
end
number_of_job_payload = ARGV[0].to_i || 100_000
class Base
def initialize(where_conditions: {}, group_by: %w[job_class shop_id api_client_id target_hostname])
@where_conditions = where_conditions
@group_by = group_by
@counts_per_attribute = Hash.new { |hash, key| hash[key] = Hash.new(0) }
end
def process_payloads(batches)
batches.each do |job_payloads|
process(job_payloads)
end
end
def process(job_payloads)
group_payloads_attributes_by_job(job_payloads) do |job_payload_attributes|
job_payload_attributes.each do |attribute_key, attribute_value|
sanitized_attribute_value = attribute_value.delete('"')
if @where_conditions[attribute_key]
break unless @where_conditions[attribute_key].include?(sanitized_attribute_value)
end
@counts_per_attribute[attribute_key][sanitized_attribute_value] += 1
end
end
end
def group_payloads_attributes_by_job(job_payloads)
raise NotImplementedError
end
def pattern_to_extract_attributes
if @group_by.include?("shop_id")
group_by_dup = @group_by.clone
group_by_dup.delete("shop_id")
Regexp.union(shop_id_regex, attributes_regex(group_by_dup))
else
attributes_regex(@group_by)
end
end
def shop_id_regex
/arguments":\[{\"(shop_id)\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
end
def attributes_regex(attributes)
/\"(#{Regexp.union(*attributes)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
end
end
class AnalyzerGlobalScan < Base
def group_payloads_attributes_by_job(job_payloads)
batch_of_payloads = job_payloads.join(", ")
batch_payloads_attributes = batch_of_payloads.scan(pattern_to_extract_attributes).flatten.compact.each_slice(2).to_a
number_to_get_exact_payloads = batch_payloads_attributes.size / job_payloads.size
batch_payloads_attributes.each_slice(number_to_get_exact_payloads).to_a.each do |job_payload|
yield job_payload
end
end
end
class AnalyzerEachScan < Base
def group_payloads_attributes_by_job(payloads)
payloads.each do |payload|
scanned_results = payload.scan(pattern_to_extract_attributes)
# We need to flatten to being able to remove the duplicate shop_id
# attribute.
# Then we group them by two (attribute_key, attribute_value)
yield scanned_results.flatten.compact.each_slice(2).to_a
end
end
end
class AnalyzerGlobalScanCustomIteration < Base
def group_payloads_attributes_by_job(job_payloads)
batch_of_payloads = job_payloads.join(", ")
batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do |attribute_key_value, attributes|
cleaned_attribute = attribute_key_value.compact
if cleaned_attribute.include?('job_class')
attributes = attributes << []
attributes.last << cleaned_attribute
else
attributes.last << cleaned_attribute
end
end.each do |job_payload|
yield job_payload
end
end
end
class AnalyzerGlobalScanCustomIterationBasicRegex < Base
def group_payloads_attributes_by_job(job_payloads)
batch_of_payloads = job_payloads.join(", ")
batch_of_payloads.scan(pattern_to_extract_attributes).each_with_object([]) do |attribute_key_value, attributes|
# we group by job_class because we know each job payload
# have a job_class
if attribute_key_value.include?('job_class')
attributes = attributes << []
attributes.last << attribute_key_value
else
# This will solve the issue with multiple shop_id
next if attributes.last.include?(attribute_key_value)
attributes.last << attribute_key_value
end
end.each do |job_payload|
yield job_payload
end
end
def pattern_to_extract_attributes
/\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
end
end
class AnalyzerSplitByJobClass < Base
def group_payloads_attributes_by_job(job_payloads)
batch_of_payloads = job_payloads.join(", ")
batch_of_payloads.scan(pattern_to_extract_attributes).join(' ').split(/(?=job_class)/).each do |attributes|
yield attributes.split.uniq.each_slice(2)
end
end
def pattern_to_extract_attributes
/\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/
end
end
JOB_CLASSES = %w[
Appscale::Jobs::AnalyzerTest::WebhookQueueJob
Appscale::Jobs::AnalyzerTest::Whatever
Appscale::Jobs::AnalyzerTest::ILikeThisOne
Appscale::Jobs::AnalyzerTest::ImBackBaby
]
def generate_job_payload(job_class)
"{\"class\":\"#{job_class}\",\"args\":[{\"job_class\":\"#{job_class}\",\"job_id\":\"657a094f-d9ee-4f0a-92ea-bf8314482390\",\"provider_job_id\":null,\"queue_name\":\"webhook\",\"priority\":null,\"arguments\":[{\"shop_id\":690933842,\"_aj_symbol_keys\":[\"shop_id\"]}],\"executions\":0,\"locale\":\"en\",\"log_level\":0,\"attempt\":0,\"request_id\":null,\"queue_start\":1540041993.7910662,\"expected_run_time\":1540041993.791,\"pod_id\":0,\"privacy_level\":null,\"feature_set\":null,\"shop_id\":690933842,\"queued_by_shopify_version\":\"0aec3a435b6de9f4da41bc511e2257727f4cf6ef\",\"queued_by_section\":\"NilSectionGlobals\",\"queued_with_readonly_master\":false}]}"
end
def genarate_job_payloads(number_of_job_payloads)
[].tap do |array|
number_of_job_payloads.times do
array << generate_job_payload(JOB_CLASSES.sample)
end
end
end
batches = [
genarate_job_payloads(number_of_job_payload),
genarate_job_payloads(number_of_job_payload),
]
global_scan = AnalyzerGlobalScan.new
global_scan_custom_iteration = AnalyzerGlobalScanCustomIteration.new
global_scan_custom_iteration_basic_regex = AnalyzerGlobalScanCustomIterationBasicRegex.new
split_by_job_class = AnalyzerSplitByJobClass.new
each_scan = AnalyzerEachScan.new
Benchmark.bmbm(28) do |x|
x.report('global_scan:') { global_scan.process_payloads(batches) }
x.report('global_scan_custom_iteration:') { global_scan_custom_iteration.process_payloads(batches) }
x.report('global_scan_custom_iteration_basic_regex:') { global_scan_custom_iteration_basic_regex.process_payloads(batches) }
x.report('split_by_job_class:') { split_by_job_class.process_payloads(batches) }
x.report('each_scan:') { each_scan.process_payloads(batches) }
end
@GustavoCaso
Copy link
Author

I have created three variations:

  1. AnalyzerGlobalScan does a global scan and them group by job payload using multiple each_slice.
  2. AnalyzerEachScan it iterates over each job payload and performing the scan on each one of them, the code might be cleaner since we no need to create the job payload, but the performance is worst due to multiple scan.
  3. AnalyzerGlobalScanCustomIteration does a global scan but instead of doing multiple each_slice it groups using custom logic, making the code read much better than AnalyzerGlobalScan.

Running the benchmark, it would print the output:

Rehearsal -----------------------------------------------------------------
global_scan:                    0.160329   0.005042   0.165371 (  0.165596)
global_scan_custom_iteration:   0.166476   0.002115   0.168591 (  0.168741)
each_scan:                      1.077777   0.016114   1.093891 (  1.095130)
-------------------------------------------------------- total: 1.427853sec

                                    user     system      total        real
global_scan:                    0.146631   0.001037   0.147668 (  0.147796)
global_scan_custom_iteration:   0.145015   0.000406   0.145421 (  0.145524)
each_scan:                      1.048631   0.007272   1.055903 (  1.056984)

Proving the each_scan is the slowest.
But global_scan_custom_iteration is almost as fast as global_scan, and the code is much more comfortable the read

@GustavoCaso
Copy link
Author

I have done more experiments and looks like we can improve a little by using the basic regex /\"(#{Regexp.union(*@group_by)})\":(\"[a-zA-Z0-9:\."-]*\"|\d*)/ that was giving as some issues with multiple shop_id attributes, but doing the aggregation with the custom iteration.

global_scan_custom_iteration_basic_regex is the winner 🎉 🎉

Rehearsal -----------------------------------------------------------------------------
global_scan:                                0.166552   0.007628   0.174180 (  0.174279)
global_scan_custom_iteration:               0.168898   0.002649   0.171547 (  0.171596)
global_scan_custom_iteration_basic_regex:   0.142025   0.001113   0.143138 (  0.143178)
split_by_job_class:                         0.188039   0.003317   0.191356 (  0.191373)
each_scan:                                  1.065555   0.014591   1.080146 (  1.080628)
-------------------------------------------------------------------- total: 1.760367sec

                                                user     system      total        real
global_scan:                                0.159464   0.001663   0.161127 (  0.161351)
global_scan_custom_iteration:               0.159547   0.001340   0.160887 (  0.160950)
global_scan_custom_iteration_basic_regex:   0.140358   0.000836   0.141194 (  0.141362)
split_by_job_class:                         0.185607   0.002132   0.187739 (  0.187775)
each_scan:                                  1.059577   0.009298   1.068875 (  1.069279)

@GustavoCaso
Copy link
Author

After talking with Moe, he pointed out that some method names are not very descriptive.

Also, know you can invoke the benchmark passing the number of job payloads that you want to execute.

ruby benchmark.rb 10000

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment