andrross/flaky-test-finder.rb

## flaky-test-finder.rb
require 'json'
require 'net/http'
require 'optparse'
require 'set'
require 'uri'

options = {}
OptionParser.new do |opt|
  opt.on('-s', '--start BUILD_NUMBER', 'Require start') { |o| options[:start] = o }
  opt.on('-e', '--end BUILD_NUMBER', 'Require end') { |o| options[:end] = o }
end.parse!

puts "Will crawl builds from #{options[:start]} to #{options[:end]}"
all_failed_tests = []
(options[:start]..options[:end]).each do |job_number|
  base_job_uri = "https://build.ci.opensearch.org/job/gradle-check/#{job_number}"
  result = JSON.parse(Net::HTTP.get_response(URI.parse(base_job_uri + '/api/json')).body)['result']

  # UNSTABLE means the build succeeded but at least one test needed to be
  # retried. Many gradle-check runs fail when running against a PR because the
  # newly introduced code has a problem. The developer then iterates on the PR
  # until all problems are resolved. To filter out this noise we only consider
  # UNSTABLE builds, and ignore failures. It is possible for gradle-check
  # builds run against merged code to fail due to flaky tests and they would be
  # missed here. However, I think that is a small minority of the cases and
  # this approach should do pretty well at identifying flaky tests.
  if result == 'UNSTABLE'
    uri = URI.parse(base_job_uri + '/testReport/api/json?tree=suites[cases[status,className,name]]')
    json = JSON.parse(Net::HTTP.get_response(uri).body)
    # 'FAILED' means the test failed, just like the previous run.
    # 'REGRESSION' means the test failed, but previously passed.
    # See https://javadoc.jenkins.io/plugin/junit/hudson/tasks/junit/CaseResult.Status.html
    failed_cases = json['suites'].map do |s|
      s['cases'].select do |c|
        c['status'] == 'REGRESSION' || c['status'] == 'FAILED'
      end
    end.flatten
    failed_tests = failed_cases.map { |c| {'name' => "#{c['className']}.#{c['name']}", 'build' => job_number}}
    all_failed_tests.push(failed_tests)
  end
end

puts '------------------'

count = {}
all_failed_tests.flatten.each do |test|
  unless count.include?(test['name'])
    count[test['name']] = {'count' => 0, 'builds' => []}
  end
  count[test['name']]['count'] += 1
  count[test['name']]['builds'].push(test['build'])
end
count.to_a.sort {|a,b| b[1]['count'] <=> a[1]['count'] }.each {|a| puts "#{a[1]['count']} #{a[0]} (#{a[1]['builds'].join(',')})" }
	require 'json'
	require 'net/http'
	require 'optparse'
	require 'set'
	require 'uri'

	options = {}
	OptionParser.new do \|opt\|
	opt.on('-s', '--start BUILD_NUMBER', 'Require start') { \|o\| options[:start] = o }
	opt.on('-e', '--end BUILD_NUMBER', 'Require end') { \|o\| options[:end] = o }
	end.parse!

	puts "Will crawl builds from #{options[:start]} to #{options[:end]}"
	all_failed_tests = []
	(options[:start]..options[:end]).each do \|job_number\|
	base_job_uri = "https://build.ci.opensearch.org/job/gradle-check/#{job_number}"
	result = JSON.parse(Net::HTTP.get_response(URI.parse(base_job_uri + '/api/json')).body)['result']

	# UNSTABLE means the build succeeded but at least one test needed to be
	# retried. Many gradle-check runs fail when running against a PR because the
	# newly introduced code has a problem. The developer then iterates on the PR
	# until all problems are resolved. To filter out this noise we only consider
	# UNSTABLE builds, and ignore failures. It is possible for gradle-check
	# builds run against merged code to fail due to flaky tests and they would be
	# missed here. However, I think that is a small minority of the cases and
	# this approach should do pretty well at identifying flaky tests.
	if result == 'UNSTABLE'
	uri = URI.parse(base_job_uri + '/testReport/api/json?tree=suites[cases[status,className,name]]')
	json = JSON.parse(Net::HTTP.get_response(uri).body)
	# 'FAILED' means the test failed, just like the previous run.
	# 'REGRESSION' means the test failed, but previously passed.
	# See https://javadoc.jenkins.io/plugin/junit/hudson/tasks/junit/CaseResult.Status.html
	failed_cases = json['suites'].map do \|s\|
	s['cases'].select do \|c\|
	c['status'] == 'REGRESSION' \|\| c['status'] == 'FAILED'
	end
	end.flatten
	failed_tests = failed_cases.map { \|c\| {'name' => "#{c['className']}.#{c['name']}", 'build' => job_number}}
	all_failed_tests.push(failed_tests)
	end
	end

	puts '------------------'

	count = {}
	all_failed_tests.flatten.each do \|test\|
	unless count.include?(test['name'])
	count[test['name']] = {'count' => 0, 'builds' => []}
	end
	count[test['name']]['count'] += 1
	count[test['name']]['builds'].push(test['build'])
	end
	count.to_a.sort {\|a,b\| b[1]['count'] <=> a[1]['count'] }.each {\|a\| puts "#{a[1]['count']} #{a[0]} (#{a[1]['builds'].join(',')})" }