Skip to content

Instantly share code, notes, and snippets.

@istvanfazakas
Last active December 22, 2022 16:04
Show Gist options
  • Save istvanfazakas/770074466d30d1f5ae7167a7c0a78e5d to your computer and use it in GitHub Desktop.
Save istvanfazakas/770074466d30d1f5ae7167a7c0a78e5d to your computer and use it in GitHub Desktop.
Script to collect data and push to BigQuery

Usage of collect_gql_data.rb Script

Description

The collect_gql_data.rb script is collecting data regarding the usage of GQL COPs created by Goldsmiths. https://github.com/toptal/rubocop-toptal-graphql/

Requirements

The following GEMs are required:

  • google-cloud-bigquery (gem install google-cloud-bigquery)
  • em-http-request (gem install em-http-request)

The following ENV variable is required to run the script:

  • TT_WORK_PATH - this would be the path to the folder that contains the toptal projects

Usage

  • TT_WORK_PATH=<path-to-toptal-projects>/ ruby ./collect_gql_data.rb "toptal.com:api-project-726361118046.GoldsmithsTeam.gql_standardization_metrics" "billing"
#!/usr/bin/env ruby
require 'yaml'
require 'json'
require 'google/cloud/bigquery'
require 'csv'
require 'byebug'
module DataAnalyzer
PROJECTS = {
'platform' => {
path: "#{ENV['TT_WORK_PATH']}/platform",
schemas: {
'cas' => ['api/lib/graphql_api/cas/', 'spec/api/lib/graphql_api/cas/'],
'client' => ['api/lib/graphql_api/client', 'spec/api/lib/graphql_api/client'],
'platform' => ['api/lib/graphql_api/platform', 'spec/api/lib/graphql_api/platform'],
'public' => ['api/lib/graphql_api/public', 'spec/api/lib/graphql_api/public'],
'staff' => ['api/lib/graphql_api/staff', 'api/lib/graphql_api/staff'],
'talent' => ['api/lib/graphql_api/talent', 'spec/api/lib/graphql_api/talent'],
'talent_public' => ['api/lib/graphql_api/talent_public', 'spec/api/lib/graphql_api/talent_public'],
'community' => ['engines/community/app/graphql/', 'engines/community/spec/graphql'],
'screening' => ['engines/screening/app/graphql', 'engines/screening/spec/graphql'],
'talent_activation' => ['engines/talent_activation/app/graphql/', 'engines/talent_activation/spec/graphql'],
'talent_profile' => ['engines/talent_profile/app/graphql/', 'engines/talent_profile/spec/graphql'],
'talent_success' => ['engines/talent_success/app/graphql/', 'engines/talent_success/spec/graphql'],
'topscreen' => ['engines/topscreen/app/graphql/', 'engines/topscreen/spec/graphql']
}
},
'billing' => {
path: "#{ENV['TT_WORK_PATH']}/billing",
schemas: {
'documents/staff' => ['app/graphql/documents/staff', 'spec/graphql/documents/staff'],
'documents/talent' => ['app/graphql/documents/talent', 'spec/graphql/documents/talent'],
'internal' => ['app/graphql/billing/gql/internal', 'spec/graphql/billing/gql/internal'],
'staff' => ['app/graphql/billing/gql/staff', 'spec/graphql/billing/gql/staff'],
'talent' => ['app/graphql/billing/gql/talent', 'spec/graphql/billing/gql/talent']
}
},
'rti-platform' => {
path: "#{ENV['TT_WORK_PATH']}/rti-platform",
schemas: {
'p2p/gql/staff' => ['app/graphql/p2p/gql/staff', 'spec/app/graphql/p2p/gql/staff'],
'p2p/gql/client' => ['app/graphql/p2p/gql/client', 'spec/app/graphql/p2p/gql/client'],
'p2p/gql/talent' => ['app/graphql/p2p/gql/talent', 'spec/app/graphql/p2p/gql/talent']
}
},
'testing-platform-backend' => {
path: "#{ENV['TT_WORK_PATH']}/testing-platform-backend",
schemas: {
'talent' => ['app/lib/graphql_api/talent', 'spec/lib/graphql_api/talent'],
'staff' => ['app/lib/graphql_api/staff', 'spec/lib/graphql_api/staff'],
'public' => ['app/lib/graphql_api/public', 'spec/lib/graphql_api/public']
}
},
'topteam' => {
path: "#{ENV['TT_WORK_PATH']}/topteam",
schemas: {
'topteam' => ['app/graphql', 'spec/graphql']
}
},
'chronicles' => {
path: "#{ENV['TT_WORK_PATH']}/chronicles",
schemas: {
'staff' => ['app/graphql/stff', 'spec/graphql/staff']
}
},
'top-retro-board-backend' => {
path: "#{ENV['TT_WORK_PATH']}/top-retro-board-backend",
schemas: {
'top-retro-board' => ['app/graphql', 'spec/graphql']
}
},
'video-screening-backend' => {
path: "#{ENV['TT_WORK_PATH']}/video-screening-backend",
schemas: {
'video-screening' => ['app/graphql', 'spec/graphql']
}
},
'top-scheduler' => {
path: "#{ENV['TT_WORK_PATH']}/top-scheduler",
schemas: {
'top-scheduler' => ['app/graphql', 'spec/graphql']
}
}
}.freeze
class BigQuery # :nodoc:
def initialize(project_args)
scope, tail = project_args.split(':')
project_id, dataset_id, table_id = tail.split('.')
@project_id = "#{scope}:#{project_id}"
@dataset_id = dataset_id
@table_id = table_id
end
def call(formatted_data)
insert_data formatted_data
# export_csv formatted_data
end
private
attr_reader :project_id, :table_id, :dataset_id
def export_csv(formatted_data)
keys = formatted_data.first.keys
CSV.open("#{ARGV[1]}.csv", 'w') do |csv|
csv << keys
formatted_data.each { |data| csv << data.values }
end
end
def insert_data(formatted_data)
p "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-"
p "Project ID: #{project_id}"
p "Table ID: #{table_id}"
p "Dataset ID: #{dataset_id}"
p "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-"
formatted_data.each_slice(1000) do |slice|
response = table.insert(slice)
if response.success?
print '.'
else
puts "Failed to insert #{response.error_rows.count} rows."
response.insert_errors.each { |err| "#{err.index} - #{err.errors}" }
exit(1)
end
end
p '-=-=-=-=-=-==-=-=-=-=-=-=-=-'
p 'Data pushed to BigQuery'
# p JSON.pretty_generate(formatted_data)
end
def dataset
@dataset ||= big_query.dataset(dataset_id)
end
def table
@table ||= dataset.table(table_id)
end
def big_query
@big_query ||= Google::Cloud::Bigquery.new(project_id: project_id)
end
end
class RubocopOffensesCount # :nodoc: rubocop:disable Metrics/ClassLength
CONFIG_FILE = '.rubocop_graphql.yml'.freeze
TODO_CONFIG_FILE = '.rubocop_graphql_todo.yml'.freeze
BACKUP_TODO_CONFIG_FILE = '.backup_rubocop_graphql_todo.yml'.freeze
CLIENT_TODO_CONFIG_FILE = '.rubocop_graphql_client_todo.yml'.freeze
BACKUP_CLIENT_TODO_CONFIG_FILE = '.backup_rubocop_graphql_client_todo.yml'.freeze
def self.call(only:)
if (PROJECTS.keys & Array(only)) == Array(only)
new(only: only).call
else
warn "Invalid project name. Valid project are: #{PROJECTS.keys}"
warn 'Please separate the project names only with single comma, and no space between.'
exit(false)
end
end
def initialize(only:)
@only = Array(only)
end
def call
data = process_schemas
format_data(data)
end
private
attr_reader :gql_cops, :only, :config_file
def format_data(data)
formatted_data = []
data.each_with_object({}) do |(project, schemas), _hash|
schemas.each do |schema_name, cops|
formatted_data += format(project, schema_name, cops)
end
end
formatted_data
end
def format(project, schema_name, cops)
gql_cops.split(',').map do |cop|
offended_cop = cops.detect { |cp| cp[:cop] == cop }
resp = {
github_repo_name: project, gql_schema: schema_name, cop_name: cop,
cop_enabled: false, number_of_offenses_in_schema: 0, created_at: Time.now
}
next resp unless offended_cop
resp.merge(cop_enabled: offended_cop[:enabled], number_of_offenses_in_schema: offended_cop[:count].to_i)
end
end
def process_schemas # rubocop:disable Metrics/MethodLength
PROJECTS.slice(*only).each_with_object({}) do |(project, project_data), hash|
project_path = project_data[:path]
next if project_path.nil?
Dir.chdir(project_path) do
backup_file(project_path, CLIENT_TODO_CONFIG_FILE, BACKUP_CLIENT_TODO_CONFIG_FILE)
backup_file(project_path, TODO_CONFIG_FILE, BACKUP_TODO_CONFIG_FILE)
@config_file = YAML.load_file("#{project_path}/#{CONFIG_FILE}")
@gql_cops = (config_file.keys - ['require']).select { |cop| cop.include?('ToptalGraphql') }.join(',')
data = project_data[:schemas].each_with_object({}) do |(schema, schema_paths), inner_hash|
inner_hash[schema] = get_schema_counts(schema, schema_paths)
end
restore_file(project_path, CLIENT_TODO_CONFIG_FILE, BACKUP_CLIENT_TODO_CONFIG_FILE)
restore_file(project_path, TODO_CONFIG_FILE, BACKUP_TODO_CONFIG_FILE)
hash[project] = data.compact
end
end
end
def backup_file(path, config_file_name, backup_file_name)
config_path = "#{path}/#{config_file_name}"
return unless File.file?(config_path)
File.rename(config_path, "#{path}/#{backup_file_name}")
File.new(config_path, 'w')
end
def restore_file(path, config_file_name, backup_file_name)
return unless File.file?("#{path}/#{backup_file_name}")
File.delete("#{path}/#{config_file_name}")
File.rename("#{path}/#{backup_file_name}", "#{path}/#{config_file_name}")
end
def get_schema_counts(schema, schema_paths)
ruby_version = `rbenv local`.strip
offense_counts = `RUBY_VERSION=#{ruby_version} bundle exec rubocop #{schema_paths.join(' ')} --format offenses --only #{gql_cops}`.strip # rubocop:disable Layout/LineLength
# p "RUBY VERSION: #{`rbenv local`}"
# p "SCHEMA: #{schema}"
# p "COUNTS: #{offense_counts}"
# p '-=-=-=-=-=-=-=-=-=-=-=-=-=-=-'
get_counts(offense_counts, schema, schema_paths)
end
def get_counts(offense_counts, schema, schema_paths)
offense_counts.split("\n").map do |count_data|
(off_count, cop_name) = count_data.split(' ')
next if cop_name.nil? || cop_name == 'Total'
enabled = cop_enabled_for_schema?(cop_name, schema, schema_paths)
{cop: cop_name, count: off_count, enabled: enabled}
end.compact
end
def cop_enabled_for_schema?(cop, schema, schema_paths) # rubocop:disable Metrics/MethodLength, Metrics/PerceivedComplexity
config = config_file[cop]
include_data = config['Include']
return false if config['Enabled'] == false
return true if config['Enabled'] && include_data.nil?
schema_paths.map do |schema_path|
next true unless include_data.detect { |path| path.start_with?('**') }.nil?
next true unless include_data.detect { |path| path.include?(schema_path) }.nil?
next true unless include_data.detect { |path| path.include?(schema_path.gsub(schema, '**')) }.nil?
next true unless include_data.detect { |path| path.include?(schema_path.gsub("#{schema}/app", '**')) }.nil?
false
end.reduce(:|)
end
end
end
formatted_data = DataAnalyzer::RubocopOffensesCount.call(only: ARGV[1].split(','))
# toptal.com:api-project-726361118046.GoldsmithsTeam.gql_standardization_metrics
DataAnalyzer::BigQuery.new(ARGV[0]).call(formatted_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment