Skip to content

Instantly share code, notes, and snippets.

@connorshea
Last active July 2, 2022 18:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save connorshea/b6fe08a5d1d28bf88f252a91dd184c80 to your computer and use it in GitHub Desktop.
Save connorshea/b6fe08a5d1d28bf88f252a91dd184c80 to your computer and use it in GitHub Desktop.
Proof-of-concept script that scrapes the GDQ schedule page and then pulls the vglist and IGDB IDs from the vglist API. This'd then be used to pull cover data and such from IGDB. It doesn't get every game in the schedule, but it can get most of them automatically, which will reduce the amount of manual effort involved in setting up the data for a…
require 'json'
gdq_schedule = JSON.parse(File.read('gdq.json'))
total_entries = gdq_schedule.count
puts "Total entries: #{total_entries}"
game_only_entries = gdq_schedule.filter { |entry| !entry['non_game'] }.count
puts "Game-only entries: #{game_only_entries}"
entries_with_vglist_ids = gdq_schedule.filter { |entry| !entry['vglist_id'].nil? }.count
puts "Entries with vglist IDs: #{entries_with_vglist_ids} (#{(entries_with_vglist_ids.fdiv(game_only_entries) * 100).round(2)}%)"
entries_with_igdb_ids = gdq_schedule.filter { |entry| !entry['igdb_id'].nil? }.count
puts "Entries with IGDB IDs: #{entries_with_igdb_ids} (#{(entries_with_igdb_ids.fdiv(game_only_entries) * 100).round(2)}%)"
puts
puts 'Games without vglist IDs:'
games_without_vglist_ids = gdq_schedule.filter { |entry| !entry['non_game'] }.filter { |entry| entry['vglist_id'].nil? }.map { |entry| entry['name'] }
games_without_vglist_ids.each do |entry|
puts "- #{entry}"
end
puts
puts 'Games without IGDB IDs:'
games_without_igdb_ids = gdq_schedule.filter { |entry| !entry['non_game'] }.filter { |entry| entry['igdb_id'].nil? }.map { |entry| entry['name'] }
games_without_igdb_ids.each do |entry|
puts "- #{entry}"
end
puts
puts 'Games with vglist IDs and no IGDB ID:'
games_without_igdb_ids.difference(games_without_vglist_ids).each do |entry|
puts "- #{entry}"
end
require 'bundler/inline'
gemfile do
source 'https://rubygems.org'
gem 'nokogiri'
gem 'graphql-client', '~> 0.18.0'
gem 'debug'
end
require 'json'
require 'open-uri'
require 'net/http'
require 'nokogiri'
require "graphql/client"
require "graphql/client/http"
require 'debug'
# For comparing using Levenshtein Distance.
# https://stackoverflow.com/questions/16323571/measure-the-distance-between-two-strings-with-ruby
require "rubygems/text"
module VGListGraphQL
HTTP = GraphQL::Client::HTTP.new("https://vglist.co/graphql") do
def headers(context)
{
"User-Agent": "GDQ Schedule Parser",
"X-User-Email": ENV['VGLIST_EMAIL'],
"X-User-Token": ENV['VGLIST_TOKEN'],
"Content-Type": "application/json",
"Accept": "*/*"
}
end
end
# Fetch latest schema on init, this will make a network request
Schema = GraphQL::Client.load_schema(HTTP)
Client = GraphQL::Client.new(schema: Schema, execute: HTTP)
end
class GDQHelper
GAME_SEARCH_QUERY = VGListGraphQL::Client.parse <<~GRAPHQL
query($name: String!) {
gameSearch(query: $name, first: 10) {
nodes {
id
name
igdbId
}
}
}
GRAPHQL
# Given a string like '1:30:00', convert to an integer for seconds.
def self.estimate_to_seconds(estimate)
num_seconds = 0
parts = estimate.split(':').map(&:to_i)
return 0 if parts.length != 3
hours, minutes, seconds = parts
num_seconds += hours * 60 * 60
num_seconds += minutes * 60
num_seconds += seconds
num_seconds
end
# Given a string like 'Foo, Bar', return an array of `['Foo', 'Bar']`.
def self.parse_runners(runners)
runners.split(',').map(&:strip)
end
# Given the following inputs, return the category and platform as a 2-tuple.
def self.parse_category_and_platform(string)
return nil if string.nil?
parts = string.split('—')
platform = parts.last
category = parts.length > 2 ? parts[0..-1].join('—') : parts.first
[category.strip, platform.strip]
end
# Scrub the name to remove 'BONUS GAME' stuff.
def self.name_scrubber(name)
name.gsub(/BONUS GAME( \d?) -/i, '').strip
end
# Whether this is a bonus game.
def self.bonus_game?(name)
name.downcase.start_with?('bonus game')
end
# Whether this is a non-game entry in the schedule (recaps, pre-show, finale).
def self.non_game?(name)
name.start_with?('Daily Recap') || ['Pre-Show', 'Finale', 'Event Recap'].include?(name)
end
def self.get_vglist_and_igdb_id(name)
vglist_game = vglist_game_query(name)
[vglist_game&.id&.to_i, vglist_game&.igdb_id]
end
def self.vglist_game_query(name)
response = VGListGraphQL::Client.query(GAME_SEARCH_QUERY, variables: { name: name })
game_nodes = response.data.game_search.nodes
nodes_with_exact_name = game_nodes.filter { |node| node.name == name }
# "Shadow of the Colossus" is the name of two different games due to the remaster :')
# So we just return nil if there are multiple games with the same exact name,
# rather than potentially choosing the wrong one :|
return nil if nodes_with_exact_name.length > 1
# If there's just one game that has the exact name we want, return it.
return nodes_with_exact_name.first if nodes_with_exact_name.length == 1
# Get games by checking for name-closeness.
games_with_similar_name = game_nodes.filter { |node| games_have_same_name?(name, node.name) }
# Return nil if no games have a similar-enough name or if there's more
# than 1 game with the name we're looking for.
return nil if games_with_similar_name.length != 1
return games_with_similar_name.first if games_with_similar_name.length == 1
end
def self.games_have_same_name?(name1, name2)
name1 = name1.downcase
name2 = name2.downcase
return true if name1 == name2
levenshtein = Class.new.extend(Gem::Text).method(:levenshtein_distance)
distance = levenshtein.call(name1, name2)
return true if distance <= 2
replacements = [
{
before: '&',
after: 'and'
}
]
replacements.each do |replacement|
name1 = name1.gsub(replacement[:before], replacement[:after]).strip
name2 = name2.gsub(replacement[:before], replacement[:after]).strip
end
return true if name1 == name2
return false
end
end
gdq_games_list = []
SCHEDULE_URLS = {
agdq2018: 'https://web.archive.org/web/20171202003955/https://gamesdonequick.com/schedule',
sgdq2018: 'https://web.archive.org/web/20180428144327/https://gamesdonequick.com/schedule',
agdq2019: 'https://web.archive.org/web/20190104080309/https://gamesdonequick.com/schedule',
sgdq2019: 'https://web.archive.org/web/20190531022612/https://gamesdonequick.com/schedule',
agdq2020: 'https://web.archive.org/web/20200121063630/https://gamesdonequick.com/schedule',
sgdq2020: 'https://web.archive.org/web/20200810014929/https://gamesdonequick.com/schedule',
agdq2021: 'https://web.archive.org/web/20210107025302/https://gamesdonequick.com/schedule',
sgdq2021: 'https://web.archive.org/web/20210528184033/https://gamesdonequick.com/schedule',
agdq2022: 'https://web.archive.org/web/20220106230609/https://gamesdonequick.com/schedule',
current: 'https://gamesdonequick.com/schedule'
}.freeze
GDQ_SCHEDULE_URL = SCHEDULE_URLS[:current]
# GDQ_SCHEDULE_URL = SCHEDULE_URLS[:sgdq2018]
response = Net::HTTP.get_response(URI.parse(GDQ_SCHEDULE_URL))
gdq_schedule_html = response.body
doc = Nokogiri::HTML(gdq_schedule_html)
# Grab the game names for each
rows = doc.css('#runTable tbody tr:not(.second-row)')
second_rows = doc.css('#runTable tbody tr.second-row')
rows.each_with_index do |row, i|
game = {}
name = row.children[3].children[0].to_s
game[:name] = GDQHelper.name_scrubber(name)
game[:bonus_game] = GDQHelper.bonus_game?(name)
game[:non_game] = GDQHelper.non_game?(name)
second_row = second_rows[i]
game[:category], game[:platform] = nil, nil
# Protect against `second_row` being nil, which can happen for older GDQ schedule pages where the Finale has no second row.
game[:category], game[:platform] = GDQHelper.parse_category_and_platform(second_row.children[3].children[0].to_s) unless second_row.nil?
# Add vglist_id and igdb_id unless this is a non-game entry, no reason to waste API requests on non-game entries.
game[:vglist_id], game[:igdb_id] = nil, nil
game[:vglist_id], game[:igdb_id] = GDQHelper.get_vglist_and_igdb_id(game[:name]) unless game[:non_game]
estimate_string = nil
estimate_string = second_row.children[1].children[2].to_s.strip unless second_row.nil?
game[:estimate] = estimate_string.nil? ? nil : GDQHelper.estimate_to_seconds(estimate_string)
game[:commentator] = second_row.children[5].children[1].to_s.strip unless second_row.nil?
game[:runners] = GDQHelper.parse_runners(row.children[5].children[0].to_s)
gdq_games_list << game
# Sleep for 1 second between entries because we don't want to spam the vglist API.
sleep 1
end
File.write(File.join(File.dirname(__FILE__), 'gdq.json'), JSON.pretty_generate(gdq_games_list))
puts 'Written to file.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment