Last active
January 10, 2019 05:33
-
-
Save connorshea/943ce81e8ff21abe6db7fbeaf38c3b69 to your computer and use it in GitHub Desktop.
Run with `WIKIDATA_USERNAME=username WIKIDATA_PASSWORD=password ruby pcgw_to_wikidata.rb`, you'll need a CSV with all the PCGW articles and Steam App IDs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# gem install sparql | |
# gem install mediawiki_api-wikidata | |
#http://www.rubydoc.info/github/ruby-rdf/sparql/frames | |
require 'sparql/client' | |
require 'json' | |
require 'csv' | |
require 'open-uri' | |
require 'mediawiki_api' | |
require "mediawiki_api/wikidata/wikidata_client" | |
# SPARQL Query to find the, pass the Steam App ID and it'll return a query | |
# that finds any Wikidata items with that App ID. | |
def query(steam_app_id) | |
sparql = <<-SPARQL | |
SELECT ?item ?itemLabel WHERE { | |
?item wdt:P1733 "#{steam_app_id}". | |
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } | |
} | |
LIMIT 10 | |
SPARQL | |
return sparql | |
end | |
# Finds Wikidata items based on the Steam App ID it's passed | |
def find_wikidata_item_by_steam_app_id(app_id) | |
endpoint = "https://query.wikidata.org/sparql" | |
client = SPARQL::Client.new(endpoint, :method => :get) | |
sparql = query(app_id) | |
rows = client.query(sparql) | |
# If there are 0 rows (no data returned) or more than one row, just skip it. | |
return nil if rows.size != 1 | |
return_row = {} | |
rows.each do |row| | |
return_row = { url: row.to_h[:item].to_s, title: row.to_h[:itemLabel].to_s } | |
end | |
return return_row | |
end | |
pcgw_steam_ids = [] | |
# Go through the CSV and create a hash for each PCGW item and its Steam App ID | |
# The CSV is in a format like this: | |
# Half-Life,70 | |
# Half-Life_2,220 | |
# Half-Life_2:_Deathmatch,320 | |
# Half-Life_2:_Episode_One,380 | |
# Half-Life_2:_Episode_Two,420 | |
# Half-Life_2:_Lost_Coast,340 | |
# Half-Life_Deathmatch:_Source,360 | |
# Half-Life:_Blue_Shift,130 | |
# Half-Life:_Opposing_Force,50 | |
# Half-Life:_Source,280 | |
CSV.foreach( | |
File.join(File.dirname(__FILE__), 'pcgw_steam_ids.csv'), | |
skip_blanks: true, | |
headers: false, | |
encoding: 'ISO-8859-1' | |
) do |row| | |
# Skip the row if the length is >40 characters. This is a hack to get around a | |
# weird issue where some game titles have really screwy encoding problems. | |
next if row[0].length > 40 | |
pcgw_steam_ids << { | |
pcgw_id: row[0], | |
steam_app_id: row[1] | |
} | |
end | |
# Authenticate with Wikidata. | |
wikidata_client = MediawikiApi::Wikidata::WikidataClient.new "https://www.wikidata.org/w/api.php" | |
wikidata_client.log_in ENV["WIKIDATA_USERNAME"], ENV["WIKIDATA_PASSWORD"] | |
# For every PCGW item created from the CSV, find the respective wikidata item | |
# and then compare the id of the PCGW item and the Wikidata item found via the | |
# Steam App ID. | |
pcgw_steam_ids.each do |game| | |
# Get the wikidata item for the current game's Steam App ID | |
wikidata_item = find_wikidata_item_by_steam_app_id(game[:steam_app_id]) | |
# If no wikidata item is returned, skip this PCGW item. | |
next if wikidata_item.nil? | |
# Replace the underscores in the PCGW ID with spaces to get as close as possible | |
# to the normal name. | |
game[:title] = game[:pcgw_id].gsub(/_/, ' ') | |
if game[:title] == wikidata_item[:title] | |
wikidata_id = wikidata_item[:url].sub('http://www.wikidata.org/entity/', '') | |
puts "Wikidata Item ID: #{wikidata_id}, game[:pcgw_id]: #{game[:pcgw_id]}" | |
# Check if the property already exists, and skip if it already does. | |
claims = JSON.load(open("https://www.wikidata.org/w/api.php?action=wbgetclaims&entity=#{wikidata_id}&property=P6337&format=json")) | |
if claims["claims"] != {} | |
puts "This already has a PCGW ID" | |
next | |
end | |
puts "This doesn't have a PCGW ID yet" | |
wikidata_client.create_claim wikidata_id, "value", "P6337", "\"#{game[:pcgw_id]}\"" | |
puts "Updated #{game[:title]}: #{wikidata_item[:url]}" | |
else | |
puts "#{game[:title]} does not equal #{wikidata_item[:title]}" | |
end | |
# Sleep for 1 second to ensure we don't get rate limited. | |
sleep(1) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment