Skip to content

Instantly share code, notes, and snippets.

@Facenapalm
Created January 8, 2022 03:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Facenapalm/97a1546b4305a6c3ee4a95f08a2fc86f to your computer and use it in GitHub Desktop.
Save Facenapalm/97a1546b4305a6c3ee4a95f08a2fc86f to your computer and use it in GitHub Desktop.
Metacritic video game lists grabber for usage in Wikipedias
# Copyright (c) 2021 Facenapalm
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import urllib.request
import re
import time
import sys
def download_list(pagenum):
attempts = 10
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
}
# url = "https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?page={}".format(pagenum)
url = "https://www.metacritic.com/browse/games/score/metascore/year/all/filtered?year_selected=2021&page={}".format(pagenum)
for attempt_no in range(attempts):
try:
request = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except Exception as error:
if attempt_no < (attempts - 1):
print("Try to redownload...")
time.sleep(5)
else:
raise error
with open("{}.txt".format(pagenum), "w", encoding="utf-8") as output:
output.write(html)
def grab(start, end):
for pagenum in range(start, end):
download_list(pagenum)
print(pagenum)
time.sleep(3)
def unify_platform(platform):
if platform.startswith("PlayStation "):
return "PS" + platform[12:]
elif platform == "Xbox One":
return "XOne"
elif platform == "Xbox Series X":
return "XSX"
else:
return platform
def parse(start, end):
existing = set([line.strip() for line in open("existing_games.txt", encoding="utf-8")])
regexp = r"<div class=\"metascore_w[^>]+>(\d{2})</div>\s+</a>\s+</div>\s+<span class=\"title numbered\">\s+\d+\.\s+</span>\s+<a href=\"([^\"]+)[^>]+><h3>(.*?)</h3></a>\s+<div class=\"clamp-details\">\s+<div class=\"platform\">\s+<span class=\"label\">Platform:</span>\s+<span class=\"data\">\s+(.*?)\s+</span>\s+</div>\s+<span>.*?(\d{4})</span>\s+</div>"
# 1 - score, 2 - url, 3 - title, 4 - platform, 5 - year
result = ""
for pagenum in range(start, end):
html = open("{}.txt".format(pagenum), "r", encoding="utf-8").read()
for match in re.finditer(regexp, html):
name = match.group(3)
name = re.sub(r"'", "’", name)
name = re.sub(r" [-–] ", "—", name)
name = re.sub(r" Of ", "of", name)
if (match.group(3) not in existing) and (name not in existing):
result = result + "{}|{}|{}|{}|https://www.metacritic.com{}\n".format(name, match.group(5), unify_platform(match.group(4)), match.group(1), match.group(2))
print(pagenum)
with open("metacritic_games.txt", "w", encoding="utf-8") as output:
output.write(result)
def combine():
data = {}
for line in open("metacritic_games.txt", encoding="utf-8"):
(title, year, platform, score, url) = tuple(line.strip().split("|"))
info = "{}: [{} {}]".format(platform, url, score)
if title not in data:
data[title] = [year, info]
else:
data[title][1] = data[title][1] + ", " + info
result = []
for title, value in data.items():
result.append("* [[{}]] ({}; {})".format(title, value[0], value[1]))
result.sort(key=lambda y: y.lower())
with open("result.txt", "w", encoding="utf-8") as output:
output.write("\n".join(result))
def main():
if len(sys.argv) < 3:
print("Usage:")
print(" python metacriticgrabber.py [grab|parse|combine] starting_page pages_count")
print("Example 1:")
print(" python metacriticgrabber.py 0 10")
print("Example 2 (verbose variation of Example 1):")
print(" python metacriticgrabber.py grab 0 10")
print(" python metacriticgrabber.py parse 0 10")
print(" python metacriticgrabber.py combine")
print()
print("Make sure the file existing_games.txt exists and contains a list of video game titles to ignore,")
print("one title per file line.")
print()
print("Scripts works in three stages:")
print("1. Grabbing: downloads lists of best games from Metacritic website and stores them in %d.txt (0.txt, 1.txt, and so")
print(" forth) files in html format.")
print("2. Parsing: extracts game titles from html files and stores them in pipe-separated list metacritic_games.txt in")
print(" following format: game_title|year|platform|score|relative_url")
print("3. Combining: processes games from metacritic_games.txt and stores them as Wikitext list at result.txt.")
elif sys.argv[1] == "grab":
grab(int(sys.argv[2]), int(sys.argv[3]))
elif sys.argv[1] == "parse":
parse(int(sys.argv[2]), int(sys.argv[3]))
elif sys.argv[1] == "combine":
combine()
else:
print("Grabbing...")
grab(int(sys.argv[1]), int(sys.argv[2]))
print("Parsing...")
parse(int(sys.argv[1]), int(sys.argv[2]))
print("Combining...")
combine()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment