Last active
October 30, 2020 16:25
-
-
Save FreaKzero/70be69c9aa90fdecf6262be4f308a952 to your computer and use it in GitHub Desktop.
Python3 Script to sanitize Categories and playerstrings from skypscraper cache XMLs - especially needed for openretro data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
import re | |
import os | |
import sys | |
import argparse | |
if sys.version_info[0] < 3: | |
print("Must be using Python 3") | |
sys.exit(1) | |
if(len(sys.argv) < 2): | |
print("Please provide a path") | |
sys.exit(1) | |
parser = argparse.ArgumentParser() | |
parser.add_argument("path", help="Path") | |
parser.add_argument("--dry", help="Dry run without writing") | |
parser.add_argument("--player", help="DEFAULT or MAXPLAYER") | |
parser.add_argument("--nocat", help="String for categories which cant be resolved") | |
args = parser.parse_args() | |
DRY_TEST = False | |
# MAXPLAYER for only max player numbers | |
# Everything else => n-n (example: 1-12) | |
PLAYERFORMAT = "DEFAULT" | |
# None => use the first grabbed tag as category | |
# String => use the given String as category | |
NO_CAT_STRING = None | |
if args.dry: | |
DRY_TEST = True | |
if args.player: | |
PLAYERFORMAT = args.player | |
if args.nocat: | |
NO_CAT_STRING = args.nocat | |
LOGGER = { | |
"nocat": [], | |
"players": [] | |
} | |
# Defined order which category should be used, first seen tag will be the category | |
# First item is most important - last item at least | |
defined = [ | |
"disney", | |
"educational", | |
"sports", | |
"shooter", | |
"rpg", | |
"puzzle", | |
"shootemup", | |
"racing", | |
"beatemup", | |
"cards", | |
"quiz", | |
"topdown", | |
"strategy", | |
"platform", | |
"adventure", | |
"reaction", | |
"arcade", | |
"simulation", | |
"action", | |
"maze", | |
"pinball", | |
"boardgame", | |
"movie", | |
"creativity" | |
] | |
# Replace typos from users, and things which are basically the same | |
rep = { | |
"fighting": "beatemup", | |
"fighter": "beatemup", | |
"fight": "beatemup", | |
"beatempup": "beatemup", | |
"car": "racing", | |
"actionadventure": "adventure", | |
"actionadvenure": "adventure", | |
"jumper": "platform", | |
"jumponthings": "platform", | |
"wanderer": "rpg", | |
"puzzlesolve": "puzzle", | |
"blackjack": "cards", | |
} | |
def writeFile(file): | |
if (DRY_TEST): | |
print("[DRY] Write: {}".format(file)) | |
else: | |
try: | |
os.rename(file,file+".orig") | |
f = open(file, "w") | |
f.write(ET.tostring(root, encoding='unicode')) | |
print("Writing {}".format(file)) | |
f.close() | |
except: | |
print("Cant write File {}".format(file)) | |
def outputlog(what): | |
if (what == 'nocat' and len(LOGGER["nocat"]) > 0): | |
print("* Used Fallback Categories for {} Games".format(len(LOGGER["nocat"]))) | |
print("\n".join(LOGGER["nocat"])) | |
else: | |
print("* Player sanitization statistics") | |
for item in LOGGER['players']: | |
print("{}\tx\t{}".format(item["players"], item["count"])) | |
def findFile(dir_path, search): | |
found = [] | |
for root, dirs, files in os.walk(dir_path): | |
for file in files: | |
if file == search: | |
found.append(root+'/'+str(file)) | |
return found | |
def findDict(lst, key, value): | |
for i, dic in enumerate(lst): | |
if dic[key] == value: | |
return i | |
return -1 | |
def catsort(str): | |
if (str in defined): | |
return defined[::-1].index(str) | |
else: | |
return -1 | |
def replacetag(str): | |
if str in rep: | |
return rep[str] | |
else: | |
return str | |
def tagclean(child, root): | |
c = child.text.split(', ') | |
x = list(set(list(map(replacetag, c)))) | |
x.sort(reverse=True, key=catsort) | |
tpl = "[T]: {}\n[P]: {}\n[O]: {}\n[W]: {} \n" | |
for cat in x: | |
if (cat in defined): | |
return cat | |
else: | |
game = root.find("./resource[@id='{}'][@type='title']".format(child.attrib["id"])).text | |
platform = root.find("./resource[@id='{}'][@type='platform']".format(child.attrib["id"])).text | |
write = x[0] | |
if (isinstance(NO_CAT_STRING, str)): | |
write = NO_CAT_STRING | |
LOGGER["nocat"].append(tpl.format(game, platform, child.text, write)) | |
return write | |
def playerclean(child): | |
cleaned = re.sub(" ", "", re.sub("\s*\(.*.?\s*", "", child.text)) | |
plstr = cleaned | |
if(PLAYERFORMAT == "MAXPLAYER"): | |
match = re.match("\d-(\d{1,3})", plstr) | |
if match is not None: | |
plstr = match[1] | |
if(cleaned.find('-') < 0 and len(cleaned) > 2): | |
plstr = "1" | |
found = findDict(LOGGER["players"], "players", plstr) | |
if (found > -1): | |
LOGGER["players"][found]["count"] += 1 | |
else: | |
LOGGER["players"].append(dict({"players": plstr, "count": 1})) | |
return plstr | |
databases = findFile(args.path, "db.xml") | |
backups = findFile(args.path, "db.xml.orig") | |
if (len(backups) > 0): | |
answer = input("Delete current Backups and reclean ? (y/n)") | |
if answer == "n": | |
sys.exit() | |
for backdb in backups: | |
print("Removing Backup: {}".format(backdb)) | |
if(DRY_TEST): | |
print("[DRY] Remove {}".format(backdb)) | |
else: | |
os.remove(backdb) | |
if (len(databases) < 1): | |
print("No Databases found on given path") | |
sys.exit() | |
for db in databases: | |
print("Reading/Cleaning {}".format(db)) | |
root = ET.parse(db).getroot() | |
CURRENTFOLDER = os.path.split(os.path.dirname(db))[1] | |
for child in root: | |
if (child.attrib["type"] == "tags"): | |
child.text = tagclean(child, root) | |
if (child.attrib["type"] == "players"): | |
child.text = playerclean(child) | |
writeFile(db) | |
outputlog('nocat') | |
outputlog('players') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment