Skip to content

Instantly share code, notes, and snippets.

@nicolalamacchia
Last active March 13, 2023 22:20
Show Gist options
  • Save nicolalamacchia/967b10fab53544d1422249f5506bd3ed to your computer and use it in GitHub Desktop.
Save nicolalamacchia/967b10fab53544d1422249f5506bd3ed to your computer and use it in GitHub Desktop.
A script to clean up ROM files from ROM collections
#!/usr/bin/python
# based on https://github.com/grayaii/rom_cleaner
# NOTE: I am in the process of rewriting this to make it more readable,
# testable, optimized. Right now I just added features and tuned
# existing ones mostly on top of the old code.
import re
import os
import argparse
from functools import reduce
from copy import deepcopy
# NOTE: this `WEIGHTS` list expresses preferences for:
# * Italian
# * English
# * European verisons
# http://www.theisozone.com/tutorials/other/general/know-your-roms-a-guide-to-identifying-the-symbols/
WEIGHTS = [
{"token": "[a]", "weight": 9, "description": "Alternate"},
{"token": "[b]", "weight": -99, "description": "Bad Dump"},
{"token": "[BF]", "weight": 7, "description": "Bug Fix"},
{"token": "[c]", "weight": 8, "description": "Cracked"},
{"token": "[f]", "weight": 6, "description": "Other Fix"},
{"token": "[h]", "weight": 5, "description": "Hack"},
{"token": "[o]", "weight": 10, "description": "Overdump"},
{"token": "[p]", "weight": 4, "description": "Pirate"},
{"token": "[t]", "weight": 3, "description": "Trained"},
{"token": "[T]", "weight": 2, "description": "Translation"},
{"token": "(Unl)", "weight": 1, "description": "Unlicensed"},
{"token": "(early)", "weight": 2, "description": "Early release"},
{
"token_re": "v\\s?[0-9]+.*",
"weight": 2,
"description": "Specific version",
},
{
"token_re": "Alt [0-9]+",
"weight": 2,
"description": "Alternative version",
},
{"token_re": "Rev [0-9]+", "weight": 2, "description": "Revision"},
{"token_re": "Debug.*", "weight": 1, "description": "Debug version"},
{"token": "(Kiosk)", "weight": 2, "description": "Kiosk version"},
{"token": "(Demo)", "weight": -99, "description": "Demo"},
{"token": "(Beta)", "weight": -99, "description": "Beta"},
{"token": "[x]", "weight": -99, "description": "Bad Checksum"},
{"token": "[!]", "weight": 100, "description": "Verified Good Dump"},
{"token": "(a)", "weight": 55, "description": "Australian"},
{"token": "(Aus)", "weight": 55, "description": "Australian"},
{"token": "(Australia)", "weight": 55, "description": "Australian"},
{"token": "(Brazil)", "weight": 0, "description": "Brazilian"},
{"token": "(C)", "weight": 0, "description": "Chinese"},
{"token": "(China)", "weight": 0, "description": "Chinese"},
{"token": "(E)", "weight": 100, "description": "Europe"},
{"token": "(EU)", "weight": 100, "description": "Europe"},
{"token": "(Europe)", "weight": 100, "description": "Europe"},
{"token": "(F)", "weight": 0, "description": "French"},
{"token": "(Fr)", "weight": 0, "description": "French"},
{"token": "(France)", "weight": 0, "description": "French"},
{"token": "(FN)", "weight": 0, "description": "Finland"},
{"token": "(G)", "weight": 0, "description": "German"},
{"token": "(De)", "weight": 0, "description": "German"},
{"token": "(Germany)", "weight": 0, "description": "German"},
{"token": "(GR)", "weight": 0, "description": "Greece"},
{"token": "(HK)", "weight": 0, "description": "Hong Kong"},
{"token": "(Taiwan)", "weight": 0, "description": "Taiwan"},
{
"token_re": r"It\+[A-Z][a-z]",
"weight": 1,
"description": "Italian",
},
{"token_re": r"[A-Z][a-z]\+It", "weight": 1, "description": "Italian"},
{"token": "(I)", "keep": True, "weight": 90, "description": "Italian"},
{"token": "(It)", "weight": 90, "description": "Italian"},
{"token": "(Italy)", "keep": True, "weight": 90, "description": "Italian"},
{"token": "(J)", "weight": -10, "description": "Japan"},
{"token": "(Ja)", "weight": 0, "description": "Japan"},
{"token": "(Japan)", "weight": -10, "description": "Japan"},
{"token": "(K)", "weight": 0, "description": "Korean"},
{"token": "(Korea)", "weight": 0, "description": "Korean"},
{"token": "(PD)", "weight": 80, "description": "Public Domain"},
{"token": "(S)", "weight": 0, "description": "Spanish"},
{"token": "(Es)", "weight": 0, "description": "Spanish"},
{"token": "(Spain)", "weight": 0, "description": "Spanish"},
{"token": "(Sweden)", "weight": 0, "description": "Sweden"},
{"token": "(SW)", "weight": 0, "description": "Sweden"},
{"token": "(NL)", "weight": 0, "description": "Dutch"},
{"token": "(Nl)", "weight": 0, "description": "Dutch"},
{"token": "(Netherlands)", "weight": 0, "description": "Dutch"},
{"token": "(U)", "weight": 65, "description": "USA"},
{"token": "(USA)", "weight": 65, "description": "USA"},
{"token_re": r"En\+[A-Z][a-z]", "weight": 1, "description": "English"},
{"token_re": r"[A-Z][a-z]\+En", "weight": 1, "description": "English"},
{"token": "(En)", "weight": 75, "description": "English"},
{"token": "(UK)", "weight": 70, "description": "England"},
{"token": "(World)", "weight": 100, "description": "International"},
{"token": "(Worlds)", "weight": 100, "description": "International"},
{"token": "(Unk)", "weight": 0, "description": "Unknown Country"},
{"token": "(Proto)", "weight": 10, "description": "Prototype"},
{"token": "(Promo)", "weight": 10, "description": "Promo"},
{"token": "(Sample)", "weight": 0, "description": "Sample"},
{"token": "(Sv)", "weight": 0, "description": "Unknown"},
{"token": "(No)", "weight": 0, "description": "Unknown"},
{"token": "(Da)", "weight": 0, "description": "Unknown"},
{"token": "(Pt)", "weight": 0, "description": "Unknown"},
{"token": "(Fi)", "weight": 0, "description": "Unknown"},
{"token": "(-)", "weight": 0, "description": "Unknown Country"},
# {"token": "(Sachen-USA)", "weight":10, "description": ""},
# {"token": "(Sachen-English)", "weight": 10, "description": ""},
]
LOW_TH = -200
RENAME_ACTION = "rename"
DELETE_ACTION = "delete"
def get_compiled_rule(rule):
compiled_rule = deepcopy(rule)
compiled_rule["token_re"] = re.compile(rule["token_re"], re.I)
return compiled_rule
re_weights = list(filter(lambda x: x.get("token_re"), WEIGHTS))
compiled_re_weights = reduce(
lambda x, y: x + [get_compiled_rule(y)], re_weights, []
)
tokens_re = re.compile(r"(?:\(.*?\))|(?:\[.*?\])")
class RomsManager:
def __init__(self, roms, action, only_one):
self.action = action
self.only_one = only_one
self.roms = roms
self._actions = {
DELETE_ACTION: self.delete,
RENAME_ACTION: self.rename,
}
def execute_action(self, action, path):
self._actions[action](path)
@staticmethod
def delete(file_path):
print("\tDeleting: {}".format(file_path))
os.remove(file_path)
@staticmethod
def rename(file_path):
orig_path = os.path.split(file_path)
new_path = os.path.join(orig_path[0], ".{}".format(orig_path[1]))
print("\tRenaming: {} to {}".format(file_path, new_path))
os.rename(file_path, new_path)
def clean(self):
total_files = 0
for stripped_filename, roms in self.roms.items():
if len(roms) <= 1:
continue
print(stripped_filename)
prev_weight = None
for r in sorted(roms, key=lambda x: x.weight, reverse=True):
total_files += 1
if r.weight > LOW_TH:
if prev_weight is None:
print("\t:OK:{}:{}".format(r.weight, r.base_filename))
prev_weight = r.weight
continue
if prev_weight == r.weight and not self.only_one:
print("\t:OK:{}:{}".format(r.weight, r.base_filename))
continue
print("\t:KO:{}:{}".format(r.weight, r.base_filename))
if r.keep:
print("\t ^ Kept")
continue
if self.action:
self.execute_action(self.action, r.rom_full_path)
print("total unique files: {}".format(len(self.roms)))
print("total files : {}".format(total_files))
class Rom:
penalty = -2 # more tokens => lower score
def __init__(self, rom_full_path, check_keep):
# super mario (hack) (!).nes:
self.rom_full_path = rom_full_path
self.check_keep = check_keep
self.keep = False
rom_basename = os.path.basename(rom_full_path)
self.stripped_filename = self.get_stripped_filename(rom_basename)
self.base_filename = rom_basename
self.tokens = self.subtokenize(rom_basename)
self.weight = self.calculate_weight()
@staticmethod
def subtokenize(string):
token_matches = tokens_re.findall(string)
for token_match in token_matches:
opening_bracket = token_match[0]
closing_bracket = token_match[-1]
inner_string = token_match[1:-1]
inner_string = inner_string.replace(", ", ",")
sub_tokens = inner_string.split(",")
yield from (
"{}{}{}".format(
opening_bracket, sub_token.strip(), closing_bracket
)
for sub_token in sub_tokens
)
@staticmethod
def get_stripped_filename(rom_basename):
filename, ext = os.path.splitext(tokens_re.sub("", rom_basename))
return "{}{}".format(filename.strip(), ext)
def get_token_weight(self, token):
rule = None
try:
rule = next(
filter(
lambda x: x.get("token", "").lower() == token.lower(),
WEIGHTS,
)
)
except StopIteration:
pass
try:
rule = next(
filter(
lambda x: x["token_re"].match(token[1:-1]),
compiled_re_weights,
)
)
except StopIteration:
pass
if rule:
return rule["weight"] + self.penalty
if self.check_keep:
self.keep = True
return self.penalty
def calculate_weight(self):
return reduce(
lambda x, y: x + self.get_token_weight(y), self.tokens, 0
)
# Parse command line args:
def parseArgs():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--roms-dir",
"--rom_dir",
dest="roms_dir",
help="Location where roms are stored",
default=".",
)
parser.add_argument(
"--one",
help=(
"Keep only one rom per title if"
" there are more with the same weight"
),
action="store_true",
)
parser.add_argument(
"--keep",
dest="keep_unkn_tags",
help="Keep ROMs with unrecognized tags",
action="store_true",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--rename",
help="Rename duplicates adding a leading dot (.)",
dest="action",
const=RENAME_ACTION,
action="store_const",
)
group.add_argument(
"--delete",
help="WARNING: this will delete the duplicate ROMs!",
dest="action",
const=DELETE_ACTION,
action="store_const",
)
args = parser.parse_args()
return args
def walk_roms(root_dir, keep_unkn_tags):
for dirname, _, filenames in os.walk(root_dir):
for filename in sorted(filenames, reverse=True):
yield Rom(os.path.join(dirname, filename), keep_unkn_tags)
def make_roms_collection(roms):
# {'test': [<__main__.Rom object at 0x7efcf097cc40>, ... ]}
rom_collection = {}
for rom in roms:
filename = rom.stripped_filename
if filename not in rom_collection:
rom_collection[filename] = []
rom_collection[filename].append(rom)
return rom_collection
if __name__ == "__main__":
args = parseArgs()
roms_dir = args.roms_dir
action = args.action
print("> Running on {}".format(roms_dir))
if action is None:
print("> Runnin in dry run mode")
roms = make_roms_collection(walk_roms(roms_dir, args.keep_unkn_tags))
all_roms = RomsManager(roms, action, args.one)
all_roms.clean()
print("all done!")
@jasonmbrown
Copy link

jasonmbrown commented May 21, 2021

Ran into an issue with this, It refuses to run the --delete command.
No Idea whats happening.

Yamagata Digital Museum.chd
        :OK:-5:Yamagata Digital Museum (JP) (Disc 4) (Winter).chd
        :OK:-5:Yamagata Digital Museum (JP) (Disc 3) (Autumn).chd
        :OK:-5:Yamagata Digital Museum (JP) (Disc 2) (Summer).chd
        :OK:-5:Yamagata Digital Museum (JP) (Disc 1) (Spring).chd
XS Junior League Soccer.chd
        :OK:15:XS Junior League Soccer (US).chd
        :KO:2:XS Junior League Soccer (EU).chd
Traceback (most recent call last):
  File "G:\clean_roms.py", line 256, in <module>
    all_roms.clean()
  File "G:\clean_roms.py", line 115, in clean
    self.execute_action(self.action, r.rom_full_path)
  File "G:\clean_roms.py", line 73, in execute_action
    self._actions[action](args, kwargs)
TypeError: delete() takes 1 positional argument but 2 were given

Ive tried fixing it myself, but its a bit too complicated for my python skills.

@nicolalamacchia
Copy link
Author

@jasonmbrown, should work now, please try again :)

@jasonmbrown
Copy link

@jasonmbrown, should work now, please try again :)

Yes its working, Much better then it was! Thank you!!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment