Last active
February 13, 2020 04:33
-
-
Save elundmark/fa3746b8b090289d6f4449b4541999c4 to your computer and use it in GitHub Desktop.
Magic Media Sorter - Sort your media correctly! (Tested in: Python v3.8+ on Linux)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.8 | |
"""Magic Media Sorter - Sort your media correctly!""" | |
import argparse | |
import operator | |
import sys | |
import re | |
def main(imported=None, importedList=None): | |
"""main function""" | |
app = { | |
"name": "Magic Media Sorter", | |
"desc": "".join([ | |
"Sorts lines of media or other strings by episode/track number - ", | |
"Reads stdin or files, files is sorted seperately while stdin is sorted together. ", | |
"You can also import this script and use it like this: ", | |
"`mms.main(['--import'], ['part10', 'part1'])`" | |
]), | |
"version": "0.6.3", | |
"updated": "2020-02-12", | |
"author": "elundmark@posteo.se", | |
"license": "MIT; https://www.opensource.org/licenses/MIT", | |
} | |
# Handle arguments and options | |
argparser = argparse.ArgumentParser( | |
prog=app["name"], | |
description=" ".join([ | |
app["name"], | |
app["version"], | |
"(" + app["updated"], | |
"<" + app["author"] + ">)", | |
app["desc"], | |
]), | |
allow_abbrev=False | |
) | |
argparser.add_argument("-V", "--version", action="version", version="%(prog)s " + app["version"]) | |
argparser.add_argument( | |
"-R", "--raw", action="store_true", | |
help="Output calculated strings for debugging (Default: %(default)s)" | |
) | |
argparser.add_argument( | |
"-S", "--no-sorting", action="store_true", | |
help="Skip sorting, useful when importing from scripts and you want raw strings (Default: %(default)s)" | |
) | |
argparser.add_argument( | |
"-i", "--import", action="store_true", | |
help="Use when you import from other python scripts (Default: %(default)s)" | |
) | |
argparser.add_argument( | |
"-I", "--case", action="store_true", | |
help="Case sensitive (Default: %(default)s)" | |
) | |
argparser.add_argument( | |
"-b", "--basename", action="store_true", | |
help="Only sort by / apply to basename (Default: %(default)s)" | |
) | |
argparser.add_argument( | |
"-r", "--reverse", action="store_true", | |
help="Reverse the results (Default: %(default)s)" | |
) | |
argparser.add_argument( | |
"-H", "--with-filename", action="store_true", | |
help="Print the file name for each match to stderr (Default: %(default)s)" | |
) | |
# treat rest arguments as files to be read | |
argparser.add_argument("files", nargs="*", type=argparse.FileType("r")) | |
# make arguments object | |
if imported and importedList: | |
opts = argparser.parse_args(imported) | |
else: | |
opts = argparser.parse_args() | |
# caching all regular expressions here | |
patterns = { | |
"any_spaces": re.compile(r"\s+"), | |
"numbers": re.compile(r"\d+"), | |
# anything that isn't alpha-numeric | |
"special_chars": re.compile(r"[ /|\t\r\a\f\v\^\[\](){}<>?!&#@£$§\\\"'`:;,._%*+=~\-]+"), | |
# match all whitespace except newlines | |
"multiple_spaces": re.compile(r"[ ]+"), | |
# these ranges can be ignored | |
"common_number_skips": | |
re.compile(r"((?<=[ ])|^)(10bit|240p|360p|480p|720p|1080p|1280p|1440p|2160p|[hx][. ]?26[45])([ ]|$)"), | |
# Standard S02E02 format, older format 1x04, (part) 7 of 10 | 5of5 | |
"ep_format_1": re.compile(r"((?<=[ ])|^)s?(\d+)(?:[ex]| ?of ?)(\d+)([ ]|$)", flags=re.IGNORECASE), | |
# Long format Season 1 Episode 4 | |
"ep_format_2": re.compile(r"((?<=[ ])|^)season ?(\d+) ?episode ?(\d+)([ ]|$)", flags=re.IGNORECASE), | |
# ignore "The X" and sort as "X" | |
"the_slug": re.compile(r"(^|[ ])the ", flags=re.IGNORECASE), | |
## Season - pages - track patterns (eg A1 B1 for LPs, p1 for images, S09 for whole seasons or episodes) | |
# keep the word character | |
"paged_num_1": re.compile(r"((?<=[ ])|^)(ep|[a-zA-Z])(\d+)([ ]|$)", flags=re.IGNORECASE), | |
# keep the word character | |
"paged_num_2": re.compile(r"((?<=[ ])|^)(\d+)([a-zA-Z])([ ]|$)", flags=re.IGNORECASE), | |
# common numbering chars that can be trimmed | |
"paged_num_3": re.compile(r"((?<=[ ])|^)(\d+)(?:st|nd|rd|th)([ ]|$)", flags=re.IGNORECASE), | |
# pad these ranges with zeroes | |
"num_ranges": re.compile(r"((?<=[ ])|^)(\d+)([ ]|$)"), | |
} | |
wordNumbers = { | |
"small": { | |
"zero": 0, | |
"one": 1, | |
"two": 2, | |
"three": 3, | |
"four": 4, | |
"five": 5, | |
"six": 6, | |
"seven": 7, | |
"eight": 8, | |
"nine": 9, | |
"ten": 10, | |
"eleven": 11, | |
"twelve": 12, | |
"thirteen": 13, | |
"fourteen": 14, | |
"fifteen": 15, | |
"sixteen": 16, | |
"seventeen":17, | |
"eighteen": 18, | |
"nineteen": 19, | |
"twenty": 20, | |
"thirty": 30, | |
"forty": 40, | |
"fifty": 50, | |
"sixty": 60, | |
"seventy": 70, | |
"eighty": 80, | |
"ninety": 90, | |
}, | |
"magnitude": { | |
"thousand": 1000, | |
"million": 1000000, | |
"billion": 1000000000, | |
"trillion": 1000000000000, | |
"quadrillion":1000000000000000, | |
"quintillion":1000000000000000000, | |
"sextillion": 1000000000000000000000, | |
}, | |
} | |
def removeNones(v): | |
"""Filter function for text2num""" | |
if v is None: | |
return False | |
return True | |
def text2num(s): | |
"""Translate words to numbers - inline replacment""" | |
# Code borrowed from https://stackoverflow.com/questions/11980087 | |
# only I need it to replace inline and leave everything else as is | |
startPos = None | |
# using the re module keeps any spaces in the beginning or end | |
a = patterns["multiple_spaces"].split(str(s)) | |
n = 0 | |
g = 0 | |
for i, w in enumerate(a): | |
x = None | |
if w == "": | |
continue | |
if w in wordNumbers["small"]: | |
x = wordNumbers["small"][w] | |
g = g + x | |
elif w == "hundred": | |
g = g * 100 | |
else: | |
if w in wordNumbers["magnitude"]: | |
x = wordNumbers["magnitude"][w] | |
n = n + g * x | |
g = 0 | |
else: | |
n = 0 | |
g = 0 | |
if startPos is not None: | |
# 1) we reached the last word number and need to remove any previous | |
# words we counted as numbers - keeping the current position | |
for y in range(startPos, i - 1): | |
a[y] = None | |
startPos = None | |
if x is not None: | |
# we had a number | |
a[i] = str(n + g) | |
if startPos is None: | |
# set the beginning of a number series | |
startPos = i | |
else: | |
# just a word | |
a[i] = w | |
# remove all the places we removed at 1) | |
a = filter(removeNones, a) | |
return " ".join(a) | |
def countWidestNumberSeries(s): | |
"""Get width of longest number series""" | |
w = 0 | |
matches = patterns["numbers"].findall(s) | |
for match in matches: | |
w = max(w, len(str(match))) | |
return w | |
def sorter(a): | |
"""Nuts and bolts""" | |
# global in this scope | |
num_max_width = 0 | |
# cache for looping | |
rlen = range(len(a)) | |
def padNumbers(m): | |
"""Fix most common meaningful number series""" | |
# 1) To make sort work, we make every series of numbers | |
# on all lines, the same width by padding all | |
# numbers with zeroes until they are all of equal width | |
nums = str(m.group(2)) | |
rest = str("" if not m.group(3) else m.group(3)) | |
nums_len = len(nums) | |
while nums_len < num_max_width: | |
nums = "0" + nums | |
nums_len += 1 | |
return nums + rest | |
for i in rlen: | |
# parse and replace item as a list [ original, parsed ] | |
parsed = "" | |
parsed = a[i] if not opts.basename else a[i].split("/")[-1] | |
if not opts.case: | |
parsed = parsed.lower() | |
parsed = patterns["special_chars"].sub(" ", parsed) | |
parsed = patterns["common_number_skips"].sub(r"", parsed) | |
parsed = patterns["ep_format_1"].sub(r"\2 \3\4", parsed) | |
parsed = patterns["ep_format_2"].sub(r"\2 \3\4", parsed) | |
parsed = patterns["the_slug"].sub(r"\1", parsed) | |
parsed = patterns["paged_num_1"].sub(r"\2 \3\4", parsed) | |
parsed = patterns["paged_num_2"].sub(r"\2 \3\4", parsed) | |
parsed = patterns["paged_num_3"].sub(r"\2\3", parsed) | |
parsed = text2num(parsed) | |
a[i] = [ a[i], parsed ] | |
for i in rlen: | |
# calculate width for padding now | |
num_max_width = max(num_max_width, countWidestNumberSeries(a[i][1])) | |
for i in rlen: | |
# pad the number series with zeroes | |
a[i] = [ a[i][0], patterns["num_ranges"].sub(padNumbers, a[i][1]) ] | |
# now we can sort, by key 1, using operator is apparently faster than using a function | |
if not opts.no_sorting: | |
a.sort(key=operator.itemgetter(1), reverse=bool(opts.reverse)) | |
# select "parsed string" if --raw (debugging) | |
for i in rlen: | |
a[i] = a[i][1] if opts.raw else a[i][0] | |
# list of lists | |
return a | |
if imported: | |
return sorter(importedList) | |
if len(opts.files) == 0: | |
# no files arguments, read from stdin or as list | |
args = list() | |
for line in sys.stdin: | |
# rstrip only newline | |
args.append(line.rstrip("\n")) | |
print("\n".join(sorter(args))) | |
else: | |
# read lines from files as one big list | |
# remember: file is already opened and ready to be read | |
for f in opts.files: | |
# reset args list | |
args = list() | |
if opts.with_filename: | |
# mimic grep and echo a filename (to stderr) | |
print("\n" + f.name + "\n", file=sys.stderr) | |
for line in f: | |
# rstrip only newline | |
args.append(line.rstrip("\n")) | |
f.close() | |
print("\n".join(sorter(args))) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment