elundmark/mms.py

## mms.py
#!/usr/bin/env python3.8
"""Magic Media Sorter - Sort your media correctly!"""

import argparse
import operator
import sys
import re

def main(imported=None, importedList=None):
    """main function"""

    app = {
        "name": "Magic Media Sorter",
        "desc": "".join([
            "Sorts lines of media or other strings by episode/track number - ",
            "Reads stdin or files, files is sorted seperately while stdin is sorted together. ",
            "You can also import this script and use it like this: ",
            "`mms.main(['--import'], ['part10', 'part1'])`"
        ]),
        "version": "0.6.3",
        "updated": "2020-02-12",
        "author": "elundmark@posteo.se",
        "license": "MIT; https://www.opensource.org/licenses/MIT",
    }
    # Handle arguments and options
    argparser = argparse.ArgumentParser(
        prog=app["name"],
        description=" ".join([
            app["name"],
            app["version"],
            "(" + app["updated"],
            "<" + app["author"] + ">)",
            app["desc"],
        ]),
        allow_abbrev=False
    )
    argparser.add_argument("-V", "--version", action="version", version="%(prog)s " + app["version"])
    argparser.add_argument(
        "-R", "--raw", action="store_true",
        help="Output calculated strings for debugging (Default: %(default)s)"
    )
    argparser.add_argument(
        "-S", "--no-sorting", action="store_true",
        help="Skip sorting, useful when importing from scripts and you want raw strings (Default: %(default)s)"
    )
    argparser.add_argument(
        "-i", "--import", action="store_true",
        help="Use when you import from other python scripts (Default: %(default)s)"
    )
    argparser.add_argument(
        "-I", "--case", action="store_true",
        help="Case sensitive (Default: %(default)s)"
    )
    argparser.add_argument(
        "-b", "--basename", action="store_true",
        help="Only sort by / apply to basename (Default: %(default)s)"
    )
    argparser.add_argument(
        "-r", "--reverse", action="store_true",
        help="Reverse the results (Default: %(default)s)"
    )
    argparser.add_argument(
        "-H", "--with-filename", action="store_true",
        help="Print the file name for each match to stderr (Default: %(default)s)"
    )
    # treat rest arguments as files to be read
    argparser.add_argument("files", nargs="*", type=argparse.FileType("r"))

    # make arguments object
    if imported and importedList:
        opts = argparser.parse_args(imported)
    else:
        opts = argparser.parse_args()

    # caching all regular expressions here
    patterns = {
        "any_spaces": re.compile(r"\s+"),
        "numbers": re.compile(r"\d+"),
        # anything that isn't alpha-numeric
        "special_chars": re.compile(r"[ /|\t\r\a\f\v\^\[\](){}<>?!&#@£$§\\\"'`:;,._%*+=~\-]+"),
        # match all whitespace except newlines
        "multiple_spaces": re.compile(r"[ ]+"),
        # these ranges can be ignored
        "common_number_skips":
            re.compile(r"((?<=[ ])|^)(10bit|240p|360p|480p|720p|1080p|1280p|1440p|2160p|[hx][. ]?26[45])([ ]|$)"),
        # Standard S02E02 format, older format 1x04, (part) 7 of 10 | 5of5
        "ep_format_1": re.compile(r"((?<=[ ])|^)s?(\d+)(?:[ex]| ?of ?)(\d+)([ ]|$)", flags=re.IGNORECASE),
        # Long format Season 1 Episode 4
        "ep_format_2": re.compile(r"((?<=[ ])|^)season ?(\d+) ?episode ?(\d+)([ ]|$)", flags=re.IGNORECASE),
        # ignore "The X" and sort as "X"
        "the_slug": re.compile(r"(^|[ ])the ", flags=re.IGNORECASE),
        ## Season - pages - track patterns (eg A1 B1 for LPs, p1 for images, S09 for whole seasons or episodes)
        # keep the word character
        "paged_num_1": re.compile(r"((?<=[ ])|^)(ep|[a-zA-Z])(\d+)([ ]|$)", flags=re.IGNORECASE),
        # keep the word character
        "paged_num_2": re.compile(r"((?<=[ ])|^)(\d+)([a-zA-Z])([ ]|$)", flags=re.IGNORECASE),
        # common numbering chars that can be trimmed
        "paged_num_3": re.compile(r"((?<=[ ])|^)(\d+)(?:st|nd|rd|th)([ ]|$)", flags=re.IGNORECASE),
        # pad these ranges with zeroes
        "num_ranges": re.compile(r"((?<=[ ])|^)(\d+)([ ]|$)"),
    }
    wordNumbers = {
        "small": {
            "zero":     0,
            "one":      1,
            "two":      2,
            "three":    3,
            "four":     4,
            "five":     5,
            "six":      6,
            "seven":    7,
            "eight":    8,
            "nine":     9,
            "ten":      10,
            "eleven":   11,
            "twelve":   12,
            "thirteen": 13,
            "fourteen": 14,
            "fifteen":  15,
            "sixteen":  16,
            "seventeen":17,
            "eighteen": 18,
            "nineteen": 19,
            "twenty":   20,
            "thirty":   30,
            "forty":    40,
            "fifty":    50,
            "sixty":    60,
            "seventy":  70,
            "eighty":   80,
            "ninety":   90,
        },
        "magnitude": {
            "thousand":   1000,
            "million":    1000000,
            "billion":    1000000000,
            "trillion":   1000000000000,
            "quadrillion":1000000000000000,
            "quintillion":1000000000000000000,
            "sextillion": 1000000000000000000000,
        },
    }

    def removeNones(v):
        """Filter function for text2num"""
        if v is None:
            return False
        return True

    def text2num(s):
        """Translate words to numbers - inline replacment"""
        # Code borrowed from https://stackoverflow.com/questions/11980087
        # only I need it to replace inline and leave everything else as is
        startPos = None
        # using the re module keeps any spaces in the beginning or end
        a = patterns["multiple_spaces"].split(str(s))
        n = 0
        g = 0

        for i, w in enumerate(a):
            x = None
            if w == "":
                continue
            if w in wordNumbers["small"]:
                x = wordNumbers["small"][w]
                g = g + x
            elif w == "hundred":
                g = g * 100
            else:
                if w in wordNumbers["magnitude"]:
                    x = wordNumbers["magnitude"][w]
                    n = n + g * x
                    g = 0
                else:
                    n = 0
                    g = 0
                    if startPos is not None:
                        # 1) we reached the last word number and need to remove any previous
                        # words we counted as numbers - keeping the current position
                        for y in range(startPos, i - 1):
                            a[y] = None

                    startPos = None
            if x is not None:
                # we had a number
                a[i] = str(n + g)
                if startPos is None:
                    # set the beginning of a number series
                    startPos = i
            else:
                # just a word
                a[i] = w

        # remove all the places we removed at 1)
        a = filter(removeNones, a)
        return " ".join(a)

    def countWidestNumberSeries(s):
        """Get width of longest number series"""
        w = 0
        matches = patterns["numbers"].findall(s)
        for match in matches:
            w = max(w, len(str(match)))
        return w

    def sorter(a):
        """Nuts and bolts"""
        # global in this scope
        num_max_width = 0

        # cache for looping
        rlen = range(len(a))

        def padNumbers(m):
            """Fix most common meaningful number series"""
            # 1) To make sort work, we make every series of numbers
            #    on all lines, the same width by padding all
            #    numbers with zeroes until they are all of equal width
            nums = str(m.group(2))
            rest = str("" if not m.group(3) else m.group(3))
            nums_len = len(nums)
            while nums_len < num_max_width:
                nums = "0" + nums
                nums_len += 1
            return nums + rest

        for i in rlen:
            # parse and replace item as a list [ original, parsed ]
            parsed = ""
            parsed = a[i] if not opts.basename else a[i].split("/")[-1]
            if not opts.case:
                parsed = parsed.lower()
            parsed = patterns["special_chars"].sub(" ", parsed)
            parsed = patterns["common_number_skips"].sub(r"", parsed)
            parsed = patterns["ep_format_1"].sub(r"\2 \3\4", parsed)
            parsed = patterns["ep_format_2"].sub(r"\2 \3\4", parsed)
            parsed = patterns["the_slug"].sub(r"\1", parsed)
            parsed = patterns["paged_num_1"].sub(r"\2 \3\4", parsed)
            parsed = patterns["paged_num_2"].sub(r"\2 \3\4", parsed)
            parsed = patterns["paged_num_3"].sub(r"\2\3", parsed)
            parsed = text2num(parsed)
            a[i] = [ a[i], parsed ]

        for i in rlen:
            # calculate width for padding now
            num_max_width = max(num_max_width, countWidestNumberSeries(a[i][1]))

        for i in rlen:
            # pad the number series with zeroes
            a[i] = [ a[i][0], patterns["num_ranges"].sub(padNumbers, a[i][1]) ]

        # now we can sort, by key 1, using operator is apparently faster than using a function
        if not opts.no_sorting:
            a.sort(key=operator.itemgetter(1), reverse=bool(opts.reverse))

        # select "parsed string" if --raw (debugging)
        for i in rlen:
            a[i] = a[i][1] if opts.raw else a[i][0]

        # list of lists
        return a

    if imported:
        return sorter(importedList)

    if len(opts.files) == 0:
        # no files arguments, read from stdin or as list
        args = list()
        for line in sys.stdin:
            # rstrip only newline
            args.append(line.rstrip("\n"))
        print("\n".join(sorter(args)))
    else:
        # read lines from files as one big list
        # remember: file is already opened and ready to be read
        for f in opts.files:
            # reset args list
            args = list()
            if opts.with_filename:
                # mimic grep and echo a filename (to stderr)
                print("\n" + f.name + "\n", file=sys.stderr)
            for line in f:
                # rstrip only newline
                args.append(line.rstrip("\n"))
            f.close()
            print("\n".join(sorter(args)))

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3.8
	"""Magic Media Sorter - Sort your media correctly!"""

	import argparse
	import operator
	import sys
	import re

	def main(imported=None, importedList=None):
	"""main function"""

	app = {
	"name": "Magic Media Sorter",
	"desc": "".join([
	"Sorts lines of media or other strings by episode/track number - ",
	"Reads stdin or files, files is sorted seperately while stdin is sorted together. ",
	"You can also import this script and use it like this: ",
	"`mms.main(['--import'], ['part10', 'part1'])`"
	]),
	"version": "0.6.3",
	"updated": "2020-02-12",
	"author": "elundmark@posteo.se",
	"license": "MIT; https://www.opensource.org/licenses/MIT",
	}
	# Handle arguments and options
	argparser = argparse.ArgumentParser(
	prog=app["name"],
	description=" ".join([
	app["name"],
	app["version"],
	"(" + app["updated"],
	"<" + app["author"] + ">)",
	app["desc"],
	]),
	allow_abbrev=False
	)
	argparser.add_argument("-V", "--version", action="version", version="%(prog)s " + app["version"])
	argparser.add_argument(
	"-R", "--raw", action="store_true",
	help="Output calculated strings for debugging (Default: %(default)s)"
	)
	argparser.add_argument(
	"-S", "--no-sorting", action="store_true",
	help="Skip sorting, useful when importing from scripts and you want raw strings (Default: %(default)s)"
	)
	argparser.add_argument(
	"-i", "--import", action="store_true",
	help="Use when you import from other python scripts (Default: %(default)s)"
	)
	argparser.add_argument(
	"-I", "--case", action="store_true",
	help="Case sensitive (Default: %(default)s)"
	)
	argparser.add_argument(
	"-b", "--basename", action="store_true",
	help="Only sort by / apply to basename (Default: %(default)s)"
	)
	argparser.add_argument(
	"-r", "--reverse", action="store_true",
	help="Reverse the results (Default: %(default)s)"
	)
	argparser.add_argument(
	"-H", "--with-filename", action="store_true",
	help="Print the file name for each match to stderr (Default: %(default)s)"
	)
	# treat rest arguments as files to be read
	argparser.add_argument("files", nargs="*", type=argparse.FileType("r"))

	# make arguments object
	if imported and importedList:
	opts = argparser.parse_args(imported)
	else:
	opts = argparser.parse_args()

	# caching all regular expressions here
	patterns = {
	"any_spaces": re.compile(r"\s+"),
	"numbers": re.compile(r"\d+"),
	# anything that isn't alpha-numeric
	"special_chars": re.compile(r"[ /\|\t\r\a\f\v\^\[\](){}<>?!&#@£$§\\\"'`:;,._%*+=~\-]+"),
	# match all whitespace except newlines
	"multiple_spaces": re.compile(r"[ ]+"),
	# these ranges can be ignored
	"common_number_skips":
	re.compile(r"((?<=[ ])\|^)(10bit\|240p\|360p\|480p\|720p\|1080p\|1280p\|1440p\|2160p\|[hx][. ]?26[45])([ ]\|$)"),
	# Standard S02E02 format, older format 1x04, (part) 7 of 10 \| 5of5
	"ep_format_1": re.compile(r"((?<=[ ])\|^)s?(\d+)(?:[ex]\| ?of ?)(\d+)([ ]\|$)", flags=re.IGNORECASE),
	# Long format Season 1 Episode 4
	"ep_format_2": re.compile(r"((?<=[ ])\|^)season ?(\d+) ?episode ?(\d+)([ ]\|$)", flags=re.IGNORECASE),
	# ignore "The X" and sort as "X"
	"the_slug": re.compile(r"(^\|[ ])the ", flags=re.IGNORECASE),
	## Season - pages - track patterns (eg A1 B1 for LPs, p1 for images, S09 for whole seasons or episodes)
	# keep the word character
	"paged_num_1": re.compile(r"((?<=[ ])\|^)(ep\|[a-zA-Z])(\d+)([ ]\|$)", flags=re.IGNORECASE),
	# keep the word character
	"paged_num_2": re.compile(r"((?<=[ ])\|^)(\d+)([a-zA-Z])([ ]\|$)", flags=re.IGNORECASE),
	# common numbering chars that can be trimmed
	"paged_num_3": re.compile(r"((?<=[ ])\|^)(\d+)(?:st\|nd\|rd\|th)([ ]\|$)", flags=re.IGNORECASE),
	# pad these ranges with zeroes
	"num_ranges": re.compile(r"((?<=[ ])\|^)(\d+)([ ]\|$)"),
	}
	wordNumbers = {
	"small": {
	"zero": 0,
	"one": 1,
	"two": 2,
	"three": 3,
	"four": 4,
	"five": 5,
	"six": 6,
	"seven": 7,
	"eight": 8,
	"nine": 9,
	"ten": 10,
	"eleven": 11,
	"twelve": 12,
	"thirteen": 13,
	"fourteen": 14,
	"fifteen": 15,
	"sixteen": 16,
	"seventeen":17,
	"eighteen": 18,
	"nineteen": 19,
	"twenty": 20,
	"thirty": 30,
	"forty": 40,
	"fifty": 50,
	"sixty": 60,
	"seventy": 70,
	"eighty": 80,
	"ninety": 90,
	},
	"magnitude": {
	"thousand": 1000,
	"million": 1000000,
	"billion": 1000000000,
	"trillion": 1000000000000,
	"quadrillion":1000000000000000,
	"quintillion":1000000000000000000,
	"sextillion": 1000000000000000000000,
	},
	}

	def removeNones(v):
	"""Filter function for text2num"""
	if v is None:
	return False
	return True

	def text2num(s):
	"""Translate words to numbers - inline replacment"""
	# Code borrowed from https://stackoverflow.com/questions/11980087
	# only I need it to replace inline and leave everything else as is
	startPos = None
	# using the re module keeps any spaces in the beginning or end
	a = patterns["multiple_spaces"].split(str(s))
	n = 0
	g = 0

	for i, w in enumerate(a):
	x = None
	if w == "":
	continue
	if w in wordNumbers["small"]:
	x = wordNumbers["small"][w]
	g = g + x
	elif w == "hundred":
	g = g * 100
	else:
	if w in wordNumbers["magnitude"]:
	x = wordNumbers["magnitude"][w]
	n = n + g * x
	g = 0
	else:
	n = 0
	g = 0
	if startPos is not None:
	# 1) we reached the last word number and need to remove any previous
	# words we counted as numbers - keeping the current position
	for y in range(startPos, i - 1):
	a[y] = None

	startPos = None
	if x is not None:
	# we had a number
	a[i] = str(n + g)
	if startPos is None:
	# set the beginning of a number series
	startPos = i
	else:
	# just a word
	a[i] = w

	# remove all the places we removed at 1)
	a = filter(removeNones, a)
	return " ".join(a)

	def countWidestNumberSeries(s):
	"""Get width of longest number series"""
	w = 0
	matches = patterns["numbers"].findall(s)
	for match in matches:
	w = max(w, len(str(match)))
	return w

	def sorter(a):
	"""Nuts and bolts"""
	# global in this scope
	num_max_width = 0

	# cache for looping
	rlen = range(len(a))

	def padNumbers(m):
	"""Fix most common meaningful number series"""
	# 1) To make sort work, we make every series of numbers
	# on all lines, the same width by padding all
	# numbers with zeroes until they are all of equal width
	nums = str(m.group(2))
	rest = str("" if not m.group(3) else m.group(3))
	nums_len = len(nums)
	while nums_len < num_max_width:
	nums = "0" + nums
	nums_len += 1
	return nums + rest

	for i in rlen:
	# parse and replace item as a list [ original, parsed ]
	parsed = ""
	parsed = a[i] if not opts.basename else a[i].split("/")[-1]
	if not opts.case:
	parsed = parsed.lower()
	parsed = patterns["special_chars"].sub(" ", parsed)
	parsed = patterns["common_number_skips"].sub(r"", parsed)
	parsed = patterns["ep_format_1"].sub(r"\2 \3\4", parsed)
	parsed = patterns["ep_format_2"].sub(r"\2 \3\4", parsed)
	parsed = patterns["the_slug"].sub(r"\1", parsed)
	parsed = patterns["paged_num_1"].sub(r"\2 \3\4", parsed)
	parsed = patterns["paged_num_2"].sub(r"\2 \3\4", parsed)
	parsed = patterns["paged_num_3"].sub(r"\2\3", parsed)
	parsed = text2num(parsed)
	a[i] = [ a[i], parsed ]

	for i in rlen:
	# calculate width for padding now
	num_max_width = max(num_max_width, countWidestNumberSeries(a[i][1]))

	for i in rlen:
	# pad the number series with zeroes
	a[i] = [ a[i][0], patterns["num_ranges"].sub(padNumbers, a[i][1]) ]

	# now we can sort, by key 1, using operator is apparently faster than using a function
	if not opts.no_sorting:
	a.sort(key=operator.itemgetter(1), reverse=bool(opts.reverse))

	# select "parsed string" if --raw (debugging)
	for i in rlen:
	a[i] = a[i][1] if opts.raw else a[i][0]

	# list of lists
	return a

	if imported:
	return sorter(importedList)

	if len(opts.files) == 0:
	# no files arguments, read from stdin or as list
	args = list()
	for line in sys.stdin:
	# rstrip only newline
	args.append(line.rstrip("\n"))
	print("\n".join(sorter(args)))
	else:
	# read lines from files as one big list
	# remember: file is already opened and ready to be read
	for f in opts.files:
	# reset args list
	args = list()
	if opts.with_filename:
	# mimic grep and echo a filename (to stderr)
	print("\n" + f.name + "\n", file=sys.stderr)
	for line in f:
	# rstrip only newline
	args.append(line.rstrip("\n"))
	f.close()
	print("\n".join(sorter(args)))

	if __name__ == "__main__":
	main()