mkweskin/batch-find-replace.py

## batch-find-replace.py
#!/usr/bin/env python3

"""
Author: Matthew Kweskin, github: @mkweskin

A general utility to read in a delimited translation file with two columns
and rename any text file with these values.
"""

import argparse
from os import path
import sys
import re
import codecs

if (sys.version_info < (3, 0)):
    raise Exception("This script requires python3. One way to install this is with miniconda3: https://docs.conda.io/en/latest/miniconda.html")

def unescaped_str(arg_str):
    """
    Allows for tab characters in the arguments
    """
    return codecs.decode(str(arg_str), 'unicode_escape')

def get_args():
    parser = argparse.ArgumentParser(description="Text replacement utility that gets values to find/replace from a translation file (tab separated file as default, others delimiters can be specified.)", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("TRANSLATION_FILE", help = 'File file with the pairs of values to find/replace. Note: if there are >1 delimiter in a line, the text following the final delimiter character will be used as the replace value.')
    parser.add_argument("INPUT", help = 'File to perform the find/replace on.')
    parser.add_argument("--output", help = 'Output file. Use \'STDOUT\' to output to screen.', default = ('STDOUT'))
    parser.add_argument("--overwrite",  action="store_true", help = 'Automatically overwrite the output file and log file if they\'re already present')
    parser.add_argument("--delimiter", help = 'Character(s) to separate the fields. Use \'\\t\' for tab (default).', type=unescaped_str, default = ('\t'))
    parser.add_argument("--quiet", action="store_true", help = 'Disable warning messages')
    parser.add_argument("--trim", action="store_true", help = 'Trim out extra white space before and after each find/replace pair')
    parser.add_argument("--reverse", action="store_true", help = 'Reverse columns 1 and 2 in the translation file')
    return parser.parse_args()

def file_checks(args):
    """
    Some checks on the input/output/log files
    """
    if not path.exists(args.INPUT):
        raise Exception("Input file does not exist")

    if not path.exists(args.TRANSLATION_FILE):
        raise Exception("Translation file does not exist")

    if args.output != 'STDOUT' and path.exists(args.output) and not args.overwrite:
        answer = input("Output file exists. (use --overwrite to Automatically overwrite the output file)\nOverwrite [y/n]? ")
        if answer.lower() != 'y':
            print ("Exiting script.")
            sys.exit()

    outputdir = args.output != 'STDOUT' and path.dirname(args.output)
    if not path.exists(outputdir) and outputdir != "":
        raise Exception("Output directory does not exist")

def read_translation(translation_file, quiet, trim, delim, reverse):
    """
    Read in a translation file as a dictionary
    """

    dict = {}
    with open(translation_file) as translation:
        for line in translation:
            line = line.rstrip('\r\n')
            line_split = line.rsplit(delim, 1)
            if len(line_split) != 2:
                sys.stderr.write("ERROR: line is missing the delimiter character: \"" + str(line_split) + "\"\n")
                sys.exit()
            if reverse:
                line_split = tuple(reversed(line_split))
            if trim:
                line_split = list(map(str.strip, line_split))
            if line_split[0] in dict.keys() and not quiet:
                sys.stderr.write ("Warning: the text to find \"" + str(line_split[0]) + "\" is already defined as \"" + str(dict[line_split[0]]) + "\", overwritting.\n")
            if delim in line_split[0] and not quiet:
                sys.stderr.write ("Warning: the line \"" + line + "\" contains an extra delimiter. All text to the left of the last delimiter will be used as the search string.\n")
            (key, val) = line_split
            dict[key] = val
    return dict

def do_find_replace(infile, outfile, re_dict):
    """
    This does the find/replace on the text file
    """
    re_dict = dict((re.escape(k), v) for k, v in re_dict.items())
    pattern = re.compile("|".join(re_dict.keys()))
    for line in infile:
        line = pattern.sub(lambda m: re_dict[re.escape(m.group(0))], line)
        if outfile == None:
            sys.stdout.write(line)
        else:
            outfile.write(line)

def main():
    args = get_args()
    file_checks(args)

    re_dict = read_translation(args.TRANSLATION_FILE, args.quiet, args.trim, args.delimiter, args.reverse)

    if args.output == "STDOUT":
        with open(args.INPUT) as infile:
            do_find_replace(infile, None, re_dict)
    else:
        with open(args.INPUT) as infile, open(args.output, 'w') as outfile:
            do_find_replace(infile, outfile, re_dict)

if __name__ == '__main__':
    main()