Last active
August 15, 2018 09:41
-
-
Save ms8r/b16e770d0998b85913839b53a51b1d90 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Script to rename filesnames that include a year and month (e.g. reports, | |
account statements) such that the respective month appears as YYYY-MM and can | |
therefore be sorted correctly. | |
Original filenames can contain month names spelled out, abbreviated, or as | |
zero-padded decimals (the first two also from different locales). By default | |
the new filename will simply show the corresponding YYYY-MM as prefix, | |
separated with an underscore. Command line parameters allow to change this to | |
postfix or substitution of the original year/month substring(s). Run script | |
with `-h` to see details. | |
""" | |
import re | |
import os | |
import os.path | |
import shutil | |
import sys | |
import fnmatch | |
from datetime import date | |
from locale import getlocale | |
import argparse | |
import logging | |
from babel.dates import get_month_names | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | |
level=logging.INFO) | |
class DateRenameError(Exception): | |
pass | |
def gen_find(pattern, top='.', descent=True): | |
""" | |
Find all filenames in a directory that match a shell wildcard | |
pattern. | |
""" | |
for path, dirlist, filelist in os.walk(top): | |
for name in fnmatch.filter(filelist, pattern): | |
# logging.info('processing %s', os.path.join(path, name)) | |
yield (path, name) | |
if not descent: | |
break | |
def mk_month_map(month_fmt='B', locale='en_US'): | |
""" | |
Returns a dict with month names for `locale` and `month_fmt` as keys and | |
month numbers as values. `month_fmt` must be one of | |
'B': full month names for `locale` (e.g. 'January') | |
'b': short version _without_ '.' (e.g. 'Okt' for locale 'de-DE' but | |
also 'Juni' for same locale) | |
'mmm': similar to 'abbreviated but strictly first three letters of | |
month names (i.e. 'Jun' rather than 'Juni' for 'de-DE') | |
'm': zero-padded decimal number as string | |
""" | |
# post-processing of month names based on width: | |
m_fmt_map = { | |
'B': ('wide', lambda k: k), | |
'b': ('abbreviated', lambda k: k.rstrip('.')), | |
'mmm': ('wide', lambda k: k[:3]), | |
} | |
if month_fmt in m_fmt_map: | |
month_names = get_month_names(m_fmt_map[month_fmt][0], locale=locale) | |
keys = [m_fmt_map[month_fmt][1](m) for m in month_names.values()] | |
return dict(zip(keys, month_names.keys())) | |
elif month_fmt == 'm': | |
return dict([('{:02d}'.format(m), m) for m in range(1, 13)]) | |
def cust_month_map(month_fmt, locale): | |
""" | |
Returns a dict with month names for `locale` and `month_fmt` as keys and | |
month numbers as values. `month_fmt` can also be a whitespace separated | |
list of twelve custom month labels. Otherwise same values as in `month_map` | |
are accepted. | |
""" | |
m_lbl = month_fmt.split() | |
if len(m_lbl) == 12: | |
return dict(zip(m_lbl, range(1, 13))) | |
else: | |
return mk_month_map(month_fmt, locale) | |
def std_month_map(name_sample, locale): | |
""" | |
Returns a dict with month labels for `locale` as keys and month numbers as | |
values. Labels will be constructed to work with `name_sample`, a sample | |
string in which the month label is to be matched. Will first try to match | |
full month names for locale, then abbreviated names for locale (excluding | |
'.'), then full names shortened to first three letters, and finally | |
zero-padded decimals (the latter must match at word boundaries or | |
underscores as delimiters). | |
""" | |
for m_fmt in ['B', 'b', 'mmm', 'm']: | |
mm = mk_month_map(m_fmt, locale) | |
m_re = mk_month_re(mm.keys()) | |
if re.search(m_re, name_sample): | |
return mm | |
def mk_month_re(month_lbls): | |
""" | |
Returns a regex string for that will match any of the items in | |
`month_labels` (an iterable). The portion matching the label will be in a | |
match group named 'month'. If the length of the first label is less than | |
three the regex will match at word boundaries or '_' as delimiters. | |
""" | |
month_lbls = list(month_lbls) | |
bounds_re = r'(_|\b)' if len(month_lbls[0]) < 3 else r'()' | |
return (bounds_re + | |
r'(?P<month>' + '|'.join(month_lbls) + ')' + | |
bounds_re) | |
def mk_year_re(name_sample): | |
""" | |
Returns a regex string that will match a 4-digit year in name sample. | |
Will first try to match current year +/- 1 (also not at word boundaries), | |
then any 4-digit number at word boundaries or '_' as delimiter. | |
""" | |
tmpl = r'({boundary})(?P<year>{y_str})({boundary})' | |
cur_year = date.today().year | |
years = '|'.join(['{:04d}'.format(y) for y in range(cur_year - 1, cur_year + 2)]) | |
for y_str, boundary in zip([years, r'\d{4}'], ['', r'_|\b']): | |
y_re = tmpl.format(y_str=y_str, boundary=boundary) | |
if re.search(y_re, name_sample): | |
return y_re | |
def ym_span_pattern(date_re_pattern): | |
""" | |
Returns a pattern string that only contains the span with year, month, and | |
anything in between (suitable for drop and sub options). `date_re_pattern` | |
must contain named goups for month and year respectively. | |
Raises `DateRenameError` if no match can found. | |
""" | |
ym_span = r'\(\?P\<(year|month)\>[^)]+\).*\(\?P\<(year|month)\>[^)]+\)' | |
m = re.search(ym_span, date_re_pattern) | |
try: | |
return m[0] | |
except TypeError as e: | |
raise DateRenameError('invalid `date_re` pattern: {}'.format( | |
date_re_pattern)) from e | |
def new_name(filename, date_re, month_map, position, sep, sub_re, drop): | |
""" | |
Given a filename, returns new name with all pre/post-fixes, substitutions, | |
and deletions applied or raises `DateRenameError` if the substitutions | |
fail at any point. | |
""" | |
date_match = date_re.search(filename) | |
try: | |
month = date_match['month'] | |
year = date_match['year'] | |
except TypeError as e: | |
raise DateRenameError("could not extract month and/or year from '{}' " | |
"with regex {}".format(filename, date_re.pattern)) from e | |
month = '{:02d}'.format(month_map[month]) | |
date_str = year + '-' + month | |
if sub_re: | |
new_fn, num_subs = sub_re.subn(date_str, filename) | |
if num_subs == 0: | |
raise DateRenameError("could not replace {} in '{}' with " | |
"'{}'".format(sub_re.pattern, filename, date_str)) | |
elif position == 'sub': | |
new_fn, num_subs = re.subn(ym_span_pattern(date_re.pattern), date_str, | |
filename) | |
if num_subs == 0: | |
raise DateRenameError("could not replace year/month span in '{}' " | |
"'{}' parameter 'sub'".format(filename)) | |
else: | |
if drop: | |
filename, num_subs = re.subn(ym_span_pattern(date_re.pattern), | |
'', filename) | |
if num_subs == 0: | |
logging.warning("could not drop year/month span from '%s'", | |
filename) | |
tmpl = { | |
'pre': '{year}-{month}{sep}{root}{ext}', | |
'post': '{root}{sep}{year}-{month}{ext}' | |
} | |
root, ext = os.path.splitext(filename) | |
new_fn = tmpl[position].format(year=year, month=month, sep=sep, | |
root=root, ext=ext) | |
return new_fn | |
def main(args): | |
files = gen_find(args.fn_pattern, args.top, args.descent) | |
sub_re = re.compile(args.sub) if args.sub else None | |
month_map = cust_month_map(args.mfmt, args.locale) if args.mfmt else None | |
first = True | |
for path, name in files: | |
if not month_map: | |
month_map = std_month_map(name, args.locale) | |
if first and args.custom: | |
date_re = re.compile(args.custom) | |
elif first: | |
m_re = mk_month_re(month_map.keys()) | |
y_re = mk_year_re(name) | |
if re.search(m_re, name).start < re.search(y_re, name).start: | |
date_re = re.compile(m_re + r'.*' + y_re) | |
else: | |
date_re = re.compile(y_re + r'.*' + m_re) | |
first = False | |
try: | |
month_str = re.search(month_re, name)[1] | |
year_str = re.search(year_re, name)[1] | |
except TypeError: | |
continue | |
date = dt.strptime(year_str + month_str, '%Y%B') | |
prefix = date.strftime('%Y-%m') | |
shutil.move(os.path.join(path, name), | |
os.path.join(path, prefix + '_' + name)) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument('fn_pattern', help="""pattern for filenames to include | |
in renaming""") | |
parser.add_argument('--top', default='.', help="""top-level directory in | |
which to look for matching filenames; defaults to current | |
directory""") | |
parser.add_argument('--descent', action='store_true', help="""if | |
specified matching files in subdirectories will also be included in | |
renaming""") | |
parser.add_argument('-n', '--no_action', action='store_true', help="""if | |
specified only old and new names of matching files will be printed | |
to stdout, without actually renaming the files""") | |
default_locale = getlocale()[0] | |
if not default_locale: | |
default_locale = 'en_US' | |
parser.add_argument('-l', '--locale', default=default_locale, help="""code | |
indicating locale for month names; defaults to system locale or | |
'en-US' if system locale cannot be determined""") | |
parser.add_argument('-w', '--where', default='pre', help="""placement of | |
YYYY-MM string: 'pre', 'post', 'sub' (the latter implies `--drop` | |
and will place YYYY-MM at the original location that contained | |
year, month, and anything in between (unless a pattern to be | |
replaced is specified via `--sub` parameter); defaults to 'pre'""") | |
parser.add_argument('--sep', default='_', help="""separator between | |
YYYY-MM and rest of filename; defaults to '_'""") | |
parser.add_argument('-c', '--custom', help="""custom Python regex to match | |
year and month in original filename; the regex must include two | |
named groups specifying year and month patterns, i.e. (?P<year>...) | |
and (?P<month>...); without a custom regex the program will first | |
try to match full names, then abbreviated names, then strict | |
3-letter abbreviations (i.e. 'Jun' rather than 'Juni' for de_DE | |
locale), then zero-padded decimals for the month; the year will be | |
assumed to be a 4-digit number including the century and will be | |
limited to the current year +/- 1; zero-padded decimals for month | |
will match at word boundaries or underscores as delimiters""") | |
parser.add_argument('--mfmt', help="""required if `--custom` is specified: | |
defines the month format with valid values being 'B' (locale's full | |
name), 'b' (locale's abbreviated name without '.'), 'mmm' (first | |
three letters of locale's full name), 'm' (zero-padded decimal | |
number); if none of that works a white space separated list of | |
month strings in correct order can be supplied""") | |
parser.add_argument('-d', '--drop', action='store_true', help=""" if | |
specified substring matching year, month, and anything in between | |
in the original filename will be ommitted in the renaming""") | |
parser.add_argument('-s', '--sub', help="""Python regex indicating what | |
substring in the original filename is to be replaced by | |
'YYYY-MM'; if both `--drop` and `--sub` are specified `--sub` will | |
be applied first (possibly eliminating a subsequent match for | |
`--drop`)""") | |
args = parser.parse_args() | |
if args.custom and not args.mfmt: | |
parser.error('`--custom` option also requires `--mfmt`') | |
main(args) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import dtren | |
class TestMonthMaps(unittest.TestCase): | |
mm_exp_res = { | |
('B', 'en_US'): ('March', 3), | |
('b', 'de_DE'): ('März', 3), | |
('mmm', 'de_DE'): ('Jun', 6), | |
('m', 'es'): ('01', 1), | |
} | |
def test_mk_month_map(self): | |
for arg in self.mm_exp_res: | |
with self.subTest(arg=arg): | |
mm = dtren.mk_month_map(*arg) | |
self.assertEqual(mm[self.mm_exp_res[arg][0]], | |
self.mm_exp_res[arg][1]) | |
def test_cust_month_map(self): | |
cust_mlbl = 'a b c d ee fff\tg h i\t j k l' | |
mm = dtren.cust_month_map(cust_mlbl, 'en_US') | |
self.assertEqual(mm['j'], 10) | |
for arg in self.mm_exp_res: | |
with self.subTest(arg=arg): | |
mm = dtren.cust_month_map(*arg) | |
self.assertEqual(mm[self.mm_exp_res[arg][0]], | |
self.mm_exp_res[arg][1]) | |
def test_std_month_map(self): | |
std_mm_cases = { | |
('Report_January2012.xls', 'en_US'): ('March', 3), | |
('Report_Jan2012.xls', 'en_US'): ('Mar', 3), | |
('Report_01.2012.xls', 'en_US'): ('03', 3), | |
('Report_2012März.xls', 'de_DE'): ('April', 4), | |
('Report_Juni-2012.xls', 'de_DE'): ('April', 4), | |
('Report_Mär-2012.xls', 'de_DE'): ('Jun', 6), | |
('Report_Aug-2012.xls', 'de_DE'): ('Juni', 6), | |
} | |
for arg, res in std_mm_cases.items(): | |
with self.subTest(arg=arg): | |
mm = dtren.std_month_map(*arg) | |
self.assertEqual(mm[res[0]], res[1]) | |
class TestFileOps(unittest.TestCase): | |
pass | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment