Skip to content

Instantly share code, notes, and snippets.

@ms8r
Last active August 15, 2018 09:41
Show Gist options
  • Save ms8r/b16e770d0998b85913839b53a51b1d90 to your computer and use it in GitHub Desktop.
Save ms8r/b16e770d0998b85913839b53a51b1d90 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Script to rename filesnames that include a year and month (e.g. reports,
account statements) such that the respective month appears as YYYY-MM and can
therefore be sorted correctly.
Original filenames can contain month names spelled out, abbreviated, or as
zero-padded decimals (the first two also from different locales). By default
the new filename will simply show the corresponding YYYY-MM as prefix,
separated with an underscore. Command line parameters allow to change this to
postfix or substitution of the original year/month substring(s). Run script
with `-h` to see details.
"""
import re
import os
import os.path
import shutil
import sys
import fnmatch
from datetime import date
from locale import getlocale
import argparse
import logging
from babel.dates import get_month_names
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
class DateRenameError(Exception):
pass
def gen_find(pattern, top='.', descent=True):
"""
Find all filenames in a directory that match a shell wildcard
pattern.
"""
for path, dirlist, filelist in os.walk(top):
for name in fnmatch.filter(filelist, pattern):
# logging.info('processing %s', os.path.join(path, name))
yield (path, name)
if not descent:
break
def mk_month_map(month_fmt='B', locale='en_US'):
"""
Returns a dict with month names for `locale` and `month_fmt` as keys and
month numbers as values. `month_fmt` must be one of
'B': full month names for `locale` (e.g. 'January')
'b': short version _without_ '.' (e.g. 'Okt' for locale 'de-DE' but
also 'Juni' for same locale)
'mmm': similar to 'abbreviated but strictly first three letters of
month names (i.e. 'Jun' rather than 'Juni' for 'de-DE')
'm': zero-padded decimal number as string
"""
# post-processing of month names based on width:
m_fmt_map = {
'B': ('wide', lambda k: k),
'b': ('abbreviated', lambda k: k.rstrip('.')),
'mmm': ('wide', lambda k: k[:3]),
}
if month_fmt in m_fmt_map:
month_names = get_month_names(m_fmt_map[month_fmt][0], locale=locale)
keys = [m_fmt_map[month_fmt][1](m) for m in month_names.values()]
return dict(zip(keys, month_names.keys()))
elif month_fmt == 'm':
return dict([('{:02d}'.format(m), m) for m in range(1, 13)])
def cust_month_map(month_fmt, locale):
"""
Returns a dict with month names for `locale` and `month_fmt` as keys and
month numbers as values. `month_fmt` can also be a whitespace separated
list of twelve custom month labels. Otherwise same values as in `month_map`
are accepted.
"""
m_lbl = month_fmt.split()
if len(m_lbl) == 12:
return dict(zip(m_lbl, range(1, 13)))
else:
return mk_month_map(month_fmt, locale)
def std_month_map(name_sample, locale):
"""
Returns a dict with month labels for `locale` as keys and month numbers as
values. Labels will be constructed to work with `name_sample`, a sample
string in which the month label is to be matched. Will first try to match
full month names for locale, then abbreviated names for locale (excluding
'.'), then full names shortened to first three letters, and finally
zero-padded decimals (the latter must match at word boundaries or
underscores as delimiters).
"""
for m_fmt in ['B', 'b', 'mmm', 'm']:
mm = mk_month_map(m_fmt, locale)
m_re = mk_month_re(mm.keys())
if re.search(m_re, name_sample):
return mm
def mk_month_re(month_lbls):
"""
Returns a regex string for that will match any of the items in
`month_labels` (an iterable). The portion matching the label will be in a
match group named 'month'. If the length of the first label is less than
three the regex will match at word boundaries or '_' as delimiters.
"""
month_lbls = list(month_lbls)
bounds_re = r'(_|\b)' if len(month_lbls[0]) < 3 else r'()'
return (bounds_re +
r'(?P<month>' + '|'.join(month_lbls) + ')' +
bounds_re)
def mk_year_re(name_sample):
"""
Returns a regex string that will match a 4-digit year in name sample.
Will first try to match current year +/- 1 (also not at word boundaries),
then any 4-digit number at word boundaries or '_' as delimiter.
"""
tmpl = r'({boundary})(?P<year>{y_str})({boundary})'
cur_year = date.today().year
years = '|'.join(['{:04d}'.format(y) for y in range(cur_year - 1, cur_year + 2)])
for y_str, boundary in zip([years, r'\d{4}'], ['', r'_|\b']):
y_re = tmpl.format(y_str=y_str, boundary=boundary)
if re.search(y_re, name_sample):
return y_re
def ym_span_pattern(date_re_pattern):
"""
Returns a pattern string that only contains the span with year, month, and
anything in between (suitable for drop and sub options). `date_re_pattern`
must contain named goups for month and year respectively.
Raises `DateRenameError` if no match can found.
"""
ym_span = r'\(\?P\<(year|month)\>[^)]+\).*\(\?P\<(year|month)\>[^)]+\)'
m = re.search(ym_span, date_re_pattern)
try:
return m[0]
except TypeError as e:
raise DateRenameError('invalid `date_re` pattern: {}'.format(
date_re_pattern)) from e
def new_name(filename, date_re, month_map, position, sep, sub_re, drop):
"""
Given a filename, returns new name with all pre/post-fixes, substitutions,
and deletions applied or raises `DateRenameError` if the substitutions
fail at any point.
"""
date_match = date_re.search(filename)
try:
month = date_match['month']
year = date_match['year']
except TypeError as e:
raise DateRenameError("could not extract month and/or year from '{}' "
"with regex {}".format(filename, date_re.pattern)) from e
month = '{:02d}'.format(month_map[month])
date_str = year + '-' + month
if sub_re:
new_fn, num_subs = sub_re.subn(date_str, filename)
if num_subs == 0:
raise DateRenameError("could not replace {} in '{}' with "
"'{}'".format(sub_re.pattern, filename, date_str))
elif position == 'sub':
new_fn, num_subs = re.subn(ym_span_pattern(date_re.pattern), date_str,
filename)
if num_subs == 0:
raise DateRenameError("could not replace year/month span in '{}' "
"'{}' parameter 'sub'".format(filename))
else:
if drop:
filename, num_subs = re.subn(ym_span_pattern(date_re.pattern),
'', filename)
if num_subs == 0:
logging.warning("could not drop year/month span from '%s'",
filename)
tmpl = {
'pre': '{year}-{month}{sep}{root}{ext}',
'post': '{root}{sep}{year}-{month}{ext}'
}
root, ext = os.path.splitext(filename)
new_fn = tmpl[position].format(year=year, month=month, sep=sep,
root=root, ext=ext)
return new_fn
def main(args):
files = gen_find(args.fn_pattern, args.top, args.descent)
sub_re = re.compile(args.sub) if args.sub else None
month_map = cust_month_map(args.mfmt, args.locale) if args.mfmt else None
first = True
for path, name in files:
if not month_map:
month_map = std_month_map(name, args.locale)
if first and args.custom:
date_re = re.compile(args.custom)
elif first:
m_re = mk_month_re(month_map.keys())
y_re = mk_year_re(name)
if re.search(m_re, name).start < re.search(y_re, name).start:
date_re = re.compile(m_re + r'.*' + y_re)
else:
date_re = re.compile(y_re + r'.*' + m_re)
first = False
try:
month_str = re.search(month_re, name)[1]
year_str = re.search(year_re, name)[1]
except TypeError:
continue
date = dt.strptime(year_str + month_str, '%Y%B')
prefix = date.strftime('%Y-%m')
shutil.move(os.path.join(path, name),
os.path.join(path, prefix + '_' + name))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('fn_pattern', help="""pattern for filenames to include
in renaming""")
parser.add_argument('--top', default='.', help="""top-level directory in
which to look for matching filenames; defaults to current
directory""")
parser.add_argument('--descent', action='store_true', help="""if
specified matching files in subdirectories will also be included in
renaming""")
parser.add_argument('-n', '--no_action', action='store_true', help="""if
specified only old and new names of matching files will be printed
to stdout, without actually renaming the files""")
default_locale = getlocale()[0]
if not default_locale:
default_locale = 'en_US'
parser.add_argument('-l', '--locale', default=default_locale, help="""code
indicating locale for month names; defaults to system locale or
'en-US' if system locale cannot be determined""")
parser.add_argument('-w', '--where', default='pre', help="""placement of
YYYY-MM string: 'pre', 'post', 'sub' (the latter implies `--drop`
and will place YYYY-MM at the original location that contained
year, month, and anything in between (unless a pattern to be
replaced is specified via `--sub` parameter); defaults to 'pre'""")
parser.add_argument('--sep', default='_', help="""separator between
YYYY-MM and rest of filename; defaults to '_'""")
parser.add_argument('-c', '--custom', help="""custom Python regex to match
year and month in original filename; the regex must include two
named groups specifying year and month patterns, i.e. (?P<year>...)
and (?P<month>...); without a custom regex the program will first
try to match full names, then abbreviated names, then strict
3-letter abbreviations (i.e. 'Jun' rather than 'Juni' for de_DE
locale), then zero-padded decimals for the month; the year will be
assumed to be a 4-digit number including the century and will be
limited to the current year +/- 1; zero-padded decimals for month
will match at word boundaries or underscores as delimiters""")
parser.add_argument('--mfmt', help="""required if `--custom` is specified:
defines the month format with valid values being 'B' (locale's full
name), 'b' (locale's abbreviated name without '.'), 'mmm' (first
three letters of locale's full name), 'm' (zero-padded decimal
number); if none of that works a white space separated list of
month strings in correct order can be supplied""")
parser.add_argument('-d', '--drop', action='store_true', help=""" if
specified substring matching year, month, and anything in between
in the original filename will be ommitted in the renaming""")
parser.add_argument('-s', '--sub', help="""Python regex indicating what
substring in the original filename is to be replaced by
'YYYY-MM'; if both `--drop` and `--sub` are specified `--sub` will
be applied first (possibly eliminating a subsequent match for
`--drop`)""")
args = parser.parse_args()
if args.custom and not args.mfmt:
parser.error('`--custom` option also requires `--mfmt`')
main(args)
import unittest
import dtren
class TestMonthMaps(unittest.TestCase):
mm_exp_res = {
('B', 'en_US'): ('March', 3),
('b', 'de_DE'): ('März', 3),
('mmm', 'de_DE'): ('Jun', 6),
('m', 'es'): ('01', 1),
}
def test_mk_month_map(self):
for arg in self.mm_exp_res:
with self.subTest(arg=arg):
mm = dtren.mk_month_map(*arg)
self.assertEqual(mm[self.mm_exp_res[arg][0]],
self.mm_exp_res[arg][1])
def test_cust_month_map(self):
cust_mlbl = 'a b c d ee fff\tg h i\t j k l'
mm = dtren.cust_month_map(cust_mlbl, 'en_US')
self.assertEqual(mm['j'], 10)
for arg in self.mm_exp_res:
with self.subTest(arg=arg):
mm = dtren.cust_month_map(*arg)
self.assertEqual(mm[self.mm_exp_res[arg][0]],
self.mm_exp_res[arg][1])
def test_std_month_map(self):
std_mm_cases = {
('Report_January2012.xls', 'en_US'): ('March', 3),
('Report_Jan2012.xls', 'en_US'): ('Mar', 3),
('Report_01.2012.xls', 'en_US'): ('03', 3),
('Report_2012März.xls', 'de_DE'): ('April', 4),
('Report_Juni-2012.xls', 'de_DE'): ('April', 4),
('Report_Mär-2012.xls', 'de_DE'): ('Jun', 6),
('Report_Aug-2012.xls', 'de_DE'): ('Juni', 6),
}
for arg, res in std_mm_cases.items():
with self.subTest(arg=arg):
mm = dtren.std_month_map(*arg)
self.assertEqual(mm[res[0]], res[1])
class TestFileOps(unittest.TestCase):
pass
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment