smichr/toy_mailmap.py

## toy_mailmap.py
"""
Each line in .mailmap is a comment, blank or mapping. A mapping
contains 1 or 2 email addresses and may terminate with a comment
that extends from the first # to the end of the physical line. An
address starts with a left angle bracket and extends through the
first unnested right angle bracket; it need not conform to a
valid email address and may contain space. A name is interpreted
as anything before the first address or between the two addresses
except for leading and trailing space.

Mappings define a pattern that is used to remap an author's name
or address as it appears in git to a canonical form. The pattern
for a single-email entry is the email; the pattern for a
double-email entry is the name and/or email address after the
first email address. Thus, each pattern is either short (an email
address) or long (a name and email address).

Patterns are tested against the pattern defined by the name and
address of a git author. The testing is done without regard to
case. A pattern match indicates that a new name and/or address as
defined in a .mailmap entry should be used. This does not affect
the pattern that is being tested. The name and/or address
existing at the end of matching becomes the canonical name and
author (see FINE PRINT below). Hence, the order of entries having
a given pattern matters. Long addresses are tested first; if there
is no match then short patterns are tested. If there is no match then
the name and address are left unmodified.

LONG PATTERNS

    conditional email change
       <E> B <G> remaps b <g> -> b <E>
    conditional name and email change
       A <E> D <N> remaps d <n> -> A <E>

SHORT PATTERNS

    name change
       A <E> remaps * <e> -> B <e>
    email change
       <E> <D> remaps * <d> -> * <E>
    name and email change
       A <E> <N> remaps * <n> -> A <E>

FINE PRINT (FEATURE)

There is a slight difference in behavior of long and short
patterns: long patterns update BOTH name and address after each
match, supplying the author's name or pattern if the corresponding
value is missing.

    These two .mailmap entries

    E <F> G <H>
    <R> G <h>

    would map G <H> to G <R>: the first match would yield E <F> and
    the second one would change BOTH back to G <R>.

Short pattern matching starts with the name and address of the
author and then updates whatever is supplied by each matching
entry: name and/or address may be updated after each match.

    These .mailmap entries

    A <B>
    D <X> <B>
    C <B>

    would map foo <b> to A <b>, then to D <X>, and finally to C <X>
    so these -- as for all entries that have the same short pattern --
    could be collapsed to C <X> <B>

CLEAN UP POSSIBILITIES

    A <E> X -> A <E>

    A <F>  \
    <E> <F> \-> A <E> <F> \
                           > collapse SHORT entries with identical
    A <E> \               /
    B <E>  \-> B <E>

    A <B> c <d> \
    E <F> C <D>  \
    <H> c <D>     \-> <H> c <D> keep only last long entry

    A <B> C <D> -> A <B> <D> **only if <D> is not ambiguous
      An address is not ambiguous if there is a short entry in .mailmap
      with that address as the key BELOW the occurance of the last long
      entry...I think. The best way to tell if an address is ambiguous
      is to search all authors when there are no entries in .mailmap.

ERROR/WARNING CONDITION?

    <F> <E>
    <E> <G>
"""
def parse(line):
    line = line.strip()
    ec = line.split("#", 1)
    if len(ec) == 1:
        entry, comment = ec[0], ''
    else:
        entry, comment = ec
        comment = "  " + comment
    if not entry:
        rv = '', '', '', '', comment
    else:
        entry = entry.strip()
        L = entry.count("<")
        R = entry.count(">")
        assert L == R
        assert L in (1, 2)
        s = entry.split(">")
        def parse_ne(s):
            n, e = s.split("<")
            n = n.strip()
            e = '<%s>' % e.split(">")[0]
            return n, e
        if len(s) == 2:
            n, e = parse_ne(s[0])
            n2 = s[1].strip()
            rv = n, e, n2, '', comment
        elif len(s) == 3:
            a = n1, e1 = parse_ne(s[0])
            b = n2, e2 = parse_ne(s[1])
            rv = []
            rv.extend(a)
            rv.extend(b)
            rv.append(comment)
        else:
            raise ValueError("syntax error: %s" % line)
    assert len(rv) == 5
    rv = tuple(rv)
    n_emails = 2 if rv[3] else (1 if rv[1] else 0)
    return n_emails, rv

def mailmap_pattern(line):
    n, dat = parse(line)
    if n == 1:
        return dat[1].lower()
    return (dat[2] + dat[3]).lower()

def author_pattern(line):
    n, dat = parse(line)
    assert n == 1
    return (dat[0] + dat[1]).lower()

mailmap = '''
A <B>
D <B> <B>
C <B>
E <F> G <H>
<W> <H>
<R> G <h>
'''.splitlines()

authors = '''
foo <b>
foo <H>
G <H>
'''.strip().splitlines()

from collections import defaultdict
longs = defaultdict(list)
shorts = defaultdict(list)
for M in mailmap:
    if not parse(M)[0]:
        continue
    mpat = mailmap_pattern(M)
    if mpat.startswith('<'):
        shorts[mpat].append(M)
    else:
        longs[mpat].append(M)
for A in authors:
    if not parse(A)[0]:
        continue
    _, dat  = parse(A)
    N, E = dat[:2]
    apat = author_pattern(A)
    for k in longs:
        if apat == k:
            for li in longs[k]:
                _, mdat = parse(li)
                ndef = mdat[0] or N # <--------+
                edef = mdat[1]               # |
            break                            # |
    else:                                    # |
        apat = E.lower()                     # |
        ndef, edef = N, E                    # | note difference!
        for k in shorts:                     # |
            if apat == k:                    # |
                for li in shorts[k]:         # |
                    _, mdat = parse(li)      # |
                    ndef = mdat[0] or ndef # <-+
                    edef = mdat[1] or edef
                break  # only one pat can match
    print('%s --> %s %s' % (A, ndef, edef))

'''
python.exe "f:\toy mailmap.py"
foo <b> --> C <B>
foo <H> --> foo <W>
G <H> --> G <R>
'''
	"""
	Each line in .mailmap is a comment, blank or mapping. A mapping
	contains 1 or 2 email addresses and may terminate with a comment
	that extends from the first # to the end of the physical line. An
	address starts with a left angle bracket and extends through the
	first unnested right angle bracket; it need not conform to a
	valid email address and may contain space. A name is interpreted
	as anything before the first address or between the two addresses
	except for leading and trailing space.

	Mappings define a pattern that is used to remap an author's name
	or address as it appears in git to a canonical form. The pattern
	for a single-email entry is the email; the pattern for a
	double-email entry is the name and/or email address after the
	first email address. Thus, each pattern is either short (an email
	address) or long (a name and email address).

	Patterns are tested against the pattern defined by the name and
	address of a git author. The testing is done without regard to
	case. A pattern match indicates that a new name and/or address as
	defined in a .mailmap entry should be used. This does not affect
	the pattern that is being tested. The name and/or address
	existing at the end of matching becomes the canonical name and
	author (see FINE PRINT below). Hence, the order of entries having
	a given pattern matters. Long addresses are tested first; if there
	is no match then short patterns are tested. If there is no match then
	the name and address are left unmodified.

	LONG PATTERNS

	conditional email change
	<E> B <G> remaps b <g> -> b <E>
	conditional name and email change
	A <E> D <N> remaps d <n> -> A <E>

	SHORT PATTERNS

	name change
	A <E> remaps * <e> -> B <e>
	email change
	<E> <D> remaps * <d> -> * <E>
	name and email change
	A <E> <N> remaps * <n> -> A <E>

	FINE PRINT (FEATURE)

	There is a slight difference in behavior of long and short
	patterns: long patterns update BOTH name and address after each
	match, supplying the author's name or pattern if the corresponding
	value is missing.

	These two .mailmap entries

	E <F> G <H>
	<R> G <h>

	would map G <H> to G <R>: the first match would yield E <F> and
	the second one would change BOTH back to G <R>.

	Short pattern matching starts with the name and address of the
	author and then updates whatever is supplied by each matching
	entry: name and/or address may be updated after each match.

	These .mailmap entries

	A <B>
	D <X> <B>
	C <B>

	would map foo <b> to A <b>, then to D <X>, and finally to C <X>
	so these -- as for all entries that have the same short pattern --
	could be collapsed to C <X> <B>

	CLEAN UP POSSIBILITIES

	A <E> X -> A <E>

	A <F> \
	<E> <F> \-> A <E> <F> \
	> collapse SHORT entries with identical
	A <E> \ /
	B <E> \-> B <E>

	A <B> c <d> \
	E <F> C <D> \
	<H> c <D> \-> <H> c <D> keep only last long entry

	A <B> C <D> -> A <B> <D> **only if <D> is not ambiguous
	An address is not ambiguous if there is a short entry in .mailmap
	with that address as the key BELOW the occurance of the last long
	entry...I think. The best way to tell if an address is ambiguous
	is to search all authors when there are no entries in .mailmap.

	ERROR/WARNING CONDITION?

	<F> <E>
	<E> <G>
	"""
	def parse(line):
	line = line.strip()
	ec = line.split("#", 1)
	if len(ec) == 1:
	entry, comment = ec[0], ''
	else:
	entry, comment = ec
	comment = " " + comment
	if not entry:
	rv = '', '', '', '', comment
	else:
	entry = entry.strip()
	L = entry.count("<")
	R = entry.count(">")
	assert L == R
	assert L in (1, 2)
	s = entry.split(">")
	def parse_ne(s):
	n, e = s.split("<")
	n = n.strip()
	e = '<%s>' % e.split(">")[0]
	return n, e
	if len(s) == 2:
	n, e = parse_ne(s[0])
	n2 = s[1].strip()
	rv = n, e, n2, '', comment
	elif len(s) == 3:
	a = n1, e1 = parse_ne(s[0])
	b = n2, e2 = parse_ne(s[1])
	rv = []
	rv.extend(a)
	rv.extend(b)
	rv.append(comment)
	else:
	raise ValueError("syntax error: %s" % line)
	assert len(rv) == 5
	rv = tuple(rv)
	n_emails = 2 if rv[3] else (1 if rv[1] else 0)
	return n_emails, rv

	def mailmap_pattern(line):
	n, dat = parse(line)
	if n == 1:
	return dat[1].lower()
	return (dat[2] + dat[3]).lower()

	def author_pattern(line):
	n, dat = parse(line)
	assert n == 1
	return (dat[0] + dat[1]).lower()

	mailmap = '''
	A <B>
	D <B> <B>
	C <B>
	E <F> G <H>
	<W> <H>
	<R> G <h>
	'''.splitlines()

	authors = '''
	foo <b>
	foo <H>
	G <H>
	'''.strip().splitlines()

	from collections import defaultdict
	longs = defaultdict(list)
	shorts = defaultdict(list)
	for M in mailmap:
	if not parse(M)[0]:
	continue
	mpat = mailmap_pattern(M)
	if mpat.startswith('<'):
	shorts[mpat].append(M)
	else:
	longs[mpat].append(M)
	for A in authors:
	if not parse(A)[0]:
	continue
	_, dat = parse(A)
	N, E = dat[:2]
	apat = author_pattern(A)
	for k in longs:
	if apat == k:
	for li in longs[k]:
	_, mdat = parse(li)
	ndef = mdat[0] or N # <--------+
	edef = mdat[1] # \|
	break # \|
	else: # \|
	apat = E.lower() # \|
	ndef, edef = N, E # \| note difference!
	for k in shorts: # \|
	if apat == k: # \|
	for li in shorts[k]: # \|
	_, mdat = parse(li) # \|
	ndef = mdat[0] or ndef # <-+
	edef = mdat[1] or edef
	break # only one pat can match
	print('%s --> %s %s' % (A, ndef, edef))

	'''
	python.exe "f:\toy mailmap.py"
	foo <b> --> C <B>
	foo <H> --> foo <W>
	G <H> --> G <R>
	'''