darius/overcooked.py

## overcooked.py
#!/usr/bin/env python
"""
Given Python2 source files, print out all the string literals that look
like they were meant to be raw strings, but are ordinary strings instead.
Exit status: 1 if any warnings, else 0.
"""

import token, tokenize

def main(argv):
    ok = True
    for filename in argv[1:]:
        try:
            f = open(filename, 'U')
            # 'U' to normalize newlines: this avoids spurious warnings
            # about strings with \ right before crlf.
        except IOError as e:
            print(e)
            ok = False
            continue
        with f:
            try:
                ok &= check(f)
            except tokenize.TokenError as e:
                print('%s: %s' % (filename, e))
                ok = False
            except SyntaxError as e:
                print('%s: %s' % (filename, e))
                ok = False
    return 0 if ok else 1

def check(f):
    ok = True
    for ttype, tstring, (start, _1), (end, _2), _3 in tokenize.generate_tokens(f.readline):
        if ttype == token.STRING and not literal_is_clean(tstring):
            ok = False
            to_end = '' if end == start else '..%d' % (end,)
            indented_string = tstring.replace('\n', '\n  ')
            print('%s:%d%s %s' % (f.name, start, to_end, indented_string))
    return ok

def literal_is_clean(tstring):
    prefix, quoted_part = parse_prefix(tstring)
    prefix = prefix.lower()
    assert prefix in ('', 'u', 'b', 'r', 'ur', 'br')
    return 'r' in prefix or is_clean(quoted_part, 'u' in prefix)

def parse_prefix(s):
    """Given a string literal s, such as 'hello' or ur"howdy", return a
    pair of the prefix before the part in quotes, together with the part
    in quotes."""
    prefix = ''
    while s[0] not in ('"', "'"):
        prefix, s = prefix + s[0], s[1:]
    return prefix, s

def is_clean(s, is_unicode):
    """Given a non-raw string literal s, return true iff all escape
    sequences have a meaning as defined at
    https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals"""
    while s:
        if not s.startswith('\\'):
            s = s[1:]
            continue
        if s[1:2] in "\n\\'" '"abfnrtv':
            s = s[2:]
        elif s[1:2] == 'u' and is_unicode:
            digits, s = s[2:6], s[6:]
            if not ok_hex(digits, 4): return False
        elif s[1:2] == 'U' and is_unicode:
            digits, s = s[2:10], s[10:]
            if not ok_hex(digits, 8): return False
        elif s[1:2] == 'N' and is_unicode:
            s = s[2:]  # XXX TODO: check the {name} that should follow
        elif s[1:2] in '01234567':
            digits, s = s[1], s[2:]
            if s[:1] in '01234567': digits, s = digits + s[:1], s[1:]
            if s[:1] in '01234567': digits, s = digits + s[:1], s[1:]
            if 255 < int(digits, 8): return False # XXX is this check needed?
        elif s[1:2] == 'x':
            digits, s = s[2:4], s[4:]
            if not ok_hex(digits, 2): return False
        else:
            return False
    return True

def ok_hex(digits, n):
    return (len(digits) == n
            and all(d in '0123456789abcdef' for d in digits.lower()))


if __name__ == '__main__':
    import sys
    sys.exit(main(sys.argv))
	#!/usr/bin/env python
	"""
	Given Python2 source files, print out all the string literals that look
	like they were meant to be raw strings, but are ordinary strings instead.
	Exit status: 1 if any warnings, else 0.
	"""

	import token, tokenize

	def main(argv):
	ok = True
	for filename in argv[1:]:
	try:
	f = open(filename, 'U')
	# 'U' to normalize newlines: this avoids spurious warnings
	# about strings with \ right before crlf.
	except IOError as e:
	print(e)
	ok = False
	continue
	with f:
	try:
	ok &= check(f)
	except tokenize.TokenError as e:
	print('%s: %s' % (filename, e))
	ok = False
	except SyntaxError as e:
	print('%s: %s' % (filename, e))
	ok = False
	return 0 if ok else 1

	def check(f):
	ok = True
	for ttype, tstring, (start, _1), (end, _2), _3 in tokenize.generate_tokens(f.readline):
	if ttype == token.STRING and not literal_is_clean(tstring):
	ok = False
	to_end = '' if end == start else '..%d' % (end,)
	indented_string = tstring.replace('\n', '\n ')
	print('%s:%d%s %s' % (f.name, start, to_end, indented_string))
	return ok

	def literal_is_clean(tstring):
	prefix, quoted_part = parse_prefix(tstring)
	prefix = prefix.lower()
	assert prefix in ('', 'u', 'b', 'r', 'ur', 'br')
	return 'r' in prefix or is_clean(quoted_part, 'u' in prefix)

	def parse_prefix(s):
	"""Given a string literal s, such as 'hello' or ur"howdy", return a
	pair of the prefix before the part in quotes, together with the part
	in quotes."""
	prefix = ''
	while s[0] not in ('"', "'"):
	prefix, s = prefix + s[0], s[1:]
	return prefix, s

	def is_clean(s, is_unicode):
	"""Given a non-raw string literal s, return true iff all escape
	sequences have a meaning as defined at
	https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals"""
	while s:
	if not s.startswith('\\'):
	s = s[1:]
	continue
	if s[1:2] in "\n\\'" '"abfnrtv':
	s = s[2:]
	elif s[1:2] == 'u' and is_unicode:
	digits, s = s[2:6], s[6:]
	if not ok_hex(digits, 4): return False
	elif s[1:2] == 'U' and is_unicode:
	digits, s = s[2:10], s[10:]
	if not ok_hex(digits, 8): return False
	elif s[1:2] == 'N' and is_unicode:
	s = s[2:] # XXX TODO: check the {name} that should follow
	elif s[1:2] in '01234567':
	digits, s = s[1], s[2:]
	if s[:1] in '01234567': digits, s = digits + s[:1], s[1:]
	if s[:1] in '01234567': digits, s = digits + s[:1], s[1:]
	if 255 < int(digits, 8): return False # XXX is this check needed?
	elif s[1:2] == 'x':
	digits, s = s[2:4], s[4:]
	if not ok_hex(digits, 2): return False
	else:
	return False
	return True

	def ok_hex(digits, n):
	return (len(digits) == n
	and all(d in '0123456789abcdef' for d in digits.lower()))


	if __name__ == '__main__':
	import sys
	sys.exit(main(sys.argv))