Last active
August 29, 2015 14:00
-
-
Save darius/11351623 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Given Python2 source files, print out all the string literals that look | |
like they were meant to be raw strings, but are ordinary strings instead. | |
Exit status: 1 if any warnings, else 0. | |
""" | |
import token, tokenize | |
def main(argv): | |
ok = True | |
for filename in argv[1:]: | |
try: | |
f = open(filename, 'U') | |
# 'U' to normalize newlines: this avoids spurious warnings | |
# about strings with \ right before crlf. | |
except IOError as e: | |
print(e) | |
ok = False | |
continue | |
with f: | |
try: | |
ok &= check(f) | |
except tokenize.TokenError as e: | |
print('%s: %s' % (filename, e)) | |
ok = False | |
except SyntaxError as e: | |
print('%s: %s' % (filename, e)) | |
ok = False | |
return 0 if ok else 1 | |
def check(f): | |
ok = True | |
for ttype, tstring, (start, _1), (end, _2), _3 in tokenize.generate_tokens(f.readline): | |
if ttype == token.STRING and not literal_is_clean(tstring): | |
ok = False | |
to_end = '' if end == start else '..%d' % (end,) | |
indented_string = tstring.replace('\n', '\n ') | |
print('%s:%d%s %s' % (f.name, start, to_end, indented_string)) | |
return ok | |
def literal_is_clean(tstring): | |
prefix, quoted_part = parse_prefix(tstring) | |
prefix = prefix.lower() | |
assert prefix in ('', 'u', 'b', 'r', 'ur', 'br') | |
return 'r' in prefix or is_clean(quoted_part, 'u' in prefix) | |
def parse_prefix(s): | |
"""Given a string literal s, such as 'hello' or ur"howdy", return a | |
pair of the prefix before the part in quotes, together with the part | |
in quotes.""" | |
prefix = '' | |
while s[0] not in ('"', "'"): | |
prefix, s = prefix + s[0], s[1:] | |
return prefix, s | |
def is_clean(s, is_unicode): | |
"""Given a non-raw string literal s, return true iff all escape | |
sequences have a meaning as defined at | |
https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals""" | |
while s: | |
if not s.startswith('\\'): | |
s = s[1:] | |
continue | |
if s[1:2] in "\n\\'" '"abfnrtv': | |
s = s[2:] | |
elif s[1:2] == 'u' and is_unicode: | |
digits, s = s[2:6], s[6:] | |
if not ok_hex(digits, 4): return False | |
elif s[1:2] == 'U' and is_unicode: | |
digits, s = s[2:10], s[10:] | |
if not ok_hex(digits, 8): return False | |
elif s[1:2] == 'N' and is_unicode: | |
s = s[2:] # XXX TODO: check the {name} that should follow | |
elif s[1:2] in '01234567': | |
digits, s = s[1], s[2:] | |
if s[:1] in '01234567': digits, s = digits + s[:1], s[1:] | |
if s[:1] in '01234567': digits, s = digits + s[:1], s[1:] | |
if 255 < int(digits, 8): return False # XXX is this check needed? | |
elif s[1:2] == 'x': | |
digits, s = s[2:4], s[4:] | |
if not ok_hex(digits, 2): return False | |
else: | |
return False | |
return True | |
def ok_hex(digits, n): | |
return (len(digits) == n | |
and all(d in '0123456789abcdef' for d in digits.lower())) | |
if __name__ == '__main__': | |
import sys | |
sys.exit(main(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment