Skip to content

Instantly share code, notes, and snippets.

@darius
Last active August 29, 2015 14:00
Show Gist options
  • Save darius/11351623 to your computer and use it in GitHub Desktop.
Save darius/11351623 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Given Python2 source files, print out all the string literals that look
like they were meant to be raw strings, but are ordinary strings instead.
Exit status: 1 if any warnings, else 0.
"""
import token, tokenize
def main(argv):
ok = True
for filename in argv[1:]:
try:
f = open(filename, 'U')
# 'U' to normalize newlines: this avoids spurious warnings
# about strings with \ right before crlf.
except IOError as e:
print(e)
ok = False
continue
with f:
try:
ok &= check(f)
except tokenize.TokenError as e:
print('%s: %s' % (filename, e))
ok = False
except SyntaxError as e:
print('%s: %s' % (filename, e))
ok = False
return 0 if ok else 1
def check(f):
ok = True
for ttype, tstring, (start, _1), (end, _2), _3 in tokenize.generate_tokens(f.readline):
if ttype == token.STRING and not literal_is_clean(tstring):
ok = False
to_end = '' if end == start else '..%d' % (end,)
indented_string = tstring.replace('\n', '\n ')
print('%s:%d%s %s' % (f.name, start, to_end, indented_string))
return ok
def literal_is_clean(tstring):
prefix, quoted_part = parse_prefix(tstring)
prefix = prefix.lower()
assert prefix in ('', 'u', 'b', 'r', 'ur', 'br')
return 'r' in prefix or is_clean(quoted_part, 'u' in prefix)
def parse_prefix(s):
"""Given a string literal s, such as 'hello' or ur"howdy", return a
pair of the prefix before the part in quotes, together with the part
in quotes."""
prefix = ''
while s[0] not in ('"', "'"):
prefix, s = prefix + s[0], s[1:]
return prefix, s
def is_clean(s, is_unicode):
"""Given a non-raw string literal s, return true iff all escape
sequences have a meaning as defined at
https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals"""
while s:
if not s.startswith('\\'):
s = s[1:]
continue
if s[1:2] in "\n\\'" '"abfnrtv':
s = s[2:]
elif s[1:2] == 'u' and is_unicode:
digits, s = s[2:6], s[6:]
if not ok_hex(digits, 4): return False
elif s[1:2] == 'U' and is_unicode:
digits, s = s[2:10], s[10:]
if not ok_hex(digits, 8): return False
elif s[1:2] == 'N' and is_unicode:
s = s[2:] # XXX TODO: check the {name} that should follow
elif s[1:2] in '01234567':
digits, s = s[1], s[2:]
if s[:1] in '01234567': digits, s = digits + s[:1], s[1:]
if s[:1] in '01234567': digits, s = digits + s[:1], s[1:]
if 255 < int(digits, 8): return False # XXX is this check needed?
elif s[1:2] == 'x':
digits, s = s[2:4], s[4:]
if not ok_hex(digits, 2): return False
else:
return False
return True
def ok_hex(digits, n):
return (len(digits) == n
and all(d in '0123456789abcdef' for d in digits.lower()))
if __name__ == '__main__':
import sys
sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment