ambv/tokenize.diff

## tokenize.diff
--- Lib/tokenize.py	2018-04-22 17:33:48.000000000 -0700
+++ Lib/lib2to3/pgen2/tokenize.py	2018-04-22 17:32:55.000000000 -0700
@@ -31,14 +31,15 @@
 import itertools as _itertools
 import re
 import sys
-from token import *
+from .token import *

 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

-import token
+from . import token
 __all__ = token.__all__ + ["tokenize", "detect_encoding",
-                           "untokenize", "TokenInfo"]
+                           "untokenize", "TokenInfo",
+                           "generate_tokens"]
 del token

 EXACT_TOKEN_TYPES = {
@@ -114,10 +115,10 @@
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 Name = r'\w+'

-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+[lL]?'
 Binnumber = r'0[bB](?:_?[01])+'
-Octnumber = r'0[oO](?:_?[0-7])+'
-Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
+Octnumber = r'0[oO]?(?:_?[0-7])+[lL]?'
+Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)[lL]?'
 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
 Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
 Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
@@ -134,7 +135,7 @@
     #  'rf'). The various permutations will be generated.
     _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
     # if we add binary f-strings, add: ['fb', 'fbr']
-    result = {''}
+    result = {'', 'ur', 'Ur', 'uR', 'UR'}
     for prefix in _valid_string_prefixes:
         for t in _itertools.permutations(prefix):
             # create a list with upper and lower versions of each
@@ -167,12 +168,13 @@
 # longest operators first (e.g., if = came before ==, == would get
 # recognized as two instances of =).
 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
+                 r"<>",
                  r"//=?", r"->",
                  r"[+\-*/%&@|^=<>]=?",
                  r"~")

 Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@`]')
 Funny = group(Operator, Bracket, Special)

 PlainToken = group(Number, Funny, String, Name)
@@ -283,7 +285,7 @@
                 self.encoding = tokval
                 continue

-            if toknum in (NAME, NUMBER):
+            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
                 tokval += ' '

             # Insert a space between two consecutive strings
@@ -455,7 +457,7 @@
         raise


-def tokenize(readline):
+def tokenize(readline, tokeneater=None):
     """
     The tokenize() generator requires one argument, readline, which
     must be a callable object which provides the same interface as the
@@ -473,7 +475,21 @@

     The first token sequence will always be an ENCODING token
     which tells you which encoding was used to decode the bytes stream.
+
+    The `tokeneater` argument is deprecated and intentionally undocumented.
     """
+    if tokeneater:
+        import warnings
+        warnings.warn(PendingDeprecationWarning,
+                      "The `tokeneater` argument to tokenize() is deprecated. "
+                      "Use `for token in tokenize(readline): tokeneater` "
+                      "instead. Note: readline should return bytes.")
+        try:
+            tokenize_loop(readline, tokeneater)
+        except StopTokenizing:
+            pass
+        return
+
     encoding, consumed = detect_encoding(readline)
     empty = _itertools.repeat(b"")
     rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
@@ -487,6 +503,12 @@
     contline = None
     indents = [0]

+    # 'stashed' and 'async_*' are used for async/await parsing
+    stashed = None
+    async_def = False
+    async_def_indent = 0
+    async_def_nl = False
+
     if encoding is not None:
         if encoding == "utf-8-sig":
             # BOM will already have been stripped.
@@ -540,13 +562,16 @@
             if pos == max:
                 break

+            if stashed:
+                yield stashed
+                stashed = None
+
             if line[pos] in '#\r\n':           # skip comments or blank lines
                 if line[pos] == '#':
                     comment_token = line[pos:].rstrip('\r\n')
                     yield TokenInfo(COMMENT, comment_token,
                            (lnum, pos), (lnum, pos + len(comment_token)), line)
                     pos += len(comment_token)
-
                 yield TokenInfo(NL, line[pos:],
                            (lnum, pos), (lnum, len(line)), line)
                 continue
@@ -561,8 +586,18 @@
                         ("<tokenize>", lnum, pos, line))
                 indents = indents[:-1]

+                if async_def and async_def_indent >= indents[-1]:
+                    async_def = False
+                    async_def_nl = False
+                    async_def_indent = 0
+
                 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

+            if async_def and async_def_nl and async_def_indent >= indents[-1]:
+                async_def = False
+                async_def_nl = False
+                async_def_indent = 0
+
         else:                                  # continued statement
             if not line:
                 raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -581,13 +616,21 @@
                     (initial == '.' and token != '.' and token != '...')):
                     yield TokenInfo(NUMBER, token, spos, epos, line)
                 elif initial in '\r\n':
+                    newline = NEWLINE
                     if parenlev > 0:
-                        yield TokenInfo(NL, token, spos, epos, line)
-                    else:
-                        yield TokenInfo(NEWLINE, token, spos, epos, line)
+                        newline = NL
+                    elif async_def:
+                        async_def_nl = True
+                    if stashed:
+                        yield stashed
+                        stashed = None
+                    yield TokenInfo(newline, token, spos, epos, line)

                 elif initial == '#':
                     assert not token.endswith("\n")
+                    if stashed:
+                        yield stashed
+                        stashed = None
                     yield TokenInfo(COMMENT, token, spos, epos, line)

                 elif token in triple_quoted:
@@ -596,6 +639,9 @@
                     if endmatch:                           # all on one line
                         pos = endmatch.end(0)
                         token = line[start:pos]
+                        if stashed:
+                            yield stashed
+                            stashed = None
                         yield TokenInfo(STRING, token, spos, (lnum, pos), line)
                     else:
                         strstart = (lnum, start)           # multiple lines
@@ -631,23 +677,65 @@
                         contline = line
                         break
                     else:                                  # ordinary string
+                        if stashed:
+                            yield stashed
+                            stashed = None
                         yield TokenInfo(STRING, token, spos, epos, line)
-
                 elif initial.isidentifier():               # ordinary name
-                    yield TokenInfo(NAME, token, spos, epos, line)
+                    if token in ('async', 'await'):
+                        if async_def:
+                            yield TokenInfo(ASYNC if token == 'async' else AWAIT,
+                                   token, spos, epos, line)
+                            continue
+
+                    tok = TokenInfo(NAME, token, spos, epos, line)
+                    if token == 'async' and not stashed:
+                        stashed = tok
+                        continue
+
+                    if token == 'def':
+                        if (stashed
+                                and stashed[0] == NAME
+                                and stashed[1] == 'async'):
+
+                            async_def = True
+                            async_def_indent = indents[-1]
+
+                            yield TokenInfo(ASYNC, stashed[1],
+                                   stashed[2], stashed[3],
+                                   stashed[4])
+                            stashed = None
+
+                    if stashed:
+                        yield stashed
+                        stashed = None
+
+                    yield tok
                 elif initial == '\\':                      # continued stmt
+                    # This yield is new; needed for better idempotency:
+                    if stashed:
+                        yield stashed
+                        stashed = None
+                    yield TokenInfo(NL, token, spos, (lnum, pos), line)
                     continued = 1
                 else:
                     if initial in '([{':
                         parenlev += 1
                     elif initial in ')]}':
                         parenlev -= 1
+                    if stashed:
+                        yield stashed
+                        stashed = None
                     yield TokenInfo(OP, token, spos, epos, line)
             else:
                 yield TokenInfo(ERRORTOKEN, line[pos],
                            (lnum, pos), (lnum, pos+1), line)
                 pos += 1

+    if stashed:
+        yield stashed
+        stashed = None
+
     for indent in indents[1:]:                 # pop remaining indent levels
         yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
@@ -658,6 +746,22 @@
 def generate_tokens(readline):
     return _tokenize(readline, None)

+
+# An undocumented, backwards compatible, API for users looping over tokens
+# with a callback.
+def tokenize_loop(readline, tokeneater):
+    for token_info in generate_tokens(readline):
+        tokeneater(*token_info)
+
+
+# An undocumented, backwards compatible token eater.
+def printtoken(type, token, spos, epos, line):
+    srow, scol = spos
+    erow, ecol = epos
+    print("%d,%d-%d,%d:\t%s\t%s" % \
+        (srow, scol, erow, ecol, tok_name[type], repr(token)))
+
+
 def main():
     import argparse
	--- Lib/tokenize.py 2018-04-22 17:33:48.000000000 -0700
	+++ Lib/lib2to3/pgen2/tokenize.py 2018-04-22 17:32:55.000000000 -0700
	@@ -31,14 +31,15 @@
	import itertools as _itertools
	import re
	import sys
	-from token import *
	+from .token import *

	cookie_re = re.compile(r'^[ \t\f]#.?coding[:=][ \t]*([-\w.]+)', re.ASCII)
	blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]\|$)', re.ASCII)

	-import token
	+from . import token
	__all__ = token.__all__ + ["tokenize", "detect_encoding",
	- "untokenize", "TokenInfo"]
	+ "untokenize", "TokenInfo",
	+ "generate_tokens"]
	del token

	EXACT_TOKEN_TYPES = {
	@@ -114,10 +115,10 @@
	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
	Name = r'\w+'

	-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
	+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+[lL]?'
	Binnumber = r'0[bB](?:_?[01])+'
	-Octnumber = r'0[oO](?:_?[0-7])+'
	-Decnumber = r'(?:0(?:_?0)\|[1-9](?:_?[0-9]))'
	+Octnumber = r'0[oO]?(?:_?[0-7])+[lL]?'
	+Decnumber = r'(?:0(?:_?0)\|[1-9](?:_?[0-9]))[lL]?'
	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
	Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
	Pointfloat = group(r'[0-9](?:_?[0-9])\.(?:[0-9](?:_?[0-9]))?',
	@@ -134,7 +135,7 @@
	# 'rf'). The various permutations will be generated.
	_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
	# if we add binary f-strings, add: ['fb', 'fbr']
	- result = {''}
	+ result = {'', 'ur', 'Ur', 'uR', 'UR'}
	for prefix in _valid_string_prefixes:
	for t in _itertools.permutations(prefix):
	# create a list with upper and lower versions of each
	@@ -167,12 +168,13 @@
	# longest operators first (e.g., if = came before ==, == would get
	# recognized as two instances of =).
	Operator = group(r"\\=?", r">>=?", r"<<=?", r"!=",
	+ r"<>",
	r"//=?", r"->",
	r"[+\-*/%&@\|^=<>]=?",
	r"~")

	Bracket = '[][(){}]'
	-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
	+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@`]')
	Funny = group(Operator, Bracket, Special)

	PlainToken = group(Number, Funny, String, Name)
	@@ -283,7 +285,7 @@
	self.encoding = tokval
	continue

	- if toknum in (NAME, NUMBER):
	+ if toknum in (NAME, NUMBER, ASYNC, AWAIT):
	tokval += ' '

	# Insert a space between two consecutive strings
	@@ -455,7 +457,7 @@
	raise


	-def tokenize(readline):
	+def tokenize(readline, tokeneater=None):
	"""
	The tokenize() generator requires one argument, readline, which
	must be a callable object which provides the same interface as the
	@@ -473,7 +475,21 @@

	The first token sequence will always be an ENCODING token
	which tells you which encoding was used to decode the bytes stream.
	+
	+ The `tokeneater` argument is deprecated and intentionally undocumented.
	"""
	+ if tokeneater:
	+ import warnings
	+ warnings.warn(PendingDeprecationWarning,
	+ "The `tokeneater` argument to tokenize() is deprecated. "
	+ "Use `for token in tokenize(readline): tokeneater` "
	+ "instead. Note: readline should return bytes.")
	+ try:
	+ tokenize_loop(readline, tokeneater)
	+ except StopTokenizing:
	+ pass
	+ return
	+
	encoding, consumed = detect_encoding(readline)
	empty = _itertools.repeat(b"")
	rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
	@@ -487,6 +503,12 @@
	contline = None
	indents = [0]

	+ # 'stashed' and 'async_*' are used for async/await parsing
	+ stashed = None
	+ async_def = False
	+ async_def_indent = 0
	+ async_def_nl = False
	+
	if encoding is not None:
	if encoding == "utf-8-sig":
	# BOM will already have been stripped.
	@@ -540,13 +562,16 @@
	if pos == max:
	break

	+ if stashed:
	+ yield stashed
	+ stashed = None
	+
	if line[pos] in '#\r\n': # skip comments or blank lines
	if line[pos] == '#':
	comment_token = line[pos:].rstrip('\r\n')
	yield TokenInfo(COMMENT, comment_token,
	(lnum, pos), (lnum, pos + len(comment_token)), line)
	pos += len(comment_token)
	-
	yield TokenInfo(NL, line[pos:],
	(lnum, pos), (lnum, len(line)), line)
	continue
	@@ -561,8 +586,18 @@
	("<tokenize>", lnum, pos, line))
	indents = indents[:-1]

	+ if async_def and async_def_indent >= indents[-1]:
	+ async_def = False
	+ async_def_nl = False
	+ async_def_indent = 0
	+
	yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

	+ if async_def and async_def_nl and async_def_indent >= indents[-1]:
	+ async_def = False
	+ async_def_nl = False
	+ async_def_indent = 0
	+
	else: # continued statement
	if not line:
	raise TokenError("EOF in multi-line statement", (lnum, 0))
	@@ -581,13 +616,21 @@
	(initial == '.' and token != '.' and token != '...')):
	yield TokenInfo(NUMBER, token, spos, epos, line)
	elif initial in '\r\n':
	+ newline = NEWLINE
	if parenlev > 0:
	- yield TokenInfo(NL, token, spos, epos, line)
	- else:
	- yield TokenInfo(NEWLINE, token, spos, epos, line)
	+ newline = NL
	+ elif async_def:
	+ async_def_nl = True
	+ if stashed:
	+ yield stashed
	+ stashed = None
	+ yield TokenInfo(newline, token, spos, epos, line)

	elif initial == '#':
	assert not token.endswith("\n")
	+ if stashed:
	+ yield stashed
	+ stashed = None
	yield TokenInfo(COMMENT, token, spos, epos, line)

	elif token in triple_quoted:
	@@ -596,6 +639,9 @@
	if endmatch: # all on one line
	pos = endmatch.end(0)
	token = line[start:pos]
	+ if stashed:
	+ yield stashed
	+ stashed = None
	yield TokenInfo(STRING, token, spos, (lnum, pos), line)
	else:
	strstart = (lnum, start) # multiple lines
	@@ -631,23 +677,65 @@
	contline = line
	break
	else: # ordinary string
	+ if stashed:
	+ yield stashed
	+ stashed = None
	yield TokenInfo(STRING, token, spos, epos, line)
	-
	elif initial.isidentifier(): # ordinary name
	- yield TokenInfo(NAME, token, spos, epos, line)
	+ if token in ('async', 'await'):
	+ if async_def:
	+ yield TokenInfo(ASYNC if token == 'async' else AWAIT,
	+ token, spos, epos, line)
	+ continue
	+
	+ tok = TokenInfo(NAME, token, spos, epos, line)
	+ if token == 'async' and not stashed:
	+ stashed = tok
	+ continue
	+
	+ if token == 'def':
	+ if (stashed
	+ and stashed[0] == NAME
	+ and stashed[1] == 'async'):
	+
	+ async_def = True
	+ async_def_indent = indents[-1]
	+
	+ yield TokenInfo(ASYNC, stashed[1],
	+ stashed[2], stashed[3],
	+ stashed[4])
	+ stashed = None
	+
	+ if stashed:
	+ yield stashed
	+ stashed = None
	+
	+ yield tok
	elif initial == '\\': # continued stmt
	+ # This yield is new; needed for better idempotency:
	+ if stashed:
	+ yield stashed
	+ stashed = None
	+ yield TokenInfo(NL, token, spos, (lnum, pos), line)
	continued = 1
	else:
	if initial in '([{':
	parenlev += 1
	elif initial in ')]}':
	parenlev -= 1
	+ if stashed:
	+ yield stashed
	+ stashed = None
	yield TokenInfo(OP, token, spos, epos, line)
	else:
	yield TokenInfo(ERRORTOKEN, line[pos],
	(lnum, pos), (lnum, pos+1), line)
	pos += 1

	+ if stashed:
	+ yield stashed
	+ stashed = None
	+
	for indent in indents[1:]: # pop remaining indent levels
	yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
	yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
	@@ -658,6 +746,22 @@
	def generate_tokens(readline):
	return _tokenize(readline, None)

	+
	+# An undocumented, backwards compatible, API for users looping over tokens
	+# with a callback.
	+def tokenize_loop(readline, tokeneater):
	+ for token_info in generate_tokens(readline):
	+ tokeneater(*token_info)
	+
	+
	+# An undocumented, backwards compatible token eater.
	+def printtoken(type, token, spos, epos, line):
	+ srow, scol = spos
	+ erow, ecol = epos
	+ print("%d,%d-%d,%d:\t%s\t%s" % \
	+ (srow, scol, erow, ecol, tok_name[type], repr(token)))
	+
	+
	def main():
	import argparse