cellularmitosis/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Blog 2019/9/7
<- previous |
index |
next ->
A Lisp-like interpreter (in Python), step 1b: a performance tweak

Let's learn how to implement a Lisp or Scheme-like language by writing a series of increasingly capable interpreters.

previous: Step 1: How a regex-based lexer (tokenizer) works
next: Step 2: Recursive descent parsing

Step 1b: A performance tweak

Our implementation from step 1 actually has a performance issue, on this line of the tokenize function:
        text = text[len(matched_text):]
Here, we are advancing the tokenizer by slicing off the length of the matched text.
However, this creates a copy of the remaining text.
Consider an input file consisting of 1,000 OPARENs.  Our lexer would:

match a paren,
advance by making a copy of the remaining 999 bytes of input,
match a paren,
advance by making a copy of the remaining 998 bytes of input,
match a paren,
advance by making a copy of the remaining 997 bytes of input,
and so on...

All of this string copying will slow things down when processing large input files.
We can solve this performance problem by making a few tweaks to our code.
The key is that Python's match() function accepts an offset as an optional parameter.  This means that rather than making all of these string copies, we can just use one copy and keep track of our offset into that string.
As Armin Ronacher says:

This is immensely useful for building lexers because you can continue to use the special ^ symbol to indicate the beginning of a line of entire string. We just need to increase the index to match further. It also means we do not have to slice up the string ourselves which saves a ton of memory allocations and string copying in the process...

Here's the gist of the changes:
$ diff -urN step1/lex.py step1b/lex.py 
--- step1/lex.py	2019-09-07 20:19:24.000000000 -0500
+++ step1b/lex.py	2019-09-07 20:15:07.000000000 -0500
@@ -11,9 +11,9 @@
             tdefs.append(pair)
     return tdefs
 
-def next_token(tdefs, text):
+def next_token(tdefs, text, offset=0):
     for (token_name, regex) in tdefs:
-        m = regex.match(text)
+        m = regex.match(text, offset)
         if m is None:
             continue
         else:
@@ -33,11 +33,12 @@
 
 def tokenize(tdefs, text):
     tokens = []
-    while len(text) > 0:
-        token = next_token(tdefs, text)
+    offset = 0
+    while (len(text) - offset) > 0:
+        token = next_token(tdefs, text, offset)
         tokens.append(token)
         matched_text = token[1]
-        text = text[len(matched_text):]
+        offset += len(matched_text)
     return tokens
 
 def _test_tokenize():
Modify next_token() to accept an offset

We'll add an offset parameter to next_token(), and pass that offset to re.match().  We'll give offset a default value of zero, making it an optional parameter.
def next_token(tdefs, text, offset=0):
    for (token_name, regex) in tdefs:
        m = regex.match(text, offset)
        if m is None:
            continue
        else:
            matched_text = m.group()
            token = (token_name, matched_text)
            return token
    raise Exception("Don't know how to lex '%s'" % text[:16])
Modify tokenize() to track an offset instead of using string slicing

Track the offset by adding the length of the matched text to it after each token is generated, and update the while predicate to take this into account:
def tokenize(tdefs, text):
    tokens = []
    offset = 0
    while (len(text) - offset) > 0:
        token = next_token(tdefs, text, offset)
        tokens.append(token)
        matched_text = token[1]
        offset += len(matched_text)
    return tokens
Re-run the tests

$ python
Python 2.7.16 (default, Mar  4 2019, 09:02:22) 
[GCC 4.2.1 Compatible Apple LLVM 10.0.0 (clang-1000.11.45.5)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import lex
>>> lex._test()
>>> 

All good :)
Continued in Step 2.

  
## lex.py
def load_tokendefs(fpath):
    import re
    tdefs = []
    with open(fpath) as f:
        for line in f:
            token_name = line.rstrip('\n')
            line2 = f.next()
            pattern = line2.rstrip('\n')
            regex = re.compile(pattern)
            pair = (token_name, regex)
            tdefs.append(pair)
    return tdefs

def next_token(tdefs, text, offset=0):
    for (token_name, regex) in tdefs:
        m = regex.match(text, offset)
        if m is None:
            continue
        else:
            matched_text = m.group()
            token = (token_name, matched_text)
            return token
    raise Exception("Don't know how to lex '%s'" % text[:16])

def _test_next_token():
    tdefs = load_tokendefs("tokendefs.txt")
    token = next_token(tdefs, "42 ")
    assert token == ("INT", "42")
    token = next_token(tdefs, " 17")
    assert token == ("WSPACE", " ")
    token = next_token(tdefs, "(1 2 3)")
    assert token == ("OPAREN", "(")

def tokenize(tdefs, text):
    tokens = []
    offset = 0
    while (len(text) - offset) > 0:
        token = next_token(tdefs, text, offset)
        tokens.append(token)
        matched_text = token[1]
        offset += len(matched_text)
    return tokens

def _test_tokenize():
    tdefs = load_tokendefs("tokendefs.txt")
    tokens = tokenize(tdefs, "(1 42   65536)")
    assert tokens == [
        ("OPAREN", "("),
        ("INT", "1"),
        ("WSPACE", " "),
        ("INT", "42"),
        ("WSPACE", "   "),
        ("INT", "65536"),
        ("CPAREN", ")")
    ]

def _test():
    _test_next_token()
    _test_tokenize()


if __name__ == "__main__":
    tdefs = load_tokendefs("tokendefs.txt")
    import sys
    if len(sys.argv) > 1:
        fname = sys.argv[-1]
        text = open(fname).read()
    else:
        text = sys.stdin.read()
    tokens = tokenize(tdefs, text)
    import pprint
    pprint.pprint(tokens)

## tokendefs.txt
INT
\d+
OPAREN
\(
CPAREN
\)
WSPACE
\s+
	def load_tokendefs(fpath):
	import re
	tdefs = []
	with open(fpath) as f:
	for line in f:
	token_name = line.rstrip('\n')
	line2 = f.next()
	pattern = line2.rstrip('\n')
	regex = re.compile(pattern)
	pair = (token_name, regex)
	tdefs.append(pair)
	return tdefs

	def next_token(tdefs, text, offset=0):
	for (token_name, regex) in tdefs:
	m = regex.match(text, offset)
	if m is None:
	continue
	else:
	matched_text = m.group()
	token = (token_name, matched_text)
	return token
	raise Exception("Don't know how to lex '%s'" % text[:16])

	def _test_next_token():
	tdefs = load_tokendefs("tokendefs.txt")
	token = next_token(tdefs, "42 ")
	assert token == ("INT", "42")
	token = next_token(tdefs, " 17")
	assert token == ("WSPACE", " ")
	token = next_token(tdefs, "(1 2 3)")
	assert token == ("OPAREN", "(")

	def tokenize(tdefs, text):
	tokens = []
	offset = 0
	while (len(text) - offset) > 0:
	token = next_token(tdefs, text, offset)
	tokens.append(token)
	matched_text = token[1]
	offset += len(matched_text)
	return tokens

	def _test_tokenize():
	tdefs = load_tokendefs("tokendefs.txt")
	tokens = tokenize(tdefs, "(1 42 65536)")
	assert tokens == [
	("OPAREN", "("),
	("INT", "1"),
	("WSPACE", " "),
	("INT", "42"),
	("WSPACE", " "),
	("INT", "65536"),
	("CPAREN", ")")
	]

	def _test():
	_test_next_token()
	_test_tokenize()


	if __name__ == "__main__":
	tdefs = load_tokendefs("tokendefs.txt")
	import sys
	if len(sys.argv) > 1:
	fname = sys.argv[-1]
	text = open(fname).read()
	else:
	text = sys.stdin.read()
	tokens = tokenize(tdefs, text)
	import pprint
	pprint.pprint(tokens)