gatopeich/minimal_javascript_prototype.py

## minimal_javascript_prototype.py
# gatopeich's minimal Javascript interpreter prototype

# Coding this as a prototype for a specific-purpose lightweight Javascript
# engine in C/C++.

# DONE: Tokenizer (except quote parsing)
# DONE: Expression extraction
# DONE: Pretty printer
# next: Interpreter...


Punctuators = (
    # https://www.ecma-international.org/ecma-262/5.1/#sec-7.7
    '{','}','(',')','[',']',
    '.',';',',','<','>','<=',
    '>=','==','!=','===','!==',
    '+','-','*','%','++','--',
    '<<','>>','>>>','&','|','^',
    '!','~','&&','||','?',':',
    '=','+=','-=','*=','%=','<<=',
    '>>=','>>>=','&=','|=','^=',
    '/','/=' ) # := DivPunctuators

# Sort longer first:
Punctuators = sorted(Punctuators, key=lambda p: str(len(p))+p, reverse=True)
# print (Punctuators)

def tokenize(txt):
    # 1. Remove comments
    lines = (l.strip() for l in txt.splitlines())
    lines = (l[:l.find('//')] if '//' in l else l for l in lines)
    # 2. Proccess quotations
    # (TBD)

    def divide_in_tokens(word):
        if not word:
            return []
        for p in Punctuators:
            if p in word:
                before,p,after = word.partition(p)
                return divide_in_tokens(before) + [p] + divide_in_tokens(after)
        return [word]

    # 3. Lines to tokens
    tokens = []
    for line in lines:
        as_tokens = []
        for word in line.split():
            as_tokens.extend(divide_in_tokens(word))
        # Inject initial semicolon where adequate
        #   - The five problematic tokens (on start of a line) are
        #   open parenthesis "(", open bracket "[", slash "/", plus "+", and minus "-"
        #   - prepend ';' on any new line that does NOT start with these
        if as_tokens and as_tokens[0] not in ('([/+-'):
            tokens.append(';')
        tokens.extend(as_tokens)

    return tokens


PARENS = {'{':'}','(':')','[':']'}

def to_expressions(tokens, opening = None):
    expressions = [opening] if opening else []
    closing = PARENS[opening] if opening else None
    current_exp = []
    while tokens:
        token = tokens.pop(0)
        if token == closing:
            if current_exp:
                expressions.append(current_exp)
            return expressions
        elif token == ';':
            if current_exp:
                expressions.append(current_exp)
                current_exp = []
        elif token in PARENS:
            current_exp.append(to_expressions(tokens, token))
        else:
            current_exp.append(token)
    if opening:
        raise Exception('No match for "%s"'%opening)
    return expressions


WikipediaExample1='''
    var x = 0; // A global variable, because it is not in any function

    function f() {
      var z = 'foxes', r = 'birds'; // 2 local variables
      m = 'fish'; // global, because it wasn't declared anywhere before

      function child() {
        var r = 'monkeys'; // This variable is local and does not affect the "birds" r of the parent function.
        z = 'penguins'; // Closure: Child function is able to access the variables of the parent function.
      }

      twenty = 20; // This variable is declared on the next line, but usable anywhere in the function, even before, as here
      var twenty;

      child();
      return x; // We can use x here, because it is global
    }

    f();

    console.log(z); // This line will raise a ReferenceError exception, because the value of z is no longer available
'''

class LineCounter():
    def __init__(self, prefix = ': ', lines = 0):
        self.prefix = prefix
        self.lines = lines
    def __str__(self):
        self.lines += 1
        return '%3d%s'%(self.lines, self.prefix)
    def indent(self):
        self.prefix += '  '
        return self
    def unindent(self):
        self.prefix = self.prefix[:-2]
        return self


def pretty_print(expressions, linecount = None):
    is_main = not linecount
    linedue = is_main
    if not linecount:
        linecount = LineCounter()
    for expr in expressions:
        if linedue:
            print(end='\n%s'%linecount)
        for elem in expr:
            if type(elem) is str:
                print (elem, end=' ')
            else:
                if elem[0] == '(':
                    print (end='( ')
                    pretty_print(elem[1:], linecount)
                    print (end = ') ')
                else:
                    print (elem[0], end='\n%s'%linecount.indent())
                    pretty_print(elem[1:], linecount)
                    print ('\n%s'%linecount.unindent(), end=PARENS[elem[0]])
        linedue = True
    if is_main:
        print('\n')

for exp in to_expressions(tokenize(WikipediaExample1)):
    print ('>', exp)

pretty_print(to_expressions(tokenize(WikipediaExample1)))

CODE='''
    // Sample Javascript snippet
    a += 1
    // The line below starts a new expression...
    b = 2
    // Unlike this one which is a continuation...
    + a
    print( a + b )
'''

print (tokenize(CODE))
pretty_print(to_expressions(tokenize(CODE)))
	# gatopeich's minimal Javascript interpreter prototype

	# Coding this as a prototype for a specific-purpose lightweight Javascript
	# engine in C/C++.

	# DONE: Tokenizer (except quote parsing)
	# DONE: Expression extraction
	# DONE: Pretty printer
	# next: Interpreter...


	Punctuators = (
	# https://www.ecma-international.org/ecma-262/5.1/#sec-7.7
	'{','}','(',')','[',']',
	'.',';',',','<','>','<=',
	'>=','==','!=','===','!==',
	'+','-','*','%','++','--',
	'<<','>>','>>>','&','\|','^',
	'!','~','&&','\|\|','?',':',
	'=','+=','-=','*=','%=','<<=',
	'>>=','>>>=','&=','\|=','^=',
	'/','/=' ) # := DivPunctuators

	# Sort longer first:
	Punctuators = sorted(Punctuators, key=lambda p: str(len(p))+p, reverse=True)
	# print (Punctuators)

	def tokenize(txt):
	# 1. Remove comments
	lines = (l.strip() for l in txt.splitlines())
	lines = (l[:l.find('//')] if '//' in l else l for l in lines)
	# 2. Proccess quotations
	# (TBD)

	def divide_in_tokens(word):
	if not word:
	return []
	for p in Punctuators:
	if p in word:
	before,p,after = word.partition(p)
	return divide_in_tokens(before) + [p] + divide_in_tokens(after)
	return [word]

	# 3. Lines to tokens
	tokens = []
	for line in lines:
	as_tokens = []
	for word in line.split():
	as_tokens.extend(divide_in_tokens(word))
	# Inject initial semicolon where adequate
	# - The five problematic tokens (on start of a line) are
	# open parenthesis "(", open bracket "[", slash "/", plus "+", and minus "-"
	# - prepend ';' on any new line that does NOT start with these
	if as_tokens and as_tokens[0] not in ('([/+-'):
	tokens.append(';')
	tokens.extend(as_tokens)

	return tokens


	PARENS = {'{':'}','(':')','[':']'}

	def to_expressions(tokens, opening = None):
	expressions = [opening] if opening else []
	closing = PARENS[opening] if opening else None
	current_exp = []
	while tokens:
	token = tokens.pop(0)
	if token == closing:
	if current_exp:
	expressions.append(current_exp)
	return expressions
	elif token == ';':
	if current_exp:
	expressions.append(current_exp)
	current_exp = []
	elif token in PARENS:
	current_exp.append(to_expressions(tokens, token))
	else:
	current_exp.append(token)
	if opening:
	raise Exception('No match for "%s"'%opening)
	return expressions


	WikipediaExample1='''
	var x = 0; // A global variable, because it is not in any function

	function f() {
	var z = 'foxes', r = 'birds'; // 2 local variables
	m = 'fish'; // global, because it wasn't declared anywhere before

	function child() {
	var r = 'monkeys'; // This variable is local and does not affect the "birds" r of the parent function.
	z = 'penguins'; // Closure: Child function is able to access the variables of the parent function.
	}

	twenty = 20; // This variable is declared on the next line, but usable anywhere in the function, even before, as here
	var twenty;

	child();
	return x; // We can use x here, because it is global
	}

	f();

	console.log(z); // This line will raise a ReferenceError exception, because the value of z is no longer available
	'''

	class LineCounter():
	def __init__(self, prefix = ': ', lines = 0):
	self.prefix = prefix
	self.lines = lines
	def __str__(self):
	self.lines += 1
	return '%3d%s'%(self.lines, self.prefix)
	def indent(self):
	self.prefix += ' '
	return self
	def unindent(self):
	self.prefix = self.prefix[:-2]
	return self


	def pretty_print(expressions, linecount = None):
	is_main = not linecount
	linedue = is_main
	if not linecount:
	linecount = LineCounter()
	for expr in expressions:
	if linedue:
	print(end='\n%s'%linecount)
	for elem in expr:
	if type(elem) is str:
	print (elem, end=' ')
	else:
	if elem[0] == '(':
	print (end='( ')
	pretty_print(elem[1:], linecount)
	print (end = ') ')
	else:
	print (elem[0], end='\n%s'%linecount.indent())
	pretty_print(elem[1:], linecount)
	print ('\n%s'%linecount.unindent(), end=PARENS[elem[0]])
	linedue = True
	if is_main:
	print('\n')

	for exp in to_expressions(tokenize(WikipediaExample1)):
	print ('>', exp)

	pretty_print(to_expressions(tokenize(WikipediaExample1)))

	CODE='''
	// Sample Javascript snippet
	a += 1
	// The line below starts a new expression...
	b = 2
	// Unlike this one which is a continuation...
	+ a
	print( a + b )
	'''

	print (tokenize(CODE))
	pretty_print(to_expressions(tokenize(CODE)))