Skip to content

Instantly share code, notes, and snippets.

@airstrike
Last active January 8, 2019 05:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save airstrike/d4d866f0b816497edecc0530ed3f9f0d to your computer and use it in GitHub Desktop.
Save airstrike/d4d866f0b816497edecc0530ed3f9f0d to your computer and use it in GitHub Desktop.
#========================================================================
# Description: Tokenise an Excel formula using an implementation of
# E. W. Bachtal's algorithm, found here:
#
# Blog post: https://ewbi.blogs.com/develops/2004/12/excel_formula_p.html
# See also: https://ewbi.blogs.com/develops/popular/excelformulaparsing.html
# Direct link to this port: http://www.ewbi.com/ewbi.develop/samples/jsport_nonEAT.py
#
# Originally written for Python v2.5 (win32)
# Author: Robin Macharg
# Copyright: Algorithm (c) E. W. Bachtal, this implementation (c) R. Macharg
#
# CVS Info:
# $Header: T:\\cvsarchive/Excel\040export\040&\040import\040XML/ExcelXMLTransform/EWBI_Javascript_port/jsport.py,v 1.5 2006/12/07 13:41:08 rmacharg Exp $
#
# Modification History
#
# Date Author Comment
# =======================================================================
# 2019/01/08 - AT - AT = Andy Terra
# Ported to Python 3
# 2006/11/29 - RMM - Made strictly class-based.
# Added parse, render and pretty print methods
# 2006/11 - RMM - RMM = Robin Macharg
# Created
#========================================================================
#========================================================================
# Class: ExcelParserTokens
# Description: Inheritable container for token definitions
#
# Attributes: Self explanatory
#
# Methods: None
#========================================================================
class ExcelParserTokens:
TOK_TYPE_NOOP = "noop";
TOK_TYPE_OPERAND = "operand";
TOK_TYPE_FUNCTION = "function";
TOK_TYPE_SUBEXPR = "subexpression";
TOK_TYPE_ARGUMENT = "argument";
TOK_TYPE_OP_PRE = "operator-prefix";
TOK_TYPE_OP_IN = "operator-infix";
TOK_TYPE_OP_POST = "operator-postfix";
TOK_TYPE_WSPACE = "white-space";
TOK_TYPE_UNKNOWN = "unknown"
TOK_SUBTYPE_START = "start";
TOK_SUBTYPE_STOP = "stop";
TOK_SUBTYPE_TEXT = "text";
TOK_SUBTYPE_NUMBER = "number";
TOK_SUBTYPE_LOGICAL = "logical";
TOK_SUBTYPE_ERROR = "error";
TOK_SUBTYPE_RANGE = "range";
TOK_SUBTYPE_MATH = "math";
TOK_SUBTYPE_CONCAT = "concatenate";
TOK_SUBTYPE_INTERSECT = "intersect";
TOK_SUBTYPE_UNION = "union";
#========================================================================
# Class: f_token
# Description: Encapsulate a formula token
#
# Attributes: tvalue -
# ttype - See token definitions, above, for values
# tsubtype - See token definitions, above, for values
#
# Methods: f_token - __init__()
#========================================================================
class f_token:
def __init__(self, value, type, subtype):
self.tvalue = value
self.ttype = type
self.tsubtype = subtype
#========================================================================
# Class: f_tokens
# Description: An ordered list of tokens
# Attributes: items - Ordered list
# index - Current position in the list
#
# Methods: f_tokens - __init__()
# f_token - add() - Add a token to the end of the list
# None - addRef() - Add a token to the end of the list
# None - reset() - reset the index to -1
# Boolean - BOF() - End of list?
# Boolean - EOF() - Beginning of list?
# Boolean - moveNext() - Move the index along one
# f_token/None - current() - Return the current token
# f_token/None - next() - Return the next token (leave the index unchanged)
# f_token/None - previous() - Return the previous token (leave the index unchanged)
#========================================================================
class f_tokens:
def __init__(self):
self.items = []
self.index = -1
def add(self, value, type, subtype=""):
if (not subtype):
subtype = ""
token = f_token(value, type, subtype)
self.addRef(token)
return token
def addRef(self, token):
self.items.append(token)
def reset(self):
self.index = -1
def BOF(self):
return self.index <= 0
def EOF(self):
return self.index >= (len(self.items) - 1)
def moveNext(self):
if self.EOF():
return False
self.index += 1
return True
def current(self):
if self.index == -1:
return None
return self.items[self.index]
def next(self):
if self.EOF():
return None
return self.items[self.index + 1]
def previous(self):
if self.index < 1:
return None
return self.items[self.index -1]
#========================================================================
# Class: f_tokenStack
# Inherits: ExcelParserTokens - a list of token values
# Description: A LIFO stack of tokens
#
# Attributes: items - Ordered list
#
# Methods: f_tokenStack - __init__()
# None - push(token) - Push a token onto the stack
# f_token/None - pop() - Pop a token off the stack
# f_token/None - token() - Non-destructively return the top item on the stack
# String - type() - Return the top token's type
# String - subtype() - Return the top token's subtype
# String - value() - Return the top token's value
#========================================================================
class f_tokenStack(ExcelParserTokens):
def __init__(self):
self.items = []
def push(self, token):
self.items.append(token)
def pop(self):
token = self.items.pop()
return f_token("", token.ttype, self.TOK_SUBTYPE_STOP)
def token(self):
# Note: this uses Pythons and/or "hack" to emulate C's ternary operator (i.e. cond ? exp1 : exp2)
return ((len(self.items) > 0) and [self.items[len(self.items) - 1]] or [None])[0]
def value(self):
return ((self.token()) and [(self.token()).tvalue] or [""])[0]
def type(self):
t = self.token()
return ((self.token()) and [(self.token()).ttype] or [""])[0]
def subtype(self):
return ((self.token()) and [(self.token()).tsubtype] or [""])[0]
#========================================================================
# Class: ExcelParser
# Description: Parse an Excel formula into a stream of tokens
# Attributes:
#
# Methods: f_tokens - getTokens(formula) - return a token stream (list)
#========================================================================
class ExcelParser(ExcelParserTokens):
def getTokens(self, formula):
def currentChar():
return formula[offset]
def doubleChar():
return formula[offset:offset+2]
def nextChar():
# JavaScript returns an empty string if the index is out of bounds,
# Python throws an IndexError. We mimic this behaviour here.
try:
formula[offset+1]
except IndexError:
return ""
else:
return formula[offset+1]
def EOF():
return offset >= len(formula)
tokens = f_tokens()
tokenStack = f_tokenStack()
offset = 0
token = ""
inString = False
inPath = False
inRange = False
inError = False
while (len(formula) > 0):
if (formula[0] == " "):
formula = formula[1:]
else:
if (formula[0] == "="):
formula = formula[1:]
break;
# state-dependent character evaluation (order is important)
while not EOF():
# double-quoted strings
# embeds are doubled
# end marks token
if inString:
if currentChar() == "\"":
if nextChar() == "\"":
token += "\""
offset += 1
else:
inString = False
tokens.add(token, self.TOK_TYPE_OPERAND, self.TOK_SUBTYPE_TEXT)
token = ""
else:
token += currentChar()
offset += 1
continue
# single-quoted strings (links)
# embeds are double
# end does not mark a token
if inPath:
if currentChar() == "'":
if nextChar() == "'":
token += "'"
offset += 1
else:
inPath = False
else:
token += currentChar()
offset += 1;
continue;
# bracketed strings (range offset or linked workbook name)
# no embeds (changed to "()" by Excel)
# end does not mark a token
if inRange:
if currentChar() == "]":
inRange = False
token += currentChar()
offset += 1
continue
# error values
# end marks a token, determined from absolute list of values
if inError:
token += currentChar()
offset += 1
if ",#NULL!,#DIV/0!,#VALUE!,#REF!,#NAME?,#NUM!,#N/A,".find("," + token + ",") != -1:
inError = False
tokens.add(token, self.TOK_TYPE_OPERAND, self.TOK_SUBTYPE_ERROR)
token = ""
continue;
# independent character evaulation (order not important)
#
# establish state-dependent character evaluations
if currentChar() == "\"":
if len(token) > 0:
# not expected
tokens.add(token, self.TOK_TYPE_UNKNOWN)
token = ""
inString = True
offset += 1
continue
if currentChar() == "'":
if len(token) > 0:
# not expected
tokens.add(token, self.TOK_TYPE_UNKNOWN)
token = ""
inPath = True
offset += 1
continue
if (currentChar() == "["):
inRange = True
token += currentChar()
offset += 1
continue
if (currentChar() == "#"):
if (len(token) > 0):
# not expected
tokens.add(token, self.TOK_TYPE_UNKNOWN)
token = ""
inError = True
token += currentChar()
offset += 1
continue
# mark start and end of arrays and array rows
if (currentChar() == "{"):
if (len(token) > 0):
# not expected
tokens.add(token, self.TOK_TYPE_UNKNOWN)
token = ""
tokenStack.push(tokens.add("ARRAY", self.TOK_TYPE_FUNCTION, self.TOK_SUBTYPE_START))
tokenStack.push(tokens.add("ARRAYROW", self.TOK_TYPE_FUNCTION, self.TOK_SUBTYPE_START))
offset += 1
continue
if (currentChar() == ";"):
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
token = ""
tokens.addRef(tokenStack.pop())
tokens.add(",", self.TOK_TYPE_ARGUMENT)
tokenStack.push(tokens.add("ARRAYROW", self.TOK_TYPE_FUNCTION, self.TOK_SUBTYPE_START))
offset += 1
continue
if (currentChar() == "}"):
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
token = ""
tokens.addRef(tokenStack.pop())
tokens.addRef(tokenStack.pop())
offset += 1
continue
# trim white-space
if (currentChar() == " "):
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
token = ""
tokens.add("", self.TOK_TYPE_WSPACE)
offset += 1
while ((currentChar() == " ") and (not EOF())):
offset += 1
continue
# multi-character comparators
if (",>=,<=,<>,".find("," + doubleChar() + ",") != -1):
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
token = ""
tokens.add(doubleChar(), self.TOK_TYPE_OP_IN, self.TOK_SUBTYPE_LOGICAL)
offset += 2
continue
# standard infix operators
if ("+-*/^&=><".find(currentChar()) != -1):
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
token = ""
tokens.add(currentChar(), self.TOK_TYPE_OP_IN)
offset += 1
continue
# standard postfix operators
if ("%".find(currentChar()) != -1):
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
token = ""
tokens.add(currentChar(), self.TOK_TYPE_OP_POST)
offset += 1
continue
# start subexpression or function
if (currentChar() == "("):
if (len(token) > 0):
tokenStack.push(tokens.add(token, self.TOK_TYPE_FUNCTION, self.TOK_SUBTYPE_START))
token = ""
else:
tokenStack.push(tokens.add("", self.TOK_TYPE_SUBEXPR, self.TOK_SUBTYPE_START))
offset += 1
continue
# function, subexpression, array parameters
if (currentChar() == ","):
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
token = ""
if (not (tokenStack.type() == self.TOK_TYPE_FUNCTION)):
tokens.add(currentChar(), self.TOK_TYPE_OP_IN, self.TOK_SUBTYPE_UNION)
else:
tokens.add(currentChar(), self.TOK_TYPE_ARGUMENT)
offset += 1
continue
# stop subexpression
if (currentChar() == ")"):
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
token = ""
tokens.addRef(tokenStack.pop())
offset += 1
continue
# token accumulation
token += currentChar()
offset += 1
# dump remaining accumulation
if (len(token) > 0):
tokens.add(token, self.TOK_TYPE_OPERAND)
# move all tokens to a new collection, excluding all unnecessary white-space tokens
tokens2 = f_tokens()
while (tokens.moveNext()):
token = tokens.current();
if (token.ttype == self.TOK_TYPE_WSPACE):
if ((tokens.BOF()) or (tokens.EOF())):
pass
elif (not(
((tokens.previous().ttype == self.TOK_TYPE_FUNCTION) and (tokens.previous().tsubtype == self.TOK_SUBTYPE_STOP)) or
((tokens.previous().ttype == self.TOK_TYPE_SUBEXPR) and (tokens.previous().tsubtype == self.TOK_SUBTYPE_STOP)) or
(tokens.previous().ttype == self.TOK_TYPE_OPERAND)
)
):
pass
elif (not(
((tokens.next().ttype == self.TOK_TYPE_FUNCTION) and (tokens.next().tsubtype == self.TOK_SUBTYPE_START)) or
((tokens.next().ttype == self.TOK_TYPE_SUBEXPR) and (tokens.next().tsubtype == self.TOK_SUBTYPE_START)) or
(tokens.next().ttype == self.TOK_TYPE_OPERAND)
)
):
pass
else:
tokens2.add(token.tvalue, self.TOK_TYPE_OP_IN, self.TOK_SUBTYPE_INTERSECT)
continue
tokens2.addRef(token);
# switch infix "-" operator to prefix when appropriate, switch infix "+" operator to noop when appropriate, identify operand
# and infix-operator subtypes, pull "@" from in front of function names
while (tokens2.moveNext()):
token = tokens2.current()
if ((token.ttype == self.TOK_TYPE_OP_IN) and (token.tvalue == "-")):
if (tokens2.BOF()):
token.ttype = self.TOK_TYPE_OP_PRE
elif (
((tokens2.previous().ttype == self.TOK_TYPE_FUNCTION) and (tokens2.previous().tsubtype == self.TOK_SUBTYPE_STOP)) or
((tokens2.previous().ttype == self.TOK_TYPE_SUBEXPR) and (tokens2.previous().tsubtype == self.TOK_SUBTYPE_STOP)) or
(tokens2.previous().ttype == self.TOK_TYPE_OP_POST) or
(tokens2.previous().ttype == self.TOK_TYPE_OPERAND)
):
token.tsubtype = self.TOK_SUBTYPE_MATH;
else:
token.ttype = self.TOK_TYPE_OP_PRE
continue
if ((token.ttype == self.TOK_TYPE_OP_IN) and (token.tvalue == "+")):
if (tokens2.BOF()):
token.ttype = self.TOK_TYPE_NOOP
elif (
((tokens2.previous().ttype == self.TOK_TYPE_FUNCTION) and (tokens2.previous().tsubtype == self.TOK_SUBTYPE_STOP)) or
((tokens2.previous().ttype == self.TOK_TYPE_SUBEXPR) and (tokens2.previous().tsubtype == self.TOK_SUBTYPE_STOP)) or
(tokens2.previous().ttype == self.TOK_TYPE_OP_POST) or
(tokens2.previous().ttype == self.TOK_TYPE_OPERAND)
):
token.tsubtype = self.TOK_SUBTYPE_MATH
else:
token.ttype = self.TOK_TYPE_NOOP
continue
if ((token.ttype == self.TOK_TYPE_OP_IN) and (len(token.tsubtype) == 0)):
if (("<>=").find(token.tvalue[0:1]) != -1):
token.tsubtype = self.TOK_SUBTYPE_LOGICAL
elif (token.tvalue == "&"):
token.tsubtype = self.TOK_SUBTYPE_CONCAT
else:
token.tsubtype = self.TOK_SUBTYPE_MATH
continue
if ((token.ttype == self.TOK_TYPE_OPERAND) and (len(token.tsubtype) == 0)):
try:
float(token.tvalue)
except ValueError as e:
if ((token.tvalue == 'TRUE') or (token.tvalue == 'FALSE')):
token.tsubtype = self.TOK_SUBTYPE_LOGICAL
else:
token.tsubtype = self.TOK_SUBTYPE_RANGE
else:
token.tsubtype = self.TOK_SUBTYPE_NUMBER
continue
if (token.ttype == self.TOK_TYPE_FUNCTION):
if (token.tvalue[0:1] == "@"):
token.tvalue = token.tvalue[1:]
continue
tokens2.reset();
# move all tokens to a new collection, excluding all noops
tokens = f_tokens()
while (tokens2.moveNext()):
if (tokens2.current().ttype != self.TOK_TYPE_NOOP):
tokens.addRef(tokens2.current())
tokens.reset()
return tokens
def parse(self, formula):
self.tokens = self.getTokens(formula)
def render(self):
output = ""
if self.tokens:
for t in self.tokens.items:
if t.ttype == self.TOK_TYPE_FUNCTION and t.tsubtype == self.TOK_SUBTYPE_START: output += t.tvalue + "("
elif t.ttype == self.TOK_TYPE_FUNCTION and t.tsubtype == self.TOK_SUBTYPE_STOP: output += ")"
elif t.ttype == self.TOK_TYPE_SUBEXPR and t.tsubtype == self.TOK_SUBTYPE_START: output += "("
elif t.ttype == self.TOK_TYPE_SUBEXPR and t.tsubtype == self.TOK_SUBTYPE_STOP: output += ")"
# TODO: add in RE substitution of " with "" for strings
elif t.ttype == self.TOK_TYPE_OPERAND and t.tsubtype == self.TOK_SUBTYPE_TEXT: output += "\"" + t.tvalue + "\""
elif t.ttype == self.TOK_TYPE_OP_IN and t.tsubtype == self.TOK_SUBTYPE_INTERSECT: output += " "
else: output += t.tvalue
return output
def prettyprint(self):
indent = 0
output = ""
if self.tokens:
for t in self.tokens.items:
if (t.tsubtype == self.TOK_SUBTYPE_STOP):
indent -= 1
output += " "*indent + t.tvalue + " <" + t.ttype +"> <" + t.tsubtype + ">" + "\n"
if (t.tsubtype == self.TOK_SUBTYPE_START):
indent += 1;
return output
#========================================================================
# Main code:
#
# A simple test-rig. Iterate through a list of test input strings,
# outputing a nested display of the token stream parsed from each one.
#========================================================================
if __name__ == "__main__":
# Test inputs
inputs = [
# Simple test formulae
'=1+3+5',
'=3 * 4 + 5',
'=50',
'=1+1',
'=$A1',
'=$B$2',
'=SUM(B5:B15)',
'=SUM(B5:B15,D5:D15)',
'=SUM(B5:B15 A7:D7)',
'=SUM(sheet1!$A$1:$B$2)',
'=[data.xls]sheet1!$A$1',
'=SUM((A:A 1:1))',
'=SUM((A:A,1:1))',
'=SUM((A:A A1:B1))',
'=SUM(D9:D11,E9:E11,F9:F11)',
'=SUM((D9:D11,(E9:E11,F9:F11)))',
'=IF(P5=1.0,"NA",IF(P5=2.0,"A",IF(P5=3.0,"B",IF(P5=4.0,"C",IF(P5=5.0,"D",IF(P5=6.0,"E",IF(P5=7.0,"F",IF(P5=8.0,"G"))))))))',
'={SUM(B2:D2*B3:D3)}',
'=SUM(123 + SUM(456) + (45<6))+456+789',
'=AVG(((((123 + 4 + AVG(A1:A2))))))',
# E. W. Bachtal's test formulae
'=IF("a"={"a","b";"c",#N/A;-1,TRUE}, "yes", "no") & " more ""test"" text"',
'=+ AName- (-+-+-2^6) = {"A","B"} + @SUM(R1C1) + (@ERROR.TYPE(#VALUE!) = 2)',
'=IF(R13C3>DATE(2002,1,6),0,IF(ISERROR(R[41]C[2]),0,IF(R13C3>=R[41]C[2],0, IF(AND(R[23]C[11]>=55,R[24]C[11]>=20),R53C3,0))))',
'=IF(R[39]C[11]>65,R[25]C[42],ROUND((R[11]C[11]*IF(OR(AND(R[39]C[11]>=55, ' +
'R[40]C[11]>=20),AND(R[40]C[11]>=20,R11C3="YES")),R[44]C[11],R[43]C[11]))+(R[14]C[11] ' +
'*IF(OR(AND(R[39]C[11]>=55,R[40]C[11]>=20),AND(R[40]C[11]>=20,R11C3="YES")), ' +
'R[45]C[11],R[43]C[11])),0))',
]
p = ExcelParser()
for i in inputs:
print("========================================")
print("Formula: " + i)
p.parse(i)
print("Pretty printed:\n", p.prettyprint())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment