Created
April 20, 2022 16:18
-
-
Save damuellen/ebcfa8c10d885a900a8080a74054136c to your computer and use it in GitHub Desktop.
Excel Formula Parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
public struct ExcelFormula { | |
public var formula: String | |
public init(_ formula: String) { self.formula = formula } | |
public var indented: String { | |
var indentCount = 0 | |
let indented: [String] = tokens.map { token in | |
if token.subtype == .Stop { indentCount -= indentCount > 0 ? 1 : 0 } | |
var indent = "" | |
indent.append("") | |
for _ in 0..<indentCount { indent.append(" ") } | |
indent.append(token.value) | |
if token.subtype == .Start { indentCount += 1 } | |
return indent | |
} | |
return indented.joined(separator: "\n") | |
} | |
public struct Token: CustomStringConvertible { | |
var value: String | |
var type: TokenType | |
var subtype: TokenSubtype? | |
public var description: String { | |
return [value, type.rawValue, subtype?.rawValue ?? ""].joined(separator: " ") | |
} | |
} | |
public enum TokenType: String { | |
case Noop, Operand, Function, Subexpression, Argument, OperatorPrefix, OperatorInfix, | |
OperatorPostfix, Whitespace, Unknown | |
} | |
public enum TokenSubtype: String { | |
case Nothing, Start, Stop, Text, Number, Logical, Error, Range, Math, Concatenation, | |
Intersection, Union | |
} | |
public var tokens: [Token] { | |
// No attempt is made to verify formulas; assumes formulas are derived from Excel, where | |
// they can only exist if valid; stack overflows/underflows sunk as nils without exceptions. | |
let formula = formula.map(Character.init) | |
if formula.count < 2 || formula[0] != "=" { return [] } | |
var tokens1 = [Token]() | |
var stack = [Token]() | |
let QUOTE_DOUBLE: Character = "\"" | |
let QUOTE_SINGLE: Character = "\\" | |
let BRACKET_CLOSE: Character = "]" | |
let BRACKET_OPEN: Character = "[" | |
let BRACE_OPEN: Character = "{" | |
let BRACE_CLOSE: Character = "}" | |
let PAREN_OPEN: Character = "(" | |
let PAREN_CLOSE: Character = ")" | |
let SEMICOLON: Character = ";" | |
let WHITESPACE: Character = " " | |
let COMMA: Character = "," | |
let ERROR_START: Character = "#" | |
let OPERATORS_SN = "+-" | |
let OPERATORS_INFIX = "+-*/^&=><" | |
let OPERATORS_POSTFIX = "%" | |
let ERRORS: [String] = ["#nil!", "#DIV/0!", "#VALUE!", "#REF!", "#NAME?", "#NUM!", "#N/A"] | |
let COMPARATORS_MULTI = [">=", "<=", "<>"] | |
var inString = false | |
var inPath = false | |
var inRange = false | |
var inError = false | |
var index = 1 | |
var tokens1index = -1 | |
var tokens2index = -1 | |
var value = "" | |
while index < formula.count { | |
// state-dependent character evaluation (order is important) | |
// double-quoted strings | |
// embeds are doubled | |
// end marks token | |
if inString { | |
if formula[index] == QUOTE_DOUBLE { | |
if ((index + 2) <= formula.count) && (formula[index + 1] == QUOTE_DOUBLE) { | |
value += String(QUOTE_DOUBLE) | |
index += 1 | |
} else { | |
inString = false | |
tokens1.append(Token(value: value, type: .Operand, subtype: .Text)) | |
value = "" | |
} | |
} else { | |
value += String(formula[index]) | |
} | |
index += 1 | |
continue | |
} | |
// single-quoted strings (links) | |
// embeds are double | |
// end does not mark a token | |
if inPath { | |
if formula[index] == QUOTE_SINGLE { | |
if ((index + 2) <= formula.count) && (formula[index + 1] == QUOTE_SINGLE) { | |
value += String(QUOTE_SINGLE) | |
index += 1 | |
} else { | |
inPath = false | |
} | |
} else { | |
value += String(formula[index]) | |
} | |
index += 1 | |
continue | |
} | |
// bracked strings (R1C1 range index or linked workbook name) | |
// no embeds (changed to "()" by Excel) | |
// end does not mark a token | |
if inRange { | |
if formula[index] == BRACKET_CLOSE { inRange = false } | |
value += String(formula[index]) | |
index += 1 | |
continue | |
} | |
// error values | |
// end marks a token, determined from absolute list of values | |
if inError { | |
value += String(formula[index]) | |
index += 1 | |
if ERRORS.contains(value) { | |
inError = false | |
tokens1.append(Token(value: value, type: .Operand, subtype: .Error)) | |
value = "" | |
} | |
continue | |
} | |
// scientific notation check | |
if OPERATORS_SN.contains(value) { | |
if value.count > 1 { | |
let regex = try! NSRegularExpression(pattern: #"^[1-9]{1}(\.[0-9]+)?E{1}$"#) | |
if regex.firstMatch(in: value, range: NSRange(value)!) != nil { | |
value += String(formula[index]) | |
index += 1 | |
continue | |
} | |
} | |
} | |
// independent character evaluation (order not important) | |
// establish state-dependent character evaluations | |
if formula[index] == QUOTE_DOUBLE { | |
if value.count > 0 { // unexpected | |
tokens1.append(Token(value: value, type: .Unknown, subtype: nil)) | |
value = "" | |
} | |
inString = true | |
index += 1 | |
continue | |
} | |
if formula[index] == QUOTE_SINGLE { | |
if value.count > 0 { // unexpected | |
tokens1.append(Token(value: value, type: .Unknown, subtype: nil)) | |
value = "" | |
} | |
inPath = true | |
index += 1 | |
continue | |
} | |
if formula[index] == BRACKET_OPEN { | |
inRange = true | |
value += String(BRACKET_OPEN) | |
index += 1 | |
continue | |
} | |
if formula[index] == ERROR_START { | |
if value.count > 0 { // unexpected | |
tokens1.append(Token(value: value, type: .Unknown, subtype: nil)) | |
value = "" | |
} | |
inError = true | |
value += String(ERROR_START) | |
index += 1 | |
continue | |
} | |
// mark start and end of arrays and array rows | |
if formula[index] == BRACE_OPEN { | |
if value.count > 0 { // unexpected | |
tokens1.append(Token(value: value, type: .Unknown, subtype: nil)) | |
value = "" | |
} | |
var token = Token(value: "ARRAY", type: .Function, subtype: .Start) | |
stack.append(token) | |
tokens1.append(token) | |
token = Token(value: "SCOPE", type: .Function, subtype: .Start) | |
stack.append(token) | |
tokens1.append(token) | |
index += 1 | |
continue | |
} | |
if formula[index] == SEMICOLON { | |
if value.count > 0 { | |
tokens1.append(Token(value: value, type: .Operand, subtype: nil)) | |
value = "" | |
} | |
tokens1.append(Token(value: "", type: stack.removeLast().type, subtype: .Stop)) | |
tokens1.append(Token(value: ",", type: .Argument, subtype: nil)) | |
let token = Token(value: "ARRAYROW", type: .Function, subtype: .Start) | |
tokens1.append(token) | |
stack.append(token) | |
index += 1 | |
continue | |
} | |
if formula[index] == BRACE_CLOSE { | |
if value.count > 0 { | |
tokens1.append(Token(value: value, type: .Operand, subtype: nil)) | |
value = "" | |
} | |
tokens1.append(Token(value: "", type: stack.removeLast().type, subtype: .Stop)) | |
tokens1.append(Token(value: "", type: stack.removeLast().type, subtype: .Stop)) | |
index += 1 | |
continue | |
} | |
// trim white-space | |
if formula[index] == WHITESPACE { | |
if value.count > 0 { | |
tokens1.append(Token(value: value, type: .Operand, subtype: nil)) | |
value = "" | |
} | |
tokens1.append(Token(value: "", type: .Whitespace, subtype: nil)) | |
index += 1 | |
while (formula[index] == WHITESPACE) && (index < formula.count) { index += 1 } | |
continue | |
} | |
// multi-character comparators | |
if (index + 2) <= formula.count { | |
if COMPARATORS_MULTI.contains(String(formula[index..<index + 2])) { | |
if value.count > 0 { | |
tokens1.append(Token(value: value, type: .Operand, subtype: nil)) | |
value = "" | |
} | |
tokens1.append( | |
Token( | |
value: String(formula[index..<index + 2]), type: .OperatorInfix, subtype: .Logical)) | |
index += 2 | |
continue | |
} | |
} | |
// standard infix operators | |
if OPERATORS_INFIX.contains(formula[index]) { | |
if value.count > 0 { | |
tokens1.append(Token(value: value, type: .Operand)) | |
value = "" | |
} | |
tokens1.append(Token(value: String(formula[index]), type: .OperatorInfix, subtype: nil)) | |
index += 1 | |
continue | |
} | |
// standard postfix operators (only one) | |
if OPERATORS_POSTFIX.contains(formula[index]) { | |
if value.count > 0 { | |
tokens1.append(Token(value: value, type: .Operand)) | |
value = "" | |
} | |
tokens1.append(Token(value: String(formula[index]), type: .OperatorPostfix, subtype: nil)) | |
index += 1 | |
continue | |
} | |
// start subexpression or function | |
if formula[index] == PAREN_OPEN { | |
if value.count > 0 { | |
let token = Token(value: value, type: .Function, subtype: .Start) | |
tokens1.append(token) | |
stack.append(token) | |
value = "" | |
} else { | |
let token = Token(value: "", type: .Subexpression, subtype: .Start) | |
tokens1.append(token) | |
stack.append(token) | |
} | |
index += 1 | |
continue | |
} | |
// function, subexpression, or array parameters, or operand unions | |
if formula[index] == COMMA { | |
if value.count > 0 { | |
tokens1.append(Token(value: value, type: .Operand)) | |
value = "" | |
} | |
if stack.last!.type != .Function { | |
tokens1.append(Token(value: ",", type: .OperatorInfix, subtype: .Union)) | |
} else { | |
tokens1.append(Token(value: ",", type: .Argument)) | |
} | |
index += 1 | |
continue | |
} | |
// stop subexpression | |
if formula[index] == PAREN_CLOSE { | |
if value.count > 0 { | |
tokens1.append(Token(value: value, type: .Operand)) | |
value = "" | |
} | |
tokens1.append(Token(value: "", type: stack.removeLast().type, subtype: .Stop)) | |
index += 1 | |
continue | |
} | |
// token accumulation | |
value += String(formula[index]) | |
index += 1 | |
} | |
// dump remaining accumulation | |
if value.count > 0 { tokens1.append(Token(value: value, type: .Operand)) } | |
// move tokenList to new set, excluding unnecessary white-space tokens and converting necessary ones to intersections | |
var tokens2 = [Token]() | |
while tokens1index < tokens1.count - 1 { | |
tokens1index += 1 | |
guard let token = tokens1index == -1 ? nil : tokens1[tokens1index] else { continue } | |
if token.type != TokenType.Whitespace { | |
tokens2.append(token) | |
continue | |
} | |
if tokens1index <= 0 || (tokens1index >= (tokens1.count - 1)) { continue } | |
guard let previous = tokens1index < 1 ? nil : tokens1[tokens1index - 1] else { continue } | |
if !(((previous.type == .Function) && (previous.subtype == .Stop)) | |
|| ((previous.type == .Subexpression) && (previous.subtype == .Stop)) | |
|| (previous.type == .Operand)) | |
{ | |
continue | |
} | |
guard let next = tokens1index >= (tokens2.count - 1) ? nil : tokens1[tokens1index + 1] else { | |
continue | |
} | |
if !(((next.type == .Function) && (next.subtype == .Start)) | |
|| ((next.type == .Subexpression) && (next.subtype == .Start)) || (next.type == .Operand)) | |
{ | |
continue | |
} | |
tokens2.append(Token(value: "", type: .OperatorInfix, subtype: .Intersection)) | |
} | |
// move tokens to final list, switching infix "-" operators to prefix when appropriate, switching infix "+" operators | |
// to noop when appropriate, identifying operand and infix-operator subtypes, and pulling "@" from function names | |
var tokens = [Token]() | |
while tokens2index < tokens2.count - 1 { | |
tokens2index += 1 | |
guard var token = tokens2index == -1 ? nil : tokens2[tokens2index] else { continue } | |
let previous = tokens2index < 1 ? nil : tokens2[tokens2index - 1] | |
if token.type == .OperatorInfix && token.value == "-" { | |
if tokens2index <= 0 { | |
token.type = .OperatorPrefix | |
} else if ((previous!.type == .Function) && (previous!.subtype == .Stop)) | |
|| ((previous!.type == .Subexpression) && (previous!.subtype == .Stop)) | |
|| (previous!.type == .OperatorPostfix) || (previous!.type == .Operand) | |
{ | |
token.subtype = .Math | |
} else { | |
token.type = .OperatorPrefix | |
} | |
tokens.append(token) | |
continue | |
} | |
if token.type == .OperatorInfix && token.value == "+" { | |
if tokens2index <= 0 { | |
continue | |
} else if ((previous!.type == .Function) && (previous!.subtype == .Stop)) | |
|| ((previous!.type == .Subexpression) && (previous!.subtype == .Stop)) | |
|| (previous!.type == .OperatorPostfix) || (previous!.type == .Operand) | |
{ | |
token.subtype = .Math | |
} else { | |
continue | |
} | |
tokens.append(token) | |
continue | |
} | |
if token.type == .OperatorInfix && token.subtype == .Nothing { | |
if token.value.hasPrefix("<>=") { | |
token.subtype = .Logical | |
} else if token.value == "&" { | |
token.subtype = .Concatenation | |
} else { | |
token.subtype = .Math | |
} | |
tokens.append(token) | |
continue | |
} | |
if token.type == .Operand && token.subtype == .Nothing { | |
let d = Double(token.value) | |
let isNumber = d != nil | |
if !isNumber { | |
if token.value == "TRUE" || token.value == "FALSE" { | |
token.subtype = .Logical | |
} else { | |
token.subtype = .Range | |
} | |
} else { | |
token.subtype = .Number | |
} | |
tokens.append(token) | |
continue | |
} | |
if token.type == .Function { | |
if token.value.count > 0 { | |
if token.value.hasPrefix("@") { token.value = String(token.value.dropFirst()) } | |
} | |
} | |
tokens.append(token) | |
} | |
return tokens | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment