Skip to content

Instantly share code, notes, and snippets.

@damuellen
Created April 20, 2022 16:18
Show Gist options
  • Save damuellen/ebcfa8c10d885a900a8080a74054136c to your computer and use it in GitHub Desktop.
Save damuellen/ebcfa8c10d885a900a8080a74054136c to your computer and use it in GitHub Desktop.
Excel Formula Parsing
import Foundation
public struct ExcelFormula {
public var formula: String
public init(_ formula: String) { self.formula = formula }
public var indented: String {
var indentCount = 0
let indented: [String] = tokens.map { token in
if token.subtype == .Stop { indentCount -= indentCount > 0 ? 1 : 0 }
var indent = ""
indent.append("")
for _ in 0..<indentCount { indent.append(" ") }
indent.append(token.value)
if token.subtype == .Start { indentCount += 1 }
return indent
}
return indented.joined(separator: "\n")
}
public struct Token: CustomStringConvertible {
var value: String
var type: TokenType
var subtype: TokenSubtype?
public var description: String {
return [value, type.rawValue, subtype?.rawValue ?? ""].joined(separator: " ")
}
}
public enum TokenType: String {
case Noop, Operand, Function, Subexpression, Argument, OperatorPrefix, OperatorInfix,
OperatorPostfix, Whitespace, Unknown
}
public enum TokenSubtype: String {
case Nothing, Start, Stop, Text, Number, Logical, Error, Range, Math, Concatenation,
Intersection, Union
}
public var tokens: [Token] {
// No attempt is made to verify formulas; assumes formulas are derived from Excel, where
// they can only exist if valid; stack overflows/underflows sunk as nils without exceptions.
let formula = formula.map(Character.init)
if formula.count < 2 || formula[0] != "=" { return [] }
var tokens1 = [Token]()
var stack = [Token]()
let QUOTE_DOUBLE: Character = "\""
let QUOTE_SINGLE: Character = "\\"
let BRACKET_CLOSE: Character = "]"
let BRACKET_OPEN: Character = "["
let BRACE_OPEN: Character = "{"
let BRACE_CLOSE: Character = "}"
let PAREN_OPEN: Character = "("
let PAREN_CLOSE: Character = ")"
let SEMICOLON: Character = ";"
let WHITESPACE: Character = " "
let COMMA: Character = ","
let ERROR_START: Character = "#"
let OPERATORS_SN = "+-"
let OPERATORS_INFIX = "+-*/^&=><"
let OPERATORS_POSTFIX = "%"
let ERRORS: [String] = ["#nil!", "#DIV/0!", "#VALUE!", "#REF!", "#NAME?", "#NUM!", "#N/A"]
let COMPARATORS_MULTI = [">=", "<=", "<>"]
var inString = false
var inPath = false
var inRange = false
var inError = false
var index = 1
var tokens1index = -1
var tokens2index = -1
var value = ""
while index < formula.count {
// state-dependent character evaluation (order is important)
// double-quoted strings
// embeds are doubled
// end marks token
if inString {
if formula[index] == QUOTE_DOUBLE {
if ((index + 2) <= formula.count) && (formula[index + 1] == QUOTE_DOUBLE) {
value += String(QUOTE_DOUBLE)
index += 1
} else {
inString = false
tokens1.append(Token(value: value, type: .Operand, subtype: .Text))
value = ""
}
} else {
value += String(formula[index])
}
index += 1
continue
}
// single-quoted strings (links)
// embeds are double
// end does not mark a token
if inPath {
if formula[index] == QUOTE_SINGLE {
if ((index + 2) <= formula.count) && (formula[index + 1] == QUOTE_SINGLE) {
value += String(QUOTE_SINGLE)
index += 1
} else {
inPath = false
}
} else {
value += String(formula[index])
}
index += 1
continue
}
// bracked strings (R1C1 range index or linked workbook name)
// no embeds (changed to "()" by Excel)
// end does not mark a token
if inRange {
if formula[index] == BRACKET_CLOSE { inRange = false }
value += String(formula[index])
index += 1
continue
}
// error values
// end marks a token, determined from absolute list of values
if inError {
value += String(formula[index])
index += 1
if ERRORS.contains(value) {
inError = false
tokens1.append(Token(value: value, type: .Operand, subtype: .Error))
value = ""
}
continue
}
// scientific notation check
if OPERATORS_SN.contains(value) {
if value.count > 1 {
let regex = try! NSRegularExpression(pattern: #"^[1-9]{1}(\.[0-9]+)?E{1}$"#)
if regex.firstMatch(in: value, range: NSRange(value)!) != nil {
value += String(formula[index])
index += 1
continue
}
}
}
// independent character evaluation (order not important)
// establish state-dependent character evaluations
if formula[index] == QUOTE_DOUBLE {
if value.count > 0 { // unexpected
tokens1.append(Token(value: value, type: .Unknown, subtype: nil))
value = ""
}
inString = true
index += 1
continue
}
if formula[index] == QUOTE_SINGLE {
if value.count > 0 { // unexpected
tokens1.append(Token(value: value, type: .Unknown, subtype: nil))
value = ""
}
inPath = true
index += 1
continue
}
if formula[index] == BRACKET_OPEN {
inRange = true
value += String(BRACKET_OPEN)
index += 1
continue
}
if formula[index] == ERROR_START {
if value.count > 0 { // unexpected
tokens1.append(Token(value: value, type: .Unknown, subtype: nil))
value = ""
}
inError = true
value += String(ERROR_START)
index += 1
continue
}
// mark start and end of arrays and array rows
if formula[index] == BRACE_OPEN {
if value.count > 0 { // unexpected
tokens1.append(Token(value: value, type: .Unknown, subtype: nil))
value = ""
}
var token = Token(value: "ARRAY", type: .Function, subtype: .Start)
stack.append(token)
tokens1.append(token)
token = Token(value: "SCOPE", type: .Function, subtype: .Start)
stack.append(token)
tokens1.append(token)
index += 1
continue
}
if formula[index] == SEMICOLON {
if value.count > 0 {
tokens1.append(Token(value: value, type: .Operand, subtype: nil))
value = ""
}
tokens1.append(Token(value: "", type: stack.removeLast().type, subtype: .Stop))
tokens1.append(Token(value: ",", type: .Argument, subtype: nil))
let token = Token(value: "ARRAYROW", type: .Function, subtype: .Start)
tokens1.append(token)
stack.append(token)
index += 1
continue
}
if formula[index] == BRACE_CLOSE {
if value.count > 0 {
tokens1.append(Token(value: value, type: .Operand, subtype: nil))
value = ""
}
tokens1.append(Token(value: "", type: stack.removeLast().type, subtype: .Stop))
tokens1.append(Token(value: "", type: stack.removeLast().type, subtype: .Stop))
index += 1
continue
}
// trim white-space
if formula[index] == WHITESPACE {
if value.count > 0 {
tokens1.append(Token(value: value, type: .Operand, subtype: nil))
value = ""
}
tokens1.append(Token(value: "", type: .Whitespace, subtype: nil))
index += 1
while (formula[index] == WHITESPACE) && (index < formula.count) { index += 1 }
continue
}
// multi-character comparators
if (index + 2) <= formula.count {
if COMPARATORS_MULTI.contains(String(formula[index..<index + 2])) {
if value.count > 0 {
tokens1.append(Token(value: value, type: .Operand, subtype: nil))
value = ""
}
tokens1.append(
Token(
value: String(formula[index..<index + 2]), type: .OperatorInfix, subtype: .Logical))
index += 2
continue
}
}
// standard infix operators
if OPERATORS_INFIX.contains(formula[index]) {
if value.count > 0 {
tokens1.append(Token(value: value, type: .Operand))
value = ""
}
tokens1.append(Token(value: String(formula[index]), type: .OperatorInfix, subtype: nil))
index += 1
continue
}
// standard postfix operators (only one)
if OPERATORS_POSTFIX.contains(formula[index]) {
if value.count > 0 {
tokens1.append(Token(value: value, type: .Operand))
value = ""
}
tokens1.append(Token(value: String(formula[index]), type: .OperatorPostfix, subtype: nil))
index += 1
continue
}
// start subexpression or function
if formula[index] == PAREN_OPEN {
if value.count > 0 {
let token = Token(value: value, type: .Function, subtype: .Start)
tokens1.append(token)
stack.append(token)
value = ""
} else {
let token = Token(value: "", type: .Subexpression, subtype: .Start)
tokens1.append(token)
stack.append(token)
}
index += 1
continue
}
// function, subexpression, or array parameters, or operand unions
if formula[index] == COMMA {
if value.count > 0 {
tokens1.append(Token(value: value, type: .Operand))
value = ""
}
if stack.last!.type != .Function {
tokens1.append(Token(value: ",", type: .OperatorInfix, subtype: .Union))
} else {
tokens1.append(Token(value: ",", type: .Argument))
}
index += 1
continue
}
// stop subexpression
if formula[index] == PAREN_CLOSE {
if value.count > 0 {
tokens1.append(Token(value: value, type: .Operand))
value = ""
}
tokens1.append(Token(value: "", type: stack.removeLast().type, subtype: .Stop))
index += 1
continue
}
// token accumulation
value += String(formula[index])
index += 1
}
// dump remaining accumulation
if value.count > 0 { tokens1.append(Token(value: value, type: .Operand)) }
// move tokenList to new set, excluding unnecessary white-space tokens and converting necessary ones to intersections
var tokens2 = [Token]()
while tokens1index < tokens1.count - 1 {
tokens1index += 1
guard let token = tokens1index == -1 ? nil : tokens1[tokens1index] else { continue }
if token.type != TokenType.Whitespace {
tokens2.append(token)
continue
}
if tokens1index <= 0 || (tokens1index >= (tokens1.count - 1)) { continue }
guard let previous = tokens1index < 1 ? nil : tokens1[tokens1index - 1] else { continue }
if !(((previous.type == .Function) && (previous.subtype == .Stop))
|| ((previous.type == .Subexpression) && (previous.subtype == .Stop))
|| (previous.type == .Operand))
{
continue
}
guard let next = tokens1index >= (tokens2.count - 1) ? nil : tokens1[tokens1index + 1] else {
continue
}
if !(((next.type == .Function) && (next.subtype == .Start))
|| ((next.type == .Subexpression) && (next.subtype == .Start)) || (next.type == .Operand))
{
continue
}
tokens2.append(Token(value: "", type: .OperatorInfix, subtype: .Intersection))
}
// move tokens to final list, switching infix "-" operators to prefix when appropriate, switching infix "+" operators
// to noop when appropriate, identifying operand and infix-operator subtypes, and pulling "@" from function names
var tokens = [Token]()
while tokens2index < tokens2.count - 1 {
tokens2index += 1
guard var token = tokens2index == -1 ? nil : tokens2[tokens2index] else { continue }
let previous = tokens2index < 1 ? nil : tokens2[tokens2index - 1]
if token.type == .OperatorInfix && token.value == "-" {
if tokens2index <= 0 {
token.type = .OperatorPrefix
} else if ((previous!.type == .Function) && (previous!.subtype == .Stop))
|| ((previous!.type == .Subexpression) && (previous!.subtype == .Stop))
|| (previous!.type == .OperatorPostfix) || (previous!.type == .Operand)
{
token.subtype = .Math
} else {
token.type = .OperatorPrefix
}
tokens.append(token)
continue
}
if token.type == .OperatorInfix && token.value == "+" {
if tokens2index <= 0 {
continue
} else if ((previous!.type == .Function) && (previous!.subtype == .Stop))
|| ((previous!.type == .Subexpression) && (previous!.subtype == .Stop))
|| (previous!.type == .OperatorPostfix) || (previous!.type == .Operand)
{
token.subtype = .Math
} else {
continue
}
tokens.append(token)
continue
}
if token.type == .OperatorInfix && token.subtype == .Nothing {
if token.value.hasPrefix("<>=") {
token.subtype = .Logical
} else if token.value == "&" {
token.subtype = .Concatenation
} else {
token.subtype = .Math
}
tokens.append(token)
continue
}
if token.type == .Operand && token.subtype == .Nothing {
let d = Double(token.value)
let isNumber = d != nil
if !isNumber {
if token.value == "TRUE" || token.value == "FALSE" {
token.subtype = .Logical
} else {
token.subtype = .Range
}
} else {
token.subtype = .Number
}
tokens.append(token)
continue
}
if token.type == .Function {
if token.value.count > 0 {
if token.value.hasPrefix("@") { token.value = String(token.value.dropFirst()) }
}
}
tokens.append(token)
}
return tokens
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment