Instantly share code, notes, and snippets.

Embed
What would you like to do?
Building a Compiler in Swift with LLVM, Part 1: Introduction and the Lexer
#if os(macOS)
import Darwin
#elseif os(Linux)
import Glibc
#endif
enum BinaryOperator: Character {
case plus = "+"
case minus = "-"
case times = "*"
case divide = "/"
case mod = "%"
case equals = "="
}
enum Token {
case leftParen, rightParen, def, extern, comma, semicolon, `if`, then, `else`
case identifier(String)
case number(Double)
case `operator`(BinaryOperator)
}
extension Character {
var value: Int32 {
return Int32(String(self).unicodeScalars.first!.value)
}
var isSpace: Bool {
return isspace(value) != 0
}
var isAlphanumeric: Bool {
return isalnum(value) != 0 || self == "_"
}
}
class Lexer {
let input: String
var index: String.Index
init(input: String) {
self.input = input
self.index = input.startIndex
}
var currentChar: Character? {
return index < input.endIndex ? input[index] : nil
}
func advanceIndex() {
input.characters.formIndex(after: &index)
}
func readIdentifierOrNumber() -> String {
var str = ""
while let char = currentChar, char.isAlphanumeric || char == "." {
str.characters.append(char)
advanceIndex()
}
return str
}
func advanceToNextToken() -> Token? {
// Skip all spaces until a non-space token
while let char = currentChar, char.isSpace {
advanceIndex()
}
// If we hit the end of the input, then we're done
guard let char = currentChar else {
return nil
}
// Handle single-scalar tokens, like comma,
// leftParen, rightParen, and the operators
let singleTokMapping: [Character: Token] = [
",": .comma, "(": .leftParen, ")": .rightParen,
";": .semicolon, "+": .operator(.plus), "-": .operator(.minus),
"*": .operator(.times), "/": .operator(.divide),
"%": .operator(.mod), "=": .operator(.equals)
]
if let tok = singleTokMapping[char] {
advanceIndex()
return tok
}
// This is where we parse identifiers or numbers
// We're going to use Swift's built-in double parsing
// logic here.
if char.isAlphanumeric {
var str = readIdentifierOrNumber()
if let dblVal = Double(str) {
return .number(dblVal)
}
// Look for known tokens, otherwise fall back to
// the identifier token
switch str {
case "def": return .def
case "extern": return .extern
case "if": return .if
case "then": return .then
case "else": return .else
default: return .identifier(str)
}
}
return nil
}
func lex() -> [Token] {
var toks = [Token]()
while let tok = advanceToNextToken() {
toks.append(tok)
}
return toks
}
}
let toks = Lexer(input: "def foo(n) (n * 100.34);").lex()
print(toks)
@kdawgwilk

This comment has been minimized.

kdawgwilk commented Jan 9, 2017

Thank you so much for taking the time to write the blog post that goes with this. I really became interested in compilers when Swift was released and became open source and this is exactly the type of articles I have been wanting to read. LLVM is fascinating to me and to see it used from Swift makes it feel more approachable than from C/C++. Again thanks and keep it up! Can't wait to read the next part in the series!

@harlanhaskins

This comment has been minimized.

Owner

harlanhaskins commented Jan 12, 2017

@kdawgwilk I'm so glad you found this useful!

@Pre1

This comment has been minimized.

Pre1 commented Feb 24, 2017

I refactored your code into this:

enum BinaryOperator: Character {
	case plus = "+"
	case minus = "-"
	case times = "*"
	case divide = "/"
	case mod = "%"
	case equals = "="
}

enum Single: Character {
	case leftParen = "("
	case rightParen = ")"
	case comma = ","
	case semicolon = ";"
}

enum Keywords: String {
	case def, extern, `if`, then, `else`
}

enum Token {
	case keys(Keywords)
	case identifier(String)
	case number(Double)
	case `operator`(BinaryOperator)
	case sub(Single)
}

and the implementation of advanceToNextToken func will be simpler (I think):

func advanceToNextToken() -> Token? {
	// Skip all spaces until a non-space token
	while let char = currentChar, char.isSpace {
		advanceIndex()
	}
	// If we hit the end of the input, then we're done
	guard let char = currentChar else {
		return nil
	}
	
	// Handle single-scalar tokens, like comma,
	// leftParen, rightParen, and the operators
	if let tokSig = Single(rawValue: char) {
		advanceIndex()
		return .sub(tokSig)
	}
	if let tokOp = BinaryOperator(rawValue: char) {
		advanceIndex()
		return .operator(tokOp)
	}
	
	// This is where we parse identifiers or numbers
	// We're going to use Swift's built-in double parsing
	// logic here.
	if char.isAlphanumeric {
		let str = readIdentifierOrNumber()
		if let dblVal = Double(str) {
			return .number(dblVal)
		}
		
		// Look for known tokens, otherwise fall back to
		// the identifier token
		if let k = Keywords(rawValue: str) {
			return .keys(k)
		}
		
		return .identifier(str)
	}
	return nil
}

what do you think?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment