Skip to content

Instantly share code, notes, and snippets.

@jpbougie
Created May 5, 2010 23:29
Show Gist options
  • Save jpbougie/391594 to your computer and use it in GitHub Desktop.
Save jpbougie/391594 to your computer and use it in GitHub Desktop.
a rough scheme lexer in LLVM IR
; FILE type for i/o using stdio
%struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct.FILE*, i32 }
@.read_mode = private constant [3 x i8] c"rb\00", align 1
declare i32 @fgetc(%struct.FILE* nocapture) nounwind
declare i32 @feof(%struct.FILE* nocapture) nounwind
declare i32 @fseek(%struct.FILE* nocapture, i64, i32) nounwind
declare i32 @ungetc(i32, %struct.FILE * nocapture) nounwind
declare i8 * @malloc(i64) nounwind
declare void @free(i8* nocapture) nounwind
declare i32 @putchar(i32) nounwind
declare noalias %struct.FILE* @fopen(i8* noalias nocapture, i8* noalias nocapture) nounwind
declare i8 @fclose(%struct.FILE*) nounwind
; stdin
;@stdin = external global %struct.FILE* ; <%struct.FILE**>
%struct.tok = type { i4, [1024 x i8] , i32 } ; type, value, value length
%struct.tok_stream = type { i64, %struct.tok *} ; length, array
; token types
;tok.open_paren = i4 0
;tok.close_paren = i4 1
;tok.integer = i4 2
;tok.string = i4 3
;tok.identifier = i4 4
;tok.quote = i4 5
define i8 @get_next_character(%struct.FILE* %fhandle, i1 %ignore_whitespace) {
entry:
br label %loop
loop:
%0 = tail call i32 @fgetc(%struct.FILE* %fhandle) nounwind ; <i32>
%char = trunc i32 %0 to i8
br i1 %ignore_whitespace, label %check_whitespace, label %result
check_whitespace:
switch i8 %char, label %result [ i8 9, label %loop ; tab
i8 10, label %loop ; \n
i8 13, label %loop ; \r
i8 32, label %loop ]; space
result:
ret i8 %char
}
define void @rewind_character(%struct.FILE* %fhandle, i8 %char) {
%1 = zext i8 %char to i32
tail call i32 @ungetc(i32 %1, %struct.FILE* %fhandle)
ret void
}
define void @ignore_whitespace(%struct.FILE* %fhandle) {
entry:
br label %begin
begin:
%0 = tail call i32 @fgetc(%struct.FILE* %fhandle) nounwind ; <i32>
%1 = trunc i32 %0 to i8
switch i8 %1, label %exit [ i8 9, label %begin ; tab
i8 10, label %begin ; \n
i8 13, label %begin ; \r
i8 32, label %begin ; space
]
exit:
%ret = tail call i32 @fseek(%struct.FILE* %fhandle, i64 -1, i32 1) ; go back one char (SEEK_CUR is 1)
ret void
}
define void @ignore_until_end_of_line(%struct.FILE * %fhandle) {
entry:
br label %begin
begin:
%0 = tail call i32 @fgetc(%struct.FILE* %fhandle) nounwind ; <i32>
%1 = trunc i32 %0 to i8
switch i8 %1, label %begin [
i8 10, label %exit ; \n
i8 -1, label %exit ; end of file
]
exit:
%ret = tail call i32 @fseek(%struct.FILE* %fhandle, i64 -1, i32 1) ; go back one char (SEEK_CUR is 1)
ret void
}
define i1 @is_alpha(i8 %input) {
; upper case
%1 = icmp uge i8 %input, 65
%2 = icmp ule i8 %input, 90
%3 = and i1 %1, %2
;lower case
%4 = icmp uge i8 %input, 97
%5 = icmp ule i8 %input, 122
%6 = and i1 %4, %5
%7 = or i1 %3, %6
ret i1 %7
}
define i1 @is_whitespace(i8 %input) {
switch i8 %input, label %ret_false [ i8 9, label %ret_true ; tab
i8 10, label %ret_true ; \n
i8 13, label %ret_true ; \r
i8 32, label %ret_true ] ; space
ret_true:
ret i1 true
ret_false:
ret i1 false
}
define i1 @is_digit(i8 %input) {
%1 = icmp uge i8 %input, 48
%2 = icmp ule i8 %input, 57
%3 = and i1 %1, %2
ret i1 %3
}
define %struct.tok* @get_next_token(%struct.FILE* %fhandle) {
entry:
;get the size of the structure
%size = getelementptr %struct.tok* null, i32 1
%sz = ptrtoint %struct.tok* %size to i64
%0 = tail call i8* @malloc(i64 %sz); allocate the resulting token
%tok.ptr = bitcast i8* %0 to %struct.tok *
;check for eof
%eofflag = tail call i32 @feof(%struct.FILE* %fhandle)
%isnoteof = icmp eq i32 %eofflag, 0
br i1 %isnoteof, label %not_eof, label %eof
not_eof:
; read a character
%1 = call i8 @get_next_character(%struct.FILE * %fhandle, i1 true)
switch i8 %1, label %other [ i8 -1, label %eof
i8 40, label %open_paren
i8 41, label %close_paren
i8 34, label %double_quote
i8 39, label %single_quote
]
;;;;;; simple tokens ;;;;;;
eof:
%2 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0
store i4 -1, i4 * %2
br label %exit
open_paren:
%3 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0
store i4 0, i4 * %3
br label %exit
close_paren:
%4 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0
store i4 1, i4 * %4
br label %exit
single_quote:
%5 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0
store i4 5, i4 * %5
br label %exit
;;; complex tokens ;;;
double_quote:
%6 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0
store i4 3, i4 * %6
; set the current length to 0
%7 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2
store i32 0, i32 * %7
br label %read_string
read_string:
;read another char
%ch = call i8 @get_next_character(%struct.FILE * %fhandle, i1 false)
%tok.value.length = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2
switch i8 %ch, label %add_to_string [ i8 -1, label %error
i8 34, label %exit
i8 92, label %string_escape
]
add_to_string:
%8 = load i32 * %tok.value.length ; get the current length
%9 = add i32 %8, 1 ; increment the length by one
; store the new character
%10 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %8 ; get the new position for the character
store i8 %ch, i8 * %10
store i32 %9, i32 * %tok.value.length ; store the new length
br label %read_string
string_escape:
; store the \
%11 = load i32 * %tok.value.length ; get the current length
%12 = add i32 %11, 1 ; increment the length by one
; store the new character
%13 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %11
store i8 %ch, i8 * %13
store i32 %12, i32 * %tok.value.length ; store the new length
; get a new character and store it right away, unless it's the end of file
%14 = call i8 @get_next_character(%struct.FILE* %fhandle, i1 false)
%15 = icmp eq i8 %14, -1
br i1 %15, label %error, label %string_add_next
string_add_next:
%16 = add i32 %12, 1
%17 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %12
store i8 %14, i8 * %17
store i32 %16, i32 * %tok.value.length
br label %read_string
other:
;check if it's a number
%18 = icmp uge i8 %1, 48
%19 = icmp ule i8 %1, 57
%20 = and i1 %18, %19
br i1 %20, label %number, label %not_a_number
not_a_number:
;check if it's a valid first letter for an identifier
%21 = call i1 @is_alpha(i8 %1)
; valid initials
; <letter> | ! | $ | % | & | * | / | : | < | = | > | ? | ~ | _ | ^
%22 = icmp eq i8 %1, 33 ; !
%23 = icmp eq i8 %1, 36 ; $
%24 = icmp eq i8 %1, 37 ; %
%25 = icmp eq i8 %1, 38 ; &
%26 = icmp eq i8 %1, 42 ; *
%27 = icmp eq i8 %1, 47 ; /
%28 = icmp eq i8 %1, 58 ; :
%29 = icmp eq i8 %1, 60 ; <
%30 = icmp eq i8 %1, 61 ; =
%31 = icmp eq i8 %1, 62 ; >
%32 = icmp eq i8 %1, 63 ; ?
%33 = icmp eq i8 %1, 126 ; ~
%34 = icmp eq i8 %1, 95 ; _
%35 = icmp eq i8 %1, 94 ; ^
%36 = or i1 %21, %22
%37 = or i1 %36, %23
%38 = or i1 %37, %24
%39 = or i1 %38, %25
%40 = or i1 %39, %26
%41 = or i1 %40, %27
%42 = or i1 %41, %28
%43 = or i1 %42, %29
%44 = or i1 %43, %30
%45 = or i1 %44, %31
%46 = or i1 %45, %32
%47 = or i1 %46, %33
%48 = or i1 %47, %34
%49 = or i1 %48, %35
br i1 %49, label %read_identifier, label %error
read_identifier:
%50 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0
store i4 4, i4 * %50
; set the current length to 1
%51 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2
store i32 1, i32 * %51
; add the first character
%id.array.pos = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 0
store i8 %1, i8 * %id.array.pos
; add next
br label %add_to_identifier
add_to_identifier:
%52 = call i8 @get_next_character(%struct.FILE* %fhandle, i1 false)
;check if it's a valid subsequent
%53 = call i1 @is_alpha(i8 %52)
%54 = call i1 @is_digit(i8 %52)
;special subsequents : + | - | . | @
%55 = icmp eq i8 %52, 43 ; +
%56 = icmp eq i8 %52, 45 ; -
%57 = icmp eq i8 %52, 46 ; .
%58 = icmp eq i8 %52, 64 ; @
%59 = or i1 %53, %54
%60 = or i1 %59, %55
%61 = or i1 %60, %56
%62 = or i1 %61, %57
%63 = or i1 %62, %58
br i1 %63, label %valid_subsequent, label %end_of_identifier
valid_subsequent:
; add the character to the value
; get the current length
%64 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2
%65 = load i32 * %64
%66 = add i32 %65, 1
; store the new character
%67 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %65
store i8 %52, i8 * %67
; store the increased length
store i32 %66, i32 * %64
br label %add_to_identifier
end_of_identifier:
call void @rewind_character(%struct.FILE* %fhandle, i8 %52)
br label %exit
number:
; set the type to integer (2)
%68 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0
store i4 2, i4 * %68
; set the current length to 1
%69 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2
store i32 1, i32 * %69
; add the first number
%70 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 0
store i8 %1, i8 * %70
br label %add_number
add_number:
%71 = call i8 @get_next_character(%struct.FILE* %fhandle, i1 false)
%72 = call i1 @is_digit(i8 %71)
br i1 %72, label %valid_digit, label %end_of_number
valid_digit:
; get the current length
%73 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2
%74 = load i32 * %73
; increment the length
%75 = add i32 %74, 1
; store the new character
%76 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %74
store i8 %71, i8 * %76
; store the new length
store i32 %75, i32 * %73
br label %add_number
end_of_number:
call void @rewind_character(%struct.FILE* %fhandle, i8 %71)
br label %exit
error:
br label %exit
exit:
ret %struct.tok* %tok.ptr
}
; an expression can be a list, an integer, a symbol or a string
; the first element is the type, the second the length (or the number in the case of the integer, and the third is a pointer, either to the list of sub elements or to the string itself)
%struct.expr = type { i4, i64, i8*}
define %struct.expr * @build_ast() {
ret %struct.expr * undef
}
; data will be stored as tagged integers
; 00 = integer, the highest 62 bits contain the number
; 01 = string/symbol, pointer to the string in memory
; 10 = cons-cell/vector
; 11 = function
define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
entry:
%0 = getelementptr inbounds i8** %argv, i64 1
%1 = load i8** %0, align 8
%file = tail call %struct.FILE* @fopen(i8* noalias %1, i8* noalias getelementptr inbounds ([3 x i8]* @.read_mode, i64 0, i64 0)) nounwind
%fileptr = ptrtoint %struct.FILE * %file to i32
%notvalid = icmp eq i32 %fileptr, 0
br i1 %notvalid, label %error, label %loop
loop:
%tok = call %struct.tok* @get_next_token(%struct.FILE* %file)
%type.ptr = getelementptr %struct.tok* %tok, i32 0, i32 0
%type = load i4* %type.ptr
%2 = zext i4 %type to i32
%3 = add i32 %2, 48
tail call i32 @putchar(i32 %3);
%eof = icmp eq i4 %type, -1
%ptr = bitcast %struct.tok* %tok to i8*
tail call void @free(i8* %ptr)
br i1 %eof, label %exit, label %loop
; br label %exit
exit:
tail call i8 @fclose(%struct.FILE * %file)
ret i32 0
error:
ret i32 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment