Skip to content

Instantly share code, notes, and snippets.

@8bit-pixies
Created February 13, 2014 12:46
Show Gist options
  • Save 8bit-pixies/8974476 to your computer and use it in GitHub Desktop.
Save 8bit-pixies/8974476 to your computer and use it in GitHub Desktop.
# R lexer
# based on http://www.jayconrod.com/posts/37/a-simple-interpreter-from-scratch-in-python-part-1
#' Returns list of patterns and token names.
#' This saves typing.
#'
#' @param pattern a regex pattern
#' @param tag the resulting token tag
tk <- function(pattern, token) {
return(list(pattern=paste0("^", pattern), tag=token))
}
#' returns tokens and tags which matched and the remaining "un"tokenised text
#'
#' @param text the code to be tokenised
#' @param token the list of tokens
lex <- function(text, tokens) {
matches <- lapply(tokens, function(x) {
m <- regexpr(x$pattern, text, perl=TRUE, ignore.case=TRUE)
list(text=regmatches(text, m), tag=x$tag)
})
# remove matches which have character(0)
matches <- matches[sapply(matches, function(x) !identical(character(0), x$text))]
# return the text and tag with the longest length
matches <- matches[which.max(unlist(lapply(matches, function(x) nchar(x$text))))][[1]]
return (list(
text = matches$text,
tag = matches$tag,
remaining = substr(text, nchar(matches$text)+1, nchar(text))
))
}
#' correctly concatenates and returns a list
#'
#' @param lst the list to be concatenated
#' @param base the existing list to be concatenated against
cat_list <- function(lst, base) {
if(length(base) ==0) {
return(list(lst))
}
else {
return(c(base, list(lst)))
}
}
# ---
tokens <- list(
# declared reserved key words
tk("for", "RESERVED"),
tk("in", "RESERVED"),
tk("data.frame", "RESERVED"),
tk("print", "RESERVED"),
#special functions
tk("-?\\d+(\\.\\d*)?:-?\\d+(\\.\\d*)?" , "SEQ"),
tk("[ ]", ""), #ignore single spaces
tk("\\t", ""), #ignore single tabs
tk("#[^\\n]*", ""), #ignore comments until the end of line
#indent rules
tk("\\n[ \\t]+", "INDENT"),
tk("\\n+", "NEWLINE"),
tk("-?\\d+(\\.\\d*)?", "NUM"),
tk("'.*'", "STRING"),
tk('\".*\"', "STRING"),
tk("'''[\\s\\S]*?'''", "STRING"),
tk('"""[\\s\\S]*?"""', "STRING"),
tk("[\\w\\.][\\d\\w\\._\\$]+", "ID"),
# brackets and parens
tk("\\[", "LBRAC"),
tk("\\]", "RBRAC"),
tk("\\(", "LPAREN"),
tk("\\)", "RPAREN"),
tk(",", "COMMA"),
tk("\\.\\.", "DOUBLEDOT"),
tk(":", "COLON"),
# operators
tk("=", "ASSIGN"),
tk("==", "EQ"),
tk("!=", "NEQ"),
tk("\\+", "PLUS"),
tk("-", "MINUS"),
tk("\\*", "TIMES"),
tk("/", "DIVIDE"),
tk("<", "LT"),
tk("<=", "LEQ"),
tk(">", "GT"),
tk(">=", "GEQ"),
tk("&", "AND"),
tk("|", "OR")
)
text = "#ignore this line
parse this line
and this number 13
for i in 1 to 10
parse indent
code continues with indents
end indents
"
text = '"""
testing asdf
"""'
lex_toks = list()
while(nchar(text) > 0) {
token <- lex(text, tokens)
text <- token$remaining
if(token$tag != ""){
lex_toks <- cat_list(list(text=token$text, tag=token$tag), lex_toks)
}
}
# tokenising complete, time to build the parser
read_from <- function(tokens) {
# ID are other variables or functions
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment