Created
February 13, 2014 12:46
-
-
Save 8bit-pixies/8974476 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R lexer | |
# based on http://www.jayconrod.com/posts/37/a-simple-interpreter-from-scratch-in-python-part-1 | |
#' Returns list of patterns and token names. | |
#' This saves typing. | |
#' | |
#' @param pattern a regex pattern | |
#' @param tag the resulting token tag | |
tk <- function(pattern, token) { | |
return(list(pattern=paste0("^", pattern), tag=token)) | |
} | |
#' returns tokens and tags which matched and the remaining "un"tokenised text | |
#' | |
#' @param text the code to be tokenised | |
#' @param token the list of tokens | |
lex <- function(text, tokens) { | |
matches <- lapply(tokens, function(x) { | |
m <- regexpr(x$pattern, text, perl=TRUE, ignore.case=TRUE) | |
list(text=regmatches(text, m), tag=x$tag) | |
}) | |
# remove matches which have character(0) | |
matches <- matches[sapply(matches, function(x) !identical(character(0), x$text))] | |
# return the text and tag with the longest length | |
matches <- matches[which.max(unlist(lapply(matches, function(x) nchar(x$text))))][[1]] | |
return (list( | |
text = matches$text, | |
tag = matches$tag, | |
remaining = substr(text, nchar(matches$text)+1, nchar(text)) | |
)) | |
} | |
#' correctly concatenates and returns a list | |
#' | |
#' @param lst the list to be concatenated | |
#' @param base the existing list to be concatenated against | |
cat_list <- function(lst, base) { | |
if(length(base) ==0) { | |
return(list(lst)) | |
} | |
else { | |
return(c(base, list(lst))) | |
} | |
} | |
# --- | |
tokens <- list( | |
# declared reserved key words | |
tk("for", "RESERVED"), | |
tk("in", "RESERVED"), | |
tk("data.frame", "RESERVED"), | |
tk("print", "RESERVED"), | |
#special functions | |
tk("-?\\d+(\\.\\d*)?:-?\\d+(\\.\\d*)?" , "SEQ"), | |
tk("[ ]", ""), #ignore single spaces | |
tk("\\t", ""), #ignore single tabs | |
tk("#[^\\n]*", ""), #ignore comments until the end of line | |
#indent rules | |
tk("\\n[ \\t]+", "INDENT"), | |
tk("\\n+", "NEWLINE"), | |
tk("-?\\d+(\\.\\d*)?", "NUM"), | |
tk("'.*'", "STRING"), | |
tk('\".*\"', "STRING"), | |
tk("'''[\\s\\S]*?'''", "STRING"), | |
tk('"""[\\s\\S]*?"""', "STRING"), | |
tk("[\\w\\.][\\d\\w\\._\\$]+", "ID"), | |
# brackets and parens | |
tk("\\[", "LBRAC"), | |
tk("\\]", "RBRAC"), | |
tk("\\(", "LPAREN"), | |
tk("\\)", "RPAREN"), | |
tk(",", "COMMA"), | |
tk("\\.\\.", "DOUBLEDOT"), | |
tk(":", "COLON"), | |
# operators | |
tk("=", "ASSIGN"), | |
tk("==", "EQ"), | |
tk("!=", "NEQ"), | |
tk("\\+", "PLUS"), | |
tk("-", "MINUS"), | |
tk("\\*", "TIMES"), | |
tk("/", "DIVIDE"), | |
tk("<", "LT"), | |
tk("<=", "LEQ"), | |
tk(">", "GT"), | |
tk(">=", "GEQ"), | |
tk("&", "AND"), | |
tk("|", "OR") | |
) | |
text = "#ignore this line | |
parse this line | |
and this number 13 | |
for i in 1 to 10 | |
parse indent | |
code continues with indents | |
end indents | |
" | |
text = '""" | |
testing asdf | |
"""' | |
lex_toks = list() | |
while(nchar(text) > 0) { | |
token <- lex(text, tokens) | |
text <- token$remaining | |
if(token$tag != ""){ | |
lex_toks <- cat_list(list(text=token$text, tag=token$tag), lex_toks) | |
} | |
} | |
# tokenising complete, time to build the parser | |
read_from <- function(tokens) { | |
# ID are other variables or functions | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# parser? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment