8bit-pixies/lexer.r

## lexer.r
# R lexer
# based on http://www.jayconrod.com/posts/37/a-simple-interpreter-from-scratch-in-python-part-1

#' Returns list of patterns and token names.
#' This saves typing.
#'
#' @param pattern a regex pattern
#' @param tag the resulting token tag
tk <- function(pattern, token) {
  return(list(pattern=paste0("^", pattern), tag=token))
}

#' returns tokens and tags which matched and the remaining "un"tokenised text
#'
#' @param text the code to be tokenised
#' @param token the list of tokens
lex <- function(text, tokens) {
  matches <- lapply(tokens, function(x) {
    m <- regexpr(x$pattern, text, perl=TRUE, ignore.case=TRUE)
    list(text=regmatches(text, m), tag=x$tag)
    })

  # remove matches which have character(0)
  matches <- matches[sapply(matches, function(x) !identical(character(0), x$text))]

  # return the text and tag with the longest length
  matches <- matches[which.max(unlist(lapply(matches, function(x) nchar(x$text))))][[1]]
  return (list(
    text = matches$text,
    tag = matches$tag,
    remaining = substr(text, nchar(matches$text)+1, nchar(text))
    ))
}

#' correctly concatenates and returns a list
#'
#' @param lst the list to be concatenated
#' @param base the existing list to be concatenated against
cat_list <- function(lst, base) {
  if(length(base) ==0) {
    return(list(lst))
  }
  else {
    return(c(base, list(lst)))
  }
}

# ---

tokens <- list(
  # declared reserved key words
  tk("for", "RESERVED"),
  tk("in", "RESERVED"),
  tk("data.frame", "RESERVED"),
  tk("print", "RESERVED"),

  #special functions
  tk("-?\\d+(\\.\\d*)?:-?\\d+(\\.\\d*)?" , "SEQ"),


  tk("[ ]", ""), #ignore single spaces
  tk("\\t", ""), #ignore single tabs
  tk("#[^\\n]*", ""), #ignore comments until the end of line

  #indent rules
  tk("\\n[ \\t]+", "INDENT"),
  tk("\\n+", "NEWLINE"),

  tk("-?\\d+(\\.\\d*)?", "NUM"),
  tk("'.*'", "STRING"),
  tk('\".*\"', "STRING"),
  tk("'''[\\s\\S]*?'''", "STRING"),
  tk('"""[\\s\\S]*?"""', "STRING"),
  tk("[\\w\\.][\\d\\w\\._\\$]+", "ID"),

  # brackets and parens
  tk("\\[", "LBRAC"),
  tk("\\]", "RBRAC"),
  tk("\\(", "LPAREN"),
  tk("\\)", "RPAREN"),
  tk(",", "COMMA"),
  tk("\\.\\.", "DOUBLEDOT"),
  tk(":", "COLON"),

  # operators
  tk("=", "ASSIGN"),
  tk("==", "EQ"),
  tk("!=", "NEQ"),

  tk("\\+", "PLUS"),
  tk("-", "MINUS"),
  tk("\\*", "TIMES"),
  tk("/", "DIVIDE"),

  tk("<", "LT"),
  tk("<=", "LEQ"),
  tk(">", "GT"),
  tk(">=", "GEQ"),

  tk("&", "AND"),
  tk("|", "OR")
)

text = "#ignore this line
parse this line
and this number 13

for i in 1 to 10
  parse indent
  code continues with indents

end indents
"

text = '"""
testing asdf
"""'

lex_toks = list()
while(nchar(text) > 0) {
  token <- lex(text, tokens)
  text <- token$remaining

  if(token$tag != ""){
    lex_toks <- cat_list(list(text=token$text, tag=token$tag), lex_toks)
  }
}

# tokenising complete, time to build the parser

read_from <- function(tokens) {
    # ID are other variables or functions
}


## parser.R
# parser?
	# R lexer
	# based on http://www.jayconrod.com/posts/37/a-simple-interpreter-from-scratch-in-python-part-1

	#' Returns list of patterns and token names.
	#' This saves typing.
	#'
	#' @param pattern a regex pattern
	#' @param tag the resulting token tag
	tk <- function(pattern, token) {
	return(list(pattern=paste0("^", pattern), tag=token))
	}

	#' returns tokens and tags which matched and the remaining "un"tokenised text
	#'
	#' @param text the code to be tokenised
	#' @param token the list of tokens
	lex <- function(text, tokens) {
	matches <- lapply(tokens, function(x) {
	m <- regexpr(x$pattern, text, perl=TRUE, ignore.case=TRUE)
	list(text=regmatches(text, m), tag=x$tag)
	})

	# remove matches which have character(0)
	matches <- matches[sapply(matches, function(x) !identical(character(0), x$text))]

	# return the text and tag with the longest length
	matches <- matches[which.max(unlist(lapply(matches, function(x) nchar(x$text))))][[1]]
	return (list(
	text = matches$text,
	tag = matches$tag,
	remaining = substr(text, nchar(matches$text)+1, nchar(text))
	))
	}

	#' correctly concatenates and returns a list
	#'
	#' @param lst the list to be concatenated
	#' @param base the existing list to be concatenated against
	cat_list <- function(lst, base) {
	if(length(base) ==0) {
	return(list(lst))
	}
	else {
	return(c(base, list(lst)))
	}
	}

	# ---

	tokens <- list(
	# declared reserved key words
	tk("for", "RESERVED"),
	tk("in", "RESERVED"),
	tk("data.frame", "RESERVED"),
	tk("print", "RESERVED"),

	#special functions
	tk("-?\\d+(\\.\\d)?:-?\\d+(\\.\\d)?" , "SEQ"),



	tk("[ ]", ""), #ignore single spaces
	tk("\\t", ""), #ignore single tabs
	tk("#[^\\n]*", ""), #ignore comments until the end of line

	#indent rules
	tk("\\n[ \\t]+", "INDENT"),
	tk("\\n+", "NEWLINE"),

	tk("-?\\d+(\\.\\d*)?", "NUM"),
	tk("'.*'", "STRING"),
	tk('\".*\"', "STRING"),
	tk("'''[\\s\\S]*?'''", "STRING"),
	tk('"""[\\s\\S]*?"""', "STRING"),
	tk("[\\w\\.][\\d\\w\\._\\$]+", "ID"),

	# brackets and parens
	tk("\\[", "LBRAC"),
	tk("\\]", "RBRAC"),
	tk("\\(", "LPAREN"),
	tk("\\)", "RPAREN"),
	tk(",", "COMMA"),
	tk("\\.\\.", "DOUBLEDOT"),
	tk(":", "COLON"),

	# operators
	tk("=", "ASSIGN"),
	tk("==", "EQ"),
	tk("!=", "NEQ"),

	tk("\\+", "PLUS"),
	tk("-", "MINUS"),
	tk("\\*", "TIMES"),
	tk("/", "DIVIDE"),

	tk("<", "LT"),
	tk("<=", "LEQ"),
	tk(">", "GT"),
	tk(">=", "GEQ"),

	tk("&", "AND"),
	tk("\|", "OR")
	)

	text = "#ignore this line
	parse this line
	and this number 13

	for i in 1 to 10
	parse indent
	code continues with indents

	end indents
	"

	text = '"""
	testing asdf
	"""'

	lex_toks = list()
	while(nchar(text) > 0) {
	token <- lex(text, tokens)
	text <- token$remaining

	if(token$tag != ""){
	lex_toks <- cat_list(list(text=token$text, tag=token$tag), lex_toks)
	}
	}

	# tokenising complete, time to build the parser

	read_from <- function(tokens) {
	# ID are other variables or functions
	}