|
library(treesitter) |
|
library(treesitter.r) |
|
language <- treesitter.r::language() |
|
parser <- parser(language) |
|
|
|
one_string <- function(x, ...) paste(x, ..., collapse = "\n") |
|
split_lines <- function(x) { |
|
if (length(grep("\n", x)) == 0L) { |
|
return(x) |
|
} |
|
x <- gsub("\n$", "\n\n", x) |
|
x[x == ""] <- "\n" |
|
unlist(strsplit(x, "\r?\n")) |
|
} |
|
|
|
# %% |
|
regex_chunk <- "^(#|--)+(\\+|\\s*%%| ----+| @knitr)(.*?)\\s*-*\\s*$" |
|
regex_md <- "^(#|--)+\\s*%%\\s*\\[markdown\\]" |
|
regex_roxygen <- "#'\\s*@" |
|
regex_inline <- "^[{][{](.+)[}][}][ ]*$" |
|
|
|
remove_comments <- function(x, comment) { |
|
stopifnot(length(comment) == 2L) |
|
c1 <- grep(comment[1], x) |
|
c2 <- grep(comment[2], x) |
|
if (length(c1) != length(c2)) { |
|
stop("comments must be put in pairs of start and end delimiters") |
|
} |
|
if (length(c1)) { |
|
x <- x[-unique(unlist(mapply(seq, c1, c2, SIMPLIFY = FALSE)))] |
|
} |
|
return(x) |
|
} |
|
|
|
# determine how many backticks we need to wrap code blocks and inline code |
|
.fmt.rmd <- function(x) { |
|
x <- one_string(x) |
|
l <- attr(gregexpr("`+", x)[[1]], "match.length") |
|
l <- max(l, 0) |
|
if (length(l) > 0) { |
|
i <- strrep("`", l + 1) |
|
b <- strrep("`", max(l + 1, 3)) |
|
} else { |
|
i <- "`" |
|
b <- "```" |
|
} |
|
c(paste0(b, "{r"), "}", b, paste0(i, "r \\1 ", i)) |
|
} |
|
|
|
parse_with_treesitter <- function(x) { |
|
tree <- parser_parse(parser, one_string(x)) |
|
root_node <- tree_root_node(tree) |
|
} |
|
|
|
root_node_to_data <- function(root_node) { |
|
d <- do.call("rbind", lapply(node_children(root_node), function(node) { |
|
data.frame( |
|
line1 = node_start_point(node)$row + 1, |
|
line2 = node_end_point(node)$row + 1, |
|
token = ifelse(node_type(node) == "comment", "COMMENT", "expr"), |
|
text = split_lines(node_text(node)) |
|
) |
|
})) |
|
return(d) |
|
} |
|
|
|
get_roxygen_comment_lines <- function(d) { |
|
d_expr <- d[d$token == "expr", ] |
|
|
|
roxygen_lines <- c() |
|
for (i in seq_len(nrow(d_expr))) { |
|
if (i == 1) { |
|
first_line <- 1 |
|
} else { |
|
first_line <- d_expr$line2[i - 1] + 1 |
|
} |
|
last_line <- d_expr$line1[i] - 1 |
|
|
|
if (first_line >= last_line) { |
|
next |
|
} |
|
lines <- first_line:last_line |
|
|
|
d_comments <- d[d$line1 %in% lines, ] |
|
|
|
# remove any comments beofre last code chunk header `# %%` |
|
is_code_chunk <- grepl(regex_chunk, d_comments$text) | grepl(regex_md, d_comments$text) |
|
if (any(is_code_chunk)) { |
|
keep <- d_comments$line1 > max(d_comments$line1[is_code_chunk]) |
|
d_comments <- d_comments[keep, ] |
|
} |
|
if (nrow(d_comments) == 0) next |
|
|
|
has_roxygen <- any(grepl(regex_roxygen, d_comments$text)) |
|
if (has_roxygen) { |
|
roxygen_lines <- c(roxygen_lines, d_comments$line1) |
|
} |
|
} |
|
|
|
return(roxygen_lines) |
|
} |
|
|
|
# 0L => Uninitialized state |
|
# 1L => Code chunk header (# %%, #+) |
|
# 2L => Code chunk label (starter) (#|) |
|
# 3L => Code chunk label (#|) |
|
# 4L => Code chunk body (starter) |
|
# 5L => Code chunk body |
|
# 6L => Markdown comment (starter) (#') |
|
# 7L => Markdown comment (#') |
|
# 8L => Markdown chunk (# %% [markdown]) |
|
# 9L => Markdown chunk body |
|
line_state_lookup <- c( |
|
"Code chunk header (# %%, #+)", |
|
"Code chunk label (starter) (#|)", |
|
"Code chunk label (#|)", |
|
"Code chunk body (starter)", |
|
"Code chunk body", |
|
"Markdown comment (starter) (#')", |
|
"Markdown comment (#')", |
|
"Markdown chunk (# %% [markdown])", |
|
"Markdown chunk body" |
|
) |
|
cell_type_lookup <- c( |
|
"Code (header)", |
|
"Code (no header)", |
|
"", |
|
"Code (no header)", |
|
"", |
|
"Markdown Comment", |
|
"", |
|
"Markdown Chunk", |
|
"" |
|
) |
|
|
|
get_line_state <- function(d, verbose = FALSE) { |
|
roxygen_lines <- get_roxygen_comment_lines(d) |
|
n_lines <- max(d$line2) |
|
|
|
# 0L => Uninitialized state |
|
# 1L => Code chunk header (# %%, #+) |
|
# 2L => Code chunk label (starter) (#|) |
|
# 3L => Code chunk label (#|) |
|
# 4L => Code chunk body (starter) |
|
# 5L => Code chunk body |
|
# 6L => Markdown comment (starter) (#') |
|
# 7L => Markdown comment (#') |
|
# 8L => Markdown chunk (# %% [markdown]) |
|
# 9L => Markdown chunk body |
|
line_state <- rep(0, n_lines) |
|
curr_chunk_type <- 0L |
|
for (i in seq_len(n_lines)) { |
|
prev_chunk_type <- curr_chunk_type |
|
di <- d[d$line1 <= i & i <= d$line2, ] |
|
|
|
# If was a (starter), downgrade |
|
if (curr_chunk_type %in% c(1L, 2L, 4L)) curr_chunk_type <- 5L |
|
if (curr_chunk_type == 6L) curr_chunk_type <- 7L |
|
if (curr_chunk_type == 8L) curr_chunk_type <- 9L |
|
|
|
# blank-line or expression |
|
if (nrow(di) == 0) { |
|
# do nothing |
|
} else if (any(di$token == "expr")) { |
|
if (prev_chunk_type %in% c(0L, 6L:9L)) { |
|
curr_chunk_type <- 4L |
|
} else { |
|
curr_chunk_type <- 5L |
|
} |
|
} else { # line is comment => check for special symbols |
|
|
|
text <- di$text[1] |
|
is_md_chunk_header <- grepl(regex_md, text) |
|
is_md_comment <- startsWith(text, "#'") |
|
is_chunk_label <- startsWith(text, "#|") |
|
is_chunk_header <- grepl(regex_chunk, text) && !is_md_chunk_header |
|
|
|
if (is_chunk_header) { # `# %% ` |
|
curr_chunk_type <- 1L |
|
} else if (is_md_chunk_header) { # `# %% [markdown]` |
|
curr_chunk_type <- 8L |
|
} else if (is_chunk_label) { # `#| ` |
|
if (prev_chunk_type %in% c(1L, 2L, 3L)) { |
|
curr_chunk_type <- 3L |
|
} else { |
|
curr_chunk_type <- 2L |
|
} |
|
} else if (is_md_comment) { # `#'` |
|
if (i %in% roxygen_lines) { # Is code |
|
if (prev_chunk_type %in% c(1L, 2L, 3L, 4L, 5L)) { |
|
curr_chunk_type <- 5L |
|
} else { |
|
curr_chunk_type <- 4L |
|
} |
|
} else { # Is markdown |
|
if (prev_chunk_type %in% c(0L:5L)) { |
|
curr_chunk_type <- 6L |
|
} else if (prev_chunk_type %in% 6L:7L) { |
|
curr_chunk_type <- 7L |
|
} else if (prev_chunk_type %in% 8L:9L) { |
|
curr_chunk_type <- 9L |
|
} |
|
} |
|
} else { # Regular comment |
|
if (prev_chunk_type %in% c(0L, 6L, 7L)) { |
|
curr_chunk_type <- 4L |
|
} else if (prev_chunk_type %in% 1L:4L) { |
|
curr_chunk_type <- 5L |
|
} else if (prev_chunk_type == 8L) { |
|
curr_chunk_type <- 9L |
|
} |
|
} |
|
} |
|
|
|
line_state[i] <- curr_chunk_type |
|
} |
|
|
|
if (verbose == TRUE) { |
|
cat(sprintf("%02i: %s\n", seq_along(line_state), line_state_lookup[line_state]), sep = "") |
|
} |
|
|
|
return(line_state) |
|
} |
|
|
|
# Transforms code into qmd format |
|
process_chunks <- function(x, chunks, verbose = FALSE) { |
|
processed <- lapply(chunks, function(chunk) { |
|
lines <- x[chunk$start:chunk$end] |
|
if (startsWith(chunk$type, "Code")) { # `# %%` |
|
is_chunk_header <- grepl(regex_chunk, lines[1]) |
|
if (is_chunk_header) { |
|
opts <- gsub(regex_chunk, "\\3", lines[1]) |
|
if (grepl("^\\s*$", opts)) { |
|
lines[1] <- paste0(cell_wrappers[1], cell_wrappers[2]) |
|
} else { |
|
lines[1] <- sprintf("%s %s%s", cell_wrappers[1], opts, cell_wrappers[2]) |
|
} |
|
} else { |
|
lines <- c(paste0(cell_wrappers[1], cell_wrappers[2]), lines) |
|
} |
|
lines <- c(lines, cell_wrappers[3]) # close |
|
} else if (chunk$type == "Markdown Comment") { # `#'` |
|
lines <- gsub("^#'\\s*", "", lines) |
|
|
|
# turn {{expr}} into inline expressions, e.g. `r expr` or \Sexpr{expr} |
|
if (any(i <- grepl(regex_inline, lines))) { |
|
lines[i] <- gsub(regex_inline, cell_wrappers[4], lines[i]) |
|
} |
|
} else if (chunk$type == "Markdown Chunk") { # `# %% [markdown]` |
|
lines <- lines[-1] |
|
lines <- gsub("^#\\s*", "", lines) |
|
|
|
# turn {{expr}} into inline expressions, e.g. `r expr` or \Sexpr{expr} |
|
if (any(i <- grepl(regex_inline, lines))) { |
|
lines[i] <- gsub(regex_inline, cell_wrappers[4], lines[i]) |
|
} |
|
} |
|
|
|
lines |
|
}) |
|
|
|
if (verbose == TRUE) { |
|
cat("\n── Chunked code: ──\n") |
|
temp <- lapply(chunks, function(chunk) { |
|
lines <- x[chunk$start:chunk$end] |
|
cat(sprintf( |
|
"~~~\nCHUNK TYPE: %s\n%s\n", |
|
chunk$type, one_string(lines) |
|
)) |
|
|
|
return(invisible(NULL)) |
|
}) |
|
} |
|
|
|
return(processed) |
|
} |
|
|
|
# %% |
|
spin_treesitter <- function( |
|
hair, text = NULL, |
|
inline = "^[{][{](.+)[}][}][ ]*$", # {{ exp}} |
|
comment = c("^[# ]*/[*]", "^.*[*]/ *$"), |
|
verbose = FALSE) { |
|
x <- if (is.null(text)) { |
|
xfun::read_utf8(hair) |
|
} else { |
|
xfun::split_lines(text) |
|
} |
|
|
|
# remove comments (these are comments like `# /*` `# */`) |
|
x <- remove_comments(x, comment) |
|
|
|
# Determine how to wrap code chunks |
|
cell_wrappers <- .fmt.rmd(x) |
|
|
|
# First, parse code with tree sitter |
|
root_node <- parse_with_treesitter(x) |
|
d <- root_node_to_data(root_node) |
|
|
|
# break up into chunks of lines |
|
line_state <- get_line_state(d, verbose = verbose) |
|
|
|
# Get chunks |
|
starts <- which(line_state %in% c(1L, 2L, 4L, 6L, 8L)) |
|
ends <- c(starts[-1] - 1, length(line_state)) |
|
cell_type <- cell_type_lookup[line_state[starts]] |
|
|
|
chunks <- lapply(seq_along(starts), function(i) { |
|
list(start = starts[i], end = ends[i], type = cell_type[i]) |
|
}) |
|
|
|
# Process code chunk by chunk |
|
processed_chunks <- process_chunks(x, chunks, verbose = verbose) |
|
|
|
unlist(processed_chunks) |
|
} |