Skip to content

Instantly share code, notes, and snippets.

@kylebutts
Last active July 7, 2024 21:40
Show Gist options
  • Save kylebutts/81aabc5e61735593c0d6f2f4a93c3594 to your computer and use it in GitHub Desktop.
Save kylebutts/81aabc5e61735593c0d6f2f4a93c3594 to your computer and use it in GitHub Desktop.
spin v2

I love knitr::spin() and code cells for my dev experience. However, there are a few edge-cases that made the function insufficient.

  1. #' roxygen-style documentation would create problems since they would be interpreted as markdown.
  2. With ark, jupytext style notebooks will be advantageous and as such, supporting # %% [markdown] would be beneficial.
  3. If # %% is on the start of a line in a string, this would cause problems (unlikely, but still)

This function streamlines this proceess by rewriting the code from first-principles:

  1. The function parses the source code using tree-sitter (see history for using R's parse function)
  2. The code is iterated line-by-line and uses a state-machine to properly parse everything.

You can see a demo with temp.R which I've intentionally writen to highlight difficulties that knitr::spin has

spun <- spin_treesitter(hair = "temp.R")
cat(spun, file = "spin_treesitter.qmd", sep = "\n")
knitr::spin(hair = "temp.R", knit = FALSE)
library(treesitter)
library(treesitter.r)
language <- treesitter.r::language()
parser <- parser(language)
one_string <- function(x, ...) paste(x, ..., collapse = "\n")
split_lines <- function(x) {
if (length(grep("\n", x)) == 0L) {
return(x)
}
x <- gsub("\n$", "\n\n", x)
x[x == ""] <- "\n"
unlist(strsplit(x, "\r?\n"))
}
# %%
regex_chunk <- "^(#|--)+(\\+|\\s*%%| ----+| @knitr)(.*?)\\s*-*\\s*$"
regex_md <- "^(#|--)+\\s*%%\\s*\\[markdown\\]"
regex_roxygen <- "#'\\s*@"
regex_inline <- "^[{][{](.+)[}][}][ ]*$"
remove_comments <- function(x, comment) {
stopifnot(length(comment) == 2L)
c1 <- grep(comment[1], x)
c2 <- grep(comment[2], x)
if (length(c1) != length(c2)) {
stop("comments must be put in pairs of start and end delimiters")
}
if (length(c1)) {
x <- x[-unique(unlist(mapply(seq, c1, c2, SIMPLIFY = FALSE)))]
}
return(x)
}
# determine how many backticks we need to wrap code blocks and inline code
.fmt.rmd <- function(x) {
x <- one_string(x)
l <- attr(gregexpr("`+", x)[[1]], "match.length")
l <- max(l, 0)
if (length(l) > 0) {
i <- strrep("`", l + 1)
b <- strrep("`", max(l + 1, 3))
} else {
i <- "`"
b <- "```"
}
c(paste0(b, "{r"), "}", b, paste0(i, "r \\1 ", i))
}
parse_with_treesitter <- function(x) {
tree <- parser_parse(parser, one_string(x))
root_node <- tree_root_node(tree)
}
root_node_to_data <- function(root_node) {
d <- do.call("rbind", lapply(node_children(root_node), function(node) {
data.frame(
line1 = node_start_point(node)$row + 1,
line2 = node_end_point(node)$row + 1,
token = ifelse(node_type(node) == "comment", "COMMENT", "expr"),
text = split_lines(node_text(node))
)
}))
return(d)
}
get_roxygen_comment_lines <- function(d) {
d_expr <- d[d$token == "expr", ]
roxygen_lines <- c()
for (i in seq_len(nrow(d_expr))) {
if (i == 1) {
first_line <- 1
} else {
first_line <- d_expr$line2[i - 1] + 1
}
last_line <- d_expr$line1[i] - 1
if (first_line >= last_line) {
next
}
lines <- first_line:last_line
d_comments <- d[d$line1 %in% lines, ]
# remove any comments beofre last code chunk header `# %%`
is_code_chunk <- grepl(regex_chunk, d_comments$text) | grepl(regex_md, d_comments$text)
if (any(is_code_chunk)) {
keep <- d_comments$line1 > max(d_comments$line1[is_code_chunk])
d_comments <- d_comments[keep, ]
}
if (nrow(d_comments) == 0) next
has_roxygen <- any(grepl(regex_roxygen, d_comments$text))
if (has_roxygen) {
roxygen_lines <- c(roxygen_lines, d_comments$line1)
}
}
return(roxygen_lines)
}
# 0L => Uninitialized state
# 1L => Code chunk header (# %%, #+)
# 2L => Code chunk label (starter) (#|)
# 3L => Code chunk label (#|)
# 4L => Code chunk body (starter)
# 5L => Code chunk body
# 6L => Markdown comment (starter) (#')
# 7L => Markdown comment (#')
# 8L => Markdown chunk (# %% [markdown])
# 9L => Markdown chunk body
line_state_lookup <- c(
"Code chunk header (# %%, #+)",
"Code chunk label (starter) (#|)",
"Code chunk label (#|)",
"Code chunk body (starter)",
"Code chunk body",
"Markdown comment (starter) (#')",
"Markdown comment (#')",
"Markdown chunk (# %% [markdown])",
"Markdown chunk body"
)
cell_type_lookup <- c(
"Code (header)",
"Code (no header)",
"",
"Code (no header)",
"",
"Markdown Comment",
"",
"Markdown Chunk",
""
)
get_line_state <- function(d, verbose = FALSE) {
roxygen_lines <- get_roxygen_comment_lines(d)
n_lines <- max(d$line2)
# 0L => Uninitialized state
# 1L => Code chunk header (# %%, #+)
# 2L => Code chunk label (starter) (#|)
# 3L => Code chunk label (#|)
# 4L => Code chunk body (starter)
# 5L => Code chunk body
# 6L => Markdown comment (starter) (#')
# 7L => Markdown comment (#')
# 8L => Markdown chunk (# %% [markdown])
# 9L => Markdown chunk body
line_state <- rep(0, n_lines)
curr_chunk_type <- 0L
for (i in seq_len(n_lines)) {
prev_chunk_type <- curr_chunk_type
di <- d[d$line1 <= i & i <= d$line2, ]
# If was a (starter), downgrade
if (curr_chunk_type %in% c(1L, 2L, 4L)) curr_chunk_type <- 5L
if (curr_chunk_type == 6L) curr_chunk_type <- 7L
if (curr_chunk_type == 8L) curr_chunk_type <- 9L
# blank-line or expression
if (nrow(di) == 0) {
# do nothing
} else if (any(di$token == "expr")) {
if (prev_chunk_type %in% c(0L, 6L:9L)) {
curr_chunk_type <- 4L
} else {
curr_chunk_type <- 5L
}
} else { # line is comment => check for special symbols
text <- di$text[1]
is_md_chunk_header <- grepl(regex_md, text)
is_md_comment <- startsWith(text, "#'")
is_chunk_label <- startsWith(text, "#|")
is_chunk_header <- grepl(regex_chunk, text) && !is_md_chunk_header
if (is_chunk_header) { # `# %% `
curr_chunk_type <- 1L
} else if (is_md_chunk_header) { # `# %% [markdown]`
curr_chunk_type <- 8L
} else if (is_chunk_label) { # `#| `
if (prev_chunk_type %in% c(1L, 2L, 3L)) {
curr_chunk_type <- 3L
} else {
curr_chunk_type <- 2L
}
} else if (is_md_comment) { # `#'`
if (i %in% roxygen_lines) { # Is code
if (prev_chunk_type %in% c(1L, 2L, 3L, 4L, 5L)) {
curr_chunk_type <- 5L
} else {
curr_chunk_type <- 4L
}
} else { # Is markdown
if (prev_chunk_type %in% c(0L:5L)) {
curr_chunk_type <- 6L
} else if (prev_chunk_type %in% 6L:7L) {
curr_chunk_type <- 7L
} else if (prev_chunk_type %in% 8L:9L) {
curr_chunk_type <- 9L
}
}
} else { # Regular comment
if (prev_chunk_type %in% c(0L, 6L, 7L)) {
curr_chunk_type <- 4L
} else if (prev_chunk_type %in% 1L:4L) {
curr_chunk_type <- 5L
} else if (prev_chunk_type == 8L) {
curr_chunk_type <- 9L
}
}
}
line_state[i] <- curr_chunk_type
}
if (verbose == TRUE) {
cat(sprintf("%02i: %s\n", seq_along(line_state), line_state_lookup[line_state]), sep = "")
}
return(line_state)
}
# Transforms code into qmd format
process_chunks <- function(x, chunks, verbose = FALSE) {
processed <- lapply(chunks, function(chunk) {
lines <- x[chunk$start:chunk$end]
if (startsWith(chunk$type, "Code")) { # `# %%`
is_chunk_header <- grepl(regex_chunk, lines[1])
if (is_chunk_header) {
opts <- gsub(regex_chunk, "\\3", lines[1])
if (grepl("^\\s*$", opts)) {
lines[1] <- paste0(cell_wrappers[1], cell_wrappers[2])
} else {
lines[1] <- sprintf("%s %s%s", cell_wrappers[1], opts, cell_wrappers[2])
}
} else {
lines <- c(paste0(cell_wrappers[1], cell_wrappers[2]), lines)
}
lines <- c(lines, cell_wrappers[3]) # close
} else if (chunk$type == "Markdown Comment") { # `#'`
lines <- gsub("^#'\\s*", "", lines)
# turn {{expr}} into inline expressions, e.g. `r expr` or \Sexpr{expr}
if (any(i <- grepl(regex_inline, lines))) {
lines[i] <- gsub(regex_inline, cell_wrappers[4], lines[i])
}
} else if (chunk$type == "Markdown Chunk") { # `# %% [markdown]`
lines <- lines[-1]
lines <- gsub("^#\\s*", "", lines)
# turn {{expr}} into inline expressions, e.g. `r expr` or \Sexpr{expr}
if (any(i <- grepl(regex_inline, lines))) {
lines[i] <- gsub(regex_inline, cell_wrappers[4], lines[i])
}
}
lines
})
if (verbose == TRUE) {
cat("\n── Chunked code: ──\n")
temp <- lapply(chunks, function(chunk) {
lines <- x[chunk$start:chunk$end]
cat(sprintf(
"~~~\nCHUNK TYPE: %s\n%s\n",
chunk$type, one_string(lines)
))
return(invisible(NULL))
})
}
return(processed)
}
# %%
spin_treesitter <- function(
hair, text = NULL,
inline = "^[{][{](.+)[}][}][ ]*$", # {{ exp}}
comment = c("^[# ]*/[*]", "^.*[*]/ *$"),
verbose = FALSE) {
x <- if (is.null(text)) {
xfun::read_utf8(hair)
} else {
xfun::split_lines(text)
}
# remove comments (these are comments like `# /*` `# */`)
x <- remove_comments(x, comment)
# Determine how to wrap code chunks
cell_wrappers <- .fmt.rmd(x)
# First, parse code with tree sitter
root_node <- parse_with_treesitter(x)
d <- root_node_to_data(root_node)
# break up into chunks of lines
line_state <- get_line_state(d, verbose = verbose)
# Get chunks
starts <- which(line_state %in% c(1L, 2L, 4L, 6L, 8L))
ends <- c(starts[-1] - 1, length(line_state))
cell_type <- cell_type_lookup[line_state[starts]]
chunks <- lapply(seq_along(starts), function(i) {
list(start = starts[i], end = ends[i], type = cell_type[i])
})
# Process code chunk by chunk
processed_chunks <- process_chunks(x, chunks, verbose = verbose)
unlist(processed_chunks)
}
#' ---
#' title: "A report generated from a pure R script"
#' format: html
#' ---
# %% [markdown]
# # Testing
#
# This is regular text
# {{ 1 + 1 }}
# %%
2 + 2
#' Adding some markdown
#' {{ 1 + 1 }}
#'
#' This code looks good to me - KB
#'
# Sneaky regular comment
#'
# comment that starts with a space
2 + 3
"this is a
# %% multi
line string literal
"
# %% echo = TRUE
2 + 1 # Comment after code
3 + 2
2 + 2
# %%
#' My custom function
#' @param x Number
#'
#' @export
myfunc <- function(x) {
x + 1
}
#| label: 'using myfunc'
#|
#|
myfunc(10)
---
title: "A report generated from a pure R script"
format: html
---
```{r [markdown]}
# # Testing
#
# This is regular text
# {{ 1 + 1 }}
```
```{r}
2 + 2
```
Adding some markdown
{{ 1 + 1 }}
This code looks good to me - KB
```{r}
# comment that starts with a space
2 + 3
"this is a
```
```{r multi}
line string literal
"
```
```{r echo = TRUE}
2 + 1 # Comment after code
3 + 2
2 + 2
```
```{r}
```
My custom function
@param x Number
@export
```{r}
myfunc <- function(x) {
x + 1
}
```
```{r}
#| label: 'using myfunc'
#|
#|
myfunc(10)
```
---
title: "A report generated from a pure R script"
format: html
---
# Testing
This is regular text
`r 1 + 1 `
```{r}
2 + 2
```
Adding some markdown
`r 1 + 1 `
This code looks good to me - KB
```{r}
# Sneaky regular comment
```
```{r}
# comment that starts with a space
2 + 3
"this is a
# %% multi
line string literal
"
```
```{r echo = TRUE}
2 + 1 # Comment after code
3 + 2
2 + 2
```
```{r}
#' My custom function
#' @param x Number
#'
#' @export
myfunc <- function(x) {
x + 1
}
```
```{r}
#| label: 'using myfunc'
#|
#|
myfunc(10)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment