Skip to content

Instantly share code, notes, and snippets.

@nanxstats
Last active June 15, 2022 21:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nanxstats/de5b08990aad2298a39aca347772304f to your computer and use it in GitHub Desktop.
Save nanxstats/de5b08990aad2298a39aca347772304f to your computer and use it in GitHub Desktop.
Find out if a file is plain text or binary using the zlib algorithm
# <https://github.com/madler/zlib/blob/8678871f18f4dd51101a9db1e37791f975969079/doc/txtvsbin.txt>
#' Classify any file into text file or binary file
#'
#' @param path File path.
#' @param n The (maximal) number of bytes to read.
#'
#' @return Logical. `TRUE` if text, `FALSE` if binary.
#'
#' @examples
#' is_text_file(file.path(R.home("doc"), "COPYING"))
#' is_text_file(file.path(R.home("doc"), "NEWS.pdf"))
is_text_file <- function(path, n = file.info(path)$size) {
bytecode <- readBin(path, what = "raw", n = n)
if (length(bytecode) == 0L) return(FALSE)
allow <- as.raw(c(9, 10, 13, 32:255))
block <- as.raw(c(0:6, 14:31))
cond1 <- any(bytecode %in% allow)
cond2 <- !any(bytecode %in% block)
cond1 && cond2
}
is_text_files <- Vectorize(is_text_file, "path")
df <- data.frame(
file = fs::dir_ls(R.home(), type = "file", recurse = TRUE),
is_text = NA,
is_text_100k = NA,
row.names = NULL,
stringsAsFactors = FALSE
)
# Read the entire file
system.time(df$is_text <- is_text_files(df$file))
# Read the first 100 KB at maximum
system.time(df$is_text_100k <- is_text_files(df$file, n = 0.1 * 1000^2))
all.equal(df$is_text, df$is_text_100k)
df$file <- gsub(paste0("^", fs::path_norm(R.home()), "/"), "", df$file)
exts <- fs::path_ext(df$file)
df$exts <- as.factor(exts)
DT::datatable(df, filter = list(position = "top", clear = FALSE))
# Find anti-patterns: inconsistent class tag within the same extension
flag <- rep(FALSE, nrow(df))
for (i in unique(exts)) {
if (i != "") {
idx <- which(exts == i)
val <- df[idx, "is_text"]
if (length(unique(val)) > 1L) {
flag[idx[val == as.logical(names(which.min(table(val))))]] <- TRUE
}
}
}
DT::datatable(df[flag, ], rownames = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment