Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Thanks @schaunwheeler codes to read xlsx files. I had remove all dependence and parse the xml files with text regular expression. It will be much faster to read large xlsx files.
#' Read xlsx files
#'
#' @param file The path to xlsx file
#' @param keep_sheets A vector of sheet name
#' @param header Whether include the head in the sheet
#' @param empty_row Whether to remove the empty rows
#' @export
xlsxToR <- function(file, keep_sheets = NULL, header = TRUE, empty_row = TRUE)
{
suppressWarnings(file.remove(tempdir()))
file.copy(file, tempdir())
new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
new_file_rename <- gsub("xlsx$", "zip", new_file)
file.rename(new_file, new_file_rename)
unzip(new_file_rename, exdir = tempdir())
# Get OS
mac <- readLines(paste0(tempdir(), "/docProps/app.xml"), warn = FALSE)
mac <- grep("Macintosh", mac)
if (length(mac) > 0)
{
os_origin <- "1899-12-30" # documentation says should be "1904-01-01"
} else
{
os_origin <- "1899-12-30"
}
# Get names of sheets
sheet_names_str <- readLines(paste0(tempdir(), "/xl/workbook.xml"), warn = FALSE)[2]
sheet_names_str <- gsub('.*<sheets>(.*)</sheets>.*', '\\1', sheet_names_str)
sheet_names_str <- strsplit(sheet_names_str, '/>')[[1]]
sheet_names <- NULL
sheet_names$name <- gsub('.* name="(.*)"( sheetId.*)', '\\1', sheet_names_str)
sheet_names$sheetId <- gsub('.* sheetId="(\\d+)" +.*', '\\1', sheet_names_str)
sheet_names$id <- gsub('.* r:id="(.*)"$', '\\1', sheet_names_str)
sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)
sheet_names$id <- gsub("\\D", "", sheet_names$id)
if(!is.null(keep_sheets))
{
sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]
}
entries <- readLines(paste0(tempdir(), "/xl/sharedStrings.xml"), warn = FALSE)[2]
entries <- gsub('^<sst .*">(<si>.*)</sst>$', '\\1', entries)
entries <- strsplit(entries, '</si>')[[1]]
entries <- gsub('^.*<t.*>(.+)</t>$', '\\1', entries)
entries[grep('<t/>', entries)] <- NA
names(entries) <- seq_along(entries) - 1
# Get column classes
styles <- readLines(paste0(tempdir(), '/xl/styles.xml'), warn = FALSE)[2]
numFmtId <- gsub('^.*<cellXfs count="\\d+">(.*)</cellXfs>.*$', '\\1', styles)
numFmtId <- strsplit(numFmtId, '<xf')[[1]]
numFmtId <- numFmtId[nchar(numFmtId) > 0]
numFmtId <- as.numeric(gsub('.*numFmtId="(\\d+)".*', '\\1', numFmtId))
cell_style <- as.data.frame(list(id = seq(0, by = 1, along = numFmtId),
numFmtId = numFmtId), stringsAsFactors = FALSE)
# Custom style
numFmt <- gsub('^.*<numFmts count="\\d+">(.*)</numFmts>.*$', '\\1', styles)
if (length(numFmt) > 0)
{
numFmt <- strsplit(numFmt, '/><numFmt')[[1]]
numFmt_cid <- as.numeric(gsub('.*numFmtId="(\\d+)".*', '\\1', numFmt))
cid_type <- rep(NA, length(numFmt_cid))
formatCode <- gsub('.*formatCode="(.*)".*', '\\1', numFmt)
pos <- grep('y|m|d', formatCode)
if (length(pos) > 0)
{
date_format <- formatCode[grep('y|m|d', formatCode)]
pos <- grep('h', date_format)
if (length(pos) > 0)
{
date_format <- date_format[-pos]
}
pos <- cell_style$numFmtId %in% numFmt_cid[formatCode %in% date_format]
cell_style$numFmtId[pos] <- 14
}
}
worksheet_paths <- paste0(tempdir(), "/xl/worksheets/sheet",
sheet_names$id, '.xml')
worksheets <- as.list(NULL)
for (i in seq(along = worksheet_paths))
{
tryCatch({
sheet_data <- readLines(worksheet_paths[i], warn = FALSE)[2]
sheet_data <- gsub('(.*<sheetData>)(.*)(</sheetData>.*)', '\\2', sheet_data)
sheet_data <- strsplit(sheet_data, '</row>')[[1]]
sheet_data <- sheet_data[grep('</c>', sheet_data)]
if (length(sheet_data) == 0)
{
next
}
sheet_data <- strsplit(sheet_data, '</c>')
sheet_data <- unlist(sheet_data)
sheet_data <- gsub('(.*<row.*>)(<c.*)', '\\2', sheet_data)
res <- NULL
res$r <- gsub('.*r="(\\w+\\d+)".*', '\\1', sheet_data)
res$v <- rep(NA, length(sheet_data))
pos <- grep('.*<v>(.*)</v>.*', sheet_data)
res$v[pos] <- gsub('.*<v>(.*)</v>.*', '\\1', sheet_data[pos])
res$s <- rep(NA, length(sheet_data))
pos <- grep('.* s="(\\d+|\\w+)"( |>).*', sheet_data)
res$s[pos] <- gsub('.* s="(\\d+|\\w+)"( |>).*', '\\1', sheet_data[pos])
res$t <- rep(NA, length(sheet_data))
pos <- grep('.* t="(\\d+|\\w+)"( |>).*', sheet_data)
res$t[pos] <- gsub('.* t="(\\d+|\\w+)"( |>).*', '\\1', sheet_data[pos])
res <- as.data.frame(res, stringsAsFactors = FALSE)
res$sheet <- sheet_names[sheet_names$id == i, 'name']
entries_match <- entries[match(res$v, names(entries))]
res$v[res$t == "s" & !is.na(res$t)] <-
entries_match[res$t == "s"& !is.na(res$t)]
res$cols <- match(gsub("\\d", "", res$r), LETTERS)
res$rows <- as.numeric(gsub("\\D", "", res$r))
nrow <- max(res$rows)
ncol <- max(res$cols)
if (header)
{
nrow <- nrow - 1
}
res_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol),
stringsAsFactors = FALSE)
style_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol),
stringsAsFactors = FALSE)
if (header)
{
header_df <- res[res$rows == 1,]
header_name <- paste0('V', seq(ncol))
header_name[header_df$cols] <- header_df$v
names(res_df) <- header_name
res <- res[res$rows != 1,]
res$rows <- res$rows - 1
}
if (nrow(res) > 0)
{
res_df[as.matrix(res[,c('rows', 'cols')])] <- res$v
s <- as.numeric(res$s)
s <- cell_style$numFmtId[match(s, cell_style$id)]
style_df[as.matrix(res[,c('rows', 'cols')])] <- s
}
style_df <- sapply(style_df, function(x)
{
ifelse(length(unique(x[!is.na(x)])) == 1, unique(x[!is.na(x)]), NA)
})
style_col <- rep('character', length(style_df))
style_col[style_df %in% 14:17] <- "date"
style_col[style_df %in% c(18:21, 45:47)] <- "time"
style_col[style_df %in% 22] <- "datetime"
style_col[is.na(style_df) & !sapply(res_df, function(x) any(grepl("\\D", x)))] <- "numeric"
number2date <- function(x)
{
if (class(x) == 'character')
{
pos <- !(is.na(x)) & !(x == 'NA')
} else
{
stop('NOT implemented to check class of column date in xlsxToR')
}
res <- rep(NA, length = length(x))
res[pos] <- as.numeric(x[pos])
res <- as.Date(res, origin = os_origin)
res
}
res_df[] <- lapply(seq_along(res_df), function(j)
{
switch(style_col[j],
character = res_df[,j],
numeric = as.numeric(res_df[,j]),
date = number2date(res_df[,j]),
time = strftime(as.POSIXct(as.numeric(res_df[,j]), origin = os_origin), format = "%H:%M:%S"),
datetime = as.POSIXct(as.numeric(res_df[,j]), origin = os_origin))
})
if (empty_row)
{
pos <- apply(res_df, 1, function(x) sum(is.na(x))) != ncol(res_df)
if (ncol(res_df) == 1)
{
col_name <- names(res_df)
res_df <- data.frame(res_df[pos,])
names(res_df) <- col_name
} else
{
res_df <- res_df[pos,]
}
}
sheet_n <- sheet_names$name[sheet_names$id == i]
worksheets[[sheet_n]] <- res_df
}, warning = function(w)
{
print(style_col)
stop(paste0('Warning messsage for sheet ', sheet_names$name[sheet_names$id == i], '\n', w))
})
}
if(length(worksheets) == 1)
{
worksheets <- worksheets[[1]]
}
worksheets
}
@Superstud

This comment has been minimized.

Copy link

Superstud commented May 6, 2015

Thanks, this is much faster :-) (4x for the 537 kb workbook that I am trying to import). I suggest you replace LETTERS with XL_LETTERS (as defined below) as it otherwise fails for worksheets with more than 26 columns.

XL_LETTERS <- c(LETTERS, paste0(rep(LETTERS, each=length(LETTERS)), rep(LETTERS, times=length(LETTERS))))[1:256]
@giriannamalai

This comment has been minimized.

Copy link

giriannamalai commented Jun 12, 2018

"Error in unzip(new_file_rename, exdir = tempdir()) :
invalid zip name argument
Calls: xlsxToR -> unzip
Execution halted"

Did any one faced this issue?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.