Skip to content

Instantly share code, notes, and snippets.

@byzheng
Forked from schaunwheeler/xlsxToR.r
Last active March 20, 2019 06:54
Show Gist options
  • Save byzheng/6119160 to your computer and use it in GitHub Desktop.
Save byzheng/6119160 to your computer and use it in GitHub Desktop.
Thanks @schaunwheeler codes to read xlsx files. I had remove all dependence and parse the xml files with text regular expression. It will be much faster to read large xlsx files.
#' Read xlsx files
#'
#' @param file The path to xlsx file
#' @param keep_sheets A vector of sheet name
#' @param header Whether include the head in the sheet
#' @param empty_row Whether to remove the empty rows
#' @export
xlsxToR <- function(file, keep_sheets = NULL, header = TRUE, empty_row = TRUE)
{
suppressWarnings(file.remove(tempdir()))
file.copy(file, tempdir())
new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
new_file_rename <- gsub("xlsx$", "zip", new_file)
file.rename(new_file, new_file_rename)
unzip(new_file_rename, exdir = tempdir())
# Get OS
mac <- readLines(paste0(tempdir(), "/docProps/app.xml"), warn = FALSE)
mac <- grep("Macintosh", mac)
if (length(mac) > 0)
{
os_origin <- "1899-12-30" # documentation says should be "1904-01-01"
} else
{
os_origin <- "1899-12-30"
}
# Get names of sheets
sheet_names_str <- readLines(paste0(tempdir(), "/xl/workbook.xml"), warn = FALSE)[2]
sheet_names_str <- gsub('.*<sheets>(.*)</sheets>.*', '\\1', sheet_names_str)
sheet_names_str <- strsplit(sheet_names_str, '/>')[[1]]
sheet_names <- NULL
sheet_names$name <- gsub('.* name="(.*)"( sheetId.*)', '\\1', sheet_names_str)
sheet_names$sheetId <- gsub('.* sheetId="(\\d+)" +.*', '\\1', sheet_names_str)
sheet_names$id <- gsub('.* r:id="(.*)"$', '\\1', sheet_names_str)
sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)
sheet_names$id <- gsub("\\D", "", sheet_names$id)
if(!is.null(keep_sheets))
{
sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]
}
entries <- readLines(paste0(tempdir(), "/xl/sharedStrings.xml"), warn = FALSE)[2]
entries <- gsub('^<sst .*">(<si>.*)</sst>$', '\\1', entries)
entries <- strsplit(entries, '</si>')[[1]]
entries <- gsub('^.*<t.*>(.+)</t>$', '\\1', entries)
entries[grep('<t/>', entries)] <- NA
names(entries) <- seq_along(entries) - 1
# Get column classes
styles <- readLines(paste0(tempdir(), '/xl/styles.xml'), warn = FALSE)[2]
numFmtId <- gsub('^.*<cellXfs count="\\d+">(.*)</cellXfs>.*$', '\\1', styles)
numFmtId <- strsplit(numFmtId, '<xf')[[1]]
numFmtId <- numFmtId[nchar(numFmtId) > 0]
numFmtId <- as.numeric(gsub('.*numFmtId="(\\d+)".*', '\\1', numFmtId))
cell_style <- as.data.frame(list(id = seq(0, by = 1, along = numFmtId),
numFmtId = numFmtId), stringsAsFactors = FALSE)
# Custom style
numFmt <- gsub('^.*<numFmts count="\\d+">(.*)</numFmts>.*$', '\\1', styles)
if (length(numFmt) > 0)
{
numFmt <- strsplit(numFmt, '/><numFmt')[[1]]
numFmt_cid <- as.numeric(gsub('.*numFmtId="(\\d+)".*', '\\1', numFmt))
cid_type <- rep(NA, length(numFmt_cid))
formatCode <- gsub('.*formatCode="(.*)".*', '\\1', numFmt)
pos <- grep('y|m|d', formatCode)
if (length(pos) > 0)
{
date_format <- formatCode[grep('y|m|d', formatCode)]
pos <- grep('h', date_format)
if (length(pos) > 0)
{
date_format <- date_format[-pos]
}
pos <- cell_style$numFmtId %in% numFmt_cid[formatCode %in% date_format]
cell_style$numFmtId[pos] <- 14
}
}
worksheet_paths <- paste0(tempdir(), "/xl/worksheets/sheet",
sheet_names$id, '.xml')
worksheets <- as.list(NULL)
for (i in seq(along = worksheet_paths))
{
tryCatch({
sheet_data <- readLines(worksheet_paths[i], warn = FALSE)[2]
sheet_data <- gsub('(.*<sheetData>)(.*)(</sheetData>.*)', '\\2', sheet_data)
sheet_data <- strsplit(sheet_data, '</row>')[[1]]
sheet_data <- sheet_data[grep('</c>', sheet_data)]
if (length(sheet_data) == 0)
{
next
}
sheet_data <- strsplit(sheet_data, '</c>')
sheet_data <- unlist(sheet_data)
sheet_data <- gsub('(.*<row.*>)(<c.*)', '\\2', sheet_data)
res <- NULL
res$r <- gsub('.*r="(\\w+\\d+)".*', '\\1', sheet_data)
res$v <- rep(NA, length(sheet_data))
pos <- grep('.*<v>(.*)</v>.*', sheet_data)
res$v[pos] <- gsub('.*<v>(.*)</v>.*', '\\1', sheet_data[pos])
res$s <- rep(NA, length(sheet_data))
pos <- grep('.* s="(\\d+|\\w+)"( |>).*', sheet_data)
res$s[pos] <- gsub('.* s="(\\d+|\\w+)"( |>).*', '\\1', sheet_data[pos])
res$t <- rep(NA, length(sheet_data))
pos <- grep('.* t="(\\d+|\\w+)"( |>).*', sheet_data)
res$t[pos] <- gsub('.* t="(\\d+|\\w+)"( |>).*', '\\1', sheet_data[pos])
res <- as.data.frame(res, stringsAsFactors = FALSE)
res$sheet <- sheet_names[sheet_names$id == i, 'name']
entries_match <- entries[match(res$v, names(entries))]
res$v[res$t == "s" & !is.na(res$t)] <-
entries_match[res$t == "s"& !is.na(res$t)]
res$cols <- match(gsub("\\d", "", res$r), LETTERS)
res$rows <- as.numeric(gsub("\\D", "", res$r))
nrow <- max(res$rows)
ncol <- max(res$cols)
if (header)
{
nrow <- nrow - 1
}
res_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol),
stringsAsFactors = FALSE)
style_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol),
stringsAsFactors = FALSE)
if (header)
{
header_df <- res[res$rows == 1,]
header_name <- paste0('V', seq(ncol))
header_name[header_df$cols] <- header_df$v
names(res_df) <- header_name
res <- res[res$rows != 1,]
res$rows <- res$rows - 1
}
if (nrow(res) > 0)
{
res_df[as.matrix(res[,c('rows', 'cols')])] <- res$v
s <- as.numeric(res$s)
s <- cell_style$numFmtId[match(s, cell_style$id)]
style_df[as.matrix(res[,c('rows', 'cols')])] <- s
}
style_df <- sapply(style_df, function(x)
{
ifelse(length(unique(x[!is.na(x)])) == 1, unique(x[!is.na(x)]), NA)
})
style_col <- rep('character', length(style_df))
style_col[style_df %in% 14:17] <- "date"
style_col[style_df %in% c(18:21, 45:47)] <- "time"
style_col[style_df %in% 22] <- "datetime"
style_col[is.na(style_df) & !sapply(res_df, function(x) any(grepl("\\D", x)))] <- "numeric"
number2date <- function(x)
{
if (class(x) == 'character')
{
pos <- !(is.na(x)) & !(x == 'NA')
} else
{
stop('NOT implemented to check class of column date in xlsxToR')
}
res <- rep(NA, length = length(x))
res[pos] <- as.numeric(x[pos])
res <- as.Date(res, origin = os_origin)
res
}
res_df[] <- lapply(seq_along(res_df), function(j)
{
switch(style_col[j],
character = res_df[,j],
numeric = as.numeric(res_df[,j]),
date = number2date(res_df[,j]),
time = strftime(as.POSIXct(as.numeric(res_df[,j]), origin = os_origin), format = "%H:%M:%S"),
datetime = as.POSIXct(as.numeric(res_df[,j]), origin = os_origin))
})
if (empty_row)
{
pos <- apply(res_df, 1, function(x) sum(is.na(x))) != ncol(res_df)
if (ncol(res_df) == 1)
{
col_name <- names(res_df)
res_df <- data.frame(res_df[pos,])
names(res_df) <- col_name
} else
{
res_df <- res_df[pos,]
}
}
sheet_n <- sheet_names$name[sheet_names$id == i]
worksheets[[sheet_n]] <- res_df
}, warning = function(w)
{
print(style_col)
stop(paste0('Warning messsage for sheet ', sheet_names$name[sheet_names$id == i], '\n', w))
})
}
if(length(worksheets) == 1)
{
worksheets <- worksheets[[1]]
}
worksheets
}
@alofting
Copy link

alofting commented May 6, 2015

Thanks, this is much faster :-) (4x for the 537 kb workbook that I am trying to import). I suggest you replace LETTERS with XL_LETTERS (as defined below) as it otherwise fails for worksheets with more than 26 columns.

XL_LETTERS <- c(LETTERS, paste0(rep(LETTERS, each=length(LETTERS)), rep(LETTERS, times=length(LETTERS))))[1:256]

@giriannamalai
Copy link

"Error in unzip(new_file_rename, exdir = tempdir()) :
invalid zip name argument
Calls: xlsxToR -> unzip
Execution halted"

Did any one faced this issue?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment