byzheng/xlsxToR.r

## xlsxToR.r
#' Read xlsx files
#'
#' @param file The path to xlsx file
#' @param keep_sheets A vector of sheet name
#' @param header Whether include the head in the sheet
#' @param empty_row Whether to remove the empty rows
#' @export
xlsxToR <- function(file, keep_sheets = NULL, header = TRUE, empty_row = TRUE)
{
    suppressWarnings(file.remove(tempdir()))
    file.copy(file, tempdir())
    new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
    new_file_rename <- gsub("xlsx$", "zip", new_file)
    file.rename(new_file, new_file_rename)

    unzip(new_file_rename, exdir = tempdir())

    # Get OS
    mac <- readLines(paste0(tempdir(), "/docProps/app.xml"), warn = FALSE)
    mac <- grep("Macintosh", mac)
    if (length(mac) > 0)
    {
        os_origin <- "1899-12-30" # documentation says should be "1904-01-01"
    } else
    {
        os_origin <- "1899-12-30"
    }

    # Get names of sheets
    sheet_names_str <- readLines(paste0(tempdir(), "/xl/workbook.xml"), warn = FALSE)[2]
    sheet_names_str <- gsub('.*<sheets>(.*)</sheets>.*', '\\1', sheet_names_str)
    sheet_names_str <- strsplit(sheet_names_str, '/>')[[1]]
    sheet_names <- NULL
    sheet_names$name <- gsub('.* name="(.*)"( sheetId.*)', '\\1', sheet_names_str)
    sheet_names$sheetId <- gsub('.* sheetId="(\\d+)" +.*', '\\1', sheet_names_str)
    sheet_names$id <- gsub('.* r:id="(.*)"$', '\\1', sheet_names_str)
    sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)
    sheet_names$id <- gsub("\\D", "", sheet_names$id)


    if(!is.null(keep_sheets))
    {
        sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]
    }

    entries <- readLines(paste0(tempdir(), "/xl/sharedStrings.xml"), warn = FALSE)[2]
    entries <- gsub('^<sst .*">(<si>.*)</sst>$', '\\1', entries)
    entries <- strsplit(entries, '</si>')[[1]]
    entries <- gsub('^.*<t.*>(.+)</t>$', '\\1', entries)
    entries[grep('<t/>', entries)] <- NA
    names(entries) <- seq_along(entries) - 1

    # Get column classes
    styles <- readLines(paste0(tempdir(), '/xl/styles.xml'), warn = FALSE)[2]
    numFmtId <- gsub('^.*<cellXfs count="\\d+">(.*)</cellXfs>.*$', '\\1', styles)
    numFmtId <- strsplit(numFmtId, '<xf')[[1]]
    numFmtId <- numFmtId[nchar(numFmtId) > 0]
    numFmtId <- as.numeric(gsub('.*numFmtId="(\\d+)".*', '\\1', numFmtId))
    cell_style <- as.data.frame(list(id = seq(0, by = 1, along = numFmtId),
        numFmtId = numFmtId), stringsAsFactors = FALSE)

    # Custom style
    numFmt <- gsub('^.*<numFmts count="\\d+">(.*)</numFmts>.*$', '\\1', styles)
    if (length(numFmt) > 0)
    {
        numFmt <- strsplit(numFmt, '/><numFmt')[[1]]
        numFmt_cid <- as.numeric(gsub('.*numFmtId="(\\d+)".*', '\\1', numFmt))
        cid_type <- rep(NA, length(numFmt_cid))
        formatCode <- gsub('.*formatCode="(.*)".*', '\\1', numFmt)
        pos <- grep('y|m|d', formatCode)
        if (length(pos) > 0)
        {
            date_format <- formatCode[grep('y|m|d', formatCode)]

            pos <- grep('h', date_format)
            if (length(pos) > 0)
            {
                date_format <- date_format[-pos]
            }
            pos <- cell_style$numFmtId %in% numFmt_cid[formatCode %in% date_format]
            cell_style$numFmtId[pos] <- 14
        }
    }
    worksheet_paths <- paste0(tempdir(), "/xl/worksheets/sheet",
        sheet_names$id, '.xml')
    worksheets <- as.list(NULL)
    for (i in seq(along = worksheet_paths))
    {
        tryCatch({
        sheet_data <- readLines(worksheet_paths[i], warn = FALSE)[2]
        sheet_data <- gsub('(.*<sheetData>)(.*)(</sheetData>.*)', '\\2', sheet_data)
        sheet_data <- strsplit(sheet_data, '</row>')[[1]]
        sheet_data <- sheet_data[grep('</c>', sheet_data)]
        if (length(sheet_data) == 0)
        {
            next
        }
        sheet_data <- strsplit(sheet_data, '</c>')
        sheet_data <- unlist(sheet_data)
        sheet_data <- gsub('(.*<row.*>)(<c.*)', '\\2', sheet_data)
        res <- NULL
        res$r <- gsub('.*r="(\\w+\\d+)".*', '\\1', sheet_data)
        res$v <- rep(NA, length(sheet_data))
        pos <- grep('.*<v>(.*)</v>.*', sheet_data)
        res$v[pos] <- gsub('.*<v>(.*)</v>.*', '\\1', sheet_data[pos])
        res$s <- rep(NA, length(sheet_data))
        pos <- grep('.* s="(\\d+|\\w+)"( |>).*', sheet_data)
        res$s[pos] <- gsub('.* s="(\\d+|\\w+)"( |>).*', '\\1', sheet_data[pos])
        res$t <- rep(NA, length(sheet_data))
        pos <- grep('.* t="(\\d+|\\w+)"( |>).*', sheet_data)
        res$t[pos] <- gsub('.* t="(\\d+|\\w+)"( |>).*', '\\1', sheet_data[pos])
        res <- as.data.frame(res, stringsAsFactors = FALSE)
        res$sheet <- sheet_names[sheet_names$id == i, 'name']
        entries_match <- entries[match(res$v, names(entries))]
        res$v[res$t == "s" & !is.na(res$t)] <-
            entries_match[res$t == "s"& !is.na(res$t)]
        res$cols <- match(gsub("\\d", "", res$r), LETTERS)
        res$rows <- as.numeric(gsub("\\D", "", res$r))

        nrow <- max(res$rows)
        ncol <- max(res$cols)
        if (header)
        {
            nrow <- nrow - 1
        }
        res_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol),
            stringsAsFactors = FALSE)
        style_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol),
            stringsAsFactors = FALSE)
        if (header)
        {
            header_df <- res[res$rows == 1,]
            header_name <- paste0('V', seq(ncol))
            header_name[header_df$cols] <- header_df$v
            names(res_df) <- header_name
            res <- res[res$rows != 1,]
            res$rows <- res$rows - 1
        }
        if (nrow(res) > 0)
        {
            res_df[as.matrix(res[,c('rows', 'cols')])] <- res$v
            s <- as.numeric(res$s)
            s <- cell_style$numFmtId[match(s, cell_style$id)]
            style_df[as.matrix(res[,c('rows', 'cols')])] <- s
        }

        style_df <- sapply(style_df, function(x)
            {
                ifelse(length(unique(x[!is.na(x)])) == 1, unique(x[!is.na(x)]), NA)
            })
        style_col <- rep('character', length(style_df))
        style_col[style_df %in% 14:17] <- "date"
        style_col[style_df %in% c(18:21, 45:47)] <- "time"
        style_col[style_df %in% 22] <- "datetime"
        style_col[is.na(style_df) & !sapply(res_df, function(x) any(grepl("\\D", x)))] <- "numeric"

        number2date <- function(x)
        {
            if (class(x) == 'character')
            {
                pos <- !(is.na(x)) & !(x == 'NA')
            } else
            {
                stop('NOT implemented to check class of column date in xlsxToR')
            }
            res <- rep(NA, length = length(x))
            res[pos] <- as.numeric(x[pos])
            res <- as.Date(res, origin = os_origin)
            res
        }
        res_df[] <- lapply(seq_along(res_df), function(j)
            {
                switch(style_col[j],
                    character = res_df[,j],
                    numeric = as.numeric(res_df[,j]),
                    date = number2date(res_df[,j]),
                    time = strftime(as.POSIXct(as.numeric(res_df[,j]), origin = os_origin), format = "%H:%M:%S"),
                    datetime = as.POSIXct(as.numeric(res_df[,j]), origin = os_origin))
            })
        if (empty_row)
        {
            pos <- apply(res_df, 1, function(x) sum(is.na(x))) != ncol(res_df)
            if (ncol(res_df) == 1)
            {
                col_name <- names(res_df)
                res_df <- data.frame(res_df[pos,])
                names(res_df) <- col_name
            } else
            {
                res_df <- res_df[pos,]
            }
        }
        sheet_n <- sheet_names$name[sheet_names$id == i]
        worksheets[[sheet_n]] <- res_df
        }, warning = function(w)
        {
            print(style_col)
            stop(paste0('Warning messsage for sheet ', sheet_names$name[sheet_names$id == i], '\n', w))
        })
    }

    if(length(worksheets) == 1)
    {
        worksheets <- worksheets[[1]]
    }
    worksheets
}
	#' Read xlsx files
	#'
	#' @param file The path to xlsx file
	#' @param keep_sheets A vector of sheet name
	#' @param header Whether include the head in the sheet
	#' @param empty_row Whether to remove the empty rows
	#' @export
	xlsxToR <- function(file, keep_sheets = NULL, header = TRUE, empty_row = TRUE)
	{
	suppressWarnings(file.remove(tempdir()))
	file.copy(file, tempdir())
	new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
	new_file_rename <- gsub("xlsx$", "zip", new_file)
	file.rename(new_file, new_file_rename)

	unzip(new_file_rename, exdir = tempdir())

	# Get OS
	mac <- readLines(paste0(tempdir(), "/docProps/app.xml"), warn = FALSE)
	mac <- grep("Macintosh", mac)
	if (length(mac) > 0)
	{
	os_origin <- "1899-12-30" # documentation says should be "1904-01-01"
	} else
	{
	os_origin <- "1899-12-30"
	}

	# Get names of sheets
	sheet_names_str <- readLines(paste0(tempdir(), "/xl/workbook.xml"), warn = FALSE)[2]
	sheet_names_str <- gsub('.<sheets>(.)</sheets>.*', '\\1', sheet_names_str)
	sheet_names_str <- strsplit(sheet_names_str, '/>')[[1]]
	sheet_names <- NULL
	sheet_names$name <- gsub('.* name="(.)"( sheetId.)', '\\1', sheet_names_str)
	sheet_names$sheetId <- gsub('.* sheetId="(\\d+)" +.*', '\\1', sheet_names_str)
	sheet_names$id <- gsub('.* r:id="(.*)"$', '\\1', sheet_names_str)
	sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)
	sheet_names$id <- gsub("\\D", "", sheet_names$id)


	if(!is.null(keep_sheets))
	{
	sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]
	}

	entries <- readLines(paste0(tempdir(), "/xl/sharedStrings.xml"), warn = FALSE)[2]
	entries <- gsub('^<sst .">(<si>.)</sst>$', '\\1', entries)
	entries <- strsplit(entries, '</si>')[[1]]
	entries <- gsub('^.<t.>(.+)</t>$', '\\1', entries)
	entries[grep('<t/>', entries)] <- NA
	names(entries) <- seq_along(entries) - 1

	# Get column classes
	styles <- readLines(paste0(tempdir(), '/xl/styles.xml'), warn = FALSE)[2]
	numFmtId <- gsub('^.<cellXfs count="\\d+">(.)</cellXfs>.*$', '\\1', styles)
	numFmtId <- strsplit(numFmtId, '<xf')[[1]]
	numFmtId <- numFmtId[nchar(numFmtId) > 0]
	numFmtId <- as.numeric(gsub('.numFmtId="(\\d+)".', '\\1', numFmtId))
	cell_style <- as.data.frame(list(id = seq(0, by = 1, along = numFmtId),
	numFmtId = numFmtId), stringsAsFactors = FALSE)

	# Custom style
	numFmt <- gsub('^.<numFmts count="\\d+">(.)</numFmts>.*$', '\\1', styles)
	if (length(numFmt) > 0)
	{
	numFmt <- strsplit(numFmt, '/><numFmt')[[1]]
	numFmt_cid <- as.numeric(gsub('.numFmtId="(\\d+)".', '\\1', numFmt))
	cid_type <- rep(NA, length(numFmt_cid))
	formatCode <- gsub('.formatCode="(.)".*', '\\1', numFmt)
	pos <- grep('y\|m\|d', formatCode)
	if (length(pos) > 0)
	{
	date_format <- formatCode[grep('y\|m\|d', formatCode)]

	pos <- grep('h', date_format)
	if (length(pos) > 0)
	{
	date_format <- date_format[-pos]
	}
	pos <- cell_style$numFmtId %in% numFmt_cid[formatCode %in% date_format]
	cell_style$numFmtId[pos] <- 14
	}
	}
	worksheet_paths <- paste0(tempdir(), "/xl/worksheets/sheet",
	sheet_names$id, '.xml')
	worksheets <- as.list(NULL)
	for (i in seq(along = worksheet_paths))
	{
	tryCatch({
	sheet_data <- readLines(worksheet_paths[i], warn = FALSE)[2]
	sheet_data <- gsub('(.<sheetData>)(.)(</sheetData>.*)', '\\2', sheet_data)
	sheet_data <- strsplit(sheet_data, '</row>')[[1]]
	sheet_data <- sheet_data[grep('</c>', sheet_data)]
	if (length(sheet_data) == 0)
	{
	next
	}
	sheet_data <- strsplit(sheet_data, '</c>')
	sheet_data <- unlist(sheet_data)
	sheet_data <- gsub('(.<row.>)(<c.*)', '\\2', sheet_data)
	res <- NULL
	res$r <- gsub('.r="(\\w+\\d+)".', '\\1', sheet_data)
	res$v <- rep(NA, length(sheet_data))
	pos <- grep('.<v>(.)</v>.*', sheet_data)
	res$v[pos] <- gsub('.<v>(.)</v>.*', '\\1', sheet_data[pos])
	res$s <- rep(NA, length(sheet_data))
	pos <- grep('.* s="(\\d+\|\\w+)"( \|>).*', sheet_data)
	res$s[pos] <- gsub('.* s="(\\d+\|\\w+)"( \|>).*', '\\1', sheet_data[pos])
	res$t <- rep(NA, length(sheet_data))
	pos <- grep('.* t="(\\d+\|\\w+)"( \|>).*', sheet_data)
	res$t[pos] <- gsub('.* t="(\\d+\|\\w+)"( \|>).*', '\\1', sheet_data[pos])
	res <- as.data.frame(res, stringsAsFactors = FALSE)
	res$sheet <- sheet_names[sheet_names$id == i, 'name']
	entries_match <- entries[match(res$v, names(entries))]
	res$v[res$t == "s" & !is.na(res$t)] <-
	entries_match[res$t == "s"& !is.na(res$t)]
	res$cols <- match(gsub("\\d", "", res$r), LETTERS)
	res$rows <- as.numeric(gsub("\\D", "", res$r))

	nrow <- max(res$rows)
	ncol <- max(res$cols)
	if (header)
	{
	nrow <- nrow - 1
	}
	res_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol),
	stringsAsFactors = FALSE)
	style_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol),
	stringsAsFactors = FALSE)
	if (header)
	{
	header_df <- res[res$rows == 1,]
	header_name <- paste0('V', seq(ncol))
	header_name[header_df$cols] <- header_df$v
	names(res_df) <- header_name
	res <- res[res$rows != 1,]
	res$rows <- res$rows - 1
	}
	if (nrow(res) > 0)
	{
	res_df[as.matrix(res[,c('rows', 'cols')])] <- res$v
	s <- as.numeric(res$s)
	s <- cell_style$numFmtId[match(s, cell_style$id)]
	style_df[as.matrix(res[,c('rows', 'cols')])] <- s
	}

	style_df <- sapply(style_df, function(x)
	{
	ifelse(length(unique(x[!is.na(x)])) == 1, unique(x[!is.na(x)]), NA)
	})
	style_col <- rep('character', length(style_df))
	style_col[style_df %in% 14:17] <- "date"
	style_col[style_df %in% c(18:21, 45:47)] <- "time"
	style_col[style_df %in% 22] <- "datetime"
	style_col[is.na(style_df) & !sapply(res_df, function(x) any(grepl("\\D", x)))] <- "numeric"

	number2date <- function(x)
	{
	if (class(x) == 'character')
	{
	pos <- !(is.na(x)) & !(x == 'NA')
	} else
	{
	stop('NOT implemented to check class of column date in xlsxToR')
	}
	res <- rep(NA, length = length(x))
	res[pos] <- as.numeric(x[pos])
	res <- as.Date(res, origin = os_origin)
	res
	}
	res_df[] <- lapply(seq_along(res_df), function(j)
	{
	switch(style_col[j],
	character = res_df[,j],
	numeric = as.numeric(res_df[,j]),
	date = number2date(res_df[,j]),
	time = strftime(as.POSIXct(as.numeric(res_df[,j]), origin = os_origin), format = "%H:%M:%S"),
	datetime = as.POSIXct(as.numeric(res_df[,j]), origin = os_origin))
	})
	if (empty_row)
	{
	pos <- apply(res_df, 1, function(x) sum(is.na(x))) != ncol(res_df)
	if (ncol(res_df) == 1)
	{
	col_name <- names(res_df)
	res_df <- data.frame(res_df[pos,])
	names(res_df) <- col_name
	} else
	{
	res_df <- res_df[pos,]
	}
	}
	sheet_n <- sheet_names$name[sheet_names$id == i]
	worksheets[[sheet_n]] <- res_df
	}, warning = function(w)
	{
	print(style_col)
	stop(paste0('Warning messsage for sheet ', sheet_names$name[sheet_names$id == i], '\n', w))
	})
	}

	if(length(worksheets) == 1)
	{
	worksheets <- worksheets[[1]]
	}
	worksheets
	}