richfitz/read_other_peoples_excel.R

## read_other_peoples_excel.R
parse_col <- function(x) {
  d <- strsplit(x, NULL)
  f <- function(x) {
    sum(match(x, LETTERS) * 26^(seq_along(x) - 1L))
  }
  vapply(d, f, numeric(1))
}

inspect_xlsx <- function(path, sheet) {
  tmp <- tempfile()
  file <- sprintf("xl/worksheets/sheet%d.xml", 1L)
  sheet_path <- utils::unzip(path, file, exdir=tmp)
  on.exit(file.remove(sheet_path))

  xml <- xml2::read_xml(sheet_path)
  stopifnot(xml2::xml_name(xml) == "worksheet")

  ## https://github.com/hadley/xml2/issues/28
  ns <- xml2::xml_ns(xml)
  sheetData <- xml2::xml_find_one(xml, "d1:sheetData", ns)

  rows <- xml2::xml_find_all(sheetData, "d1:row", xml2::xml_ns(xml))

  f_row <- function(x) {
    lapply(xml2::xml_find_all(x, "d1:c", ns), f_cell)
  }
  f_cell <- function(y) {
    ## TODO: should check/filter for cells holding "" too?
    ##
    ## The 't' attribute is type.  Possible entries include:
    ##   n: numeric
    ##   s: string
    ##
    ## The "type" contains "v" for a value and "f" for a formula
    ## list(attrs=xml2::xml_attrs(y),
    ##      type=xml2::xml_name(xml2::xml_contents(y)))
    list(ref=xml2::xml_attr(y, "r"),
         s=xml2::xml_attr(y, "s"),
         t=xml2::xml_attr(y, "t"),
         is_formula=is_formula(y))
  }
  is_formula <- function(x) {
    !is.null(tryCatch(xml2::xml_find_one(x, "d1:f", ns),
                      error=function(x) NULL))
  }
  dat <- unlist(lapply(rows, f_row), FALSE)
  dat <- data.frame(ref=vapply(dat, "[[", character(1), "ref"),
                    s=vapply(dat, "[[", character(1), "s"),
                    t=vapply(dat, "[[", character(1), "t"),
                    is_formula=vapply(dat, "[[", logical(1), "is_formula"),
                    stringsAsFactors=FALSE)
  re <- "^([[:alpha:]]+)([0-9]+)$"
  dat$col_alpha <- sub(re, "\\1", dat$ref)
  dat$col <- parse_col(dat$col_alpha)
  dat$row <- as.integer(sub(re, "\\2", dat$ref))
  dat
}

show_cells <- function(x) {
  m <- matrix(".", max(dat$row), max(dat$col))
  m[cbind(dat$row, dat$col)] <- ifelse(dat$is_formula, "=", "o")
  message(paste(apply(m, 1, paste, collapse=""), collapse="\n"))
}

dat <- inspect_xlsx("terrible.xlsx", 1L)
show_cells(dat)

## ........
## ........
## ........
## ........
## ........
## ........
## ........
## .....ooo
## .....===
## .ooo....
## .ooo....
## .ooo....


## terrible.xlsx

      
    Raw
  

              terrible.xlsx
            
          
            View raw
	parse_col <- function(x) {
	d <- strsplit(x, NULL)
	f <- function(x) {
	sum(match(x, LETTERS) * 26^(seq_along(x) - 1L))
	}
	vapply(d, f, numeric(1))
	}

	inspect_xlsx <- function(path, sheet) {
	tmp <- tempfile()
	file <- sprintf("xl/worksheets/sheet%d.xml", 1L)
	sheet_path <- utils::unzip(path, file, exdir=tmp)
	on.exit(file.remove(sheet_path))

	xml <- xml2::read_xml(sheet_path)
	stopifnot(xml2::xml_name(xml) == "worksheet")

	## https://github.com/hadley/xml2/issues/28
	ns <- xml2::xml_ns(xml)
	sheetData <- xml2::xml_find_one(xml, "d1:sheetData", ns)

	rows <- xml2::xml_find_all(sheetData, "d1:row", xml2::xml_ns(xml))

	f_row <- function(x) {
	lapply(xml2::xml_find_all(x, "d1:c", ns), f_cell)
	}
	f_cell <- function(y) {
	## TODO: should check/filter for cells holding "" too?
	##
	## The 't' attribute is type. Possible entries include:
	## n: numeric
	## s: string
	##
	## The "type" contains "v" for a value and "f" for a formula
	## list(attrs=xml2::xml_attrs(y),
	## type=xml2::xml_name(xml2::xml_contents(y)))
	list(ref=xml2::xml_attr(y, "r"),
	s=xml2::xml_attr(y, "s"),
	t=xml2::xml_attr(y, "t"),
	is_formula=is_formula(y))
	}
	is_formula <- function(x) {
	!is.null(tryCatch(xml2::xml_find_one(x, "d1:f", ns),
	error=function(x) NULL))
	}
	dat <- unlist(lapply(rows, f_row), FALSE)
	dat <- data.frame(ref=vapply(dat, "[[", character(1), "ref"),
	s=vapply(dat, "[[", character(1), "s"),
	t=vapply(dat, "[[", character(1), "t"),
	is_formula=vapply(dat, "[[", logical(1), "is_formula"),
	stringsAsFactors=FALSE)
	re <- "^([[:alpha:]]+)([0-9]+)$"
	dat$col_alpha <- sub(re, "\\1", dat$ref)
	dat$col <- parse_col(dat$col_alpha)
	dat$row <- as.integer(sub(re, "\\2", dat$ref))
	dat
	}

	show_cells <- function(x) {
	m <- matrix(".", max(dat$row), max(dat$col))
	m[cbind(dat$row, dat$col)] <- ifelse(dat$is_formula, "=", "o")
	message(paste(apply(m, 1, paste, collapse=""), collapse="\n"))
	}

	dat <- inspect_xlsx("terrible.xlsx", 1L)
	show_cells(dat)

	## ........
	## ........
	## ........
	## ........
	## ........
	## ........
	## ........
	## .....ooo
	## .....===
	## .ooo....
	## .ooo....
	## .ooo....