Skip to content

Instantly share code, notes, and snippets.

@fredbenenson
Last active August 29, 2015 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fredbenenson/2d72f23eb035bcf101c3 to your computer and use it in GitHub Desktop.
Save fredbenenson/2d72f23eb035bcf101c3 to your computer and use it in GitHub Desktop.
Data Frame Indexing Bug
# This is a reduction which seems to indicate an issue when
# adding a column using an arbitrary set of indexes.
# First, let's create a data-frame with some random values:
s <- data.frame(x = runif(10), y = runif(10))
# Now, two randomly generated lists of numbers that we'll use to try to index
# This could be created thusly:
# wrong <- sample(1:nrow(s), nrow(s) * 0.8), etc.
wrong <- c(3, 6, 7, 5, 1, 2, 9, 8)
right <- c(4, 10, 2, 6, 1, 5, 9, 3)
# Indexing the df with the vector without 10 (the length of the data.frame s)
# generates an error when creating a new column:
s$z[wrong] <- 1
> Error in `$<-.data.frame`(`*tmp*`, "z", value = c(1, 1, 1, NA, 1, 1, 1, :
replacement has 9 rows, data has 10
# Whereas using the vector *with* 10 does work when creating a new column:
s$z[right] <- 1
> s
x y z
1 0.28089503 0.60456879 1
2 0.37238938 0.79486150 1
3 0.76933910 0.11040848 1
4 0.90518539 0.09314296 1
5 0.62716400 0.81500552 1
6 0.64327678 0.58516840 1
7 0.05474681 0.23425909 NA
8 0.83767040 0.73871715 NA
9 0.01475207 0.23033121 1
10 0.18314193 0.10045413 1
@certifiedwaif
Copy link

> `[<-.data.frame`
function (x, i, j, value) 
{
    if (!all(names(sys.call()) %in% c("", "value"))) 
        warning("named arguments are discouraged")
    nA <- nargs()
    if (nA == 4L) {
        has.i <- !missing(i)
        has.j <- !missing(j)
    }
    else if (nA == 3L) {
        if (is.atomic(value) && !is.null(names(value))) 
            names(value) <- NULL
        if (missing(i) && missing(j)) {
            i <- j <- NULL
            has.i <- has.j <- FALSE
            if (is.null(value)) 
                return(x[logical()])
        }
        else {
            if (is.numeric(i) && is.matrix(i) && ncol(i) == 2) {
                index <- rep.int(FALSE, prod(dim(x)))
                dim(index) <- dim(x)
                tryCatch(index[i] <- TRUE, error = function(e) stop(conditionMessage(e), 
                  call. = FALSE))
                o <- order(i[, 2], i[, 1])
                N <- length(value)
                if (length(o)%%N != 0L) 
                  warning("number of items to replace is not a multiple of replacement length")
                if (N < length(o)) 
                  value <- rep(value, length.out = length(o))
                value <- value[o]
                i <- index
            }
            if (is.logical(i) && is.matrix(i) && all(dim(i) == 
                dim(x))) {
                nreplace <- sum(i, na.rm = TRUE)
                if (!nreplace) 
                  return(x)
                N <- length(value)
                if (N > 1L && N < nreplace && (nreplace%%N) == 
                  0L) 
                  value <- rep(value, length.out = nreplace)
                if (N > 1L && (length(value) != nreplace)) 
                  stop("'value' is the wrong length")
                n <- 0L
                nv <- nrow(x)
                for (v in seq_len(dim(i)[2L])) {
                  thisvar <- i[, v, drop = TRUE]
                  nv <- sum(thisvar, na.rm = TRUE)
                  if (nv) {
                    if (is.matrix(x[[v]])) 
                      x[[v]][thisvar, ] <- if (N > 1L) 
                        value[n + seq_len(nv)]
                      else value
                    else x[[v]][thisvar] <- if (N > 1L) 
                      value[n + seq_len(nv)]
                    else value
                  }
                  n <- n + nv
                }
                return(x)
            }
            if (is.matrix(i)) 
                stop("unsupported matrix index in replacement")
            j <- i
            i <- NULL
            has.i <- FALSE
            has.j <- TRUE
        }
    }
    else {
        stop("need 0, 1, or 2 subscripts")
    }
    if (has.j && length(j) == 0L) 
        return(x)
    cl <- oldClass(x)
    class(x) <- NULL
    new.cols <- NULL
    nvars <- length(x)
    nrows <- .row_names_info(x, 2L)
    if (has.i && length(i)) {
        rows <- NULL
        if (anyNA(i)) 
            stop("missing values are not allowed in subscripted assignments of data frames")
        if (char.i <- is.character(i)) {
            rows <- attr(x, "row.names")
            ii <- match(i, rows)
            nextra <- sum(new.rows <- is.na(ii))
            if (nextra > 0L) {
                ii[new.rows] <- seq.int(from = nrows + 1L, length.out = nextra)
                new.rows <- i[new.rows]
            }
            i <- ii
        }
        if (all(i >= 0L) && (nn <- max(i)) > nrows) {
            if (is.null(rows)) 
                rows <- attr(x, "row.names")
            if (!char.i) {
                nrr <- (nrows + 1L):nn
                if (inherits(value, "data.frame") && (dim(value)[1L]) >= 
                  length(nrr)) {
                  new.rows <- attr(value, "row.names")[seq_along(nrr)]
                  repl <- duplicated(new.rows) | match(new.rows, 
                    rows, 0L)
                  if (any(repl)) 
                    new.rows[repl] <- nrr[repl]
                }
                else new.rows <- nrr
            }
            x <- xpdrows.data.frame(x, rows, new.rows)
            rows <- attr(x, "row.names")
            nrows <- length(rows)
        }
        iseq <- seq_len(nrows)[i]
        if (anyNA(iseq)) 
            stop("non-existent rows not allowed")
    }
    else iseq <- NULL
    if (has.j) {
        if (anyNA(j)) 
            stop("missing values are not allowed in subscripted assignments of data frames")
        if (is.character(j)) {
            if ("" %in% j) 
                stop("column name \"\" cannot match any column")
            jj <- match(j, names(x))
            nnew <- sum(is.na(jj))
            if (nnew > 0L) {
                n <- is.na(jj)
                jj[n] <- nvars + seq_len(nnew)
                new.cols <- j[n]
            }
            jseq <- jj
        }
        else if (is.logical(j) || min(j) < 0L) 
            jseq <- seq_along(x)[j]
        else {
            jseq <- j
            if (max(jseq) > nvars) {
                new.cols <- paste0("V", seq.int(from = nvars + 
                  1L, to = max(jseq)))
                if (length(new.cols) != sum(jseq > nvars)) 
                  stop("new columns would leave holes after existing columns")
                if (is.list(value) && !is.null(vnm <- names(value))) {
                  p <- length(jseq)
                  if (length(vnm) < p) 
                    vnm <- rep_len(vnm, p)
                  new.cols <- vnm[jseq > nvars]
                }
            }
        }
    }
    else jseq <- seq_along(x)
    if (anyDuplicated(jseq)) 
        stop("duplicate subscripts for columns")
    n <- length(iseq)
    if (n == 0L) 
        n <- nrows
    p <- length(jseq)
    m <- length(value)
    if (!is.list(value)) {
        if (p == 1L) {
            N <- NROW(value)
            if (N > n) 
                stop(sprintf(ngettext(N, "replacement has %d row, data has %d", 
                  "replacement has %d rows, data has %d"), N, 
                  n), domain = NA)
            if (N < n && N > 0L) 
                if (n%%N == 0L && length(dim(value)) <= 1L) 
                  value <- rep(value, length.out = n)
                else stop(sprintf(ngettext(N, "replacement has %d row, data has %d", 
                  "replacement has %d rows, data has %d"), N, 
                  nrows), domain = NA)
            if (!is.null(names(value))) 
                names(value) <- NULL
            value <- list(value)
        }
        else {
            if (m < n * p && (m == 0L || (n * p)%%m)) 
                stop(sprintf(ngettext(m, "replacement has %d item, need %d", 
                  "replacement has %d items, need %d"), m, n * 
                  p), domain = NA)
            value <- matrix(value, n, p)
            value <- split(value, col(value))
        }
        dimv <- c(n, p)
    }
    else {
        value <- unclass(value)
        lens <- vapply(value, NROW, 1L)
        for (k in seq_along(lens)) {
            N <- lens[k]
            if (n != N && length(dim(value[[k]])) == 2L) 
                stop(sprintf(ngettext(N, "replacement element %d is a matrix/data frame of %d row, need %d", 
                  "replacement element %d is a matrix/data frame of %d rows, need %d"), 
                  k, N, n), domain = NA)
            if (N > 0L && N < n && n%%N) 
                stop(sprintf(ngettext(N, "replacement element %d has %d row, need %d", 
                  "replacement element %d has %d rows, need %d"), 
                  k, N, n), domain = NA)
            if (N > 0L && N < n) 
                value[[k]] <- rep(value[[k]], length.out = n)
            if (N > n) {
                warning(sprintf(ngettext(N, "replacement element %d has %d row to replace %d rows", 
                  "replacement element %d has %d rows to replace %d rows"), 
                  k, N, n), domain = NA)
                value[[k]] <- value[[k]][seq_len(n)]
            }
        }
        dimv <- c(n, length(value))
    }
    nrowv <- dimv[1L]
    if (nrowv < n && nrowv > 0L) {
        if (n%%nrowv == 0L) 
            value <- value[rep_len(seq_len(nrowv), n), , drop = FALSE]
        else stop(sprintf(ngettext(nrowv, "%d row in value to replace %d rows", 
            "%d rows in value to replace %d rows"), nrowv, n), 
            domain = NA)
    }
    else if (nrowv > n) 
        warning(sprintf(ngettext(nrowv, "replacement data has %d row to replace %d rows", 
            "replacement data has %d rows to replace %d rows"), 
            nrowv, n), domain = NA)
    ncolv <- dimv[2L]
    jvseq <- seq_len(p)
    if (ncolv < p) 
        jvseq <- rep_len(seq_len(ncolv), p)
    else if (ncolv > p) {
        warning(sprintf(ngettext(ncolv, "provided %d variable to replace %d variables", 
            "provided %d variables to replace %d variables"), 
            ncolv, p), domain = NA)
        new.cols <- new.cols[seq_len(p)]
    }
    if (length(new.cols)) {
        nm <- names(x)
        rows <- .row_names_info(x, 0L)
        a <- attributes(x)
        a["names"] <- NULL
        x <- c(x, vector("list", length(new.cols)))
        attributes(x) <- a
        names(x) <- c(nm, new.cols)
        attr(x, "row.names") <- rows
    }
    if (has.i) 
        for (jjj in seq_len(p)) {
            jj <- jseq[jjj]
            vjj <- value[[jvseq[[jjj]]]]
            if (jj <= nvars) {
                if (length(dim(x[[jj]])) != 2L) 
                  x[[jj]][iseq] <- vjj
                else x[[jj]][iseq, ] <- vjj
            }
            else {
                x[[jj]] <- vjj[FALSE]
                if (length(dim(vjj)) == 2L) {
                  length(x[[j]]) <- nrows * ncol(vjj)
                  dim(x[[j]]) <- c(nrows, ncol(vjj))
                  x[[jj]][iseq, ] <- vjj
                }
                else {
                  length(x[[j]]) <- nrows
                  x[[jj]][iseq] <- vjj
                }
            }
        }
    else if (p > 0L) 
        for (jjj in p:1L) {
            o <- order(jseq)
            jseq <- jseq[o]
            jvseq <- jvseq[o]
            jj <- jseq[jjj]
            v <- value[[jvseq[[jjj]]]]
            if (nrows > 0L && !length(v)) 
                length(v) <- nrows
            x[[jj]] <- v
            if (!is.null(v) && is.atomic(x[[jj]]) && !is.null(names(x[[jj]]))) 
                names(x[[jj]]) <- NULL
        }
    if (length(new.cols) > 0L) {
        new.cols <- names(x)
        if (anyDuplicated(new.cols)) 
            names(x) <- make.unique(new.cols)
    }
    class(x) <- cl
    x
}
<bytecode: 0x1071c6588>
<environment: namespace:base>```

That's a lot more code than I was expecting!

@certifiedwaif
Copy link

Strangely, when I define a function called f with the same content, and then call it with the arguments
f(wrong, z, 1)
the function works as expected, and returns the data frame

       x         y  z

1 0.6707523 0.6788389 1
2 0.7418809 0.4239940 1
3 0.5061432 0.2350612 1
4 0.3961475 0.8396474 NA
5 0.2268333 0.8911401 1
6 0.5864661 0.6671142 1
7 0.2140100 0.7586803 1
8 0.9915062 0.9027944 1
9 0.4577299 0.9190519 1
10 0.3907814 0.6885121 NA

Weird!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment