BrianWeinstein/RemoveSparseTermsLarge.R

## RemoveSparseTermsLarge.R
# tm::removeSparseTerms attempts to remove sparse terms via slicing a sparse matrix.
# The slicing operation tries to convert the sparse matrix to a dense matrix, but this
# fails if the dense matrix has more than ((2^31) - 1) entries [i.e., if (nrow * ncol) > ((2^31) - 1)]
#
# The error message is
# In nr * nc : NAs produced by integer overflow
#
# Instead of using tm::removeSparseTerms, the following function subsets the sparse matrix directly
# and avoids converting the sparse matrix to a dense one.

library(tm)
library(slam)

RemoveSparseTermsLarge <- function(x, sparse){

  stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
            is.numeric(sparse), sparse > 0, sparse < 1)

  # define mm as a DTM
  mm <- if(inherits(x, "TermDocumentMatrix")){
    t(x)
  } else {
    x
  }

  # find the non-sparse terms
  tt <- table(mm$j) > mm$nrow * (1 - sparse)
  termIndex <- as.numeric(names(tt[tt]))
  nonSparseTermVec <- (mm$j %in% termIndex)

  # subset the simple triplet matrix
  dtm.ns <- simple_triplet_matrix(i = mm$i[nonSparseTermVec],
                                  j = as.integer(as.factor(mm$j[nonSparseTermVec])),
                                  v = mm$v[nonSparseTermVec],
                                  nrow = mm$nrow,
                                  ncol = length(termIndex),
                                  dimnames = list(mm$dimnames$Docs, mm$dimnames$Terms[termIndex]))

  # convert back to a DTM/TDM
  if(inherits(x, "TermDocumentMatrix")){
    as.TermDocumentMatrix(t(dtm.ns), weighting = weightTf)
  } else {
    as.DocumentTermMatrix(dtm.ns, weighting = weightTf)
  }

}
	# tm::removeSparseTerms attempts to remove sparse terms via slicing a sparse matrix.
	# The slicing operation tries to convert the sparse matrix to a dense matrix, but this
	# fails if the dense matrix has more than ((2^31) - 1) entries [i.e., if (nrow * ncol) > ((2^31) - 1)]
	#
	# The error message is
	# In nr * nc : NAs produced by integer overflow
	#
	# Instead of using tm::removeSparseTerms, the following function subsets the sparse matrix directly
	# and avoids converting the sparse matrix to a dense one.

	library(tm)
	library(slam)

	RemoveSparseTermsLarge <- function(x, sparse){

	stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
	is.numeric(sparse), sparse > 0, sparse < 1)

	# define mm as a DTM
	mm <- if(inherits(x, "TermDocumentMatrix")){
	t(x)
	} else {
	x
	}

	# find the non-sparse terms
	tt <- table(mm$j) > mm$nrow * (1 - sparse)
	termIndex <- as.numeric(names(tt[tt]))
	nonSparseTermVec <- (mm$j %in% termIndex)

	# subset the simple triplet matrix
	dtm.ns <- simple_triplet_matrix(i = mm$i[nonSparseTermVec],
	j = as.integer(as.factor(mm$j[nonSparseTermVec])),
	v = mm$v[nonSparseTermVec],
	nrow = mm$nrow,
	ncol = length(termIndex),
	dimnames = list(mm$dimnames$Docs, mm$dimnames$Terms[termIndex]))

	# convert back to a DTM/TDM
	if(inherits(x, "TermDocumentMatrix")){
	as.TermDocumentMatrix(t(dtm.ns), weighting = weightTf)
	} else {
	as.DocumentTermMatrix(dtm.ns, weighting = weightTf)
	}

	}