avullo/sample_files_from_directory.R

## sample_files_from_directory.R
## Randomly sample a given percentage of lines from files in a directory
## and write them out to a file
##
sampleFiles <- function(dir = './', ofname, perc = 1, append = TRUE, seed = 1234) {
  ## 'dir' is a characted vector of length 1 representing the name of the directory

  ## 'ofname' is a character vector of length 1 indicating the name of the output file

  ## 'append' is logical to tell wheter to append to sampled fraction to the output file

  ## 'seed' is a number to the set the seed for the random number generator

  ## Return: NULL

  files <- list.files(dir)
  lapply(files, function(fname) sampleFile(paste(dir, fname, sep = "/"), ofname, perc = perc, append = append, seed = seed))
}

## sample_large_file.R
## Randomly sample a certain fraction of the lines of a file, and write them to an output file
##
## Adapted from https://stat.ethz.ch/pipermail/r-help/2007-February/124812.html
##
## WARN
##   works with files with no or a very few empty lines
##
sampleFile <- function(ifname, ofname, perc = 1, append = TRUE, seed = 1234) {
  ## 'ifname' is a character vector of length 1 indicating the name of the file

  ## 'ofname' is a character vector of length 1 indicating the name of the output file

  ## 'perc' is a number between 1 and 100 indicating the fraction (in %) of
  ## of the input file to write to the output file

  ## 'append' is logical to tell wheter to append to sampled fraction to the output file

  ## 'seed' is a number to the set the seed for the random number generator

  ## Return: NULL

  nlines <- numberOfLines(ifname)

  # generate the random row values
  set.seed(seed)
  sel <- sample(1:nlines, nlines * perc / 100)

  # set up a sequence for the cache chunks,
  # chunk size is 9th of number of lines
  chunk_size <- floor(nlines/9)
  cuts <- seq(0, nlines, chunk_size)

  # loop over the length of cuts, less 1
  for ( i in seq(along = cuts[-1]) ) {
    # get a chunk_size row chunk, skipping rows
    # as appropriate for each subsequent chunk
    # might get less then chunk_size lines, if there are empty lines
    chunk <- scan(ifname, what = character(), sep = "\n", skip = cuts[i], nlines = chunk_size)

    # set up a row sequence for the current chunk
    rows <- (cuts[i]+1):(cuts[i+1])

    # are any of the the random values in the current chunk?
    # if so, get them and write them out
    chunk.sel <- sel[which(sel %in% rows)]
    if(length(chunk.sel) > 0) {
      chunk_index <- sel - cuts[i]
      # take into account chunk might have less than chunk_size lines
      write.rows <- chunk[chunk_index[chunk_index>0 & chunk_index <= chunk_size]]
      write(write.rows, ofname, append = append, sep = "\n")
    }
  }
}
	## Randomly sample a given percentage of lines from files in a directory
	## and write them out to a file
	##
	sampleFiles <- function(dir = './', ofname, perc = 1, append = TRUE, seed = 1234) {
	## 'dir' is a characted vector of length 1 representing the name of the directory

	## 'ofname' is a character vector of length 1 indicating the name of the output file

	## 'append' is logical to tell wheter to append to sampled fraction to the output file

	## 'seed' is a number to the set the seed for the random number generator

	## Return: NULL

	files <- list.files(dir)
	lapply(files, function(fname) sampleFile(paste(dir, fname, sep = "/"), ofname, perc = perc, append = append, seed = seed))
	}
	## Randomly sample a certain fraction of the lines of a file, and write them to an output file
	##
	## Adapted from https://stat.ethz.ch/pipermail/r-help/2007-February/124812.html
	##
	## WARN
	## works with files with no or a very few empty lines
	##
	sampleFile <- function(ifname, ofname, perc = 1, append = TRUE, seed = 1234) {
	## 'ifname' is a character vector of length 1 indicating the name of the file

	## 'ofname' is a character vector of length 1 indicating the name of the output file

	## 'perc' is a number between 1 and 100 indicating the fraction (in %) of
	## of the input file to write to the output file

	## 'append' is logical to tell wheter to append to sampled fraction to the output file

	## 'seed' is a number to the set the seed for the random number generator

	## Return: NULL

	nlines <- numberOfLines(ifname)

	# generate the random row values
	set.seed(seed)
	sel <- sample(1:nlines, nlines * perc / 100)

	# set up a sequence for the cache chunks,
	# chunk size is 9th of number of lines
	chunk_size <- floor(nlines/9)
	cuts <- seq(0, nlines, chunk_size)

	# loop over the length of cuts, less 1
	for ( i in seq(along = cuts[-1]) ) {
	# get a chunk_size row chunk, skipping rows
	# as appropriate for each subsequent chunk
	# might get less then chunk_size lines, if there are empty lines
	chunk <- scan(ifname, what = character(), sep = "\n", skip = cuts[i], nlines = chunk_size)

	# set up a row sequence for the current chunk
	rows <- (cuts[i]+1):(cuts[i+1])

	# are any of the the random values in the current chunk?
	# if so, get them and write them out
	chunk.sel <- sel[which(sel %in% rows)]
	if(length(chunk.sel) > 0) {
	chunk_index <- sel - cuts[i]
	# take into account chunk might have less than chunk_size lines
	write.rows <- chunk[chunk_index[chunk_index>0 & chunk_index <= chunk_size]]
	write(write.rows, ofname, append = append, sep = "\n")
	}
	}
	}