Skip to content

Instantly share code, notes, and snippets.

@avullo
Last active December 6, 2017 00:57
Show Gist options
  • Save avullo/b92c7f71ff01867e88b8ca7b8e90fb2b to your computer and use it in GitHub Desktop.
Save avullo/b92c7f71ff01867e88b8ca7b8e90fb2b to your computer and use it in GitHub Desktop.
R random sampling from (large) text files
## Randomly sample a given percentage of lines from files in a directory
## and write them out to a file
##
sampleFiles <- function(dir = './', ofname, perc = 1, append = TRUE, seed = 1234) {
## 'dir' is a characted vector of length 1 representing the name of the directory
## 'ofname' is a character vector of length 1 indicating the name of the output file
## 'append' is logical to tell wheter to append to sampled fraction to the output file
## 'seed' is a number to the set the seed for the random number generator
## Return: NULL
files <- list.files(dir)
lapply(files, function(fname) sampleFile(paste(dir, fname, sep = "/"), ofname, perc = perc, append = append, seed = seed))
}
## Randomly sample a certain fraction of the lines of a file, and write them to an output file
##
## Adapted from https://stat.ethz.ch/pipermail/r-help/2007-February/124812.html
##
## WARN
## works with files with no or a very few empty lines
##
sampleFile <- function(ifname, ofname, perc = 1, append = TRUE, seed = 1234) {
## 'ifname' is a character vector of length 1 indicating the name of the file
## 'ofname' is a character vector of length 1 indicating the name of the output file
## 'perc' is a number between 1 and 100 indicating the fraction (in %) of
## of the input file to write to the output file
## 'append' is logical to tell wheter to append to sampled fraction to the output file
## 'seed' is a number to the set the seed for the random number generator
## Return: NULL
nlines <- numberOfLines(ifname)
# generate the random row values
set.seed(seed)
sel <- sample(1:nlines, nlines * perc / 100)
# set up a sequence for the cache chunks,
# chunk size is 9th of number of lines
chunk_size <- floor(nlines/9)
cuts <- seq(0, nlines, chunk_size)
# loop over the length of cuts, less 1
for ( i in seq(along = cuts[-1]) ) {
# get a chunk_size row chunk, skipping rows
# as appropriate for each subsequent chunk
# might get less then chunk_size lines, if there are empty lines
chunk <- scan(ifname, what = character(), sep = "\n", skip = cuts[i], nlines = chunk_size)
# set up a row sequence for the current chunk
rows <- (cuts[i]+1):(cuts[i+1])
# are any of the the random values in the current chunk?
# if so, get them and write them out
chunk.sel <- sel[which(sel %in% rows)]
if(length(chunk.sel) > 0) {
chunk_index <- sel - cuts[i]
# take into account chunk might have less than chunk_size lines
write.rows <- chunk[chunk_index[chunk_index>0 & chunk_index <= chunk_size]]
write(write.rows, ofname, append = append, sep = "\n")
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment