infotroph/readtime.r

## readtime.r
# Context: I have untidy CSVs that need some junk lines filtered out before they're even grid-shaped.
# I currently do the filtering with an external sed call,
# but wanted something that would work on any OS.

# In https://gist.github.com/infotroph/dd0faa5fd24bb78b4ff6
# I asked how to do the filtering from within R,
# and settled on readLines -> filter -> send filtered lines back to read.csv.

# This script doesn't filter anything,
# it just tests different ways of passing lines back into read.csv afterwards:
# As an array of strings, as one single string with embedded newlines, or as a textConnection.


library(microbenchmark)
library(readr)

TMPFILE = "readtime_tmp.csv"

NROWS = 1e3 # Write/read this many rows to/from temp file.
NTIMES = 5	# Run the benchmark this many times.
# N.B. Keep these both very small at first!
# On my machine even 5000 lines make f_strvec() and f_con() take >30 sec per read.


# Write some fake data to temp file.
# Here I'm using 1 column character data, then 104 columns of doubles.
cat( # headers
	paste(c("id", rep(letters,4)), collapse=","),
	sep="\n",
	file=TMPFILE)
invisible(replicate( # data
	NROWS,
	cat(
		paste(c(sample(letters, 1), rnorm(26*4)), collapse=","),
		sep="\n",
		file=TMPFILE,
		append=TRUE)))

colClasses = c("character", rep("numeric", 26*4))
readr_classes=paste(c("c", rep("d", 26*4)), collapse="")

# Baseline for comparison: Standard call to read.csv
f_direct = function(){
	return(read.csv(TMPFILE, colClasses=colClasses))
}

# File contents as vector of character strings
f_strvec = function(){
	x = readLines(TMPFILE)
	return(read.csv(text=x, colClasses=colClasses))
}

# File contents glued into one single string before passing
f_onestr = function(){
	x=readLines(TMPFILE)
	return(read.csv(
		text=do.call("paste", c(as.list(x), sep="\n")),
		colClasses=colClasses))
}

# Vector of strings passed as a TextConnection
f_con = function(){
	x = readLines(TMPFILE)
	xc = textConnection(x)
	on.exit(close(xc))
	return(read.csv(xc, colClasses=colClasses))
}

# Hadleyfied readlines, base read.csv
f_readr_lines = function(){
	x=read_lines(TMPFILE)
	return(read.csv(
		text=do.call("paste", c(as.list(x), sep="\n")),
		colClasses=colClasses))
}

# Both readlines and csv read Hadleyfied.
# N.B. On first several tries, this threw
# "C stack usage is too close to the limit",
# but haven't been able to reproduce that since.
# I wouldn't count on this function working for very large files.
f_readr_both = function(){
	x=read_lines(TMPFILE)
	return(read_csv(
		file=do.call("paste", c(as.list(x), sep="\n")),
		col_types=readr_classes))
}


cat(paste("Reading", NROWS, "lines.\n"))

a=f_direct()
b=f_strvec()
c=f_con()
d=f_onestr()
e=f_readr_lines()
f=f_readr_both()

# read_csv()'s default column names aren't the same as read.csv()'s.
# Let's make them match for comparison.
names(e) = make.names(names(e), unique=TRUE)
names(f) = make.names(names(f), unique=TRUE)
class(e) = "data.frame"
class(f) = "data.frame"

# Did all methods return the same dataframe?
stopifnot(all(
	all.equal(a,b),
	all.equal(a,c),
	all.equal(a,d),
	all.equal(a,e),
	all.equal(a,f)))

rm(list=c("a", "b", "c", "d", "e", "f"))

print(microbenchmark(
	a=f_direct(),
	b=f_strvec(),
	c=f_con(),
	d=f_onestr(),
	e=f_readr_lines(),
	f=f_readr_both(),
	times=NTIMES))
	# Context: I have untidy CSVs that need some junk lines filtered out before they're even grid-shaped.
	# I currently do the filtering with an external sed call,
	# but wanted something that would work on any OS.

	# In https://gist.github.com/infotroph/dd0faa5fd24bb78b4ff6
	# I asked how to do the filtering from within R,
	# and settled on readLines -> filter -> send filtered lines back to read.csv.

	# This script doesn't filter anything,
	# it just tests different ways of passing lines back into read.csv afterwards:
	# As an array of strings, as one single string with embedded newlines, or as a textConnection.


	library(microbenchmark)
	library(readr)

	TMPFILE = "readtime_tmp.csv"

	NROWS = 1e3 # Write/read this many rows to/from temp file.
	NTIMES = 5 # Run the benchmark this many times.
	# N.B. Keep these both very small at first!
	# On my machine even 5000 lines make f_strvec() and f_con() take >30 sec per read.


	# Write some fake data to temp file.
	# Here I'm using 1 column character data, then 104 columns of doubles.
	cat( # headers
	paste(c("id", rep(letters,4)), collapse=","),
	sep="\n",
	file=TMPFILE)
	invisible(replicate( # data
	NROWS,
	cat(
	paste(c(sample(letters, 1), rnorm(26*4)), collapse=","),
	sep="\n",
	file=TMPFILE,
	append=TRUE)))

	colClasses = c("character", rep("numeric", 26*4))
	readr_classes=paste(c("c", rep("d", 26*4)), collapse="")

	# Baseline for comparison: Standard call to read.csv
	f_direct = function(){
	return(read.csv(TMPFILE, colClasses=colClasses))
	}

	# File contents as vector of character strings
	f_strvec = function(){
	x = readLines(TMPFILE)
	return(read.csv(text=x, colClasses=colClasses))
	}

	# File contents glued into one single string before passing
	f_onestr = function(){
	x=readLines(TMPFILE)
	return(read.csv(
	text=do.call("paste", c(as.list(x), sep="\n")),
	colClasses=colClasses))
	}

	# Vector of strings passed as a TextConnection
	f_con = function(){
	x = readLines(TMPFILE)
	xc = textConnection(x)
	on.exit(close(xc))
	return(read.csv(xc, colClasses=colClasses))
	}

	# Hadleyfied readlines, base read.csv
	f_readr_lines = function(){
	x=read_lines(TMPFILE)
	return(read.csv(
	text=do.call("paste", c(as.list(x), sep="\n")),
	colClasses=colClasses))
	}

	# Both readlines and csv read Hadleyfied.
	# N.B. On first several tries, this threw
	# "C stack usage is too close to the limit",
	# but haven't been able to reproduce that since.
	# I wouldn't count on this function working for very large files.
	f_readr_both = function(){
	x=read_lines(TMPFILE)
	return(read_csv(
	file=do.call("paste", c(as.list(x), sep="\n")),
	col_types=readr_classes))
	}


	cat(paste("Reading", NROWS, "lines.\n"))

	a=f_direct()
	b=f_strvec()
	c=f_con()
	d=f_onestr()
	e=f_readr_lines()
	f=f_readr_both()

	# read_csv()'s default column names aren't the same as read.csv()'s.
	# Let's make them match for comparison.
	names(e) = make.names(names(e), unique=TRUE)
	names(f) = make.names(names(f), unique=TRUE)
	class(e) = "data.frame"
	class(f) = "data.frame"

	# Did all methods return the same dataframe?
	stopifnot(all(
	all.equal(a,b),
	all.equal(a,c),
	all.equal(a,d),
	all.equal(a,e),
	all.equal(a,f)))

	rm(list=c("a", "b", "c", "d", "e", "f"))

	print(microbenchmark(
	a=f_direct(),
	b=f_strvec(),
	c=f_con(),
	d=f_onestr(),
	e=f_readr_lines(),
	f=f_readr_both(),
	times=NTIMES))