infotroph/read.longline.R

## read.longline.R
# My input files have short header lines, then CSV data, then short footer lines.
# I'm currently trimming the short lines with an external call to sed,
# but I want a pure-R solution for portability.

# This version works nicely on small examples but gets very slow on large files,
# because append() grows the list, triggering a memory reallocation, for every line.
# Suggestions for speed improvement requested.

read.longline = function(file){
	f = file(file, "r")
	lines = list()
	repeat{ # read short headers & discard
		l = readLines(f, n=1)
		if(length(l) > 0 && nchar(l) > 65){
			# We've found the first data row.
			# Leave it on the stack to process in the next loop.
			pushBack(l, f)
			break
		}
	}
	repeat{ # read long lines, add to CSV, break when short lines start again
		l = readLines(f, n=1)
		if(length(l) > 0 && nchar(l) > 65){
			# Naive implementation!
			# Likely to be VERY slow because we're growing lines every time.
			lines = append(lines, l)
		}else{
			# Either we've hit a short line == beginning of PGP block,
			# or empty line == end of the file.
			# Either way we're done.
			break
		}
	}
	close(f)
	# Now stitch lines together into a dataframe
	txtdat = do.call("paste", c(lines, sep="\n"))
	return(read.csv(text=txtdat, stringsAsFactors=FALSE))
}
	# My input files have short header lines, then CSV data, then short footer lines.
	# I'm currently trimming the short lines with an external call to sed,
	# but I want a pure-R solution for portability.

	# This version works nicely on small examples but gets very slow on large files,
	# because append() grows the list, triggering a memory reallocation, for every line.
	# Suggestions for speed improvement requested.

	read.longline = function(file){
	f = file(file, "r")
	lines = list()
	repeat{ # read short headers & discard
	l = readLines(f, n=1)
	if(length(l) > 0 && nchar(l) > 65){
	# We've found the first data row.
	# Leave it on the stack to process in the next loop.
	pushBack(l, f)
	break
	}
	}
	repeat{ # read long lines, add to CSV, break when short lines start again
	l = readLines(f, n=1)
	if(length(l) > 0 && nchar(l) > 65){
	# Naive implementation!
	# Likely to be VERY slow because we're growing lines every time.
	lines = append(lines, l)
	}else{
	# Either we've hit a short line == beginning of PGP block,
	# or empty line == end of the file.
	# Either way we're done.
	break
	}
	}
	close(f)
	# Now stitch lines together into a dataframe
	txtdat = do.call("paste", c(lines, sep="\n"))
	return(read.csv(text=txtdat, stringsAsFactors=FALSE))
	}