tonglu/fast_split.R

## fast_split.R
v <- sample(1:1000, 50000000, replace = T)
df <- data.frame(id = v)
# split(df, df$id)) is SLOW because it's O(n^2)
# sort(df$id) is FAST because it's O(n ln n)
sid <- sort(df$id)
rle(sid) -> rlesid # Fast because O(n)
tally <- 1
values <- numeric(length(rlesid$lengths)); indices <- numeric(length(rlesid$lengths))
for (i in seq_along(indices)) {
  indices[i] <- tally
  tally <- tally + rlesid$lengths[i]
  values[i] <- sum(df[seq(indices[i], tally - 1), 'id']) #operation for each key: id
}
df2 <- data.frame(id = seq_along(indices), value = values)
head(df2)
	v <- sample(1:1000, 50000000, replace = T)
	df <- data.frame(id = v)
	# split(df, df$id)) is SLOW because it's O(n^2)
	# sort(df$id) is FAST because it's O(n ln n)
	sid <- sort(df$id)
	rle(sid) -> rlesid # Fast because O(n)
	tally <- 1
	values <- numeric(length(rlesid$lengths)); indices <- numeric(length(rlesid$lengths))
	for (i in seq_along(indices)) {
	indices[i] <- tally
	tally <- tally + rlesid$lengths[i]
	values[i] <- sum(df[seq(indices[i], tally - 1), 'id']) #operation for each key: id
	}
	df2 <- data.frame(id = seq_along(indices), value = values)
	head(df2)