Skip to content

Instantly share code, notes, and snippets.

@tonglu
Last active August 29, 2015 14:22
Show Gist options
  • Save tonglu/86ec2f8bb20a01dc7555 to your computer and use it in GitHub Desktop.
Save tonglu/86ec2f8bb20a01dc7555 to your computer and use it in GitHub Desktop.
v <- sample(1:1000, 50000000, replace = T)
df <- data.frame(id = v)
# split(df, df$id)) is SLOW because it's O(n^2)
# sort(df$id) is FAST because it's O(n ln n)
sid <- sort(df$id)
rle(sid) -> rlesid # Fast because O(n)
tally <- 1
values <- numeric(length(rlesid$lengths)); indices <- numeric(length(rlesid$lengths))
for (i in seq_along(indices)) {
indices[i] <- tally
tally <- tally + rlesid$lengths[i]
values[i] <- sum(df[seq(indices[i], tally - 1), 'id']) #operation for each key: id
}
df2 <- data.frame(id = seq_along(indices), value = values)
head(df2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment