Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
An analysis of crunchbase data for start up financing timing.
library(ggplot2)
library(ggthemes)
library(plyr)
library(lubridate)
library(scales)
library(data.table)
options(scipen=999)
options(stringsAsFactors = FALSE)
rounds <- read.csv("2014-01-06-crunchbase_monthly_export_rounds.csv")
# exclude non venture rounds (other, private equity, post-ipo)
rounds <- subset(rounds, funding_round_type %in% c("venture", "angel", "series-a", "series-b"))# , "series-c+"))
rounds <- subset(rounds, funded_month != "1960-01")
# fix strange date data
rounds$funded_at <- ymd(paste(rounds$funded_month, "01", sep="-"))
rounds$round_raised_amount_usd <- cut(rounds$raised_amount_usd, breaks=c(0, 500000, 1000000, 5000000, 10000000, 20000000, 40000000, 80000000, Inf), right=FALSE)# round(rounds $raised_amount_usd / 500000) * 500000
# dedup
rounds <- data.table(rounds)
setkeyv(rounds, c("company_name", "funded_at", "funding_round_type"))
rounds <- unique(rounds)
# restrict to companies first funded after 2008
companies <- rounds[, list(first_funded_at = min(funded_at)), by = company_name]
rounds <- join(rounds, companies)
rounds <- subset(rounds, first_funded_at > ymd("2008-01-01"))
# fix strange difference in units from diff
my.diff <- function(x, lag=1) {
n <- length(x)
round(difftime(x[(1+lag):n], x[1:(n-lag)], units="days") / 30)
}
# round sequences
rounds_index <- rounds[, id := seq_along(funded_at), by=company_name]
rounds_index <- rounds_index[, diff := c(my.diff(funded_at), NA), by=company_name]
rounds_index$lifetime <- rounds_index$diff
rounds_index[is.na(rounds_index$lifetime),]$lifetime <- round(as.numeric(difftime(max(rounds_index$funded_at), rounds_index[is.na(rounds_index$lifetime),]$funded_at, units="days") / 30))
# rounds_index <- ddply(rounds, .(company_name), transform, index=seq_along(funded_at), diff=c(my.diff(funded_at), NA))
# aggregate
medians <- ddply(rounds_index, .(funding_round_type), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians <- medians[order(medians$median),]
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= funding_round_type]
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by= funding_round_type]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
ggplot(diff_summary, aes(x=lifetime, y=rounds, color=funding_round_type)) + geom_point() + scale_x_continuous(breaks = 0:4 * 12, limits=c(0,48)) + geom_smooth() + ggtitle("Financings") + ylab("Financings") + xlab("Months After Funding") + scale_color_discrete(name = "Round")
ggplot(diff_summary, aes(x=lifetime, y= cum_rounds / cum_total_rounds, color=funding_round_type)) + geom_line() + scale_x_continuous(breaks = 0:4 * 12, limits=c(0,48)) + ggtitle("Likelihood Of Raising A Follow On Round By Time") + ylab("Percent of Companies that Raise a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + scale_color_discrete(name = "Round")
ggsave("follow-on-likelihood-by-time.png")
#### round_raised_amount_usd
medians <- ddply(rounds_index, .(funding_round_type, round_raised_amount_usd), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians[order(medians$funding_round_type, medians$median),]
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "round_raised_amount_usd", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, round_raised_amount_usd, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= c("funding_round_type", "round_raised_amount_usd")]
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "round_raised_amount_usd", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, round_raised_amount_usd, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by=c("funding_round_type", "round_raised_amount_usd")]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, round_raised_amount_usd, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
diff_summary <- diff_summary[!is.na(diff_summary$round_raised_amount_usd),]
ggplot(subset(diff_summary, funding_round_type == "angel"), aes(x=lifetime, y=rounds, color= as.factor(round_raised_amount_usd))) + geom_point() + xlim(0,48) + geom_smooth() + ggtitle("") + ylab("Rounds") + xlab("Months After Funding")
ggplot(subset(diff_summary, funding_round_type == "angel"), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color= as.factor(round_raised_amount_usd))) + geom_line() + xlim(0,48) + ggtitle("") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format())
diff_summary$funding_round_type <- factor(diff_summary$funding_round_type, levels=c("angel", "venture", "series-a", "series-b"))
diff_summary$round_raised_amount_usd <- mapvalues(diff_summary$round_raised_amount_usd, from = c("[0,5e+05)", "[5e+05,1e+06)", "[1e+06,5e+06)", "[5e+06,1e+07)", "[1e+07,2e+07)", "[2e+07,4e+07)", "[4e+07,8e+07)", "[8e+07,Inf)"), to = c("$0-.5M", "$.5-1M", "$1-5M", "$5-10M", "$10-20M", "$20-40M", "$40-80M", "$80M+"))
ggplot(subset(diff_summary, cum_total_rounds > 20 & round_raised_amount_usd != "$80M+"), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color=round_raised_amount_usd)) + geom_line() + xlim(0,48) + ggtitle("Follow On Likelihood by Round and Amount") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + facet_wrap(~ funding_round_type) + scale_colour_few()
ggsave("follow-on-likelihood-by-round-size.png")
#### company_category_code
medians <- ddply(rounds_index, .(funding_round_type, company_category_code), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians <- medians[order(medians$funding_round_type, medians$median),]
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "company_category_code", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, company_category_code, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= c("funding_round_type", "company_category_code")]
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "company_category_code", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, company_category_code, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by=c("funding_round_type", "company_category_code")]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, company_category_code, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
category_counts <- ddply(rounds, .(company_category_code), summarize, counts=length(unique(company_name)))
category_counts <- category_counts[order(category_counts$counts, decreasing=T),]
ggplot(subset(diff_summary, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=lifetime, y=rounds, color=funding_round_type)) + geom_point() + xlim(0,48) + geom_smooth() + ggtitle("") + ylab("Rounds") + xlab("Months After Funding") + facet_wrap(~ company_category_code)
ggplot(subset(diff_summary, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color= funding_round_type)) + geom_line() + xlim(0,48) + ggtitle("Follow On Likelihood by Category") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + facet_wrap(~ company_category_code)
ggsave("follow-on-likelihood-by-round-category.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment