public
Created

An analysis of crunchbase data for start up financing timing.

  • Download Gist
lifetime.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
library(ggplot2)
library(ggthemes)
library(plyr)
library(lubridate)
library(scales)
library(data.table)
options(scipen=999)
options(stringsAsFactors = FALSE)
 
rounds <- read.csv("2014-01-06-crunchbase_monthly_export_rounds.csv")
# exclude non venture rounds (other, private equity, post-ipo)
rounds <- subset(rounds, funding_round_type %in% c("venture", "angel", "series-a", "series-b"))# , "series-c+"))
rounds <- subset(rounds, funded_month != "1960-01")
# fix strange date data
rounds$funded_at <- ymd(paste(rounds$funded_month, "01", sep="-"))
rounds$round_raised_amount_usd <- cut(rounds$raised_amount_usd, breaks=c(0, 500000, 1000000, 5000000, 10000000, 20000000, 40000000, 80000000, Inf), right=FALSE)# round(rounds $raised_amount_usd / 500000) * 500000
 
# dedup
rounds <- data.table(rounds)
setkeyv(rounds, c("company_name", "funded_at", "funding_round_type"))
rounds <- unique(rounds)
 
# restrict to companies first funded after 2008
companies <- rounds[, list(first_funded_at = min(funded_at)), by = company_name]
rounds <- join(rounds, companies)
rounds <- subset(rounds, first_funded_at > ymd("2008-01-01"))
 
# fix strange difference in units from diff
my.diff <- function(x, lag=1) {
n <- length(x)
round(difftime(x[(1+lag):n], x[1:(n-lag)], units="days") / 30)
}
 
# round sequences
rounds_index <- rounds[, id := seq_along(funded_at), by=company_name]
rounds_index <- rounds_index[, diff := c(my.diff(funded_at), NA), by=company_name]
rounds_index$lifetime <- rounds_index$diff
rounds_index[is.na(rounds_index$lifetime),]$lifetime <- round(as.numeric(difftime(max(rounds_index$funded_at), rounds_index[is.na(rounds_index$lifetime),]$funded_at, units="days") / 30))
 
# rounds_index <- ddply(rounds, .(company_name), transform, index=seq_along(funded_at), diff=c(my.diff(funded_at), NA))
 
 
# aggregate
medians <- ddply(rounds_index, .(funding_round_type), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians <- medians[order(medians$median),]
 
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= funding_round_type]
 
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by= funding_round_type]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
 
ggplot(diff_summary, aes(x=lifetime, y=rounds, color=funding_round_type)) + geom_point() + scale_x_continuous(breaks = 0:4 * 12, limits=c(0,48)) + geom_smooth() + ggtitle("Financings") + ylab("Financings") + xlab("Months After Funding") + scale_color_discrete(name = "Round")
ggplot(diff_summary, aes(x=lifetime, y= cum_rounds / cum_total_rounds, color=funding_round_type)) + geom_line() + scale_x_continuous(breaks = 0:4 * 12, limits=c(0,48)) + ggtitle("Likelihood Of Raising A Follow On Round By Time") + ylab("Percent of Companies that Raise a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + scale_color_discrete(name = "Round")
ggsave("follow-on-likelihood-by-time.png")
 
#### round_raised_amount_usd
medians <- ddply(rounds_index, .(funding_round_type, round_raised_amount_usd), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians[order(medians$funding_round_type, medians$median),]
 
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "round_raised_amount_usd", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, round_raised_amount_usd, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= c("funding_round_type", "round_raised_amount_usd")]
 
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "round_raised_amount_usd", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, round_raised_amount_usd, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by=c("funding_round_type", "round_raised_amount_usd")]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, round_raised_amount_usd, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
diff_summary <- diff_summary[!is.na(diff_summary$round_raised_amount_usd),]
 
ggplot(subset(diff_summary, funding_round_type == "angel"), aes(x=lifetime, y=rounds, color= as.factor(round_raised_amount_usd))) + geom_point() + xlim(0,48) + geom_smooth() + ggtitle("") + ylab("Rounds") + xlab("Months After Funding")
ggplot(subset(diff_summary, funding_round_type == "angel"), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color= as.factor(round_raised_amount_usd))) + geom_line() + xlim(0,48) + ggtitle("") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format())
 
diff_summary$funding_round_type <- factor(diff_summary$funding_round_type, levels=c("angel", "venture", "series-a", "series-b"))
diff_summary$round_raised_amount_usd <- mapvalues(diff_summary$round_raised_amount_usd, from = c("[0,5e+05)", "[5e+05,1e+06)", "[1e+06,5e+06)", "[5e+06,1e+07)", "[1e+07,2e+07)", "[2e+07,4e+07)", "[4e+07,8e+07)", "[8e+07,Inf)"), to = c("$0-.5M", "$.5-1M", "$1-5M", "$5-10M", "$10-20M", "$20-40M", "$40-80M", "$80M+"))
 
ggplot(subset(diff_summary, cum_total_rounds > 20 & round_raised_amount_usd != "$80M+"), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color=round_raised_amount_usd)) + geom_line() + xlim(0,48) + ggtitle("Follow On Likelihood by Round and Amount") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + facet_wrap(~ funding_round_type) + scale_colour_few()
ggsave("follow-on-likelihood-by-round-size.png")
 
 
#### company_category_code
medians <- ddply(rounds_index, .(funding_round_type, company_category_code), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians <- medians[order(medians$funding_round_type, medians$median),]
 
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "company_category_code", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, company_category_code, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= c("funding_round_type", "company_category_code")]
 
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "company_category_code", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, company_category_code, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by=c("funding_round_type", "company_category_code")]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, company_category_code, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
 
category_counts <- ddply(rounds, .(company_category_code), summarize, counts=length(unique(company_name)))
category_counts <- category_counts[order(category_counts$counts, decreasing=T),]
 
ggplot(subset(diff_summary, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=lifetime, y=rounds, color=funding_round_type)) + geom_point() + xlim(0,48) + geom_smooth() + ggtitle("") + ylab("Rounds") + xlab("Months After Funding") + facet_wrap(~ company_category_code)
ggplot(subset(diff_summary, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color= funding_round_type)) + geom_line() + xlim(0,48) + ggtitle("Follow On Likelihood by Category") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + facet_wrap(~ company_category_code)
ggsave("follow-on-likelihood-by-round-category.png")

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.