Skip to content

Instantly share code, notes, and snippets.

@jdavidson
Created January 6, 2014 18:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jdavidson/8287656 to your computer and use it in GitHub Desktop.
Save jdavidson/8287656 to your computer and use it in GitHub Desktop.
Script to visualize Crunchbase funding rounds data
library(ggplot2)
library(plyr)
library(lubridate)
library(scales)
options(scipen=999)
options(stringsAsFactors = FALSE)
rounds <- read.csv("2013-11-04-crunchbase_monthly_export_rounds.csv")
# exclude non venture rounds (other, private equity, post-ipo)
rounds <- subset(rounds, funding_round_type %in% c("venture", "angel", "crowdfunding", "series-a", "series-b", "series-c+"))
rounds$funded_at <- mdy(rounds$funded_at)
# summarize companies funding history
companies <- ddply(rounds, .(company_name, company_category_code, company_state_code, company_country_code, company_region), summarize, rounds=length(funded_at), first_funding_round_type=funding_round_type[order(funded_at)[1]], first_raised_amount_usd=raised_amount_usd[order(funded_at)[1]], first_funded_at=min(funded_at), total_raised_amount_usd=sum(raised_amount_usd), days_between_first_and_second_funding=as.numeric(funded_at[order(funded_at)[2]] - funded_at[order(funded_at)[1]]))
# restrict to 2010 - 2012
companies <- subset(companies, first_funded_at < ymd("2013-01-01") & first_funded_at >= ymd("2010-01-01"))
companies$second_round <- companies$rounds > 1
companies <- subset(companies, first_raised_amount_usd > 0)
companies$round_first_raised_amount_usd <- round(companies$first_raised_amount_usd / 250000) * 250000
round_first_raised_amount_usd_counts <- ddply(companies, .(round_first_raised_amount_usd), summarize, counts=length(company_name), second_round=sum(second_round), success=second_round / counts)
first_raised_amount_usd_counts <- ddply(companies, .(first_raised_amount_usd), summarize, counts=length(company_name))
first_raised_amount_usd_counts$cum <- cumsum(first_raised_amount_usd_counts $counts) / sum(first_raised_amount_usd_counts $counts)
first_raised_amount_usd_median <- median(companies$first_raised_amount_usd)
ggplot(first_raised_amount_usd_counts, aes(x=first_raised_amount_usd, y=cum)) + geom_line() + scale_x_continuous(limits=c(1, 10000000), labels=dollar_format()) + scale_y_continuous(labels = percent_format()) + geom_vline(aes(xintercept= first_raised_amount_usd_median), linetype="dashed", size=.5) + geom_text(aes(x= first_raised_amount_usd_median, y=0, label=dollar_format()(first_raised_amount_usd_median)), vjust=1.2, size=2, position=position_jitter(width=0, height=0)) + ggtitle("Cumulative Distribution of First Round Funding Amount") + ylab("Cumulative Percentage of Companies") + xlab("First Round Funding Amount (USD)")
ggsave("cumulative_distribution_first_raised_amount_usd.png")
ggplot(subset(round_first_raised_amount_usd_counts, counts > 10), aes(x= round_first_raised_amount_usd, y=success)) + geom_point() + scale_x_continuous(limits=c(1, 10000000), labels=dollar_format()) + scale_y_continuous(labels = percent_format()) + geom_smooth(method="lm", aes(weight=counts)) + ggtitle("Percent Follow On Funding by First Round Funding Amount") + ylab("Percent Follow On Funding") + xlab("First Round Funding Amount (USD)")
ggsave("follow_on_by_first_raised_amount_usd.png")
first_raised_amount_usd_counts <- ddply(companies, .(second_round, first_raised_amount_usd), summarize, counts=length(company_name))
first_raised_amount_usd_counts <- ddply(first_raised_amount_usd_counts, .(second_round), transform, cum=cumsum(counts) / sum(counts))
medians <- ddply(companies, .(second_round), summarize, median=median(first_raised_amount_usd))
ggplot(first_raised_amount_usd_counts, aes(x=first_raised_amount_usd, y=cum, color=second_round)) + geom_line() + scale_x_continuous(limits=c(1, 10000000), labels=dollar_format()) + scale_y_continuous(labels = percent_format()) + geom_vline(data=medians, aes(xintercept=median, color=second_round), linetype="dashed", size=.5) + geom_text(data=medians, aes(x= median, y=0, label= dollar_format()(median), color=second_round), vjust=1.2, size=2, position=position_jitter(width=0, height=0)) + ggtitle("Cumulative Distribution of First Round Funding Amount") + ylab("Cumulative Percentage of Companies") + xlab("First Round Funding Amount (USD)")
ggsave("cumulative_distribution_second_round_first_raised_amount_usd.png")
round_category_first_raised_amount_usd_counts <- ddply(companies, .(company_category_code, round_first_raised_amount_usd), summarize, counts=length(company_name), second_round=sum(second_round), success=second_round / counts)
category_counts <- ddply(companies, .(company_category_code), summarize, counts=length(company_name), second_round=sum(second_round), success=second_round / counts)
category_counts <- category_counts[order(category_counts$counts, decreasing=T),]
ggplot(subset(round_category_first_raised_amount_usd_counts, counts > 5 & company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x= round_first_raised_amount_usd, y=success)) + geom_point() + scale_x_continuous(limits=c(1, 10000000), labels=dollar_format()) + scale_y_continuous(labels = percent_format()) + facet_wrap(~ company_category_code) + geom_smooth(method="lm", aes(weight=counts)) + ggtitle("Percent Follow On Funding by First Round Funding Amount") + ylab("Percent Follow On Funding") + xlab("First Round Funding Amount (USD)")
ggsave("follow_on_by_first_raised_amount_usd_by_category.png")
medians <- ddply(companies, .(company_category_code, second_round), summarize, median=median(first_raised_amount_usd))
first_raised_amount_usd_counts <- ddply(companies, .(company_category_code, second_round, first_raised_amount_usd), summarize, counts=length(company_name))
first_raised_amount_usd_counts <- ddply(first_raised_amount_usd_counts, .(company_category_code, second_round), transform, cum=cumsum(counts) / sum(counts))
ggplot(subset(first_raised_amount_usd_counts, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=first_raised_amount_usd, y=cum, color=second_round)) + geom_line() + scale_x_continuous(limits=c(1, 10000000), labels=dollar_format()) + scale_y_continuous(labels = percent_format()) + geom_vline(data=subset(medians,company_category_code %in% category_counts[1:9, "company_category_code"]), aes(xintercept=median, color=second_round), linetype="dashed", size=.5) + geom_text(data=subset(medians,company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x= median, y=0, label= dollar_format()(median), color=second_round), vjust=1.2, size=2, position=position_jitter(width=0, height=0)) + facet_wrap(~ company_category_code) + ggtitle("Cumulative Distribution of First Round Funding Amount") + ylab("Cumulative Percentage of Companies") + xlab("First Round Funding Amount (USD)")
ggsave("cumulative_distribution_second_round_first_raised_amount_usd_by_category.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment