Regional analysis of Crunchbase data
library(ggplot2) | |
library(ggthemes) | |
library(dplyr) | |
library(lubridate) | |
library(scales) | |
library(data.table) | |
library(reshape2) | |
options(scipen=999) | |
options(stringsAsFactors = FALSE) | |
# fix strange difference in units from diff | |
my.diff <- function(x, lag=1) { | |
n <- length(x) | |
round(difftime(x[(1+lag):n], x[1:(n-lag)], units="days") / 30) | |
} | |
rounds <- read.csv("2014-01-06-crunchbase_monthly_export_rounds.csv") | |
# dedup | |
rounds <- data.table(rounds) | |
setkeyv(rounds, c("company_name", "funded_at", "funding_round_type")) | |
rounds <- unique(rounds) | |
rounds <- subset(rounds, funded_month != "1960-01") | |
# fix strange date data | |
rounds$funded_at <- ymd(paste(rounds$funded_month, "01", sep="-")) | |
rounds <- arrange(rounds, funded_at) | |
rounds <- rounds[, id := seq_along(funded_at), by=company_name] | |
rounds <- rounds[, diff := c(my.diff(funded_at), NA), by=company_name] | |
# clean up rounds | |
rounds <- filter(rounds, company_country_code == "USA", company_state_code != "") | |
rounds$company_region <- toupper(gsub(" - Other", "", rounds$company_region)) | |
rounds$company_city <- toupper(gsub("[^[:alnum:]///' ]", "", rounds$company_city)) | |
rounds[company_region == "SF BAY"]$company_state_code <- "CA" | |
rounds[company_region == "NEW YORK"]$company_state_code <- "NY" | |
rounds[company_region == "LOS ANGELES"]$company_state_code <- "CA" | |
rounds <- filter(rounds, !company_region %in% c("UNKNOWN", "TBD")) | |
rounds$geocode <- paste(rounds$company_region, rounds$company_state_code, rounds$company_country_code) | |
categories <- read.csv("categories.csv") | |
names(categories)[1] <- c("company_category_code") | |
rounds <- data.table(inner_join(rounds, select(categories, company_category_code, broad_category))) | |
rounds <- filter(rounds, broad_category %in% c("enterprise", "consumer")) | |
regional_success <- filter(rounds, funded_at > ymd("2005-01-01")) %.% group_by(company_region, year=year(funded_at)) %.% summarise(companies=n(), follow_on=sum(!is.na(diff))) %.% arrange(desc(companies)) | |
rplot <- ggplot(filter(regional_success, company_region %in% c("SF BAY", "NEW YORK", "BOSTON", "LOS ANGELES"), year < 2014), aes(x=as.factor(year), y=follow_on / companies, fill= company_region)) + geom_bar(stat="identity", position="dodge") + ylab("Follow On Rate") + xlab("Year") + ggtitle("Follow On Rate by Region") + theme(legend.position=c(.9,.8),legend.key = element_rect(fill=alpha("white", .2)), legend.background = element_rect(fill=alpha("white", .9))) | |
ggsave("region-follow-on.png", rplot, width=640 / 72, height=400 / 72, dpi=72) | |
rounds$sf <- rounds$company_city %in% c("SAN FRANCISCO", "SAN FRANCISO", "SN FRANCISCO", "SAN FRANCSICO") | |
bay_area_success <- filter(rounds, company_region == "SF BAY", funded_at > ymd("2005-01-01")) %.% group_by(sf, year=year(funded_at)) %.% summarise(companies=n(), follow_on=sum(!is.na(diff))) %.% arrange(year) | |
bay_area_success$follow_on_rate <- bay_area_success$follow_on / bay_area_success$companies | |
sfplot <- ggplot(filter(bay_area_success, year < 2014), aes(x=as.factor(year), y=follow_on / companies, fill=sf)) + geom_bar(stat="identity", position="dodge") + ylab("Follow On Rate") + xlab("Year") + ggtitle("Follow On Rate in SF Bay Area") + scale_fill_discrete(labels=c("Bay Area", "SF")) + theme(legend.position=c(.9,.8),legend.key = element_rect(fill=alpha("white", .2)), legend.background = element_rect(fill=alpha("white", .9)), legend.title=element_blank()) | |
ggsave("sf-follow-on.png", sfplot, width=640 / 72, height=400 / 72, dpi=72) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment