Skip to content

Instantly share code, notes, and snippets.

@alexklapheke
Last active January 27, 2020 03:20
Show Gist options
  • Save alexklapheke/874157b173c822478ae5cfa0094fae39 to your computer and use it in GitHub Desktop.
Save alexklapheke/874157b173c822478ae5cfa0094fae39 to your computer and use it in GitHub Desktop.
#!/usr/bin/env Rscript
library(ggplot2)
library(dplyr)
library(lubridate)
library(moments)
pdf("test.pdf", width=7, height=5)
kickstarter <- read.csv("DSI_kickstarterscrape_dataset.csv", header=T)
# Fix HTML entity issue
kickstarter[kickstarter$category == "Film &amp; Video",]$category <- "Film & Video"
# Format dates (Fri, 19 Aug 2011 19:28:17 -0000)
kickstarter$funded.date <- as.Date(kickstarter$funded.date, "%a, %d %b %Y %T %z")
# Calculate starting date
kickstarter$started.date <- kickstarter$funded.date - days(round(kickstarter$duration))
# What is the mean (total) pledge that projects get? (not per backer)
# Your answer may vary by +/- 5%
mean(kickstarter$pledged, na.rm = T)
# Create a histogram that shows the distribution for number of
# backers. What is the skew of the distribution?
ggplot(kickstarter) + geom_histogram(aes(backers)) + scale_x_continuous(trans="log10") +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Distribution of backers", x="Backers (log10)", y="Number of projects")
skewness(kickstarter$backers, na.rm=T)
# Is the ‘duration’ variable normally distributed?
ggplot(kickstarter) + geom_histogram(aes(duration))
shapiro.test(sample(kickstarter$duration, 5000))
ggplot(kickstarter[!is.na(kickstarter$pledged),]) + geom_boxplot(aes(x=category, y=pledged, color=category)) +
scale_y_continuous(trans="log10") +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Distribution of pledge amounts by category", x="Category", y="Amount (log10 $)", fill="Category")
# Presentation graphs
kickstarter_median <- kickstarter %>%
group_by(category) %>%
summarise(median = median(pledged, na.rm = T))
ggplot(kickstarter_median) + geom_bar(aes(x=category, y=median, fill=category), stat="identity") +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Median pledge amounts by category", x="Category", y="Amount ($)", fill="Category")
kickstarter$start.month <- month(kickstarter$started.date)
kickstarter_month <- kickstarter %>%
group_by(start.month) %>%
summarise(median = median(pledged, na.rm = T))
ggplot(kickstarter_month) + geom_bar(aes(x=start.month, y=median), stat="identity") +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Amount pledged by month project started", x="Month", y="Amount pledged ($)")
# Success rates by various metrics
ggplot(kickstarter[kickstarter$status != "live",]) + geom_histogram(aes(x=duration, fill=status), position="fill") +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Success rate by duration", x="Duration (days)", y="Success rate", fill="Status")
ggplot(kickstarter[kickstarter$status != "live",]) + geom_bar(aes(x=start.month, fill=status), stat="count", position="fill") +
scale_x_continuous(breaks = seq(12)) +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Success rate by month project started", x="Month", y="Success rate", fill="Status")
ggplot(kickstarter[kickstarter$status != "live",]) + geom_histogram(aes(x=goal, fill=status), position="fill", bins=20) +
scale_x_continuous(trans="log10", breaks = 10^seq(0,7)) +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Success rate by goal", x="Goal (log10 $)", y="Success rate", fill="Status")
ggplot(kickstarter[kickstarter$status != "live",]) + geom_bar(aes(x=category, fill=status), stat="count", position="fill") +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Success rate by category", x="Category", y="Success rate", fill="Status")
kickstarter_median_goal <- kickstarter[kickstarter$status != "live",] %>%
group_by(category) %>%
summarise(median = median(goal, na.rm = T))
ggplot(kickstarter_median_goal) + geom_bar(aes(x=category, y=median, fill=category), stat="identity") +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title="Median goal by category", x="Category", y="Goal ($)", fill="Category")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment