minimaxir/salary_bootstrap.Rmd

## salary_bootstrap.Rmd
---
title: "R Notebook"
output: html_notebook
---

```{r}
library(ggplot2)
library(scales)
theme_set(theme_minimal())
```

```{r}
library(tidyverse)

survey_results <- read_csv("/Users/maxwoolf/Downloads/developer_survey_2017/survey_results_public.csv")

tab_space_survey <- survey_results %>%
  filter(!is.na(TabsSpaces)) %>%
  mutate(TabsSpaces = factor(TabsSpaces, c("Spaces", "Tabs", "Both"))) %>%
  extract(YearsCodedJob, "YearsCodedNumber", "(\\d+)", convert = TRUE) %>%
  replace_na(list(YearsCodedNumber = 0)) %>%
  mutate(YearsCodedGroup = case_when(YearsCodedNumber < 5 ~ "<= 5 years",
                                     YearsCodedNumber <= 10 ~ "6-10",
                                     YearsCodedNumber <= 15 ~ "11-15",
                                     TRUE ~ "15+"),
         YearsCodedGroup = reorder(YearsCodedGroup, YearsCodedNumber, mean)) %>%
  filter(Professional == "Professional developer")
```

```{r}
library(boot)

salary_median <- function(data, indices){
  d <- data[indices,]
  median(d$Salary)
}

boot_median <- function(data) {
  boot <- boot.ci(boot(data, salary_median, 1000), type="perc")$perc
    data.frame(
      median = median(data$Salary),
      low_ci = boot[4],
      high_ci = boot[5]
    )
}
```


```{r}
survey_set <- tab_space_survey %>%
  filter(!is.na(Salary))

survey_set_2 <- survey_set %>%
  group_by(TabsSpaces, YearsCodedGroup) %>%
  do(boot_median(.)) %>%
  ungroup()

survey_set_2
```


```{r}
 survey_set_2 %>%
  ggplot(aes(x=YearsCodedGroup, y=median,
             group = TabsSpaces, color = TabsSpaces)) +
  geom_ribbon(aes(ymin=low_ci, ymax=high_ci, fill=TabsSpaces), alpha=0.25, size=0) +
  geom_line(size = 1.2, alpha = 0.8) +
  scale_y_continuous(labels = dollar_format()) +
  expand_limits(y = 0) +
  labs(color = "Uses",
       x = "Number of years someone coded as part of their job",
       y = "Median annual salary (US Dollars)",
       title = "Salary differences between developers who use tabs and spaces",
       subtitle = paste("From", comma(nrow(survey_set)), "professional developers in the 2017 Developer Survey results, who provided tabs/spaces and salary"))
```

```{r}
countries <- c("United States", "India", "United Kingdom", "Germany",
               "Canada", "Other")

survey_set <- tab_space_survey %>%
  filter(Professional == "Professional developer") %>%
  filter(!is.na(Salary))

survey_set <- survey_set %>%
  mutate(Country = fct_lump(Country, 5))

survey_set3 <- survey_set %>%
  group_by(Country, TabsSpaces) %>%
  do(boot_median(.)) %>%
  ungroup() %>%
  mutate(Country = factor(Country, countries))

survey_set3
```

```{r}
survey_set3 %>%
  ggplot(aes(TabsSpaces, median, fill = TabsSpaces)) +
  geom_col(alpha = 0.9, show.legend = FALSE) +
  geom_errorbar(aes(ymin=low_ci, ymax=high_ci, width=0.5)) +
  theme(strip.text.x = element_text(size = 11, family = "Roboto-Bold")) +
  facet_wrap(~ Country, scales = "free") +
  labs(x = '"Do you use tabs or spaces?"',
       y = "Median annual salary (US Dollars)",
       title = "Salary differences between developers who use tabs and spaces",
       subtitle = paste("From", comma(nrow(survey_set)), "respondents in the 2017 Developer Survey results")) +
  scale_y_continuous(labels = dollar_format(), expand = c(0,0))
```
	---
	title: "R Notebook"
	output: html_notebook
	---

	```{r}
	library(ggplot2)
	library(scales)
	theme_set(theme_minimal())
	```

	```{r}
	library(tidyverse)

	survey_results <- read_csv("/Users/maxwoolf/Downloads/developer_survey_2017/survey_results_public.csv")

	tab_space_survey <- survey_results %>%
	filter(!is.na(TabsSpaces)) %>%
	mutate(TabsSpaces = factor(TabsSpaces, c("Spaces", "Tabs", "Both"))) %>%
	extract(YearsCodedJob, "YearsCodedNumber", "(\\d+)", convert = TRUE) %>%
	replace_na(list(YearsCodedNumber = 0)) %>%
	mutate(YearsCodedGroup = case_when(YearsCodedNumber < 5 ~ "<= 5 years",
	YearsCodedNumber <= 10 ~ "6-10",
	YearsCodedNumber <= 15 ~ "11-15",
	TRUE ~ "15+"),
	YearsCodedGroup = reorder(YearsCodedGroup, YearsCodedNumber, mean)) %>%
	filter(Professional == "Professional developer")
	```

	```{r}
	library(boot)

	salary_median <- function(data, indices){
	d <- data[indices,]
	median(d$Salary)
	}

	boot_median <- function(data) {
	boot <- boot.ci(boot(data, salary_median, 1000), type="perc")$perc
	data.frame(
	median = median(data$Salary),
	low_ci = boot[4],
	high_ci = boot[5]
	)
	}
	```


	```{r}
	survey_set <- tab_space_survey %>%
	filter(!is.na(Salary))

	survey_set_2 <- survey_set %>%
	group_by(TabsSpaces, YearsCodedGroup) %>%
	do(boot_median(.)) %>%
	ungroup()

	survey_set_2
	```


	```{r}
	survey_set_2 %>%
	ggplot(aes(x=YearsCodedGroup, y=median,
	group = TabsSpaces, color = TabsSpaces)) +
	geom_ribbon(aes(ymin=low_ci, ymax=high_ci, fill=TabsSpaces), alpha=0.25, size=0) +
	geom_line(size = 1.2, alpha = 0.8) +
	scale_y_continuous(labels = dollar_format()) +
	expand_limits(y = 0) +
	labs(color = "Uses",
	x = "Number of years someone coded as part of their job",
	y = "Median annual salary (US Dollars)",
	title = "Salary differences between developers who use tabs and spaces",
	subtitle = paste("From", comma(nrow(survey_set)), "professional developers in the 2017 Developer Survey results, who provided tabs/spaces and salary"))
	```

	```{r}
	countries <- c("United States", "India", "United Kingdom", "Germany",
	"Canada", "Other")

	survey_set <- tab_space_survey %>%
	filter(Professional == "Professional developer") %>%
	filter(!is.na(Salary))

	survey_set <- survey_set %>%
	mutate(Country = fct_lump(Country, 5))

	survey_set3 <- survey_set %>%
	group_by(Country, TabsSpaces) %>%
	do(boot_median(.)) %>%
	ungroup() %>%
	mutate(Country = factor(Country, countries))

	survey_set3
	```

	```{r}
	survey_set3 %>%
	ggplot(aes(TabsSpaces, median, fill = TabsSpaces)) +
	geom_col(alpha = 0.9, show.legend = FALSE) +
	geom_errorbar(aes(ymin=low_ci, ymax=high_ci, width=0.5)) +
	theme(strip.text.x = element_text(size = 11, family = "Roboto-Bold")) +
	facet_wrap(~ Country, scales = "free") +
	labs(x = '"Do you use tabs or spaces?"',
	y = "Median annual salary (US Dollars)",
	title = "Salary differences between developers who use tabs and spaces",
	subtitle = paste("From", comma(nrow(survey_set)), "respondents in the 2017 Developer Survey results")) +
	scale_y_continuous(labels = dollar_format(), expand = c(0,0))
	```