juliasilge/january_meta.R

## january_meta.R
## this analysis assumes a dataframe `post_views` with columns:
##     PostId
##     CreationDate
##     Tag
##     AnswerCount
##     ViewCount

library(tidyverse)

post_views %>%
  distinct(PostId, .keep_all = TRUE) %>%
  mutate(AnswerCount = as.factor(AnswerCount),
         AnswerCount = fct_lump(AnswerCount),
         AnswerCount = fct_recode(AnswerCount,
                                  `More than 3` = "Other")) %>%
  ggplot(aes(AnswerCount, ViewCount)) +
  geom_boxplot() +
  scale_y_log10() +
  labs(x = "Answers per question", y = "Views per question",
       title = "Answers and views on Stack Overflow questions",
       subtitle = "There is an enormous amount of question-to-question variation")

simple_model <- post_views %>%
  distinct(PostId, .keep_all = TRUE) %>%
  lm(AnswerCount ~ ViewCount, data = .)

summary(simple_model)

model_with_time <- post_views %>%
  distinct(PostId, .keep_all = TRUE) %>%
  lm(AnswerCount ~ ViewCount + CreationDate, data = .)

summary(model_with_time)

model_no_intercept <- post_views %>%
  distinct(PostId, .keep_all = TRUE) %>%
  lm(AnswerCount ~ 0 + ViewCount, data = .)

summary(model_no_intercept)

log_model <- post_views %>%
  distinct(PostId, .keep_all = TRUE) %>%
  lm(AnswerCount ~ log10(ViewCount), data = .)

summary(log_model)


library(broom)

trained_models <- post_views %>%
  replace_na(list(AnswerCount = 0)) %>%
  add_count(Tag) %>%
  filter(n > 1e4) %>%
  nest(-Tag) %>%
  mutate(Model = map(data, ~ lm(AnswerCount ~ log10(ViewCount), data = .)))


slopes <- trained_models %>%
  unnest(map(Model, tidy)) %>%
  filter(term == "log10(ViewCount)")

slopes

library(ggrepel)

median_slope <- slopes %>% pull(estimate) %>% median()

post_views %>%
  count(Tag, sort = TRUE) %>%
  inner_join(slopes) %>%
  ggplot(aes(n, estimate, label = Tag)) +
  geom_hline(yintercept = median_slope,
             lty = 2, color = "gray70", size = 2, alpha = 0.8) +
  geom_point() +
  geom_text_repel(family = "IBMPlexSans-Medium") +
  scale_x_log10() +
  labs(x = "Number of questions",
       y = "Slope (Number of answers per 10x increase in views)",
       title = "Views and answers on Stack Overflow by tag",
       subtitle = paste("The median increase in answers per 10x increase in views for this group of technologies is", round(median_slope, 2)))
	## this analysis assumes a dataframe `post_views` with columns:
	## PostId
	## CreationDate
	## Tag
	## AnswerCount
	## ViewCount

	library(tidyverse)

	post_views %>%
	distinct(PostId, .keep_all = TRUE) %>%
	mutate(AnswerCount = as.factor(AnswerCount),
	AnswerCount = fct_lump(AnswerCount),
	AnswerCount = fct_recode(AnswerCount,
	`More than 3` = "Other")) %>%
	ggplot(aes(AnswerCount, ViewCount)) +
	geom_boxplot() +
	scale_y_log10() +
	labs(x = "Answers per question", y = "Views per question",
	title = "Answers and views on Stack Overflow questions",
	subtitle = "There is an enormous amount of question-to-question variation")

	simple_model <- post_views %>%
	distinct(PostId, .keep_all = TRUE) %>%
	lm(AnswerCount ~ ViewCount, data = .)

	summary(simple_model)

	model_with_time <- post_views %>%
	distinct(PostId, .keep_all = TRUE) %>%
	lm(AnswerCount ~ ViewCount + CreationDate, data = .)

	summary(model_with_time)

	model_no_intercept <- post_views %>%
	distinct(PostId, .keep_all = TRUE) %>%
	lm(AnswerCount ~ 0 + ViewCount, data = .)

	summary(model_no_intercept)

	log_model <- post_views %>%
	distinct(PostId, .keep_all = TRUE) %>%
	lm(AnswerCount ~ log10(ViewCount), data = .)

	summary(log_model)


	library(broom)

	trained_models <- post_views %>%
	replace_na(list(AnswerCount = 0)) %>%
	add_count(Tag) %>%
	filter(n > 1e4) %>%
	nest(-Tag) %>%
	mutate(Model = map(data, ~ lm(AnswerCount ~ log10(ViewCount), data = .)))


	slopes <- trained_models %>%
	unnest(map(Model, tidy)) %>%
	filter(term == "log10(ViewCount)")

	slopes

	library(ggrepel)

	median_slope <- slopes %>% pull(estimate) %>% median()

	post_views %>%
	count(Tag, sort = TRUE) %>%
	inner_join(slopes) %>%
	ggplot(aes(n, estimate, label = Tag)) +
	geom_hline(yintercept = median_slope,
	lty = 2, color = "gray70", size = 2, alpha = 0.8) +
	geom_point() +
	geom_text_repel(family = "IBMPlexSans-Medium") +
	scale_x_log10() +
	labs(x = "Number of questions",
	y = "Slope (Number of answers per 10x increase in views)",
	title = "Views and answers on Stack Overflow by tag",
	subtitle = paste("The median increase in answers per 10x increase in views for this group of technologies is", round(median_slope, 2)))