FrankRuns/selection-bias-visual.r

## selection-bias-visual.r
# purpose: visualize linear trend for all data and subset of data

# libraries
library(dplyr)
library(ggplot2)

# read data
d <- read.csv("mycsvfile.csv")

# quickl look
head(d)

# make is smaller
d <- d %>% select(traffic_std, stars_std, is_top_ten)

# correlations
cor(d$traffic_std, d$stars_std)
d %>% filter(is_top_ten == 1) %>%
  summarize(cor(traffic_std, stars_std))

# visualize
ggplot(d, aes(x=traffic_std, y=stars_std)) +
  geom_point(color="steel blue", alpha = 0.5) +
  geom_smooth(method="lm", color="blue") +
  geom_point(data = d %>% filter(is_top_ten == 1),
             aes(x=traffic_std, y=stars_std),
             color="pink", alpha=0.5) +
  geom_smooth(data = d %>% filter(is_top_ten == 1),
              aes(x=traffic_std, y=stars_std),
              color="red", method="lm") +
  labs(x="Traffic (standardized)",
       y="Star Rating (standardized)",
       title="The Top 10% (red) Ranked Restaurants... \nhave a Different Correlation than all (blue+red) Restaurants") +
  theme_minimal()
	# purpose: visualize linear trend for all data and subset of data

	# libraries
	library(dplyr)
	library(ggplot2)

	# read data
	d <- read.csv("mycsvfile.csv")

	# quickl look
	head(d)

	# make is smaller
	d <- d %>% select(traffic_std, stars_std, is_top_ten)

	# correlations
	cor(d$traffic_std, d$stars_std)
	d %>% filter(is_top_ten == 1) %>%
	summarize(cor(traffic_std, stars_std))

	# visualize
	ggplot(d, aes(x=traffic_std, y=stars_std)) +
	geom_point(color="steel blue", alpha = 0.5) +
	geom_smooth(method="lm", color="blue") +
	geom_point(data = d %>% filter(is_top_ten == 1),
	aes(x=traffic_std, y=stars_std),
	color="pink", alpha=0.5) +
	geom_smooth(data = d %>% filter(is_top_ten == 1),
	aes(x=traffic_std, y=stars_std),
	color="red", method="lm") +
	labs(x="Traffic (standardized)",
	y="Star Rating (standardized)",
	title="The Top 10% (red) Ranked Restaurants... \nhave a Different Correlation than all (blue+red) Restaurants") +
	theme_minimal()