ensley/mlb_payroll_wins.r

## mlb_payroll_wins.r
library(ggplot2)
library(ggthemes)
library(dplyr)

# load payroll and win/loss data
df <- read.csv('mlb-standings-and-payroll.csv')
# pick out seasons from 1985 on
df <- df %>%
      filter(year >= 1985) %>%
      select(tm, year, w, g, wins_losses, est_payroll) %>%
      tbl_df()

# fix up old team name abbreviations
team.lookups <- read.csv('team-lookups.csv', stringsAsFactors=F)
df <- left_join(df, team.lookups, by=c('tm' = 'historic_team'))

# add team color data
team.colors <- read.csv('team-colors.csv', stringsAsFactors=F)
df <- left_join(df, team.colors, by=c('modern_team' = 'tm'))

# add payroll percentile for each team, rather than z-score
df <- df %>%
  dplyr::group_by(year) %>%
  dplyr::mutate(rank=percent_rank(est_payroll))

df$division <- as.factor(df$division)
divisions <- levels(df$division)

# create the plots, one division at a time
for(div in divisions) {
  df.division <- filter(df, division==div)
  p <- ggplot(df.division, aes(x=rank, y=wins_losses, color=team_color)) +
    geom_point(alpha=0.75, size=4) +
    geom_hline(yintercept=0.5) + geom_vline(xintercept=0.5) +
    stat_smooth(data=within(df, modern_team <- NULL), color='grey', size=1,
                method='lm', formula = y ~ poly(x, 2), se=F) +
    stat_smooth(size=2, method='lm', formula = y ~ poly(x, 2), se=F) +
    scale_color_identity() +
    scale_x_continuous(name='Standardized Salary\n(#of standard deviations from yearly mean',
                       breaks=c(0, 0.5, 1), limit=c(-0.1,1.1), labels=c('0%','50%','100%')) +
    scale_y_continuous(name='Win/Loss %', breaks=seq(0.3, 0.7, 0.1), limit=c(0.25, 0.75)) +
    facet_wrap(~modern_team, ncol=5, scales='free_x') +
    theme_fivethirtyeight() +
    ggtitle(div)
  ggsave(filename=paste0(div, ".png"), plot=p, width=15, height=4)
}


fit <- lm(wins_losses ~ poly(rank, 2), data=df)

df <- dplyr::mutate(df, expected_winpct = predict(fit, newdata=data.frame(rank=rank)))
df <- dplyr::mutate(df, expected_w = expected_winpct*g)
df <- dplyr::mutate(df, diff_w = w - expected_w)
df <- dplyr::mutate(df, diff_winpct = wins_losses - expected_winpct)
df <- dplyr::mutate(df, posneg = as.factor(sign(diff_winpct)))

rankings <- df %>% group_by(modern_team) %>% summarise(avg_diff=mean(diff_w)) %>% arrange(desc(avg_diff))
	library(ggplot2)
	library(ggthemes)
	library(dplyr)

	# load payroll and win/loss data
	df <- read.csv('mlb-standings-and-payroll.csv')
	# pick out seasons from 1985 on
	df <- df %>%
	filter(year >= 1985) %>%
	select(tm, year, w, g, wins_losses, est_payroll) %>%
	tbl_df()

	# fix up old team name abbreviations
	team.lookups <- read.csv('team-lookups.csv', stringsAsFactors=F)
	df <- left_join(df, team.lookups, by=c('tm' = 'historic_team'))

	# add team color data
	team.colors <- read.csv('team-colors.csv', stringsAsFactors=F)
	df <- left_join(df, team.colors, by=c('modern_team' = 'tm'))

	# add payroll percentile for each team, rather than z-score
	df <- df %>%
	dplyr::group_by(year) %>%
	dplyr::mutate(rank=percent_rank(est_payroll))

	df$division <- as.factor(df$division)
	divisions <- levels(df$division)

	# create the plots, one division at a time
	for(div in divisions) {
	df.division <- filter(df, division==div)
	p <- ggplot(df.division, aes(x=rank, y=wins_losses, color=team_color)) +
	geom_point(alpha=0.75, size=4) +
	geom_hline(yintercept=0.5) + geom_vline(xintercept=0.5) +
	stat_smooth(data=within(df, modern_team <- NULL), color='grey', size=1,
	method='lm', formula = y ~ poly(x, 2), se=F) +
	stat_smooth(size=2, method='lm', formula = y ~ poly(x, 2), se=F) +
	scale_color_identity() +
	scale_x_continuous(name='Standardized Salary\n(#of standard deviations from yearly mean',
	breaks=c(0, 0.5, 1), limit=c(-0.1,1.1), labels=c('0%','50%','100%')) +
	scale_y_continuous(name='Win/Loss %', breaks=seq(0.3, 0.7, 0.1), limit=c(0.25, 0.75)) +
	facet_wrap(~modern_team, ncol=5, scales='free_x') +
	theme_fivethirtyeight() +
	ggtitle(div)
	ggsave(filename=paste0(div, ".png"), plot=p, width=15, height=4)
	}


	fit <- lm(wins_losses ~ poly(rank, 2), data=df)

	df <- dplyr::mutate(df, expected_winpct = predict(fit, newdata=data.frame(rank=rank)))
	df <- dplyr::mutate(df, expected_w = expected_winpct*g)
	df <- dplyr::mutate(df, diff_w = w - expected_w)
	df <- dplyr::mutate(df, diff_winpct = wins_losses - expected_winpct)
	df <- dplyr::mutate(df, posneg = as.factor(sign(diff_winpct)))

	rankings <- df %>% group_by(modern_team) %>% summarise(avg_diff=mean(diff_w)) %>% arrange(desc(avg_diff))