arvi1000/zach_alt_spelling.R

## zach_alt_spelling.R
library(tidyverse)

# zachary / zachery by year
z_dat <- babynames::babynames %>%
  filter(grepl('^zach(a|e)ry$', tolower(name)) & year >= 1950) %>%
  group_by(name, year) %>%
  summarise(n=sum(n))

# when was peak zachary? 1993
z_dat %>%
  group_by(name) %>%
  summarise(year[which.max(n)])

# percent Zachery among (Zachary + Zachery) in a given year (5% in 86)
z_dat %>%
  filter(year == 1986) %>%
  with(., n[2] / sum(n))

# plot
z_dat %>%
  ggplot(aes(x=year, y=n, color=name)) +
  geom_line() +
  facet_wrap(~name, scales='free_y') +
  theme_light() +
  theme(legend.position = 'none') +
  scale_y_continuous(labels = scales::comma) +
  labs(title='USA baby name frequency by year',
       subtitle = "source: Social Security Administration",
       y='occurrences', x='year')

# incidence of the less common spelling is pretty well correlated w the name's popularity
merge(
  z_dat %>%
  ungroup %>%
  filter(name=='Zachary') %>%
  mutate('Zachary'=n) %>%
  select(-name, -n),

  z_dat %>%
    ungroup %>%
    filter(name=='Zachery') %>%
    mutate('Zachery'=n) %>%
    select(-name, -n),
  by='year', all=T) %>%
  mutate(all_zach = Zachary + Zachery,
         pct_zachery = Zachery / all_zach) %>%
  data.table::melt(id.var='year') %>%
  filter(variable %in% c('all_zach', 'pct_zachery'),
         year %>% between(1980, 2015)) %>%
  ggplot(aes(x=year, y=value, color=variable)) +
  geom_line() +
  facet_wrap(~variable, ncol=1, scales='free_y') +
  labs(y=NULL) +
  theme_light() + theme(legend.position='none') +
  scale_color_brewer(palette = 1, type='qual')
	library(tidyverse)

	# zachary / zachery by year
	z_dat <- babynames::babynames %>%
	filter(grepl('^zach(a\|e)ry$', tolower(name)) & year >= 1950) %>%
	group_by(name, year) %>%
	summarise(n=sum(n))

	# when was peak zachary? 1993
	z_dat %>%
	group_by(name) %>%
	summarise(year[which.max(n)])

	# percent Zachery among (Zachary + Zachery) in a given year (5% in 86)
	z_dat %>%
	filter(year == 1986) %>%
	with(., n[2] / sum(n))

	# plot
	z_dat %>%
	ggplot(aes(x=year, y=n, color=name)) +
	geom_line() +
	facet_wrap(~name, scales='free_y') +
	theme_light() +
	theme(legend.position = 'none') +
	scale_y_continuous(labels = scales::comma) +
	labs(title='USA baby name frequency by year',
	subtitle = "source: Social Security Administration",
	y='occurrences', x='year')

	# incidence of the less common spelling is pretty well correlated w the name's popularity
	merge(
	z_dat %>%
	ungroup %>%
	filter(name=='Zachary') %>%
	mutate('Zachary'=n) %>%
	select(-name, -n),

	z_dat %>%
	ungroup %>%
	filter(name=='Zachery') %>%
	mutate('Zachery'=n) %>%
	select(-name, -n),
	by='year', all=T) %>%
	mutate(all_zach = Zachary + Zachery,
	pct_zachery = Zachery / all_zach) %>%
	data.table::melt(id.var='year') %>%
	filter(variable %in% c('all_zach', 'pct_zachery'),
	year %>% between(1980, 2015)) %>%
	ggplot(aes(x=year, y=value, color=variable)) +
	geom_line() +
	facet_wrap(~variable, ncol=1, scales='free_y') +
	labs(y=NULL) +
	theme_light() + theme(legend.position='none') +
	scale_color_brewer(palette = 1, type='qual')