nhatley/tidyverse_tools_with_center_survey_data.R

## tidyverse_tools_with_center_survey_data.R
#code from start to finish

##install or update packages if neccessary
#install.packages("tidyverse")
#install.packages("haven")

##load packages in
library(tidyverse) #loads all "core" tidyverse packages like dplyr, tidyr, forcats, and ggplot2
library(haven)

##read dataset in with value labels (as_factor)
Apr17 <- read_sav("Apr17 public.sav", user_na = TRUE) %>% as_factor()

##create trump_approval variable and relevel it
Apr17 <- Apr17 %>%
  mutate(trump_approval = case_when(
    q1 == "Approve" & q1a == "Very strongly" ~ "Strongly approve",
    q1 == "Approve" & q1a != "Very strongly" ~ "Not strongly approve",
    q1 == "Disapprove" & q1a == "Very strongly" ~ "Strongly disapprove",
    q1 == "Disapprove" & q1a != "Very strongly" ~ "Not strongly disapprove",
    q1 == "Don't know/Refused (VOL.)" | q1a == "Don't know/Refused (VOL.)" ~ "Refused"
  ) #this parentheses closes our call to case_when
  %>% #and then sends it to fct_relevel with %>%
    fct_relevel("Strongly approve",
                "Not strongly approve",
                "Not strongly disapprove",
                "Strongly disapprove",
                "Refused"
    ) #this parentheses closes our call to fct_relevel
  ) #this parentheses closes our call to mutate

## collapse education variable into 3 categories
Apr17 <- Apr17 %>%
  mutate(educ_cat = fct_collapse(educ2,
                                 "High school grad or less" = c(
                                   "Less than high school (Grades 1-8 or no formal schooling)",
                                   "High school incomplete (Grades 9-11 or Grade 12 with NO diploma)",
                                   "High school graduate (Grade 12 with diploma or GED certificate)"
                                 ),
                                 "Some college" = c(
                                   "Some college, no degree (includes some community college)",
                                   "Two year associate degree from a college or university"
                                 ),
                                 "College grad+" = c(
                                   "Four year college or university degree/Bachelor's degree (e.g., BS, BA, AB)",
                                   "Some postgraduate or professional schooling, no postgraduate degree",
                                   "Postgraduate or professional degree, including master's, doctorate, medical or law degree"
                                 )
  ) #this parentheses closes our call to fct_collapse
  ) #this parentheses closes our call to mutate

##get trump_approval weighted totals
trump_approval <- Apr17 %>%
  group_by(trump_approval) %>%
  summarise(weighted_n = sum(weight))

##get trump_approval weighted proportions
trump_approval <- Apr17 %>%
  ##group by trump_approval to calculated weighted totals by taking the sum of the weights
  group_by(trump_approval) %>%
  summarise(weighted_n = sum(weight)) %>%
  ##add the weighted_group_size to get the total weighted n and
  ##divide weighted_n by weighted_group_size to get the proportions
  mutate(weighted_group_size = sum(weighted_n),
         weighted_estimate = weighted_n / weighted_group_size
  )


##get trump_approval by education
trump_estimates_educ <- Apr17 %>%
  #group by educ and trump approval to get weighted n's per group
  group_by(educ_cat, trump_approval) %>%
  #calculate the total number of people in each answer and education category using survey weights (weight)
  summarise(weighted_n = sum(weight)) %>%
  #group by education to calculate education category size
  group_by(educ_cat) %>%
  #add columns for total group size and the proportion
  mutate(weighted_group_size = sum(weighted_n),
         weighted_estimate = weighted_n/weighted_group_size)

##select only columns interested in for this analysis
###rename psraid to resp_id
Apr17 <- Apr17 %>%
  select(resp_id = psraid, weight, trump_approval, educ_cat, racethn, gen5)

##create Apr_17 long with gather
Apr17_long <- Apr17 %>%
  #gather educ_cat, racethn, gen5 into two columns:
  ##a key  called "subgroup variable" (educ_cat, racethn, gen5)
  ##and a value called "subgroup"
  gather(key = subgroup_variable, value = subgroup, educ_cat, racethn, gen5)

##get weighted estimates for every subgroup
trump_estimates <- Apr17_long %>%
  #group by subgroup_variable, subgroup, and trump approval to get weighted n of approval/disapproval for all subgroup cats
  group_by(subgroup_variable, subgroup, trump_approval) %>%
  #calculate the total number of people in each answer and education category using survey weights (weight)
  summarise(weighted_n = sum(weight)) %>%
  #group by subgroup only to calculate subgroup category size
  group_by(subgroup) %>%
  #add columns for total group size and the proportion
  mutate(weighted_group_size = sum(weighted_n),
         weighted_estimate = weighted_n/weighted_group_size)

#only want proportions so select out total categories
trump_estimates <- trump_estimates %>%
  select(-weighted_n, -weighted_group_size)

##create plot
trump_estimates %>%
  ##remove "Refused" category for Trump Approval
  filter(trump_approval != "Refused") %>%
  ##remove Refused categories in our subgroup values
  filter(!(subgroup %in% c("Don't know/Refused (VOL.)", "DK/Ref"))) %>%
  ggplot(
    aes(
      x = weighted_estimate,
      y = subgroup
    )
  ) +
  geom_point() +
  scale_x_continuous(limits = c(0, .8),
                     breaks = seq(0, .6, by = .2),
                     labels = scales::percent(seq(0, .6, by = .2), accuracy = 1)
  ) +
  facet_grid(cols = vars(trump_approval),
             rows = vars(subgroup_variable),
             scales = "free_y",
             space = "free"
  ) +
  theme_bw() +
  theme(axis.title.y = element_blank())
	#code from start to finish

	##install or update packages if neccessary
	#install.packages("tidyverse")
	#install.packages("haven")

	##load packages in
	library(tidyverse) #loads all "core" tidyverse packages like dplyr, tidyr, forcats, and ggplot2
	library(haven)

	##read dataset in with value labels (as_factor)
	Apr17 <- read_sav("Apr17 public.sav", user_na = TRUE) %>% as_factor()

	##create trump_approval variable and relevel it
	Apr17 <- Apr17 %>%
	mutate(trump_approval = case_when(
	q1 == "Approve" & q1a == "Very strongly" ~ "Strongly approve",
	q1 == "Approve" & q1a != "Very strongly" ~ "Not strongly approve",
	q1 == "Disapprove" & q1a == "Very strongly" ~ "Strongly disapprove",
	q1 == "Disapprove" & q1a != "Very strongly" ~ "Not strongly disapprove",
	q1 == "Don't know/Refused (VOL.)" \| q1a == "Don't know/Refused (VOL.)" ~ "Refused"
	) #this parentheses closes our call to case_when
	%>% #and then sends it to fct_relevel with %>%
	fct_relevel("Strongly approve",
	"Not strongly approve",
	"Not strongly disapprove",
	"Strongly disapprove",
	"Refused"
	) #this parentheses closes our call to fct_relevel
	) #this parentheses closes our call to mutate

	## collapse education variable into 3 categories
	Apr17 <- Apr17 %>%
	mutate(educ_cat = fct_collapse(educ2,
	"High school grad or less" = c(
	"Less than high school (Grades 1-8 or no formal schooling)",
	"High school incomplete (Grades 9-11 or Grade 12 with NO diploma)",
	"High school graduate (Grade 12 with diploma or GED certificate)"
	),
	"Some college" = c(
	"Some college, no degree (includes some community college)",
	"Two year associate degree from a college or university"
	),
	"College grad+" = c(
	"Four year college or university degree/Bachelor's degree (e.g., BS, BA, AB)",
	"Some postgraduate or professional schooling, no postgraduate degree",
	"Postgraduate or professional degree, including master's, doctorate, medical or law degree"
	)
	) #this parentheses closes our call to fct_collapse
	) #this parentheses closes our call to mutate

	##get trump_approval weighted totals
	trump_approval <- Apr17 %>%
	group_by(trump_approval) %>%
	summarise(weighted_n = sum(weight))

	##get trump_approval weighted proportions
	trump_approval <- Apr17 %>%
	##group by trump_approval to calculated weighted totals by taking the sum of the weights
	group_by(trump_approval) %>%
	summarise(weighted_n = sum(weight)) %>%
	##add the weighted_group_size to get the total weighted n and
	##divide weighted_n by weighted_group_size to get the proportions
	mutate(weighted_group_size = sum(weighted_n),
	weighted_estimate = weighted_n / weighted_group_size
	)


	##get trump_approval by education
	trump_estimates_educ <- Apr17 %>%
	#group by educ and trump approval to get weighted n's per group
	group_by(educ_cat, trump_approval) %>%
	#calculate the total number of people in each answer and education category using survey weights (weight)
	summarise(weighted_n = sum(weight)) %>%
	#group by education to calculate education category size
	group_by(educ_cat) %>%
	#add columns for total group size and the proportion
	mutate(weighted_group_size = sum(weighted_n),
	weighted_estimate = weighted_n/weighted_group_size)

	##select only columns interested in for this analysis
	###rename psraid to resp_id
	Apr17 <- Apr17 %>%
	select(resp_id = psraid, weight, trump_approval, educ_cat, racethn, gen5)

	##create Apr_17 long with gather
	Apr17_long <- Apr17 %>%
	#gather educ_cat, racethn, gen5 into two columns:
	##a key called "subgroup variable" (educ_cat, racethn, gen5)
	##and a value called "subgroup"
	gather(key = subgroup_variable, value = subgroup, educ_cat, racethn, gen5)

	##get weighted estimates for every subgroup
	trump_estimates <- Apr17_long %>%
	#group by subgroup_variable, subgroup, and trump approval to get weighted n of approval/disapproval for all subgroup cats
	group_by(subgroup_variable, subgroup, trump_approval) %>%
	#calculate the total number of people in each answer and education category using survey weights (weight)
	summarise(weighted_n = sum(weight)) %>%
	#group by subgroup only to calculate subgroup category size
	group_by(subgroup) %>%
	#add columns for total group size and the proportion
	mutate(weighted_group_size = sum(weighted_n),
	weighted_estimate = weighted_n/weighted_group_size)

	#only want proportions so select out total categories
	trump_estimates <- trump_estimates %>%
	select(-weighted_n, -weighted_group_size)

	##create plot
	trump_estimates %>%
	##remove "Refused" category for Trump Approval
	filter(trump_approval != "Refused") %>%
	##remove Refused categories in our subgroup values
	filter(!(subgroup %in% c("Don't know/Refused (VOL.)", "DK/Ref"))) %>%
	ggplot(
	aes(
	x = weighted_estimate,
	y = subgroup
	)
	) +
	geom_point() +
	scale_x_continuous(limits = c(0, .8),
	breaks = seq(0, .6, by = .2),
	labels = scales::percent(seq(0, .6, by = .2), accuracy = 1)
	) +
	facet_grid(cols = vars(trump_approval),
	rows = vars(subgroup_variable),
	scales = "free_y",
	space = "free"
	) +
	theme_bw() +
	theme(axis.title.y = element_blank())