naomispence/deandra

## deandra
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
options(scipen = 999)
data(wave5addhealth)

wave5addhealth<-wave5addhealth[wave5addhealth$H5TO6<97,]


####FREQUENCY TABLES####
#change variable names and titles

#CATEGORICAL VARIABLE
frequency(wave5addhealth$H5PE4, title= "Frequency Distribution of Personality and Avoidance, Wave 5 Add Health")
#The 3 lines below here LABEL the dummy codes as the categories that they represent.
wave5addhealth$H5PE4 <- factor(wave5addhealth$H5PE4, levels = c(1,2,3,4,5),
                               labels = c("Strongly Agree", "Agree", "Neither Agree Nor Disagree", "Disagree", "Strongly Disagree"))
frequency(wave5addhealth$H5PE4, title= "Frequency Distribution of Personality and Avoidance, Wave 5 Add Health")

#write interpretations that pick out interesting and important information from your frequency tables
#CATEGORICAL VARIABLE INTERPRETATION: 40.91% of the sample who ever smoked disagree and don't go out of their way to avoid problems. 27.65% neither agree nor disagree on going out of their way to avoid problems. 18.67% agree on going out of their way to avoid problems.
#Remember your sample is no longer people who have tried to smoke, its whoever smoked
#CATEGORICAL VARIABLE
frequency(wave5addhealth$H5TO6, title= "Frequency Distribution of People Trying to Quit Smoking, Wave 5 Add Health")
#The 2 lines below here LABEL the dummy codes as the categories that they represent.
wave5addhealth$H5TO6 <- factor(wave5addhealth$H5TO6, levels = c(0,1),
                               labels = c("No", "Yes"))

#CATEGORICAL VARIABLE INTERPRETATION: 49.2% of the sample reported yes to trying to quit smoking. 16.2% of the sample reported no to trying to quit smoking.

ggplot(data=subset(wave5addhealth, !is.na(H5PE4)), aes(x = H5PE4)) +
  geom_bar(color="pink", fill="lavender", aes(y = ((..count..)/sum(..count..)))) +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Bar Graph of People Who Avoid Dealing with Life Problems, Wave 5 Add Health") +
  labs(y="Percent", x="Experiences with Avoidant Behavior") +
  theme(axis.text.x=element_text(angle=-25))
#It is more common for people to disagree with having an avoidance personality when it come to dealing with life problems. The least common are people who strongly agree with avoiding problems in their lives.

MODE(wave5addhealth$H5PE4)

ggplot(data=subset(wave5addhealth, !is.na(H5TO6)), aes(x = H5TO6)) +
  geom_bar(color="blue", fill="purple", aes(y = ((..count..)/sum(..count..)))) +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Bar Graph of People Who Have Ever Smoked, Wave 5 Add Health") +
  labs(y="Percent", x="Amount of People Who Have Tried to Quit Smoking") +
  theme(axis.text.x=element_text(angle=-25))
#It is common for people to try to quit smoking, than not.

MODE(wave5addhealth$H5TO6)

##LINES 60-70: BIVARIATE TABLE (CROSSTAB) FOR 2 CATEGORICAL VARIABLES
#NOTE THAT YOU NEED TO PUT YOUR DEPENDENT VARIABLE FIRST; dependent ~ independent
#The order that you list variables in a crosstab is critical for ensuring that you're
#correctly interpreting the results. We "percent down, compare across" to see group
#differences in the dependent variable by groups of the independent variable.

lehmansociology::crosstab(H5TO6 ~ H5PE4, data = wave5addhealth,
                          title = "People Who've Tried to Quit Smoking by Avoidant Behavior",
                          format= "column_percent")
#Interpretation: People more often than not have tried to quit smoking.


####FREQUENCY TABLES####
#change variable names and titles

#CATEGORICAL VARIABLE
frequency(wave5addhealth$H5SS0B, title= "Frequency Distribution of Feeling Depressed the Last 7 Days")

#QUANTITATIVE VARIABLE
frequency(wave5addhealth$H5ID21, cumulative.percent = TRUE,
          title = "Amount of Times Eating At a Fast Food Restaurant")

#write interpretations that pick out interesting and important information from your frequency tables
#CATEGORICAL VARIABLE INTERPRETATION: 69.03% of people never or rarely got depressed in the last 7 days. 24.14% of people reported getting depressed sometimes within the last 7 days.
#QUANTITATIVE VARIABLE INTERPRETATION: It was reported that 4064 were valid in their responses when asked the question about eating at fast food places. 132 people reported having invalid responses.

wave5addhealth$H5SS0B<- wave5addhealth$H5SS0B
wave5addhealth$H5SS0B<- factor(wave5addhealth$H5SS0B, levels = c(1,2,3,4),
                               labels = c("Never or Rarely", "Sometimes", "A Lot of the Time", "Most of the Time or All of the
          Time"))

frequency(wave5addhealth$H5SS0B, title= "Frequency Distribution of Feeling Depressed the Last 7 Days")
MODE(wave5addhealth$H5SS0B)

ggplot(data=subset(wave5addhealth, !is.na(H5SS0B)), aes(x = H5SS0B)) +
  geom_bar(color="purple", fill="blue", aes(y = ((..count..)/sum(..count..)))) +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Bar Graph of of Feeling Depressed the Last 7 Days, Wave 5 Add Health") +
  labs(y="Percent", x="Occurence") +
  theme(axis.text.x=element_text(angle=-25))

frequency(wave5addhealth$H5ID21, cumulative.percent = TRUE,
          title = "Number of Times Eating at a Fast Food Restaurant")

#QUANTITATIVE VARIABLE INTERPRETATION:

ggplot(data = wave5addhealth, aes(x = H5ID21)) +
  geom_histogram(color="blue", fill="green", binwidth =1, aes(y=(..count../sum(..count..))*100)) +
  ggtitle("Figure 1.Number of Times Eating at a Fast Food Restaurant") +
  labs(y="Percent", x= "Number of Times Eating Fast Food")

#note: for a quantitative variable, you should get mode, median, mean, standard deviation, and range; you get these results
# mostly from summary but need MODE and sd for those two statistics.
MODE(wave5addhealth$H5ID21)
summary(wave5addhealth$H5ID21, na.rm=TRUE)
sd(wave5addhealth$H5ID21, na.rm=TRUE)

#QUANTITATIVE VARIABLE INTERPRETATION: The minimum and the 1st Quarter were both 0.00. The median was 1.00. the Mean was 1.89, the 3rd quarter was 3.00. and the maximum was 15.00.

ggplot(data=subset(wave5addhealth, !is.na(H5SS0B)))+stat_summary(aes(x=H5SS0B,y=H5ID21),fun.y=mean,geom="bar")+
  ylab("Number of Times Eating Fast Food")+
  xlab("Average Time of Feeling Depressed the Last 7 Days")+
  ggtitle("Bar Graph of Average Time of Feeling Depressed the Last 7 Days/Number of Times Eating Fast Food")

#Interpretation: The data set shows that majority of people are consuming fast food a lot of the time have been depressed during the last 7 days. The percentage was 2.4% which happens to be the highest. This means most people experiencing depression throughout the week are relying on fast food to make them feel better in some way. People who are never or rarely depressed throughout the seven days of the week still consume fast food 1.56% of the time.

####ANOVA for a quantitative dependent variable (DV) and categorical independent variable (IV)
#run an analysis of variance (ANOVA); only change variable names and put in this order DV ~ IV
#IMPORTANT: USE THE COPY OF YOUR CATEGORICAL VARIABLE THAT YOU USED FOR THE BAR GRAPH (variable name ends in cat)
data.aov1 <- aov(wave5addhealth$H5ID21 ~ wave5addhealth$H5PE4, data=wave5addhealth)
summary(data.aov1)
#remove hashtag on line below to run Tukey ONLY if the F test is statistically significant AND there are more than 2 categories on the IV.
TukeyHSD(data.aov1)

##INTERPRETATION OF ANOVA: There is not evidence that statistically the 2 variables are significantly associated to one another.

####ANOVA for a quantitative dependent variable (DV) and categorical independent variable (IV)
#run an analysis of variance (ANOVA); only change variable names and put in this order DV ~ IV
#IMPORTANT: USE THE COPY OF YOUR CATEGORICAL VARIABLE THAT YOU USED FOR THE BAR GRAPH (variable name ends in cat)
data.aov2 <- aov(wave5addhealth$H5ID21 ~ wave5addhealth$H5SS0B, data=wave5addhealth)
summary(data.aov2)
#remove hashtag on line below to run Tukey ONLY if the F test is statistically significant AND there are more than 2 categories on the IV.
TukeyHSD(data.aov2)

##INTERPRETATION OF ANOVA: There is not evidence that statistically the 2 variables are significantly associated to one another.

####CHI SQUARE for categorical independent variable and categorical dependent variable
#only change the variable names
data.chisq1 <- chisq.test(wave5addhealth$H5PE4, wave5addhealth$H5TO6)
data.chisq1

##INTERPRETATION OF CHI SQUARE: There is not evidence that statistically the 2 variables are significantly associated.

####CHI SQUARE for categorical independent variable and categorical dependent variable
#only change the variable names
data.chisq2 <- chisq.test(wave5addhealth$H5SS0B, wave5addhealth$H5TO6)
data.chisq2

##INTERPRETATION OF CHI SQUARE: There is not evidence that statistically the 2 variables are significantly associated.
	library(ggplot2)
	library(dplyr)
	library(lsr)
	library(descr)
	library(Hmisc)
	library('lehmansociology')
	options(scipen = 999)
	data(wave5addhealth)

	wave5addhealth<-wave5addhealth[wave5addhealth$H5TO6<97,]


	####FREQUENCY TABLES####
	#change variable names and titles

	#CATEGORICAL VARIABLE
	frequency(wave5addhealth$H5PE4, title= "Frequency Distribution of Personality and Avoidance, Wave 5 Add Health")
	#The 3 lines below here LABEL the dummy codes as the categories that they represent.
	wave5addhealth$H5PE4 <- factor(wave5addhealth$H5PE4, levels = c(1,2,3,4,5),
	labels = c("Strongly Agree", "Agree", "Neither Agree Nor Disagree", "Disagree", "Strongly Disagree"))
	frequency(wave5addhealth$H5PE4, title= "Frequency Distribution of Personality and Avoidance, Wave 5 Add Health")

	#write interpretations that pick out interesting and important information from your frequency tables
	#CATEGORICAL VARIABLE INTERPRETATION: 40.91% of the sample who ever smoked disagree and don't go out of their way to avoid problems. 27.65% neither agree nor disagree on going out of their way to avoid problems. 18.67% agree on going out of their way to avoid problems.
	#Remember your sample is no longer people who have tried to smoke, its whoever smoked
	#CATEGORICAL VARIABLE
	frequency(wave5addhealth$H5TO6, title= "Frequency Distribution of People Trying to Quit Smoking, Wave 5 Add Health")
	#The 2 lines below here LABEL the dummy codes as the categories that they represent.
	wave5addhealth$H5TO6 <- factor(wave5addhealth$H5TO6, levels = c(0,1),
	labels = c("No", "Yes"))

	#CATEGORICAL VARIABLE INTERPRETATION: 49.2% of the sample reported yes to trying to quit smoking. 16.2% of the sample reported no to trying to quit smoking.

	ggplot(data=subset(wave5addhealth, !is.na(H5PE4)), aes(x = H5PE4)) +
	geom_bar(color="pink", fill="lavender", aes(y = ((..count..)/sum(..count..)))) +
	scale_y_continuous(labels = scales::percent) +
	ggtitle("Bar Graph of People Who Avoid Dealing with Life Problems, Wave 5 Add Health") +
	labs(y="Percent", x="Experiences with Avoidant Behavior") +
	theme(axis.text.x=element_text(angle=-25))
	#It is more common for people to disagree with having an avoidance personality when it come to dealing with life problems. The least common are people who strongly agree with avoiding problems in their lives.

	MODE(wave5addhealth$H5PE4)

	ggplot(data=subset(wave5addhealth, !is.na(H5TO6)), aes(x = H5TO6)) +
	geom_bar(color="blue", fill="purple", aes(y = ((..count..)/sum(..count..)))) +
	scale_y_continuous(labels = scales::percent) +
	ggtitle("Bar Graph of People Who Have Ever Smoked, Wave 5 Add Health") +
	labs(y="Percent", x="Amount of People Who Have Tried to Quit Smoking") +
	theme(axis.text.x=element_text(angle=-25))
	#It is common for people to try to quit smoking, than not.

	MODE(wave5addhealth$H5TO6)

	##LINES 60-70: BIVARIATE TABLE (CROSSTAB) FOR 2 CATEGORICAL VARIABLES
	#NOTE THAT YOU NEED TO PUT YOUR DEPENDENT VARIABLE FIRST; dependent ~ independent
	#The order that you list variables in a crosstab is critical for ensuring that you're
	#correctly interpreting the results. We "percent down, compare across" to see group
	#differences in the dependent variable by groups of the independent variable.

	lehmansociology::crosstab(H5TO6 ~ H5PE4, data = wave5addhealth,
	title = "People Who've Tried to Quit Smoking by Avoidant Behavior",
	format= "column_percent")
	#Interpretation: People more often than not have tried to quit smoking.


	####FREQUENCY TABLES####
	#change variable names and titles

	#CATEGORICAL VARIABLE
	frequency(wave5addhealth$H5SS0B, title= "Frequency Distribution of Feeling Depressed the Last 7 Days")

	#QUANTITATIVE VARIABLE
	frequency(wave5addhealth$H5ID21, cumulative.percent = TRUE,
	title = "Amount of Times Eating At a Fast Food Restaurant")

	#write interpretations that pick out interesting and important information from your frequency tables
	#CATEGORICAL VARIABLE INTERPRETATION: 69.03% of people never or rarely got depressed in the last 7 days. 24.14% of people reported getting depressed sometimes within the last 7 days.
	#QUANTITATIVE VARIABLE INTERPRETATION: It was reported that 4064 were valid in their responses when asked the question about eating at fast food places. 132 people reported having invalid responses.

	wave5addhealth$H5SS0B<- wave5addhealth$H5SS0B
	wave5addhealth$H5SS0B<- factor(wave5addhealth$H5SS0B, levels = c(1,2,3,4),
	labels = c("Never or Rarely", "Sometimes", "A Lot of the Time", "Most of the Time or All of the
	Time"))

	frequency(wave5addhealth$H5SS0B, title= "Frequency Distribution of Feeling Depressed the Last 7 Days")
	MODE(wave5addhealth$H5SS0B)

	ggplot(data=subset(wave5addhealth, !is.na(H5SS0B)), aes(x = H5SS0B)) +
	geom_bar(color="purple", fill="blue", aes(y = ((..count..)/sum(..count..)))) +
	scale_y_continuous(labels = scales::percent) +
	ggtitle("Bar Graph of of Feeling Depressed the Last 7 Days, Wave 5 Add Health") +
	labs(y="Percent", x="Occurence") +
	theme(axis.text.x=element_text(angle=-25))

	frequency(wave5addhealth$H5ID21, cumulative.percent = TRUE,
	title = "Number of Times Eating at a Fast Food Restaurant")

	#QUANTITATIVE VARIABLE INTERPRETATION:

	ggplot(data = wave5addhealth, aes(x = H5ID21)) +
	geom_histogram(color="blue", fill="green", binwidth =1, aes(y=(..count../sum(..count..))*100)) +
	ggtitle("Figure 1.Number of Times Eating at a Fast Food Restaurant") +
	labs(y="Percent", x= "Number of Times Eating Fast Food")

	#note: for a quantitative variable, you should get mode, median, mean, standard deviation, and range; you get these results
	# mostly from summary but need MODE and sd for those two statistics.
	MODE(wave5addhealth$H5ID21)
	summary(wave5addhealth$H5ID21, na.rm=TRUE)
	sd(wave5addhealth$H5ID21, na.rm=TRUE)

	#QUANTITATIVE VARIABLE INTERPRETATION: The minimum and the 1st Quarter were both 0.00. The median was 1.00. the Mean was 1.89, the 3rd quarter was 3.00. and the maximum was 15.00.

	ggplot(data=subset(wave5addhealth, !is.na(H5SS0B)))+stat_summary(aes(x=H5SS0B,y=H5ID21),fun.y=mean,geom="bar")+
	ylab("Number of Times Eating Fast Food")+
	xlab("Average Time of Feeling Depressed the Last 7 Days")+
	ggtitle("Bar Graph of Average Time of Feeling Depressed the Last 7 Days/Number of Times Eating Fast Food")

	#Interpretation: The data set shows that majority of people are consuming fast food a lot of the time have been depressed during the last 7 days. The percentage was 2.4% which happens to be the highest. This means most people experiencing depression throughout the week are relying on fast food to make them feel better in some way. People who are never or rarely depressed throughout the seven days of the week still consume fast food 1.56% of the time.

	####ANOVA for a quantitative dependent variable (DV) and categorical independent variable (IV)
	#run an analysis of variance (ANOVA); only change variable names and put in this order DV ~ IV
	#IMPORTANT: USE THE COPY OF YOUR CATEGORICAL VARIABLE THAT YOU USED FOR THE BAR GRAPH (variable name ends in cat)
	data.aov1 <- aov(wave5addhealth$H5ID21 ~ wave5addhealth$H5PE4, data=wave5addhealth)
	summary(data.aov1)
	#remove hashtag on line below to run Tukey ONLY if the F test is statistically significant AND there are more than 2 categories on the IV.
	TukeyHSD(data.aov1)

	##INTERPRETATION OF ANOVA: There is not evidence that statistically the 2 variables are significantly associated to one another.

	####ANOVA for a quantitative dependent variable (DV) and categorical independent variable (IV)
	#run an analysis of variance (ANOVA); only change variable names and put in this order DV ~ IV
	#IMPORTANT: USE THE COPY OF YOUR CATEGORICAL VARIABLE THAT YOU USED FOR THE BAR GRAPH (variable name ends in cat)
	data.aov2 <- aov(wave5addhealth$H5ID21 ~ wave5addhealth$H5SS0B, data=wave5addhealth)
	summary(data.aov2)
	#remove hashtag on line below to run Tukey ONLY if the F test is statistically significant AND there are more than 2 categories on the IV.
	TukeyHSD(data.aov2)

	##INTERPRETATION OF ANOVA: There is not evidence that statistically the 2 variables are significantly associated to one another.

	####CHI SQUARE for categorical independent variable and categorical dependent variable
	#only change the variable names
	data.chisq1 <- chisq.test(wave5addhealth$H5PE4, wave5addhealth$H5TO6)
	data.chisq1

	##INTERPRETATION OF CHI SQUARE: There is not evidence that statistically the 2 variables are significantly associated.

	####CHI SQUARE for categorical independent variable and categorical dependent variable
	#only change the variable names
	data.chisq2 <- chisq.test(wave5addhealth$H5SS0B, wave5addhealth$H5TO6)
	data.chisq2

	##INTERPRETATION OF CHI SQUARE: There is not evidence that statistically the 2 variables are significantly associated.