naomispence/LR5: Making and Comparing Confidence Intervals

## LR5: Making and Comparing Confidence Intervals
#Lab Report 5: Making and Comparing Confidence Intervals
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
data(gss123)
options(scipen = 999)

#For this LR we will be looking at the
#relationship between a dichotomous nominal independent variable
#and an interval ratio dependent variable
#First, we have to look at our dichotomous variable and see
#what the two categories are.

#WHICH DICHOTOMOUS VARIABLE ARE YOU USING?

#REPLACE SEX WITH THE NAME OF YOUR DICHOTOMOUS VARIABLE
frequency(gss123$sex)
frequency(as.numeric(gss123$sex))
#The code below shows you how to formally dichotomize a variable so that
#the two groups are coded as 0 and 1.
#This is important for doing statistical
#analyses. Note that we name the new dichotomous variable whatever the
#category is that we have coded as 1

gss123$male<-(as.numeric(gss123$sex)) ==1
frequency(gss123$male)
mean(gss123$male)

#Look at the two cateogories in your frequency table
#and decide what you will call each of your mini
#datasets. Choose names that make sense.

#Below, REPLACE men and women WITH THE NAMES OF YOUR TWO
#MINI DATASETS. REPLACE sex WITH THE NAME OF YOUR
#DICHOTOMOUS VARIABLE. REPLACE "Male" and "Female"
#WITH THE TWO CATEGORIES OF YOUR FREQUENCY TABLE.
#REMEMBER TO INCLUDE THEM IN QUOTATION MARKS AND
#TO WRITE THEM EXACTLY AS THEY APPEAR (FOR
#EXAMPLE, IF THEY ARE CAPITALIZED IN THE FREQUENCY
#TABLE THEY NEED TO BE CAPITALIZED IN THE CODE).

men<-dplyr::filter(gss123, male=="TRUE")
women<-dplyr::filter(gss123, male=="FALSE")

#Follow the code below for the rest of your
#LR but replace men and
#women with the names of your two mini datasets
#and replace conrinc with the name of your
#interval ratio variable.
#Make sure to change titles and labels, too!
#Fill in your answers below

#WHICH INTERVAL-RATIO VARIABLE ARE YOU USING?

#WHAT IS YOUR RESEARCH QUESTION?

#Summary statistics and a histogram for your dependent variable
#for your first mini dataset

#CHANGE THIS CODE BY REPLACING men WITH THE NAME
#OF ONE OF YOUR TWO MINI DATASETS AND REPLACING
#conrinc WITH YOUR DEPENDENT VARIABLE
#AND BY EDITING THE TITLES AND LABELS FOR GRAPHS
#REMEMBER TO THINK ABOUT BINWIDTH AND WHETHER YOU WANT
#TO CHANGE IT TO SOMETHING BIGGER

summary(men$conrinc, na.rm=TRUE)
sd(men$conrinc, na.rm=TRUE)

ggplot(data = men, aes(x = conrinc)) +
  geom_histogram(color="blue", fill="pink", binwidth =1,
                 aes(y=(..count../sum(..count..))*100)) +
  ggtitle("Distribution of Males' Income, GSS") +
  labs(y="Percent", x="Income")

#INTERPRET THE SUMMARY STATISTICS AND HISTOGRAM

#Now we are getting summary statistics and a
#histogram for your second mini dataset.

#CHANGE THIS CODE BY REPLACING women WITH THE NAME
#OF YOUR OTHER MINI DATASET AND REPLACING
#conrinc WITH YOUR DEPENDENT VARIABLE
#AND BY EDITING THE TITLES AND LABELS FOR GRAPHS
#REMEMBER TO THINK ABOUT BINWIDTH AND WHETHER YOU WANT
#TO CHANGE IT TO SOMETHING BIGGER

summary(women$conrinc, na.rm=TRUE)
sd(women$conrinc, na.rm=TRUE)

ggplot(data = women, aes(x = conrinc)) +
  geom_histogram(color="blue", fill="pink", binwidth =1,
                 aes(y=(..count../sum(..count..))*100)) +
  ggtitle("Distribution of Females' Income, GSS") +
  labs(y="Percent", x="Income")

#INTERPRET THE SUMMARY STATISTICS AND HISTOGRAM

#Comparing means in a bar graph
#CHANGE THE INDEPENDENT VARIABLE (replace sex)
#CHANGE THE DEPENDENT VARIABLE (replace conrinc)
#CHANGE THE LABELS
ggplot(data=gss123) +
  stat_summary(aes(x=sex, y=conrinc), fun=mean, geom="bar") +
  xlab("Sex") +
  ylab("Mean Income by Spouse") +
  theme(axis.text.x=element_text(angle=-45))

#ANSWER THIS: Compare the results you got for your two groups.
#Which has a bigger mean? Which has more variability?
#Based on the bar graph of means,
#does it seem like the two categories of your
#dichotomous variable
#differ in the dependent variable?

#USING INFERENTAL STATISTICS TO COMPARE GROUPS
#Let's compare the confidence intervals for the mean of our
#dependent variable for the two categories of our independent variable.
#REPLACE men AND women WITH THE NAMES OF YOUR MINI DATASETS
#REPLACE conrinc WITH THE NAME OF YOUR DEPENDENT VARIABLE

#Comparing the CIs- 95%
ciMean(men$conrinc, na.rm=TRUE, conf =0.95)
ciMean(women$conrinc, na.rm=TRUE, conf =0.95)

#Comparing the CIs - 99%
ciMean(men$conrinc, na.rm=TRUE, conf =0.99)
ciMean(women$conrinc, na.rm=TRUE, conf =0.99)

#INTERPRET the confidence intervals and come to a conclusion about whether
#your independent and dependent variables are related.
	#Lab Report 5: Making and Comparing Confidence Intervals
	library(ggplot2)
	library(dplyr)
	library(lsr)
	library(descr)
	library(Hmisc)
	library('lehmansociology')
	data(gss123)
	options(scipen = 999)

	#For this LR we will be looking at the
	#relationship between a dichotomous nominal independent variable
	#and an interval ratio dependent variable
	#First, we have to look at our dichotomous variable and see
	#what the two categories are.

	#WHICH DICHOTOMOUS VARIABLE ARE YOU USING?

	#REPLACE SEX WITH THE NAME OF YOUR DICHOTOMOUS VARIABLE
	frequency(gss123$sex)
	frequency(as.numeric(gss123$sex))
	#The code below shows you how to formally dichotomize a variable so that
	#the two groups are coded as 0 and 1.
	#This is important for doing statistical
	#analyses. Note that we name the new dichotomous variable whatever the
	#category is that we have coded as 1

	gss123$male<-(as.numeric(gss123$sex)) ==1
	frequency(gss123$male)
	mean(gss123$male)

	#Look at the two cateogories in your frequency table
	#and decide what you will call each of your mini
	#datasets. Choose names that make sense.

	#Below, REPLACE men and women WITH THE NAMES OF YOUR TWO
	#MINI DATASETS. REPLACE sex WITH THE NAME OF YOUR
	#DICHOTOMOUS VARIABLE. REPLACE "Male" and "Female"
	#WITH THE TWO CATEGORIES OF YOUR FREQUENCY TABLE.
	#REMEMBER TO INCLUDE THEM IN QUOTATION MARKS AND
	#TO WRITE THEM EXACTLY AS THEY APPEAR (FOR
	#EXAMPLE, IF THEY ARE CAPITALIZED IN THE FREQUENCY
	#TABLE THEY NEED TO BE CAPITALIZED IN THE CODE).

	men<-dplyr::filter(gss123, male=="TRUE")
	women<-dplyr::filter(gss123, male=="FALSE")

	#Follow the code below for the rest of your
	#LR but replace men and
	#women with the names of your two mini datasets
	#and replace conrinc with the name of your
	#interval ratio variable.
	#Make sure to change titles and labels, too!
	#Fill in your answers below

	#WHICH INTERVAL-RATIO VARIABLE ARE YOU USING?

	#WHAT IS YOUR RESEARCH QUESTION?

	#Summary statistics and a histogram for your dependent variable
	#for your first mini dataset

	#CHANGE THIS CODE BY REPLACING men WITH THE NAME
	#OF ONE OF YOUR TWO MINI DATASETS AND REPLACING
	#conrinc WITH YOUR DEPENDENT VARIABLE
	#AND BY EDITING THE TITLES AND LABELS FOR GRAPHS
	#REMEMBER TO THINK ABOUT BINWIDTH AND WHETHER YOU WANT
	#TO CHANGE IT TO SOMETHING BIGGER

	summary(men$conrinc, na.rm=TRUE)
	sd(men$conrinc, na.rm=TRUE)

	ggplot(data = men, aes(x = conrinc)) +
	geom_histogram(color="blue", fill="pink", binwidth =1,
	aes(y=(..count../sum(..count..))*100)) +
	ggtitle("Distribution of Males' Income, GSS") +
	labs(y="Percent", x="Income")

	#INTERPRET THE SUMMARY STATISTICS AND HISTOGRAM

	#Now we are getting summary statistics and a
	#histogram for your second mini dataset.

	#CHANGE THIS CODE BY REPLACING women WITH THE NAME
	#OF YOUR OTHER MINI DATASET AND REPLACING
	#conrinc WITH YOUR DEPENDENT VARIABLE
	#AND BY EDITING THE TITLES AND LABELS FOR GRAPHS
	#REMEMBER TO THINK ABOUT BINWIDTH AND WHETHER YOU WANT
	#TO CHANGE IT TO SOMETHING BIGGER

	summary(women$conrinc, na.rm=TRUE)
	sd(women$conrinc, na.rm=TRUE)

	ggplot(data = women, aes(x = conrinc)) +
	geom_histogram(color="blue", fill="pink", binwidth =1,
	aes(y=(..count../sum(..count..))*100)) +
	ggtitle("Distribution of Females' Income, GSS") +
	labs(y="Percent", x="Income")

	#INTERPRET THE SUMMARY STATISTICS AND HISTOGRAM

	#Comparing means in a bar graph
	#CHANGE THE INDEPENDENT VARIABLE (replace sex)
	#CHANGE THE DEPENDENT VARIABLE (replace conrinc)
	#CHANGE THE LABELS
	ggplot(data=gss123) +
	stat_summary(aes(x=sex, y=conrinc), fun=mean, geom="bar") +
	xlab("Sex") +
	ylab("Mean Income by Spouse") +
	theme(axis.text.x=element_text(angle=-45))

	#ANSWER THIS: Compare the results you got for your two groups.
	#Which has a bigger mean? Which has more variability?
	#Based on the bar graph of means,
	#does it seem like the two categories of your
	#dichotomous variable
	#differ in the dependent variable?

	#USING INFERENTAL STATISTICS TO COMPARE GROUPS
	#Let's compare the confidence intervals for the mean of our
	#dependent variable for the two categories of our independent variable.
	#REPLACE men AND women WITH THE NAMES OF YOUR MINI DATASETS
	#REPLACE conrinc WITH THE NAME OF YOUR DEPENDENT VARIABLE

	#Comparing the CIs- 95%
	ciMean(men$conrinc, na.rm=TRUE, conf =0.95)
	ciMean(women$conrinc, na.rm=TRUE, conf =0.95)

	#Comparing the CIs - 99%
	ciMean(men$conrinc, na.rm=TRUE, conf =0.99)
	ciMean(women$conrinc, na.rm=TRUE, conf =0.99)

	#INTERPRET the confidence intervals and come to a conclusion about whether
	#your independent and dependent variables are related.