naomispence/Comparing CIs and distributions

## Comparing CIs and distributions
##Comparing CIs and distributions

#Load the libraries and data first
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
data(gss123)
options(scipen = 999)


#In this lab we will learn how to explore the relationship between a
#dichotomous independent variable and an interval-ratio dependent variable.
#We do this by getting detailed information on each of the two categories
#of the dichotomous variable. The example we will work through looks at the
#relationship between sex and income in constant dollars.

#WHAT WOULD THE RESEARCH QUESTION BE?

#to make a variable dichotomous, we first need to see which category is
#coded as 1.
frequency(gss123$sex)
frequency(as.numeric(gss123$sex))

#The code below shows you how to formally dichotomize a variable so that
#the two groups are coded as 0 and 1.
#This is important for doing statistical
#analyses. Note that we name the new dichotomous variable whatever the
#category is that we have coded as 1

gss123$male<-(as.numeric(gss123$sex)) ==1
frequency(gss123$male)
mean(gss123$male)

#next, we want to create two separate mini datasets, one for one of
#our dichotomous categories, and one for the other
#the line of code below "filters out" or pulls out the males and creates a
#temporary mini dataset with just the males in it, named male
male<-dplyr::filter(gss123, male=="TRUE")
#the variable value in quotation marks at the end of the line of code above
#is case sensitive.

#notice that the line below uses the temporary dataset called male instead of
#gss123 and that male is case sensitive
#We are getting summary statistics and a histogram for income for MALES only

summary(male$conrinc, na.rm=TRUE)
sd(male$conrinc, na.rm=TRUE)

ggplot(data = male, aes(x = conrinc)) +
  geom_histogram(color="blue", fill="pink", binwidth =10000,
                 aes(y=(..count../sum(..count..))*100)) +
  ggtitle("Distribution of American Males by Income") +
  labs(y="Percent", x="Income")


#Now we are filtering out females into a mini dataset and getting
#summary statistics and a
#histogram for income for FEMALES only.

female<-dplyr::filter(gss123, male=="FALSE")
summary(female$conrinc, na.rm=TRUE)
sd(female$conrinc, na.rm=TRUE)

ggplot(data = female, aes(x = conrinc)) +
  geom_histogram(color="blue", fill="pink", binwidth =10000,
                 aes(y=(..count../sum(..count..))*100)) +
  ggtitle("Distribution of American Females by Income") +
  labs(y="Percent", x="Income")

#Comparing means in a bar graph
ggplot(data=gss123) +
  stat_summary(aes(x=sex, y=conrinc), fun=mean, geom="bar") +
  xlab("Sex") +
  ylab("Mean Income in Constant Dollars") +
  theme(axis.text.x=element_text(angle=-45))

#Compare the results you got for males and for females. Which has a bigger
#mean? Which has more variability? Does it seem like males and females
#differ in their income?

#Note that we just compared descriptive statistics. What if we want to
#compare inferential statistics?
#Let's compare the confidence intervals for the mean income for males
#and females.
#Comparing the CIs- 95%
ciMean(male$conrinc, na.rm=TRUE, conf =0.95)
ciMean(female$conrinc, na.rm=TRUE, conf =0.95)
#Comparing the CIs - 99%
ciMean(male$conrinc, na.rm=TRUE, conf =0.99)
ciMean(female$conrinc, na.rm=TRUE, conf =0.99)

#Interpret the confidence intervals and come to a conclusion about whether
#sex and income are related.
	##Comparing CIs and distributions

	#Load the libraries and data first
	library(ggplot2)
	library(dplyr)
	library(lsr)
	library(descr)
	library(Hmisc)
	library('lehmansociology')
	data(gss123)
	options(scipen = 999)


	#In this lab we will learn how to explore the relationship between a
	#dichotomous independent variable and an interval-ratio dependent variable.
	#We do this by getting detailed information on each of the two categories
	#of the dichotomous variable. The example we will work through looks at the
	#relationship between sex and income in constant dollars.

	#WHAT WOULD THE RESEARCH QUESTION BE?

	#to make a variable dichotomous, we first need to see which category is
	#coded as 1.
	frequency(gss123$sex)
	frequency(as.numeric(gss123$sex))

	#The code below shows you how to formally dichotomize a variable so that
	#the two groups are coded as 0 and 1.
	#This is important for doing statistical
	#analyses. Note that we name the new dichotomous variable whatever the
	#category is that we have coded as 1

	gss123$male<-(as.numeric(gss123$sex)) ==1
	frequency(gss123$male)
	mean(gss123$male)

	#next, we want to create two separate mini datasets, one for one of
	#our dichotomous categories, and one for the other
	#the line of code below "filters out" or pulls out the males and creates a
	#temporary mini dataset with just the males in it, named male
	male<-dplyr::filter(gss123, male=="TRUE")
	#the variable value in quotation marks at the end of the line of code above
	#is case sensitive.

	#notice that the line below uses the temporary dataset called male instead of
	#gss123 and that male is case sensitive
	#We are getting summary statistics and a histogram for income for MALES only

	summary(male$conrinc, na.rm=TRUE)
	sd(male$conrinc, na.rm=TRUE)

	ggplot(data = male, aes(x = conrinc)) +
	geom_histogram(color="blue", fill="pink", binwidth =10000,
	aes(y=(..count../sum(..count..))*100)) +
	ggtitle("Distribution of American Males by Income") +
	labs(y="Percent", x="Income")


	#Now we are filtering out females into a mini dataset and getting
	#summary statistics and a
	#histogram for income for FEMALES only.

	female<-dplyr::filter(gss123, male=="FALSE")
	summary(female$conrinc, na.rm=TRUE)
	sd(female$conrinc, na.rm=TRUE)

	ggplot(data = female, aes(x = conrinc)) +
	geom_histogram(color="blue", fill="pink", binwidth =10000,
	aes(y=(..count../sum(..count..))*100)) +
	ggtitle("Distribution of American Females by Income") +
	labs(y="Percent", x="Income")

	#Comparing means in a bar graph
	ggplot(data=gss123) +
	stat_summary(aes(x=sex, y=conrinc), fun=mean, geom="bar") +
	xlab("Sex") +
	ylab("Mean Income in Constant Dollars") +
	theme(axis.text.x=element_text(angle=-45))

	#Compare the results you got for males and for females. Which has a bigger
	#mean? Which has more variability? Does it seem like males and females
	#differ in their income?

	#Note that we just compared descriptive statistics. What if we want to
	#compare inferential statistics?
	#Let's compare the confidence intervals for the mean income for males
	#and females.
	#Comparing the CIs- 95%
	ciMean(male$conrinc, na.rm=TRUE, conf =0.95)
	ciMean(female$conrinc, na.rm=TRUE, conf =0.95)
	#Comparing the CIs - 99%
	ciMean(male$conrinc, na.rm=TRUE, conf =0.99)
	ciMean(female$conrinc, na.rm=TRUE, conf =0.99)

	#Interpret the confidence intervals and come to a conclusion about whether
	#sex and income are related.