naomispence

## graph mode cat var
wave5addhealth$H5HR2cat <- wave5addhealth$H5HR2
wave5addhealth$H5HR2cat <- factor(wave5addhealth$H5HR2cat, levels = c(1,2,3,5,6),
                                  labels = c("Own Place", "Parents' Home", "Another Person's Home", "Homeless", "Other"))

frequency(wave5addhealth$H5HR2, title= "Frequency Distribution of Living Arrangements, Wave 5 Add Health")
MODE(wave5addhealth$H5HR2)

ggplot(data=subset(wave5addhealth, !is.na(H5HR2cat)), aes(x = H5HR2cat)) +
  geom_bar(color="blue", fill="yellow", aes(y = ((..count..)/sum(..count..)))) +
  scale_y_continuous(labels = scales::percent) +

## CSC_factoranalysis
std_subdata<-na.omit(vac_data[,c("SP.DYN.IMRT.IN", "GC.TAX.TOTL.GD.ZS", "MS.MIL.TOTL.TF.ZS", "IQ.SCI.OVRL", "icrg_qog", "wbgi_gee","polity2")])

std_subdata$std_SP.DYN.IMRT.IN<-scale(std_subdata$SP.DYN.IMRT.IN)
std_subdata$std_GC.TAX.TOTL.GD.ZS<-scale(std_subdata$GC.TAX.TOTL.GD.ZS)
std_subdata$std_MS.MIL.TOTL.TF.ZS<-scale(std_subdata$MS.MIL.TOTL.TF.ZS)
std_subdata$std_IQ.SCI.OVRL<-scale(std_subdata$IQ.SCI.OVRL)
std_subdata$std_icrg_qog<-scale(std_subdata$icrg_qog)
std_subdata$std_wbgi_gee<-scale(std_subdata$wbgi_gee)
std_subdata$std_polity2<-scale(std_subdata$polity2)

## getting_started_w_graphs_and_Descriptives
##The R code here needs to go below where you created labels for your categorical variable values.

####DESCRIPTIVE STATISTICS CODE####
#note: for a categorical variable, only mode is appropriate (and median if the variable is ordinal) and you can see the
# mode in the frequency table by looking for the variable value with the largest percent
MODE(wave5addhealth$H5HR2)

#Bar graph is appropriate for a categorical variable.

#Now get a bar graph

## fall23soc345_Lab4
#START BY LOADING LIBRARIES AND OPTIONS
library(aws.s3)
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
options(scipen = 999)

## fall23soc345_Lab3
#START BY LOADING LIBRARIES AND OPTIONS
library(aws.s3)
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
options(scipen = 999)

## fall23soc345_Lab2
#START BY LOADING LIBRARIES AND OPTIONS
library(aws.s3)
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
options(scipen = 999)

## fall23soc345_Lab1
#START BY LOADING LIBRARIES AND OPTIONS
library(aws.s3)
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
options(scipen = 999)

## Final project example code spring 2023
#Load the libraries and data first
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
data(gss123)
options(scipen = 999)

## bivariate analysis spr23
#make sure you have this near the top of your script so your p-values aren't in scientific notation
options(scipen = 999)


#run an analysis of variance (ANOVA); type DV ~ IV
data.aov1 <- aov(gss123$physhlth ~ gss123$race, data=gss123)
summary(data.aov1)
by(gss123$physhlth, gss123$race, mean, na.rm=T)
#remove hashtag on line below to run Tukey ONLY if the F test is statistically significant
#TukeyHSD(data.aov1)

## ANOVA sp23
# Example: natheal independent, physhlth dependent

#Comparing means in a bar graph
ggplot(data=subset(gss123, !is.na(natheal))) +
  stat_summary(aes(x=natheal, y=physhlth), fun=mean, geom="bar") +
  xlab("Opinions on Health Spending") +
  ylab("Mean Number of Days of Poor Physical Health")

# Does it look like there are
#differences in the dependent variable based on the categories of the
	wave5addhealth$H5HR2cat <- wave5addhealth$H5HR2
	wave5addhealth$H5HR2cat <- factor(wave5addhealth$H5HR2cat, levels = c(1,2,3,5,6),
	labels = c("Own Place", "Parents' Home", "Another Person's Home", "Homeless", "Other"))

	frequency(wave5addhealth$H5HR2, title= "Frequency Distribution of Living Arrangements, Wave 5 Add Health")
	MODE(wave5addhealth$H5HR2)

	ggplot(data=subset(wave5addhealth, !is.na(H5HR2cat)), aes(x = H5HR2cat)) +
	geom_bar(color="blue", fill="yellow", aes(y = ((..count..)/sum(..count..)))) +
	scale_y_continuous(labels = scales::percent) +
	std_subdata<-na.omit(vac_data[,c("SP.DYN.IMRT.IN", "GC.TAX.TOTL.GD.ZS", "MS.MIL.TOTL.TF.ZS", "IQ.SCI.OVRL", "icrg_qog", "wbgi_gee","polity2")])

	std_subdata$std_SP.DYN.IMRT.IN<-scale(std_subdata$SP.DYN.IMRT.IN)
	std_subdata$std_GC.TAX.TOTL.GD.ZS<-scale(std_subdata$GC.TAX.TOTL.GD.ZS)
	std_subdata$std_MS.MIL.TOTL.TF.ZS<-scale(std_subdata$MS.MIL.TOTL.TF.ZS)
	std_subdata$std_IQ.SCI.OVRL<-scale(std_subdata$IQ.SCI.OVRL)
	std_subdata$std_icrg_qog<-scale(std_subdata$icrg_qog)
	std_subdata$std_wbgi_gee<-scale(std_subdata$wbgi_gee)
	std_subdata$std_polity2<-scale(std_subdata$polity2)
	##The R code here needs to go below where you created labels for your categorical variable values.

	####DESCRIPTIVE STATISTICS CODE####
	#note: for a categorical variable, only mode is appropriate (and median if the variable is ordinal) and you can see the
	# mode in the frequency table by looking for the variable value with the largest percent
	MODE(wave5addhealth$H5HR2)

	#Bar graph is appropriate for a categorical variable.

	#Now get a bar graph
	#START BY LOADING LIBRARIES AND OPTIONS
	library(aws.s3)
	library(ggplot2)
	library(dplyr)
	library(lsr)
	library(descr)
	library(Hmisc)
	library('lehmansociology')
	options(scipen = 999)
	#Load the libraries and data first
	library(ggplot2)
	library(dplyr)
	library(lsr)
	library(descr)
	library(Hmisc)
	library('lehmansociology')
	data(gss123)
	options(scipen = 999)
	#make sure you have this near the top of your script so your p-values aren't in scientific notation
	options(scipen = 999)


	#run an analysis of variance (ANOVA); type DV ~ IV
	data.aov1 <- aov(gss123$physhlth ~ gss123$race, data=gss123)
	summary(data.aov1)
	by(gss123$physhlth, gss123$race, mean, na.rm=T)
	#remove hashtag on line below to run Tukey ONLY if the F test is statistically significant
	#TukeyHSD(data.aov1)
	# Example: natheal independent, physhlth dependent

	#Comparing means in a bar graph
	ggplot(data=subset(gss123, !is.na(natheal))) +
	stat_summary(aes(x=natheal, y=physhlth), fun=mean, geom="bar") +
	xlab("Opinions on Health Spending") +
	ylab("Mean Number of Days of Poor Physical Health")

	# Does it look like there are
	#differences in the dependent variable based on the categories of the