Last active
October 3, 2023 16:18
-
-
Save naomispence/050ddfdebc43d3dfd00068e4a3d27c1a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#START BY LOADING LIBRARIES AND OPTIONS | |
library(aws.s3) | |
library(ggplot2) | |
library(dplyr) | |
library(lsr) | |
library(descr) | |
library(Hmisc) | |
library('lehmansociology') | |
options(scipen = 999) | |
#LOAD DATA FROM THE LEHMAN SERVER | |
Sys.setenv("AWS_ACCESS_KEY_ID" = "AKIAXIJLI7UET3TVMHRW", | |
"AWS_SECRET_ACCESS_KEY" = "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", | |
"AWS_DEFAULT_REGION" = "us-west-2") | |
s3load('addhealthW5.rdata', bucket = 'lehmansociologydata') | |
##LINES 18-30 ARE FROM LAST WEEK WHEN WE WORKED WITH CATEGORICAL VARIABLES FOR GRAPHS. | |
wave5addhealth$H5HR2cat <- wave5addhealth$H5HR2 | |
wave5addhealth$H5HR2cat <- factor(wave5addhealth$H5HR2cat, levels = c(1,2,3,5,6), | |
labels = c("Own Place", "Parents' Home", "Another Person's Home", "Homeless", "Other")) | |
frequency(wave5addhealth$H5HR2, title= "Frequency Distribution of Living Arrangements, Wave 5 Add Health") | |
MODE(wave5addhealth$H5HR2) | |
ggplot(data=subset(wave5addhealth, !is.na(H5HR2cat)), aes(x = H5HR2cat)) + | |
geom_bar(color="blue", fill="yellow", aes(y = ((..count..)/sum(..count..)))) + | |
scale_y_continuous(labels = scales::percent) + | |
ggtitle("Bar Graph of of Living Arrangements, Wave 5 Add Health") + | |
labs(y="Percent", x="Current Residence") + | |
theme(axis.text.x=element_text(angle=-25)) | |
##END UNIVARIATE WORK FOR A CATEGORICAL VARIABLE | |
#LINES 34-56: UNIVARIATE STATISTICS AND GRAPH FOR A QUANTITATIVE VARIABLE | |
frequency(wave5addhealth$H5ID23, cumulative.percent = TRUE, | |
title = "Distribution of Hours per Week Spent Watching TV, Movies, or Videos, Wave 5 Add Health") | |
#QUANTITATIVE VARIABLE INTERPRETATION: 3.8% of the sample report watching 0 hours of TV, movies, and videos. About one-quarter | |
# (24.7%) watch 4 hours or fewer (cum. percent). Ten hours per week was the amount of time reported by 11% of the Add | |
# Health repondents. | |
ggplot(data = wave5addhealth, aes(x = H5ID23)) + | |
geom_histogram(color="blue", fill="green", binwidth =1, aes(y=(..count../sum(..count..))*100)) + | |
ggtitle("Figure 1. Distribution of Time Spent Watching TV, Movies, or Videos, Add Health Wave 5") + | |
labs(y="Percent", x="Hours per Week") | |
#note: for a quantitative variable, you should get mode, median, mean, standard deviation, and range; you get these results | |
# mostly from summary but need MODE and sd for those two statistics. | |
MODE(wave5addhealth$H5ID23) | |
summary(wave5addhealth$H5ID23, na.rm=TRUE) | |
sd(wave5addhealth$H5ID23, na.rm=TRUE) | |
#QUANTITATIVE VARIABLE INTERPRETATION: The mode, or most commonly reported amount of time spent watching TV | |
# among Add Health respondents is 10 hours per week. The median value is also 10 hours per week. However, the | |
# mean, or average, amount of time spent watching TV, videos, or movies is 13.45 hour per week with a standard deviation of | |
# 14.59 hours per week. The amount of time that members of the sample spend watching TV ranges from 0-200 hours per week. | |
# The histogram shows that the distribution of time spent watching TV is right skewed. | |
##END UNIVARIATE WORK FOR QUANTITATIVE VARIABLE. | |
##LINES 60-70: BIVARIATE TABLE (CROSSTAB) FOR 2 CATEGORICAL VARIABLES | |
#NOTE THAT YOU NEED TO PUT YOUR DEPENDENT VARIABLE FIRST; dependent ~ independent | |
#The order that you list variables in a crosstab is critical for ensuring that you're | |
#correctly interpreting the results. We "percent down, compare across" to see group | |
#differences in the dependent variable by groups of the independent variable. | |
lehmansociology::crosstab(H5HR2 ~ H5OD2A, data = wave5addhealth, | |
title = "Living Arrangements by Sex Assigned at Birth", | |
format= "column_percent") | |
#Interpretation: 85% of males live in their own place, compared to 89% of females. | |
#A higher percent of those who were assigned the male sex at birth (9.1%) live with | |
#their parents as adults who are in their 30s or early 40s, compared to 6.4% of females. | |
##END WORK FOR CROSSTAB | |
##LINES 74-77: BAR GRAPH FOR QUANTITATIVE DEPENDENT VARIABLE AND CATEGORICAL INDEPENDENT VARIABLE | |
ggplot(data=subset(wave5addhealth, !is.na(H5HR2cat)))+stat_summary(aes(x=H5HR2cat,y=H5ID23),fun.y=mean,geom="bar")+ | |
ylab("Average Hours Per Week")+ | |
xlab("Current Living Arrangements")+ | |
ggtitle("Bar Graph of Average Time Spent Watching TV/Movies/Videos by Living Arrangements") | |
#Interpretation: The graph shows that Add Health respondents who live in their own | |
#home watch about 13 hours of TV, movies, and videos per week. The highest average | |
#time spent watching TV is among those living in their parents' home or another | |
#person's home; these group average about 17.5 hours per week. | |
##END WORK FOR BIVARIATE BAR GRAPH |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment