Skip to content

Instantly share code, notes, and snippets.

@naomispence
Last active October 3, 2023 16:18
Show Gist options
  • Save naomispence/050ddfdebc43d3dfd00068e4a3d27c1a to your computer and use it in GitHub Desktop.
Save naomispence/050ddfdebc43d3dfd00068e4a3d27c1a to your computer and use it in GitHub Desktop.
#START BY LOADING LIBRARIES AND OPTIONS
library(aws.s3)
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
options(scipen = 999)
#LOAD DATA FROM THE LEHMAN SERVER
Sys.setenv("AWS_ACCESS_KEY_ID" = "AKIAXIJLI7UET3TVMHRW",
"AWS_SECRET_ACCESS_KEY" = "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",
"AWS_DEFAULT_REGION" = "us-west-2")
s3load('addhealthW5.rdata', bucket = 'lehmansociologydata')
##LINES 18-30 ARE FROM LAST WEEK WHEN WE WORKED WITH CATEGORICAL VARIABLES FOR GRAPHS.
wave5addhealth$H5HR2cat <- wave5addhealth$H5HR2
wave5addhealth$H5HR2cat <- factor(wave5addhealth$H5HR2cat, levels = c(1,2,3,5,6),
labels = c("Own Place", "Parents' Home", "Another Person's Home", "Homeless", "Other"))
frequency(wave5addhealth$H5HR2, title= "Frequency Distribution of Living Arrangements, Wave 5 Add Health")
MODE(wave5addhealth$H5HR2)
ggplot(data=subset(wave5addhealth, !is.na(H5HR2cat)), aes(x = H5HR2cat)) +
geom_bar(color="blue", fill="yellow", aes(y = ((..count..)/sum(..count..)))) +
scale_y_continuous(labels = scales::percent) +
ggtitle("Bar Graph of of Living Arrangements, Wave 5 Add Health") +
labs(y="Percent", x="Current Residence") +
theme(axis.text.x=element_text(angle=-25))
##END UNIVARIATE WORK FOR A CATEGORICAL VARIABLE
#LINES 34-56: UNIVARIATE STATISTICS AND GRAPH FOR A QUANTITATIVE VARIABLE
frequency(wave5addhealth$H5ID23, cumulative.percent = TRUE,
title = "Distribution of Hours per Week Spent Watching TV, Movies, or Videos, Wave 5 Add Health")
#QUANTITATIVE VARIABLE INTERPRETATION: 3.8% of the sample report watching 0 hours of TV, movies, and videos. About one-quarter
# (24.7%) watch 4 hours or fewer (cum. percent). Ten hours per week was the amount of time reported by 11% of the Add
# Health repondents.
ggplot(data = wave5addhealth, aes(x = H5ID23)) +
geom_histogram(color="blue", fill="green", binwidth =1, aes(y=(..count../sum(..count..))*100)) +
ggtitle("Figure 1. Distribution of Time Spent Watching TV, Movies, or Videos, Add Health Wave 5") +
labs(y="Percent", x="Hours per Week")
#note: for a quantitative variable, you should get mode, median, mean, standard deviation, and range; you get these results
# mostly from summary but need MODE and sd for those two statistics.
MODE(wave5addhealth$H5ID23)
summary(wave5addhealth$H5ID23, na.rm=TRUE)
sd(wave5addhealth$H5ID23, na.rm=TRUE)
#QUANTITATIVE VARIABLE INTERPRETATION: The mode, or most commonly reported amount of time spent watching TV
# among Add Health respondents is 10 hours per week. The median value is also 10 hours per week. However, the
# mean, or average, amount of time spent watching TV, videos, or movies is 13.45 hour per week with a standard deviation of
# 14.59 hours per week. The amount of time that members of the sample spend watching TV ranges from 0-200 hours per week.
# The histogram shows that the distribution of time spent watching TV is right skewed.
##END UNIVARIATE WORK FOR QUANTITATIVE VARIABLE.
##LINES 60-70: BIVARIATE TABLE (CROSSTAB) FOR 2 CATEGORICAL VARIABLES
#NOTE THAT YOU NEED TO PUT YOUR DEPENDENT VARIABLE FIRST; dependent ~ independent
#The order that you list variables in a crosstab is critical for ensuring that you're
#correctly interpreting the results. We "percent down, compare across" to see group
#differences in the dependent variable by groups of the independent variable.
lehmansociology::crosstab(H5HR2 ~ H5OD2A, data = wave5addhealth,
title = "Living Arrangements by Sex Assigned at Birth",
format= "column_percent")
#Interpretation: 85% of males live in their own place, compared to 89% of females.
#A higher percent of those who were assigned the male sex at birth (9.1%) live with
#their parents as adults who are in their 30s or early 40s, compared to 6.4% of females.
##END WORK FOR CROSSTAB
##LINES 74-77: BAR GRAPH FOR QUANTITATIVE DEPENDENT VARIABLE AND CATEGORICAL INDEPENDENT VARIABLE
ggplot(data=subset(wave5addhealth, !is.na(H5HR2cat)))+stat_summary(aes(x=H5HR2cat,y=H5ID23),fun.y=mean,geom="bar")+
ylab("Average Hours Per Week")+
xlab("Current Living Arrangements")+
ggtitle("Bar Graph of Average Time Spent Watching TV/Movies/Videos by Living Arrangements")
#Interpretation: The graph shows that Add Health respondents who live in their own
#home watch about 13 hours of TV, movies, and videos per week. The highest average
#time spent watching TV is among those living in their parents' home or another
#person's home; these group average about 17.5 hours per week.
##END WORK FOR BIVARIATE BAR GRAPH
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment