naomispence/SOC345OM01_data_management

## SOC345OM01_data_management
#START BY LOADING LIBRARIES AND OPTIONS
library(aws.s3)
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
options(scipen = 999)

#LOAD DATA FROM THE LEHMAN SERVER
Sys.setenv("AWS_ACCESS_KEY_ID" = "AKIAXIJLI7UET3TVMHRW",
           "AWS_SECRET_ACCESS_KEY" = "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",
           "AWS_DEFAULT_REGION" = "us-west-2")
s3load('addhealthW5.rdata', bucket = 'lehmansociologydata')

####IMPORTANT: IF YOU HAVE DATA MANAGEMENT FOR YOUR PROJECT VARIABLES, YOU SHOULD PUT IT
####RIGHT BELOW WHERE YOU LOAD THE DATA, AND YOU NEED TO RUN YOUR DATA MANAGEMENT LINES
####EVERY TIME YOU START WORKING IN RSTUDIO WHEN YOU RUN LIBRARIES AND LOAD THE DATA.

#SP: the line below is data management for H5SS3A to change "not applicable" to "no" because people who do not have a spouse said "not applicable" and they do have a spouse to discuss worries.
wave5addhealth$H5SS3A[wave5addhealth$H5SS3A==3]<-0
frequency(wave5addhealth$H5SS3A)

#TR:reduce number of categories for IV
wave5addhealth$H5LM9[wave5addhealth$H5LM9==5]<-4
wave5addhealth$H5LM9[wave5addhealth$H5LM9==6]<-4
#two lines above combine all gov't employees (local, state, fed) into dummy code 4
wave5addhealth$H5LM9[wave5addhealth$H5LM9==7]<-5
wave5addhealth$H5LM9[wave5addhealth$H5LM9==8]<-5
wave5addhealth$H5LM9[wave5addhealth$H5LM9==9]<-5
#three lines above combine self-employed and family employed into one group with dummy code 5
wave5addhealth$H5LM9[wave5addhealth$H5LM9==97]<-NA
#line above makes legitimate skip into NA to exclude from sample those who are not employed
frequency(wave5addhealth$H5LM9)

#JC: H5EC2 has legit skip for people with $0 income reported on H5EC1; it also has "don't know" that needs to be coded to missing
wave5addhealth$H5EC2[wave5addhealth$H5EC2==997]<-0
#line above makes legitimate skip into 0 to include people with no income as their own category
wave5addhealth$H5EC2[wave5addhealth$H5EC2==998]<-NA
#line above codes "don't know" to missing to exclude people who don't know their household income
frequency(wave5addhealth$H5EC2)

#DR: H5TO6 has legitimate skiip for people who never smoked; the line below will exclude them from the sample.
wave5addhealth$H5TO6[wave5addhealth$H5TO6==97]<-NA
wave5addhealth <-subset(wave5addhealth, H5TO6 != 'NA')
#the line above will remove never smokers from the dataset

#KH: H5LM15 has legitimate skip for people who are not employed. The line of code below will treat them as missing values. The smaple will include only people who are employed.
wave5addhealth$H5LM15[wave5addhealth$H5LM15==97]<-NA
wave5addhealth <-subset(wave5addhealth, H5LM15 != 'NA')
	#START BY LOADING LIBRARIES AND OPTIONS
	library(aws.s3)
	library(ggplot2)
	library(dplyr)
	library(lsr)
	library(descr)
	library(Hmisc)
	library('lehmansociology')
	options(scipen = 999)

	#LOAD DATA FROM THE LEHMAN SERVER
	Sys.setenv("AWS_ACCESS_KEY_ID" = "AKIAXIJLI7UET3TVMHRW",
	"AWS_SECRET_ACCESS_KEY" = "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",
	"AWS_DEFAULT_REGION" = "us-west-2")
	s3load('addhealthW5.rdata', bucket = 'lehmansociologydata')

	####IMPORTANT: IF YOU HAVE DATA MANAGEMENT FOR YOUR PROJECT VARIABLES, YOU SHOULD PUT IT
	####RIGHT BELOW WHERE YOU LOAD THE DATA, AND YOU NEED TO RUN YOUR DATA MANAGEMENT LINES
	####EVERY TIME YOU START WORKING IN RSTUDIO WHEN YOU RUN LIBRARIES AND LOAD THE DATA.

	#SP: the line below is data management for H5SS3A to change "not applicable" to "no" because people who do not have a spouse said "not applicable" and they do have a spouse to discuss worries.
	wave5addhealth$H5SS3A[wave5addhealth$H5SS3A==3]<-0
	frequency(wave5addhealth$H5SS3A)

	#TR:reduce number of categories for IV
	wave5addhealth$H5LM9[wave5addhealth$H5LM9==5]<-4
	wave5addhealth$H5LM9[wave5addhealth$H5LM9==6]<-4
	#two lines above combine all gov't employees (local, state, fed) into dummy code 4
	wave5addhealth$H5LM9[wave5addhealth$H5LM9==7]<-5
	wave5addhealth$H5LM9[wave5addhealth$H5LM9==8]<-5
	wave5addhealth$H5LM9[wave5addhealth$H5LM9==9]<-5
	#three lines above combine self-employed and family employed into one group with dummy code 5
	wave5addhealth$H5LM9[wave5addhealth$H5LM9==97]<-NA
	#line above makes legitimate skip into NA to exclude from sample those who are not employed
	frequency(wave5addhealth$H5LM9)

	#JC: H5EC2 has legit skip for people with $0 income reported on H5EC1; it also has "don't know" that needs to be coded to missing
	wave5addhealth$H5EC2[wave5addhealth$H5EC2==997]<-0
	#line above makes legitimate skip into 0 to include people with no income as their own category
	wave5addhealth$H5EC2[wave5addhealth$H5EC2==998]<-NA
	#line above codes "don't know" to missing to exclude people who don't know their household income
	frequency(wave5addhealth$H5EC2)

	#DR: H5TO6 has legitimate skiip for people who never smoked; the line below will exclude them from the sample.
	wave5addhealth$H5TO6[wave5addhealth$H5TO6==97]<-NA
	wave5addhealth <-subset(wave5addhealth, H5TO6 != 'NA')
	#the line above will remove never smokers from the dataset

	#KH: H5LM15 has legitimate skip for people who are not employed. The line of code below will treat them as missing values. The smaple will include only people who are employed.
	wave5addhealth$H5LM15[wave5addhealth$H5LM15==97]<-NA
	wave5addhealth <-subset(wave5addhealth, H5LM15 != 'NA')