Last active
December 16, 2015 20:59
-
-
Save mcorey/5496443 to your computer and use it in GitHub Desktop.
This is the script that produces the results for my final dissertation chapter.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Build 03-11 Respondent Data [12-Feb-2013 Cut to 2000 RND cases] | |
rm(list=ls()) #Clear Workspace | |
#Load Libraries | |
library(utils) | |
library(foreign) | |
library(car) | |
library(TraMineR) | |
library(plyr) | |
library(TraMineRextras) | |
library(parallel) | |
library(cluster) | |
library(stats) | |
library(descr) | |
library(gmodels) | |
library(texreg) | |
#Set WD and load data | |
#MAC | |
setwd("/Users/michaelcorey/Dropbox/School Work") | |
# *nix | |
# setwd("/home/michael/Dropbox/School Work/") | |
atus.resp.raw<-read.dta("ATUS data/atusresp_0311/atus_0311_resp.dta", | |
convert.factors=TRUE, warn.missing.labels = FALSE) | |
atus.summ<-read.dta("ATUS data/atusresp_0311/atus_0311_summ.dta", | |
convert.factors=TRUE, warn.missing.labels = FALSE) | |
ls() #check what is loaded. | |
atus.summ.m <- atus.summ[, c("tucaseid", "teage", "ptdtrace", "peeduca", | |
"pehspnon", "tesex", "t050101", "t050102", | |
"t050103")] | |
atus.resp <- merge(atus.resp.raw, atus.summ.m, by="tucaseid", all.x=F, all.y=T) | |
#Get codebook for new file | |
colnames(atus.resp) #show column names | |
var.labels <- attr(atus.resp.raw, "var.labels") #Make variable label object | |
var.lbl.summ <- attr(atus.summ.m, "var.labels") | |
var.labels <- c(var.labels, 1:8) #need to add dummy labels until I can pass | |
#the partial list of var.names from summ.m | |
var.labels | |
codebook.resp <- data.frame(var.name=names(atus.resp), var.labels) #Make codebook | |
codebook.resp #Print codebook | |
#Look around the data | |
#Look at line numbers | |
table(atus.resp$tulineno) #Looks good, all line 1 (ATUS respondents) | |
#Build smaller dataset | |
resp.0311 <- atus.resp[, c("tucaseid")] #Make dataframe | |
resp.0311 <- as.data.frame(resp.0311) | |
#HH Characteristics | |
resp.0311$id <- atus.resp$tucaseid #Assign ID | |
resp.0311$sex <- atus.resp$tesex #Resp Sex | |
resp.0311$kid.lt18.own <- atus.resp$trohhchild #Own HH kid <18 | |
resp.0311$nonhh.kid.lt18. <- atus.resp$trnhhchild #Own Non-HH kid <18 | |
resp.0311$kid.lt18.hh <- atus.resp$trhhchild #Any HH child under 18 | |
resp.0311$kid.yngst <- atus.resp$tryhhchild #Youngest Child in HH | |
resp.0311$hh.size <- atus.resp$trnumhou #HH Size | |
resp.0311$kid.lt13.wake.ts <- atus.resp$tucc2 #1st kid <13 awake [ts] | |
resp.0311$kid.lt13.sleep.ts <- atus.resp$tucc4 #last kid <13 asleep [ts] | |
#Respondent Characteristics | |
resp.0311$hourly <- atus.resp$teernhry #Hourly Flag | |
resp.0311$year <- atus.resp$tuyear #Year of Intv | |
resp.0311$day <- atus.resp$tudiaryday #Week of Intv | |
resp.0311$holiday <- atus.resp$trholiday #Is it a holiday? | |
resp.0311$earn.weekly <- atus.resp$trernwa #Weekly Earnings | |
resp.0311$industry <- atus.resp$trdtind1 #Indstry Recode (main) | |
resp.0311$occup <- atus.resp$trdtocc1 #Occup Recode (main) | |
resp.0311$work.7d <- atus.resp$tufwk #Work last 7 days | |
resp.0311$month <- atus.resp$tumonth #Month of Intv | |
resp.0311$job.7d <- atus.resp$tuabsot #Job last 7 days | |
resp.0311$h35.usual <- atus.resp$tehrftpt #35 hrs usually | |
resp.0311$whrs.usual <- atus.resp$tehruslt #Usual work hours (tot) | |
resp.0311$age <- atus.resp$teage #Age of respondent | |
resp.0311$race <- atus.resp$ptdtrace #Race of respondent | |
resp.0311$hisp <- atus.resp$pehspnon #Hispanic | |
resp.0311$lf.edit <- atus.resp$telfs #Edited labor force | |
resp.0311$workj1 <- atus.summ$t050101 #Time in 1st Job on Day | |
resp.0311$workj2 <- atus.summ$t050102 #Time in 2nd Job on Day | |
resp.0311$workOth <- atus.summ$t050103 #Time in 3rd Job on Day | |
#Fix Respondent Characteristics | |
#Recode day to weekday | |
resp.0311$weekday <- recode(resp.0311$day, | |
' c("Sunday", "Saturday") = "Weekend"; else = "Weekday" ') | |
table(atus.resp$tudiaryday, resp.0311$weekday) | |
#Partner Information | |
resp.0311$ptner.pres <- atus.resp$trsppres #Is spouse/ptnr present | |
resp.0311$ptner.empl <- atus.resp$tespempnot #Sp/Ptnr Employment Status | |
resp.0311$ptner.job7d <- atus.resp$tuspabs #Sp/Ptnr Job Last 7d | |
resp.0311$ptner.35hwk <- atus.resp$tuspusft #Sp/Ptnr Usually 35h+ | |
resp.0311$ptner.whrs <- atus.resp$tespuhrs #Sp/Ptnr Usual Work Hours | |
resp.0311$ptner.ftpt <- atus.resp$trspftpt #Sp/Ptnr Full/Part Time | |
#Save and pass to build activity data | |
save(resp.0311, file="Diss Paper 1 -- weekend work over time/Diss Paper 1 Data/resp0311.Rdata" ) | |
rm(list=ls()) | |
load(file="Diss Paper 1 -- weekend work over time/Diss Paper 1 Data/resp0311.Rdata") | |
#---------------------------------------------------------------------------- | |
#Setup Activity Data | |
act.0311<-read.dta("ATUS data/atusact_0311/atus_0311_act.dta", | |
convert.factors=TRUE, warn.missing.labels = FALSE) | |
save(act.0311, file="Diss Paper 1 -- weekend work over time/Diss Paper 1 Data/act0311-raw.Rdata" ) | |
#OR | |
#Load Pre-Set Activity Data | |
# load(file = "/Users/michaelcorey/Dropbox/School Work/Diss Paper 1 -- weekend work over time/Diss Paper 1 Data/act0311-raw.Rdata") | |
#Convert Start Time Factor into minutes (1-1440) | |
#Use a function from StackOverflow to split times to minutes (from 4am as zero) | |
act.0311$startmin <- sapply(strsplit(as.character(act.0311$tustarttim),":"), | |
function(x) { | |
x <- as.numeric(x) | |
((x[1]-4)*60)+x[2]+(x[3]/60)+1 | |
} ) | |
act.0311$stopmin <- sapply(strsplit(as.character(act.0311$tustoptime),":"), | |
function(x) { | |
x <- as.numeric(x) | |
((x[1]-4)*60)+x[2]+(x[3]/60)+1 | |
} ) | |
#Then adjust after midnight times to be greater than 1200 (e.g.20h*60m) minutes | |
act.0311$startmin[113:120] #look at a negative case | |
#Add 1440 to the cases where the start minute is less than zero | |
act.0311$startmin[which(act.0311$startmin<=0)] <- | |
act.0311$startmin[which(act.0311$startmin<=0)] + 1440 | |
act.0311$startmin[113:120] #check correct conversion | |
act.0311$stopmin[113:120] #look at a negative case | |
#Add 1440 to the cases where the stop minute is less than zero | |
act.0311$stopmin[which(act.0311$stopmin<=0)] <- | |
act.0311$stopmin[which(act.0311$stopmin<=0)] + 1440 | |
act.0311$stopmin[113:120] #check correct conversion | |
#Move Anytime the last activity moves to the next day into being 1440 | |
act.0311$stopmin[which(act.0311$stopmin < act.0311$startmin)] <- 1440 | |
act.0311[1:10,] #Show first ten rows, conversions look OK | |
summary(act.0311$stopmin) #And the summary makes sense | |
#make short id name1 | |
act.0311$id <- act.0311$tucaseid | |
act.0311[1:10, c("id", "tucaseid")] | |
#Make a simple 4-category activity ledger (1=sleep, 2=care, 3=work, 4=other) | |
act.0311$simpleacts <- recode(act.0311$trcodep, "50101:59999='work'; | |
else='other' " ) | |
act.0311$simpleacts <- factor(act.0311$simpleacts, c( "other", "work")) | |
#Care | |
act.0311$careacts <- recode(act.0311$trcodep, "030101:030399='care'; | |
else='other' " ) | |
act.0311$careacts <- factor(act.0311$careacts, c( "other", "care")) | |
#Leisure | |
act.0311$leisureacts <- recode(act.0311$trcodep, "120101:139999='leisure'; | |
else='other' " ) | |
act.0311$leisureacts <- factor(act.0311$leisureacts, c( "other", "leisure")) | |
#Sleep | |
act.0311$sleepacts <- recode(act.0311$trcodep, "010101:101099='sleep'; | |
else='other' " ) | |
act.0311$sleepacts <- factor(act.0311$sleepacts, c( "other", "sleep")) | |
#Check activity codings for overlaps and accuracy | |
act.0311[25:50, c("id", "simpleacts", "careacts", "leisureacts", | |
"sleepacts", "trcodep")] | |
#make backup of full file | |
act.0311.full <- act.0311 | |
save(act.0311, file="act0311.Rdata") | |
act.0311 <- act.0311[,c("id", "startmin", "stopmin", "simpleacts", | |
"careacts", "leisureacts", "sleepacts")] | |
#Reducing the activity file to weekdays and individuals with (own) children | |
#Subset the respondents to only thos with kids and a weekday diary | |
resp.0311.wkdy.kids <- subset(resp.0311, kid.lt18.own=="Yes" & | |
weekday=="Weekday" & | |
workj1>=30 & lf.edit=="Employed - at work") | |
## ADD Working to ^^ & below | |
resp.0311.wkdy.kids[1:10, c("id", "kid.lt18.own", "kid.lt18.hh", | |
"day", "weekday", "workj1") ] | |
#Select only 4000 of cases | |
###Why not do 1000 at a time and then rbind???!?!?!, could even foreach it | |
####And get some multi-core going on??? | |
set.seed(85) | |
resp.0311.wkdy.kids <- resp.0311.wkdy.kids[sample(nrow(resp.0311.wkdy.kids), | |
4000,),] | |
#Subset the activity file to only those ids matching the wkdy.kids resp file | |
act.0311.wkdy.kids <- subset(act.0311, act.0311$id %in% resp.0311.wkdy.kids$id) | |
#Check my work (act.0311.wkdy.kids$id should be identical to that of resp...kids) | |
check <- act.0311.wkdy.kids$id %in% resp.0311.wkdy.kids$id | |
summary(check) #Yep, all match. | |
#Format the activity file as a sequence | |
act.seqf.simple.wkdy.kids <- seqformat(act.0311.wkdy.kids, id="id", | |
from = "SPELL", to="STS", | |
begin = "startmin", end="stopmin", | |
status ="simpleacts", process="FALSE") | |
#Sequence | |
act.seq.simp.wkdy.kids <- seqdef(act.seqf.simple.wkdy.kids, xtstep=120) | |
#Granulate to 15 minute intervals | |
act.seq.swk.gran <- seqgranularity(act.seq.simp.wkdy.kids, | |
tspan = 15, method = "first") | |
#Implement a label for granular y axis | |
time.lab.gran <- c(1:97) | |
time.lab.gran[1]="4am"; time.lab.gran[9]="6am"; time.lab.gran[17]="8am"; | |
time.lab.gran[25]="10am"; time.lab.gran[33]="12pm"; time.lab.gran[41]="2pm"; | |
time.lab.gran[49]="4pm"; time.lab.gran[57]="6pm"; time.lab.gran[65]="8pm"; | |
time.lab.gran[73]="10pm"; time.lab.gran[81]="12am"; time.lab.gran[89]="2am"; | |
time.lab.gran[97]="4am"; | |
#Make sequence distance matrix | |
seqdist.mtx <- seqdist(act.seq.swk.gran, method="DHD", sm="CONSTANT") | |
#Look at sequence data & etc | |
#Hierarchical Aglomative (agnes) cluster ala Laurent and Kan | |
act.seq.simp.clstr <- agnes(seqdist.mtx, diss=T, method="ward", | |
keep.diss = F, keep.data=F) | |
#Why did Lesnard use a -.3 linkage?!?!?!? | |
#Plot the agnes in PDF | |
pdf("Diss Paper 1 -- weekend work over time/agnes3.pdf", width=40, height=15) | |
plot(act.seq.simp.clstr, ask=F ) | |
dev.off() | |
#Breakup clusters and plot: | |
act.seq.swk.gran$clust12 <- cutree(act.seq.simp.clstr, k=7) | |
act.seq.swk.gran$clust12 <- factor(act.seq.swk.gran$clust12, | |
labels=paste("Cluster", 1:7)) | |
resp.0311.wkdy.kids$work.clust <- act.seq.swk.gran$clust12 | |
seqdplot(act.seq.swk.gran, group=resp.0311.wkdy.kids$work.clust, | |
xtlab=time.lab.gran, border=NA) | |
#Try it with 4 clusters... | |
act.seq.swk.gran$clust7 <- cutree(act.seq.simp.clstr, k=9) | |
act.seq.swk.gran$clust7 <- factor(act.seq.swk.gran$clust7, | |
labels=paste("Work Clust", 1:9)) | |
#Plot State Distribution Plots | |
seqdplot(act.seq.swk.gran, group=act.seq.swk.gran$clust7, | |
xtlab=time.lab.gran, border=NA) | |
#Plot Modal States Plots | |
seqplot(act.seq.swk.gran, type="i", group=act.seq.swk.gran$clust7, | |
xtlab=time.lab.gran) | |
#Table to look at cuml and total percentage in each cluster | |
freq(ordered(act.seq.swk.gran$clust7), plot=F) | |
#Table to look at clusters against certain factors | |
CrossTable(act.seq.swk.gran$clust7, resp.0311.wkdy.kids$hh.size, | |
prop.c=F, prop.t=F, prop.chi=F) | |
#Plot representative sequences (why 35 seqs?) | |
seqrplot(act.seq.swk.gran, criterion = "prob", | |
xtlab=time.lab.gran, dist.matrix=seqdist.mtx, | |
withlegend = "right") | |
#Format the care activity file as a sequence | |
act.seqf.care <- seqformat(act.0311.wkdy.kids, id="id", | |
from = "SPELL", to="STS", | |
begin = "startmin", end="stopmin", | |
status ="careacts", process="FALSE") | |
#Sequence | |
act.seq.care <- seqdef(act.seqf.care, xtstep=120) | |
#Granulate to 15 minute intervals | |
act.seq.care.gran <- seqgranularity(act.seq.care, | |
tspan = 15, method = "first") | |
#Make sequence distance matrix | |
seqdist.mtx.care <- seqdist(act.seq.care.gran, method="DHD", sm="CONSTANT") | |
#Look at sequence data for care, etc | |
#Hierarchical Aglomative (agnes) cluster ala Laurent and Kan | |
act.seq.care.clstr <- agnes(seqdist.mtx.care, diss=T, method="ward", | |
keep.diss = F, keep.data=F) | |
#Why did Lesnard use the beta, and/or a -.3 linkage?!?!?!? | |
#Plot the Care agnes in PDF | |
pdf("Diss Paper 1 -- weekend work over time/CareAgnes.pdf", width=40, height=15) | |
plot(act.seq.care.clstr, ask=F ) | |
dev.off() | |
#Make useable clusters | |
act.seq.care.clstr$care.clust <- cutree(act.seq.care.clstr, k=7) | |
resp.0311.wkdy.kids$care.clust <- factor(act.seq.care.clstr$care.clust, | |
labels=paste("Care Clust", 1:7)) | |
#Plot State Distribution Plots for Care | |
seqdplot(act.seq.care.gran, group=resp.0311.wkdy.kids$care.clust, | |
xtlab=time.lab.gran, border=NA) #NEAT! | |
CrossTable(act.seq.care.gran$care.clust, act.seq.swk.gran$clust12, | |
prop.r=F, prop.t=F, prop.chisq=T) | |
short.clst <- act.seq.swk.gran[c("clust7", "care.clust")] | |
analyze.this <- merge(resp.0311.wkdy.kids, short.clst) | |
#Trying out texreg | |
testreg <- lm(whrs.usual ~ sex, data=resp.0311.wkdy.kids) | |
screenreg(testreg) | |
#collapse work clusters | |
resp.0311.wkdy.kids$work.5c <- recode(resp.0311.wkdy.kids$work.clust, | |
"c('Cluster 1','Cluster 4','Cluster 5')='Day'; | |
c('Cluster 3')= 'Night Shift'; | |
'Cluster 2'='AM part-time'; | |
c('Cluster 6')='Eve Shift'; | |
c('Cluster 7')='Long Day'") | |
resp.0311.wkdy.kids$work.5c <-relevel(resp.0311.wkdy.kids$work.5c, ref="Day") | |
#Plot Collapsed Work Clusters | |
seqplot(act.seq.swk.gran, type="d", group=resp.0311.wkdy.kids$work.5c, | |
xtlab=time.lab.gran) | |
#multilevel reg for work clusters? | |
work.mlgt <- multinom(work.5c ~ age + sex + hh.size + kid.yngst, data=resp.0311.wkdy.kids) | |
summary(work.mlgt) | |
exp(coef(work.mlgt)) | |
write.dta(dataframe=resp.0311.wkdy.kids, file="clustForStata.dta") | |
#Try to find a simple kmeans elbow graph | |
# kMax<-20; varexpl<-NULL | |
# for(i in 1:kMax) { | |
# kMeansResults<-kmeans(seqdist.mtx, i) | |
# varexpl <- c(varexpl, kMeansResults$betweenss/kMeansResults$totss) | |
# } | |
# plot(1:kMax, varexpl*100, xlab = "Number of clusters", | |
# ylab = "Percent of variance explained",type="l") | |
#hmm, 3? | |
# act.kmeans <- kmeans(seqdist.mtx, 7) | |
# plot(seqdist.mtx, act.kmeans$cluster) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment