Skip to content

Instantly share code, notes, and snippets.

@mcorey
Last active December 16, 2015 20:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mcorey/5496443 to your computer and use it in GitHub Desktop.
Save mcorey/5496443 to your computer and use it in GitHub Desktop.
This is the script that produces the results for my final dissertation chapter.
#Build 03-11 Respondent Data [12-Feb-2013 Cut to 2000 RND cases]
rm(list=ls()) #Clear Workspace
#Load Libraries
library(utils)
library(foreign)
library(car)
library(TraMineR)
library(plyr)
library(TraMineRextras)
library(parallel)
library(cluster)
library(stats)
library(descr)
library(gmodels)
library(texreg)
#Set WD and load data
#MAC
setwd("/Users/michaelcorey/Dropbox/School Work")
# *nix
# setwd("/home/michael/Dropbox/School Work/")
atus.resp.raw<-read.dta("ATUS data/atusresp_0311/atus_0311_resp.dta",
convert.factors=TRUE, warn.missing.labels = FALSE)
atus.summ<-read.dta("ATUS data/atusresp_0311/atus_0311_summ.dta",
convert.factors=TRUE, warn.missing.labels = FALSE)
ls() #check what is loaded.
atus.summ.m <- atus.summ[, c("tucaseid", "teage", "ptdtrace", "peeduca",
"pehspnon", "tesex", "t050101", "t050102",
"t050103")]
atus.resp <- merge(atus.resp.raw, atus.summ.m, by="tucaseid", all.x=F, all.y=T)
#Get codebook for new file
colnames(atus.resp) #show column names
var.labels <- attr(atus.resp.raw, "var.labels") #Make variable label object
var.lbl.summ <- attr(atus.summ.m, "var.labels")
var.labels <- c(var.labels, 1:8) #need to add dummy labels until I can pass
#the partial list of var.names from summ.m
var.labels
codebook.resp <- data.frame(var.name=names(atus.resp), var.labels) #Make codebook
codebook.resp #Print codebook
#Look around the data
#Look at line numbers
table(atus.resp$tulineno) #Looks good, all line 1 (ATUS respondents)
#Build smaller dataset
resp.0311 <- atus.resp[, c("tucaseid")] #Make dataframe
resp.0311 <- as.data.frame(resp.0311)
#HH Characteristics
resp.0311$id <- atus.resp$tucaseid #Assign ID
resp.0311$sex <- atus.resp$tesex #Resp Sex
resp.0311$kid.lt18.own <- atus.resp$trohhchild #Own HH kid <18
resp.0311$nonhh.kid.lt18. <- atus.resp$trnhhchild #Own Non-HH kid <18
resp.0311$kid.lt18.hh <- atus.resp$trhhchild #Any HH child under 18
resp.0311$kid.yngst <- atus.resp$tryhhchild #Youngest Child in HH
resp.0311$hh.size <- atus.resp$trnumhou #HH Size
resp.0311$kid.lt13.wake.ts <- atus.resp$tucc2 #1st kid <13 awake [ts]
resp.0311$kid.lt13.sleep.ts <- atus.resp$tucc4 #last kid <13 asleep [ts]
#Respondent Characteristics
resp.0311$hourly <- atus.resp$teernhry #Hourly Flag
resp.0311$year <- atus.resp$tuyear #Year of Intv
resp.0311$day <- atus.resp$tudiaryday #Week of Intv
resp.0311$holiday <- atus.resp$trholiday #Is it a holiday?
resp.0311$earn.weekly <- atus.resp$trernwa #Weekly Earnings
resp.0311$industry <- atus.resp$trdtind1 #Indstry Recode (main)
resp.0311$occup <- atus.resp$trdtocc1 #Occup Recode (main)
resp.0311$work.7d <- atus.resp$tufwk #Work last 7 days
resp.0311$month <- atus.resp$tumonth #Month of Intv
resp.0311$job.7d <- atus.resp$tuabsot #Job last 7 days
resp.0311$h35.usual <- atus.resp$tehrftpt #35 hrs usually
resp.0311$whrs.usual <- atus.resp$tehruslt #Usual work hours (tot)
resp.0311$age <- atus.resp$teage #Age of respondent
resp.0311$race <- atus.resp$ptdtrace #Race of respondent
resp.0311$hisp <- atus.resp$pehspnon #Hispanic
resp.0311$lf.edit <- atus.resp$telfs #Edited labor force
resp.0311$workj1 <- atus.summ$t050101 #Time in 1st Job on Day
resp.0311$workj2 <- atus.summ$t050102 #Time in 2nd Job on Day
resp.0311$workOth <- atus.summ$t050103 #Time in 3rd Job on Day
#Fix Respondent Characteristics
#Recode day to weekday
resp.0311$weekday <- recode(resp.0311$day,
' c("Sunday", "Saturday") = "Weekend"; else = "Weekday" ')
table(atus.resp$tudiaryday, resp.0311$weekday)
#Partner Information
resp.0311$ptner.pres <- atus.resp$trsppres #Is spouse/ptnr present
resp.0311$ptner.empl <- atus.resp$tespempnot #Sp/Ptnr Employment Status
resp.0311$ptner.job7d <- atus.resp$tuspabs #Sp/Ptnr Job Last 7d
resp.0311$ptner.35hwk <- atus.resp$tuspusft #Sp/Ptnr Usually 35h+
resp.0311$ptner.whrs <- atus.resp$tespuhrs #Sp/Ptnr Usual Work Hours
resp.0311$ptner.ftpt <- atus.resp$trspftpt #Sp/Ptnr Full/Part Time
#Save and pass to build activity data
save(resp.0311, file="Diss Paper 1 -- weekend work over time/Diss Paper 1 Data/resp0311.Rdata" )
rm(list=ls())
load(file="Diss Paper 1 -- weekend work over time/Diss Paper 1 Data/resp0311.Rdata")
#----------------------------------------------------------------------------
#Setup Activity Data
act.0311<-read.dta("ATUS data/atusact_0311/atus_0311_act.dta",
convert.factors=TRUE, warn.missing.labels = FALSE)
save(act.0311, file="Diss Paper 1 -- weekend work over time/Diss Paper 1 Data/act0311-raw.Rdata" )
#OR
#Load Pre-Set Activity Data
# load(file = "/Users/michaelcorey/Dropbox/School Work/Diss Paper 1 -- weekend work over time/Diss Paper 1 Data/act0311-raw.Rdata")
#Convert Start Time Factor into minutes (1-1440)
#Use a function from StackOverflow to split times to minutes (from 4am as zero)
act.0311$startmin <- sapply(strsplit(as.character(act.0311$tustarttim),":"),
function(x) {
x <- as.numeric(x)
((x[1]-4)*60)+x[2]+(x[3]/60)+1
} )
act.0311$stopmin <- sapply(strsplit(as.character(act.0311$tustoptime),":"),
function(x) {
x <- as.numeric(x)
((x[1]-4)*60)+x[2]+(x[3]/60)+1
} )
#Then adjust after midnight times to be greater than 1200 (e.g.20h*60m) minutes
act.0311$startmin[113:120] #look at a negative case
#Add 1440 to the cases where the start minute is less than zero
act.0311$startmin[which(act.0311$startmin<=0)] <-
act.0311$startmin[which(act.0311$startmin<=0)] + 1440
act.0311$startmin[113:120] #check correct conversion
act.0311$stopmin[113:120] #look at a negative case
#Add 1440 to the cases where the stop minute is less than zero
act.0311$stopmin[which(act.0311$stopmin<=0)] <-
act.0311$stopmin[which(act.0311$stopmin<=0)] + 1440
act.0311$stopmin[113:120] #check correct conversion
#Move Anytime the last activity moves to the next day into being 1440
act.0311$stopmin[which(act.0311$stopmin < act.0311$startmin)] <- 1440
act.0311[1:10,] #Show first ten rows, conversions look OK
summary(act.0311$stopmin) #And the summary makes sense
#make short id name1
act.0311$id <- act.0311$tucaseid
act.0311[1:10, c("id", "tucaseid")]
#Make a simple 4-category activity ledger (1=sleep, 2=care, 3=work, 4=other)
act.0311$simpleacts <- recode(act.0311$trcodep, "50101:59999='work';
else='other' " )
act.0311$simpleacts <- factor(act.0311$simpleacts, c( "other", "work"))
#Care
act.0311$careacts <- recode(act.0311$trcodep, "030101:030399='care';
else='other' " )
act.0311$careacts <- factor(act.0311$careacts, c( "other", "care"))
#Leisure
act.0311$leisureacts <- recode(act.0311$trcodep, "120101:139999='leisure';
else='other' " )
act.0311$leisureacts <- factor(act.0311$leisureacts, c( "other", "leisure"))
#Sleep
act.0311$sleepacts <- recode(act.0311$trcodep, "010101:101099='sleep';
else='other' " )
act.0311$sleepacts <- factor(act.0311$sleepacts, c( "other", "sleep"))
#Check activity codings for overlaps and accuracy
act.0311[25:50, c("id", "simpleacts", "careacts", "leisureacts",
"sleepacts", "trcodep")]
#make backup of full file
act.0311.full <- act.0311
save(act.0311, file="act0311.Rdata")
act.0311 <- act.0311[,c("id", "startmin", "stopmin", "simpleacts",
"careacts", "leisureacts", "sleepacts")]
#Reducing the activity file to weekdays and individuals with (own) children
#Subset the respondents to only thos with kids and a weekday diary
resp.0311.wkdy.kids <- subset(resp.0311, kid.lt18.own=="Yes" &
weekday=="Weekday" &
workj1>=30 & lf.edit=="Employed - at work")
## ADD Working to ^^ & below
resp.0311.wkdy.kids[1:10, c("id", "kid.lt18.own", "kid.lt18.hh",
"day", "weekday", "workj1") ]
#Select only 4000 of cases
###Why not do 1000 at a time and then rbind???!?!?!, could even foreach it
####And get some multi-core going on???
set.seed(85)
resp.0311.wkdy.kids <- resp.0311.wkdy.kids[sample(nrow(resp.0311.wkdy.kids),
4000,),]
#Subset the activity file to only those ids matching the wkdy.kids resp file
act.0311.wkdy.kids <- subset(act.0311, act.0311$id %in% resp.0311.wkdy.kids$id)
#Check my work (act.0311.wkdy.kids$id should be identical to that of resp...kids)
check <- act.0311.wkdy.kids$id %in% resp.0311.wkdy.kids$id
summary(check) #Yep, all match.
#Format the activity file as a sequence
act.seqf.simple.wkdy.kids <- seqformat(act.0311.wkdy.kids, id="id",
from = "SPELL", to="STS",
begin = "startmin", end="stopmin",
status ="simpleacts", process="FALSE")
#Sequence
act.seq.simp.wkdy.kids <- seqdef(act.seqf.simple.wkdy.kids, xtstep=120)
#Granulate to 15 minute intervals
act.seq.swk.gran <- seqgranularity(act.seq.simp.wkdy.kids,
tspan = 15, method = "first")
#Implement a label for granular y axis
time.lab.gran <- c(1:97)
time.lab.gran[1]="4am"; time.lab.gran[9]="6am"; time.lab.gran[17]="8am";
time.lab.gran[25]="10am"; time.lab.gran[33]="12pm"; time.lab.gran[41]="2pm";
time.lab.gran[49]="4pm"; time.lab.gran[57]="6pm"; time.lab.gran[65]="8pm";
time.lab.gran[73]="10pm"; time.lab.gran[81]="12am"; time.lab.gran[89]="2am";
time.lab.gran[97]="4am";
#Make sequence distance matrix
seqdist.mtx <- seqdist(act.seq.swk.gran, method="DHD", sm="CONSTANT")
#Look at sequence data & etc
#Hierarchical Aglomative (agnes) cluster ala Laurent and Kan
act.seq.simp.clstr <- agnes(seqdist.mtx, diss=T, method="ward",
keep.diss = F, keep.data=F)
#Why did Lesnard use a -.3 linkage?!?!?!?
#Plot the agnes in PDF
pdf("Diss Paper 1 -- weekend work over time/agnes3.pdf", width=40, height=15)
plot(act.seq.simp.clstr, ask=F )
dev.off()
#Breakup clusters and plot:
act.seq.swk.gran$clust12 <- cutree(act.seq.simp.clstr, k=7)
act.seq.swk.gran$clust12 <- factor(act.seq.swk.gran$clust12,
labels=paste("Cluster", 1:7))
resp.0311.wkdy.kids$work.clust <- act.seq.swk.gran$clust12
seqdplot(act.seq.swk.gran, group=resp.0311.wkdy.kids$work.clust,
xtlab=time.lab.gran, border=NA)
#Try it with 4 clusters...
act.seq.swk.gran$clust7 <- cutree(act.seq.simp.clstr, k=9)
act.seq.swk.gran$clust7 <- factor(act.seq.swk.gran$clust7,
labels=paste("Work Clust", 1:9))
#Plot State Distribution Plots
seqdplot(act.seq.swk.gran, group=act.seq.swk.gran$clust7,
xtlab=time.lab.gran, border=NA)
#Plot Modal States Plots
seqplot(act.seq.swk.gran, type="i", group=act.seq.swk.gran$clust7,
xtlab=time.lab.gran)
#Table to look at cuml and total percentage in each cluster
freq(ordered(act.seq.swk.gran$clust7), plot=F)
#Table to look at clusters against certain factors
CrossTable(act.seq.swk.gran$clust7, resp.0311.wkdy.kids$hh.size,
prop.c=F, prop.t=F, prop.chi=F)
#Plot representative sequences (why 35 seqs?)
seqrplot(act.seq.swk.gran, criterion = "prob",
xtlab=time.lab.gran, dist.matrix=seqdist.mtx,
withlegend = "right")
#Format the care activity file as a sequence
act.seqf.care <- seqformat(act.0311.wkdy.kids, id="id",
from = "SPELL", to="STS",
begin = "startmin", end="stopmin",
status ="careacts", process="FALSE")
#Sequence
act.seq.care <- seqdef(act.seqf.care, xtstep=120)
#Granulate to 15 minute intervals
act.seq.care.gran <- seqgranularity(act.seq.care,
tspan = 15, method = "first")
#Make sequence distance matrix
seqdist.mtx.care <- seqdist(act.seq.care.gran, method="DHD", sm="CONSTANT")
#Look at sequence data for care, etc
#Hierarchical Aglomative (agnes) cluster ala Laurent and Kan
act.seq.care.clstr <- agnes(seqdist.mtx.care, diss=T, method="ward",
keep.diss = F, keep.data=F)
#Why did Lesnard use the beta, and/or a -.3 linkage?!?!?!?
#Plot the Care agnes in PDF
pdf("Diss Paper 1 -- weekend work over time/CareAgnes.pdf", width=40, height=15)
plot(act.seq.care.clstr, ask=F )
dev.off()
#Make useable clusters
act.seq.care.clstr$care.clust <- cutree(act.seq.care.clstr, k=7)
resp.0311.wkdy.kids$care.clust <- factor(act.seq.care.clstr$care.clust,
labels=paste("Care Clust", 1:7))
#Plot State Distribution Plots for Care
seqdplot(act.seq.care.gran, group=resp.0311.wkdy.kids$care.clust,
xtlab=time.lab.gran, border=NA) #NEAT!
CrossTable(act.seq.care.gran$care.clust, act.seq.swk.gran$clust12,
prop.r=F, prop.t=F, prop.chisq=T)
short.clst <- act.seq.swk.gran[c("clust7", "care.clust")]
analyze.this <- merge(resp.0311.wkdy.kids, short.clst)
#Trying out texreg
testreg <- lm(whrs.usual ~ sex, data=resp.0311.wkdy.kids)
screenreg(testreg)
#collapse work clusters
resp.0311.wkdy.kids$work.5c <- recode(resp.0311.wkdy.kids$work.clust,
"c('Cluster 1','Cluster 4','Cluster 5')='Day';
c('Cluster 3')= 'Night Shift';
'Cluster 2'='AM part-time';
c('Cluster 6')='Eve Shift';
c('Cluster 7')='Long Day'")
resp.0311.wkdy.kids$work.5c <-relevel(resp.0311.wkdy.kids$work.5c, ref="Day")
#Plot Collapsed Work Clusters
seqplot(act.seq.swk.gran, type="d", group=resp.0311.wkdy.kids$work.5c,
xtlab=time.lab.gran)
#multilevel reg for work clusters?
work.mlgt <- multinom(work.5c ~ age + sex + hh.size + kid.yngst, data=resp.0311.wkdy.kids)
summary(work.mlgt)
exp(coef(work.mlgt))
write.dta(dataframe=resp.0311.wkdy.kids, file="clustForStata.dta")
#Try to find a simple kmeans elbow graph
# kMax<-20; varexpl<-NULL
# for(i in 1:kMax) {
# kMeansResults<-kmeans(seqdist.mtx, i)
# varexpl <- c(varexpl, kMeansResults$betweenss/kMeansResults$totss)
# }
# plot(1:kMax, varexpl*100, xlab = "Number of clusters",
# ylab = "Percent of variance explained",type="l")
#hmm, 3?
# act.kmeans <- kmeans(seqdist.mtx, 7)
# plot(seqdist.mtx, act.kmeans$cluster)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment