Skip to content

Instantly share code, notes, and snippets.

@charlezzee
Last active November 16, 2016 22:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save charlezzee/1f2135e4c535240c4a55352a829757f4 to your computer and use it in GitHub Desktop.
Save charlezzee/1f2135e4c535240c4a55352a829757f4 to your computer and use it in GitHub Desktop.
Code QSS 30.05 Charlie
#reading in the packages required to make desired plots
library(dplyr)
library(ggplot2)
library(readr)
library(gtools)
library(maptools)
library(ggmap)
#setting the working directory
setwd('/users/charleslundquist/documents/QSS 30.05- Charlie')
#coding the print plot function to download graphs onto computer after they are coded
printplot <- function(plot) {
png('plot.png',height=500,width=1000)
print(plot)
dev.off()
}
#reading in the data
a <- read_csv('./data/usa_00031.csv',col_types=cols(HHWT=col_double(),PERWT=col_double()))
#creating a new Sex variable
b <- a %>% mutate(Sex=factor(ifelse(SEX==1,1,2),
labels=c('Male','Female')))
#excluding data from Alaska and Hawaii before 1960
C <- b %>% filter(!STATEFIP %in% c(2,15) | YEAR>=1960)
#filtering out all non traditional group quarters so only families are included
c <- C %>% filter(GQ==1)
#create a new variable to measure whether a person is working, or not working
d <- c %>% mutate(Wrk=ifelse(OCC1950>979, 'Not Working/Unemployed','Working for Pay'))
#filtering data set to only include Married (head or spouse) Women from 1940 onwards
e <- d %>% filter(Sex=='Female',YEAR>=1940 & RELATE==1 | 2)
#grouping married women by year, and their work status.
#weighting the groups by the PERWT function
E <- e %>% group_by(YEAR,Wrk) %>% summarise(POPULATE=sum(PERWT))
asd <- e %>% group_by(YEAR,Wrk) %>% summarise(Number5=sum(PERWT))
#grouping the married women simply by year.
asdf <- e %>% group_by(YEAR) %>% summarise(Number6=sum(PERWT))
#joining the married women grouped by year and work status with the women grouped by year
joinem <- left_join(asd,asdf)
#creating a workforce participation variable
#number of married women who are classified as working or not working (Number 5) divided by the total number of married women (Number 6)
#gives the workforce participation rate. A ratio of working to not working married women by year
pctwrk <- joinem %>% mutate(pct=Number5/Number6*100)
View(pctwrk)
#graphing the new dataset into a bar chart that shows the ratio of working to not working married women
#from 1940 to 2000
graph4 <- ggplot(data=pctwrk,aes(x=YEAR, y=pct/100,fill=Wrk)) +
geom_bar(stat = 'identity') +
labs(x='Year',y='Percent Working',fill='Work Status',title='Figure 1: Wives Labor Force Participation') +
scale_y_continuous(labels=scales :: percent) +
scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE))
printplot(graph4)
#graphing another bar chart showing the total number of working and not working married women
#notice that here, I use the E data set, and set y to POPULATE, which measures the total number of working and not working women
graph6 <- ggplot(data=E,aes(x=YEAR, y=POPULATE,fill=Wrk)) +
geom_bar(stat = 'identity') +
labs(x='Year',y='Population',fill='Work Status',title='Figure 2: Wives Labor Force Participation,1940-2000') +
scale_y_continuous(labels=scales :: comma) +
scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE))
printplot(graph6)
#First, I filter out all data that includes people who are working but do not make anyincome
#and all people who are measured as not working, who make an income. These are likely to be mismeasurements
c2 <- d %>% filter((Wrk=='Working for Pay' & INCWAGE>0) | (Wrk=='Not Working/Unemployed' & INCWAGE==0))
#Creating a new income variable where all INCWAGE values of 999998 or 999999 are equal to 0, as the 999998 and 999999 stand for missing people
#Then, I multiple the remaining incwage variables by CPI99, so that they are all equivalent to price levels in 1999
c3 <- c2 %>% mutate(inc=(ifelse(INCWAGE>=999998,0,INCWAGE*CPI99)))
#filtering the c3 data set to only include husbands
#making two new variables: hwork and hinc
#hwork measures the husbands working status
#hinc measures the husbands income in 1999 price levels
husbands <- c3 %>% filter(Sex=='Male' & SPLOC>0) %>%
rename(hwork=Wrk) %>%
rename(hinc=inc)
#filtering the c3 data set to only include wives
#selecting variables that will be necessary for my graphs
#creating two new variables to measure the wives working status, and income
wies <- c3 %>% filter(Sex=='Female' & SPLOC>0) %>%
select(YEAR,SERIAL,PERNUM,Wrk,SPLOC,inc) %>%
rename(wwork=Wrk) %>%
rename(winc=inc)
#joining the husbands and wives together to create a new couples variable
#using the wwork and hwork variables, I create a new variable that measures the breadwinner status of the couples
couples <- left_join(wies,husbands,by=c('YEAR','SERIAL','SPLOC'='PERNUM')) %>%
mutate(breadwin=factor(ifelse(wwork=='Not Working/Unemployed' & hwork=='Not Working/Unemployed',1,
ifelse(wwork=='Not Working/Unemployed' & hwork=='Working for Pay',2,
ifelse(wwork==hwork,3,4))),
labels=c('No Bread Winner','Male Bread Winner','Dual Bread Winner','Female Bread Winner')))
#I only want to look at female, male, and dual breadwinners, so I filter out
#the NA and No Bread Winner categories
couplescc <- couples %>% filter(breadwin!='NA' & breadwin!='No Bread Winner')
#first, I group together all households with the same year, and breadwinner status
#then I calcuate the median household income, and household incomes at the
#10th, 25th, 70th and 90th percentile
#by adding together the winc and hinc variables, and weighting them by the HHWT variable I can determine the household's income
ccc <- couplescc %>% group_by(YEAR,breadwin) %>%
summarise(MED=median(rep(winc+hinc,times=HHWT)),
MIN=quantile(rep(winc+hinc,times=HHWT),0.1),
LOW=quantile(rep(winc+hinc,times=HHWT),0.25),
HIGH=quantile(rep(winc+hinc,times=HHWT),0.75),
MAX=quantile(rep(winc+hinc,times=HHWT),0.9))
head(ccc)
#I filter out all data before 1960
Coupfinal <- ccc %>% filter(YEAR>1950)
#Now, I am able to create a bar graph that measures the median household income by
#breadwinner type between 1960 and 2000
graphit <- ggplot(data=Coupfinal, aes(x=YEAR,y=MED,fill=breadwin)) +
geom_bar(stat='identity',position='dodge') +
labs(fill='Bread Winner Status',x='Year',y='Median Income',title='Figure 3: Median Household Income by Household Type, 1960-2000') +
theme_bw(base_size=11) +
scale_y_continuous(labels=scales:: comma) +
scale_x_continuous(breaks=c(1960,1970,1980,1990,2000))
printplot(graphit)
#I also decide to make a boxplot to measure the household income at the
#10th, 25th, 75th, and 90th percentiles. The center of the box plot will be median income
graphyay <- ggplot(Couper, aes(x=YEAR,ymin=MIN,lower=LOW,middle=MED,upper=HIGH,ymax=MAX,fill=breadwin)) +
geom_boxplot(stat='identity',position='dodge') +
labs(fill='Bread Winner Status',x='Year',y='Income in US Dollars',title='Income Distribution by Household Type, 1960-2000') +
scale_y_continuous(labels=scales::comma) +
scale_x_continuous(breaks=c(1960,1970,1980,1990,2000)) +
theme_bw(base_size=11)
printplot(graphyay)
#Now I go back to the e dataset, and add in a new Education variable
#the new variable measures the educational attainment of each individual
#by grouping together people by the number of years they had attended school
educ <- e %>% mutate(Education=factor(ifelse(EDUC<6,1,
ifelse(EDUC<7,2,
ifelse(EDUC<10,3,
ifelse(EDUC<11,4,5)))),
labels=c('Dropout','High School','Some College','College','Grad School')))
#First, I group the dataset by YEAR, Work, and Educational Status
#and I weight these groups by the PERWT variable. Here I have the total number of working
#and not working wives by year, within a given educational bracket
ASD <- educ %>% group_by(YEAR,Wrk,Education) %>% summarise(Numb=sum(PERWT))
#Next, I group the dataset by YEAR and Educational Status, and weight the groups
#With the Perwt variable. Here, I have the total number of wives in a given educational
#category each year
ASD2 <- educ %>% group_by(YEAR,Education) %>% summarise(Num2=sum(PERWT))
View(ASD2)
#Join the two data sets together, and only include the working women
#because I want to simply measure the workforce participation rate
joineducation <- left_join(ASD,ASD2) %>%
filter(Wrk=='Working for Pay')
#Create a new variable to measure the workforce participation rate, which is equivalent to
#the number of working women in a given year and educational category, over the total number of women
#in that educational category
Workforcepartbyedu <- joineducation %>%
mutate(wpar=Numb/Num2*100)
#First I create a bar graph that shows the total number of working and not working women
#By educational category between the years 1940 and 2000.
graph7 <- ggplot(data=ASD,aes(x=Education, y=Numb,fill=Wrk)) +
geom_bar(stat = 'identity') +
labs(x='Education',y='Population',fill='Working Status',title='Figure 4: Number of Working and Not Working Wives by Education,1940-2000') +
scale_y_continuous(labels=scales :: comma,limits = c(0,70000000)) +
theme_bw(base_size=10) +
facet_wrap(~YEAR,ncol=2,scales='free_y')
printplot(graph7)
#Then I create a line graph that measures the workforce participation rate for each educational
#category between 1940 and 2000
graph8 <- ggplot(data=Workforcepartbyedu,aes(x=YEAR,y=wpar/100,group=Education,colour=Education)) +
geom_line() +
geom_point() +
ggtitle('Figure 6: Wives Labor Force Participation by Education,1940-2000') +
labs(x='YEAR',y='Percent of Wives Working for Pay',group='Education Level',Title='Figure 6: Wives Labor Force Participation by Education,1940-2000') +
scale_y_continuous(labels=scales :: percent,limits=c(0,1))
printplot(graph8)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment