Last active
November 16, 2016 22:33
-
-
Save charlezzee/1f2135e4c535240c4a55352a829757f4 to your computer and use it in GitHub Desktop.
Code QSS 30.05 Charlie
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#reading in the packages required to make desired plots | |
library(dplyr) | |
library(ggplot2) | |
library(readr) | |
library(gtools) | |
library(maptools) | |
library(ggmap) | |
#setting the working directory | |
setwd('/users/charleslundquist/documents/QSS 30.05- Charlie') | |
#coding the print plot function to download graphs onto computer after they are coded | |
printplot <- function(plot) { | |
png('plot.png',height=500,width=1000) | |
print(plot) | |
dev.off() | |
} | |
#reading in the data | |
a <- read_csv('./data/usa_00031.csv',col_types=cols(HHWT=col_double(),PERWT=col_double())) | |
#creating a new Sex variable | |
b <- a %>% mutate(Sex=factor(ifelse(SEX==1,1,2), | |
labels=c('Male','Female'))) | |
#excluding data from Alaska and Hawaii before 1960 | |
C <- b %>% filter(!STATEFIP %in% c(2,15) | YEAR>=1960) | |
#filtering out all non traditional group quarters so only families are included | |
c <- C %>% filter(GQ==1) | |
#create a new variable to measure whether a person is working, or not working | |
d <- c %>% mutate(Wrk=ifelse(OCC1950>979, 'Not Working/Unemployed','Working for Pay')) | |
#filtering data set to only include Married (head or spouse) Women from 1940 onwards | |
e <- d %>% filter(Sex=='Female',YEAR>=1940 & RELATE==1 | 2) | |
#grouping married women by year, and their work status. | |
#weighting the groups by the PERWT function | |
E <- e %>% group_by(YEAR,Wrk) %>% summarise(POPULATE=sum(PERWT)) | |
asd <- e %>% group_by(YEAR,Wrk) %>% summarise(Number5=sum(PERWT)) | |
#grouping the married women simply by year. | |
asdf <- e %>% group_by(YEAR) %>% summarise(Number6=sum(PERWT)) | |
#joining the married women grouped by year and work status with the women grouped by year | |
joinem <- left_join(asd,asdf) | |
#creating a workforce participation variable | |
#number of married women who are classified as working or not working (Number 5) divided by the total number of married women (Number 6) | |
#gives the workforce participation rate. A ratio of working to not working married women by year | |
pctwrk <- joinem %>% mutate(pct=Number5/Number6*100) | |
View(pctwrk) | |
#graphing the new dataset into a bar chart that shows the ratio of working to not working married women | |
#from 1940 to 2000 | |
graph4 <- ggplot(data=pctwrk,aes(x=YEAR, y=pct/100,fill=Wrk)) + | |
geom_bar(stat = 'identity') + | |
labs(x='Year',y='Percent Working',fill='Work Status',title='Figure 1: Wives Labor Force Participation') + | |
scale_y_continuous(labels=scales :: percent) + | |
scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) + | |
scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE)) | |
printplot(graph4) | |
#graphing another bar chart showing the total number of working and not working married women | |
#notice that here, I use the E data set, and set y to POPULATE, which measures the total number of working and not working women | |
graph6 <- ggplot(data=E,aes(x=YEAR, y=POPULATE,fill=Wrk)) + | |
geom_bar(stat = 'identity') + | |
labs(x='Year',y='Population',fill='Work Status',title='Figure 2: Wives Labor Force Participation,1940-2000') + | |
scale_y_continuous(labels=scales :: comma) + | |
scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) + | |
scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE)) | |
printplot(graph6) | |
#First, I filter out all data that includes people who are working but do not make anyincome | |
#and all people who are measured as not working, who make an income. These are likely to be mismeasurements | |
c2 <- d %>% filter((Wrk=='Working for Pay' & INCWAGE>0) | (Wrk=='Not Working/Unemployed' & INCWAGE==0)) | |
#Creating a new income variable where all INCWAGE values of 999998 or 999999 are equal to 0, as the 999998 and 999999 stand for missing people | |
#Then, I multiple the remaining incwage variables by CPI99, so that they are all equivalent to price levels in 1999 | |
c3 <- c2 %>% mutate(inc=(ifelse(INCWAGE>=999998,0,INCWAGE*CPI99))) | |
#filtering the c3 data set to only include husbands | |
#making two new variables: hwork and hinc | |
#hwork measures the husbands working status | |
#hinc measures the husbands income in 1999 price levels | |
husbands <- c3 %>% filter(Sex=='Male' & SPLOC>0) %>% | |
rename(hwork=Wrk) %>% | |
rename(hinc=inc) | |
#filtering the c3 data set to only include wives | |
#selecting variables that will be necessary for my graphs | |
#creating two new variables to measure the wives working status, and income | |
wies <- c3 %>% filter(Sex=='Female' & SPLOC>0) %>% | |
select(YEAR,SERIAL,PERNUM,Wrk,SPLOC,inc) %>% | |
rename(wwork=Wrk) %>% | |
rename(winc=inc) | |
#joining the husbands and wives together to create a new couples variable | |
#using the wwork and hwork variables, I create a new variable that measures the breadwinner status of the couples | |
couples <- left_join(wies,husbands,by=c('YEAR','SERIAL','SPLOC'='PERNUM')) %>% | |
mutate(breadwin=factor(ifelse(wwork=='Not Working/Unemployed' & hwork=='Not Working/Unemployed',1, | |
ifelse(wwork=='Not Working/Unemployed' & hwork=='Working for Pay',2, | |
ifelse(wwork==hwork,3,4))), | |
labels=c('No Bread Winner','Male Bread Winner','Dual Bread Winner','Female Bread Winner'))) | |
#I only want to look at female, male, and dual breadwinners, so I filter out | |
#the NA and No Bread Winner categories | |
couplescc <- couples %>% filter(breadwin!='NA' & breadwin!='No Bread Winner') | |
#first, I group together all households with the same year, and breadwinner status | |
#then I calcuate the median household income, and household incomes at the | |
#10th, 25th, 70th and 90th percentile | |
#by adding together the winc and hinc variables, and weighting them by the HHWT variable I can determine the household's income | |
ccc <- couplescc %>% group_by(YEAR,breadwin) %>% | |
summarise(MED=median(rep(winc+hinc,times=HHWT)), | |
MIN=quantile(rep(winc+hinc,times=HHWT),0.1), | |
LOW=quantile(rep(winc+hinc,times=HHWT),0.25), | |
HIGH=quantile(rep(winc+hinc,times=HHWT),0.75), | |
MAX=quantile(rep(winc+hinc,times=HHWT),0.9)) | |
head(ccc) | |
#I filter out all data before 1960 | |
Coupfinal <- ccc %>% filter(YEAR>1950) | |
#Now, I am able to create a bar graph that measures the median household income by | |
#breadwinner type between 1960 and 2000 | |
graphit <- ggplot(data=Coupfinal, aes(x=YEAR,y=MED,fill=breadwin)) + | |
geom_bar(stat='identity',position='dodge') + | |
labs(fill='Bread Winner Status',x='Year',y='Median Income',title='Figure 3: Median Household Income by Household Type, 1960-2000') + | |
theme_bw(base_size=11) + | |
scale_y_continuous(labels=scales:: comma) + | |
scale_x_continuous(breaks=c(1960,1970,1980,1990,2000)) | |
printplot(graphit) | |
#I also decide to make a boxplot to measure the household income at the | |
#10th, 25th, 75th, and 90th percentiles. The center of the box plot will be median income | |
graphyay <- ggplot(Couper, aes(x=YEAR,ymin=MIN,lower=LOW,middle=MED,upper=HIGH,ymax=MAX,fill=breadwin)) + | |
geom_boxplot(stat='identity',position='dodge') + | |
labs(fill='Bread Winner Status',x='Year',y='Income in US Dollars',title='Income Distribution by Household Type, 1960-2000') + | |
scale_y_continuous(labels=scales::comma) + | |
scale_x_continuous(breaks=c(1960,1970,1980,1990,2000)) + | |
theme_bw(base_size=11) | |
printplot(graphyay) | |
#Now I go back to the e dataset, and add in a new Education variable | |
#the new variable measures the educational attainment of each individual | |
#by grouping together people by the number of years they had attended school | |
educ <- e %>% mutate(Education=factor(ifelse(EDUC<6,1, | |
ifelse(EDUC<7,2, | |
ifelse(EDUC<10,3, | |
ifelse(EDUC<11,4,5)))), | |
labels=c('Dropout','High School','Some College','College','Grad School'))) | |
#First, I group the dataset by YEAR, Work, and Educational Status | |
#and I weight these groups by the PERWT variable. Here I have the total number of working | |
#and not working wives by year, within a given educational bracket | |
ASD <- educ %>% group_by(YEAR,Wrk,Education) %>% summarise(Numb=sum(PERWT)) | |
#Next, I group the dataset by YEAR and Educational Status, and weight the groups | |
#With the Perwt variable. Here, I have the total number of wives in a given educational | |
#category each year | |
ASD2 <- educ %>% group_by(YEAR,Education) %>% summarise(Num2=sum(PERWT)) | |
View(ASD2) | |
#Join the two data sets together, and only include the working women | |
#because I want to simply measure the workforce participation rate | |
joineducation <- left_join(ASD,ASD2) %>% | |
filter(Wrk=='Working for Pay') | |
#Create a new variable to measure the workforce participation rate, which is equivalent to | |
#the number of working women in a given year and educational category, over the total number of women | |
#in that educational category | |
Workforcepartbyedu <- joineducation %>% | |
mutate(wpar=Numb/Num2*100) | |
#First I create a bar graph that shows the total number of working and not working women | |
#By educational category between the years 1940 and 2000. | |
graph7 <- ggplot(data=ASD,aes(x=Education, y=Numb,fill=Wrk)) + | |
geom_bar(stat = 'identity') + | |
labs(x='Education',y='Population',fill='Working Status',title='Figure 4: Number of Working and Not Working Wives by Education,1940-2000') + | |
scale_y_continuous(labels=scales :: comma,limits = c(0,70000000)) + | |
theme_bw(base_size=10) + | |
facet_wrap(~YEAR,ncol=2,scales='free_y') | |
printplot(graph7) | |
#Then I create a line graph that measures the workforce participation rate for each educational | |
#category between 1940 and 2000 | |
graph8 <- ggplot(data=Workforcepartbyedu,aes(x=YEAR,y=wpar/100,group=Education,colour=Education)) + | |
geom_line() + | |
geom_point() + | |
ggtitle('Figure 6: Wives Labor Force Participation by Education,1940-2000') + | |
labs(x='YEAR',y='Percent of Wives Working for Pay',group='Education Level',Title='Figure 6: Wives Labor Force Participation by Education,1940-2000') + | |
scale_y_continuous(labels=scales :: percent,limits=c(0,1)) | |
printplot(graph8) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment