charlezzee/char30.05

## char30.05
#reading in the packages required to make desired plots

library(dplyr)
library(ggplot2)
library(readr)
library(gtools)
library(maptools)
library(ggmap)

#setting the working directory
setwd('/users/charleslundquist/documents/QSS 30.05- Charlie')

#coding the print plot function to download graphs onto computer after they are coded
printplot <- function(plot) {
  png('plot.png',height=500,width=1000)
  print(plot)
  dev.off()
}

#reading in the data
a <- read_csv('./data/usa_00031.csv',col_types=cols(HHWT=col_double(),PERWT=col_double()))

#creating a new Sex variable
b <- a %>% mutate(Sex=factor(ifelse(SEX==1,1,2),
                        labels=c('Male','Female')))

#excluding data from Alaska and Hawaii before 1960

C <- b %>% filter(!STATEFIP %in% c(2,15) | YEAR>=1960)

#filtering out all non traditional group quarters so only families are included

c <- C %>% filter(GQ==1)

#create a new variable to measure whether a person is working, or not working

d <- c %>% mutate(Wrk=ifelse(OCC1950>979, 'Not Working/Unemployed','Working for Pay'))

#filtering data set to only include Married (head or spouse) Women from 1940 onwards

e <- d %>% filter(Sex=='Female',YEAR>=1940 & RELATE==1 | 2)

#grouping married women by year, and their work status.
#weighting the groups by the PERWT function

E <- e %>% group_by(YEAR,Wrk) %>% summarise(POPULATE=sum(PERWT))

asd <- e %>% group_by(YEAR,Wrk) %>% summarise(Number5=sum(PERWT))

#grouping the married women simply by year.

asdf <- e %>% group_by(YEAR) %>% summarise(Number6=sum(PERWT))

#joining the married women grouped by year and work status with the women grouped by year

joinem <- left_join(asd,asdf)

#creating a workforce participation variable
#number of married women who are classified as working or not working (Number 5) divided by the total number of married women (Number 6)
#gives the workforce participation rate. A ratio of working to not working married women by year

pctwrk <- joinem %>% mutate(pct=Number5/Number6*100)
View(pctwrk)

#graphing the new dataset into a bar chart that shows the ratio of working to not working married women
#from 1940 to 2000
graph4 <- ggplot(data=pctwrk,aes(x=YEAR, y=pct/100,fill=Wrk)) +
  geom_bar(stat = 'identity') +
  labs(x='Year',y='Percent Working',fill='Work Status',title='Figure 1: Wives Labor Force Participation') +
  scale_y_continuous(labels=scales :: percent) +
  scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
  scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE))
printplot(graph4)

#graphing another bar chart showing the total number of working and not working married women
#notice that here, I use the E data set, and set y to POPULATE, which measures the total number of working and not working women
graph6 <- ggplot(data=E,aes(x=YEAR, y=POPULATE,fill=Wrk)) +
  geom_bar(stat = 'identity') +
  labs(x='Year',y='Population',fill='Work Status',title='Figure 2: Wives Labor Force Participation,1940-2000') +
  scale_y_continuous(labels=scales :: comma) +
  scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
  scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE))
printplot(graph6)

#First, I filter out all data that includes people who are working but do not make anyincome
#and all people who are measured as not working, who make an income. These are likely to be mismeasurements

c2 <- d %>% filter((Wrk=='Working for Pay' & INCWAGE>0) | (Wrk=='Not Working/Unemployed' & INCWAGE==0))

#Creating a new income variable where all INCWAGE values of 999998 or 999999 are equal to 0, as the 999998 and 999999 stand for missing people
#Then, I multiple the remaining incwage variables by CPI99, so that they are all equivalent to price levels in 1999
c3 <- c2 %>% mutate(inc=(ifelse(INCWAGE>=999998,0,INCWAGE*CPI99)))

#filtering the c3 data set to only include husbands
#making two new variables: hwork and hinc
#hwork measures the husbands working status
#hinc measures the husbands income in 1999 price levels
husbands <- c3 %>% filter(Sex=='Male' & SPLOC>0) %>%
  rename(hwork=Wrk) %>%
  rename(hinc=inc)

#filtering the c3 data set to only include wives
#selecting variables that will be necessary for my graphs
#creating two new variables to measure the wives working status, and income
wies <- c3 %>% filter(Sex=='Female' & SPLOC>0) %>%
  select(YEAR,SERIAL,PERNUM,Wrk,SPLOC,inc) %>%
  rename(wwork=Wrk) %>%
  rename(winc=inc)

#joining the husbands and wives together to create a new couples variable
#using the wwork and hwork variables, I create a new variable that measures the breadwinner status of the couples

couples <- left_join(wies,husbands,by=c('YEAR','SERIAL','SPLOC'='PERNUM')) %>%
  mutate(breadwin=factor(ifelse(wwork=='Not Working/Unemployed' & hwork=='Not Working/Unemployed',1,
                  ifelse(wwork=='Not Working/Unemployed' & hwork=='Working for Pay',2,
                  ifelse(wwork==hwork,3,4))),
                labels=c('No Bread Winner','Male Bread Winner','Dual Bread Winner','Female Bread Winner')))

#I only want to look at female, male, and dual breadwinners, so I filter out
#the NA and No Bread Winner categories
couplescc <- couples %>% filter(breadwin!='NA' & breadwin!='No Bread Winner')

#first, I group together all households with the same year, and breadwinner status
#then I calcuate the median household income, and household incomes at the
#10th, 25th, 70th and 90th percentile
#by adding together the winc and hinc variables, and weighting them by the HHWT variable I can determine the household's income

ccc <- couplescc %>% group_by(YEAR,breadwin) %>%
  summarise(MED=median(rep(winc+hinc,times=HHWT)),
            MIN=quantile(rep(winc+hinc,times=HHWT),0.1),
            LOW=quantile(rep(winc+hinc,times=HHWT),0.25),
            HIGH=quantile(rep(winc+hinc,times=HHWT),0.75),
            MAX=quantile(rep(winc+hinc,times=HHWT),0.9))

head(ccc)

#I filter out all data before 1960
Coupfinal <- ccc %>% filter(YEAR>1950)

#Now, I am able to create a bar graph that measures the median household income by
#breadwinner type between 1960 and 2000
graphit <- ggplot(data=Coupfinal, aes(x=YEAR,y=MED,fill=breadwin)) +
  geom_bar(stat='identity',position='dodge') +
  labs(fill='Bread Winner Status',x='Year',y='Median Income',title='Figure 3: Median Household Income by Household Type, 1960-2000') +
  theme_bw(base_size=11) +
  scale_y_continuous(labels=scales:: comma) +
  scale_x_continuous(breaks=c(1960,1970,1980,1990,2000))
printplot(graphit)

#I also decide to make a boxplot to measure the household income at the
#10th, 25th, 75th, and 90th percentiles. The center of the box plot will be median income

graphyay <- ggplot(Couper, aes(x=YEAR,ymin=MIN,lower=LOW,middle=MED,upper=HIGH,ymax=MAX,fill=breadwin)) +
  geom_boxplot(stat='identity',position='dodge') +
  labs(fill='Bread Winner Status',x='Year',y='Income in US Dollars',title='Income Distribution by Household Type, 1960-2000') +
  scale_y_continuous(labels=scales::comma) +
  scale_x_continuous(breaks=c(1960,1970,1980,1990,2000)) +
  theme_bw(base_size=11)

printplot(graphyay)

#Now I go back to the e dataset, and add in a new Education variable
#the new variable measures the educational attainment of each individual
#by grouping together people by the number of years they had attended school
educ <- e %>% mutate(Education=factor(ifelse(EDUC<6,1,
                                      ifelse(EDUC<7,2,
                                      ifelse(EDUC<10,3,
                                      ifelse(EDUC<11,4,5)))),
labels=c('Dropout','High School','Some College','College','Grad School')))

#First, I group the dataset by YEAR, Work, and Educational Status
#and I weight these groups by the PERWT variable. Here I have the total number of working
#and not working wives by year, within a given educational bracket
ASD <- educ %>% group_by(YEAR,Wrk,Education) %>% summarise(Numb=sum(PERWT))

#Next, I group the dataset by YEAR and Educational Status, and weight the groups
#With the Perwt variable. Here, I have the total number of wives in a given educational
#category each year
ASD2 <- educ %>% group_by(YEAR,Education) %>% summarise(Num2=sum(PERWT))

View(ASD2)

#Join the two data sets together, and only include the working women
#because I want to simply measure the workforce participation rate
joineducation <- left_join(ASD,ASD2) %>%
  filter(Wrk=='Working for Pay')

#Create a new variable to measure the workforce participation rate, which is equivalent to
#the number of working women in a given year and educational category, over the total number of women
#in that educational category
Workforcepartbyedu <- joineducation %>%
  mutate(wpar=Numb/Num2*100)

#First I create a bar graph that shows the total number of working and not working women
#By educational category between the years 1940 and 2000.
graph7 <- ggplot(data=ASD,aes(x=Education, y=Numb,fill=Wrk)) +
  geom_bar(stat = 'identity') +
  labs(x='Education',y='Population',fill='Working Status',title='Figure 4: Number of Working and Not Working Wives by Education,1940-2000') +
  scale_y_continuous(labels=scales :: comma,limits = c(0,70000000)) +
  theme_bw(base_size=10) +
  facet_wrap(~YEAR,ncol=2,scales='free_y')
printplot(graph7)

#Then I create a line graph that measures the workforce participation rate for each educational
#category between 1940 and 2000
graph8 <- ggplot(data=Workforcepartbyedu,aes(x=YEAR,y=wpar/100,group=Education,colour=Education)) +
  geom_line() +
  geom_point() +
  ggtitle('Figure 6: Wives Labor Force Participation by Education,1940-2000') +
  labs(x='YEAR',y='Percent of Wives Working for Pay',group='Education Level',Title='Figure 6: Wives Labor Force Participation by Education,1940-2000') +
  scale_y_continuous(labels=scales :: percent,limits=c(0,1))

printplot(graph8)
	#reading in the packages required to make desired plots

	library(dplyr)
	library(ggplot2)
	library(readr)
	library(gtools)
	library(maptools)
	library(ggmap)

	#setting the working directory
	setwd('/users/charleslundquist/documents/QSS 30.05- Charlie')

	#coding the print plot function to download graphs onto computer after they are coded
	printplot <- function(plot) {
	png('plot.png',height=500,width=1000)
	print(plot)
	dev.off()
	}

	#reading in the data
	a <- read_csv('./data/usa_00031.csv',col_types=cols(HHWT=col_double(),PERWT=col_double()))

	#creating a new Sex variable
	b <- a %>% mutate(Sex=factor(ifelse(SEX==1,1,2),
	labels=c('Male','Female')))

	#excluding data from Alaska and Hawaii before 1960

	C <- b %>% filter(!STATEFIP %in% c(2,15) \| YEAR>=1960)

	#filtering out all non traditional group quarters so only families are included

	c <- C %>% filter(GQ==1)

	#create a new variable to measure whether a person is working, or not working

	d <- c %>% mutate(Wrk=ifelse(OCC1950>979, 'Not Working/Unemployed','Working for Pay'))

	#filtering data set to only include Married (head or spouse) Women from 1940 onwards

	e <- d %>% filter(Sex=='Female',YEAR>=1940 & RELATE==1 \| 2)

	#grouping married women by year, and their work status.
	#weighting the groups by the PERWT function

	E <- e %>% group_by(YEAR,Wrk) %>% summarise(POPULATE=sum(PERWT))

	asd <- e %>% group_by(YEAR,Wrk) %>% summarise(Number5=sum(PERWT))

	#grouping the married women simply by year.

	asdf <- e %>% group_by(YEAR) %>% summarise(Number6=sum(PERWT))

	#joining the married women grouped by year and work status with the women grouped by year

	joinem <- left_join(asd,asdf)

	#creating a workforce participation variable
	#number of married women who are classified as working or not working (Number 5) divided by the total number of married women (Number 6)
	#gives the workforce participation rate. A ratio of working to not working married women by year

	pctwrk <- joinem %>% mutate(pct=Number5/Number6*100)
	View(pctwrk)

	#graphing the new dataset into a bar chart that shows the ratio of working to not working married women
	#from 1940 to 2000
	graph4 <- ggplot(data=pctwrk,aes(x=YEAR, y=pct/100,fill=Wrk)) +
	geom_bar(stat = 'identity') +
	labs(x='Year',y='Percent Working',fill='Work Status',title='Figure 1: Wives Labor Force Participation') +
	scale_y_continuous(labels=scales :: percent) +
	scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
	scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE))
	printplot(graph4)

	#graphing another bar chart showing the total number of working and not working married women
	#notice that here, I use the E data set, and set y to POPULATE, which measures the total number of working and not working women
	graph6 <- ggplot(data=E,aes(x=YEAR, y=POPULATE,fill=Wrk)) +
	geom_bar(stat = 'identity') +
	labs(x='Year',y='Population',fill='Work Status',title='Figure 2: Wives Labor Force Participation,1940-2000') +
	scale_y_continuous(labels=scales :: comma) +
	scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
	scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE))
	printplot(graph6)

	#First, I filter out all data that includes people who are working but do not make anyincome
	#and all people who are measured as not working, who make an income. These are likely to be mismeasurements

	c2 <- d %>% filter((Wrk=='Working for Pay' & INCWAGE>0) \| (Wrk=='Not Working/Unemployed' & INCWAGE==0))

	#Creating a new income variable where all INCWAGE values of 999998 or 999999 are equal to 0, as the 999998 and 999999 stand for missing people
	#Then, I multiple the remaining incwage variables by CPI99, so that they are all equivalent to price levels in 1999
	c3 <- c2 %>% mutate(inc=(ifelse(INCWAGE>=999998,0,INCWAGE*CPI99)))

	#filtering the c3 data set to only include husbands
	#making two new variables: hwork and hinc
	#hwork measures the husbands working status
	#hinc measures the husbands income in 1999 price levels
	husbands <- c3 %>% filter(Sex=='Male' & SPLOC>0) %>%
	rename(hwork=Wrk) %>%
	rename(hinc=inc)

	#filtering the c3 data set to only include wives
	#selecting variables that will be necessary for my graphs
	#creating two new variables to measure the wives working status, and income
	wies <- c3 %>% filter(Sex=='Female' & SPLOC>0) %>%
	select(YEAR,SERIAL,PERNUM,Wrk,SPLOC,inc) %>%
	rename(wwork=Wrk) %>%
	rename(winc=inc)

	#joining the husbands and wives together to create a new couples variable
	#using the wwork and hwork variables, I create a new variable that measures the breadwinner status of the couples

	couples <- left_join(wies,husbands,by=c('YEAR','SERIAL','SPLOC'='PERNUM')) %>%
	mutate(breadwin=factor(ifelse(wwork=='Not Working/Unemployed' & hwork=='Not Working/Unemployed',1,
	ifelse(wwork=='Not Working/Unemployed' & hwork=='Working for Pay',2,
	ifelse(wwork==hwork,3,4))),
	labels=c('No Bread Winner','Male Bread Winner','Dual Bread Winner','Female Bread Winner')))

	#I only want to look at female, male, and dual breadwinners, so I filter out
	#the NA and No Bread Winner categories
	couplescc <- couples %>% filter(breadwin!='NA' & breadwin!='No Bread Winner')

	#first, I group together all households with the same year, and breadwinner status
	#then I calcuate the median household income, and household incomes at the
	#10th, 25th, 70th and 90th percentile
	#by adding together the winc and hinc variables, and weighting them by the HHWT variable I can determine the household's income

	ccc <- couplescc %>% group_by(YEAR,breadwin) %>%
	summarise(MED=median(rep(winc+hinc,times=HHWT)),
	MIN=quantile(rep(winc+hinc,times=HHWT),0.1),
	LOW=quantile(rep(winc+hinc,times=HHWT),0.25),
	HIGH=quantile(rep(winc+hinc,times=HHWT),0.75),
	MAX=quantile(rep(winc+hinc,times=HHWT),0.9))

	head(ccc)

	#I filter out all data before 1960
	Coupfinal <- ccc %>% filter(YEAR>1950)

	#Now, I am able to create a bar graph that measures the median household income by
	#breadwinner type between 1960 and 2000
	graphit <- ggplot(data=Coupfinal, aes(x=YEAR,y=MED,fill=breadwin)) +
	geom_bar(stat='identity',position='dodge') +
	labs(fill='Bread Winner Status',x='Year',y='Median Income',title='Figure 3: Median Household Income by Household Type, 1960-2000') +
	theme_bw(base_size=11) +
	scale_y_continuous(labels=scales:: comma) +
	scale_x_continuous(breaks=c(1960,1970,1980,1990,2000))
	printplot(graphit)

	#I also decide to make a boxplot to measure the household income at the
	#10th, 25th, 75th, and 90th percentiles. The center of the box plot will be median income

	graphyay <- ggplot(Couper, aes(x=YEAR,ymin=MIN,lower=LOW,middle=MED,upper=HIGH,ymax=MAX,fill=breadwin)) +
	geom_boxplot(stat='identity',position='dodge') +
	labs(fill='Bread Winner Status',x='Year',y='Income in US Dollars',title='Income Distribution by Household Type, 1960-2000') +
	scale_y_continuous(labels=scales::comma) +
	scale_x_continuous(breaks=c(1960,1970,1980,1990,2000)) +
	theme_bw(base_size=11)

	printplot(graphyay)

	#Now I go back to the e dataset, and add in a new Education variable
	#the new variable measures the educational attainment of each individual
	#by grouping together people by the number of years they had attended school
	educ <- e %>% mutate(Education=factor(ifelse(EDUC<6,1,
	ifelse(EDUC<7,2,
	ifelse(EDUC<10,3,
	ifelse(EDUC<11,4,5)))),
	labels=c('Dropout','High School','Some College','College','Grad School')))

	#First, I group the dataset by YEAR, Work, and Educational Status
	#and I weight these groups by the PERWT variable. Here I have the total number of working
	#and not working wives by year, within a given educational bracket
	ASD <- educ %>% group_by(YEAR,Wrk,Education) %>% summarise(Numb=sum(PERWT))

	#Next, I group the dataset by YEAR and Educational Status, and weight the groups
	#With the Perwt variable. Here, I have the total number of wives in a given educational
	#category each year
	ASD2 <- educ %>% group_by(YEAR,Education) %>% summarise(Num2=sum(PERWT))

	View(ASD2)

	#Join the two data sets together, and only include the working women
	#because I want to simply measure the workforce participation rate
	joineducation <- left_join(ASD,ASD2) %>%
	filter(Wrk=='Working for Pay')

	#Create a new variable to measure the workforce participation rate, which is equivalent to
	#the number of working women in a given year and educational category, over the total number of women
	#in that educational category
	Workforcepartbyedu <- joineducation %>%
	mutate(wpar=Numb/Num2*100)

	#First I create a bar graph that shows the total number of working and not working women
	#By educational category between the years 1940 and 2000.
	graph7 <- ggplot(data=ASD,aes(x=Education, y=Numb,fill=Wrk)) +
	geom_bar(stat = 'identity') +
	labs(x='Education',y='Population',fill='Working Status',title='Figure 4: Number of Working and Not Working Wives by Education,1940-2000') +
	scale_y_continuous(labels=scales :: comma,limits = c(0,70000000)) +
	theme_bw(base_size=10) +
	facet_wrap(~YEAR,ncol=2,scales='free_y')
	printplot(graph7)

	#Then I create a line graph that measures the workforce participation rate for each educational
	#category between 1940 and 2000
	graph8 <- ggplot(data=Workforcepartbyedu,aes(x=YEAR,y=wpar/100,group=Education,colour=Education)) +
	geom_line() +
	geom_point() +
	ggtitle('Figure 6: Wives Labor Force Participation by Education,1940-2000') +
	labs(x='YEAR',y='Percent of Wives Working for Pay',group='Education Level',Title='Figure 6: Wives Labor Force Participation by Education,1940-2000') +
	scale_y_continuous(labels=scales :: percent,limits=c(0,1))

	printplot(graph8)