lfortin-117/Lab 7 - LF

## Lab 7 - LF

#Part A For Boxplot and Line
library(readr)
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(scales)

setwd('/users/emily/dropbox/hist90_01/fall2016/labs/10_27')

#Reading in the data and filtering out those below earning age, as well as those in general quarters,
#and those not earning any income
ipums <- read_csv('Lab7.csv')
a <- ipums %>% filter(GQ==1 & AGE>=16 & INCWAGE>0 & INCWAGE<999999)

#Recoding for race
b <- a %>% mutate(SingleRace=factor(ifelse(HISPAN>0,1,
                                           ifelse(RACESING==1,2,
                                                  ifelse(RACESING==2,3,
                                                         ifelse(RACESING==3,4,
                                                                ifelse(RACESING==4,5,6)))))))
levels(b$SingleRace) <- c('Hispanic','White','Black','Native American','Asian','Other')
c <- b %>% filter(SingleRace!='Other')

#Recoding for sex
d <- c %>% mutate(Sex=factor(SEX,labels=c('Male','Female')))

#Adjusting for inflation with the CPI99
e <- d %>% mutate(AdjInc=INCWAGE*CPI99)

#Weighting by PERWT in all years other than 1950, in which we use SLWT
ee <- e %>% mutate(Weight=ifelse(YEAR!=1950,PERWT,SLWT))

#Removing the very high income entries from the set
eee <- ee %>% mutate(AdjInc=ifelse(AdjInc>59941.99,59941.99,AdjInc))

#Aggregating the data and splitting up the quartiles whilst summarizing
f <- eee %>% group_by(SingleRace,YEAR,Sex) %>%
  summarise(MED=median(rep(AdjInc,times=Weight)),
            MIN=quantile(rep(AdjInc,times=Weight),0.1),
            LOW=quantile(rep(AdjInc,times=Weight),0.25),
            HIGH=quantile(rep(AdjInc,times=Weight),0.75),
            MAX=quantile(rep(AdjInc,times=Weight),0.9))


#Plotting both the box plot and the column graph
Boxgraph <- ggplot(f, aes(x=YEAR,ymin=MIN,lower=LOW,middle=MED,upper=HIGH,ymax=MAX,fill=Sex)) +
  geom_boxplot(stat='identity',position='dodge') +
  facet_wrap(~SingleRace) +
  labs(y= 'Income, US Dollars', title='Figure 1: Income by Race and Sex for Those With Income, 1940-2000')

Linegraph <- ggplot(f,aes(x=YEAR,y=MED,color=SingleRace)) +
  geom_line() + geom_point() +
  facet_grid(Sex~.) +
  labs(y='Median Income, US Dollars',color='Race/Ethnicity', title='Figure 2: Median Income by Race and Sex For Those With Income, 1940-2000')


#Part B for Column Graph
library(readr)
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(scales)


#Reading in the data and filtering out those below earning age, as well as those in general quarters,
#and those not earning any income
ipums <- read_csv('Lab7.csv')
a <- ipums %>% filter(GQ==1 & AGE>=16 & INCWAGE>0 & INCWAGE<999999)

#Recoding for race
b <- a %>% mutate(SingleRace=factor(ifelse(HISPAN>0,1,
                                           ifelse(RACESING==1,2,
                                                  ifelse(RACESING==2,3,
                                                         ifelse(RACESING==3,4,
                                                                ifelse(RACESING==4,5,6)))))))
levels(b$SingleRace) <- c('Hispanic','White','Black','Native American','Asian','Other')
c <- b %>% filter(SingleRace!='Other')

#Recoding for sex
d <- c %>% mutate(Sex=factor(SEX,labels=c('Male','Female')))

#Adjusting for inflation with the CPI99
e <- d %>% mutate(AdjInc=INCWAGE*CPI99)

#Weighting by PERWT in all years other than 1950, in which we use SLWT
ee <- e %>% mutate(Weight=ifelse(YEAR!=1950,PERWT,SLWT))

#Removing the very high income entries from the set
eee <- ee %>% mutate(AdjInc=ifelse(AdjInc>59941.99,59941.99,AdjInc))


#Recode the IncWage variable for the column graph
f <- eee %>% mutate(IncomeF=factor(ifelse(AdjInc<10000, 1,
                                   ifelse(AdjInc<20000, 2,
                                   ifelse(AdjInc<30000, 3,
                                   ifelse(AdjInc<40000, 4,
                                   ifelse(AdjInc<59000, 5, 6)))))))

levels(f$IncomeF) <- c('1-9,999','10,000-19,999','20,000-29,999','30,000-39,999',
                             '40,000-58,999','59,000+')

#Aggregating and weighting
f1 <- f %>% group_by(YEAR, Sex, SingleRace, IncomeF) %>% summarize(Number=sum(Weight))

#Plotting
Columngraph <- ggplot(data=f1, aes(x=YEAR,y=Number,fill=IncomeF)) +
  geom_bar(stat='identity', position='fill') +
  labs(x='Year', y='Percent of Population', fill='Income',title='Figure 3: Income by Sex and Race for Those With Income 1940-2000') +
  scale_y_continuous(labels=scales::percent) +
  scale_x_continuous(breaks=c(1950,1970,1990))+
  scale_fill_brewer(palette = 'Set1') +
  facet_grid(Sex~.~SingleRace) +
  theme_bw() + theme(legend.position='bottom')

	#Part A For Boxplot and Line
	library(readr)
	library(dplyr)
	library(ggplot2)
	library(RColorBrewer)
	library(scales)

	setwd('/users/emily/dropbox/hist90_01/fall2016/labs/10_27')

	#Reading in the data and filtering out those below earning age, as well as those in general quarters,
	#and those not earning any income
	ipums <- read_csv('Lab7.csv')
	a <- ipums %>% filter(GQ==1 & AGE>=16 & INCWAGE>0 & INCWAGE<999999)

	#Recoding for race
	b <- a %>% mutate(SingleRace=factor(ifelse(HISPAN>0,1,
	ifelse(RACESING==1,2,
	ifelse(RACESING==2,3,
	ifelse(RACESING==3,4,
	ifelse(RACESING==4,5,6)))))))
	levels(b$SingleRace) <- c('Hispanic','White','Black','Native American','Asian','Other')
	c <- b %>% filter(SingleRace!='Other')

	#Recoding for sex
	d <- c %>% mutate(Sex=factor(SEX,labels=c('Male','Female')))

	#Adjusting for inflation with the CPI99
	e <- d %>% mutate(AdjInc=INCWAGE*CPI99)

	#Weighting by PERWT in all years other than 1950, in which we use SLWT
	ee <- e %>% mutate(Weight=ifelse(YEAR!=1950,PERWT,SLWT))

	#Removing the very high income entries from the set
	eee <- ee %>% mutate(AdjInc=ifelse(AdjInc>59941.99,59941.99,AdjInc))

	#Aggregating the data and splitting up the quartiles whilst summarizing
	f <- eee %>% group_by(SingleRace,YEAR,Sex) %>%
	summarise(MED=median(rep(AdjInc,times=Weight)),
	MIN=quantile(rep(AdjInc,times=Weight),0.1),
	LOW=quantile(rep(AdjInc,times=Weight),0.25),
	HIGH=quantile(rep(AdjInc,times=Weight),0.75),
	MAX=quantile(rep(AdjInc,times=Weight),0.9))



	#Plotting both the box plot and the column graph
	Boxgraph <- ggplot(f, aes(x=YEAR,ymin=MIN,lower=LOW,middle=MED,upper=HIGH,ymax=MAX,fill=Sex)) +
	geom_boxplot(stat='identity',position='dodge') +
	facet_wrap(~SingleRace) +
	labs(y= 'Income, US Dollars', title='Figure 1: Income by Race and Sex for Those With Income, 1940-2000')

	Linegraph <- ggplot(f,aes(x=YEAR,y=MED,color=SingleRace)) +
	geom_line() + geom_point() +
	facet_grid(Sex~.) +
	labs(y='Median Income, US Dollars',color='Race/Ethnicity', title='Figure 2: Median Income by Race and Sex For Those With Income, 1940-2000')


	#Part B for Column Graph
	library(readr)
	library(dplyr)
	library(ggplot2)
	library(RColorBrewer)
	library(scales)


	#Reading in the data and filtering out those below earning age, as well as those in general quarters,
	#and those not earning any income
	ipums <- read_csv('Lab7.csv')
	a <- ipums %>% filter(GQ==1 & AGE>=16 & INCWAGE>0 & INCWAGE<999999)

	#Recoding for race
	b <- a %>% mutate(SingleRace=factor(ifelse(HISPAN>0,1,
	ifelse(RACESING==1,2,
	ifelse(RACESING==2,3,
	ifelse(RACESING==3,4,
	ifelse(RACESING==4,5,6)))))))
	levels(b$SingleRace) <- c('Hispanic','White','Black','Native American','Asian','Other')
	c <- b %>% filter(SingleRace!='Other')

	#Recoding for sex
	d <- c %>% mutate(Sex=factor(SEX,labels=c('Male','Female')))

	#Adjusting for inflation with the CPI99
	e <- d %>% mutate(AdjInc=INCWAGE*CPI99)

	#Weighting by PERWT in all years other than 1950, in which we use SLWT
	ee <- e %>% mutate(Weight=ifelse(YEAR!=1950,PERWT,SLWT))

	#Removing the very high income entries from the set
	eee <- ee %>% mutate(AdjInc=ifelse(AdjInc>59941.99,59941.99,AdjInc))


	#Recode the IncWage variable for the column graph
	f <- eee %>% mutate(IncomeF=factor(ifelse(AdjInc<10000, 1,
	ifelse(AdjInc<20000, 2,
	ifelse(AdjInc<30000, 3,
	ifelse(AdjInc<40000, 4,
	ifelse(AdjInc<59000, 5, 6)))))))

	levels(f$IncomeF) <- c('1-9,999','10,000-19,999','20,000-29,999','30,000-39,999',
	'40,000-58,999','59,000+')

	#Aggregating and weighting
	f1 <- f %>% group_by(YEAR, Sex, SingleRace, IncomeF) %>% summarize(Number=sum(Weight))

	#Plotting
	Columngraph <- ggplot(data=f1, aes(x=YEAR,y=Number,fill=IncomeF)) +
	geom_bar(stat='identity', position='fill') +
	labs(x='Year', y='Percent of Population', fill='Income',title='Figure 3: Income by Sex and Race for Those With Income 1940-2000') +
	scale_y_continuous(labels=scales::percent) +
	scale_x_continuous(breaks=c(1950,1970,1990))+
	scale_fill_brewer(palette = 'Set1') +
	facet_grid(Sex~.~SingleRace) +
	theme_bw() + theme(legend.position='bottom')