SirSamAlot280/Lab_7_Income_Analysis

## Lab_7_Income_Analysis
#Load packages
library(dplyr)
library(readr)
library(ggplot2)
library(RColorBrewer)
library(scales)

#Read in selected IPUMs data
ipums <- read_csv('./data/10_27.csv')

#Filter for only people not in group quarters, over the age of 16, and removing people
#who don't have an income
a <- ipums %>% filter(GQ==1 & AGE>16 & INCWAGE>0 & INCWAGE<999999)

#Condense race into five racial groups
b <- a %>% mutate(Race=factor(ifelse(HISPAN>0 & HISPAN<=4,1,
                                     ifelse(RACESING==1,2,
                                            ifelse(RACESING==2,3,
                                                   ifelse(RACESING==3,4,
                                                          ifelse(RACESING==4,5,6)))))))
levels(b$Race) <- c('Hispanic','White','Black','Native American','Asian','Other')

#Filter out individuals classified in the "Other" race category
c <- b %>% filter(Race!='Other')

#Calssify who is male and female
d <- c %>% mutate(Sex=factor(SEX,labels=c('Male','Female')))

#Weigh the income for indvidiuals
e <- d %>% mutate(AdjInc=INCWAGE*CPI99)

#Weigh the years accordingly
f <- e %>% mutate(Weight=ifelse(YEAR!=1950,PERWT,SLWT))

#Top code for all years to match the 1940 topcode
g <- f %>% mutate(AdjInc=ifelse(AdjInc>59941.99,59941.99,AdjInc))

#Group by necessary variables and summarise for median income
h <- g %>% group_by(Race,YEAR,Sex) %>%
  summarise(MED=median(rep(AdjInc,times=Weight)),
            MIN=quantile(rep(AdjInc,times=Weight),0.1),
            LOW=quantile(rep(AdjInc,times=Weight),0.25),
            HIGH=quantile(rep(AdjInc,times=Weight),0.75),
            MAX=quantile(rep(AdjInc,times=Weight),0.9))

#Factor out the income levels/breaks
gg <- g %>% mutate(IncGr=factor(ifelse(AdjInc>=1 & AdjInc<10000,1,
                                ifelse(AdjInc>10000 & AdjInc<20000,2,
                                ifelse(AdjInc>=20000 & AdjInc<30000,3,
                                ifelse(AdjInc>=30000 & AdjInc<40000,4,
                                ifelse(AdjInc>=40000 & AdjInc<59000,5,6))))),
labels=c('$1-9,9999','$10,000-19,000','$20,000-29,999','$30,000-39,999','$40,000-58,999','$59,000+')))

#Calculate the percentage for each race in terms of the newly created income breaks
i <- gg %>% group_by(YEAR,IncGr,Sex,Race) %>% summarise(Number=sum(Weight))
j <- i %>% group_by(YEAR,Sex,Race) %>% summarise(Total=sum(Number))
k <- left_join(i,j) %>% mutate(pct=Number/Total*100)

#Create box plot for this data
png('Race_Box_Plot.png',height=500,width=1000)
ggplot(h,aes(x=YEAR,ymin=MIN,lower=LOW,middle=MED,
           upper=HIGH,ymax=MAX,fill=Sex))+
  geom_boxplot(stat='identity',position='dodge')+
  labs(fill='Sex',title='Income by Sex and Race for Those with Income,1940-2000',x='Year',y='Income in U.S. Dollars')+
  facet_wrap(~Race)+
  theme_bw() + theme(legend.position='bottom')+
  scale_y_continuous(labels=scales::comma)+
  scale_fill_brewer(palette='Set1')
dev.off()

#Create line graph with markers for the data
png('Race_AdjInc_Line_Plot.png',height=500,width=1000)
ggplot(h,aes(x=YEAR,y=MED,color=Race)) +
  geom_line() + geom_point() +
  labs(color='Race',
       title='Median Income by Sex and Race for Those with Income,1940-2000',
       x='Year',y='Income in U.S. Dollars')+
  scale_y_continuous(labels=scales::comma)+
  scale_fill_brewer(palette='Set2')+
  facet_grid(Sex~.)
dev.off()

#Create bar graph for this data
png('Race_IncGr_Bar_Graph.png',height=500,width=1000)
ggplot(k, aes(x=YEAR,y=pct/100,fill=IncGr)) +
  geom_bar(stat='identity') +
  labs(fill='Sex',title='Income by Race and Sex for Those with Income,1940-2000',
       x='Year',y='Income in U.S. Dollars') +
  scale_y_continuous(labels=scales::percent) +
  facet_grid(Sex~.~Race) +
  scale_fill_brewer(palette ='Set2')
dev.off()
	#Load packages
	library(dplyr)
	library(readr)
	library(ggplot2)
	library(RColorBrewer)
	library(scales)

	#Read in selected IPUMs data
	ipums <- read_csv('./data/10_27.csv')

	#Filter for only people not in group quarters, over the age of 16, and removing people
	#who don't have an income
	a <- ipums %>% filter(GQ==1 & AGE>16 & INCWAGE>0 & INCWAGE<999999)

	#Condense race into five racial groups
	b <- a %>% mutate(Race=factor(ifelse(HISPAN>0 & HISPAN<=4,1,
	ifelse(RACESING==1,2,
	ifelse(RACESING==2,3,
	ifelse(RACESING==3,4,
	ifelse(RACESING==4,5,6)))))))
	levels(b$Race) <- c('Hispanic','White','Black','Native American','Asian','Other')

	#Filter out individuals classified in the "Other" race category
	c <- b %>% filter(Race!='Other')

	#Calssify who is male and female
	d <- c %>% mutate(Sex=factor(SEX,labels=c('Male','Female')))

	#Weigh the income for indvidiuals
	e <- d %>% mutate(AdjInc=INCWAGE*CPI99)

	#Weigh the years accordingly
	f <- e %>% mutate(Weight=ifelse(YEAR!=1950,PERWT,SLWT))

	#Top code for all years to match the 1940 topcode
	g <- f %>% mutate(AdjInc=ifelse(AdjInc>59941.99,59941.99,AdjInc))

	#Group by necessary variables and summarise for median income
	h <- g %>% group_by(Race,YEAR,Sex) %>%
	summarise(MED=median(rep(AdjInc,times=Weight)),
	MIN=quantile(rep(AdjInc,times=Weight),0.1),
	LOW=quantile(rep(AdjInc,times=Weight),0.25),
	HIGH=quantile(rep(AdjInc,times=Weight),0.75),
	MAX=quantile(rep(AdjInc,times=Weight),0.9))

	#Factor out the income levels/breaks
	gg <- g %>% mutate(IncGr=factor(ifelse(AdjInc>=1 & AdjInc<10000,1,
	ifelse(AdjInc>10000 & AdjInc<20000,2,
	ifelse(AdjInc>=20000 & AdjInc<30000,3,
	ifelse(AdjInc>=30000 & AdjInc<40000,4,
	ifelse(AdjInc>=40000 & AdjInc<59000,5,6))))),
	labels=c('$1-9,9999','$10,000-19,000','$20,000-29,999','$30,000-39,999','$40,000-58,999','$59,000+')))

	#Calculate the percentage for each race in terms of the newly created income breaks
	i <- gg %>% group_by(YEAR,IncGr,Sex,Race) %>% summarise(Number=sum(Weight))
	j <- i %>% group_by(YEAR,Sex,Race) %>% summarise(Total=sum(Number))
	k <- left_join(i,j) %>% mutate(pct=Number/Total*100)

	#Create box plot for this data
	png('Race_Box_Plot.png',height=500,width=1000)
	ggplot(h,aes(x=YEAR,ymin=MIN,lower=LOW,middle=MED,
	upper=HIGH,ymax=MAX,fill=Sex))+
	geom_boxplot(stat='identity',position='dodge')+
	labs(fill='Sex',title='Income by Sex and Race for Those with Income,1940-2000',x='Year',y='Income in U.S. Dollars')+
	facet_wrap(~Race)+
	theme_bw() + theme(legend.position='bottom')+
	scale_y_continuous(labels=scales::comma)+
	scale_fill_brewer(palette='Set1')
	dev.off()

	#Create line graph with markers for the data
	png('Race_AdjInc_Line_Plot.png',height=500,width=1000)
	ggplot(h,aes(x=YEAR,y=MED,color=Race)) +
	geom_line() + geom_point() +
	labs(color='Race',
	title='Median Income by Sex and Race for Those with Income,1940-2000',
	x='Year',y='Income in U.S. Dollars')+
	scale_y_continuous(labels=scales::comma)+
	scale_fill_brewer(palette='Set2')+
	facet_grid(Sex~.)
	dev.off()

	#Create bar graph for this data
	png('Race_IncGr_Bar_Graph.png',height=500,width=1000)
	ggplot(k, aes(x=YEAR,y=pct/100,fill=IncGr)) +
	geom_bar(stat='identity') +
	labs(fill='Sex',title='Income by Race and Sex for Those with Income,1940-2000',
	x='Year',y='Income in U.S. Dollars') +
	scale_y_continuous(labels=scales::percent) +
	facet_grid(Sex~.~Race) +
	scale_fill_brewer(palette ='Set2')
	dev.off()