scfurey/Project Code.R

## Project Code.R
#USA Visualization 1: Animated Map of Those Living in Metro Areas


#load packages
library (dplyr)
library (readr)
library(ggplot2)
library(RColorBrewer)
library(ggmap)
library(maptools)
library(gtools)


#read in blank map data of the USA
mapdata <- read_csv('map.csv')

#create and print blank map of USA
map1 <- ggplot() + theme_nothing(legend=TRUE) +
  geom_polygon(data=mapdata, aes(x=long,y=lat,group=group), fill='white',color='black')
png('map.png',width=1500,height=1000)
print(map1)
dev.off ()

#read in IPUMS data
ipums <- read_csv('usa_00015.csv',col_types = cols(PERWT=col_double()))

#create Metrostatus variable
a <- ipums %>% mutate(Metrostatus=ifelse(METRO %in% c(2,3), 'Metro', 'Non-Metro'))

#group by YEAR, STATEFIP, and Metrostatus
b <- a %>% group_by(YEAR,STATEFIP,Metrostatus) %>% summarise(Number=sum(PERWT))

#group by YEAR and STATEFIP
c <- b %>% group_by(YEAR,STATEFIP) %>% summarise(Total=sum(Number))

#left join these previous two variables
d <- left_join(b,c)

#find percentages
pct <- d %>% mutate(pctmetro=Number/Total*100) %>%filter(Metrostatus=='Metro')

#cut into equal categories with appropriate ranges
cuts <- quantcut(pct$pctmetro,q=seq(0,1,.2))

#assign values and create ranges for the legend
metrocats <- pct %>% mutate(Population=factor(ifelse(pctmetro<20,1,
                                                     ifelse(pctmetro<35,2,
                                                            ifelse(pctmetro<50,3,
                                                                   ifelse(pctmetro<65,4,5))))))
levels(metrocats$Population) <- c('0-19%','20-34%','35-48%',
                                  '49-65%','65%+')

#create newmap variable from original map data
newmap <- mapdata %>% mutate(STATEI=as.integer(STATEFIP))


#left-join metrocats by newmap
metromap <- left_join(metrocats,newmap,by=c('STATEFIP'='STATEI')) %>% arrange(order)


#print separate maps for years 1930-2010
for (year in unique(metromap$YEAR)) {
  map2 <- map1 + scale_fill_brewer(palette='Blues') + theme_bw(base_size = 24) +

    geom_polygon(data=filter(metromap,YEAR==year),aes(x=long,y=lat,group=group,fill=Population),color='black')
  labs(title=paste('Percentage of People Residing in Metro Areas in the United States,1930-2010,',year,sep=' '))
  png(paste('map_',year,'.png',sep=''),width=1500,height=1000)
  print(map2)
  dev.off ()
}

#run this code once
install.packages('devtools')
library(devtools)
devtools::install_github('dgrtwo/gganimate')

#load gganimate
library(gganimate)

#animate the separate maps into a gif
anmap <- map1 + scale_fill_brewer(palette='Blues') +
  geom_polygon(data=metromap,aes(x=long,y=lat,group=group,fill=Population,frame=YEAR),color='black') +
  labs(title='Percentage of People Residing in Metro Areas in the United States,1930-2010, ')

gg_animate(anmap,ani.width=1500,ani.height=1000,'anmap.gif')


#USA Visualization 2: Column Graph of those Living in Cities versus Suburbs


#load packages
library (dplyr)
library (readr)
library(ggplot2)
library(RColorBrewer)

#read in IPUMS data
ipums <- read_csv('usa_00015.csv',col_types = cols(PERWT=col_double()))

#filter those living in metro areas
a <- ipums %>% filter(METRO %in% c(2,3)) %>% filter(!STATEFIP %in% c(2,15))

#distinguish between those living in cities versus suburbs
b <- a %>% mutate(Metro=ifelse(METRO==2, 'City', 'Suburb'))

#select variables I need
c <- b %>% select(YEAR, PERWT, Metro)

#Group twice, creating Number and NumTotal so you can find percentages
f1 <- b %>% group_by(YEAR,Metro) %>% summarise(Number=sum(PERWT))
f2 <- b %>% group_by(YEAR) %>% summarise(NumTotal=sum(PERWT))

#create pctmetro by left joining f1 and f2 and dividing Number/NumTotal
pctmetro <- left_join(f1,f2) %>% mutate(pct=Number/NumTotal*100)

#printplot function to print
printplot <- function(plot) {
  png('USA_Fig2.png',height=500,width=1000)
  print(fig1)
  dev.off()
}

#graph
fig1 <- ggplot(data=pctmetro, aes(x=YEAR, y=pct/100, fill=Metro)) +
  geom_bar(stat='identity') +
  labs(x='Year',y='Percent of Population',fill='Metro',title='American Metro Population Living in Cities versus Suburbs, 1930-2010') +
  scale_y_continuous(labels=scales::percent) +
  scale_x_continuous(breaks=c(1930,1940,1950,1960,1970,1980,1990,2000,2010)) +
  scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE)) +
  theme_bw()
printplot(fig1)


#USA Visualization 3: Column Graph of Americans Living in Cities versus Suburbs by Race.

#load packages
library (dplyr)
library (readr)
library(ggplot2)
library(RColorBrewer)
library(scales)

#read in IPUMS data
ipums <- read_csv('usa_00015.csv',col_types = cols(PERWT=col_double()))

#exclude Alaska and Hawaii
filtered_ipums <- ipums %>% filter(!STATEFIP %in% c(2,15))

#filter by those residing in metro areas
a <- filtered_ipums %>% filter(METRO %in% c(2,3))

#create metro variable
b <- a %>% mutate(Metro=ifelse(METRO==2, 'City', 'Suburb'))

#create race variable by recoding HISPAN and RACESING
c <- b %>% mutate(Raceth=factor(ifelse(HISPAN>0,'Hispanic',
                                ifelse(RACESING==1, 'White',
                                ifelse(RACESING==2, 'Black', 'Other')))))

#group by Year, Metro, Raceth
f1 <- c %>% group_by(YEAR, Metro, Raceth) %>% summarise(Number=sum(PERWT))

#group by Year, Metro
f2 <- f1 %>% group_by(YEAR,Metro) %>% mutate(Total=sum(Number))

#column graph
fig2 <- ggplot(data=f2,aes(x=Metro,y=Number, fill=Raceth)) +
  geom_bar(stat='identity') +
  labs(x='City/Suburb',y='Population',fill='Metro',title='Number of Americans Living in Cities or Suburbs by Race, 1930-2010') +
  scale_y_continuous(labels=scales::comma) +
  scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE)) +
  facet_wrap(~YEAR,ncol=2) +
  theme_bw()

#print
png('USA_Fig3.png', height=500, width=1000)
print(fig2)
dev.off()


#Newark Visualization 1: Population Pyramid


#load packages
library (dplyr)
library (readr)
library(ggplot2)
library(RColorBrewer)

#read in data
a <- read_csv('usa_00015.csv')
b <- a %>% filter(CITY==4630)

#Create vector of age category labels
agecats <- '0-9'
for (i in 1:7) {
  agecats <- c(agecats,paste(i, '0-',i,9,sep=''))
}
agecats <- c(agecats, '80+')

#create sex and age variables
c <- b %>% mutate(Sex=factor(SEX,labels=c('Male','Female')))
d <- c %>% mutate (Age=ifelse(AGE>=80,8,floor(AGE/10)))
e <- d %>% mutate(Age=factor(Age,labels=agecats))

#create Raceth variable using HISPAN and RACESING variables from IPUMS
f <- e %>% mutate(Raceth=factor(ifelse(HISPAN>0,'Hispanic',
                                ifelse(RACESING==1, 'White',
                                ifelse(RACESING==2, 'Black', 'Other')))))

#group by city, Age, Sex, Raceth, and Year
g <- f %>% group_by(CITY, Age, Sex, Raceth, YEAR) %>% summarise(Number=sum(PERWT))

#Multiply 'Male' by -1
h <- g %>% mutate(Number=ifelse(Sex=='Male', -1 *Number,Number))

#Graphing (make sure to use proper scale, appropriate title,etc.)
fig1 <- ggplot(data=h,aes(x=Age, y=Number,fill=Sex)) +
  geom_bar(data=h[h$Sex=='Male',], stat='identity') +
  geom_bar(data=h[h$Sex=='Female',], stat='identity') +
  coord_flip() +
  facet_grid(Raceth~.~YEAR) +
  scale_y_continuous(breaks=c(-40000,-20000,0,20000,40000),
                     labels=c('40','20','0','20','40')) +
  labs(y='Population in Thousands',title='Population Pyramid for Newark Residents,1930-2010')+
  scale_fill_brewer(palette='Set1',guide=guide_legend(reverse=TRUE))+
  theme_bw() + theme(legend.position='bottom')

#print
png('Newark_Fig1.png', height=500, width=1000)
print(fig1)
dev.off()

#save as PDF to export more easily to Word
ggsave('Newark_Population_Pyramid.pdf',width=10,height=7.5)


#Newark Visualization 2: Industry Column Graph


#load packages
library (dplyr)
library (readr)
library(ggplot2)
library(RColorBrewer)

#read in data; filter working ages and specify Newark
a <- read_csv('usa_00015.csv') %>%
  filter(AGE>=15 & AGE<=65 & (CITY==4630))

#create industry variable by recoding IND1950
b <- a %>% mutate(Industry=factor(ifelse(IND1950<100 | IND1950>976,1,
                                  ifelse(IND1950<246, 2,
                                  ifelse(IND1950==246 | IND1950==976,4,
                                  ifelse(IND1950>700,7,
                                  ifelse(IND1950>600,5,
                                  ifelse(IND1950>500,6,3))))))))
levels(b$Industry) <- c('none','agricultural/extractive','manufacturing',
                        'construction or general labor', 'trade',
                        'transporation/communication/utilities',
                        'service')

#Check to see what the head (top) looks like
c <- b %>% select(YEAR, PERWT, Industry)
head (c)

#Group by YEAR, City, and Industry
f1 <- c %>% group_by(YEAR, Industry) %>% summarise(Number=sum (PERWT))
head (f1)


#Graph industry
fig1 <- ggplot(data=f1,aes(x=YEAR,y=Number,fill=Industry)) +
  geom_bar(stat='identity',position='fill') +
  labs(x='Year',y='Percent of Population',fill='Industry',title='Newark Industry, 1930-2010') +
  scale_y_continuous(labels=scales::percent) +
  scale_x_continuous(breaks=c(1930,1940,1950,1960,1970,1980,1990,2000,2010)) +
  scale_fill_brewer(palette='Set1') +
  theme_bw() + theme(legend.position='bottom')

#print it
png('Newark_Industry.png',height=500,width=1000)
print(fig1)
dev.off()
	#USA Visualization 1: Animated Map of Those Living in Metro Areas


	#load packages
	library (dplyr)
	library (readr)
	library(ggplot2)
	library(RColorBrewer)
	library(ggmap)
	library(maptools)
	library(gtools)


	#read in blank map data of the USA
	mapdata <- read_csv('map.csv')

	#create and print blank map of USA
	map1 <- ggplot() + theme_nothing(legend=TRUE) +
	geom_polygon(data=mapdata, aes(x=long,y=lat,group=group), fill='white',color='black')
	png('map.png',width=1500,height=1000)
	print(map1)
	dev.off ()

	#read in IPUMS data
	ipums <- read_csv('usa_00015.csv',col_types = cols(PERWT=col_double()))

	#create Metrostatus variable
	a <- ipums %>% mutate(Metrostatus=ifelse(METRO %in% c(2,3), 'Metro', 'Non-Metro'))

	#group by YEAR, STATEFIP, and Metrostatus
	b <- a %>% group_by(YEAR,STATEFIP,Metrostatus) %>% summarise(Number=sum(PERWT))

	#group by YEAR and STATEFIP
	c <- b %>% group_by(YEAR,STATEFIP) %>% summarise(Total=sum(Number))

	#left join these previous two variables
	d <- left_join(b,c)

	#find percentages
	pct <- d %>% mutate(pctmetro=Number/Total*100) %>%filter(Metrostatus=='Metro')

	#cut into equal categories with appropriate ranges
	cuts <- quantcut(pct$pctmetro,q=seq(0,1,.2))

	#assign values and create ranges for the legend
	metrocats <- pct %>% mutate(Population=factor(ifelse(pctmetro<20,1,
	ifelse(pctmetro<35,2,
	ifelse(pctmetro<50,3,
	ifelse(pctmetro<65,4,5))))))
	levels(metrocats$Population) <- c('0-19%','20-34%','35-48%',
	'49-65%','65%+')

	#create newmap variable from original map data
	newmap <- mapdata %>% mutate(STATEI=as.integer(STATEFIP))


	#left-join metrocats by newmap
	metromap <- left_join(metrocats,newmap,by=c('STATEFIP'='STATEI')) %>% arrange(order)


	#print separate maps for years 1930-2010
	for (year in unique(metromap$YEAR)) {
	map2 <- map1 + scale_fill_brewer(palette='Blues') + theme_bw(base_size = 24) +

	geom_polygon(data=filter(metromap,YEAR==year),aes(x=long,y=lat,group=group,fill=Population),color='black')
	labs(title=paste('Percentage of People Residing in Metro Areas in the United States,1930-2010,',year,sep=' '))
	png(paste('map_',year,'.png',sep=''),width=1500,height=1000)
	print(map2)
	dev.off ()
	}

	#run this code once
	install.packages('devtools')
	library(devtools)
	devtools::install_github('dgrtwo/gganimate')

	#load gganimate
	library(gganimate)

	#animate the separate maps into a gif
	anmap <- map1 + scale_fill_brewer(palette='Blues') +
	geom_polygon(data=metromap,aes(x=long,y=lat,group=group,fill=Population,frame=YEAR),color='black') +
	labs(title='Percentage of People Residing in Metro Areas in the United States,1930-2010, ')

	gg_animate(anmap,ani.width=1500,ani.height=1000,'anmap.gif')


	#USA Visualization 2: Column Graph of those Living in Cities versus Suburbs


	#load packages
	library (dplyr)
	library (readr)
	library(ggplot2)
	library(RColorBrewer)

	#read in IPUMS data
	ipums <- read_csv('usa_00015.csv',col_types = cols(PERWT=col_double()))

	#filter those living in metro areas
	a <- ipums %>% filter(METRO %in% c(2,3)) %>% filter(!STATEFIP %in% c(2,15))

	#distinguish between those living in cities versus suburbs
	b <- a %>% mutate(Metro=ifelse(METRO==2, 'City', 'Suburb'))

	#select variables I need
	c <- b %>% select(YEAR, PERWT, Metro)

	#Group twice, creating Number and NumTotal so you can find percentages
	f1 <- b %>% group_by(YEAR,Metro) %>% summarise(Number=sum(PERWT))
	f2 <- b %>% group_by(YEAR) %>% summarise(NumTotal=sum(PERWT))

	#create pctmetro by left joining f1 and f2 and dividing Number/NumTotal
	pctmetro <- left_join(f1,f2) %>% mutate(pct=Number/NumTotal*100)

	#printplot function to print
	printplot <- function(plot) {
	png('USA_Fig2.png',height=500,width=1000)
	print(fig1)
	dev.off()
	}

	#graph
	fig1 <- ggplot(data=pctmetro, aes(x=YEAR, y=pct/100, fill=Metro)) +
	geom_bar(stat='identity') +
	labs(x='Year',y='Percent of Population',fill='Metro',title='American Metro Population Living in Cities versus Suburbs, 1930-2010') +
	scale_y_continuous(labels=scales::percent) +
	scale_x_continuous(breaks=c(1930,1940,1950,1960,1970,1980,1990,2000,2010)) +
	scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE)) +
	theme_bw()
	printplot(fig1)


	#USA Visualization 3: Column Graph of Americans Living in Cities versus Suburbs by Race.

	#load packages
	library (dplyr)
	library (readr)
	library(ggplot2)
	library(RColorBrewer)
	library(scales)

	#read in IPUMS data
	ipums <- read_csv('usa_00015.csv',col_types = cols(PERWT=col_double()))

	#exclude Alaska and Hawaii
	filtered_ipums <- ipums %>% filter(!STATEFIP %in% c(2,15))

	#filter by those residing in metro areas
	a <- filtered_ipums %>% filter(METRO %in% c(2,3))

	#create metro variable
	b <- a %>% mutate(Metro=ifelse(METRO==2, 'City', 'Suburb'))

	#create race variable by recoding HISPAN and RACESING
	c <- b %>% mutate(Raceth=factor(ifelse(HISPAN>0,'Hispanic',
	ifelse(RACESING==1, 'White',
	ifelse(RACESING==2, 'Black', 'Other')))))

	#group by Year, Metro, Raceth
	f1 <- c %>% group_by(YEAR, Metro, Raceth) %>% summarise(Number=sum(PERWT))

	#group by Year, Metro
	f2 <- f1 %>% group_by(YEAR,Metro) %>% mutate(Total=sum(Number))

	#column graph
	fig2 <- ggplot(data=f2,aes(x=Metro,y=Number, fill=Raceth)) +
	geom_bar(stat='identity') +
	labs(x='City/Suburb',y='Population',fill='Metro',title='Number of Americans Living in Cities or Suburbs by Race, 1930-2010') +
	scale_y_continuous(labels=scales::comma) +
	scale_fill_brewer(palette='Set2',guide=guide_legend(reverse=TRUE)) +
	facet_wrap(~YEAR,ncol=2) +
	theme_bw()

	#print
	png('USA_Fig3.png', height=500, width=1000)
	print(fig2)
	dev.off()


	#Newark Visualization 1: Population Pyramid


	#load packages
	library (dplyr)
	library (readr)
	library(ggplot2)
	library(RColorBrewer)

	#read in data
	a <- read_csv('usa_00015.csv')
	b <- a %>% filter(CITY==4630)

	#Create vector of age category labels
	agecats <- '0-9'
	for (i in 1:7) {
	agecats <- c(agecats,paste(i, '0-',i,9,sep=''))
	}
	agecats <- c(agecats, '80+')

	#create sex and age variables
	c <- b %>% mutate(Sex=factor(SEX,labels=c('Male','Female')))
	d <- c %>% mutate (Age=ifelse(AGE>=80,8,floor(AGE/10)))
	e <- d %>% mutate(Age=factor(Age,labels=agecats))

	#create Raceth variable using HISPAN and RACESING variables from IPUMS
	f <- e %>% mutate(Raceth=factor(ifelse(HISPAN>0,'Hispanic',
	ifelse(RACESING==1, 'White',
	ifelse(RACESING==2, 'Black', 'Other')))))

	#group by city, Age, Sex, Raceth, and Year
	g <- f %>% group_by(CITY, Age, Sex, Raceth, YEAR) %>% summarise(Number=sum(PERWT))

	#Multiply 'Male' by -1
	h <- g %>% mutate(Number=ifelse(Sex=='Male', -1 *Number,Number))

	#Graphing (make sure to use proper scale, appropriate title,etc.)
	fig1 <- ggplot(data=h,aes(x=Age, y=Number,fill=Sex)) +
	geom_bar(data=h[h$Sex=='Male',], stat='identity') +
	geom_bar(data=h[h$Sex=='Female',], stat='identity') +
	coord_flip() +
	facet_grid(Raceth~.~YEAR) +
	scale_y_continuous(breaks=c(-40000,-20000,0,20000,40000),
	labels=c('40','20','0','20','40')) +
	labs(y='Population in Thousands',title='Population Pyramid for Newark Residents,1930-2010')+
	scale_fill_brewer(palette='Set1',guide=guide_legend(reverse=TRUE))+
	theme_bw() + theme(legend.position='bottom')

	#print
	png('Newark_Fig1.png', height=500, width=1000)
	print(fig1)
	dev.off()

	#save as PDF to export more easily to Word
	ggsave('Newark_Population_Pyramid.pdf',width=10,height=7.5)


	#Newark Visualization 2: Industry Column Graph


	#load packages
	library (dplyr)
	library (readr)
	library(ggplot2)
	library(RColorBrewer)

	#read in data; filter working ages and specify Newark
	a <- read_csv('usa_00015.csv') %>%
	filter(AGE>=15 & AGE<=65 & (CITY==4630))

	#create industry variable by recoding IND1950
	b <- a %>% mutate(Industry=factor(ifelse(IND1950<100 \| IND1950>976,1,
	ifelse(IND1950<246, 2,
	ifelse(IND1950==246 \| IND1950==976,4,
	ifelse(IND1950>700,7,
	ifelse(IND1950>600,5,
	ifelse(IND1950>500,6,3))))))))
	levels(b$Industry) <- c('none','agricultural/extractive','manufacturing',
	'construction or general labor', 'trade',
	'transporation/communication/utilities',
	'service')

	#Check to see what the head (top) looks like
	c <- b %>% select(YEAR, PERWT, Industry)
	head (c)

	#Group by YEAR, City, and Industry
	f1 <- c %>% group_by(YEAR, Industry) %>% summarise(Number=sum (PERWT))
	head (f1)


	#Graph industry
	fig1 <- ggplot(data=f1,aes(x=YEAR,y=Number,fill=Industry)) +
	geom_bar(stat='identity',position='fill') +
	labs(x='Year',y='Percent of Population',fill='Industry',title='Newark Industry, 1930-2010') +
	scale_y_continuous(labels=scales::percent) +
	scale_x_continuous(breaks=c(1930,1940,1950,1960,1970,1980,1990,2000,2010)) +
	scale_fill_brewer(palette='Set1') +
	theme_bw() + theme(legend.position='bottom')

	#print it
	png('Newark_Industry.png',height=500,width=1000)
	print(fig1)
	dev.off()