Skip to content

Instantly share code, notes, and snippets.

@TheInstantCrush
Last active November 8, 2016 02:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TheInstantCrush/a9b1e4e77dfda97810c35cea23fdd3a3 to your computer and use it in GitHub Desktop.
Save TheInstantCrush/a9b1e4e77dfda97810c35cea23fdd3a3 to your computer and use it in GitHub Desktop.
#Data visualizations
library(readr)
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(scales)
library(ggmap)
library(maptools)
library(gtools)
#Read in data
ipums <- read_csv('data/raw.csv',col_types=cols(HHWT=col_double(),PERWT=col_double()))
#Exclude Alaska/Hawaii before 1960
ipums <- ipums %>% filter(!(YEAR < 1960 & STATEFIP %in% c(2,15)))
ipumsx <- ipums
#Studying only Asians
ipums <- ipums %>% filter(RACESING==4)
#1. Spine graphs showing historical distribution of Asian-Americans by birthplace.
#Select relevant variables
ipums1 <- ipums %>% select(YEAR,PERWT,AGE,BPL)
#Create birthplace categories
ipums1 <- ipums1 %>% mutate(Birthplace = factor(ifelse(BPL==6,1,
ifelse(BPL==15,2,
ifelse(BPL<150,3,
ifelse(BPL==500,4,
ifelse(BPL==501,5,
ifelse(BPL==502,6,
ifelse(BPL==515,7,
ifelse(BPL==518,8,
ifelse(BPL %in% c(510:519),9,
ifelse(BPL==521,10,11)))))))))),
labels=c('California','Hawaii','Other US/Territories','China','Japan','Korea','Philippines',
'Vietnam','Other SE Asia','India','Other')))
#Group by year and birthplace, calculate totals and percentages
a <- ipums1 %>% group_by(YEAR) %>% summarise(Number1=sum(PERWT))
b <- ipums1 %>% group_by(YEAR,Birthplace) %>% summarise(Number2=sum(PERWT))
c <- left_join(b,a)
c <- c %>% mutate(Pct=Number2/Number1*100)
#Graph
graph1 <- ggplot(c, aes(x=YEAR,y=Pct/100,fill=Birthplace)) +
geom_bar(stat='identity', aes(width=rescale(Number1,c(2,10)))) +
labs(x='Year',y='Percent of Population',fill='Birthplace',
title='Asian-American Population by Birthplace, 1940-2000') +
theme_bw() +
scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
scale_y_continuous(labels=scales::percent) +
guides(fill = guide_legend(reverse=TRUE)) +
scale_fill_brewer(palette='Set3')
png('graph1.png',width=1000,height=500)
print(graph1)
dev.off()
#2. Series of historical maps documenting historical Asian-American populations by state.
#Set up US Map
mapdata <- read_csv('data/map.csv')
map1 <- ggplot() + theme_nothing(legend=TRUE) +
geom_polygon(data=mapdata, aes(x=long,y=lat,group=group),fill='white',color='black')
png('map.png',width=1500,height=1000)
print(map1)
dev.off()
#Select relevant variables
ipums2 <- ipums %>% select(YEAR,PERWT,STATEFIP)
#Group by year and state
ipums2 <- ipums2 %>% group_by(YEAR,STATEFIP) %>% summarise(Number=sum(PERWT))
newmap <- mapdata %>% mutate(STATEI=as.integer(STATEFIP))
#Set up population buckets
cuts <- quantcut(ipums2$Number,q=seq(0,1,.2))
cats <- ipums2 %>% mutate(Population=factor(ifelse(Number<2000,1,
ifelse(Number<4000,2,
ifelse(Number<15000,3,
ifelse(Number<60000,4,5))))))
levels(cats$Population) <- c('1-1,999','2,000-3,999','4,000-14,999','15,000-59,999','60,000+')
ipumsmap <- left_join(cats,newmap,by=c('STATEFIP'='STATEI')) %>% arrange(order)
#Map for each year
for (year in unique(ipumsmap$YEAR)) {
map2 <- map1 + scale_fill_brewer(palette='Blues') +
theme_bw(base_size = 24) +
geom_polygon(data=filter(ipumsmap,YEAR==year),aes(x=long,y=lat,group=group,fill=Population),color='black') +
labs(title=paste('Asian American Populations By State,',year,sep=' '))
png(paste('map_',year,'.png',sep=''),width=1500,height=1000)
print(map2)
dev.off()
}
#3. Column graphs showing historical occupational distribution of Asian-Americans, broken down by sex and birthplace.
#3a only Asian-Americans
#Select relevant variables, ages 15-65 only
ipums3 <- ipums %>% select(YEAR,PERWT,AGE,BPL,OCC1950,SEX) %>% filter(AGE>=15 & AGE <=65)
#Set up nationality, occupation, and sex categories
ipums3 <- ipums3 %>% mutate(Birthplace = factor(ifelse(BPL<150,1,2),
labels=c('Native-Born','Foreign-Born')),
Occupation = factor(ifelse(OCC1950 %in% c(980:999), 1,
ifelse(OCC1950 %in% c(100,123,810:840) , 2,
ifelse(OCC1950 %in% c(500:690,910:979), 3,
ifelse(OCC1950 %in% c(200:490), 4,
ifelse(OCC1950 %in% c(700:790), 5,
ifelse(OCC1950 %in% c(000:099), 6,7)))))),
labels=c('none','farmers/farm laborers','craftsmen/operatives/laborers',
'managerial/clerical/sales','service','professional')),
Sex = ifelse(SEX==1,'Male','Female'))
#Group by year, birthplace, occupation, sex
d <- ipums3 %>% group_by(YEAR,Birthplace,Occupation,Sex) %>% summarise(Number=sum(PERWT))
#Graph
graph2 <- ggplot(data=d,aes(x=YEAR,y=Number,fill=Occupation)) +
geom_bar(stat='identity',position='fill') +
labs(x='Year',y='Percent of Population',fill='Occupation',
title='Occupation of Asian-Americans Aged 15-65 by Sex, Birthplace, and Year, 1940-2000') +
scale_y_continuous(labels=scales::percent) +
scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
scale_fill_brewer(palette='Set1') +
facet_grid(Sex~.~Birthplace) +
theme_bw() + theme(legend.position = 'bottom')
png('graph2.png',height=500,width=1000)
print(graph2)
dev.off()
#3b All Americans
#Select relevant variables, ages 15-65 only
ipums3x <- ipumsx %>% select(YEAR,PERWT,AGE,BPL,OCC1950,SEX) %>% filter(AGE>=15 & AGE <=65)
#Set up nationality, occupation, and sex categories
ipums3x <- ipums3x %>% mutate(Birthplace = factor(ifelse(BPL<150,1,2),
labels=c('Native-Born','Foreign-Born')),
Occupation = factor(ifelse(OCC1950 %in% c(980:999), 1,
ifelse(OCC1950 %in% c(100,123,810:840) , 2,
ifelse(OCC1950 %in% c(500:690,910:979), 3,
ifelse(OCC1950 %in% c(200:490), 4,
ifelse(OCC1950 %in% c(700:790), 5,
ifelse(OCC1950 %in% c(000:099), 6,7)))))),
labels=c('none','farmers/farm laborers','craftsmen/operatives/laborers',
'managerial/clerical/sales','service','professional')),
Sex = ifelse(SEX==1,'Male','Female'))
#Group by year, birthplace, occupation, sex
dx <- ipums3x %>% group_by(YEAR,Birthplace,Occupation,Sex) %>% summarise(Number=sum(PERWT))
#Graph
graph2x <- ggplot(data=dx,aes(x=YEAR,y=Number,fill=Occupation)) +
geom_bar(stat='identity',position='fill') +
labs(x='Year',y='Percent of Population',fill='Occupation',
title='Occupation of Americans Aged 15-65 by Sex, Birthplace, and Year, 1940-2000') +
scale_y_continuous(labels=scales::percent) +
scale_x_continuous(breaks=c(1940,1950,1960,1970,1980,1990,2000)) +
scale_fill_brewer(palette='Set1') +
facet_grid(Sex~.~Birthplace) +
theme_bw() + theme(legend.position = 'bottom')
png('graph2x.png',height=500,width=1000)
print(graph2x)
dev.off()
#4. Population pyramids showing the historical distribution of Asian immigrants by age at arrival and sex, and by years in the US and sex.
#Select relevant variables, years, and birthplaces
ipums4 <- ipums %>% select(YEAR,PERWT,AGE,BPL,SEX,YRIMMIG)
ipums4 <- ipums4 %>% filter(YEAR>=1970&BPL>=150)
#Create vector of age category labels
agecats <- '0-9'
for(i in 1:7){
agecats <- c(agecats,paste(i,'0-',i,9,sep=''))
}
agecats <- c(agecats,'80+')
#Label sex, change YRIMMIG to integer
ipums4 <- ipums4 %>% mutate(Sex=factor(SEX,labels=c('Male','Female')),
Yrimmig=as.numeric(YRIMMIG))
#Compute age at arrival Aaa and years in United States YIUS
ipums4 <- ipums4 %>% mutate(Aaa=(AGE-YEAR+Yrimmig),Yius=(YEAR-Yrimmig))
#Clean bad entries: year of immigration before birth year, immigration in 996
ipums4 <- ipums4 %>% filter(Aaa>=0 & Yius >=0)
#Place into age categories
ipums4 <- ipums4 %>% mutate(Acat=factor(ifelse(Aaa>=80,8,floor(Aaa/10)),labels=agecats),
Ycat=factor(ifelse(Yius>=80,8,floor(Yius/10)),labels=agecats))
#Group age at arrival by category, sex, and year; prep for population pyramid
e1 <- ipums4 %>% group_by(Acat,Sex,YEAR) %>% summarise(Number=sum(PERWT))
e1 <- e1 %>% mutate(Number=ifelse(Sex=='Male',-1*Number,Number))
#Group years in the US by category, sex, and year; prep for population pyramid
e2 <- ipums4 %>% group_by(Ycat,Sex,YEAR) %>% summarise(Number=sum(PERWT))
e2 <- e2 %>% mutate(Number=ifelse(Sex=='Male',-1*Number,Number))
#Graph age at arrival
graph3 <- ggplot(data=e1,aes(x=Acat,y=Number,fill=Sex)) +
geom_bar(data=e1[e1$Sex=='Male',],stat='identity') +
geom_bar(data=e1[e1$Sex=='Female',],stat='identity') +
coord_flip() +
facet_grid(~YEAR) +
scale_y_continuous(breaks=c(-1500000,-1000000,-500000,0,500000,1000000,1500000),
labels=c('1.5','1','0.5','0','0.5','1','1.5')) +
labs(y='Population in millions',title='Population Pyramids for Age at Arrival of Asian immigrants') +
scale_fill_brewer(palette='Set1',guide=guide_legend(reverse=TRUE)) +
theme_bw() + theme(legend.position='bottom')
png('graph3.png',height=500,width=1000)
print(graph3)
dev.off()
#Graph years in US
graph4 <- ggplot(data=e2,aes(x=Ycat,y=Number,fill=Sex)) +
geom_bar(data=e2[e2$Sex=='Male',],stat='identity') +
geom_bar(data=e2[e2$Sex=='Female',],stat='identity') +
coord_flip() +
facet_grid(~YEAR) +
scale_y_continuous(breaks=c(-1500000,-1000000,-500000,0,500000,1000000,1500000),
labels=c('1.5','1','0.5','0','0.5','1','1.5')) +
labs(y='Population in millions',title='Population Pyramids for Years in US of Asian immigrants') +
scale_fill_brewer(palette='Set1',guide=guide_legend(reverse=TRUE)) +
theme_bw() + theme(legend.position='bottom')
png('graph4.png',height=500,width=1000)
print(graph4)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment