Kavakian9/FinalProject.R

## FinalProject.R
#Data visualizations

#1.	Animated Population Map of Identifiable Armenians

#load packages
library(readr)
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(ggmap)
library(maptools)
library(gtools)

#read in map data
mapdata <- read_csv('map.csv')

#create map outline
map1 <- ggplot() + theme_nothing(legend=TRUE) +
  geom_polygon(data=mapdata, aes(x=long,y=lat,group=group),fill='white',color='black')
png('map.png',width=1500,height=1000)
print(map1)
dev.off()

#read in IPUMS data
data <- read_csv('finalproj.csv',col_types=cols(PERWT=col_double(),SLWT=col_double()))

#filter by birthplace, mother's birthplace, father's birthplace, mother tongue, and ancestry
birthplace <- data %>% filter(BPLD==46540 | MBPLD==46540 | FBPLD==46540 | MTONGUE==28 | MMTONGUE==28 | FMTONGUE==28| LANGUAGE==28 | ANCESTR1==431 | ANCESTR2==431)

#weight by PERWT in every year, except for 1940 when sample line variable is used
birthplace2 <- birthplace %>% mutate(Weight=ifelse(YEAR==1940,SLWT,PERWT))

#group by YEAR and STATEFIP
birthplace3 <- birthplace2 %>% group_by(YEAR,STATEFIP) %>% summarise(Number=sum(Weight))

#quantcut to determine quartiles of data
cuts <- quantcut(birthplace3$Number,q=seq(0,1,.1))

table(cuts)

#creating number categories for population
bplcats <- birthplace3 %>% mutate(Population=factor(ifelse(Number<300,1,
                                                           ifelse(Number<1000,2,
                                                                  ifelse(Number<2500,3,
                                                                         ifelse(Number<5000,4,
                                                                                ifelse(Number<10000,5,6)))))))
levels(bplcats$Population) <- c('1-299','300-999','1000-2,499','2,500-4,999','5,000-9,999','10,000+')

table(bplcats$Population)

newmap <- mapdata %>% mutate(STATEI=as.integer(STATEFIP))

#join bplcats with newmap
amap <- left_join(bplcats,newmap,by=c('STATEFIP'='STATEI'))

amap <- amap %>% arrange(order)

#map it!
map2 <- ggplot() + theme_nothing(legend=TRUE) + scale_fill_brewer(palette='Blues') +
  geom_polygon(data=mapdata, aes(x=long,y=lat,group=group),fill='white',color='black') +
  geom_polygon(data=filter(amap,YEAR==1900),aes(x=long,y=lat,group=group,fill=Population),color='black')
png('map.png',width=1500,height=1000)
print(map2)
dev.off()


#create for loop for year
for (year in unique(amap$YEAR)) {
  map2 <- map1 + scale_fill_brewer(palette='Blues') + theme_bw(base_size = 24) +
    geom_polygon(data=filter(amap,YEAR==year),aes(x=long,y=lat,group=group,fill=Population),color='black') +
    labs(title=paste('Persons identified as Armenian,',year,sep=' '))
  png(paste('map_',year,'.png',sep=''),width=1500,height=1000)
  print(map2)
  dev.off()
}

#animate map!
install.packages('devtools')
library(devtools)
devtools::install_github('dgrtwo/gganimate')


library(gganimate)

anmap <- map2 + scale_fill_brewer(palette='Blues') +
  geom_polygon(data=amap,aes(x=long,y=lat,group=group,fill=Population,frame=YEAR),color='black') +
  labs(title='Persons identified as Armenian,')

gg_animate(anmap,ani.width=1500,ani.height=1000,'anmap.gif')


#2.	Clustered Column Graph of Sex Ratio of Identifiable Armenians and Non-Armenians

#load packages
library(dplyr)
library(readr)
library(ggplot2)
library(RColorBrewer)

#read in IPUMS data and account for parsing errors
ipums <- read_csv('finalproj.csv',col_types=cols(PERWT=col_double()))

#create Worcester city variable
worc <- ipums %>% filter(CITY==7570)

#create character for Race
pop <- worc %>% mutate(Race=ifelse(BPLD==46540 | MBPLD==46540 | FBPLD==46540 | MTONGUE==28 | MMTONGUE==28 | FMTONGUE==28 | LANGUAGE==28 | ANCESTR1==431 | ANCESTR2==431,'Armenian', 'Non-Armenian'))

#write .csv file
write_csv(worc,'worc.csv')

#replace NA with 0 to create character above
worc[is.na(worc)] <- 0

#recode SEX
sex <- pop %>% mutate(Sex=ifelse(SEX==1,'Male','Female'))

#select Year, PERWT, SLWT, Race
new <- sex %>% select(YEAR,PERWT,SLWT, Race,Sex)

#account for sample line variable in 1940
new1 <- new %>% mutate(Weight=ifelse(YEAR==1940,SLWT,PERWT))

#Group by Year and Race
f1 <- new1 %>% group_by(Race,Sex,YEAR) %>% summarise(Number=sum(Weight))

#Export figures to png
png('Graph2.png',height=500,width=1000)

#graphing
ggplot(data=f1,aes(x=YEAR, y=Number,fill=Sex)) +
  geom_bar(stat='identity',position = 'dodge') +
  labs(x='Year', y='Population', fill='',title='Population of Armenians and Non-Armenians in Worcester, MA by Year and Sex, 1900-2000') +
  scale_y_continuous(labels=scales::comma) +
  scale_x_continuous(breaks=c(1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000)) +
  scale_fill_brewer(palette='Set2') +
  facet_wrap(~Race,scales='free_y') +
  theme_bw()

dev.off()


#3.	Stacked Column Graph of Occupation of Identifiable Armenians and Non-Armenians

#load packages
library(dplyr)
library(readr)
library(ggplot2)
library(RColorBrewer)

#read in IPUMS data & omit non-working age people
ipums <- read_csv('finalproj.csv',col_types=cols(PERWT=col_double())) %>% filter(AGE>=15 & AGE<=65)

#create Worcester city variable
worc <- ipums %>% filter(CITY==7570)

#creat a character for identifiable Armenians and non-Armenians
pop1 <- worc %>% mutate(Race=ifelse(BPLD==46540 | MBPLD==46540 | FBPLD==46540 | MTONGUE==28 | MMTONGUE==28 | FMTONGUE==28 | LANGUAGE==28 | ANCESTR1==431 | ANCESTR2==431,'Armenian', 'Non-Armenian'))

#write .csv file
write_csv(worc,'worc.csv')

#replace NA with 0 to create character above
worc[is.na(worc)] <- 0

#create Occupation variable by recoding OCC1950
occ <- pop1 %>% mutate(Occupation=factor(ifelse(OCC1950 %in% c(980:999), 1,
                                           ifelse(OCC1950 %in% c(100) | OCC1950 %in% c(123) | OCC1950 %in% c(810:840) , 2,
                                                  ifelse(OCC1950 %in% c(500:690) |  OCC1950 %in% c(910:979),3,
                                                         ifelse(OCC1950 %in% c(200:490), 4,
                                                                ifelse(OCC1950 %in% c(700:790) ,5,
                                                                       ifelse(OCC1950 %in% c(000:099),6,NA)))))),
                                    labels=c('None','Farmers/Farm Laborers','Craftsmen/Operatives/Laborers',
                                             'Managerial/Clerical/Sales','Service',
                                             'Professional')))


#recode SEX
sex <- occ %>% mutate(Sex=ifelse(SEX==1,'Male','Female'))

#narrow down to variables I need by selecting YEAR, PERWT, SLWT, Race, Occupation, and Sex
select <- sex %>% select(YEAR,PERWT,SLWT,Race,Occupation,Sex)

#account for sample line variable in 1940
select1 <- select %>% mutate(Weight=ifelse(YEAR==1940,SLWT,PERWT))

#Group by Year, Sex, Race, and Occupation
f1 <- select1 %>% group_by(YEAR, Sex, Race,Occupation) %>% summarise(Number=sum(Weight))


#Export figures to png
png('Graph3.png',height=500,width=1000)


#graph it!
ggplot(data=arrange(f1,Occupation),aes(x=YEAR, y=Number,fill=Occupation)) +
  geom_bar(stat='identity',position='fill') +
  labs(x='Year', y='Percent of Population', fill='Occupation',title='Occupation of Armenians and Non-Armenians Aged 15-65 by Year and Sex in Worcester, MA, 1900-2000') +
  scale_y_continuous(labels=scales::percent) +
  scale_x_continuous(breaks=c(1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000)) +
  scale_fill_brewer(palette='Set1') +
  facet_grid(Sex~.~Race) +
  theme_bw()+ theme(legend.position='bottom')

dev.off()
	#Data visualizations

	#1. Animated Population Map of Identifiable Armenians

	#load packages
	library(readr)
	library(dplyr)
	library(ggplot2)
	library(RColorBrewer)
	library(ggmap)
	library(maptools)
	library(gtools)

	#read in map data
	mapdata <- read_csv('map.csv')

	#create map outline
	map1 <- ggplot() + theme_nothing(legend=TRUE) +
	geom_polygon(data=mapdata, aes(x=long,y=lat,group=group),fill='white',color='black')
	png('map.png',width=1500,height=1000)
	print(map1)
	dev.off()

	#read in IPUMS data
	data <- read_csv('finalproj.csv',col_types=cols(PERWT=col_double(),SLWT=col_double()))

	#filter by birthplace, mother's birthplace, father's birthplace, mother tongue, and ancestry
	birthplace <- data %>% filter(BPLD==46540 \| MBPLD==46540 \| FBPLD==46540 \| MTONGUE==28 \| MMTONGUE==28 \| FMTONGUE==28\| LANGUAGE==28 \| ANCESTR1==431 \| ANCESTR2==431)

	#weight by PERWT in every year, except for 1940 when sample line variable is used
	birthplace2 <- birthplace %>% mutate(Weight=ifelse(YEAR==1940,SLWT,PERWT))

	#group by YEAR and STATEFIP
	birthplace3 <- birthplace2 %>% group_by(YEAR,STATEFIP) %>% summarise(Number=sum(Weight))

	#quantcut to determine quartiles of data
	cuts <- quantcut(birthplace3$Number,q=seq(0,1,.1))

	table(cuts)

	#creating number categories for population
	bplcats <- birthplace3 %>% mutate(Population=factor(ifelse(Number<300,1,
	ifelse(Number<1000,2,
	ifelse(Number<2500,3,
	ifelse(Number<5000,4,
	ifelse(Number<10000,5,6)))))))
	levels(bplcats$Population) <- c('1-299','300-999','1000-2,499','2,500-4,999','5,000-9,999','10,000+')

	table(bplcats$Population)

	newmap <- mapdata %>% mutate(STATEI=as.integer(STATEFIP))

	#join bplcats with newmap
	amap <- left_join(bplcats,newmap,by=c('STATEFIP'='STATEI'))

	amap <- amap %>% arrange(order)

	#map it!
	map2 <- ggplot() + theme_nothing(legend=TRUE) + scale_fill_brewer(palette='Blues') +
	geom_polygon(data=mapdata, aes(x=long,y=lat,group=group),fill='white',color='black') +
	geom_polygon(data=filter(amap,YEAR==1900),aes(x=long,y=lat,group=group,fill=Population),color='black')
	png('map.png',width=1500,height=1000)
	print(map2)
	dev.off()


	#create for loop for year
	for (year in unique(amap$YEAR)) {
	map2 <- map1 + scale_fill_brewer(palette='Blues') + theme_bw(base_size = 24) +
	geom_polygon(data=filter(amap,YEAR==year),aes(x=long,y=lat,group=group,fill=Population),color='black') +
	labs(title=paste('Persons identified as Armenian,',year,sep=' '))
	png(paste('map_',year,'.png',sep=''),width=1500,height=1000)
	print(map2)
	dev.off()
	}

	#animate map!
	install.packages('devtools')
	library(devtools)
	devtools::install_github('dgrtwo/gganimate')


	library(gganimate)

	anmap <- map2 + scale_fill_brewer(palette='Blues') +
	geom_polygon(data=amap,aes(x=long,y=lat,group=group,fill=Population,frame=YEAR),color='black') +
	labs(title='Persons identified as Armenian,')

	gg_animate(anmap,ani.width=1500,ani.height=1000,'anmap.gif')



	#2. Clustered Column Graph of Sex Ratio of Identifiable Armenians and Non-Armenians

	#load packages
	library(dplyr)
	library(readr)
	library(ggplot2)
	library(RColorBrewer)

	#read in IPUMS data and account for parsing errors
	ipums <- read_csv('finalproj.csv',col_types=cols(PERWT=col_double()))

	#create Worcester city variable
	worc <- ipums %>% filter(CITY==7570)

	#create character for Race
	pop <- worc %>% mutate(Race=ifelse(BPLD==46540 \| MBPLD==46540 \| FBPLD==46540 \| MTONGUE==28 \| MMTONGUE==28 \| FMTONGUE==28 \| LANGUAGE==28 \| ANCESTR1==431 \| ANCESTR2==431,'Armenian', 'Non-Armenian'))

	#write .csv file
	write_csv(worc,'worc.csv')

	#replace NA with 0 to create character above
	worc[is.na(worc)] <- 0

	#recode SEX
	sex <- pop %>% mutate(Sex=ifelse(SEX==1,'Male','Female'))

	#select Year, PERWT, SLWT, Race
	new <- sex %>% select(YEAR,PERWT,SLWT, Race,Sex)

	#account for sample line variable in 1940
	new1 <- new %>% mutate(Weight=ifelse(YEAR==1940,SLWT,PERWT))

	#Group by Year and Race
	f1 <- new1 %>% group_by(Race,Sex,YEAR) %>% summarise(Number=sum(Weight))

	#Export figures to png
	png('Graph2.png',height=500,width=1000)

	#graphing
	ggplot(data=f1,aes(x=YEAR, y=Number,fill=Sex)) +
	geom_bar(stat='identity',position = 'dodge') +
	labs(x='Year', y='Population', fill='',title='Population of Armenians and Non-Armenians in Worcester, MA by Year and Sex, 1900-2000') +
	scale_y_continuous(labels=scales::comma) +
	scale_x_continuous(breaks=c(1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000)) +
	scale_fill_brewer(palette='Set2') +
	facet_wrap(~Race,scales='free_y') +
	theme_bw()

	dev.off()



	#3. Stacked Column Graph of Occupation of Identifiable Armenians and Non-Armenians

	#load packages
	library(dplyr)
	library(readr)
	library(ggplot2)
	library(RColorBrewer)

	#read in IPUMS data & omit non-working age people
	ipums <- read_csv('finalproj.csv',col_types=cols(PERWT=col_double())) %>% filter(AGE>=15 & AGE<=65)

	#create Worcester city variable
	worc <- ipums %>% filter(CITY==7570)

	#creat a character for identifiable Armenians and non-Armenians
	pop1 <- worc %>% mutate(Race=ifelse(BPLD==46540 \| MBPLD==46540 \| FBPLD==46540 \| MTONGUE==28 \| MMTONGUE==28 \| FMTONGUE==28 \| LANGUAGE==28 \| ANCESTR1==431 \| ANCESTR2==431,'Armenian', 'Non-Armenian'))

	#write .csv file
	write_csv(worc,'worc.csv')

	#replace NA with 0 to create character above
	worc[is.na(worc)] <- 0

	#create Occupation variable by recoding OCC1950
	occ <- pop1 %>% mutate(Occupation=factor(ifelse(OCC1950 %in% c(980:999), 1,
	ifelse(OCC1950 %in% c(100) \| OCC1950 %in% c(123) \| OCC1950 %in% c(810:840) , 2,
	ifelse(OCC1950 %in% c(500:690) \| OCC1950 %in% c(910:979),3,
	ifelse(OCC1950 %in% c(200:490), 4,
	ifelse(OCC1950 %in% c(700:790) ,5,
	ifelse(OCC1950 %in% c(000:099),6,NA)))))),
	labels=c('None','Farmers/Farm Laborers','Craftsmen/Operatives/Laborers',
	'Managerial/Clerical/Sales','Service',
	'Professional')))


	#recode SEX
	sex <- occ %>% mutate(Sex=ifelse(SEX==1,'Male','Female'))

	#narrow down to variables I need by selecting YEAR, PERWT, SLWT, Race, Occupation, and Sex
	select <- sex %>% select(YEAR,PERWT,SLWT,Race,Occupation,Sex)

	#account for sample line variable in 1940
	select1 <- select %>% mutate(Weight=ifelse(YEAR==1940,SLWT,PERWT))

	#Group by Year, Sex, Race, and Occupation
	f1 <- select1 %>% group_by(YEAR, Sex, Race,Occupation) %>% summarise(Number=sum(Weight))


	#Export figures to png
	png('Graph3.png',height=500,width=1000)


	#graph it!
	ggplot(data=arrange(f1,Occupation),aes(x=YEAR, y=Number,fill=Occupation)) +
	geom_bar(stat='identity',position='fill') +
	labs(x='Year', y='Percent of Population', fill='Occupation',title='Occupation of Armenians and Non-Armenians Aged 15-65 by Year and Sex in Worcester, MA, 1900-2000') +
	scale_y_continuous(labels=scales::percent) +
	scale_x_continuous(breaks=c(1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000)) +
	scale_fill_brewer(palette='Set1') +
	facet_grid(Sex~.~Race) +
	theme_bw()+ theme(legend.position='bottom')

	dev.off()