robbrit/Degree Values - Code

## Degree Values - Code
data2006 <- read.csv("census2006.csv", header = TRUE)
data1986 <- read.csv("census1986.csv", header = TRUE)

# Get the CPI for 1986, 2006, and 2010
cpi2006 <- 1.091
cpi1986 <- 0.656
cpi2010 <- 1.165

calc_avg <- function(w1986, w2006, message, png_file){
  # scale by CPIs to give 2010 dollars
  inc2006 <- w2006$empin / cpi2006 * cpi2010
  inc1986 <- w1986$wagesp / cpi1986 * cpi2010

  # get the mean and standard error to create a confidence interval for each
  # income - at this sample size, probably won't have a big interval
  mean2006 <- mean(inc2006)
  med2006 <- median(inc2006)
  se2006 <- sd(inc2006) / sqrt(length(inc2006))
  semd2006 <- 1.253 * se2006 # standard error for median is 1.253 * standard error for mean

  mean1986 <- mean(inc1986)
  med1986 <- median(inc1986)
  se1986 <- sd(inc1986) / sqrt(length(inc1986))
  semd1986 <- 1.253 * se1986

  z <- 1.96  # 95% confidence
  #z <- 1.645 # 90% confidence
  #z <- 1.282 # 80% confidence

  print(message)
  print(sprintf("Confidence for 1986: %f to %f", mean1986 - z * se1986, mean1986 + z * se1986))
  print(sprintf("Confidence for 2006: %f to %f", mean2006 - z * se2006, mean2006 + z * se2006))
  print(sprintf("Standard Deviation for 1986: %f", sd(inc1986)))
  print(sprintf("Standard Deviation for 2006: %f", sd(inc2006)))
  print(sprintf("Median Confidence for 1986: %f to %f", med1986 - z * semd1986, med1986 + z * se1986))
  print(sprintf("Median Confidence for 2006: %f to %f", med2006 - z * semd2006, med2006 + z * se2006))

  # output some histograms
  png(sprintf(png_file, 1986))
  hist(inc1986, breaks = 30, col = "lightblue")
  dev.off()

  png(sprintf(png_file, 2006))
  hist(inc2006, breaks = 30, col = "lightblue")
  dev.off()
}

# filters:
# - people who are employed in a regular job (no self-employment)
#   - this means COW (class of worker) is 4 for 2006, 1 for 1986
#   - employment income number must be available
# - people who are at least 15 years old
#   - 2006: age group is at least 6 (15-17 year olds) and not 88 (unavailable)
#   - 1986: just gives raw age
# - have a university degree
bac2006 <- data2006[which(data2006$cow == 4 & data2006$empin != 9999999 & data2006$empin != 8888888
  & data2006$agegrp >= 6 & data2006$agegrp != 88 & data2006$hdgree >= 9 & data2006$hdgree <= 13),]
bac1986 <- data1986[which(data1986$cowp == 1 & data1986$agep >= 15 & data1986$hlosp == 11),]

groups <- list(
  list("education", 1, 1),
  list("fine arts", 2, 2),
  list("humanities", 3, 3),
  list("social sciences", 4, 4),
  list("commerce/business", 6, 5),
  list("health/food sciences", 7, 6),
  list("engineering", 8, 7),
  list("sciences", 12, 10)
)

for (group in groups){
  group2006 <- bac2006[which(bac2006$mfs == group[3]),]
  group1986 <- bac1986[which(bac1986$dgmfs == group[2]),]

  calc_avg(group1986, group2006, sprintf("Average wages for degree in %s:", group[1]), "wages_uni%d.png")
  print("")
}
	data2006 <- read.csv("census2006.csv", header = TRUE)
	data1986 <- read.csv("census1986.csv", header = TRUE)

	# Get the CPI for 1986, 2006, and 2010
	cpi2006 <- 1.091
	cpi1986 <- 0.656
	cpi2010 <- 1.165

	calc_avg <- function(w1986, w2006, message, png_file){
	# scale by CPIs to give 2010 dollars
	inc2006 <- w2006$empin / cpi2006 * cpi2010
	inc1986 <- w1986$wagesp / cpi1986 * cpi2010

	# get the mean and standard error to create a confidence interval for each
	# income - at this sample size, probably won't have a big interval
	mean2006 <- mean(inc2006)
	med2006 <- median(inc2006)
	se2006 <- sd(inc2006) / sqrt(length(inc2006))
	semd2006 <- 1.253 * se2006 # standard error for median is 1.253 * standard error for mean

	mean1986 <- mean(inc1986)
	med1986 <- median(inc1986)
	se1986 <- sd(inc1986) / sqrt(length(inc1986))
	semd1986 <- 1.253 * se1986

	z <- 1.96 # 95% confidence
	#z <- 1.645 # 90% confidence
	#z <- 1.282 # 80% confidence

	print(message)
	print(sprintf("Confidence for 1986: %f to %f", mean1986 - z * se1986, mean1986 + z * se1986))
	print(sprintf("Confidence for 2006: %f to %f", mean2006 - z * se2006, mean2006 + z * se2006))
	print(sprintf("Standard Deviation for 1986: %f", sd(inc1986)))
	print(sprintf("Standard Deviation for 2006: %f", sd(inc2006)))
	print(sprintf("Median Confidence for 1986: %f to %f", med1986 - z * semd1986, med1986 + z * se1986))
	print(sprintf("Median Confidence for 2006: %f to %f", med2006 - z * semd2006, med2006 + z * se2006))

	# output some histograms
	png(sprintf(png_file, 1986))
	hist(inc1986, breaks = 30, col = "lightblue")
	dev.off()

	png(sprintf(png_file, 2006))
	hist(inc2006, breaks = 30, col = "lightblue")
	dev.off()
	}

	# filters:
	# - people who are employed in a regular job (no self-employment)
	# - this means COW (class of worker) is 4 for 2006, 1 for 1986
	# - employment income number must be available
	# - people who are at least 15 years old
	# - 2006: age group is at least 6 (15-17 year olds) and not 88 (unavailable)
	# - 1986: just gives raw age
	# - have a university degree
	bac2006 <- data2006[which(data2006$cow == 4 & data2006$empin != 9999999 & data2006$empin != 8888888
	& data2006$agegrp >= 6 & data2006$agegrp != 88 & data2006$hdgree >= 9 & data2006$hdgree <= 13),]
	bac1986 <- data1986[which(data1986$cowp == 1 & data1986$agep >= 15 & data1986$hlosp == 11),]

	groups <- list(
	list("education", 1, 1),
	list("fine arts", 2, 2),
	list("humanities", 3, 3),
	list("social sciences", 4, 4),
	list("commerce/business", 6, 5),
	list("health/food sciences", 7, 6),
	list("engineering", 8, 7),
	list("sciences", 12, 10)
	)

	for (group in groups){
	group2006 <- bac2006[which(bac2006$mfs == group[3]),]
	group1986 <- bac1986[which(bac1986$dgmfs == group[2]),]

	calc_avg(group1986, group2006, sprintf("Average wages for degree in %s:", group[1]), "wages_uni%d.png")
	print("")
	}