ajdamico/analyzing big survey data with limited computing resources.R

## analyzing big survey data with limited computing resources.R
########################################
###read large csv into SQL DB
########################################

#set to the number of GB of RAM on computer
gbram <- 0.5

#set to CSV file directory
setwd("C:\\American Community Survey\\2009\\")

#program start
start <- Sys.time()
chunk_size <- gbram * 100000
table_name <- "acs09"

library(RSQLite)

file_list <- c("ss09pusa.csv", "ss09pusb.csv")
file_list
input <- file(file_list[1], "r")

db <- dbConnect(SQLite(), dbname="acs09.db")
header <- readLines(input, n = 1)
fields <- strsplit(header, ",")[[1]]
colTypes <- rep("INTEGER", length(fields))
colDecl <- paste(fields, colTypes)
sql <- sprintf(
	paste("CREATE TABLE",table_name,"(%s)") ,
	paste(colDecl, collapse = ", ") )
dbGetQuery(db, sql)
colClasses <- rep("character", length(fields))
sql.in <- sprintf(
	paste("INSERT INTO",table_name,"VALUES (%s)"),
	paste(rep("?", length(fields)), collapse = ","))

dbBeginTransaction(db)
for (i in 1:length(file_list)){
	input <- file(file_list[i], "r")
	header <- readLines(input, n = 1)
	tryCatch({
	while (TRUE) {
		part <- read.table(input, nrows=chunk_size, sep=",",
			colClasses = colClasses,
			comment.char = "")
		dbGetPreparedQuery(db, sql.in, bind.data = part)
	}
	}, error = function(e) {
	if (grepl("no lines available", conditionMessage(e)))
		TRUE
	else
		stop(conditionMessage(e))
	})
}
dbCommit(db)
dbGetInfo(db)

Sys.time() - start

########################################
###means, distributions, and medians
########################################

dec_places <- "1.00000000"

# simple weighted mean on a linear variable – Age – of all US residents
dbGetQuery( db , paste("SELECT" , dec_places , "* SUM(AGEP*PWGTP) /
		SUM(PWGTP) as wgtage
		from acs09") )

# simple weighted mean on a linear variable – Age – of all US residents, by state
dbGetQuery( db , paste("SELECT ST," , dec_places , "* SUM(AGEP*PWGTP) /
		SUM(PWGTP) as wgtage
		from acs09
		GROUP BY ST") )

# simple weighted mean on a factor variable
# % with Public Health Insurance Coverage – of all US residents
dbGetQuery( db , paste("SELECT PUBCOV," , dec_places , "* sum(PWGTP) /
		(SELECT sum(PWGTP) from acs09) as pctPUBCOV
		FROM acs09
		GROUP BY PUBCOV") )

# % with Public Health Insurance Coverage – of all US residents, by state
try(dbGetQuery( db , "DROP TABLE totals_temp"),silent=T)
dbGetQuery( db , paste( "CREATE TABLE totals_temp AS
		SELECT ST," , dec_places , "* sum(PWGTP) as PWGTP FROM acs09
		GROUP BY ST") )

dbGetQuery( db , paste( "SELECT b.ST, PUBCOV,", dec_places , "* sum(b.PWGTP) /
		(a.PWGTP) as pctPUBCOV
		FROM acs09 b INNER JOIN totals_temp a
		ON a.ST == b.ST
		GROUP BY PUBCOV, b.ST")

#rough median and other quantiles on a linear variable – Income – of adult US residents
s <- dbGetQuery( db , "SELECT SUM(PWGTP) FROM acs09 WHERE AGEP > 17 AND PINCP != ''")

#find record with the desired quantile point
p_w <- s * .5
#p_w <- s * .75

#reorder table by variable of interest
try(dbGetQuery( db , "DROP TABLE ordered_temp"),silent=T)
dbGetQuery( db , "CREATE TABLE ordered_temp AS
		SELECT CAST(PINCP AS INTEGER) AS PINCP , PWGTP
		FROM acs09 WHERE PINCP != '' AND AGEP > 17
		ORDER BY PINCP ASC")

#pull single record at appropriate point in data set containing weighted median
sql.in <- paste( "SELECT PINCP , (SELECT SUM(PWGTP) FROM ordered_temp b
		WHERE b.rowid <= a.rowid) as sum_wgts FROM ordered_temp a
		WHERE sum_wgts >=" , p_w , "AND AGEP > 17 LIMIT 1")

wgtd_median <- dbGetQuery( db ,  sql.in)


########################################
###computing standard errors with sql
########################################

# weighted mean and confidence interval of a linear variable – Age – by state
i <- 1:80
replicate_sums <- paste(", ",dec_places," * SUM(AGEP * pwgtp",i,") / ",
	dec_places , "* SUM(pwgtp",i,") AS PWGTP_",i,sep="",collapse="")
sql.in <- paste("SELECT ST,", dec_places ,"* SUM(AGEP*PWGTP) /" ,
	 dec_places , "* SUM(PWGTP) AS PWGTP_A" , replicate_sums ,
	"FROM acs09 GROUP BY ST")
z <- dbGetQuery( db , sql.in )

for (i in 1:80){
      z[,paste("DIFFSQ",i,sep="")] <- ( z[,"PWGTP_A"] - z[,paste("PWGTP_",i,sep="")])^2
}
z[,"SE"] <- sqrt( rowSums(z[,grepl("DIFFSQ",names(z))]) * 4 / 80 )
z[,"UB"] <- z[,"PWGTP_A"] + 1.645 * z[,"SE"]
z[,"LB"] <- z[,"PWGTP_A"] - 1.645 * z[,"SE"]

#state code, mean, SE, 90% lower bound, 90% upper bound
z[,c("ST","PWGTP_A","SE","LB","UB")]

# weighted mean and confidence interval of a factor variable – % with Public Ins. – by state
i <- 1:80
try(dbGetQuery( db , "DROP TABLE totals_temp"),silent=T)
replicate_sums <- paste(", SUM(pwgtp",i,")  AS PWGTP_",i,sep="",collapse="")
sql.in <- paste("CREATE TABLE totals_temp AS SELECT ST, SUM(PWGTP) AS PWGTP_A",
	replicate_sums ,
	"FROM acs09 GROUP BY ST")
dbGetQuery( db , sql.in )

replicate_sums <- paste("," , dec_places , "*sum(b.PWGTP",i,") / (a.PWGTP_",i,") as pct",i,sep="",collapse="")
sql.in <- paste("SELECT b.ST, PUBCOV," , dec_places ,
	"* SUM(b.PWGTP)/(a.PWGTP_A) as pctA" , replicate_sums,
	"FROM acs09 b INNER JOIN totals_temp a ON a.ST == b.ST",
	"GROUP BY PUBCOV, b.ST")
z <- dbGetQuery( db , sql.in )

for (i in 1:80){
      z[,paste("DIFFSQ",i,sep="")] <- ( z[,"pctA"] - z[,paste("pct",i,sep="")])^2
}
z[,"SE"] <- sqrt( rowSums(z[,grepl("DIFFSQ",names(z))]) * 4 / 80 )
z[,"UB"] <- z[,"pctA"] + 1.645 * z[,"SE"]
z[,"LB"] <- z[,"pctA"] - 1.645 * z[,"SE"]

#state code, public coverage category, percent, SE, 90% lower bound, 90% upper bound
z[,c("ST","PUBCOV","pctA","SE","LB","UB")]
	########################################
	###read large csv into SQL DB
	########################################

	#set to the number of GB of RAM on computer
	gbram <- 0.5

	#set to CSV file directory
	setwd("C:\\American Community Survey\\2009\\")

	#program start
	start <- Sys.time()
	chunk_size <- gbram * 100000
	table_name <- "acs09"

	library(RSQLite)

	file_list <- c("ss09pusa.csv", "ss09pusb.csv")
	file_list
	input <- file(file_list[1], "r")

	db <- dbConnect(SQLite(), dbname="acs09.db")
	header <- readLines(input, n = 1)
	fields <- strsplit(header, ",")[[1]]
	colTypes <- rep("INTEGER", length(fields))
	colDecl <- paste(fields, colTypes)
	sql <- sprintf(
	paste("CREATE TABLE",table_name,"(%s)") ,
	paste(colDecl, collapse = ", ") )
	dbGetQuery(db, sql)
	colClasses <- rep("character", length(fields))
	sql.in <- sprintf(
	paste("INSERT INTO",table_name,"VALUES (%s)"),
	paste(rep("?", length(fields)), collapse = ","))

	dbBeginTransaction(db)
	for (i in 1:length(file_list)){
	input <- file(file_list[i], "r")
	header <- readLines(input, n = 1)
	tryCatch({
	while (TRUE) {
	part <- read.table(input, nrows=chunk_size, sep=",",
	colClasses = colClasses,
	comment.char = "")
	dbGetPreparedQuery(db, sql.in, bind.data = part)
	}
	}, error = function(e) {
	if (grepl("no lines available", conditionMessage(e)))
	TRUE
	else
	stop(conditionMessage(e))
	})
	}
	dbCommit(db)
	dbGetInfo(db)

	Sys.time() - start

	########################################
	###means, distributions, and medians
	########################################

	dec_places <- "1.00000000"

	# simple weighted mean on a linear variable – Age – of all US residents
	dbGetQuery( db , paste("SELECT" , dec_places , "* SUM(AGEP*PWGTP) /
	SUM(PWGTP) as wgtage
	from acs09") )

	# simple weighted mean on a linear variable – Age – of all US residents, by state
	dbGetQuery( db , paste("SELECT ST," , dec_places , "* SUM(AGEP*PWGTP) /
	SUM(PWGTP) as wgtage
	from acs09
	GROUP BY ST") )

	# simple weighted mean on a factor variable
	# % with Public Health Insurance Coverage – of all US residents
	dbGetQuery( db , paste("SELECT PUBCOV," , dec_places , "* sum(PWGTP) /
	(SELECT sum(PWGTP) from acs09) as pctPUBCOV
	FROM acs09
	GROUP BY PUBCOV") )

	# % with Public Health Insurance Coverage – of all US residents, by state
	try(dbGetQuery( db , "DROP TABLE totals_temp"),silent=T)
	dbGetQuery( db , paste( "CREATE TABLE totals_temp AS
	SELECT ST," , dec_places , "* sum(PWGTP) as PWGTP FROM acs09
	GROUP BY ST") )

	dbGetQuery( db , paste( "SELECT b.ST, PUBCOV,", dec_places , "* sum(b.PWGTP) /
	(a.PWGTP) as pctPUBCOV
	FROM acs09 b INNER JOIN totals_temp a
	ON a.ST == b.ST
	GROUP BY PUBCOV, b.ST")

	#rough median and other quantiles on a linear variable – Income – of adult US residents
	s <- dbGetQuery( db , "SELECT SUM(PWGTP) FROM acs09 WHERE AGEP > 17 AND PINCP != ''")

	#find record with the desired quantile point
	p_w <- s * .5
	#p_w <- s * .75

	#reorder table by variable of interest
	try(dbGetQuery( db , "DROP TABLE ordered_temp"),silent=T)
	dbGetQuery( db , "CREATE TABLE ordered_temp AS
	SELECT CAST(PINCP AS INTEGER) AS PINCP , PWGTP
	FROM acs09 WHERE PINCP != '' AND AGEP > 17
	ORDER BY PINCP ASC")

	#pull single record at appropriate point in data set containing weighted median
	sql.in <- paste( "SELECT PINCP , (SELECT SUM(PWGTP) FROM ordered_temp b
	WHERE b.rowid <= a.rowid) as sum_wgts FROM ordered_temp a
	WHERE sum_wgts >=" , p_w , "AND AGEP > 17 LIMIT 1")

	wgtd_median <- dbGetQuery( db , sql.in)


	########################################
	###computing standard errors with sql
	########################################

	# weighted mean and confidence interval of a linear variable – Age – by state
	i <- 1:80
	replicate_sums <- paste(", ",dec_places," * SUM(AGEP * pwgtp",i,") / ",
	dec_places , "* SUM(pwgtp",i,") AS PWGTP_",i,sep="",collapse="")
	sql.in <- paste("SELECT ST,", dec_places ,"* SUM(AGEP*PWGTP) /" ,
	dec_places , "* SUM(PWGTP) AS PWGTP_A" , replicate_sums ,
	"FROM acs09 GROUP BY ST")
	z <- dbGetQuery( db , sql.in )

	for (i in 1:80){
	z[,paste("DIFFSQ",i,sep="")] <- ( z[,"PWGTP_A"] - z[,paste("PWGTP_",i,sep="")])^2
	}
	z[,"SE"] <- sqrt( rowSums(z[,grepl("DIFFSQ",names(z))]) * 4 / 80 )
	z[,"UB"] <- z[,"PWGTP_A"] + 1.645 * z[,"SE"]
	z[,"LB"] <- z[,"PWGTP_A"] - 1.645 * z[,"SE"]

	#state code, mean, SE, 90% lower bound, 90% upper bound
	z[,c("ST","PWGTP_A","SE","LB","UB")]

	# weighted mean and confidence interval of a factor variable – % with Public Ins. – by state
	i <- 1:80
	try(dbGetQuery( db , "DROP TABLE totals_temp"),silent=T)
	replicate_sums <- paste(", SUM(pwgtp",i,") AS PWGTP_",i,sep="",collapse="")
	sql.in <- paste("CREATE TABLE totals_temp AS SELECT ST, SUM(PWGTP) AS PWGTP_A",
	replicate_sums ,
	"FROM acs09 GROUP BY ST")
	dbGetQuery( db , sql.in )

	replicate_sums <- paste("," , dec_places , "*sum(b.PWGTP",i,") / (a.PWGTP_",i,") as pct",i,sep="",collapse="")
	sql.in <- paste("SELECT b.ST, PUBCOV," , dec_places ,
	"* SUM(b.PWGTP)/(a.PWGTP_A) as pctA" , replicate_sums,
	"FROM acs09 b INNER JOIN totals_temp a ON a.ST == b.ST",
	"GROUP BY PUBCOV, b.ST")
	z <- dbGetQuery( db , sql.in )

	for (i in 1:80){
	z[,paste("DIFFSQ",i,sep="")] <- ( z[,"pctA"] - z[,paste("pct",i,sep="")])^2
	}
	z[,"SE"] <- sqrt( rowSums(z[,grepl("DIFFSQ",names(z))]) * 4 / 80 )
	z[,"UB"] <- z[,"pctA"] + 1.645 * z[,"SE"]
	z[,"LB"] <- z[,"pctA"] - 1.645 * z[,"SE"]

	#state code, public coverage category, percent, SE, 90% lower bound, 90% upper bound
	z[,c("ST","PUBCOV","pctA","SE","LB","UB")]