Skip to content

Instantly share code, notes, and snippets.

@ajdamico ajdamico/ncvs_bughunt.R
Last active Feb 24, 2016

Embed
What would you like to do?
library(downloader)
# install.packages( c("MonetDB.R", "MonetDBLite" , "survey" , "SAScii" , "descr" , "downloader" , "digest" , "stringr" , "R.utils" , "RCurl" ) , repos=c("http://dev.monetdb.org/Assets/R/", "http://cran.rstudio.com/"))
library(SAScii) # load the SAScii package (imports ascii data with a SAS script)
library(RCurl) # load RCurl package (downloads https files)
library(stringr) # load stringr package (manipulates character strings easily)
library(downloader) # downloads and then runs the source() function on scripts from github
library(MonetDB.R) # load the MonetDB.R package (connects r to a monet database)
library(MonetDBLite) # load MonetDBLite package (creates database files in R)
library(descr) # load the descr package (converts fixed-width files to delimited files)
library(R.utils) # load the R.utils package (counts the number of lines in a file quickly)
library(foreign) # load foreign package (converts data files into R)
tf <- tempfile()
source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/MonetDB/read.SAScii.monetdb.R" , prompt = FALSE )
dbfolder <- paste0( getwd() , "/MonetDB" )
db <- dbConnect( MonetDBLite() , dbfolder )
datafolder <- paste0( getwd() , "/data" )
dir.create( datafolder )
# initiate a curl handle so the remote server knows it's you.
curl = getCurlHandle()
# set a cookie file on the local disk
curlSetOpt(
cookiejar = 'cookies.txt' ,
followlocation = TRUE ,
autoreferer = TRUE ,
curl = curl
)
dp <- "http://www.icpsr.umich.edu/cgi-bin/bob/terms2?study=36044&ds=1&bundle=ascsas&path=NACJD"
# post your username and password to the umich server
login.page <-
postForm(
"http://www.icpsr.umich.edu/ticketlogin" ,
email = your.username ,
password = your.password ,
path = "NACJD" ,
request_uri = dp ,
style = "POST" ,
curl = curl
)
# consent to terms of use page
terms.of.use.page <-
postForm(
"http://www.icpsr.umich.edu/cgi-bin/terms" ,
agree = 'yes' ,
path = "NACJD" ,
study = "36044" ,
ds = "1" ,
bundle = "ascsas" ,
dups = "yes" ,
style = "POST" ,
curl = curl
)
# download the current sas file onto the local disk
this.sas_ri <- getBinaryURL( dp , curl = curl )
# save the actual downloaded-file to the filepath specified on the local disk
writeBin( this.sas_ri , paste0( datafolder , "/myfile.zip" ) )
# unzip the downloaded file within the local drive
z <- unzip( paste0( datafolder , "/myfile.zip" ) , exdir = datafolder )
# determine the filenames that end with `sas`
sas.import <- z[ grep( "sas$" , tolower( z ) ) ]
# determine the filenames containing the word `data`
data.file <- z[ grep( "data" , tolower( basename( z ) ) ) ]
tablename <- 'x36044_0001'
# read the data file into an r sqlite database
read.SAScii.monetdb(
fn = data.file ,
sas_ri = sas.import ,
tl = TRUE , # convert all column names to lowercase?
tablename = tablename ,
skip.decimal.division = TRUE ,
conn = db
)
# figure out which variables need to be recoded to system missing #
# read the entire sas import script into a character vector
recode.lines <- toupper( readLines( sas.import ) )
# look for the start of the system missing recode block
mvr <- intersect( grep( "RECODE TO SAS SYSMIS" , recode.lines ) , grep( "USER-DEFINED MISSING VALUE" , recode.lines ) )
# if there's just one..
if ( length( mvr ) == 1 ){
# isolate the recode lines
recode.lines <- recode.lines[ mvr:length( recode.lines ) ]
# find all lines that start with an IF statement and end with a semicolon
lines.with.if <- grep( "IF (.*);" , recode.lines )
# confirm all of those lines have a sas missing value (a dot) somewhere in there.
lines.with.dots <- grep( "\\." , recode.lines )
# if the lines don't match up, fail cuz something's wrong. terribly terribly wrong.
if ( length( lines.with.if[ !( lines.with.if %in% lines.with.dots ) ] ) > 0 ) stop( "some recode line is recoding to something other than missing" )
# further limit the recode lines to only lines containing an if block
recodes <- recode.lines[ lines.with.if ]
# break the recode lines up by semicolons, in case there's more than one per line
recodes <- unlist( strsplit( recodes , ";" ) )
# remove the word `IF `
recodes <- gsub( "IF " , "" , recodes )
# remove leading and trailing whitespace
recodes <- str_trim( recodes )
# remove empty strings
recodes <- recodes[ recodes != '' ]
# find which variables need to be replaced by extracting whatever's directly in front of the equals sign
vtr <- str_trim( tolower( gsub( "(.*) THEN( ?)(.*)( ?)=(.*)" , "\\3" , recodes ) ) )
# remove everything after the `THEN` block..
ptm <- gsub( " THEN( ?)(.*)" , "" , recodes )
# ..to create a vector of patterns to match
ptm <- tolower( str_trim( ptm ) )
}
print( ptm )
if ( dbGetQuery( db , paste0( 'select count(*) from ' , tablename ) )[ 1 , 1 ] < 100000 ){
print('before dbreadtable')
# pull the data file into working memory
x <- dbReadTable( db , tablename )
print('after dbreadtable')
# if there are any missing values to recode
if ( length( mvr ) == 1 ){
print('after length(mvr)==1')
# loop through each variable to recode
for ( k in seq_along( vtr ) ){
print('after k in seq_along(vtr)')
# overwrite sas syntax with r syntax in the patterns to match commands.
r.command <- gsub( "=" , "==" , ptm[ k ] )
r.command <- gsub( " or " , "|" , r.command )
r.command <- gsub( " and " , "&" , r.command )
r.command <- gsub( " in \\(" , " %in% c\\(" , r.command )
cat( r.command , '\r\n' )
# wherever the pattern has been matched, overwrite the current variable with a missing
x[ with( x , which( eval( parse( text = r.command ) ) ) ) , vtr[ k ] ] <- NA
# if a column is *only* NAs then delete it
if( all( is.na( x[ , vtr[ k ] ] ) ) ) x[ , vtr[ k ] ] <- NULL
# clear up RAM
gc()
}
print("before remove table")
# remove the current data table from the database
dbRemoveTable( db , tablename )
print("before write table" )
# ..and overwrite it with the data.frame object
# that you've just blessedly cleaned up
dbWriteTable( db , tablename , x )
}
print('before save' )
# save the r data.frame object to the local disk as an `.rda`
save( x , file = gsub( "\\-Data\\.txt$" , "rda" , data.file ) )
# remove the object from working memory
rm( x )
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.