To get the data sample, we take first 25k rows and last 25k rows from the sample of 59mil rows in bash:
>> tail -n25000 /2016/mbsf_abcd_summary_res000017155_req008183_2016.dat \
> sample_mbsf_abcd_summary_res000017155_req008183_2016.dat
>> head -n25000 /2016/mbsf_abcd_summary_res000017155_req008183_2016.dat \
>> sample_mbsf_abcd_summary_res000017155_req008183_2016.dat
# word count:
>> wc -l data/sample_mbsf_abcd_summary_res000017155_req008183_2016.dat
50000
This is the code (as job.R
) used to get the summaries:
fileName <- "data/sample_mbsf_abcd_summary_res000017155_req008183_2016.dat"
conn <- file(fileName,open="r")
linn <-readLines(conn)
age <- c()
sex <- c()
race <- c()
for (i in 1:length(linn)){
# print(linn[i])
# From docs: AGE_AT_END_REF_YR, NUM, starts at char: 98, len: 3
a <- as.integer(substr(linn[i], 98, 100))
age <- c(age, a)
# SEX_IDENT_CD, CHAR, starts at char: 118, len: 1
sex <- c(sex, as.integer(substr(linn[i], 118, 118)))
# BENE_RACE_CD, CHAR, starts at char: 119, len: 1
race <- c(race, as.integer(substr(linn[i], 119, 119)))
}
close(conn)
length( which( age > 64 ) )
summary(age)
summary(sex)
summary(race)
sum(is.na(age))
sum(is.na(sex))
sum(is.na(race))
table(age)
table(sex)
table(race)
This code was executed on RCE/HTCondor.