Skip to content

Instantly share code, notes, and snippets.

@dewittpe
Created September 13, 2015 03:48
Show Gist options
  • Save dewittpe/bcc78fd45b3d4887deb5 to your computer and use it in GitHub Desktop.
Save dewittpe/bcc78fd45b3d4887deb5 to your computer and use it in GitHub Desktop.
Regular expressions for selecting columns to read into R via `readr::read_delim`
# Example using regular expressions and setting col_types for use with
# readr::read_delim
# function select_cols
# args:
# clnms a character vector of column names
# rexprs a character vector of regular expressions to search clnms for. These
# rexprs select the columns form the .csv
# types a character vector of "l", "i", "d", "c" for logical, integer,
# double, and character. see documentation for readr for more detail
# return:
# a charcter string to pass to the col_types argument of readr::read_csv()
select_cols <- function(clnms, rexprs, types) {
if (length(rexprs) != length(types))
stop("length(rexprs) != length(types))")
cls <- rep("_", length(clnms))
for(i in seq_along(rexprs)) {
cls[grep(rexprs[i], clnms)] <- types[i]
}
paste(cls, collapse = "")
}
# Example data
input_data <-
'"","sid","sex","age","group","proc1","proc2","proc3","proc4","proc5","proc6","proc7","proc8","proc9","proc10","proc11","proc12","procdos1","procdos2","procdos3","procdos4","procdos5","procdos6","procdos7","procdos8","procdos9","procdos10","procdos11","procdos12"
"1",1,"F",8,"C",763.83,441.6,400.08,708.04,138.69,623.44,700.68,663.61,648.04,421.05,913.26,233.6,4,0,5,14,2,33,7,1,36,28,2,3
"2",2,"F",13,"A",829.94,492.19,412.07,984.53,226.43,242.11,100.22,295.44,853.11,469.57,643.12,172.24,9,18,5,14,10,14,8,0,10,7,12,9
"3",3,"M",17,"C",449.29,133.69,458.63,783.58,294.74,423.12,287.71,294.91,776.36,616.12,668.35,517.66,5,7,2,22,30,35,0,7,39,2,15,6
"4",4,"F",5,"C",716.65,976.18,806.22,609.83,531.45,681.06,939.72,450.05,507.45,630.7,943.64,801.42,12,18,7,28,11,0,38,10,9,16,8,8
"5",5,"F",8,"A",103.55,488.57,135.04,864.71,277.67,798.23,933.07,948.2,582.21,747.68,865.43,760.17,3,4,4,4,0,13,5,9,12,9,1,13
"6",6,"F",17,"B",849.62,961.81,773.91,270.52,747.41,607.28,760.68,966.34,583.63,455.47,621.83,835.5,5,13,4,4,1,21,24,13,10,9,2,25
"7",7,"F",18,"B",106.6,898.97,709.54,344.16,107.1,310.33,399.76,765.86,101.24,927.27,839.26,253.14,18,13,0,3,0,1,2,9,25,24,4,5
"8",8,"M",2,"C",286.89,675.97,254.14,845.33,437.94,180.98,563.55,759.91,420.1,966.3,202.35,950.24,35,7,8,8,2,22,4,33,5,25,5,11
"9",9,"F",9,"B",915.93,973.86,334.98,723.88,562.96,177.05,769.57,582.18,650.91,310.17,788.05,364.26,3,6,19,1,9,20,22,18,2,15,0,30
"10",10,"F",10,"C",650.59,656.95,562.97,316.49,101.41,374.69,657.24,102.05,846.04,752.04,661.25,234.16,19,1,5,0,18,0,14,1,15,3,12,30
'
# Read in the names of the data set
input_data_clnms <-
names(readr::read_csv(input_data, n_max = 1, col_names = TRUE)[1, ])
# use the select_cols function to read in only the id, age, group, procedures 1 through 5
# and the day of service for each procedure.
pick_these_columns <-
select_cols(clnms = input_data_clnms,
rexprs = c("sid|procdos[1-5]$", "age|proc[1-5]$", "group"),
types = c("i", "d", "c"))
pick_these_columns
# now read in the data
readr::read_csv(input_data, col_types = pick_these_columns)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment