Created
September 13, 2015 03:48
-
-
Save dewittpe/bcc78fd45b3d4887deb5 to your computer and use it in GitHub Desktop.
Regular expressions for selecting columns to read into R via `readr::read_delim`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Example using regular expressions and setting col_types for use with | |
# readr::read_delim | |
# function select_cols | |
# args: | |
# clnms a character vector of column names | |
# rexprs a character vector of regular expressions to search clnms for. These | |
# rexprs select the columns form the .csv | |
# types a character vector of "l", "i", "d", "c" for logical, integer, | |
# double, and character. see documentation for readr for more detail | |
# return: | |
# a charcter string to pass to the col_types argument of readr::read_csv() | |
select_cols <- function(clnms, rexprs, types) { | |
if (length(rexprs) != length(types)) | |
stop("length(rexprs) != length(types))") | |
cls <- rep("_", length(clnms)) | |
for(i in seq_along(rexprs)) { | |
cls[grep(rexprs[i], clnms)] <- types[i] | |
} | |
paste(cls, collapse = "") | |
} | |
# Example data | |
input_data <- | |
'"","sid","sex","age","group","proc1","proc2","proc3","proc4","proc5","proc6","proc7","proc8","proc9","proc10","proc11","proc12","procdos1","procdos2","procdos3","procdos4","procdos5","procdos6","procdos7","procdos8","procdos9","procdos10","procdos11","procdos12" | |
"1",1,"F",8,"C",763.83,441.6,400.08,708.04,138.69,623.44,700.68,663.61,648.04,421.05,913.26,233.6,4,0,5,14,2,33,7,1,36,28,2,3 | |
"2",2,"F",13,"A",829.94,492.19,412.07,984.53,226.43,242.11,100.22,295.44,853.11,469.57,643.12,172.24,9,18,5,14,10,14,8,0,10,7,12,9 | |
"3",3,"M",17,"C",449.29,133.69,458.63,783.58,294.74,423.12,287.71,294.91,776.36,616.12,668.35,517.66,5,7,2,22,30,35,0,7,39,2,15,6 | |
"4",4,"F",5,"C",716.65,976.18,806.22,609.83,531.45,681.06,939.72,450.05,507.45,630.7,943.64,801.42,12,18,7,28,11,0,38,10,9,16,8,8 | |
"5",5,"F",8,"A",103.55,488.57,135.04,864.71,277.67,798.23,933.07,948.2,582.21,747.68,865.43,760.17,3,4,4,4,0,13,5,9,12,9,1,13 | |
"6",6,"F",17,"B",849.62,961.81,773.91,270.52,747.41,607.28,760.68,966.34,583.63,455.47,621.83,835.5,5,13,4,4,1,21,24,13,10,9,2,25 | |
"7",7,"F",18,"B",106.6,898.97,709.54,344.16,107.1,310.33,399.76,765.86,101.24,927.27,839.26,253.14,18,13,0,3,0,1,2,9,25,24,4,5 | |
"8",8,"M",2,"C",286.89,675.97,254.14,845.33,437.94,180.98,563.55,759.91,420.1,966.3,202.35,950.24,35,7,8,8,2,22,4,33,5,25,5,11 | |
"9",9,"F",9,"B",915.93,973.86,334.98,723.88,562.96,177.05,769.57,582.18,650.91,310.17,788.05,364.26,3,6,19,1,9,20,22,18,2,15,0,30 | |
"10",10,"F",10,"C",650.59,656.95,562.97,316.49,101.41,374.69,657.24,102.05,846.04,752.04,661.25,234.16,19,1,5,0,18,0,14,1,15,3,12,30 | |
' | |
# Read in the names of the data set | |
input_data_clnms <- | |
names(readr::read_csv(input_data, n_max = 1, col_names = TRUE)[1, ]) | |
# use the select_cols function to read in only the id, age, group, procedures 1 through 5 | |
# and the day of service for each procedure. | |
pick_these_columns <- | |
select_cols(clnms = input_data_clnms, | |
rexprs = c("sid|procdos[1-5]$", "age|proc[1-5]$", "group"), | |
types = c("i", "d", "c")) | |
pick_these_columns | |
# now read in the data | |
readr::read_csv(input_data, col_types = pick_these_columns) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment