dewittpe/select_cols.R

## select_cols.R
# Example using regular expressions and setting col_types for use with
# readr::read_delim

# function select_cols
# args:
#   clnms  a character vector of column names
#   rexprs a character vector of regular expressions to search clnms for.  These
#          rexprs select the columns form the .csv
#   types  a character vector of "l", "i", "d", "c" for logical, integer,
#          double, and character.  see documentation for readr for more detail
# return:
#   a charcter string to pass to the col_types argument of readr::read_csv()

select_cols <- function(clnms, rexprs, types) {
  if (length(rexprs) != length(types))
    stop("length(rexprs) != length(types))")

  cls <- rep("_", length(clnms))

  for(i in seq_along(rexprs)) {
    cls[grep(rexprs[i], clnms)] <- types[i]
  }
  paste(cls, collapse = "")
}


# Example data
input_data <-
'"","sid","sex","age","group","proc1","proc2","proc3","proc4","proc5","proc6","proc7","proc8","proc9","proc10","proc11","proc12","procdos1","procdos2","procdos3","procdos4","procdos5","procdos6","procdos7","procdos8","procdos9","procdos10","procdos11","procdos12"
"1",1,"F",8,"C",763.83,441.6,400.08,708.04,138.69,623.44,700.68,663.61,648.04,421.05,913.26,233.6,4,0,5,14,2,33,7,1,36,28,2,3
"2",2,"F",13,"A",829.94,492.19,412.07,984.53,226.43,242.11,100.22,295.44,853.11,469.57,643.12,172.24,9,18,5,14,10,14,8,0,10,7,12,9
"3",3,"M",17,"C",449.29,133.69,458.63,783.58,294.74,423.12,287.71,294.91,776.36,616.12,668.35,517.66,5,7,2,22,30,35,0,7,39,2,15,6
"4",4,"F",5,"C",716.65,976.18,806.22,609.83,531.45,681.06,939.72,450.05,507.45,630.7,943.64,801.42,12,18,7,28,11,0,38,10,9,16,8,8
"5",5,"F",8,"A",103.55,488.57,135.04,864.71,277.67,798.23,933.07,948.2,582.21,747.68,865.43,760.17,3,4,4,4,0,13,5,9,12,9,1,13
"6",6,"F",17,"B",849.62,961.81,773.91,270.52,747.41,607.28,760.68,966.34,583.63,455.47,621.83,835.5,5,13,4,4,1,21,24,13,10,9,2,25
"7",7,"F",18,"B",106.6,898.97,709.54,344.16,107.1,310.33,399.76,765.86,101.24,927.27,839.26,253.14,18,13,0,3,0,1,2,9,25,24,4,5
"8",8,"M",2,"C",286.89,675.97,254.14,845.33,437.94,180.98,563.55,759.91,420.1,966.3,202.35,950.24,35,7,8,8,2,22,4,33,5,25,5,11
"9",9,"F",9,"B",915.93,973.86,334.98,723.88,562.96,177.05,769.57,582.18,650.91,310.17,788.05,364.26,3,6,19,1,9,20,22,18,2,15,0,30
"10",10,"F",10,"C",650.59,656.95,562.97,316.49,101.41,374.69,657.24,102.05,846.04,752.04,661.25,234.16,19,1,5,0,18,0,14,1,15,3,12,30
'
# Read in the names of the data set
input_data_clnms <-
  names(readr::read_csv(input_data, n_max = 1, col_names = TRUE)[1, ])


# use the select_cols function to read in only the id, age, group, procedures 1 through 5
# and the day of service for each procedure.

pick_these_columns <-
  select_cols(clnms  = input_data_clnms,
              rexprs = c("sid|procdos[1-5]$", "age|proc[1-5]$", "group"),
              types  = c("i", "d", "c"))

pick_these_columns

# now read in the data
readr::read_csv(input_data, col_types = pick_these_columns)
	# Example using regular expressions and setting col_types for use with
	# readr::read_delim

	# function select_cols
	# args:
	# clnms a character vector of column names
	# rexprs a character vector of regular expressions to search clnms for. These
	# rexprs select the columns form the .csv
	# types a character vector of "l", "i", "d", "c" for logical, integer,
	# double, and character. see documentation for readr for more detail
	# return:
	# a charcter string to pass to the col_types argument of readr::read_csv()

	select_cols <- function(clnms, rexprs, types) {
	if (length(rexprs) != length(types))
	stop("length(rexprs) != length(types))")

	cls <- rep("_", length(clnms))

	for(i in seq_along(rexprs)) {
	cls[grep(rexprs[i], clnms)] <- types[i]
	}
	paste(cls, collapse = "")
	}


	# Example data
	input_data <-
	'"","sid","sex","age","group","proc1","proc2","proc3","proc4","proc5","proc6","proc7","proc8","proc9","proc10","proc11","proc12","procdos1","procdos2","procdos3","procdos4","procdos5","procdos6","procdos7","procdos8","procdos9","procdos10","procdos11","procdos12"
	"1",1,"F",8,"C",763.83,441.6,400.08,708.04,138.69,623.44,700.68,663.61,648.04,421.05,913.26,233.6,4,0,5,14,2,33,7,1,36,28,2,3
	"2",2,"F",13,"A",829.94,492.19,412.07,984.53,226.43,242.11,100.22,295.44,853.11,469.57,643.12,172.24,9,18,5,14,10,14,8,0,10,7,12,9
	"3",3,"M",17,"C",449.29,133.69,458.63,783.58,294.74,423.12,287.71,294.91,776.36,616.12,668.35,517.66,5,7,2,22,30,35,0,7,39,2,15,6
	"4",4,"F",5,"C",716.65,976.18,806.22,609.83,531.45,681.06,939.72,450.05,507.45,630.7,943.64,801.42,12,18,7,28,11,0,38,10,9,16,8,8
	"5",5,"F",8,"A",103.55,488.57,135.04,864.71,277.67,798.23,933.07,948.2,582.21,747.68,865.43,760.17,3,4,4,4,0,13,5,9,12,9,1,13
	"6",6,"F",17,"B",849.62,961.81,773.91,270.52,747.41,607.28,760.68,966.34,583.63,455.47,621.83,835.5,5,13,4,4,1,21,24,13,10,9,2,25
	"7",7,"F",18,"B",106.6,898.97,709.54,344.16,107.1,310.33,399.76,765.86,101.24,927.27,839.26,253.14,18,13,0,3,0,1,2,9,25,24,4,5
	"8",8,"M",2,"C",286.89,675.97,254.14,845.33,437.94,180.98,563.55,759.91,420.1,966.3,202.35,950.24,35,7,8,8,2,22,4,33,5,25,5,11
	"9",9,"F",9,"B",915.93,973.86,334.98,723.88,562.96,177.05,769.57,582.18,650.91,310.17,788.05,364.26,3,6,19,1,9,20,22,18,2,15,0,30
	"10",10,"F",10,"C",650.59,656.95,562.97,316.49,101.41,374.69,657.24,102.05,846.04,752.04,661.25,234.16,19,1,5,0,18,0,14,1,15,3,12,30
	'
	# Read in the names of the data set
	input_data_clnms <-
	names(readr::read_csv(input_data, n_max = 1, col_names = TRUE)[1, ])


	# use the select_cols function to read in only the id, age, group, procedures 1 through 5
	# and the day of service for each procedure.

	pick_these_columns <-
	select_cols(clnms = input_data_clnms,
	rexprs = c("sid\|procdos[1-5]$", "age\|proc[1-5]$", "group"),
	types = c("i", "d", "c"))

	pick_these_columns

	# now read in the data
	readr::read_csv(input_data, col_types = pick_these_columns)