erikerhardt/Open zip and read csv workflow.R

## Open zip and read csv workflow.R
# don't print readr::read_csv() column types
options(readr.num_columns = 0)
options(readr.show_col_types = FALSE)
# remove attributes after readr::read_csv()
# attr(df, "spec") <- NULL

# remove $ and , from dollar amounts; (5) is negative 5, so remove ")" and replace "(" with "-"
`Demand Sales`      = gsub("[(]", "-", gsub("[\\$,)]", "", `Demand Sales`      )) %>% as.numeric()


# which to use?
tab_list_to_use <-
  c(
  #  "Tab A"
  #, "Tab B"
    "Tab C"
  , "Tab D"
  #, "Tab E"
  #, "Tab F"
  #, "Tab G"
  #, "Tab H"
  , "Tab I"
  #, "Tab J"
  #, "Tab K1"
  #, "Tab K2"
  #, "Tab L"
  #, "Tab M"
  #, "Tab N"
  )


## ## https://stackoverflow.com/questions/46682818/read-a-csv-file-in-a-zipped-folder-with-r-without-unzipping
## # just a list of files inside master.zip
## master <- as.character(unzip("master.zip", list = TRUE)$Name)
## # load the first file "file1.csv"
## data <- read.csv(unz("master.zip", "file1.csv"), header = TRUE, sep = ",")


## Read files and get site and table names
fn_path  <-
  "../data"

list_files <-
  list.files(
    path = fn_path
  , pattern = "*.zip"
  )

list_files_dates <-
  list_files %>%
  str_split_fixed(
    pattern = fixed("_")
  , n = 5
  ) %>%
  as_tibble() %>%
  select(5) %>%
  unlist() %>%
  str_split(
    pattern = fixed(".zip")
  #, n = 2
  , simplify = TRUE
  ) %>%
  as_tibble() %>%
  select(1) %>%
  unlist()

list_files_all <-
  tibble(
    fn   = list_files
  , date = list_files_dates
  )


# list to hold all data
dat_sheet <- list()

# initialize tables
for (n_tab in tab_list_to_use) {
  dat_sheet[[ n_tab ]] <- list()
}

for (i_row in 1:nrow(list_files_all)) {
  ## i_row = 1

  message("Reading file: ", list_files_all[["fn"]][i_row])

  ## ## https://stackoverflow.com/questions/46682818/read-a-csv-file-in-a-zipped-folder-with-r-without-unzipping
  ## # just a list of files inside master.zip
  ## master <- as.character(unzip("master.zip", list = TRUE)$Name)
  ## # load the first file "file1.csv"
  ## data <- read.csv(unz("master.zip", "file1.csv"), header = TRUE, sep = ",")

  zip_file_names <-
    file.path(
      fn_path
    , list_files_all[["fn"]][i_row]
    ) %>%
    unzip(list = TRUE) %>%
    pull(Name) %>%
    as.character()

  # "Tab A - Summary.csv"
  # "Tab B- Summary Detail.csv"
  # "Tab C- Age and Gender Detail.csv"
  # "Tab D- Assessment Detail.csv"
  # "Tab E - Master HH Detail.csv"
  # "Tab F - Client HH Detail.csv"
  # "Tab G - Master HH Membership.csv"
  # "Tab H - Client HH Membership.csv"
  # "Tab I - Entry Exit Detail.csv"
  # "Tab J - Income Detail.csv"
  # "Tab K1 - NC Benefit Detail.csv"
  # "Tab K2 - Ins Benefit Detail.csv"
  # "Tab L - Disability Detail.csv"
  # "Tab M - Duplicate Clients.csv"
  # "Tab N - Additional Information.csv"

  ind_files <-
    which(
      zip_file_names %>%
      str_split_fixed(
        pattern = fixed("-")
      , n = 2
      #, simplify = TRUE
      ) %>%
      as_tibble() %>%
      select(1) %>%
      unlist() %>%
      str_trim()
        %in%
      tab_list_to_use
    )

  for (i_csv in seq_along(ind_files)) {
    # i_csv = 1

    dat_sheet[[ tab_list_to_use[i_csv] ]][[ list_files_all[["date"]][i_row] ]] <-
      # open zip file
      unz(
        description =
          file.path(
            fn_path
          , list_files_all[["fn"]][i_row]
          )
      , filename =
          zip_file_names[ind_files][i_csv]
      ) %>%
      # read csv sheet
      read.csv(
        header  = TRUE
      , sep     = ","
      ) %>%
      # track which file these data came from
      mutate(
        sheet_date = list_files_all[["date"]][i_row]
      , sheet_name = tab_list_to_use[i_csv]
      )
  }
}


# bind all dates together
for (n_tab in tab_list_to_use) {
  dat_sheet[[ n_tab ]] <-
    bind_rows(
      dat_sheet[[ n_tab ]]
    )
}

sapply(X = dat_sheet, FUN = dim)
	# don't print readr::read_csv() column types
	options(readr.num_columns = 0)
	options(readr.show_col_types = FALSE)
	# remove attributes after readr::read_csv()
	# attr(df, "spec") <- NULL

	# remove $ and , from dollar amounts; (5) is negative 5, so remove ")" and replace "(" with "-"
	`Demand Sales` = gsub("[(]", "-", gsub("[\\$,)]", "", `Demand Sales` )) %>% as.numeric()


	# which to use?
	tab_list_to_use <-
	c(
	# "Tab A"
	#, "Tab B"
	"Tab C"
	, "Tab D"
	#, "Tab E"
	#, "Tab F"
	#, "Tab G"
	#, "Tab H"
	, "Tab I"
	#, "Tab J"
	#, "Tab K1"
	#, "Tab K2"
	#, "Tab L"
	#, "Tab M"
	#, "Tab N"
	)



	## ## https://stackoverflow.com/questions/46682818/read-a-csv-file-in-a-zipped-folder-with-r-without-unzipping
	## # just a list of files inside master.zip
	## master <- as.character(unzip("master.zip", list = TRUE)$Name)
	## # load the first file "file1.csv"
	## data <- read.csv(unz("master.zip", "file1.csv"), header = TRUE, sep = ",")




	## Read files and get site and table names
	fn_path <-
	"../data"

	list_files <-
	list.files(
	path = fn_path
	, pattern = "*.zip"
	)

	list_files_dates <-
	list_files %>%
	str_split_fixed(
	pattern = fixed("_")
	, n = 5
	) %>%
	as_tibble() %>%
	select(5) %>%
	unlist() %>%
	str_split(
	pattern = fixed(".zip")
	#, n = 2
	, simplify = TRUE
	) %>%
	as_tibble() %>%
	select(1) %>%
	unlist()

	list_files_all <-
	tibble(
	fn = list_files
	, date = list_files_dates
	)


	# list to hold all data
	dat_sheet <- list()

	# initialize tables
	for (n_tab in tab_list_to_use) {
	dat_sheet[[ n_tab ]] <- list()
	}

	for (i_row in 1:nrow(list_files_all)) {
	## i_row = 1

	message("Reading file: ", list_files_all[["fn"]][i_row])

	## ## https://stackoverflow.com/questions/46682818/read-a-csv-file-in-a-zipped-folder-with-r-without-unzipping
	## # just a list of files inside master.zip
	## master <- as.character(unzip("master.zip", list = TRUE)$Name)
	## # load the first file "file1.csv"
	## data <- read.csv(unz("master.zip", "file1.csv"), header = TRUE, sep = ",")

	zip_file_names <-
	file.path(
	fn_path
	, list_files_all[["fn"]][i_row]
	) %>%
	unzip(list = TRUE) %>%
	pull(Name) %>%
	as.character()

	# "Tab A - Summary.csv"
	# "Tab B- Summary Detail.csv"
	# "Tab C- Age and Gender Detail.csv"
	# "Tab D- Assessment Detail.csv"
	# "Tab E - Master HH Detail.csv"
	# "Tab F - Client HH Detail.csv"
	# "Tab G - Master HH Membership.csv"
	# "Tab H - Client HH Membership.csv"
	# "Tab I - Entry Exit Detail.csv"
	# "Tab J - Income Detail.csv"
	# "Tab K1 - NC Benefit Detail.csv"
	# "Tab K2 - Ins Benefit Detail.csv"
	# "Tab L - Disability Detail.csv"
	# "Tab M - Duplicate Clients.csv"
	# "Tab N - Additional Information.csv"

	ind_files <-
	which(
	zip_file_names %>%
	str_split_fixed(
	pattern = fixed("-")
	, n = 2
	#, simplify = TRUE
	) %>%
	as_tibble() %>%
	select(1) %>%
	unlist() %>%
	str_trim()
	%in%
	tab_list_to_use
	)

	for (i_csv in seq_along(ind_files)) {
	# i_csv = 1

	dat_sheet[[ tab_list_to_use[i_csv] ]][[ list_files_all[["date"]][i_row] ]] <-
	# open zip file
	unz(
	description =
	file.path(
	fn_path
	, list_files_all[["fn"]][i_row]
	)
	, filename =
	zip_file_names[ind_files][i_csv]
	) %>%
	# read csv sheet
	read.csv(
	header = TRUE
	, sep = ","
	) %>%
	# track which file these data came from
	mutate(
	sheet_date = list_files_all[["date"]][i_row]
	, sheet_name = tab_list_to_use[i_csv]
	)
	}
	}


	# bind all dates together
	for (n_tab in tab_list_to_use) {
	dat_sheet[[ n_tab ]] <-
	bind_rows(
	dat_sheet[[ n_tab ]]
	)
	}

	sapply(X = dat_sheet, FUN = dim)