seabbs/data-read-plus-munge

## data-read-plus-munge
#  Get required packages - managed using pacman ---------------------------
# This installs packages if they are missing and otherwise loads them.
if (!require(pacman)) install.packages("pacman"); library(pacman)
p_load("tidyverse")
p_load("fs")
p_load("data.table")
p_load("lubridate")
p_load("purrr")
p_load("furrr")


# Get filenames -----------------------------------------------------------

get_filenames <- function(directory = NULL, sub_path = NULL) {
  folders <- fs::dir_ls(path = directory) %>%
    unname

  #fill vector with names of files
  # Moved from a for loop to a vectorised approach (shorter + faster)
  filenames <- purrr::map_chr(folders, ~ fs::path(., sub_path))

  return(filenames)
}

## Modify filenames for your data.
filenames <- get_filenames("raw-data", sub_path = file.path("SheepScab", "Sheep", "I_0forsam.csv"))


# Read in (single) -----------------------------------------------------------------

## This function summarises a single file
## Currently data munging is implemented in dplyr + base + data.table
## This means that code clarity has been sacrificed for performance.
summarise_file <- function(file) {

  ## Read in the data using data.table
  df <- data.table::fread(file)

  ## Turn into a tibble
  df <- tibble::as_tibble(df)

  ## Time variables
  df$time <-  lubridate::as_date(df$time)
  df$month <- lubridate::month(df$time)
  df$year <- lubridate::year(df$time)
  df$month_year <- lubridate::floor_date(df$time, "month")

  df <- dplyr::select(df, iteration, time, month, year, month_year, dplyr::everything())

  ## Summarise across farms
  ## Number infected
  df_mat <- as.matrix(df[, -(1:5)])
  df_mat <- sign(df_mat)
  df$no_inf_farms <- rowSums(df_mat)

  ## Pull variables of interest
  df_sum <-  dplyr::select(df, iteration, time, month, year, month_year, no_inf_farms)


  ## Summarise within farms - highly compressed for speed
  ## Convert to data.table
  df_long <- data.table::as.data.table(df)

  ## Add a row with 0 infected cases at the end of the simulation.

  ## Go long from wide
  df_long <- data.table::melt.data.table(df_long, id.vars = c("iteration", "time",
                                                              "month", "year", "month_year"),
                                         variable.name = "farm", value.name = "infected")

  ##Find all farms and add new ending iteration with zero cases
  all_farms <- df_long[iteration == max(iteration)][,`:=`(iteration = max(iteration) + 1,
                                         infected = 0, type = "maximum")]

  ## Pull out new infections
  df_time_inf <- df_long[infected > 0 & data.table::shift(infected, 1, type = "lag") == 0][, type := "infected"]

  ## Pull out new recoveries
  df_time_rec  <- df_long[infected == 0 & data.table::shift(infected, 1, type = "lag") > 0][,type := "recovered"]

  ## Join data frames together
  df_long <- rbind(df_time_inf, df_time_rec, all_farms)

  ## Find time between infection and recoveries
  df_long <- df_long[order(iteration)][,
                                       time_infected := data.table::shift(iteration, 1, type = "lead") -
                                         iteration,
                                       by = farm][type == "infected"]

  ## Make farms human readable
  df_long <- df_long[, farm := data.table::last(data.table::tstrsplit(farm, "/", fixed=TRUE))]

  df_long <- tibble::as_tibble(df_long)

  out <- list(df_sum, df_long)

  return(out)

}

## Forces JIT compilation (should occur by default)
summarise_file <- compiler::cmpfun(summarise_file)

## My profiling tests with a 1e6 row dataframe show that (in %):
## Overall - 1200ms
## See img/profile.png for details

# Summarise all files -----------------------------------------------------

## This uses parallisation via the future package.
## If your datasets are large you may run out of RAM and this will crash
## If this happens comment out plan(multiprocess) and setDTthreads and uncomment plan(sequential) + setDTthreads
## For small jobs it may be faster to run sequentially as setting up parallisation adds some overhead.
## As it stands it looks like using a sequential plan and allowing data.table to be parallel is fastest.
##future::plan(future::multiprocess(workers = future::availableCores() / 2))
##data.table::setDTthreads(threads = 1)

plan(sequential)
data.table::setDTthreads(threads = future::availableCores() / 2)

## Run summary function
sim_sum <- future_map(filenames, summarise_file, .progress = TRUE)

## Flip list order
sim_sum <- purrr::transpose(sim_sum)

##Pull out and bind no in farm df
disease_sum <- dplyr::bind_rows(sim_sum[[1]], .id = "id")

## Pull and bind when farms inf df
farm_sum <- dplyr::bind_rows(sim_sum[[2]], .id = "id")

# Save results ------------------------------------------------------------

data.table::fwrite(disease_sum, "data/disease_sum.csv")

data.table::fwrite(farm_sum, "data/farm_sum.csv")
	# Get required packages - managed using pacman ---------------------------
	# This installs packages if they are missing and otherwise loads them.
	if (!require(pacman)) install.packages("pacman"); library(pacman)
	p_load("tidyverse")
	p_load("fs")
	p_load("data.table")
	p_load("lubridate")
	p_load("purrr")
	p_load("furrr")


	# Get filenames -----------------------------------------------------------

	get_filenames <- function(directory = NULL, sub_path = NULL) {
	folders <- fs::dir_ls(path = directory) %>%
	unname

	#fill vector with names of files
	# Moved from a for loop to a vectorised approach (shorter + faster)
	filenames <- purrr::map_chr(folders, ~ fs::path(., sub_path))

	return(filenames)
	}

	## Modify filenames for your data.
	filenames <- get_filenames("raw-data", sub_path = file.path("SheepScab", "Sheep", "I_0forsam.csv"))


	# Read in (single) -----------------------------------------------------------------

	## This function summarises a single file
	## Currently data munging is implemented in dplyr + base + data.table
	## This means that code clarity has been sacrificed for performance.
	summarise_file <- function(file) {

	## Read in the data using data.table
	df <- data.table::fread(file)

	## Turn into a tibble
	df <- tibble::as_tibble(df)

	## Time variables
	df$time <- lubridate::as_date(df$time)
	df$month <- lubridate::month(df$time)
	df$year <- lubridate::year(df$time)
	df$month_year <- lubridate::floor_date(df$time, "month")

	df <- dplyr::select(df, iteration, time, month, year, month_year, dplyr::everything())

	## Summarise across farms
	## Number infected
	df_mat <- as.matrix(df[, -(1:5)])
	df_mat <- sign(df_mat)
	df$no_inf_farms <- rowSums(df_mat)

	## Pull variables of interest
	df_sum <- dplyr::select(df, iteration, time, month, year, month_year, no_inf_farms)


	## Summarise within farms - highly compressed for speed
	## Convert to data.table
	df_long <- data.table::as.data.table(df)

	## Add a row with 0 infected cases at the end of the simulation.

	## Go long from wide
	df_long <- data.table::melt.data.table(df_long, id.vars = c("iteration", "time",
	"month", "year", "month_year"),
	variable.name = "farm", value.name = "infected")

	##Find all farms and add new ending iteration with zero cases
	all_farms <- df_long[iteration == max(iteration)][,`:=`(iteration = max(iteration) + 1,
	infected = 0, type = "maximum")]

	## Pull out new infections
	df_time_inf <- df_long[infected > 0 & data.table::shift(infected, 1, type = "lag") == 0][, type := "infected"]

	## Pull out new recoveries
	df_time_rec <- df_long[infected == 0 & data.table::shift(infected, 1, type = "lag") > 0][,type := "recovered"]

	## Join data frames together
	df_long <- rbind(df_time_inf, df_time_rec, all_farms)

	## Find time between infection and recoveries
	df_long <- df_long[order(iteration)][,
	time_infected := data.table::shift(iteration, 1, type = "lead") -
	iteration,
	by = farm][type == "infected"]

	## Make farms human readable
	df_long <- df_long[, farm := data.table::last(data.table::tstrsplit(farm, "/", fixed=TRUE))]

	df_long <- tibble::as_tibble(df_long)

	out <- list(df_sum, df_long)

	return(out)

	}

	## Forces JIT compilation (should occur by default)
	summarise_file <- compiler::cmpfun(summarise_file)

	## My profiling tests with a 1e6 row dataframe show that (in %):
	## Overall - 1200ms
	## See img/profile.png for details

	# Summarise all files -----------------------------------------------------

	## This uses parallisation via the future package.
	## If your datasets are large you may run out of RAM and this will crash
	## If this happens comment out plan(multiprocess) and setDTthreads and uncomment plan(sequential) + setDTthreads
	## For small jobs it may be faster to run sequentially as setting up parallisation adds some overhead.
	## As it stands it looks like using a sequential plan and allowing data.table to be parallel is fastest.
	##future::plan(future::multiprocess(workers = future::availableCores() / 2))
	##data.table::setDTthreads(threads = 1)

	plan(sequential)
	data.table::setDTthreads(threads = future::availableCores() / 2)

	## Run summary function
	sim_sum <- future_map(filenames, summarise_file, .progress = TRUE)

	## Flip list order
	sim_sum <- purrr::transpose(sim_sum)

	##Pull out and bind no in farm df
	disease_sum <- dplyr::bind_rows(sim_sum[[1]], .id = "id")

	## Pull and bind when farms inf df
	farm_sum <- dplyr::bind_rows(sim_sum[[2]], .id = "id")

	# Save results ------------------------------------------------------------

	data.table::fwrite(disease_sum, "data/disease_sum.csv")

	data.table::fwrite(farm_sum, "data/farm_sum.csv")