bjurban/tidy-hobo-data.R

## tidy-hobo-data.R
# Combine multiple raw HOBO data files into a single .csv file
# Author: Bryan Urban
# Email: burban@fraunhofer.org
# Date: 2015-04-30

## SETUP --------------------

# install these two packages first if you don't have them:
# install.packages("data.table")
# install.packages("lubridate")
library(data.table)
library(lubridate)

# change these to match the folders containing the data
raw_dir <- "D:/data/project/raw"
out_dir <- "D:/data/project"


## LOAD RAW DATA ------------
# get file names:
pattern = ".*csv$" # for identifying files to read
fns <- list.files(raw_dir, pattern=pattern, full.names=TRUE)

# load data into lists
read_and_label <- function(x,...){
  z <- fread(x,...)

  # add file name without the extension as id column
  pattern <- "(.*\\/)([^.]+)(\\.csv$)"
  z$ids <- sub(pattern, "\\2", x)
  z
}

# reads columns 2 and 3 (timestamp and temperature) into a list of data.table
all_data <-
  lapply(fns, function(x,...) {try(read_and_label(x,...))},
         select=2:3, header=FALSE, skip=2
  )

## PROCESS RAW DATA ---------
# drop errors, merge into one large data.table, name columns, parse timestamp
all_data <- all_data[sapply(all_data, is.data.table)]
all_data <- rbindlist(all_data)

setnames(all_data, c("ts", "temp","ids"))
all_data[, ts:=floor_date(mdy_hms(ts), "minute")] # floor ts to nearest minute

# drop rows with missing data
all_data <- all_data[complete.cases(all_data),]

## SAVE PROCESSED DATA ------
# write data in one big file
write.csv(all_data, paste(out_dir, "all_data.csv", sep="/"),
          row.names=FALSE)
	# Combine multiple raw HOBO data files into a single .csv file
	# Author: Bryan Urban
	# Email: burban@fraunhofer.org
	# Date: 2015-04-30

	## SETUP --------------------

	# install these two packages first if you don't have them:
	# install.packages("data.table")
	# install.packages("lubridate")
	library(data.table)
	library(lubridate)

	# change these to match the folders containing the data
	raw_dir <- "D:/data/project/raw"
	out_dir <- "D:/data/project"


	## LOAD RAW DATA ------------
	# get file names:
	pattern = ".*csv$" # for identifying files to read
	fns <- list.files(raw_dir, pattern=pattern, full.names=TRUE)

	# load data into lists
	read_and_label <- function(x,...){
	z <- fread(x,...)

	# add file name without the extension as id column
	pattern <- "(.*\\/)([^.]+)(\\.csv$)"
	z$ids <- sub(pattern, "\\2", x)
	z
	}

	# reads columns 2 and 3 (timestamp and temperature) into a list of data.table
	all_data <-
	lapply(fns, function(x,...) {try(read_and_label(x,...))},
	select=2:3, header=FALSE, skip=2
	)

	## PROCESS RAW DATA ---------
	# drop errors, merge into one large data.table, name columns, parse timestamp
	all_data <- all_data[sapply(all_data, is.data.table)]
	all_data <- rbindlist(all_data)

	setnames(all_data, c("ts", "temp","ids"))
	all_data[, ts:=floor_date(mdy_hms(ts), "minute")] # floor ts to nearest minute

	# drop rows with missing data
	all_data <- all_data[complete.cases(all_data),]

	## SAVE PROCESSED DATA ------
	# write data in one big file
	write.csv(all_data, paste(out_dir, "all_data.csv", sep="/"),
	row.names=FALSE)