mbjoseph/usda_dataimport.R

## usda_dataimport.R

# Importing data from USDA into large data file ---------------------------
library(stringr)
library(dplyr)

path_to_data <- "~/Desktop/ams_cattle_data/"
data_files <- list.files(path = path_to_data, pattern = "cattle",
                         full.names = TRUE)

# Define helper functions -------------------------------------------------
parse_row <- function(text) {
  # Identifies columns in the text files:
  # - replaces spaces with underscores
  # - returns a vector that splits character string based on 2+ underscores
  #   where each element in the vector is an entry in a column for that row
  text_vec <- gsub(pattern = ' ', replacement = '_', x = text)
  unlist(strsplit(text_vec, split = '[_]{2,}'))
}


maybe_as.numeric <- function(x) {
  # tries to convert the input to a number
  # but if it fails (all converted things are NA)
  # return the original input, otherwise return the numeric version
  num_x <- suppressWarnings(as.numeric(x))
  if (all(is.na(num_x))) {
    res <- x
  } else {
    res <- num_x
  }
  res
}

convert_to_df <- function(file) {
  # takes a text file as input and parses the file to generate a data.frame
  x <- readLines(file)
  x <- x[grepl("[[:alpha:]]", x)]  # remove blank lines

  # convert to a list of vectors, where each element in list is a row
  xl <- t(do.call(rbind, list(x))) %>%
    apply(1, parse_row) %>%
    unique()

  # row bind the elements in the list together to make a data.frame
  df <- do.call(rbind.data.frame, xl[-1])

  # produce names for each column based on the first header row, but removing
  # comments and pricing point
  cols_to_ignore <- c('Comments', 'Pricing_Point')
  names(df) <- xl[[1]][!(xl[[1]] %in% cols_to_ignore)]

  # convert every entry to character (by default they are factors)
  df[] <- lapply(df, as.character)

  # try to convert each column to numeric and return a clean data frame
  df <- df %>%
    lapply(maybe_as.numeric) %>%
    data.frame(stringsAsFactors = FALSE) %>%
    select(-starts_with('NA')) %>%
    mutate(Location = gsub(pattern = '_', replacement = " ", x = Location))
  names(df) <- tolower(names(df))
  df
}

# single test case
convert_to_df(data_files[1]) %>%
  str()


# Parse textfiles and bundle output into a list ------------------------
# Note: with a lot of data files, this list is likely to grow quite large.
# Maybe consider saving these out as csv files as they are processed
library(parallel)

list_of_dfs <- mclapply(data_files,
                        FUN = convert_to_df,
                        mc.cores = detectCores())
# or in serial (with regular old lapply)
# list_of_dfs <- lapply(file.names[1:], convert_to_df)

# to run on a subset (e.g., the first 100 files):
# list_of_dfs <- mclapply(data_files[1:100],
#                         FUN = convert_to_df,
#                         mc.cores = detectCores())


# merge into a master data frame ------------------------------------------
merged_df <- do.call(rbind, list_of_dfs)

	# Importing data from USDA into large data file ---------------------------
	library(stringr)
	library(dplyr)

	path_to_data <- "~/Desktop/ams_cattle_data/"
	data_files <- list.files(path = path_to_data, pattern = "cattle",
	full.names = TRUE)

	# Define helper functions -------------------------------------------------
	parse_row <- function(text) {
	# Identifies columns in the text files:
	# - replaces spaces with underscores
	# - returns a vector that splits character string based on 2+ underscores
	# where each element in the vector is an entry in a column for that row
	text_vec <- gsub(pattern = ' ', replacement = '_', x = text)
	unlist(strsplit(text_vec, split = '[_]{2,}'))
	}


	maybe_as.numeric <- function(x) {
	# tries to convert the input to a number
	# but if it fails (all converted things are NA)
	# return the original input, otherwise return the numeric version
	num_x <- suppressWarnings(as.numeric(x))
	if (all(is.na(num_x))) {
	res <- x
	} else {
	res <- num_x
	}
	res
	}

	convert_to_df <- function(file) {
	# takes a text file as input and parses the file to generate a data.frame
	x <- readLines(file)
	x <- x[grepl("[[:alpha:]]", x)] # remove blank lines

	# convert to a list of vectors, where each element in list is a row
	xl <- t(do.call(rbind, list(x))) %>%
	apply(1, parse_row) %>%
	unique()

	# row bind the elements in the list together to make a data.frame
	df <- do.call(rbind.data.frame, xl[-1])

	# produce names for each column based on the first header row, but removing
	# comments and pricing point
	cols_to_ignore <- c('Comments', 'Pricing_Point')
	names(df) <- xl[[1]][!(xl[[1]] %in% cols_to_ignore)]

	# convert every entry to character (by default they are factors)
	df[] <- lapply(df, as.character)

	# try to convert each column to numeric and return a clean data frame
	df <- df %>%
	lapply(maybe_as.numeric) %>%
	data.frame(stringsAsFactors = FALSE) %>%
	select(-starts_with('NA')) %>%
	mutate(Location = gsub(pattern = '_', replacement = " ", x = Location))
	names(df) <- tolower(names(df))
	df
	}

	# single test case
	convert_to_df(data_files[1]) %>%
	str()



	# Parse textfiles and bundle output into a list ------------------------
	# Note: with a lot of data files, this list is likely to grow quite large.
	# Maybe consider saving these out as csv files as they are processed
	library(parallel)

	list_of_dfs <- mclapply(data_files,
	FUN = convert_to_df,
	mc.cores = detectCores())
	# or in serial (with regular old lapply)
	# list_of_dfs <- lapply(file.names[1:], convert_to_df)

	# to run on a subset (e.g., the first 100 files):
	# list_of_dfs <- mclapply(data_files[1:100],
	# FUN = convert_to_df,
	# mc.cores = detectCores())


	# merge into a master data frame ------------------------------------------
	merged_df <- do.call(rbind, list_of_dfs)