Last active September 30, 2016 17:14
# Importing data from USDA into large data file ---------------------------
path_to_data <- "~/Desktop/ams_cattle_data/"
data_files <- list.files(path = path_to_data, pattern = "cattle",
full.names = TRUE)
# Define helper functions -------------------------------------------------
parse_row <- function(text) {
# Identifies columns in the text files:
# - replaces spaces with underscores
# - returns a vector that splits character string based on 2+ underscores
# where each element in the vector is an entry in a column for that row
text_vec <- gsub(pattern = ' ', replacement = '_', x = text)
unlist(strsplit(text_vec, split = '[_]{2,}'))
maybe_as.numeric <- function(x) {
# tries to convert the input to a number
# but if it fails (all converted things are NA)
# return the original input, otherwise return the numeric version
num_x <- suppressWarnings(as.numeric(x))
if (all( {
res <- x
} else {
res <- num_x
convert_to_df <- function(file) {
# takes a text file as input and parses the file to generate a data.frame
x <- readLines(file)
x <- x[grepl("[[:alpha:]]", x)] # remove blank lines
# convert to a list of vectors, where each element in list is a row
xl <- t(, list(x))) %>%
apply(1, parse_row) %>%
# row bind the elements in the list together to make a data.frame
df <-, xl[-1])
# produce names for each column based on the first header row, but removing
# comments and pricing point
cols_to_ignore <- c('Comments', 'Pricing_Point')
names(df) <- xl[[1]][!(xl[[1]] %in% cols_to_ignore)]
# convert every entry to character (by default they are factors)
df[] <- lapply(df, as.character)
# try to convert each column to numeric and return a clean data frame
df <- df %>%
lapply(maybe_as.numeric) %>%
data.frame(stringsAsFactors = FALSE) %>%
select(-starts_with('NA')) %>%
mutate(Location = gsub(pattern = '_', replacement = " ", x = Location))
names(df) <- tolower(names(df))
# single test case
convert_to_df(data_files[1]) %>%
# Parse textfiles and bundle output into a list ------------------------
# Note: with a lot of data files, this list is likely to grow quite large.
# Maybe consider saving these out as csv files as they are processed
list_of_dfs <- mclapply(data_files,
FUN = convert_to_df,
mc.cores = detectCores())
# or in serial (with regular old lapply)
# list_of_dfs <- lapply(file.names[1:], convert_to_df)
# to run on a subset (e.g., the first 100 files):
# list_of_dfs <- mclapply(data_files[1:100],
# FUN = convert_to_df,
# mc.cores = detectCores())
# merge into a master data frame ------------------------------------------
merged_df <-, list_of_dfs)
tshrum commented Aug 29, 2016

Thanks Max! And thanks to your expressive code (and google), I think I know what is going on throughout. (Not sure why the NA column appears with the location factors, but that seems pretty minor)

