Skip to content

Instantly share code, notes, and snippets.

@mbjoseph
Last active September 30, 2016 17:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mbjoseph/1ef65ae3477f74d991c07c37cf3bf6f4 to your computer and use it in GitHub Desktop.
Save mbjoseph/1ef65ae3477f74d991c07c37cf3bf6f4 to your computer and use it in GitHub Desktop.
# Importing data from USDA into large data file ---------------------------
library(stringr)
library(dplyr)
path_to_data <- "~/Desktop/ams_cattle_data/"
data_files <- list.files(path = path_to_data, pattern = "cattle",
full.names = TRUE)
# Define helper functions -------------------------------------------------
parse_row <- function(text) {
# Identifies columns in the text files:
# - replaces spaces with underscores
# - returns a vector that splits character string based on 2+ underscores
# where each element in the vector is an entry in a column for that row
text_vec <- gsub(pattern = ' ', replacement = '_', x = text)
unlist(strsplit(text_vec, split = '[_]{2,}'))
}
maybe_as.numeric <- function(x) {
# tries to convert the input to a number
# but if it fails (all converted things are NA)
# return the original input, otherwise return the numeric version
num_x <- suppressWarnings(as.numeric(x))
if (all(is.na(num_x))) {
res <- x
} else {
res <- num_x
}
res
}
convert_to_df <- function(file) {
# takes a text file as input and parses the file to generate a data.frame
x <- readLines(file)
x <- x[grepl("[[:alpha:]]", x)] # remove blank lines
# convert to a list of vectors, where each element in list is a row
xl <- t(do.call(rbind, list(x))) %>%
apply(1, parse_row) %>%
unique()
# row bind the elements in the list together to make a data.frame
df <- do.call(rbind.data.frame, xl[-1])
# produce names for each column based on the first header row, but removing
# comments and pricing point
cols_to_ignore <- c('Comments', 'Pricing_Point')
names(df) <- xl[[1]][!(xl[[1]] %in% cols_to_ignore)]
# convert every entry to character (by default they are factors)
df[] <- lapply(df, as.character)
# try to convert each column to numeric and return a clean data frame
df <- df %>%
lapply(maybe_as.numeric) %>%
data.frame(stringsAsFactors = FALSE) %>%
select(-starts_with('NA')) %>%
mutate(Location = gsub(pattern = '_', replacement = " ", x = Location))
names(df) <- tolower(names(df))
df
}
# single test case
convert_to_df(data_files[1]) %>%
str()
# Parse textfiles and bundle output into a list ------------------------
# Note: with a lot of data files, this list is likely to grow quite large.
# Maybe consider saving these out as csv files as they are processed
library(parallel)
list_of_dfs <- mclapply(data_files,
FUN = convert_to_df,
mc.cores = detectCores())
# or in serial (with regular old lapply)
# list_of_dfs <- lapply(file.names[1:], convert_to_df)
# to run on a subset (e.g., the first 100 files):
# list_of_dfs <- mclapply(data_files[1:100],
# FUN = convert_to_df,
# mc.cores = detectCores())
# merge into a master data frame ------------------------------------------
merged_df <- do.call(rbind, list_of_dfs)
@tshrum
Copy link

tshrum commented Aug 29, 2016

Thanks Max! And thanks to your expressive code (and google), I think I know what is going on throughout. (Not sure why the NA column appears with the location factors, but that seems pretty minor)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment