Last active
September 30, 2016 17:14
-
-
Save mbjoseph/1ef65ae3477f74d991c07c37cf3bf6f4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Importing data from USDA into large data file --------------------------- | |
library(stringr) | |
library(dplyr) | |
path_to_data <- "~/Desktop/ams_cattle_data/" | |
data_files <- list.files(path = path_to_data, pattern = "cattle", | |
full.names = TRUE) | |
# Define helper functions ------------------------------------------------- | |
parse_row <- function(text) { | |
# Identifies columns in the text files: | |
# - replaces spaces with underscores | |
# - returns a vector that splits character string based on 2+ underscores | |
# where each element in the vector is an entry in a column for that row | |
text_vec <- gsub(pattern = ' ', replacement = '_', x = text) | |
unlist(strsplit(text_vec, split = '[_]{2,}')) | |
} | |
maybe_as.numeric <- function(x) { | |
# tries to convert the input to a number | |
# but if it fails (all converted things are NA) | |
# return the original input, otherwise return the numeric version | |
num_x <- suppressWarnings(as.numeric(x)) | |
if (all(is.na(num_x))) { | |
res <- x | |
} else { | |
res <- num_x | |
} | |
res | |
} | |
convert_to_df <- function(file) { | |
# takes a text file as input and parses the file to generate a data.frame | |
x <- readLines(file) | |
x <- x[grepl("[[:alpha:]]", x)] # remove blank lines | |
# convert to a list of vectors, where each element in list is a row | |
xl <- t(do.call(rbind, list(x))) %>% | |
apply(1, parse_row) %>% | |
unique() | |
# row bind the elements in the list together to make a data.frame | |
df <- do.call(rbind.data.frame, xl[-1]) | |
# produce names for each column based on the first header row, but removing | |
# comments and pricing point | |
cols_to_ignore <- c('Comments', 'Pricing_Point') | |
names(df) <- xl[[1]][!(xl[[1]] %in% cols_to_ignore)] | |
# convert every entry to character (by default they are factors) | |
df[] <- lapply(df, as.character) | |
# try to convert each column to numeric and return a clean data frame | |
df <- df %>% | |
lapply(maybe_as.numeric) %>% | |
data.frame(stringsAsFactors = FALSE) %>% | |
select(-starts_with('NA')) %>% | |
mutate(Location = gsub(pattern = '_', replacement = " ", x = Location)) | |
names(df) <- tolower(names(df)) | |
df | |
} | |
# single test case | |
convert_to_df(data_files[1]) %>% | |
str() | |
# Parse textfiles and bundle output into a list ------------------------ | |
# Note: with a lot of data files, this list is likely to grow quite large. | |
# Maybe consider saving these out as csv files as they are processed | |
library(parallel) | |
list_of_dfs <- mclapply(data_files, | |
FUN = convert_to_df, | |
mc.cores = detectCores()) | |
# or in serial (with regular old lapply) | |
# list_of_dfs <- lapply(file.names[1:], convert_to_df) | |
# to run on a subset (e.g., the first 100 files): | |
# list_of_dfs <- mclapply(data_files[1:100], | |
# FUN = convert_to_df, | |
# mc.cores = detectCores()) | |
# merge into a master data frame ------------------------------------------ | |
merged_df <- do.call(rbind, list_of_dfs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks Max! And thanks to your expressive code (and google), I think I know what is going on throughout. (Not sure why the NA column appears with the location factors, but that seems pretty minor)