Skip to content

Instantly share code, notes, and snippets.

@favstats
Created July 12, 2022 15:59
Show Gist options
  • Save favstats/83e52375e9e8196013b644ef326da44e to your computer and use it in GitHub Desktop.
Save favstats/83e52375e9e8196013b644ef326da44e to your computer and use it in GitHub Desktop.
Latent GOLD exports data in a very weird way because commas are kept for decimals in a comma-separated data file which leads to issues. This script helps recovering the data by extracting the data row-by-row.
#===============================================================================
# 2022-07-12
# Extract data from Latent GOLD within R
# Fabio Votta (@favstats)
#===============================================================================
library(tidyverse)
library(data.table)
setwd(here::here())
## Funktion findet kommazahlen
identify_commas <- function(x) {
# x <- "0,997973"
nums <- str_split(x, ",") %>%
unlist()
if(any(str_count(nums) >= 2)){
science_part <- as.numeric(paste0(nums[which(diff(str_count(nums) >= 2)==1)], ".", nums[str_count(nums) >= 2]))
}
which_are_not_science <- which(str_count(nums) == 1)
which_are_not_science_t <- which_are_not_science %>%
discard(~{magrittr::is_in(.x, which(diff(str_count(nums) >= 2)==1))})
which_are_science <- c(which(diff(str_count(nums) >= 2)==1))
science_dat <- tibble(type = "science", value = science_part) %>%
mutate_all(as.character)
if(nrow(science_dat)!=0){
science_dat <- science_dat %>%
mutate(id = 1:n())
}
if(nrow(science_dat)>=2){
science_dat <- science_dat %>% mutate(type = paste0(type, id))
}
not_science_dat <- tibble(type = "not_science", value = nums[which_are_not_science_t]) %>%
mutate_all(as.character)
if(nrow(not_science_dat)!=0){
not_science_dat <- not_science_dat %>%
mutate(id = 1:n())
}
if(nrow(not_science_dat)>=2){
not_science_dat <- not_science_dat %>% mutate(type = paste0(type, id))
}
ordaaa <- c(not_science = which_are_not_science_t, science = which_are_science) %>%
sort() %>%
as.data.frame() %>%
rownames_to_column("type") %>%
set_names(c("type", "order")) %>%
left_join(bind_rows(science_dat, not_science_dat), by = "type") %>%
as_tibble() %>%
mutate(value = as.numeric(value))
ordaaa$value
}
# examples <- c("0,8,23379e-295", "4,89544e-13,1,40608e-117",
# "0,00409152", "1,6864e-25", "0,1,96165e-284,3", "0,997973", "0,23")
#
# debugonce(identify_commas)
#
# identify_commas("0")
## parser funktion
parse_dat <- function(x, verbose = T) {
x %>%
# slice(1:19) %>%
split(1:nrow(.)) %>%
map_dfr(~{
if(verbose){
counter <- paste0(.x$internal_id/nrow(x)*100) %>%
as.numeric() %>%
round(2) %>%
format() %>%
paste0("%")
print(counter)
}
fin <- .x %>%
select(contains("clu")) %>%
flatten_chr() %>%
na.omit() %>% as.character() %>%
discard(~magrittr::equals(.x, "")) %>%
map(identify_commas) %>%
unlist() %>%
tibble() %>%
t() %>%
as_tibble() %>%
set_names(c(paste0("cluster", 1:(ncol(.)-1)), "cluster"))
if(verbose){
print(fin)
}
return(fin)
})
}
## funktion liest .dat ein und extrahiert cluster
fix_it_all <- function(filepath, verbose = T) {
yo <- read_lines(filepath)
yo %>%
str_replace_all(",clu", " clu") %>%
write_lines(file = "fix.txt", sep = "\n")
raw_txt <- data.table::fread("fix.txt", sep = " ") %>%
janitor::clean_names() %>%
mutate(internal_id = 1:n()) %>%
mutate_all(~ifelse(str_ends(.x, ","), str_sub(.x, 1, nchar(.x)-1), .x))
fin <- parse_dat(raw_txt, verbose)
return(fin)
}
## achtung das dauert *sehr* lang
soweit_so_gut <- fix_it_all("../../../Downloads/data7.dat")
write_csv(soweit_so_gut, file = "soweit_so_gut.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment