favstats/extract_dat.R

## extract_dat.R
#===============================================================================
# 2022-07-12
# Extract data from Latent GOLD within R
# Fabio Votta (@favstats)
#===============================================================================

library(tidyverse)
library(data.table)

setwd(here::here())


## Funktion findet kommazahlen
identify_commas <- function(x) {

  # x <- "0,997973"

  nums <- str_split(x, ",") %>%
    unlist()

  if(any(str_count(nums) >= 2)){
    science_part <- as.numeric(paste0(nums[which(diff(str_count(nums) >= 2)==1)], ".", nums[str_count(nums) >= 2]))
  }

  which_are_not_science <- which(str_count(nums) == 1)

  which_are_not_science_t <- which_are_not_science %>%
    discard(~{magrittr::is_in(.x, which(diff(str_count(nums) >= 2)==1))})


  which_are_science <- c(which(diff(str_count(nums) >= 2)==1))


  science_dat <- tibble(type = "science", value = science_part) %>%
    mutate_all(as.character)

  if(nrow(science_dat)!=0){
    science_dat <- science_dat %>%
      mutate(id = 1:n())
  }

  if(nrow(science_dat)>=2){
    science_dat <- science_dat %>% mutate(type = paste0(type, id))
  }

  not_science_dat <- tibble(type = "not_science", value = nums[which_are_not_science_t])  %>%
    mutate_all(as.character)

  if(nrow(not_science_dat)!=0){
    not_science_dat <- not_science_dat %>%
      mutate(id = 1:n())
  }

  if(nrow(not_science_dat)>=2){
    not_science_dat <- not_science_dat %>% mutate(type = paste0(type, id))
  }


  ordaaa <- c(not_science = which_are_not_science_t, science = which_are_science) %>%
    sort() %>%
    as.data.frame() %>%
    rownames_to_column("type") %>%
    set_names(c("type", "order")) %>%
    left_join(bind_rows(science_dat, not_science_dat), by = "type") %>%
    as_tibble() %>%
    mutate(value = as.numeric(value))

  ordaaa$value

}

# examples <- c("0,8,23379e-295", "4,89544e-13,1,40608e-117",
# "0,00409152", "1,6864e-25", "0,1,96165e-284,3", "0,997973", "0,23")
#
# debugonce(identify_commas)
#
# identify_commas("0")

## parser funktion
parse_dat <- function(x, verbose = T) {

  x %>%
    # slice(1:19) %>%
    split(1:nrow(.)) %>%
    map_dfr(~{

      if(verbose){
        counter <- paste0(.x$internal_id/nrow(x)*100) %>%
          as.numeric() %>%
          round(2) %>%
          format() %>%
          paste0("%")

        print(counter)
      }


      fin <- .x %>%
        select(contains("clu")) %>%
        flatten_chr() %>%
        na.omit() %>% as.character() %>%
        discard(~magrittr::equals(.x, "")) %>%
        map(identify_commas) %>%
        unlist() %>%
        tibble() %>%
        t() %>%
        as_tibble() %>%
        set_names(c(paste0("cluster", 1:(ncol(.)-1)), "cluster"))


      if(verbose){
        print(fin)
      }

      return(fin)
    })


}

## funktion liest .dat ein und extrahiert cluster
fix_it_all <- function(filepath, verbose = T) {

  yo <- read_lines(filepath)

  yo %>%
    str_replace_all(",clu", " clu") %>%
    write_lines(file = "fix.txt", sep = "\n")

  raw_txt <- data.table::fread("fix.txt", sep = " ")  %>%
    janitor::clean_names()  %>%
    mutate(internal_id = 1:n()) %>%
    mutate_all(~ifelse(str_ends(.x, ","), str_sub(.x, 1, nchar(.x)-1), .x))

  fin <- parse_dat(raw_txt, verbose)

  return(fin)

}

## achtung das dauert *sehr* lang
soweit_so_gut <- fix_it_all("../../../Downloads/data7.dat")

write_csv(soweit_so_gut, file = "soweit_so_gut.csv")
	#===============================================================================
	# 2022-07-12
	# Extract data from Latent GOLD within R
	# Fabio Votta (@favstats)
	#===============================================================================

	library(tidyverse)
	library(data.table)

	setwd(here::here())



	## Funktion findet kommazahlen
	identify_commas <- function(x) {

	# x <- "0,997973"

	nums <- str_split(x, ",") %>%
	unlist()

	if(any(str_count(nums) >= 2)){
	science_part <- as.numeric(paste0(nums[which(diff(str_count(nums) >= 2)==1)], ".", nums[str_count(nums) >= 2]))
	}

	which_are_not_science <- which(str_count(nums) == 1)

	which_are_not_science_t <- which_are_not_science %>%
	discard(~{magrittr::is_in(.x, which(diff(str_count(nums) >= 2)==1))})


	which_are_science <- c(which(diff(str_count(nums) >= 2)==1))


	science_dat <- tibble(type = "science", value = science_part) %>%
	mutate_all(as.character)

	if(nrow(science_dat)!=0){
	science_dat <- science_dat %>%
	mutate(id = 1:n())
	}

	if(nrow(science_dat)>=2){
	science_dat <- science_dat %>% mutate(type = paste0(type, id))
	}

	not_science_dat <- tibble(type = "not_science", value = nums[which_are_not_science_t]) %>%
	mutate_all(as.character)

	if(nrow(not_science_dat)!=0){
	not_science_dat <- not_science_dat %>%
	mutate(id = 1:n())
	}

	if(nrow(not_science_dat)>=2){
	not_science_dat <- not_science_dat %>% mutate(type = paste0(type, id))
	}


	ordaaa <- c(not_science = which_are_not_science_t, science = which_are_science) %>%
	sort() %>%
	as.data.frame() %>%
	rownames_to_column("type") %>%
	set_names(c("type", "order")) %>%
	left_join(bind_rows(science_dat, not_science_dat), by = "type") %>%
	as_tibble() %>%
	mutate(value = as.numeric(value))

	ordaaa$value

	}

	# examples <- c("0,8,23379e-295", "4,89544e-13,1,40608e-117",
	# "0,00409152", "1,6864e-25", "0,1,96165e-284,3", "0,997973", "0,23")
	#
	# debugonce(identify_commas)
	#
	# identify_commas("0")

	## parser funktion
	parse_dat <- function(x, verbose = T) {

	x %>%
	# slice(1:19) %>%
	split(1:nrow(.)) %>%
	map_dfr(~{

	if(verbose){
	counter <- paste0(.x$internal_id/nrow(x)*100) %>%
	as.numeric() %>%
	round(2) %>%
	format() %>%
	paste0("%")

	print(counter)
	}


	fin <- .x %>%
	select(contains("clu")) %>%
	flatten_chr() %>%
	na.omit() %>% as.character() %>%
	discard(~magrittr::equals(.x, "")) %>%
	map(identify_commas) %>%
	unlist() %>%
	tibble() %>%
	t() %>%
	as_tibble() %>%
	set_names(c(paste0("cluster", 1:(ncol(.)-1)), "cluster"))


	if(verbose){
	print(fin)
	}

	return(fin)
	})


	}

	## funktion liest .dat ein und extrahiert cluster
	fix_it_all <- function(filepath, verbose = T) {

	yo <- read_lines(filepath)

	yo %>%
	str_replace_all(",clu", " clu") %>%
	write_lines(file = "fix.txt", sep = "\n")

	raw_txt <- data.table::fread("fix.txt", sep = " ") %>%
	janitor::clean_names() %>%
	mutate(internal_id = 1:n()) %>%
	mutate_all(~ifelse(str_ends(.x, ","), str_sub(.x, 1, nchar(.x)-1), .x))

	fin <- parse_dat(raw_txt, verbose)

	return(fin)

	}

	## achtung das dauert sehr lang
	soweit_so_gut <- fix_it_all("../../../Downloads/data7.dat")

	write_csv(soweit_so_gut, file = "soweit_so_gut.csv")