Skip to content

Instantly share code, notes, and snippets.

@technocrat
Created October 14, 2023 06:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save technocrat/9203735a3ad95cf5c49d096fa56c403a to your computer and use it in GitHub Desktop.
Save technocrat/9203735a3ad95cf5c49d096fa56c403a to your computer and use it in GitHub Desktop.
Script for discussion of R programming style focused on the what of each step
# aristotle.R
# author: Richard Careaga
# problem to illustrate analytic programming
# Date: 2023-10-14
# what we have: a text copy of an old fixed-length format tape
# intended to record numeric blocks of data in 20 character increments
# without delimiters; the objective is to divide the text into
# 20 character blocks, find the index of the last "0" in each block
# and record the block sequence number of the block together with the
# numeric characters preceding it back to the next "0" reading right to
# left. example for the first block
# "62034706309085592380" --> 1 8559238
# strategy
# 1. divide the source data into 20-character
# strings
# 2. write functions to parse the strings
# according to the criteria for extracting
# digits that appear between "0" characters
# 3. classify strings into categories and
# obtain location indexes for each category
# 4. subsets strings by category and apply
# the appropriate function
# key step is to parse each string according to
# the index location of the "0" chracters
# contained--example "62034706309085592380"
# has "0" at
# find_zero("62034706309085592380")
# [1] 3 7 10 12 20
# functions
double_zero <- function(x) x[length(x)] - x[length(x)-1] == 1
find_zero <- function(x) as.numeric(gregexpr("0", x)[[1]])
first_zero <- function(x) ifelse(x == 1 & length(x) == 1,TRUE,FALSE)
has_zero <- function(x) grep("0",x[,2])
lacks_zero <- function(x) grep("0",x[,2], invert = TRUE)
multi_zero <- function(x) ifelse(length(x) >= 2,TRUE,FALSE)
parse_multi <- function(x) {
pull_multi = function(x) gsub(".*0(.*)0.*", "\\1", x)
ifelse(pull_multi(x) == "",NA,pull_multi(x)) |>
gsub("^0|0$","",x=_)
}
parse_single <- function(x) gsub("0.*$","",x)
single_zero <- function(x) ifelse(x != 1 & length(x) == 1,TRUE,FALSE)
# data
x <- paste(readLines("https://gist.githubusercontent.com/technocrat/50e92332adacc7f25b10c272f512e0f4/raw/67e1fe4db09cdafecfb7907c863987e1b0444dd1/legacy%2520tape"), collapse = '\n')
# checks: divisible into 80-character blocks,
# only digits 0-9 and show beginning
nchar(x) %% 20
grep("[^0-9]",x)
paste(substr(x,1,160),"...")
# example: first 20 character block
substr(x, 1, 20)
# what we want
# y example: first row of target matrix y
yex <- matrix(c(1,8559238),1,2)
colnames(yex) <- c("id","extract")
yex
# y = f(x): f is to be composed
# preprocessing: divide into blocks of 20
# characters, keeping track of block sequence
char_list <- strsplit(x, "")[[1]]
chunk_size <- 20
num_chunks <- ceiling(length(char_list) / chunk_size)
chunks <- split(char_list, ceiling(seq_along(char_list) / chunk_size))
x <- matrix(sapply(chunks, paste, collapse = ""),ncol = 1)
# add a record identifier
id <- row(x)
x <- cbind(id,x)
colnames(x) <- c("id","extract")
# list of positions of zero in each block of 20
# applied to x, the data
lp <- lapply(x[,2],find_zero)
# index of rows of each type
the_all_zeros <- which(lapply(lp,length) == 20)
the_double_zeros <- which(lapply(lp,double_zero) == TRUE)
the_first_zeros <- which(lapply(lapply(lp,first_zero),isTRUE) == TRUE)
the_lacks_zeros <- which(lapply(lp,sum) == -1)
the_multi_zeros <- which(lapply(lp,length) >= 2)
the_single_zeros <- which(lapply(lp,length) == 1)
the_discards <- c(the_all_zeros,the_double_zeros,the_first_zeros,the_lacks_zeros)
# receiver object
y <- matrix(rep("blank",640))
# there should be no values "blank" remaining after assigning
# values to x[index] where index is the_multi_zeros,
# the_single_zeros, the_discards
# main
y[the_multi_zeros] <- parse_multi(x[the_multi_zeros,2])
y[the_single_zeros] <- parse_single(x[the_single_zeros,2])
y[the_discards] <- NA
cbind(x[,2],y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment