Skip to content

Instantly share code, notes, and snippets.

@HenrikBengtsson
Last active August 29, 2015 14:20
Show Gist options
  • Save HenrikBengtsson/55049bfe12faa708b574 to your computer and use it in GitHub Desktop.
Save HenrikBengtsson/55049bfe12faa708b574 to your computer and use it in GitHub Desktop.
Mixed-ordering of string containing roman numerals
source("mixedsortRoman.R")
# Basics
yi <- c(1, 5, 10, 50, 100)
print(yi)
## [1] 1 5 10 50 100
yr <- as.character(as.roman(yi))
print(yr)
## [1] "I" "V" "X" "L" "C"
yri <- utils:::.roman2numeric(yr)
print(yri)
## [1] 1 5 10 50 100
stopifnot(all(yri == yi))
zi <- sort(yi)
zr <- mixedsortRoman(yr)
print(zr)
## [1] "I" "V" "X" "L" "C"
zri <- utils:::.roman2numeric(zr)
print(zri)
## [1] 1 5 10 50 100
stopifnot(all(zri == zi))
# $Id: mixedsort.R 1774 2014-03-01 20:02:08Z warnes $
mixedsort <- function(x) x[mixedorder(x)]
mixedorder <- function(x)
{
# - Split each each character string into an vector of strings and
# numbers
# - Separately rank numbers and strings
# - Combine orders so that strings follow numbers
if(length(x)<1)
return(NULL)
else if(length(x)==1)
return(1)
if( is.numeric(x) )
return( order(x) )
delim="\\$\\@\\$"
numeric <- function(x)
{
suppressWarnings( as.numeric(x) )
}
nonnumeric <- function(x)
{
suppressWarnings( ifelse(is.na(as.numeric(x)), toupper(x), NA) )
}
x <- as.character(x)
which.nas <- which(is.na(x))
which.blanks <- which(x=="")
if(length(which.blanks) >0)
x[ which.blanks ] <- -Inf
if(length(which.nas) >0)
x[ which.nas ] <- Inf
####
# - Convert each character string into an vector containing single
# character and numeric values.
####
# find and mark numbers in the form of +1.23e+45.67
delimited <- gsub("([+-]{0,1}[0-9]+\\.{0,1}[0-9]*([eE][\\+\\-]{0,1}[0-9]+\\.{0,1}[0-9]*){0,1})",
paste(delim,"\\1",delim,sep=""), x)
# separate out numbers
step1 <- strsplit(delimited, delim)
# remove empty elements
step1 <- lapply( step1, function(x) x[x>""] )
# create numeric version of data
step1.numeric <- lapply( step1, numeric )
# create non-numeric version of data
step1.character <- lapply( step1, nonnumeric )
# now transpose so that 1st vector contains 1st element from each
# original string
maxelem <- max(sapply(step1, length))
step1.numeric.t <- lapply(1:maxelem,
function(i)
sapply(step1.numeric,
function(x)x[i])
)
step1.character.t <- lapply(1:maxelem,
function(i)
sapply(step1.character,
function(x)x[i])
)
# now order them
rank.numeric <- sapply(step1.numeric.t,rank)
rank.character <- sapply(step1.character.t,
function(x) as.numeric(factor(x)))
# and merge
rank.numeric[!is.na(rank.character)] <- 0 # mask off string values
rank.character <- t(
t(rank.character) +
apply(matrix(rank.numeric),2,max,na.rm=TRUE)
)
rank.overall <- ifelse(is.na(rank.character),rank.numeric,rank.character)
order.frame <- as.data.frame(rank.overall)
if(length(which.nas) > 0)
order.frame[which.nas,] <- Inf
retval <- do.call("order",order.frame)
return(retval)
}
mixedsortRoman <- function (x) x[mixedorderRoman(x)]
mixedorderRoman <- function (x)
{
# - Split each each character string into an vector of strings and
# numbers
# - Separately rank numbers and strings
# - Combine orders so that strings follow numbers
if(length(x)<1)
return(NULL)
else if(length(x)==1)
return(1)
if( is.numeric(x) )
return( order(x) )
# NOTE: Note that as.roman(x) is NA for x > 3899
romanC <- as.character( as.roman(1:3899) )
delim="\\$\\@\\$"
numeric <- function(x)
{
suppressWarnings( match(x, romanC) )
}
nonnumeric <- function(x)
{
suppressWarnings( ifelse(is.na(numeric(x)), toupper(x), NA) )
}
x <- as.character(x)
which.nas <- which(is.na(x))
which.blanks <- which(x=="")
if(length(which.blanks) >0)
x[ which.blanks ] <- -Inf
if(length(which.nas) >0)
x[ which.nas ] <- Inf
####
# - Convert each character string into an vector containing single
# character and numeric values.
####
# find and mark numbers in the form of +1.23e+45.67
delimited <- gsub("([IVXCLM]+)",
paste(delim, "\\1", delim, sep=""), x)
# separate out numbers
step1 <- strsplit(delimited, delim)
# remove empty elements
step1 <- lapply( step1, function(x) x[x>""] )
# create numeric version of data
step1.numeric <- lapply( step1, numeric )
# create non-numeric version of data
step1.character <- lapply( step1, nonnumeric )
# now transpose so that 1st vector contains 1st element from each
# original string
maxelem <- max(sapply(step1, length))
step1.numeric.t <- lapply(1:maxelem,
function(i)
sapply(step1.numeric,
function(x)x[i])
)
step1.character.t <- lapply(1:maxelem,
function(i)
sapply(step1.character,
function(x)x[i])
)
# now order them
rank.numeric <- sapply(step1.numeric.t,rank)
rank.character <- sapply(step1.character.t,
function(x) as.numeric(factor(x)))
# and merge
rank.numeric[!is.na(rank.character)] <- 0 # mask off string values
rank.character <- t(
t(rank.character) +
apply(matrix(rank.numeric),2,max,na.rm=TRUE)
)
rank.overall <- ifelse(is.na(rank.character),rank.numeric,rank.character)
order.frame <- as.data.frame(rank.overall)
if(length(which.nas) > 0)
order.frame[which.nas,] <- Inf
retval <- do.call("order",order.frame)
return(retval)
}
##############################################################################
## Based on code snippet by David Winsemius [1].
##
## REFERENCES:
## [1] R-devel thread '[R] Mixed sorting/ordering of strings acknowledging
## roman numerals?', started 2014-08-26
## (https://stat.ethz.ch/pipermail/r-help/2014-August/421141.html)
##############################################################################
@HenrikBengtsson
Copy link
Author

Current difference between mixedorderRoman() and gtools::mixedorder():

[HB-X201]{hb}: diff mixedsortRoman.R mixedsort.R
0a1
> # $Id: mixedsort.R 1774 2014-03-01 20:02:08Z warnes $
1a3
> mixedsort <- function(x) x[mixedorder(x)]
3,6c5,6
< mixedsortRoman <- function (x) x[mixedorderRoman(x)]
<
< mixedorderRoman <- function (x)
< {

---
> mixedorder <- function(x)
>   {
20,21d19
<     # NOTE: Note that as.roman(x) is NA for x > 3899
<     romanC <- as.character( as.roman(1:3899) )
27c25
<         suppressWarnings( match(x, romanC) )

---
>         suppressWarnings( as.numeric(x) )
32c30
<         suppressWarnings( ifelse(is.na(numeric(x)), toupper(x), NA) )

---
>         suppressWarnings( ifelse(is.na(as.numeric(x)), toupper(x), NA) )
52,53c50,51
<     delimited <- gsub("([IVXCLM]+)",
<         paste(delim, "\\1", delim, sep=""), x)

---
>     delimited <- gsub("([+-]{0,1}[0-9]+\\.{0,1}[0-9]*([eE][\\+\\-]{0,1}[0-9]+\
\.{0,1}[0-9]*){0,1})",
>                       paste(delim,"\\1",delim,sep=""), x)
104,105c102
< }
<

---
>   }
108,115d104

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment