Skip to content

Instantly share code, notes, and snippets.

@dill
Last active August 26, 2016 03:42
Show Gist options
  • Save dill/1caaa954b7c93a229f76d62b29dcce67 to your computer and use it in GitHub Desktop.
Save dill/1caaa954b7c93a229f76d62b29dcce67 to your computer and use it in GitHub Desktop.
End Imperialism! (in terms of units of length, at least) 📏 👍
# try to understand measurements from the past...
# possible input types:
# - 5
# - 5'
# - 5'5
# - 5'7 1/2
# - 5'7\"
# - 5'8.5
# - 5'61/2\"
# - 6'0 1/2
# - 5'4 1/2 ?
# - 5'6.25
# - 4'0
# - 5-7.5
# - 5'91/2\" Lt
# - 5'5-7 1/2
# - 5'7 1/2l
# - 5'9'
# - 5'109
# - 6'-
# - 5 '
# - 5'8+
# - 5'+
# - 5' 5\" (2)
# - 5'?
# - 5 ft. 6in.
# - 5ft' 7 in.
# - 5 ft. 8 in
# - 5 ft 4 in.
# - 5 ft. 4 in.
# - 5'4'5
# - 5,' 0
# - 5'8
# - 5'5t 1/4
# - 5''11 1/2
# - 6 ' 11/2
# - 5-8.5
# - 6; 1/2
library(stringr)
library(convertr)
the_empire_strikes_back <- function(x){
# somewhere to store everything
x_new <- matrix(0, nrow=length(x), ncol=2)
# kill the empty quotes
x[x==""] <- NA
x[x==" "] <- NA
# get rid of trailing, preceeding whitespace
x <- trimws(x)
# clip other trailing weirdness if we can
x <- sub("^(.+)\".*$", "\\1", x)
x <- sub("^(.+'\\s+.+\").+$", "\\1", x)
x <- sub("(.*)[\\+\\?\\-]$", "\\1", x)
x <- sub("(.*)[:alpha:]+$", "\\1", x)
# get rid of trailing, preceeding whitespace, again!
x <- trimws(x)
# some people think '' is the same as '
x <- sub("^(\\d+)\\s?''", "\\1'", x)
# some people think . is the same as '
x <- sub("^(\\d+)\\s?\\.", "\\1'", x)
# some people think - is the same as '
x <- sub("^(\\d+)\\s?\\-", "\\1'", x)
# some people think ; is the same as '
x <- sub("^(\\d+)\\s?;", "\\1'", x)
# some people think - is the same as '
x <- sub("(.*)`$", "\\1\"", x)
# extra spaces mid word
x <- sub("\\s\\s+", " ", x)
# single numbers (e.g., "5")
ind <- grepl("^\\d+$", x)
x_new[ind, 1] <- as.numeric(x[ind])
# quote as foot symbol (e.g., "5'")
ind <- grepl("^\\d+'$", x)
x_new[ind, 1] <- as.numeric(sub("'", "", x[ind]))
# quote as foot symbol with space (e.g., "5 '")
ind <- grepl("^\\d+ '$", x)
x_new[ind, 1] <- as.numeric(sub(" '", "", x[ind]))
# feet and inches with ' and no " (e.g., "5'5" or "5'5.25")
ind <- grepl("^\\d+\\s?'\\s?\\d+(.\\d+)?$", x)
x_new[ind, ] <- str_split_fixed(x[ind], "'", 2)
# feet and inches with , (e.g., "5,5")
ind <- grepl("^\\d+,\\s?\\d+(.\\d+)?$", x)
x_new[ind, ] <- str_split_fixed(x[ind], ",", 2)
# just a space and no " or ' etc
ind <- grepl("^\\d+ \\d+$", x)
x_new[ind, ] <- str_split_fixed(x[ind], " ", 2)
# with space and fractional inches
ind <- grepl("^\\d+\\s?'\\s?\\d+ \\d+/\\d+\"?$", x)
ft <- sub("^(\\d+)\\s?'\\s?\\d+ \\d+/\\d+\"?$", "\\1", x[ind])
inch <- sub("^\\d+\\s?'\\s?(\\d+) (\\d+)/(\\d+)\"?$", "\\1+\\2/\\3", x[ind])
inch <- sapply(inch, function(y) eval(parse(text=y)))
if(sum(ind)>0){
x_new[ind, ] <- cbind(ft, inch)
}
# with space and fractional inches, no whole inches
ind <- grepl("^\\d+' \\d+/\\d+$", x)
ft <- sub("^(\\d+)' \\d+/\\d+$", "\\1", x[ind])
inch <- sub("^\\d+' (\\d+)/(\\d+)$", "\\1/\\2", x[ind])
inch <- sapply(inch, function(y) eval(parse(text=y)))
if(sum(ind)>0){
x_new[ind, ] <- cbind(ft, inch)
}
# feet and inches with ' and escaped " (e.g., "5'5\"" or "5'5.25\"")
ind <- grepl("^\\d+'\\s?\\d+(.\\d+)?\"$", x)
spl <- str_split_fixed(x[ind], "'", 2)
if(nrow(spl)!=0){
x_new[ind, ] <- sub("\"\\s+", "", spl)
}
# feet ' and for some reason inches with ' (e.g., "5'4'")
ind <- grepl("^\\d+'\\d+(.\\d+)?'$", x)
spl <- str_split_fixed(x[ind], "'", 2)
if(nrow(spl)!=0){
x_new[ind, ] <- sub("'", "", spl)
}
# no space and fractional inches (?!) ("5'61/2\"", or "5'61/2")
ind <- grepl("^\\d+'\\d+\\d+/\\d+(\")?$", x)
ft <- sub("^(\\d+)'\\d+\\d+/\\d+(\")?$", "\\1", x[ind])
inch <- sub("^\\d+'(\\d+)(\\d+)/(\\d+)(\")?$", "\\1+\\2/\\3", x[ind])
inch <- sapply(inch, function(y) eval(parse(text=y)))
if(sum(ind)>0){
x_new[ind, ] <- cbind(ft, inch)
}
# no space and only fractional inches (?!) ("5'1/2")
# only one way to have this be non-ambiguous (no whole inches)
ind <- grepl("^\\d+'?\\s?\\d/\\d+(\")?$", x)
ft <- sub("^(\\d+)'?\\s?\\d/\\d+(\")?$", "\\1", x[ind])
inch <- sub("^\\d+'?\\s?(\\d)/(\\d+)(\")?$", "\\1/\\2", x[ind])
inch <- sapply(inch, function(y) eval(parse(text=y)))
if(sum(ind)>0){
x_new[ind, ] <- cbind(ft, inch)
}
# feet and inches with ft and in etc
ind <- grepl("^\\d+.?ft.? \\d+.?in.?$", x)
ft <- sub("^(\\d+).?ft.? \\d+.?in.?$", "\\1", x[ind])
inch <- sub("^\\d+.?ft.? (\\d+).?in.?$", "\\1", x[ind])
if(sum(ind)>0){
x_new[ind, ] <- cbind(ft, inch)
}
## horrible corner cases
# something like "5'4 3'4" which we'll interpret as 5' 4 3/4
ind <- grepl("^\\d+'\\s?\\d+ \\d+'\\d+$", x)
ft <- sub("^(\\d+)'\\s?\\d+ \\d+'\\d+$", "\\1", x[ind])
inch <- sub("^\\d+'\\s?(\\d+) (\\d+)'(\\d+)$", "\\1+\\2/\\3", x[ind])
inch <- sapply(inch, function(y) eval(parse(text=y)))
if(sum(ind)>0){
x_new[ind, ] <- cbind(ft, inch)
}
# something like "5'5' 1/2" which we'll interpret as 5' 5 1/2
ind <- grepl("^\\d+'\\d+' \\d+/\\d+$", x)
ft <- sub("^(\\d+)'\\d+' \\d+/\\d+$", "\\1", x[ind])
inch <- sub("^\\d+'(\\d+)' (\\d+)/(\\d+)$", "\\1+\\2/\\3", x[ind])
inch <- sapply(inch, function(y) eval(parse(text=y)))
if(sum(ind)>0){
x_new[ind, ] <- cbind(ft, inch)
}
mode(x_new) <- "numeric"
# which didn't parse
nogo <- apply(x_new, 1, function(x) any(is.na(x))) |
apply(x_new, 1, function(x) all(x==0))
if(sum(nogo)>0){
warning(paste(sum(nogo), "measurements failed to parse."))
x_new[nogo,] <- NA
}
feet <- convert(x_new[, 1], "ft", "m")
inches <- convert(x_new[, 2], "in", "m")
return(feet+inches)
}
testers <- c("5", "5'", "5'5", "5'7 1/2", "5'7\"", "5'8.5", "5'61/2\"", "6'0 1/2", "5'4 1/2 ?", "5'6.25", "4'0", "5-7.5", "5'91/2\" Lt", "5'5-7 1/2", "5'7 1/2l", "5'9'", "5'109", "6'-", "5 '", "5'8+", "5'+", "5' 5\" (2)", " 5'?", "5 ft. 6in.", "5ft' 7 in.", "5 ft. 8 in", "5 ft 4 in.", "5 ft. 4 in.", "5'4'5", "5,' 0", " 5'8", "5'5t 1/4", "5''11 1/2", "6 ' 11/2", "5-8.5", "6; 1/2")
tested <- the_empire_strikes_back(testers)
@dill
Copy link
Author

dill commented Aug 14, 2016

So far I get the following results:

> source("the_empire_strikes_back.R")
> cbind.data.frame(testers, tested)
      testers 1    2
1           5 5    0
2          5' 5    0
3         5'5 5    5
4     5'7 1/2 5  7.5
5        5'7" 5    7
6       5'8.5 5  8.5
7     5'61/2" 5 61/2
8     6'0 1/2 6  0.5
9   5'4 1/2 ? 0    0
10     5'6.25 5 6.25
11        4'0 4    0
12      5-7.5 0    0
13 5'91/2" Lt 0    0
14  5'5-7 1/2 0    0
15   5'7 1/2l 0    0
16       5'9' 0    0
17      5'109 5  109
18        6'- 0    0
19        5 ' 0    0
20       5'8+ 0    0
21        5'+ 0    0
22  5' 5" (2) 0    0
23        5'? 0    0
24 5 ft. 6in. 0    0
25 5ft' 7 in. 0    0

@dill
Copy link
Author

dill commented Aug 14, 2016

From revision 2:

> source("the_empire_strikes_back.R")
Warning message:
In the_empire_strikes_back(testers) : 4 measurements failed to parse.
> cbind.data.frame(testers, tested)
      testers  1      2
1           5  5   0.00
2          5'  5   0.00
3         5'5  5   5.00
4     5'7 1/2  5   7.50
5        5'7"  5   7.00
6       5'8.5  5   8.50
7     5'61/2"  5   6.50
8     6'0 1/2  6   0.50
9   5'4 1/2 ? NA     NA
10     5'6.25  5   6.25
11        4'0  4   0.00
12      5-7.5 NA     NA
13 5'91/2" Lt  5   9.50
14  5'5-7 1/2 NA     NA
15   5'7 1/2l NA     NA
16       5'9'  5   9.00
17      5'109  5 109.00
18        6'-  6   0.00
19        5 '  5   0.00
20       5'8+  5   8.00
21        5'+  5   0.00
22  5' 5" (2)  5   5.00
23        5'?  5   0.00
24 5 ft. 6in.  5   6.00
25 5ft' 7 in.  5   7.00

@dill
Copy link
Author

dill commented Aug 15, 2016

For revision 3:

> source("the_empire_strikes_back.R")
> cbind.data.frame(testers, tested)
       testers  tested
1            5 1.52400
2           5' 1.52400
3          5'5 1.65100
4      5'7 1/2 1.71450
5         5'7" 1.70180
6        5'8.5 1.73990
7      5'61/2" 1.68910
8      6'0 1/2 1.84150
9    5'4 1/2 ? 1.63830
10      5'6.25 1.68275
11         4'0 1.21920
12       5-7.5 1.71450
13  5'91/2" Lt 1.76530
14   5'5-7 1/2      NA
15    5'7 1/2l 1.71450
16        5'9' 1.75260
17       5'109 4.29260
18         6'- 1.82880
19         5 ' 1.52400
20        5'8+ 1.72720
21         5'+ 1.52400
22   5' 5" (2) 1.65100
23         5'? 1.52400
24  5 ft. 6in. 1.67640
25  5ft' 7 in. 1.70180
26  5 ft. 8 in 1.72720
27  5 ft 4 in. 1.62560
28 5 ft. 4 in. 1.62560
29       5'4'5      NA
30       5,' 0      NA
31         5'8 1.72720
32    5'5t 1/4      NA
33   5''11 1/2 1.81610
34    6 ' 11/2      NA
35       5-8.5 1.73990
36      6; 1/2 1.84150

@daijiang
Copy link

I was planned to comment last night that it probably better to clean unimportant characters (e.g. ? + - ) first and then to consider different combinations. Glad to see that you have already done this!

@davharris
Copy link

davharris commented Aug 15, 2016

Revised version:

This misses a few edge cases like (#14, where I guess they reported a range of possible sizes?), but it's close. And short.

library(dplyr)
library(stringr)
library(purrr)
library(tidyr)
library(convertr)


x %>% 
  map(strsplit, "[^[:digit:]/\\.]+") %>% 
  map(1) %>% 
  map(str_replace_all, "1/2", ".5") %>%
  map(str_replace_all, "1/4", ".25") %>%
  map(str_subset, "[:digit:]") %>%
  map(as.numeric) %>%
  map(~c(.x, rep(0, 4 - length(.x)))) %>% 
  map(~data_frame(feet = .x[1], inches = .x[2] + .x[3])) %>%
  bind_rows() %>%
  mutate(inches = ifelse(is.na(inches), 0, inches)) %>%
  mutate(metric = convert(feet, "ft", "m") + convert(inches, "in", "m"))

(Revised again)

@jmcurran
Copy link

This is my attempt. The test case file is from David's examples

#crew = read.csv("crewlist.csv", stringsAsFactors = FALSE)
#nrow(crew)
#names(crew)

library(stringr)

im_your_father_luke = function(h){
  ## there is a lot of code dealing with NAs here because the example data set had them
  ## it would probably be cleaner if we didn't bother

  ## clean out a bunch of initial guff
  h = gsub("[\"\\-]", "", h)
  h = gsub("^ *$", "", h)
  h = gsub("^(5|6)'([1-9]|10|11)*([13]\\/[248]).*$", "\\1'\\2 \\3", h)
  h = gsub("^5'5(7|9)(.*$)", "5'\\1\\2", h)

  ## This is the workhorse regexp  
  pattern = "^ *([1-7]) *(ft[.']?|'{1,2}|,|;|\\.)? *([0-9]{1,2})? *(['t]|in\\.*)?([ ]+(1\\/2|([12])\\/3|(1|3)\\/4|([15])\\/6|([1-7])\\/8|1\\/12)|(\\.?[0-9]{1,3}))?[ +\\?`l (2)]*$"

  ## produces a matrix with 11 columns
  ## column 2 should have the feet, column 4 should have the inches, columm 6 will have the fraction if there is one, column will have the decimal if there is one
  m = str_match(h, pattern)

  ## helper function to evaluate the fractions
  convertFracs = function(x){
    if(is.na(x)){
      return(NA)
    }else{
      return(25.4 * eval(parse(text=x)))
    }
  }


  ## covert the feet, inches and fraction into millimetres
  mm = cbind(as.numeric(m[,2])*12*25.4, as.numeric(m[,4])*25.4 , 
             apply(cbind(sapply(m[,6], convertFracs, USE.NAMES = FALSE), sapply(m[,11], convertFracs, USE.NAMES = FALSE)), 1, function(row){
               if(all(is.na(row))){
                 return(NA)
               }else if(all(!is.na(row))){
                 return(-1)
               }else if(is.na(row[1]) & !is.na(row[2])){
                 return(row[2])
               }else{
                return(row[1]) 
               }
             }))


  ## appropriate columns to get a single figure
  toMM = function(row){
    if(all(is.na(row))){
      return(NA)
    }else if(!is.na(row[1]) & is.na(row[2]) & is.na(row[3])){
      return(row[1])
    }else if(!is.na(row[1]) & !is.na(row[2]) & is.na(row[3])){
      return(row[1] + row[2])
    }else if(!is.na(row[1]) & is.na(row[2]) & !is.na(row[3])){
      return(row[1] + row[3])
    }else{
      return(sum(row))
    }
  }

  return(data.frame(input = h, output = apply(mm, 1, toMM)))
}
raw,clean,ft,in,mm
"""5",5'0,5,0,1524
5',5'0,5,0,1524
5'5,5'5,5,5,1651
5'7 1/2,5'7 1/2,5,7.5,1714.5
5'7,5'7,5,7,1701.8
5'8.5,5'8.5,5,8.5,1739.9
5'612,5'6 1/2,5,6.5,1689.1
6'0 1/2,6'0 1/2,6,0.5,1841.5
5'4 1/2 ?,5'4 1/2 ?,5,4.5,1638.3
5'6.25,5'6.25,5,6.25,1682.75
4'0,4'0,4,0,1219.2
5-7.5,5'7.5,5,7.5,1714.5
"5'91/2\ Lt""",5'9 1/2,5,9.5,1765.3
5'5-7 1/2,5'7 1/2,5,7.5,1714.5
5'7 1/2l,5'7 1/2,5,7.5,1714.5
5'9',5'9,5,9,1752.6
5'109,5'10,5,10,1778
6'-,6'0,6,0,1828.8
5 ',5'0,5,0,1524
5'8+,5'8,5,8,1727.2
5'+,5'0,5,0,1524
"5' 5\ (2)""",5'5,5,5,1651
 5'?,5'0,5,0,1524
5 ft. 6in.,5'6,5,6,1676.4
5ft' 7 in.,5'7,5,7,1701.8
5 ft. 8 in,5'8,5,8,1727.2
5 ft 4 in.,5'4,5,4,1625.6
5 ft. 4 in.,5'4',5,4,1625.6
5'4'5,5'4.5,5,4.5,1638.3
"5,' 0",5'0,5,0,1524
 5'8,5'8,5,8,1727.2
5'5t 1/4,5'5 1/4,5,5.25,1657.35
5''11 1/2,5'11 1/2,5,11.5,1816.1
6 ' 11/2,6'0 1/2,6,0.5,1841.5
5-8.5,5'8.5,5,8.5,1739.9
6; 1/2,6'0 1/2,6,0.5,1841.5

This is how mine works. Fails on 4 cases where I think the answer is either hard to choose or it requires a bit more regexp tinkering

DLMTestCases = read.csv("DLMTest.csv", stringsAsFactors = FALSE)
mine = im_your_father_luke(DLMTestCases$raw)
fails = DLMTestCases[abs(DLMTestCases$mm - mine$output) > 0.001,] 
fails
       raw   clean ft  in.     mm
7    5'612 5'6 1/2  5  6.5 1689.1
17   5'109    5'10  5 10.0 1778.0
29   5'4'5   5'4.5  5  4.5 1638.3

@jmcurran
Copy link

Latest version of my code here:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment