Last active
August 26, 2016 03:42
-
-
Save dill/1caaa954b7c93a229f76d62b29dcce67 to your computer and use it in GitHub Desktop.
End Imperialism! (in terms of units of length, at least) 📏 👍
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# try to understand measurements from the past... | |
# possible input types: | |
# - 5 | |
# - 5' | |
# - 5'5 | |
# - 5'7 1/2 | |
# - 5'7\" | |
# - 5'8.5 | |
# - 5'61/2\" | |
# - 6'0 1/2 | |
# - 5'4 1/2 ? | |
# - 5'6.25 | |
# - 4'0 | |
# - 5-7.5 | |
# - 5'91/2\" Lt | |
# - 5'5-7 1/2 | |
# - 5'7 1/2l | |
# - 5'9' | |
# - 5'109 | |
# - 6'- | |
# - 5 ' | |
# - 5'8+ | |
# - 5'+ | |
# - 5' 5\" (2) | |
# - 5'? | |
# - 5 ft. 6in. | |
# - 5ft' 7 in. | |
# - 5 ft. 8 in | |
# - 5 ft 4 in. | |
# - 5 ft. 4 in. | |
# - 5'4'5 | |
# - 5,' 0 | |
# - 5'8 | |
# - 5'5t 1/4 | |
# - 5''11 1/2 | |
# - 6 ' 11/2 | |
# - 5-8.5 | |
# - 6; 1/2 | |
library(stringr) | |
library(convertr) | |
the_empire_strikes_back <- function(x){ | |
# somewhere to store everything | |
x_new <- matrix(0, nrow=length(x), ncol=2) | |
# kill the empty quotes | |
x[x==""] <- NA | |
x[x==" "] <- NA | |
# get rid of trailing, preceeding whitespace | |
x <- trimws(x) | |
# clip other trailing weirdness if we can | |
x <- sub("^(.+)\".*$", "\\1", x) | |
x <- sub("^(.+'\\s+.+\").+$", "\\1", x) | |
x <- sub("(.*)[\\+\\?\\-]$", "\\1", x) | |
x <- sub("(.*)[:alpha:]+$", "\\1", x) | |
# get rid of trailing, preceeding whitespace, again! | |
x <- trimws(x) | |
# some people think '' is the same as ' | |
x <- sub("^(\\d+)\\s?''", "\\1'", x) | |
# some people think . is the same as ' | |
x <- sub("^(\\d+)\\s?\\.", "\\1'", x) | |
# some people think - is the same as ' | |
x <- sub("^(\\d+)\\s?\\-", "\\1'", x) | |
# some people think ; is the same as ' | |
x <- sub("^(\\d+)\\s?;", "\\1'", x) | |
# some people think - is the same as ' | |
x <- sub("(.*)`$", "\\1\"", x) | |
# extra spaces mid word | |
x <- sub("\\s\\s+", " ", x) | |
# single numbers (e.g., "5") | |
ind <- grepl("^\\d+$", x) | |
x_new[ind, 1] <- as.numeric(x[ind]) | |
# quote as foot symbol (e.g., "5'") | |
ind <- grepl("^\\d+'$", x) | |
x_new[ind, 1] <- as.numeric(sub("'", "", x[ind])) | |
# quote as foot symbol with space (e.g., "5 '") | |
ind <- grepl("^\\d+ '$", x) | |
x_new[ind, 1] <- as.numeric(sub(" '", "", x[ind])) | |
# feet and inches with ' and no " (e.g., "5'5" or "5'5.25") | |
ind <- grepl("^\\d+\\s?'\\s?\\d+(.\\d+)?$", x) | |
x_new[ind, ] <- str_split_fixed(x[ind], "'", 2) | |
# feet and inches with , (e.g., "5,5") | |
ind <- grepl("^\\d+,\\s?\\d+(.\\d+)?$", x) | |
x_new[ind, ] <- str_split_fixed(x[ind], ",", 2) | |
# just a space and no " or ' etc | |
ind <- grepl("^\\d+ \\d+$", x) | |
x_new[ind, ] <- str_split_fixed(x[ind], " ", 2) | |
# with space and fractional inches | |
ind <- grepl("^\\d+\\s?'\\s?\\d+ \\d+/\\d+\"?$", x) | |
ft <- sub("^(\\d+)\\s?'\\s?\\d+ \\d+/\\d+\"?$", "\\1", x[ind]) | |
inch <- sub("^\\d+\\s?'\\s?(\\d+) (\\d+)/(\\d+)\"?$", "\\1+\\2/\\3", x[ind]) | |
inch <- sapply(inch, function(y) eval(parse(text=y))) | |
if(sum(ind)>0){ | |
x_new[ind, ] <- cbind(ft, inch) | |
} | |
# with space and fractional inches, no whole inches | |
ind <- grepl("^\\d+' \\d+/\\d+$", x) | |
ft <- sub("^(\\d+)' \\d+/\\d+$", "\\1", x[ind]) | |
inch <- sub("^\\d+' (\\d+)/(\\d+)$", "\\1/\\2", x[ind]) | |
inch <- sapply(inch, function(y) eval(parse(text=y))) | |
if(sum(ind)>0){ | |
x_new[ind, ] <- cbind(ft, inch) | |
} | |
# feet and inches with ' and escaped " (e.g., "5'5\"" or "5'5.25\"") | |
ind <- grepl("^\\d+'\\s?\\d+(.\\d+)?\"$", x) | |
spl <- str_split_fixed(x[ind], "'", 2) | |
if(nrow(spl)!=0){ | |
x_new[ind, ] <- sub("\"\\s+", "", spl) | |
} | |
# feet ' and for some reason inches with ' (e.g., "5'4'") | |
ind <- grepl("^\\d+'\\d+(.\\d+)?'$", x) | |
spl <- str_split_fixed(x[ind], "'", 2) | |
if(nrow(spl)!=0){ | |
x_new[ind, ] <- sub("'", "", spl) | |
} | |
# no space and fractional inches (?!) ("5'61/2\"", or "5'61/2") | |
ind <- grepl("^\\d+'\\d+\\d+/\\d+(\")?$", x) | |
ft <- sub("^(\\d+)'\\d+\\d+/\\d+(\")?$", "\\1", x[ind]) | |
inch <- sub("^\\d+'(\\d+)(\\d+)/(\\d+)(\")?$", "\\1+\\2/\\3", x[ind]) | |
inch <- sapply(inch, function(y) eval(parse(text=y))) | |
if(sum(ind)>0){ | |
x_new[ind, ] <- cbind(ft, inch) | |
} | |
# no space and only fractional inches (?!) ("5'1/2") | |
# only one way to have this be non-ambiguous (no whole inches) | |
ind <- grepl("^\\d+'?\\s?\\d/\\d+(\")?$", x) | |
ft <- sub("^(\\d+)'?\\s?\\d/\\d+(\")?$", "\\1", x[ind]) | |
inch <- sub("^\\d+'?\\s?(\\d)/(\\d+)(\")?$", "\\1/\\2", x[ind]) | |
inch <- sapply(inch, function(y) eval(parse(text=y))) | |
if(sum(ind)>0){ | |
x_new[ind, ] <- cbind(ft, inch) | |
} | |
# feet and inches with ft and in etc | |
ind <- grepl("^\\d+.?ft.? \\d+.?in.?$", x) | |
ft <- sub("^(\\d+).?ft.? \\d+.?in.?$", "\\1", x[ind]) | |
inch <- sub("^\\d+.?ft.? (\\d+).?in.?$", "\\1", x[ind]) | |
if(sum(ind)>0){ | |
x_new[ind, ] <- cbind(ft, inch) | |
} | |
## horrible corner cases | |
# something like "5'4 3'4" which we'll interpret as 5' 4 3/4 | |
ind <- grepl("^\\d+'\\s?\\d+ \\d+'\\d+$", x) | |
ft <- sub("^(\\d+)'\\s?\\d+ \\d+'\\d+$", "\\1", x[ind]) | |
inch <- sub("^\\d+'\\s?(\\d+) (\\d+)'(\\d+)$", "\\1+\\2/\\3", x[ind]) | |
inch <- sapply(inch, function(y) eval(parse(text=y))) | |
if(sum(ind)>0){ | |
x_new[ind, ] <- cbind(ft, inch) | |
} | |
# something like "5'5' 1/2" which we'll interpret as 5' 5 1/2 | |
ind <- grepl("^\\d+'\\d+' \\d+/\\d+$", x) | |
ft <- sub("^(\\d+)'\\d+' \\d+/\\d+$", "\\1", x[ind]) | |
inch <- sub("^\\d+'(\\d+)' (\\d+)/(\\d+)$", "\\1+\\2/\\3", x[ind]) | |
inch <- sapply(inch, function(y) eval(parse(text=y))) | |
if(sum(ind)>0){ | |
x_new[ind, ] <- cbind(ft, inch) | |
} | |
mode(x_new) <- "numeric" | |
# which didn't parse | |
nogo <- apply(x_new, 1, function(x) any(is.na(x))) | | |
apply(x_new, 1, function(x) all(x==0)) | |
if(sum(nogo)>0){ | |
warning(paste(sum(nogo), "measurements failed to parse.")) | |
x_new[nogo,] <- NA | |
} | |
feet <- convert(x_new[, 1], "ft", "m") | |
inches <- convert(x_new[, 2], "in", "m") | |
return(feet+inches) | |
} | |
testers <- c("5", "5'", "5'5", "5'7 1/2", "5'7\"", "5'8.5", "5'61/2\"", "6'0 1/2", "5'4 1/2 ?", "5'6.25", "4'0", "5-7.5", "5'91/2\" Lt", "5'5-7 1/2", "5'7 1/2l", "5'9'", "5'109", "6'-", "5 '", "5'8+", "5'+", "5' 5\" (2)", " 5'?", "5 ft. 6in.", "5ft' 7 in.", "5 ft. 8 in", "5 ft 4 in.", "5 ft. 4 in.", "5'4'5", "5,' 0", " 5'8", "5'5t 1/4", "5''11 1/2", "6 ' 11/2", "5-8.5", "6; 1/2") | |
tested <- the_empire_strikes_back(testers) |
From revision 2:
> source("the_empire_strikes_back.R")
Warning message:
In the_empire_strikes_back(testers) : 4 measurements failed to parse.
> cbind.data.frame(testers, tested)
testers 1 2
1 5 5 0.00
2 5' 5 0.00
3 5'5 5 5.00
4 5'7 1/2 5 7.50
5 5'7" 5 7.00
6 5'8.5 5 8.50
7 5'61/2" 5 6.50
8 6'0 1/2 6 0.50
9 5'4 1/2 ? NA NA
10 5'6.25 5 6.25
11 4'0 4 0.00
12 5-7.5 NA NA
13 5'91/2" Lt 5 9.50
14 5'5-7 1/2 NA NA
15 5'7 1/2l NA NA
16 5'9' 5 9.00
17 5'109 5 109.00
18 6'- 6 0.00
19 5 ' 5 0.00
20 5'8+ 5 8.00
21 5'+ 5 0.00
22 5' 5" (2) 5 5.00
23 5'? 5 0.00
24 5 ft. 6in. 5 6.00
25 5ft' 7 in. 5 7.00
For revision 3:
> source("the_empire_strikes_back.R")
> cbind.data.frame(testers, tested)
testers tested
1 5 1.52400
2 5' 1.52400
3 5'5 1.65100
4 5'7 1/2 1.71450
5 5'7" 1.70180
6 5'8.5 1.73990
7 5'61/2" 1.68910
8 6'0 1/2 1.84150
9 5'4 1/2 ? 1.63830
10 5'6.25 1.68275
11 4'0 1.21920
12 5-7.5 1.71450
13 5'91/2" Lt 1.76530
14 5'5-7 1/2 NA
15 5'7 1/2l 1.71450
16 5'9' 1.75260
17 5'109 4.29260
18 6'- 1.82880
19 5 ' 1.52400
20 5'8+ 1.72720
21 5'+ 1.52400
22 5' 5" (2) 1.65100
23 5'? 1.52400
24 5 ft. 6in. 1.67640
25 5ft' 7 in. 1.70180
26 5 ft. 8 in 1.72720
27 5 ft 4 in. 1.62560
28 5 ft. 4 in. 1.62560
29 5'4'5 NA
30 5,' 0 NA
31 5'8 1.72720
32 5'5t 1/4 NA
33 5''11 1/2 1.81610
34 6 ' 11/2 NA
35 5-8.5 1.73990
36 6; 1/2 1.84150
I was planned to comment last night that it probably better to clean unimportant characters (e.g. ? + - ) first and then to consider different combinations. Glad to see that you have already done this!
Revised version:
This misses a few edge cases like (#14, where I guess they reported a range of possible sizes?), but it's close. And short.
library(dplyr)
library(stringr)
library(purrr)
library(tidyr)
library(convertr)
x %>%
map(strsplit, "[^[:digit:]/\\.]+") %>%
map(1) %>%
map(str_replace_all, "1/2", ".5") %>%
map(str_replace_all, "1/4", ".25") %>%
map(str_subset, "[:digit:]") %>%
map(as.numeric) %>%
map(~c(.x, rep(0, 4 - length(.x)))) %>%
map(~data_frame(feet = .x[1], inches = .x[2] + .x[3])) %>%
bind_rows() %>%
mutate(inches = ifelse(is.na(inches), 0, inches)) %>%
mutate(metric = convert(feet, "ft", "m") + convert(inches, "in", "m"))
(Revised again)
This is my attempt. The test case file is from David's examples
#crew = read.csv("crewlist.csv", stringsAsFactors = FALSE)
#nrow(crew)
#names(crew)
library(stringr)
im_your_father_luke = function(h){
## there is a lot of code dealing with NAs here because the example data set had them
## it would probably be cleaner if we didn't bother
## clean out a bunch of initial guff
h = gsub("[\"\\-]", "", h)
h = gsub("^ *$", "", h)
h = gsub("^(5|6)'([1-9]|10|11)*([13]\\/[248]).*$", "\\1'\\2 \\3", h)
h = gsub("^5'5(7|9)(.*$)", "5'\\1\\2", h)
## This is the workhorse regexp
pattern = "^ *([1-7]) *(ft[.']?|'{1,2}|,|;|\\.)? *([0-9]{1,2})? *(['t]|in\\.*)?([ ]+(1\\/2|([12])\\/3|(1|3)\\/4|([15])\\/6|([1-7])\\/8|1\\/12)|(\\.?[0-9]{1,3}))?[ +\\?`l (2)]*$"
## produces a matrix with 11 columns
## column 2 should have the feet, column 4 should have the inches, columm 6 will have the fraction if there is one, column will have the decimal if there is one
m = str_match(h, pattern)
## helper function to evaluate the fractions
convertFracs = function(x){
if(is.na(x)){
return(NA)
}else{
return(25.4 * eval(parse(text=x)))
}
}
## covert the feet, inches and fraction into millimetres
mm = cbind(as.numeric(m[,2])*12*25.4, as.numeric(m[,4])*25.4 ,
apply(cbind(sapply(m[,6], convertFracs, USE.NAMES = FALSE), sapply(m[,11], convertFracs, USE.NAMES = FALSE)), 1, function(row){
if(all(is.na(row))){
return(NA)
}else if(all(!is.na(row))){
return(-1)
}else if(is.na(row[1]) & !is.na(row[2])){
return(row[2])
}else{
return(row[1])
}
}))
## appropriate columns to get a single figure
toMM = function(row){
if(all(is.na(row))){
return(NA)
}else if(!is.na(row[1]) & is.na(row[2]) & is.na(row[3])){
return(row[1])
}else if(!is.na(row[1]) & !is.na(row[2]) & is.na(row[3])){
return(row[1] + row[2])
}else if(!is.na(row[1]) & is.na(row[2]) & !is.na(row[3])){
return(row[1] + row[3])
}else{
return(sum(row))
}
}
return(data.frame(input = h, output = apply(mm, 1, toMM)))
}
raw,clean,ft,in,mm
"""5",5'0,5,0,1524
5',5'0,5,0,1524
5'5,5'5,5,5,1651
5'7 1/2,5'7 1/2,5,7.5,1714.5
5'7,5'7,5,7,1701.8
5'8.5,5'8.5,5,8.5,1739.9
5'612,5'6 1/2,5,6.5,1689.1
6'0 1/2,6'0 1/2,6,0.5,1841.5
5'4 1/2 ?,5'4 1/2 ?,5,4.5,1638.3
5'6.25,5'6.25,5,6.25,1682.75
4'0,4'0,4,0,1219.2
5-7.5,5'7.5,5,7.5,1714.5
"5'91/2\ Lt""",5'9 1/2,5,9.5,1765.3
5'5-7 1/2,5'7 1/2,5,7.5,1714.5
5'7 1/2l,5'7 1/2,5,7.5,1714.5
5'9',5'9,5,9,1752.6
5'109,5'10,5,10,1778
6'-,6'0,6,0,1828.8
5 ',5'0,5,0,1524
5'8+,5'8,5,8,1727.2
5'+,5'0,5,0,1524
"5' 5\ (2)""",5'5,5,5,1651
5'?,5'0,5,0,1524
5 ft. 6in.,5'6,5,6,1676.4
5ft' 7 in.,5'7,5,7,1701.8
5 ft. 8 in,5'8,5,8,1727.2
5 ft 4 in.,5'4,5,4,1625.6
5 ft. 4 in.,5'4',5,4,1625.6
5'4'5,5'4.5,5,4.5,1638.3
"5,' 0",5'0,5,0,1524
5'8,5'8,5,8,1727.2
5'5t 1/4,5'5 1/4,5,5.25,1657.35
5''11 1/2,5'11 1/2,5,11.5,1816.1
6 ' 11/2,6'0 1/2,6,0.5,1841.5
5-8.5,5'8.5,5,8.5,1739.9
6; 1/2,6'0 1/2,6,0.5,1841.5
This is how mine works. Fails on 4 cases where I think the answer is either hard to choose or it requires a bit more regexp tinkering
DLMTestCases = read.csv("DLMTest.csv", stringsAsFactors = FALSE)
mine = im_your_father_luke(DLMTestCases$raw)
fails = DLMTestCases[abs(DLMTestCases$mm - mine$output) > 0.001,]
fails
raw clean ft in. mm
7 5'612 5'6 1/2 5 6.5 1689.1
17 5'109 5'10 5 10.0 1778.0
29 5'4'5 5'4.5 5 4.5 1638.3
Latest version of my code here:
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
So far I get the following results: