Last active
April 25, 2022 20:36
-
-
Save franzbischoff/0d8a5a01a0451ddf866dc2f6bfe1d30f to your computer and use it in GitHub Desktop.
Experiment of FLUSS scores
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create the GT and predictions | |
set.seed(2022) | |
ts_length <- 1000 | |
n_points <- 10 | |
truth <- sort(round(runif(n_points, 1, ts_length), 0)) | |
min_diff <- min(abs(diff(truth))) | |
mask <- runif(n_points) > 0.5 | |
reported_equal <- truth + round(runif(n_points, -min_diff / 2, min_diff / 2), 0) | |
reported_half <- reported_equal[mask] | |
reported_half_rand <- sort(round(runif(floor(n_points / 2), 1, ts_length), 0)) | |
reported_dbl <- unique(sort(c(reported_equal, round(runif(n_points, 1, ts_length), 0)))) | |
reported_dbl_rand <- sort(round(runif(floor(n_points * 2), 1, ts_length), 0)) | |
reported_one <- reported_equal[floor(n_points / 2)] | |
reported_all <- seq.int(1, ts_length, 1) | |
score_regimes1 <- function(gtruth, reported, data_size) { | |
n <- length(gtruth) | |
m <- length(reported) | |
minv <- rep(Inf, n) | |
for (j in 1:n) { | |
for (i in 1:m) { | |
if (abs(reported[i] - gtruth[j]) < abs(minv[j])) { | |
minv[j] <- abs(reported[i] - gtruth[j]) | |
} | |
} | |
} | |
o <- max(n, m) | |
h <- min(n, m) | |
score <- sum(minv) / data_size | |
score1 <- sum(minv) / m / data_size | |
score2 <- sum(minv) * m / data_size | |
score_max1 <- sum(minv) / o / data_size | |
score_max2 <- sum(minv) * o / data_size | |
score_min1 <- sum(minv) / h / data_size | |
score_min2 <- sum(minv) * h / data_size | |
return(round(c( | |
score = score, | |
score1 = score1, | |
score2 = score2, | |
score_max1 = score_max1, | |
score_max2 = score_max2, | |
score_min1 = score_min1, | |
score_min2 = score_min2 | |
), 4)) | |
} | |
score_regimes2 <- function(gtruth, reported, data_size) { | |
m <- length(gtruth) | |
n <- length(reported) | |
out <- max(m, n) | |
inn <- min(m, n) | |
if (out == m) { | |
outer <- sort(gtruth) | |
inner <- sort(reported) | |
} else { | |
outer <- sort(reported) | |
inner <- sort(gtruth) | |
} | |
minv <- rep(Inf, out) | |
for (j in seq.int(1, out)) { | |
for (i in seq.int(1, inn)) { | |
if (abs(inner[i] - outer[j]) < abs(minv[j])) { | |
minv[j] <- abs(inner[i] - outer[j]) | |
} | |
} | |
} | |
score <- sum(minv) / data_size | |
score1 <- sum(minv) / n / data_size | |
score2 <- sum(minv) * n / data_size | |
score_max1 <- sum(minv) / out / data_size | |
score_max2 <- sum(minv) * out / data_size | |
score_min1 <- sum(minv) / inn / data_size | |
score_min2 <- sum(minv) * inn / data_size | |
return(round(c( | |
nscore = score, | |
nscore1 = score1, | |
nscore2 = score2, | |
nscore_max1 = score_max1, | |
nscore_max2 = score_max2, | |
nscore_min1 = score_min1, | |
nscore_min2 = score_min2 | |
), 4)) | |
} | |
score_regimes <- score_regimes1 | |
tr <- score_regimes(truth, truth, ts_length) | |
eq <- score_regimes(truth, reported_equal, ts_length) | |
hf <- score_regimes(truth, reported_half, ts_length) | |
hfr <- score_regimes(truth, reported_half_rand, ts_length) | |
db <- score_regimes(truth, reported_dbl, ts_length) | |
dbr <- score_regimes(truth, reported_dbl_rand, ts_length) | |
on <- score_regimes(truth, reported_one, ts_length) | |
al <- score_regimes(truth, reported_all, ts_length) | |
scores <- data.frame(truth = tr, equal = eq, dbl = db, half = hf, one = on, dbl_r = dbr, half_r = hfr, all = al) | |
score_regimes <- score_regimes2 | |
tr <- score_regimes(truth, truth, ts_length) | |
eq <- score_regimes(truth, reported_equal, ts_length) | |
hf <- score_regimes(truth, reported_half, ts_length) | |
hfr <- score_regimes(truth, reported_half_rand, ts_length) | |
db <- score_regimes(truth, reported_dbl, ts_length) | |
dbr <- score_regimes(truth, reported_dbl_rand, ts_length) | |
on <- score_regimes(truth, reported_one, ts_length) | |
al <- score_regimes(truth, reported_all, ts_length) | |
scores2 <- data.frame(truth = tr, equal = eq, dbl = db, half = hf, one = on, dbl_r = dbr, half_r = hfr, all = al) | |
scores <- rbind(scores, scores2) | |
print(scores) | |
# Possible best algorithm: | |
score_regimes3 <- function(gtruth, reported, data_size) { | |
m <- length(gtruth) | |
n <- length(reported) | |
out <- max(m, n) | |
inn <- min(m, n) | |
if (out == m) { | |
outer <- sort(gtruth) | |
inner <- sort(reported) | |
} else { | |
outer <- sort(reported) | |
inner <- sort(gtruth) | |
} | |
minv <- rep(Inf, out) | |
k <- 1 | |
for (j in seq.int(1, out)) { | |
for (i in seq.int(k, inn)) { | |
if (abs(inner[i] - outer[j]) <= minv[j]) { | |
minv[j] <- abs(inner[i] - outer[j]) | |
k <- i # pruning, truth and reported must be sorted | |
} else { | |
break # pruning, truth and reported must be sorted | |
} | |
} | |
} | |
# score <- sum(minv) / data_size | |
# score <- sum(minv) / n / data_size | |
# score <- sum(minv) * n / data_size | |
# score <- sum(minv) / out / data_size | |
# score <- sum(minv) * out / data_size # * | |
score <- sum(minv) / inn / data_size # * | |
# max = sum(abs(seq.int(1, out)-1000))/1000 | |
return(score) | |
} | |
if (FALSE) { | |
library(plotly) | |
set.seed(2022) | |
ts_length <- 2000 | |
n_points <- 100 | |
truth <- sort(round(runif(n_points, 1, ts_length), 0)) | |
score_mean <- NULL | |
score_sd <- NULL | |
x <- c(seq.int(1, floor(n_points * 2)), seq.int(floor(n_points * 2) + 1, ts_length, by = 10)) | |
for (i in x) { | |
result <- NULL | |
cli::cli_inform("Size {i}.") | |
set.seed(2022) | |
for (j in seq.int(1, ceiling(n_points / (log2(i) + 1)))) { | |
# cli::cli_inform("Sample {j}.") | |
guess <- sort(round(runif(i, 1, ts_length), 0)) | |
res <- score_regimes3(truth, guess, ts_length) | |
result <- c(result, res) | |
} | |
score_mean <- c(score_mean, mean(result)) | |
score_sd <- c(score_sd, sd(result)) | |
} | |
plot_ly(x = x, y = score_mean) # / n_points / ts_length | |
} | |
if (FALSE) { | |
library(plotly) | |
p <- subplot( | |
plot_ly() %>% | |
add_segments( | |
x = truth, y = 2, | |
xend = reported_equal, yend = 1, | |
color = I("#dddddd"), showlegend = FALSE | |
) %>% | |
add_markers( | |
x = truth, y = 2, | |
color = I("blue"), | |
name = "truth" | |
) %>% | |
add_markers( | |
x = reported_equal, y = 1, | |
color = I("red"), name = "reported_equal" | |
), | |
plot_ly() %>% | |
add_segments( | |
x = truth, y = 2, | |
xend = reported_equal, yend = 1, | |
color = I("#dddddd"), showlegend = FALSE | |
) %>% | |
add_markers( | |
x = truth, y = 2, | |
color = I("blue"), showlegend = FALSE | |
) %>% | |
add_markers( | |
x = reported_half, y = 1, | |
color = I("green"), name = "reported_half" | |
), | |
plot_ly() %>% | |
add_segments( | |
x = truth, y = 2, | |
xend = reported_equal, yend = 1, | |
color = I("#dddddd"), showlegend = FALSE | |
) %>% | |
add_markers( | |
x = truth, y = 2, | |
color = I("blue"), showlegend = FALSE | |
) %>% | |
add_markers( | |
x = reported_dbl, y = 1, | |
color = I("#dceb08"), name = "reported_dbl" | |
), | |
plot_ly() %>% | |
add_segments( | |
x = truth, y = 2, | |
xend = reported_equal, yend = 1, | |
color = I("#dddddd"), showlegend = FALSE | |
) %>% | |
add_markers( | |
x = truth, y = 2, | |
color = I("blue"), showlegend = FALSE | |
) %>% | |
add_markers( | |
x = reported_one, y = 1, | |
color = I("#e408eb"), name = "reported_one" | |
), | |
shareX = TRUE, nrows = 4 | |
) | |
print(p) | |
} | |
##### Proposed algorithm | |
# 1. Check which is larger, the gtruth or the predicted regimes. | |
# 2. Sort both indexes for allowin pruning. | |
# 3. The larger one is the outer loop. | |
# 4. The smaller one is the inner loop. | |
# 5. For each outer loop index, find the closest inner loop index. | |
# 6. If the distance is smaller than the current minimum, update the minimum. | |
# 7. The score is the sum of the minimums divided by the data_size multiplied by the size of the inner loop. | |
score_regimes_final <- function(gtruth, predicted, data_size) { | |
m <- length(gtruth) | |
n <- length(predicted) | |
out <- max(m, n) | |
inn <- min(m, n) | |
if (out == m) { | |
outer <- sort(gtruth) | |
inner <- sort(predicted) | |
} else { | |
outer <- sort(predicted) | |
inner <- sort(gtruth) | |
} | |
minv <- rep(Inf, out) | |
k <- 1 | |
for (j in seq.int(1, out)) { | |
for (i in seq.int(k, inn)) { | |
if (abs(inner[i] - outer[j]) <= minv[j]) { | |
minv[j] <- abs(inner[i] - outer[j]) | |
k <- i # pruning, truth and predicted must be sorted | |
} else { | |
break # pruning, truth and predicted must be sorted | |
} | |
} | |
} | |
score <- sum(minv) / (inn * data_size) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment