franzbischoff/fluss_score_tests.R

## fluss_score_tests.R
# Create the GT and predictions
set.seed(2022)
ts_length <- 1000
n_points <- 10
truth <- sort(round(runif(n_points, 1, ts_length), 0))
min_diff <- min(abs(diff(truth)))
mask <- runif(n_points) > 0.5
reported_equal <- truth + round(runif(n_points, -min_diff / 2, min_diff / 2), 0)
reported_half <- reported_equal[mask]
reported_half_rand <- sort(round(runif(floor(n_points / 2), 1, ts_length), 0))
reported_dbl <- unique(sort(c(reported_equal, round(runif(n_points, 1, ts_length), 0))))
reported_dbl_rand <- sort(round(runif(floor(n_points * 2), 1, ts_length), 0))
reported_one <- reported_equal[floor(n_points / 2)]
reported_all <- seq.int(1, ts_length, 1)

score_regimes1 <- function(gtruth, reported, data_size) {
  n <- length(gtruth)
  m <- length(reported)
  minv <- rep(Inf, n)
  for (j in 1:n) {
    for (i in 1:m) {
      if (abs(reported[i] - gtruth[j]) < abs(minv[j])) {
        minv[j] <- abs(reported[i] - gtruth[j])
      }
    }
  }

  o <- max(n, m)
  h <- min(n, m)

  score <- sum(minv) / data_size
  score1 <- sum(minv) / m / data_size
  score2 <- sum(minv) * m / data_size
  score_max1 <- sum(minv) / o / data_size
  score_max2 <- sum(minv) * o / data_size
  score_min1 <- sum(minv) / h / data_size
  score_min2 <- sum(minv) * h / data_size
  return(round(c(
    score = score,
    score1 = score1,
    score2 = score2,
    score_max1 = score_max1,
    score_max2 = score_max2,
    score_min1 = score_min1,
    score_min2 = score_min2
  ), 4))
}

score_regimes2 <- function(gtruth, reported, data_size) {
  m <- length(gtruth)
  n <- length(reported)

  out <- max(m, n)
  inn <- min(m, n)

  if (out == m) {
    outer <- sort(gtruth)
    inner <- sort(reported)
  } else {
    outer <- sort(reported)
    inner <- sort(gtruth)
  }

  minv <- rep(Inf, out)

  for (j in seq.int(1, out)) {
    for (i in seq.int(1, inn)) {
      if (abs(inner[i] - outer[j]) < abs(minv[j])) {
        minv[j] <- abs(inner[i] - outer[j])
      }
    }
  }

  score <- sum(minv) / data_size
  score1 <- sum(minv) / n / data_size
  score2 <- sum(minv) * n / data_size
  score_max1 <- sum(minv) / out / data_size
  score_max2 <- sum(minv) * out / data_size
  score_min1 <- sum(minv) / inn / data_size
  score_min2 <- sum(minv) * inn / data_size

  return(round(c(
    nscore = score,
    nscore1 = score1,
    nscore2 = score2,
    nscore_max1 = score_max1,
    nscore_max2 = score_max2,
    nscore_min1 = score_min1,
    nscore_min2 = score_min2
  ), 4))
}

score_regimes <- score_regimes1

tr <- score_regimes(truth, truth, ts_length)
eq <- score_regimes(truth, reported_equal, ts_length)
hf <- score_regimes(truth, reported_half, ts_length)
hfr <- score_regimes(truth, reported_half_rand, ts_length)
db <- score_regimes(truth, reported_dbl, ts_length)
dbr <- score_regimes(truth, reported_dbl_rand, ts_length)
on <- score_regimes(truth, reported_one, ts_length)
al <- score_regimes(truth, reported_all, ts_length)

scores <- data.frame(truth = tr, equal = eq, dbl = db, half = hf, one = on, dbl_r = dbr, half_r = hfr, all = al)

score_regimes <- score_regimes2

tr <- score_regimes(truth, truth, ts_length)
eq <- score_regimes(truth, reported_equal, ts_length)
hf <- score_regimes(truth, reported_half, ts_length)
hfr <- score_regimes(truth, reported_half_rand, ts_length)
db <- score_regimes(truth, reported_dbl, ts_length)
dbr <- score_regimes(truth, reported_dbl_rand, ts_length)
on <- score_regimes(truth, reported_one, ts_length)
al <- score_regimes(truth, reported_all, ts_length)

scores2 <- data.frame(truth = tr, equal = eq, dbl = db, half = hf, one = on, dbl_r = dbr, half_r = hfr, all = al)

scores <- rbind(scores, scores2)

print(scores)

# Possible best algorithm:


score_regimes3 <- function(gtruth, reported, data_size) {
  m <- length(gtruth)
  n <- length(reported)

  out <- max(m, n)
  inn <- min(m, n)

  if (out == m) {
    outer <- sort(gtruth)
    inner <- sort(reported)
  } else {
    outer <- sort(reported)
    inner <- sort(gtruth)
  }

  minv <- rep(Inf, out)

  k <- 1

  for (j in seq.int(1, out)) {
    for (i in seq.int(k, inn)) {
      if (abs(inner[i] - outer[j]) <= minv[j]) {
        minv[j] <- abs(inner[i] - outer[j])
        k <- i # pruning, truth and reported must be sorted
      } else {
        break # pruning, truth and reported must be sorted
      }
    }
  }

  # score <- sum(minv) / data_size
  # score <- sum(minv) / n / data_size
  # score <- sum(minv) * n / data_size
  # score <- sum(minv) / out / data_size
  # score <- sum(minv) * out / data_size # *
  score <- sum(minv) / inn / data_size # *

  # max = sum(abs(seq.int(1, out)-1000))/1000

  return(score)
}

if (FALSE) {
  library(plotly)
  set.seed(2022)
  ts_length <- 2000
  n_points <- 100
  truth <- sort(round(runif(n_points, 1, ts_length), 0))

  score_mean <- NULL
  score_sd <- NULL

  x <- c(seq.int(1, floor(n_points * 2)), seq.int(floor(n_points * 2) + 1, ts_length, by = 10))

  for (i in x) {
    result <- NULL
    cli::cli_inform("Size {i}.")
    set.seed(2022)
    for (j in seq.int(1, ceiling(n_points / (log2(i) + 1)))) {
      # cli::cli_inform("Sample {j}.")
      guess <- sort(round(runif(i, 1, ts_length), 0))
      res <- score_regimes3(truth, guess, ts_length)
      result <- c(result, res)
    }

    score_mean <- c(score_mean, mean(result))
    score_sd <- c(score_sd, sd(result))
  }

  plot_ly(x = x, y = score_mean) #  / n_points / ts_length
}

if (FALSE) {
  library(plotly)
  p <- subplot(
    plot_ly() %>%
      add_segments(
        x = truth, y = 2,
        xend = reported_equal, yend = 1,
        color = I("#dddddd"), showlegend = FALSE
      ) %>%
      add_markers(
        x = truth, y = 2,
        color = I("blue"),
        name = "truth"
      ) %>%
      add_markers(
        x = reported_equal, y = 1,
        color = I("red"), name = "reported_equal"
      ),
    plot_ly() %>%
      add_segments(
        x = truth, y = 2,
        xend = reported_equal, yend = 1,
        color = I("#dddddd"), showlegend = FALSE
      ) %>%
      add_markers(
        x = truth, y = 2,
        color = I("blue"), showlegend = FALSE
      ) %>%
      add_markers(
        x = reported_half, y = 1,
        color = I("green"), name = "reported_half"
      ),
    plot_ly() %>%
      add_segments(
        x = truth, y = 2,
        xend = reported_equal, yend = 1,
        color = I("#dddddd"), showlegend = FALSE
      ) %>%
      add_markers(
        x = truth, y = 2,
        color = I("blue"), showlegend = FALSE
      ) %>%
      add_markers(
        x = reported_dbl, y = 1,
        color = I("#dceb08"), name = "reported_dbl"
      ),
    plot_ly() %>%
      add_segments(
        x = truth, y = 2,
        xend = reported_equal, yend = 1,
        color = I("#dddddd"), showlegend = FALSE
      ) %>%
      add_markers(
        x = truth, y = 2,
        color = I("blue"), showlegend = FALSE
      ) %>%
      add_markers(
        x = reported_one, y = 1,
        color = I("#e408eb"), name = "reported_one"
      ),
    shareX = TRUE, nrows = 4
  )
  print(p)
}


##### Proposed algorithm
# 1. Check which is larger, the gtruth or the predicted regimes.
# 2. Sort both indexes for allowin pruning.
# 3. The larger one is the outer loop.
# 4. The smaller one is the inner loop.
# 5. For each outer loop index, find the closest inner loop index.
# 6. If the distance is smaller than the current minimum, update the minimum.
# 7. The score is the sum of the minimums divided by the data_size multiplied by the size of the inner loop.

score_regimes_final <- function(gtruth, predicted, data_size) {
  m <- length(gtruth)
  n <- length(predicted)

  out <- max(m, n)
  inn <- min(m, n)

  if (out == m) {
    outer <- sort(gtruth)
    inner <- sort(predicted)
  } else {
    outer <- sort(predicted)
    inner <- sort(gtruth)
  }

  minv <- rep(Inf, out)

  k <- 1

  for (j in seq.int(1, out)) {
    for (i in seq.int(k, inn)) {
      if (abs(inner[i] - outer[j]) <= minv[j]) {
        minv[j] <- abs(inner[i] - outer[j])
        k <- i # pruning, truth and predicted must be sorted
      } else {
        break # pruning, truth and predicted must be sorted
      }
    }
  }

  score <- sum(minv) / (inn * data_size)
}
	# Create the GT and predictions
	set.seed(2022)
	ts_length <- 1000
	n_points <- 10
	truth <- sort(round(runif(n_points, 1, ts_length), 0))
	min_diff <- min(abs(diff(truth)))
	mask <- runif(n_points) > 0.5
	reported_equal <- truth + round(runif(n_points, -min_diff / 2, min_diff / 2), 0)
	reported_half <- reported_equal[mask]
	reported_half_rand <- sort(round(runif(floor(n_points / 2), 1, ts_length), 0))
	reported_dbl <- unique(sort(c(reported_equal, round(runif(n_points, 1, ts_length), 0))))
	reported_dbl_rand <- sort(round(runif(floor(n_points * 2), 1, ts_length), 0))
	reported_one <- reported_equal[floor(n_points / 2)]
	reported_all <- seq.int(1, ts_length, 1)

	score_regimes1 <- function(gtruth, reported, data_size) {
	n <- length(gtruth)
	m <- length(reported)
	minv <- rep(Inf, n)
	for (j in 1:n) {
	for (i in 1:m) {
	if (abs(reported[i] - gtruth[j]) < abs(minv[j])) {
	minv[j] <- abs(reported[i] - gtruth[j])
	}
	}
	}

	o <- max(n, m)
	h <- min(n, m)

	score <- sum(minv) / data_size
	score1 <- sum(minv) / m / data_size
	score2 <- sum(minv) * m / data_size
	score_max1 <- sum(minv) / o / data_size
	score_max2 <- sum(minv) * o / data_size
	score_min1 <- sum(minv) / h / data_size
	score_min2 <- sum(minv) * h / data_size
	return(round(c(
	score = score,
	score1 = score1,
	score2 = score2,
	score_max1 = score_max1,
	score_max2 = score_max2,
	score_min1 = score_min1,
	score_min2 = score_min2
	), 4))
	}

	score_regimes2 <- function(gtruth, reported, data_size) {
	m <- length(gtruth)
	n <- length(reported)

	out <- max(m, n)
	inn <- min(m, n)

	if (out == m) {
	outer <- sort(gtruth)
	inner <- sort(reported)
	} else {
	outer <- sort(reported)
	inner <- sort(gtruth)
	}

	minv <- rep(Inf, out)

	for (j in seq.int(1, out)) {
	for (i in seq.int(1, inn)) {
	if (abs(inner[i] - outer[j]) < abs(minv[j])) {
	minv[j] <- abs(inner[i] - outer[j])
	}
	}
	}

	score <- sum(minv) / data_size
	score1 <- sum(minv) / n / data_size
	score2 <- sum(minv) * n / data_size
	score_max1 <- sum(minv) / out / data_size
	score_max2 <- sum(minv) * out / data_size
	score_min1 <- sum(minv) / inn / data_size
	score_min2 <- sum(minv) * inn / data_size

	return(round(c(
	nscore = score,
	nscore1 = score1,
	nscore2 = score2,
	nscore_max1 = score_max1,
	nscore_max2 = score_max2,
	nscore_min1 = score_min1,
	nscore_min2 = score_min2
	), 4))
	}

	score_regimes <- score_regimes1

	tr <- score_regimes(truth, truth, ts_length)
	eq <- score_regimes(truth, reported_equal, ts_length)
	hf <- score_regimes(truth, reported_half, ts_length)
	hfr <- score_regimes(truth, reported_half_rand, ts_length)
	db <- score_regimes(truth, reported_dbl, ts_length)
	dbr <- score_regimes(truth, reported_dbl_rand, ts_length)
	on <- score_regimes(truth, reported_one, ts_length)
	al <- score_regimes(truth, reported_all, ts_length)

	scores <- data.frame(truth = tr, equal = eq, dbl = db, half = hf, one = on, dbl_r = dbr, half_r = hfr, all = al)

	score_regimes <- score_regimes2

	tr <- score_regimes(truth, truth, ts_length)
	eq <- score_regimes(truth, reported_equal, ts_length)
	hf <- score_regimes(truth, reported_half, ts_length)
	hfr <- score_regimes(truth, reported_half_rand, ts_length)
	db <- score_regimes(truth, reported_dbl, ts_length)
	dbr <- score_regimes(truth, reported_dbl_rand, ts_length)
	on <- score_regimes(truth, reported_one, ts_length)
	al <- score_regimes(truth, reported_all, ts_length)

	scores2 <- data.frame(truth = tr, equal = eq, dbl = db, half = hf, one = on, dbl_r = dbr, half_r = hfr, all = al)

	scores <- rbind(scores, scores2)

	print(scores)

	# Possible best algorithm:


	score_regimes3 <- function(gtruth, reported, data_size) {
	m <- length(gtruth)
	n <- length(reported)

	out <- max(m, n)
	inn <- min(m, n)

	if (out == m) {
	outer <- sort(gtruth)
	inner <- sort(reported)
	} else {
	outer <- sort(reported)
	inner <- sort(gtruth)
	}

	minv <- rep(Inf, out)

	k <- 1

	for (j in seq.int(1, out)) {
	for (i in seq.int(k, inn)) {
	if (abs(inner[i] - outer[j]) <= minv[j]) {
	minv[j] <- abs(inner[i] - outer[j])
	k <- i # pruning, truth and reported must be sorted
	} else {
	break # pruning, truth and reported must be sorted
	}
	}
	}

	# score <- sum(minv) / data_size
	# score <- sum(minv) / n / data_size
	# score <- sum(minv) * n / data_size
	# score <- sum(minv) / out / data_size
	# score <- sum(minv) * out / data_size # *
	score <- sum(minv) / inn / data_size # *

	# max = sum(abs(seq.int(1, out)-1000))/1000

	return(score)
	}

	if (FALSE) {
	library(plotly)
	set.seed(2022)
	ts_length <- 2000
	n_points <- 100
	truth <- sort(round(runif(n_points, 1, ts_length), 0))

	score_mean <- NULL
	score_sd <- NULL

	x <- c(seq.int(1, floor(n_points * 2)), seq.int(floor(n_points * 2) + 1, ts_length, by = 10))

	for (i in x) {
	result <- NULL
	cli::cli_inform("Size {i}.")
	set.seed(2022)
	for (j in seq.int(1, ceiling(n_points / (log2(i) + 1)))) {
	# cli::cli_inform("Sample {j}.")
	guess <- sort(round(runif(i, 1, ts_length), 0))
	res <- score_regimes3(truth, guess, ts_length)
	result <- c(result, res)
	}

	score_mean <- c(score_mean, mean(result))
	score_sd <- c(score_sd, sd(result))
	}

	plot_ly(x = x, y = score_mean) # / n_points / ts_length
	}

	if (FALSE) {
	library(plotly)
	p <- subplot(
	plot_ly() %>%
	add_segments(
	x = truth, y = 2,
	xend = reported_equal, yend = 1,
	color = I("#dddddd"), showlegend = FALSE
	) %>%
	add_markers(
	x = truth, y = 2,
	color = I("blue"),
	name = "truth"
	) %>%
	add_markers(
	x = reported_equal, y = 1,
	color = I("red"), name = "reported_equal"
	),
	plot_ly() %>%
	add_segments(
	x = truth, y = 2,
	xend = reported_equal, yend = 1,
	color = I("#dddddd"), showlegend = FALSE
	) %>%
	add_markers(
	x = truth, y = 2,
	color = I("blue"), showlegend = FALSE
	) %>%
	add_markers(
	x = reported_half, y = 1,
	color = I("green"), name = "reported_half"
	),
	plot_ly() %>%
	add_segments(
	x = truth, y = 2,
	xend = reported_equal, yend = 1,
	color = I("#dddddd"), showlegend = FALSE
	) %>%
	add_markers(
	x = truth, y = 2,
	color = I("blue"), showlegend = FALSE
	) %>%
	add_markers(
	x = reported_dbl, y = 1,
	color = I("#dceb08"), name = "reported_dbl"
	),
	plot_ly() %>%
	add_segments(
	x = truth, y = 2,
	xend = reported_equal, yend = 1,
	color = I("#dddddd"), showlegend = FALSE
	) %>%
	add_markers(
	x = truth, y = 2,
	color = I("blue"), showlegend = FALSE
	) %>%
	add_markers(
	x = reported_one, y = 1,
	color = I("#e408eb"), name = "reported_one"
	),
	shareX = TRUE, nrows = 4
	)
	print(p)
	}


	##### Proposed algorithm
	# 1. Check which is larger, the gtruth or the predicted regimes.
	# 2. Sort both indexes for allowin pruning.
	# 3. The larger one is the outer loop.
	# 4. The smaller one is the inner loop.
	# 5. For each outer loop index, find the closest inner loop index.
	# 6. If the distance is smaller than the current minimum, update the minimum.
	# 7. The score is the sum of the minimums divided by the data_size multiplied by the size of the inner loop.

	score_regimes_final <- function(gtruth, predicted, data_size) {
	m <- length(gtruth)
	n <- length(predicted)

	out <- max(m, n)
	inn <- min(m, n)

	if (out == m) {
	outer <- sort(gtruth)
	inner <- sort(predicted)
	} else {
	outer <- sort(predicted)
	inner <- sort(gtruth)
	}

	minv <- rep(Inf, out)

	k <- 1

	for (j in seq.int(1, out)) {
	for (i in seq.int(k, inn)) {
	if (abs(inner[i] - outer[j]) <= minv[j]) {
	minv[j] <- abs(inner[i] - outer[j])
	k <- i # pruning, truth and predicted must be sorted
	} else {
	break # pruning, truth and predicted must be sorted
	}
	}
	}

	score <- sum(minv) / (inn * data_size)
	}