Skip to content

Instantly share code, notes, and snippets.

@dirkschumacher

dirkschumacher/large.R

Last active Jan 6, 2020
Embed
What would you like to do?
``` r
partition <- function(groups_vector, n_shards) {
stopifnot(is.integer(groups_vector))
group_sizes <- sort(table(groups_vector), decreasing = TRUE)
n_groups <- length(group_sizes)
stopifnot(n_groups > n_shards)
assigned_shards <- rep(NA_integer_, length(group_sizes))
assigned_shards[1:n_shards] <- 1:n_shards
shard_sums <- as.vector(group_sizes[1:n_shards]) #drop names to avoid confusion
for (i in (n_shards + 1):n_groups) {
z <- which.min(shard_sums)
shard_sums[z] <- shard_sums[z] + group_sizes[i]
assigned_shards[i] <- z
}
return(list(
shard_sums = shard_sums,
assigned_shards = assigned_shards
))
}
library(ROI.plugin.glpk)
library(rmpk)
n_groups <- 200
groups_vector <- sample.int(n_groups, size = 3000000,
replace = TRUE, prob = runif(n_groups))
group_sizes <- sort(table(groups_vector), decreasing = TRUE)
n_groups <- length(group_sizes)
n_shards <- 10
# the idea is to minimize the maximum shard size - minimum shard size
# this uses GLPK. A solver like cbc or gurobi will perform better
model <- MIPModel(
ROI_solver(
solver = "glpk",
control = list(verbose = TRUE, presolve = TRUE, tm_limit = 60 * 1000) # let it run for max 60 seconds
)
)
x <- model$add_variable(i = 1:n_groups, s = 1:n_shards, type = "integer", lb = 0, ub = 1)
size <- model$add_variable(s = 1:n_shards, lb = 0, ub = sum(group_sizes))
max_size <- model$add_variable(lb = max(group_sizes), ub = sum(group_sizes), type = "integer")
min_size <- model$add_variable(lb = 0, ub = sum(group_sizes), type = "integer")
model$set_objective(max_size - min_size, sense = "min")
model$add_constraint(sum_expr(x[i, j], j = 1:n_shards) == 1, i = 1:n_groups)
model$add_constraint(sum_expr(x[i, j] * as.integer(group_sizes[i]), i = 1:n_groups) == size[j], j = 1:n_shards)
model$add_constraint(size[j] >= min_size, j = 1:n_shards)
model$add_constraint(size[j] <= max_size, j = 1:n_shards)
model$add_constraint(min_size <= max_size) # redundant, but maybe this constraint helps
model$set_bounds(x[1, 1], lb = 1) #WLOG we place the first item in the first bucket
model
#> MIP Model:
#> Variables: 2012
#> Constraints: 231
model$optimize()
#> <SOLVER MSG> ----
#> GLPK Simplex Optimizer, v4.65
#> 231 rows, 2012 columns, 4052 non-zeros
#> Preprocessing...
#> 230 rows, 2002 columns, 4032 non-zeros
#> Scaling...
#> A: min|aij| = 1.000e+00 max|aij| = 3.012e+04 ratio = 3.012e+04
#> GM: min|aij| = 8.649e-01 max|aij| = 1.156e+00 ratio = 1.337e+00
#> EQ: min|aij| = 8.043e-01 max|aij| = 1.000e+00 ratio = 1.243e+00
#> Constructing initial basis...
#> Size of triangular part is 230
#> 0: obj = 3.023300000e+04 inf = 9.447e+04 (1)
#> 58: obj = 1.500000000e+06 inf = 5.684e-13 (0)
#> * 380: obj = 5.820766091e-11 inf = 4.825e-12 (0) 2
#> OPTIMAL LP SOLUTION FOUND
#> GLPK Integer Optimizer, v4.65
#> 231 rows, 2012 columns, 4052 non-zeros
#> 2002 integer variables, 1999 of which are binary
#> Preprocessing...
#> 230 rows, 2002 columns, 4032 non-zeros
#> 1992 integer variables, 1990 of which are binary
#> Scaling...
#> A: min|aij| = 1.000e+00 max|aij| = 3.012e+04 ratio = 3.012e+04
#> GM: min|aij| = 8.649e-01 max|aij| = 1.156e+00 ratio = 1.337e+00
#> EQ: min|aij| = 8.043e-01 max|aij| = 1.000e+00 ratio = 1.243e+00
#> 2N: min|aij| = 5.000e-01 max|aij| = 1.610e+00 ratio = 3.220e+00
#> Constructing initial basis...
#> Size of triangular part is 230
#> Solving LP relaxation...
#> GLPK Simplex Optimizer, v4.65
#> 230 rows, 2002 columns, 4032 non-zeros
#> 380: obj = 3.023300000e+04 inf = 9.186e+04 (1)
#> 490: obj = 1.500000000e+06 inf = 0.000e+00 (0)
#> * 888: obj = -5.820766091e-11 inf = 2.617e-11 (0) 2
#> OPTIMAL LP SOLUTION FOUND
#> Integer optimization begins...
#> Long-step dual simplex will be used
#> + 888: mip = not found yet >= -inf (1; 0)
#> + 3837: >>>>> 9.820000000e+03 >= 0.000000000e+00 100.0% (403; 0)
#> + 6042: mip = 9.820000000e+03 >= 0.000000000e+00 100.0% (803; 8)
#> + 7066: >>>>> 7.345000000e+03 >= 0.000000000e+00 100.0% (993; 8)
#> + 10932: >>>>> 7.344000000e+03 >= 0.000000000e+00 100.0% (1588; 104)
#> + 13211: >>>>> 6.136000000e+03 >= 0.000000000e+00 100.0% (1967; 105)
#> + 14741: >>>>> 5.949000000e+03 >= 0.000000000e+00 100.0% (2221; 213)
#> + 17902: >>>>> 5.289000000e+03 >= 0.000000000e+00 100.0% (2773; 249)
#> + 21338: >>>>> 4.844000000e+03 >= 0.000000000e+00 100.0% (3222; 491)
#> + 23135: >>>>> 4.805000000e+03 >= 0.000000000e+00 100.0% (3492; 529)
#> + 31178: mip = 4.805000000e+03 >= 0.000000000e+00 100.0% (4288; 606)
#> + 33254: >>>>> 3.283000000e+03 >= 0.000000000e+00 100.0% (4540; 610)
#> + 36022: >>>>> 3.075000000e+03 >= 0.000000000e+00 100.0% (3892; 2653)
#> + 39175: >>>>> 2.790000000e+03 >= 0.000000000e+00 100.0% (4137; 2749)
#> + 43730: >>>>> 2.719000000e+03 >= 0.000000000e+00 100.0% (4602; 2955)
#> + 44966: >>>>> 2.614000000e+03 >= 0.000000000e+00 100.0% (4711; 3057)
#> + 50421: >>>>> 2.429000000e+03 >= 0.000000000e+00 100.0% (5428; 3243)
#> + 61983: mip = 2.429000000e+03 >= 0.000000000e+00 100.0% (6653; 3593)
#> + 68666: >>>>> 2.370000000e+03 >= 0.000000000e+00 100.0% (7358; 3637)
#> + 71196: >>>>> 2.285000000e+03 >= 0.000000000e+00 100.0% (7608; 3836)
#> + 86903: mip = 2.285000000e+03 >= 0.000000000e+00 100.0% (8749; 4470)
#> + 96369: >>>>> 2.153000000e+03 >= 0.000000000e+00 100.0% (9358; 4614)
#> +103764: >>>>> 2.010000000e+03 >= 0.000000000e+00 100.0% (9377; 5556)
#> +109622: >>>>> 1.628000000e+03 >= 0.000000000e+00 100.0% (9081; 7026)
#> Time used: 60.0 secs. Memory used: 10.5 Mb.
#> +118563: mip = 1.628000000e+03 >= 0.000000000e+00 100.0% (8484; 9437)
#> TIME LIMIT EXCEEDED; SEARCH TERMINATED
#> <!SOLVER MSG> ----
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
result <- model$get_variable_value(x[i, j]) %>%
filter(value > 0) %>%
group_by(j) %>%
summarise(shard_sums = sum(group_sizes[i]))
result
#> # A tibble: 10 x 2
#> j shard_sums
#> <int> <int>
#> 1 1 300160
#> 2 2 300797
#> 3 3 299700
#> 4 4 299874
#> 5 5 299169
#> 6 6 299950
#> 7 7 299202
#> 8 8 300288
#> 9 9 300678
#> 10 10 300182
max(result$shard_sums) - min(result$shard_sums)
#> [1] 1628
result2 <- partition(groups_vector, n_shards)
max(result2$shard_sums) - min(result2$shard_sums)
#> [1] 418
```
<sup>Created on 2020-01-06 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
``` r
partition <- function(groups_vector, n_shards) {
stopifnot(is.integer(groups_vector))
group_sizes <- sort(table(groups_vector), decreasing = TRUE)
n_groups <- length(group_sizes)
stopifnot(n_groups > n_shards)
assigned_shards <- rep(NA_integer_, length(group_sizes))
assigned_shards[1:n_shards] <- 1:n_shards
shard_sums <- as.vector(group_sizes[1:n_shards]) #drop names to avoid confusion
for (i in (n_shards + 1):n_groups) {
z <- which.min(shard_sums)
shard_sums[z] <- shard_sums[z] + group_sizes[i]
assigned_shards[i] <- z
}
return(list(
shard_sums = shard_sums,
assigned_shards = assigned_shards
))
}
library(ROI.plugin.glpk)
library(rmpk)
n_groups <- 10
groups_vector <- sample.int(n_groups, size = 3000000,
replace = TRUE, prob = runif(n_groups))
group_sizes <- sort(table(groups_vector), decreasing = TRUE)
n_groups <- length(group_sizes)
n_shards <- 4
# the idea is to minimize the maximum shard size - minimum shard size
# this uses GLPK. A solver like cbc or gurobi will perform better
model <- MIPModel(
ROI_solver(
solver = "glpk",
control = list(verbose = TRUE, presolve = TRUE, tm_limit = 60 * 1000) # let it run for max 60 seconds
)
)
x <- model$add_variable(i = 1:n_groups, s = 1:n_shards, type = "integer", lb = 0, ub = 1)
size <- model$add_variable(s = 1:n_shards, lb = 0, ub = sum(group_sizes))
max_size <- model$add_variable(lb = max(group_sizes), ub = sum(group_sizes), type = "integer")
min_size <- model$add_variable(lb = 0, ub = sum(group_sizes), type = "integer")
model$set_objective(max_size - min_size, sense = "min")
model$add_constraint(sum_expr(x[i, j], j = 1:n_shards) == 1, i = 1:n_groups)
model$add_constraint(sum_expr(x[i, j] * as.integer(group_sizes[i]), i = 1:n_groups) == size[j], j = 1:n_shards)
model$add_constraint(size[j] >= min_size, j = 1:n_shards)
model$add_constraint(size[j] <= max_size, j = 1:n_shards)
model$add_constraint(min_size <= max_size) # redundant, but maybe this constraint helps
model$set_bounds(x[1, 1], lb = 1) #WLOG we place the first item in the first bucket
model
#> MIP Model:
#> Variables: 46
#> Constraints: 23
model$optimize()
#> <SOLVER MSG> ----
#> GLPK Simplex Optimizer, v4.65
#> 23 rows, 46 columns, 102 non-zeros
#> Preprocessing...
#> 22 rows, 42 columns, 94 non-zeros
#> Scaling...
#> A: min|aij| = 1.000e+00 max|aij| = 5.214e+05 ratio = 5.214e+05
#> GM: min|aij| = 8.702e-01 max|aij| = 1.149e+00 ratio = 1.320e+00
#> EQ: min|aij| = 8.118e-01 max|aij| = 1.000e+00 ratio = 1.232e+00
#> Constructing initial basis...
#> Size of triangular part is 22
#> 0: obj = 7.045880000e+05 inf = 1.977e+04 (1)
#> 3: obj = 1.500000000e+06 inf = 1.592e-12 (0)
#> * 11: obj = 1.527951099e-10 inf = 3.865e-12 (0)
#> OPTIMAL LP SOLUTION FOUND
#> GLPK Integer Optimizer, v4.65
#> 23 rows, 46 columns, 102 non-zeros
#> 42 integer variables, 39 of which are binary
#> Preprocessing...
#> 22 rows, 42 columns, 94 non-zeros
#> 38 integer variables, 36 of which are binary
#> Scaling...
#> A: min|aij| = 1.000e+00 max|aij| = 5.214e+05 ratio = 5.214e+05
#> GM: min|aij| = 8.702e-01 max|aij| = 1.149e+00 ratio = 1.320e+00
#> EQ: min|aij| = 8.118e-01 max|aij| = 1.000e+00 ratio = 1.232e+00
#> 2N: min|aij| = 5.000e-01 max|aij| = 1.020e+00 ratio = 2.039e+00
#> Constructing initial basis...
#> Size of triangular part is 22
#> Solving LP relaxation...
#> GLPK Simplex Optimizer, v4.65
#> 22 rows, 42 columns, 94 non-zeros
#> 11: obj = 7.045880000e+05 inf = 2.486e+04 (1)
#> 24: obj = 7.500000000e+05 inf = 0.000e+00 (0)
#> * 25: obj = -2.182787284e-11 inf = 0.000e+00 (0)
#> OPTIMAL LP SOLUTION FOUND
#> Integer optimization begins...
#> Long-step dual simplex will be used
#> + 25: mip = not found yet >= -inf (1; 0)
#> + 71: >>>>> 9.974700000e+04 >= 0.000000000e+00 100.0% (12; 0)
#> + 271: >>>>> 8.218800000e+04 >= 0.000000000e+00 100.0% (31; 25)
#> + 634: >>>>> 8.084300000e+04 >= 0.000000000e+00 100.0% (34; 112)
#> + 645: >>>>> 7.816100000e+04 >= 0.000000000e+00 100.0% (33; 113)
#> + 658: >>>>> 6.506500000e+04 >= 0.000000000e+00 100.0% (31; 122)
#> + 1519: mip = 6.506500000e+04 >= tree is empty 0.0% (0; 579)
#> INTEGER OPTIMAL SOLUTION FOUND
#> <!SOLVER MSG> ----
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
result <- model$get_variable_value(x[i, j]) %>%
filter(value > 0) %>%
group_by(j) %>%
summarise(shard_sums = sum(group_sizes[i]))
result
#> # A tibble: 4 x 2
#> j shard_sums
#> <int> <int>
#> 1 1 704849
#> 2 2 769713
#> 3 3 769914
#> 4 4 755524
max(result$shard_sums) - min(result$shard_sums)
#> [1] 65065
result2 <- partition(groups_vector, n_shards)
max(result2$shard_sums) - min(result2$shard_sums)
#> [1] 145426
```
<sup>Created on 2020-01-06 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
``` r
partition <- function(groups_vector, n_shards) {
stopifnot(is.integer(groups_vector))
group_sizes <- sort(table(groups_vector), decreasing = TRUE)
n_groups <- length(group_sizes)
stopifnot(n_groups > n_shards)
assigned_shards <- rep(NA_integer_, length(group_sizes))
assigned_shards[1:n_shards] <- 1:n_shards
shard_sums <- as.vector(group_sizes[1:n_shards]) #drop names to avoid confusion
for (i in (n_shards + 1):n_groups) {
z <- which.min(shard_sums)
shard_sums[z] <- shard_sums[z] + group_sizes[i]
assigned_shards[i] <- z
}
return(list(
shard_sums = shard_sums,
assigned_shards = assigned_shards
))
}
library(ROI.plugin.glpk)
library(rmpk)
n_groups <- 30
groups_vector <- sample.int(n_groups, size = 3000000,
replace = TRUE, prob = runif(n_groups))
group_sizes <- sort(table(groups_vector), decreasing = TRUE)
n_groups <- length(group_sizes)
n_shards <- 4
# the idea is to minimize the maximum shard size - minimum shard size
# this uses GLPK. A solver like cbc or gurobi will perform better
model <- MIPModel(
ROI_solver(
solver = "glpk",
control = list(verbose = TRUE, presolve = TRUE, tm_limit = 60 * 1000) # let it run for max 60 seconds
)
)
x <- model$add_variable(i = 1:n_groups, s = 1:n_shards, type = "integer", lb = 0, ub = 1)
size <- model$add_variable(s = 1:n_shards, lb = 0, ub = sum(group_sizes))
max_size <- model$add_variable(lb = max(group_sizes), ub = sum(group_sizes), type = "integer")
min_size <- model$add_variable(lb = 0, ub = sum(group_sizes), type = "integer")
model$set_objective(max_size - min_size, sense = "min")
model$add_constraint(sum_expr(x[i, j], j = 1:n_shards) == 1, i = 1:n_groups)
model$add_constraint(sum_expr(x[i, j] * as.integer(group_sizes[i]), i = 1:n_groups) == size[j], j = 1:n_shards)
model$add_constraint(size[j] >= min_size, j = 1:n_shards)
model$add_constraint(size[j] <= max_size, j = 1:n_shards)
model$add_constraint(min_size <= max_size) # redundant, but maybe this constraint helps
model$set_bounds(x[1, 1], lb = 1) #WLOG we place the first item in the first bucket
model
#> MIP Model:
#> Variables: 126
#> Constraints: 43
model$optimize()
#> <SOLVER MSG> ----
#> GLPK Simplex Optimizer, v4.65
#> 43 rows, 126 columns, 262 non-zeros
#> Preprocessing...
#> 42 rows, 122 columns, 254 non-zeros
#> Scaling...
#> A: min|aij| = 1.000e+00 max|aij| = 1.682e+05 ratio = 1.682e+05
#> GM: min|aij| = 8.807e-01 max|aij| = 1.135e+00 ratio = 1.289e+00
#> EQ: min|aij| = 8.265e-01 max|aij| = 1.000e+00 ratio = 1.210e+00
#> Constructing initial basis...
#> Size of triangular part is 42
#> 0: obj = 1.719670000e+05 inf = 4.812e+04 (1)
#> 10: obj = 1.500000000e+06 inf = 4.547e-13 (0)
#> * 41: obj = -1.164153218e-10 inf = 8.185e-12 (0)
#> OPTIMAL LP SOLUTION FOUND
#> GLPK Integer Optimizer, v4.65
#> 43 rows, 126 columns, 262 non-zeros
#> 122 integer variables, 119 of which are binary
#> Preprocessing...
#> 42 rows, 122 columns, 254 non-zeros
#> 118 integer variables, 116 of which are binary
#> Scaling...
#> A: min|aij| = 1.000e+00 max|aij| = 1.682e+05 ratio = 1.682e+05
#> GM: min|aij| = 8.807e-01 max|aij| = 1.135e+00 ratio = 1.289e+00
#> EQ: min|aij| = 8.265e-01 max|aij| = 1.000e+00 ratio = 1.210e+00
#> 2N: min|aij| = 5.000e-01 max|aij| = 1.379e+00 ratio = 2.758e+00
#> Constructing initial basis...
#> Size of triangular part is 42
#> Solving LP relaxation...
#> GLPK Simplex Optimizer, v4.65
#> 42 rows, 122 columns, 254 non-zeros
#> 41: obj = 1.719670000e+05 inf = 4.150e+04 (1)
#> 84: obj = 7.500000000e+05 inf = 1.490e-12 (0)
#> * 85: obj = 0.000000000e+00 inf = 3.309e-12 (0)
#> OPTIMAL LP SOLUTION FOUND
#> Integer optimization begins...
#> Long-step dual simplex will be used
#> + 85: mip = not found yet >= -inf (1; 0)
#> + 184: >>>>> 2.992550000e+05 >= 0.000000000e+00 100.0% (47; 0)
#> + 290: >>>>> 5.442800000e+04 >= 0.000000000e+00 100.0% (74; 1)
#> + 429: >>>>> 4.106200000e+04 >= 0.000000000e+00 100.0% (83; 56)
#> + 472: >>>>> 2.203000000e+04 >= 0.000000000e+00 100.0% (90; 63)
#> + 849: >>>>> 1.090200000e+04 >= 0.000000000e+00 100.0% (149; 91)
#> + 904: >>>>> 1.061500000e+04 >= 0.000000000e+00 100.0% (137; 132)
#> + 944: >>>>> 6.573000000e+03 >= 0.000000000e+00 100.0% (144; 133)
#> + 1917: >>>>> 3.642000000e+03 >= 0.000000000e+00 100.0% (353; 229)
#> + 4042: >>>>> 3.157000000e+03 >= 0.000000000e+00 100.0% (744; 469)
#> + 10911: >>>>> 3.130000000e+03 >= 0.000000000e+00 100.0% (1772; 1263)
#> + 19040: >>>>> 2.786000000e+03 >= 0.000000000e+00 100.0% (2942; 2467)
#> + 21741: >>>>> 2.406000000e+03 >= 0.000000000e+00 100.0% (3234; 2932)
#> + 21751: >>>>> 1.028000000e+03 >= 0.000000000e+00 100.0% (3213; 2976)
#> + 43815: >>>>> 8.200000000e+02 >= 0.000000000e+00 100.0% (5930; 6872)
#> + 57324: mip = 8.200000000e+02 >= 0.000000000e+00 100.0% (7625; 9577)
#> + 67680: mip = 8.200000000e+02 >= 0.000000000e+00 100.0% (8619; 12191)
#> + 80564: mip = 8.200000000e+02 >= 0.000000000e+00 100.0% (10290; 14571)
#> + 82467: >>>>> 8.190000000e+02 >= 0.000000000e+00 100.0% (10488; 14922)
#> + 92317: mip = 8.190000000e+02 >= 0.000000000e+00 100.0% (11517; 17044)
#> + 97821: mip = 8.190000000e+02 >= 0.000000000e+00 100.0% (11911; 18573)
#> +103782: mip = 8.190000000e+02 >= 0.000000000e+00 100.0% (12730; 19794)
#> +111662: mip = 8.190000000e+02 >= 0.000000000e+00 100.0% (13657; 21064)
#> +118550: mip = 8.190000000e+02 >= 0.000000000e+00 100.0% (14214; 22540)
#> +124118: mip = 8.190000000e+02 >= 0.000000000e+00 100.0% (14728; 23485)
#> +127861: mip = 8.190000000e+02 >= 0.000000000e+00 100.0% (15077; 24495)
#> Time used: 60.0 secs. Memory used: 14.1 Mb.
#> +129644: mip = 8.190000000e+02 >= 0.000000000e+00 100.0% (15137; 25165)
#> TIME LIMIT EXCEEDED; SEARCH TERMINATED
#> <!SOLVER MSG> ----
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
result <- model$get_variable_value(x[i, j]) %>%
filter(value > 0) %>%
group_by(j) %>%
summarise(shard_sums = sum(group_sizes[i]))
result
#> # A tibble: 4 x 2
#> j shard_sums
#> <int> <int>
#> 1 1 749886
#> 2 2 749506
#> 3 3 750325
#> 4 4 750283
max(result$shard_sums) - min(result$shard_sums)
#> [1] 819
result2 <- partition(groups_vector, n_shards)
max(result2$shard_sums) - min(result2$shard_sums)
#> [1] 6884
```
<sup>Created on 2020-01-06 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.