Daniel Falbel 4/13/2019
In tf.data
in python the api for iterating over the elements of a
dataset is the following:
for x in ds_tensors:
print(x)
This works as expected since ds_tensors
is already an iterator
. In a
more complete example the iteration appears like this:
for x, y in train_dataset:
# Optimize the model
loss_value, grads = grad(model, x, y)
optimizer.apply_gradients(zip(grads, model.trainable_variables),
global_step)
# Track progress
epoch_loss_avg(loss_value) # add current batch loss
if (loss < 1)
break
# compare predicted label to actual label
epoch_accuracy(tf.argmax(model(x), axis=1, output_type=tf.int32), y)
Before Eager Execution, the recommended way in python was something like:
iter = dataset.make_initializable_iterator()
el = iter.get_next()
with tf.Session() as sess:
sess.run(iter.initializer)
print(sess.run(el))
print(sess.run(el))
print(sess.run(el))
Before Eager execution the API was very similar to python’s:
library(tensorflow)
library(tfdatasets)
tf$compat$v1$disable_eager_execution()
dataset <- range_dataset(from = 1, to = 100) %>%
dataset_shuffle(20) %>%
dataset_batch(5)
iter <- make_iterator_one_shot(dataset)
next_batch <- iterator_get_next(iter)
sess <- tf$compat$v1$Session()
sess$run(next_batch)
With the introduction of Eager execution we implemente the
until_out_of_range
function that works likes this:
library(tfdatasets)
dataset <- range_dataset(from = 1, to = 100) %>%
dataset_shuffle(20) %>%
dataset_batch(20)
# make_iterator_one_shot is deprecated in 2.0 and
# the dataset as already an iterator.
iter <- make_iterator_one_shot(dataset)
until_out_of_range({
batch <- iterator_get_next(iter)
x[[i]] <- batch
str(batch)
})
There are 3 issues with this approach:
- In large code chunks it can be hard to find out what iterator is running.
- We have to define the iterator before, and also get the next element inside the loop. It’s easy to make mistakes.
- Does not allow destructuring assignment, which allows very elegant code.
dataset <- range_dataset(from = 1, to = 100) %>%
dataset_shuffle(20) %>%
dataset_batch(20)
iter <- make_iterator_one_shot(dataset)
x <- list()
i <- 1
until_out_of_range({
batch <- iterator_get_next(iter)
x[[i]] <- batch
i <- i + 1
break
})
x
for_each_batch
solves 2 issues with until_out_of_range
:
- The first argument is the dataset you are iterating in.
- The iterator is advanced and created inside the function.
for_each_batch <- function(x, expr) {
break_error <- tryCatch(eval(parse(text = "break")), error = function(e) e)
it <- reticulate::as_iterator(x)
nxt <- reticulate::iter_next(it)
en_expr <- rlang::enquo(expr)
tryCatch({
while (!is.null(nxt)) {
if (is.list(nxt) && (!is.null(names(nxt)) && all(!sapply(names(nxt), is.null))))
nxt <- append(nxt, list(.x = nxt))
else
nxt <- list(.x = nxt)
rlang::env_bind(rlang::get_env(en_expr), !!!nxt)
eval(rlang::get_expr(en_expr), rlang::get_env(en_expr))
nxt <- reticulate::iter_next(it)
}
},
error = function(e) {
if (!identical(e$message, break_error$message))
stop(e)
})
}
library(tfdatasets)
d <- range_dataset(from = 1, to = 100) %>%
dataset_shuffle(100) %>%
dataset_batch(25)
d1 <- tf$data$Dataset$zip(list(x = d, y = d))
d2 <- tf$data$Dataset$zip(reticulate::tuple(d, d))
# refer to variables by name (if the dataset is named)
for_each_batch(d1, {
print(x + y)
})
## tf.Tensor(
## [ 74 52 110 188 14 46 4 154 146 152 124 178 80 192 130 42 138 150
## 26 102 184 62 196 106 120], shape=(25,), dtype=int64)
## tf.Tensor(
## [144 136 50 70 132 186 54 140 44 92 182 18 76 30 2 12 20 34
## 170 116 6 134 168 16 194], shape=(25,), dtype=int64)
## tf.Tensor(
## [180 28 158 174 118 96 10 198 176 36 108 84 40 86 172 156 66 100
## 82 128 48 60 32 114 126], shape=(25,), dtype=int64)
## tf.Tensor(
## [190 142 164 122 104 8 78 22 38 90 58 112 72 162 88 98 64 24
## 166 56 94 68 160 148], shape=(24,), dtype=int64)
# refer to next batch by .x
for_each_batch(d1, {
print(.x$x + .x$y)
})
## tf.Tensor(
## [ 74 52 110 188 14 46 4 154 146 152 124 178 80 192 130 42 138 150
## 26 102 184 62 196 106 120], shape=(25,), dtype=int64)
## tf.Tensor(
## [144 136 50 70 132 186 54 140 44 92 182 18 76 30 2 12 20 34
## 170 116 6 134 168 16 194], shape=(25,), dtype=int64)
## tf.Tensor(
## [180 28 158 174 118 96 10 198 176 36 108 84 40 86 172 156 66 100
## 82 128 48 60 32 114 126], shape=(25,), dtype=int64)
## tf.Tensor(
## [190 142 164 122 104 8 78 22 38 90 58 112 72 162 88 98 64 24
## 166 56 94 68 160 148], shape=(24,), dtype=int64)
# refer to next batch by .x if unnamed
for_each_batch(d2, {
print(.x[[1]] + .x[[2]])
})
## tf.Tensor(
## [ 74 52 110 188 14 46 4 154 146 152 124 178 80 192 130 42 138 150
## 26 102 184 62 196 106 120], shape=(25,), dtype=int64)
## tf.Tensor(
## [144 136 50 70 132 186 54 140 44 92 182 18 76 30 2 12 20 34
## 170 116 6 134 168 16 194], shape=(25,), dtype=int64)
## tf.Tensor(
## [180 28 158 174 118 96 10 198 176 36 108 84 40 86 172 156 66 100
## 82 128 48 60 32 114 126], shape=(25,), dtype=int64)
## tf.Tensor(
## [190 142 164 122 104 8 78 22 38 90 58 112 72 162 88 98 64 24
## 166 56 94 68 160 148], shape=(24,), dtype=int64)
# we can create local variables from inside
i <- 1
for_each_batch(d2, {
print(.x[[1]] + .x[[2]])
i <- i + 1
})
## tf.Tensor(
## [ 74 52 110 188 14 46 4 154 146 152 124 178 80 192 130 42 138 150
## 26 102 184 62 196 106 120], shape=(25,), dtype=int64)
## tf.Tensor(
## [144 136 50 70 132 186 54 140 44 92 182 18 76 30 2 12 20 34
## 170 116 6 134 168 16 194], shape=(25,), dtype=int64)
## tf.Tensor(
## [180 28 158 174 118 96 10 198 176 36 108 84 40 86 172 156 66 100
## 82 128 48 60 32 114 126], shape=(25,), dtype=int64)
## tf.Tensor(
## [190 142 164 122 104 8 78 22 38 90 58 112 72 162 88 98 64 24
## 166 56 94 68 160 148], shape=(24,), dtype=int64)
i
## [1] 5
# devtools::install_github("lionel-/flowery")
library(flowery)
library(tfdatasets)
d <- range_dataset(from = 1, to = 100) %>%
dataset_shuffle(100) %>%
dataset_batch(25)
d1 <- tf$data$Dataset$zip(list(x = d, y = d))
d2 <- tf$data$Dataset$zip(reticulate::tuple(d, d))
as_iterator <- function(x) {
generator({
it <- reticulate::as_iterator(x)
nxt <- reticulate::iter_next(it)
while (!is.null(nxt)) {
yield(nxt)
nxt <- reticulate::iter_next(it)
}
})
}
# use for syntax to iterate over the dataset
iter <- as_iterator(d1)
iterate(for(x in iter) {
print(x$x + x$y)
})
## tf.Tensor(
## [ 90 140 114 138 10 152 130 6 188 82 162 142 146 182 60 124 54 56
## 94 74 4 66 136 134 194], shape=(25,), dtype=int64)
## tf.Tensor(
## [ 72 110 184 148 100 14 178 34 36 160 62 28 164 22 30 20 106 168
## 76 176 172 32 50 42 180], shape=(25,), dtype=int64)
## tf.Tensor(
## [ 58 44 84 46 92 38 96 186 120 154 102 40 70 116 126 12 170 196
## 156 68 2 108 64 88 86], shape=(25,), dtype=int64)
## tf.Tensor(
## [190 158 144 198 166 174 24 80 132 26 48 98 112 118 150 16 8 104
## 192 52 18 128 78 122], shape=(24,), dtype=int64)
## NULL
# get variables by name
iter <- as_iterator(d1)
iterate(for(x in iter) {
print(x[[1]] + x[[2]])
})
## tf.Tensor(
## [ 90 140 114 138 10 152 130 6 188 82 162 142 146 182 60 124 54 56
## 94 74 4 66 136 134 194], shape=(25,), dtype=int64)
## tf.Tensor(
## [ 72 110 184 148 100 14 178 34 36 160 62 28 164 22 30 20 106 168
## 76 176 172 32 50 42 180], shape=(25,), dtype=int64)
## tf.Tensor(
## [ 58 44 84 46 92 38 96 186 120 154 102 40 70 116 126 12 170 196
## 156 68 2 108 64 88 86], shape=(25,), dtype=int64)
## tf.Tensor(
## [190 158 144 198 166 174 24 80 132 26 48 98 112 118 150 16 8 104
## 192 52 18 128 78 122], shape=(24,), dtype=int64)
## NULL
# can create outside local variables
i <- 1
iter <- as_iterator(d1)
iterate(for(x in iter) {
print(x[[1]] + x[[2]])
i <- i + 1
})
## tf.Tensor(
## [ 90 140 114 138 10 152 130 6 188 82 162 142 146 182 60 124 54 56
## 94 74 4 66 136 134 194], shape=(25,), dtype=int64)
## tf.Tensor(
## [ 72 110 184 148 100 14 178 34 36 160 62 28 164 22 30 20 106 168
## 76 176 172 32 50 42 180], shape=(25,), dtype=int64)
## tf.Tensor(
## [ 58 44 84 46 92 38 96 186 120 154 102 40 70 116 126 12 170 196
## 156 68 2 108 64 88 86], shape=(25,), dtype=int64)
## tf.Tensor(
## [190 158 144 198 166 174 24 80 132 26 48 98 112 118 150 16 8 104
## 192 52 18 128 78 122], shape=(24,), dtype=int64)
## NULL
i
## [1] 5
# allows more low level control
iter <- as_iterator(d1)
i <- 1
while (TRUE) {
if (i %% 2 == 0) {}
next_batch <- iter()
if (i > 10 || is.null(next_batch))
break
print(next_batch)
i <- i + 1
}
## $y
## tf.Tensor(
## [45 70 57 69 5 76 65 3 94 41 81 71 73 91 30 62 27 28 47 37 2 33 68 67
## 97], shape=(25,), dtype=int64)
##
## $x
## tf.Tensor(
## [45 70 57 69 5 76 65 3 94 41 81 71 73 91 30 62 27 28 47 37 2 33 68 67
## 97], shape=(25,), dtype=int64)
##
## $y
## tf.Tensor(
## [36 55 92 74 50 7 89 17 18 80 31 14 82 11 15 10 53 84 38 88 86 16 25 21
## 90], shape=(25,), dtype=int64)
##
## $x
## tf.Tensor(
## [36 55 92 74 50 7 89 17 18 80 31 14 82 11 15 10 53 84 38 88 86 16 25 21
## 90], shape=(25,), dtype=int64)
##
## $y
## tf.Tensor(
## [29 22 42 23 46 19 48 93 60 77 51 20 35 58 63 6 85 98 78 34 1 54 32 44
## 43], shape=(25,), dtype=int64)
##
## $x
## tf.Tensor(
## [29 22 42 23 46 19 48 93 60 77 51 20 35 58 63 6 85 98 78 34 1 54 32 44
## 43], shape=(25,), dtype=int64)
##
## $y
## tf.Tensor([95 79 72 99 83 87 12 40 66 13 24 49 56 59 75 8 4 52 96 26 9 64 39 61], shape=(24,), dtype=int64)
##
## $x
## tf.Tensor([95 79 72 99 83 87 12 40 66 13 24 49 56 59 75 8 4 52 96 26 9 64 39 61], shape=(24,), dtype=int64)