andrewhowdencom/1st-attempt.patch Secret

## 1st-attempt.patch
From 86f653952d84b2eedf02e2644d837f4ce6bc7d6f Mon Sep 17 00:00:00 2001
From: Andrew Howden <hello@andrewhowden.com>
Date: Wed, 27 Jan 2021 22:54:43 +0100
Subject: [PATCH] Treat "no logs" as an error condition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently there appears to be a problem (At least in the sampled
cluster) in which:

1. The call to the Kubernetes API works without issue, but
2. There are no logs returned by Kubernetes associated with a
   given container

This means, practically, that Drone attempts to query logs but does not
find any — and assumes the build skips them. It thus skips all retry
logic or similar workarounds to address this issue.

This commit revises the logic slightly, both in the retry process as
well as determining when log querying "failed".

== Retries

Rather than the initial simple "two" retries, this commit allows drone
administrators to set the arbitrary number of retries.

This was useful as a debugging context; when set to an arbitrarily high
number and ignoring errors, the logs _eventually_ worked — indeed, even
showed up repeated within the Drone UI.

It remains as a slightly reworked retry implementation.

=== Definition of "Failure"

Denoting that a stream has "failed" even when it is perfectly valid is
tricky. The stream should declare if that, for whatever reason, it is
unable to query the data.

That said, pragmatics denote that the vast majority of builds will have
some log output. Furthermore, the "cost" of marking this as a failure is
that the kubernetes API will be retried — essentially asking the API
"are you truly _sure_ there is no logs?"

Given this, it seems a reasonable solution.

=== General Design

This design was hacked up to solve a specific problem, and this commit
written to describe the problem for the maintainers.

Figuring out an actual solution is probably going to be harder
---
 engine/cancellable_copy.go | 40 ++++++++++++++++++++++++++++++++++----
 engine/engine_impl.go      | 39 ++++++++++++++++++++++++++++++-------
 2 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/engine/cancellable_copy.go b/engine/cancellable_copy.go
index 132bbde..35b807a 100644
--- a/engine/cancellable_copy.go
+++ b/engine/cancellable_copy.go
@@ -1,10 +1,10 @@
 package engine

 import (
+	"bufio"
 	"context"
+	"errors"
 	"io"
-
-	"github.com/drone/runner-go/livelog"
 )

 // cancellableCopy method copies from source to destination honoring the context.
@@ -13,8 +13,40 @@ func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) erro
 	ch := make(chan error, 1)
 	go func() {
 		defer close(ch)
-		err := livelog.Copy(dst, src)
-		ch <- err
+
+		// Copy the live log
+		r := bufio.NewReader(src)
+		consumed := 0
+
+		for {
+			bytes, err := r.ReadBytes('\n')
+			consumed += len(bytes)
+
+			if _, err := dst.Write(bytes); err != nil {
+				ch <- err
+				return
+			}
+
+			// This addresses a condition in which the stream returned is apparently
+			// valid, however there is no content within that stream.
+			//
+			// On balance, this is unlikely for a CI tool — most CI build logs. Given
+			// this, builds with zero output are artificially marked as an
+			// error.
+			if err == io.EOF && consumed == 0 {
+				ch <- errors.New("EOF character found but no logs read")
+				return
+			}
+
+			if err != nil {
+				if err != io.EOF {
+					ch <- err
+					return
+				}
+				return
+			}
+		}
+
 	}()

 	select {
diff --git a/engine/engine_impl.go b/engine/engine_impl.go
index 4fdba39..bf1e2f0 100644
--- a/engine/engine_impl.go
+++ b/engine/engine_impl.go
@@ -8,8 +8,10 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"log"
 	"os"
 	"path/filepath"
+	"strconv"
 	"time"

 	"github.com/drone-runners/drone-runner-kube/internal/docker/image"
@@ -179,6 +181,7 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime.
 	}

 	err = k.tail(ctx, spec, step, output)
+
 	// this feature flag retries fetching the logs if it fails on
 	// the first attempt. This is meant to help triage the following
 	// issue:
@@ -186,17 +189,39 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime.
 	//    https://discourse.drone.io/t/kubernetes-runner-intermittently-fails-steps/7372
 	//
 	// BEGIN: FEATURE FLAG
-	if err != nil {
-		if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS") == "true" {
-			<-time.After(time.Second * 5)
-			err = k.tail(ctx, spec, step, output)
-		}
+	retries := 0
+
+	if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS") == "true" {
+		log.Printf("Discovered feature flag 'retry logs'. Setting retries to 2")
+		retries = 2
+	}
+
+	// Check to see how many retries should be executed
+	if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS_MAX_RETRIES") != "" {
+		d, err := strconv.Atoi(os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS_MAX_RETRIES"))
 		if err != nil {
-			<-time.After(time.Second * 5)
-			err = k.tail(ctx, spec, step, output)
+			retries = d
+
+			log.Printf("Discovered feature flag 'retry logs with max retries'. Setting retries to %d", d)
 		}
 	}
+
+	for err != nil && retries != 0 {
+		// Print the error of the last run
+		log.Printf("unable to fetch logs: %s", err)
+
+		// Wait 5 seconds to give whatever needs it time to rest.
+		<-time.After(time.Second * 5)
+
+		// Retry the logs
+		err = k.tail(ctx, spec, step, output)
+
+		// Decrease retries
+		retries--
+	}
+
 	// END: FEATURE FAG
+
 	if err != nil {
 		return nil, err
 	}
--
2.29.2


## 2nd-attempt.patch
From 5acf28a0e8c21aaad3fe6eebba650b25c626c909 Mon Sep 17 00:00:00 2001
From: Andrew Howden <hello@andrewhowden.com>
Date: Wed, 3 Feb 2021 21:01:38 +0100
Subject: [PATCH] Treat no logs as no logs, and retry them
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the previous commit (86f6539) there was some work completed that
treated the "no logs" received by Kubernetes as an error condition,
encouraging a retry behaviour that would then later retry the builds.

However, this introduced an undesirable side-effect — build steps
that actually do not return any output (such as the slack build step) or
in the case of an exceedingly long wait, Drone would mark the build as
failed.

This is undesirable, as there are a number of builds with the "Slack"
plugin which does not have any output — it just posts the message on to
slack.

Given this, this commit reworks the way the no-log detection is
implemented. Rather than denoting the no logs as an error, this patch
instead modifies the function that does the log copying to return the
number of bytes its implemented.

This additional interface clearly communicates whether or not logs were
_processed_; if they are processed, they must have been received. This
allows retrying the logs for a period without denoting them as a
failure.

== Design Notes
=== Why not just create a new failure type such as io.EOF?

The author did not think of this until writing this commit. That's a
much better idea.
---
 engine/cancellable_copy.go | 51 +++++++++++++++++++-------------------
 engine/engine_impl.go      | 31 ++++++++++++++++++-----
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/engine/cancellable_copy.go b/engine/cancellable_copy.go
index 35b807a..837d585 100644
--- a/engine/cancellable_copy.go
+++ b/engine/cancellable_copy.go
@@ -3,57 +3,56 @@ package engine
 import (
 	"bufio"
 	"context"
-	"errors"
 	"io"
 )

 // cancellableCopy method copies from source to destination honoring the context.
 // If context.Cancel is called, it will return immediately with context cancelled error.
-func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) error {
-	ch := make(chan error, 1)
+//
+// Returns the total number of bytes copied into the buffer.
+func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) (written int, err error) {
+	eCh := make(chan error, 1)
+
 	go func() {
-		defer close(ch)
+		defer close(eCh)

 		// Copy the live log
 		r := bufio.NewReader(src)
-		consumed := 0

 		for {
-			bytes, err := r.ReadBytes('\n')
-			consumed += len(bytes)
+			// Read the buffer back
+			bytes, rErr := r.ReadBytes('\n')

-			if _, err := dst.Write(bytes); err != nil {
-				ch <- err
-				return
-			}
+			// Write the buffer to the destination
+			w, wErr := dst.Write(bytes)
+			written += w

-			// This addresses a condition in which the stream returned is apparently
-			// valid, however there is no content within that stream.
-			//
-			// On balance, this is unlikely for a CI tool — most CI build logs. Given
-			// this, builds with zero output are artificially marked as an
-			// error.
-			if err == io.EOF && consumed == 0 {
-				ch <- errors.New("EOF character found but no logs read")
+			// Check if there was a failure to write
+			if wErr != nil {
+				eCh <- wErr
 				return
 			}

-			if err != nil {
-				if err != io.EOF {
-					ch <- err
+			// Check if there was a failure to read
+			if rErr != nil {
+
+				// If this was _not_ an EOF error, its' remarkable. Return it.
+				if rErr != io.EOF {
+					eCh <- rErr
 					return
 				}
+
+				// If this was an EOF error, it can be ignored.
 				return
 			}
 		}

 	}()
-
 	select {
 	case <-ctx.Done():
 		src.Close()
-		return ctx.Err()
-	case err := <-ch:
-		return err
+		return written, ctx.Err()
+	case err := <-eCh:
+		return written, err
 	}
 }
diff --git a/engine/engine_impl.go b/engine/engine_impl.go
index bf1e2f0..40e7e68 100644
--- a/engine/engine_impl.go
+++ b/engine/engine_impl.go
@@ -17,6 +17,7 @@ import (
 	"github.com/drone-runners/drone-runner-kube/internal/docker/image"
 	"github.com/drone/runner-go/pipeline/runtime"
 	"github.com/hashicorp/go-multierror"
+	"github.com/sirupsen/logrus"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	k8sruntime "k8s.io/apimachinery/pkg/runtime"
@@ -180,7 +181,7 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime.
 		return nil, err
 	}

-	err = k.tail(ctx, spec, step, output)
+	consumed, err := k.tail(ctx, spec, step, output)

 	// this feature flag retries fetching the logs if it fails on
 	// the first attempt. This is meant to help triage the following
@@ -206,15 +207,33 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime.
 		}
 	}

-	for err != nil && retries != 0 {
+	hasLogs := (err == nil && consumed > 0)
+
+	for retries != 0 && hasLogs != true {
+		// It is impossible to differentiate whether or not a given container has no
+		// logs by design, or whether instead there is some race condition in which the
+		// response from Kubernetes is returning no logs incorrectly.
+		//
+		// Given this, no logs consumed is treated as a "probable error", and the retry invoked.
+		if err == nil && consumed > 0 {
+			hasLogs = true
+
+			// Skip this execution to exit from this loop entirely
+			continue
+		}
+
 		// Print the error of the last run
-		log.Printf("unable to fetch logs: %s", err)
+		logrus.New().WithFields(logrus.Fields{
+			"consumed": consumed,
+			"error":    err,
+			"retries":  retries,
+		}).Infoln("Attempted to fetch logs but found issues: either an error is present or no logs were consumed.")

 		// Wait 5 seconds to give whatever needs it time to rest.
 		<-time.After(time.Second * 5)

 		// Retry the logs
-		err = k.tail(ctx, spec, step, output)
+		consumed, err = k.tail(ctx, spec, step, output)

 		// Decrease retries
 		retries--
@@ -311,7 +330,7 @@ func (k *Kubernetes) waitForTerminated(ctx context.Context, spec *Spec, step *St
 	return state, nil
 }

-func (k *Kubernetes) tail(ctx context.Context, spec *Spec, step *Step, output io.Writer) error {
+func (k *Kubernetes) tail(ctx context.Context, spec *Spec, step *Step, output io.Writer) (int, error) {
 	opts := &v1.PodLogOptions{
 		Follow:    true,
 		Container: step.ID,
@@ -326,7 +345,7 @@ func (k *Kubernetes) tail(ctx context.Context, spec *Spec, step *Step, output io

 	readCloser, err := req.Stream()
 	if err != nil {
-		return err
+		return 0, err
 	}
 	defer readCloser.Close()

--
2.30.0
	From 86f653952d84b2eedf02e2644d837f4ce6bc7d6f Mon Sep 17 00:00:00 2001
	From: Andrew Howden <hello@andrewhowden.com>
	Date: Wed, 27 Jan 2021 22:54:43 +0100
	Subject: [PATCH] Treat "no logs" as an error condition
	MIME-Version: 1.0
	Content-Type: text/plain; charset=UTF-8
	Content-Transfer-Encoding: 8bit

	Currently there appears to be a problem (At least in the sampled
	cluster) in which:

	1. The call to the Kubernetes API works without issue, but
	2. There are no logs returned by Kubernetes associated with a
	given container

	This means, practically, that Drone attempts to query logs but does not
	find any — and assumes the build skips them. It thus skips all retry
	logic or similar workarounds to address this issue.

	This commit revises the logic slightly, both in the retry process as
	well as determining when log querying "failed".

	== Retries

	Rather than the initial simple "two" retries, this commit allows drone
	administrators to set the arbitrary number of retries.

	This was useful as a debugging context; when set to an arbitrarily high
	number and ignoring errors, the logs _eventually_ worked — indeed, even
	showed up repeated within the Drone UI.

	It remains as a slightly reworked retry implementation.

	=== Definition of "Failure"

	Denoting that a stream has "failed" even when it is perfectly valid is
	tricky. The stream should declare if that, for whatever reason, it is
	unable to query the data.

	That said, pragmatics denote that the vast majority of builds will have
	some log output. Furthermore, the "cost" of marking this as a failure is
	that the kubernetes API will be retried — essentially asking the API
	"are you truly _sure_ there is no logs?"

	Given this, it seems a reasonable solution.

	=== General Design

	This design was hacked up to solve a specific problem, and this commit
	written to describe the problem for the maintainers.

	Figuring out an actual solution is probably going to be harder
	---
	engine/cancellable_copy.go \| 40 ++++++++++++++++++++++++++++++++++----
	engine/engine_impl.go \| 39 ++++++++++++++++++++++++++++++-------
	2 files changed, 68 insertions(+), 11 deletions(-)

	diff --git a/engine/cancellable_copy.go b/engine/cancellable_copy.go
	index 132bbde..35b807a 100644
	--- a/engine/cancellable_copy.go
	+++ b/engine/cancellable_copy.go
	@@ -1,10 +1,10 @@
	package engine

	import (
	+ "bufio"
	"context"
	+ "errors"
	"io"
	-
	- "github.com/drone/runner-go/livelog"
	)

	// cancellableCopy method copies from source to destination honoring the context.
	@@ -13,8 +13,40 @@ func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) erro
	ch := make(chan error, 1)
	go func() {
	defer close(ch)
	- err := livelog.Copy(dst, src)
	- ch <- err
	+
	+ // Copy the live log
	+ r := bufio.NewReader(src)
	+ consumed := 0
	+
	+ for {
	+ bytes, err := r.ReadBytes('\n')
	+ consumed += len(bytes)
	+
	+ if _, err := dst.Write(bytes); err != nil {
	+ ch <- err
	+ return
	+ }
	+
	+ // This addresses a condition in which the stream returned is apparently
	+ // valid, however there is no content within that stream.
	+ //
	+ // On balance, this is unlikely for a CI tool — most CI build logs. Given
	+ // this, builds with zero output are artificially marked as an
	+ // error.
	+ if err == io.EOF && consumed == 0 {
	+ ch <- errors.New("EOF character found but no logs read")
	+ return
	+ }
	+
	+ if err != nil {
	+ if err != io.EOF {
	+ ch <- err
	+ return
	+ }
	+ return
	+ }
	+ }
	+
	}()

	select {
	diff --git a/engine/engine_impl.go b/engine/engine_impl.go
	index 4fdba39..bf1e2f0 100644
	--- a/engine/engine_impl.go
	+++ b/engine/engine_impl.go
	@@ -8,8 +8,10 @@ import (
	"context"
	"fmt"
	"io"
	+ "log"
	"os"
	"path/filepath"
	+ "strconv"
	"time"

	"github.com/drone-runners/drone-runner-kube/internal/docker/image"
	@@ -179,6 +181,7 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime.
	}

	err = k.tail(ctx, spec, step, output)
	+
	// this feature flag retries fetching the logs if it fails on
	// the first attempt. This is meant to help triage the following
	// issue:
	@@ -186,17 +189,39 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime.
	// https://discourse.drone.io/t/kubernetes-runner-intermittently-fails-steps/7372
	//
	// BEGIN: FEATURE FLAG
	- if err != nil {
	- if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS") == "true" {
	- <-time.After(time.Second * 5)
	- err = k.tail(ctx, spec, step, output)
	- }
	+ retries := 0
	+
	+ if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS") == "true" {
	+ log.Printf("Discovered feature flag 'retry logs'. Setting retries to 2")
	+ retries = 2
	+ }
	+
	+ // Check to see how many retries should be executed
	+ if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS_MAX_RETRIES") != "" {
	+ d, err := strconv.Atoi(os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS_MAX_RETRIES"))
	if err != nil {
	- <-time.After(time.Second * 5)
	- err = k.tail(ctx, spec, step, output)
	+ retries = d
	+
	+ log.Printf("Discovered feature flag 'retry logs with max retries'. Setting retries to %d", d)
	}
	}
	+
	+ for err != nil && retries != 0 {
	+ // Print the error of the last run
	+ log.Printf("unable to fetch logs: %s", err)
	+
	+ // Wait 5 seconds to give whatever needs it time to rest.
	+ <-time.After(time.Second * 5)
	+
	+ // Retry the logs
	+ err = k.tail(ctx, spec, step, output)
	+
	+ // Decrease retries
	+ retries--
	+ }
	+
	// END: FEATURE FAG
	+
	if err != nil {
	return nil, err
	}
	--
	2.29.2
	From 5acf28a0e8c21aaad3fe6eebba650b25c626c909 Mon Sep 17 00:00:00 2001
	From: Andrew Howden <hello@andrewhowden.com>
	Date: Wed, 3 Feb 2021 21:01:38 +0100
	Subject: [PATCH] Treat no logs as no logs, and retry them
	MIME-Version: 1.0
	Content-Type: text/plain; charset=UTF-8
	Content-Transfer-Encoding: 8bit

	In the previous commit (86f6539) there was some work completed that
	treated the "no logs" received by Kubernetes as an error condition,
	encouraging a retry behaviour that would then later retry the builds.

	However, this introduced an undesirable side-effect — build steps
	that actually do not return any output (such as the slack build step) or
	in the case of an exceedingly long wait, Drone would mark the build as
	failed.

	This is undesirable, as there are a number of builds with the "Slack"
	plugin which does not have any output — it just posts the message on to
	slack.

	Given this, this commit reworks the way the no-log detection is
	implemented. Rather than denoting the no logs as an error, this patch
	instead modifies the function that does the log copying to return the
	number of bytes its implemented.

	This additional interface clearly communicates whether or not logs were
	_processed_; if they are processed, they must have been received. This
	allows retrying the logs for a period without denoting them as a
	failure.

	== Design Notes
	=== Why not just create a new failure type such as io.EOF?

	The author did not think of this until writing this commit. That's a
	much better idea.
	---
	engine/cancellable_copy.go \| 51 +++++++++++++++++++-------------------
	engine/engine_impl.go \| 31 ++++++++++++++++++-----
	2 files changed, 50 insertions(+), 32 deletions(-)

	diff --git a/engine/cancellable_copy.go b/engine/cancellable_copy.go
	index 35b807a..837d585 100644
	--- a/engine/cancellable_copy.go
	+++ b/engine/cancellable_copy.go
	@@ -3,57 +3,56 @@ package engine
	import (
	"bufio"
	"context"
	- "errors"
	"io"
	)

	// cancellableCopy method copies from source to destination honoring the context.
	// If context.Cancel is called, it will return immediately with context cancelled error.
	-func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) error {
	- ch := make(chan error, 1)
	+//
	+// Returns the total number of bytes copied into the buffer.
	+func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) (written int, err error) {
	+ eCh := make(chan error, 1)
	+
	go func() {
	- defer close(ch)
	+ defer close(eCh)

	// Copy the live log
	r := bufio.NewReader(src)
	- consumed := 0

	for {
	- bytes, err := r.ReadBytes('\n')
	- consumed += len(bytes)
	+ // Read the buffer back
	+ bytes, rErr := r.ReadBytes('\n')

	- if _, err := dst.Write(bytes); err != nil {
	- ch <- err
	- return
	- }
	+ // Write the buffer to the destination
	+ w, wErr := dst.Write(bytes)
	+ written += w

	- // This addresses a condition in which the stream returned is apparently
	- // valid, however there is no content within that stream.
	- //
	- // On balance, this is unlikely for a CI tool — most CI build logs. Given
	- // this, builds with zero output are artificially marked as an
	- // error.
	- if err == io.EOF && consumed == 0 {
	- ch <- errors.New("EOF character found but no logs read")
	+ // Check if there was a failure to write
	+ if wErr != nil {
	+ eCh <- wErr
	return
	}

	- if err != nil {
	- if err != io.EOF {
	- ch <- err
	+ // Check if there was a failure to read
	+ if rErr != nil {
	+
	+ // If this was _not_ an EOF error, its' remarkable. Return it.
	+ if rErr != io.EOF {
	+ eCh <- rErr
	return
	}
	+
	+ // If this was an EOF error, it can be ignored.
	return
	}
	}

	}()
	-
	select {
	case <-ctx.Done():
	src.Close()
	- return ctx.Err()
	- case err := <-ch:
	- return err
	+ return written, ctx.Err()
	+ case err := <-eCh:
	+ return written, err
	}
	}
	diff --git a/engine/engine_impl.go b/engine/engine_impl.go
	index bf1e2f0..40e7e68 100644
	--- a/engine/engine_impl.go
	+++ b/engine/engine_impl.go
	@@ -17,6 +17,7 @@ import (
	"github.com/drone-runners/drone-runner-kube/internal/docker/image"
	"github.com/drone/runner-go/pipeline/runtime"
	"github.com/hashicorp/go-multierror"
	+ "github.com/sirupsen/logrus"
	v1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	k8sruntime "k8s.io/apimachinery/pkg/runtime"
	@@ -180,7 +181,7 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime.
	return nil, err
	}

	- err = k.tail(ctx, spec, step, output)
	+ consumed, err := k.tail(ctx, spec, step, output)

	// this feature flag retries fetching the logs if it fails on
	// the first attempt. This is meant to help triage the following
	@@ -206,15 +207,33 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime.
	}
	}

	- for err != nil && retries != 0 {
	+ hasLogs := (err == nil && consumed > 0)
	+
	+ for retries != 0 && hasLogs != true {
	+ // It is impossible to differentiate whether or not a given container has no
	+ // logs by design, or whether instead there is some race condition in which the
	+ // response from Kubernetes is returning no logs incorrectly.
	+ //
	+ // Given this, no logs consumed is treated as a "probable error", and the retry invoked.
	+ if err == nil && consumed > 0 {
	+ hasLogs = true
	+
	+ // Skip this execution to exit from this loop entirely
	+ continue
	+ }
	+
	// Print the error of the last run
	- log.Printf("unable to fetch logs: %s", err)
	+ logrus.New().WithFields(logrus.Fields{
	+ "consumed": consumed,
	+ "error": err,
	+ "retries": retries,
	+ }).Infoln("Attempted to fetch logs but found issues: either an error is present or no logs were consumed.")

	// Wait 5 seconds to give whatever needs it time to rest.
	<-time.After(time.Second * 5)

	// Retry the logs
	- err = k.tail(ctx, spec, step, output)
	+ consumed, err = k.tail(ctx, spec, step, output)

	// Decrease retries
	retries--
	@@ -311,7 +330,7 @@ func (k Kubernetes) waitForTerminated(ctx context.Context, spec Spec, step *St
	return state, nil
	}

	-func (k Kubernetes) tail(ctx context.Context, spec Spec, step *Step, output io.Writer) error {
	+func (k Kubernetes) tail(ctx context.Context, spec Spec, step *Step, output io.Writer) (int, error) {
	opts := &v1.PodLogOptions{
	Follow: true,
	Container: step.ID,
	@@ -326,7 +345,7 @@ func (k Kubernetes) tail(ctx context.Context, spec Spec, step *Step, output io

	readCloser, err := req.Stream()
	if err != nil {
	- return err
	+ return 0, err
	}
	defer readCloser.Close()

	--
	2.30.0