-
-
Save andrewhowdencom/7ae1d52e3130c159a0031aa7f3f05a18 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 86f653952d84b2eedf02e2644d837f4ce6bc7d6f Mon Sep 17 00:00:00 2001 | |
From: Andrew Howden <hello@andrewhowden.com> | |
Date: Wed, 27 Jan 2021 22:54:43 +0100 | |
Subject: [PATCH] Treat "no logs" as an error condition | |
MIME-Version: 1.0 | |
Content-Type: text/plain; charset=UTF-8 | |
Content-Transfer-Encoding: 8bit | |
Currently there appears to be a problem (At least in the sampled | |
cluster) in which: | |
1. The call to the Kubernetes API works without issue, but | |
2. There are no logs returned by Kubernetes associated with a | |
given container | |
This means, practically, that Drone attempts to query logs but does not | |
find any — and assumes the build skips them. It thus skips all retry | |
logic or similar workarounds to address this issue. | |
This commit revises the logic slightly, both in the retry process as | |
well as determining when log querying "failed". | |
== Retries | |
Rather than the initial simple "two" retries, this commit allows drone | |
administrators to set the arbitrary number of retries. | |
This was useful as a debugging context; when set to an arbitrarily high | |
number and ignoring errors, the logs _eventually_ worked — indeed, even | |
showed up repeated within the Drone UI. | |
It remains as a slightly reworked retry implementation. | |
=== Definition of "Failure" | |
Denoting that a stream has "failed" even when it is perfectly valid is | |
tricky. The stream should declare if that, for whatever reason, it is | |
unable to query the data. | |
That said, pragmatics denote that the vast majority of builds will have | |
some log output. Furthermore, the "cost" of marking this as a failure is | |
that the kubernetes API will be retried — essentially asking the API | |
"are you truly _sure_ there is no logs?" | |
Given this, it seems a reasonable solution. | |
=== General Design | |
This design was hacked up to solve a specific problem, and this commit | |
written to describe the problem for the maintainers. | |
Figuring out an actual solution is probably going to be harder | |
--- | |
engine/cancellable_copy.go | 40 ++++++++++++++++++++++++++++++++++---- | |
engine/engine_impl.go | 39 ++++++++++++++++++++++++++++++------- | |
2 files changed, 68 insertions(+), 11 deletions(-) | |
diff --git a/engine/cancellable_copy.go b/engine/cancellable_copy.go | |
index 132bbde..35b807a 100644 | |
--- a/engine/cancellable_copy.go | |
+++ b/engine/cancellable_copy.go | |
@@ -1,10 +1,10 @@ | |
package engine | |
import ( | |
+ "bufio" | |
"context" | |
+ "errors" | |
"io" | |
- | |
- "github.com/drone/runner-go/livelog" | |
) | |
// cancellableCopy method copies from source to destination honoring the context. | |
@@ -13,8 +13,40 @@ func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) erro | |
ch := make(chan error, 1) | |
go func() { | |
defer close(ch) | |
- err := livelog.Copy(dst, src) | |
- ch <- err | |
+ | |
+ // Copy the live log | |
+ r := bufio.NewReader(src) | |
+ consumed := 0 | |
+ | |
+ for { | |
+ bytes, err := r.ReadBytes('\n') | |
+ consumed += len(bytes) | |
+ | |
+ if _, err := dst.Write(bytes); err != nil { | |
+ ch <- err | |
+ return | |
+ } | |
+ | |
+ // This addresses a condition in which the stream returned is apparently | |
+ // valid, however there is no content within that stream. | |
+ // | |
+ // On balance, this is unlikely for a CI tool — most CI build logs. Given | |
+ // this, builds with zero output are artificially marked as an | |
+ // error. | |
+ if err == io.EOF && consumed == 0 { | |
+ ch <- errors.New("EOF character found but no logs read") | |
+ return | |
+ } | |
+ | |
+ if err != nil { | |
+ if err != io.EOF { | |
+ ch <- err | |
+ return | |
+ } | |
+ return | |
+ } | |
+ } | |
+ | |
}() | |
select { | |
diff --git a/engine/engine_impl.go b/engine/engine_impl.go | |
index 4fdba39..bf1e2f0 100644 | |
--- a/engine/engine_impl.go | |
+++ b/engine/engine_impl.go | |
@@ -8,8 +8,10 @@ import ( | |
"context" | |
"fmt" | |
"io" | |
+ "log" | |
"os" | |
"path/filepath" | |
+ "strconv" | |
"time" | |
"github.com/drone-runners/drone-runner-kube/internal/docker/image" | |
@@ -179,6 +181,7 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime. | |
} | |
err = k.tail(ctx, spec, step, output) | |
+ | |
// this feature flag retries fetching the logs if it fails on | |
// the first attempt. This is meant to help triage the following | |
// issue: | |
@@ -186,17 +189,39 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime. | |
// https://discourse.drone.io/t/kubernetes-runner-intermittently-fails-steps/7372 | |
// | |
// BEGIN: FEATURE FLAG | |
- if err != nil { | |
- if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS") == "true" { | |
- <-time.After(time.Second * 5) | |
- err = k.tail(ctx, spec, step, output) | |
- } | |
+ retries := 0 | |
+ | |
+ if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS") == "true" { | |
+ log.Printf("Discovered feature flag 'retry logs'. Setting retries to 2") | |
+ retries = 2 | |
+ } | |
+ | |
+ // Check to see how many retries should be executed | |
+ if os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS_MAX_RETRIES") != "" { | |
+ d, err := strconv.Atoi(os.Getenv("DRONE_FEATURE_FLAG_RETRY_LOGS_MAX_RETRIES")) | |
if err != nil { | |
- <-time.After(time.Second * 5) | |
- err = k.tail(ctx, spec, step, output) | |
+ retries = d | |
+ | |
+ log.Printf("Discovered feature flag 'retry logs with max retries'. Setting retries to %d", d) | |
} | |
} | |
+ | |
+ for err != nil && retries != 0 { | |
+ // Print the error of the last run | |
+ log.Printf("unable to fetch logs: %s", err) | |
+ | |
+ // Wait 5 seconds to give whatever needs it time to rest. | |
+ <-time.After(time.Second * 5) | |
+ | |
+ // Retry the logs | |
+ err = k.tail(ctx, spec, step, output) | |
+ | |
+ // Decrease retries | |
+ retries-- | |
+ } | |
+ | |
// END: FEATURE FAG | |
+ | |
if err != nil { | |
return nil, err | |
} | |
-- | |
2.29.2 | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 5acf28a0e8c21aaad3fe6eebba650b25c626c909 Mon Sep 17 00:00:00 2001 | |
From: Andrew Howden <hello@andrewhowden.com> | |
Date: Wed, 3 Feb 2021 21:01:38 +0100 | |
Subject: [PATCH] Treat no logs as no logs, and retry them | |
MIME-Version: 1.0 | |
Content-Type: text/plain; charset=UTF-8 | |
Content-Transfer-Encoding: 8bit | |
In the previous commit (86f6539) there was some work completed that | |
treated the "no logs" received by Kubernetes as an error condition, | |
encouraging a retry behaviour that would then later retry the builds. | |
However, this introduced an undesirable side-effect — build steps | |
that actually do not return any output (such as the slack build step) or | |
in the case of an exceedingly long wait, Drone would mark the build as | |
failed. | |
This is undesirable, as there are a number of builds with the "Slack" | |
plugin which does not have any output — it just posts the message on to | |
slack. | |
Given this, this commit reworks the way the no-log detection is | |
implemented. Rather than denoting the no logs as an error, this patch | |
instead modifies the function that does the log copying to return the | |
number of bytes its implemented. | |
This additional interface clearly communicates whether or not logs were | |
_processed_; if they are processed, they must have been received. This | |
allows retrying the logs for a period without denoting them as a | |
failure. | |
== Design Notes | |
=== Why not just create a new failure type such as io.EOF? | |
The author did not think of this until writing this commit. That's a | |
much better idea. | |
--- | |
engine/cancellable_copy.go | 51 +++++++++++++++++++------------------- | |
engine/engine_impl.go | 31 ++++++++++++++++++----- | |
2 files changed, 50 insertions(+), 32 deletions(-) | |
diff --git a/engine/cancellable_copy.go b/engine/cancellable_copy.go | |
index 35b807a..837d585 100644 | |
--- a/engine/cancellable_copy.go | |
+++ b/engine/cancellable_copy.go | |
@@ -3,57 +3,56 @@ package engine | |
import ( | |
"bufio" | |
"context" | |
- "errors" | |
"io" | |
) | |
// cancellableCopy method copies from source to destination honoring the context. | |
// If context.Cancel is called, it will return immediately with context cancelled error. | |
-func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) error { | |
- ch := make(chan error, 1) | |
+// | |
+// Returns the total number of bytes copied into the buffer. | |
+func cancellableCopy(ctx context.Context, dst io.Writer, src io.ReadCloser) (written int, err error) { | |
+ eCh := make(chan error, 1) | |
+ | |
go func() { | |
- defer close(ch) | |
+ defer close(eCh) | |
// Copy the live log | |
r := bufio.NewReader(src) | |
- consumed := 0 | |
for { | |
- bytes, err := r.ReadBytes('\n') | |
- consumed += len(bytes) | |
+ // Read the buffer back | |
+ bytes, rErr := r.ReadBytes('\n') | |
- if _, err := dst.Write(bytes); err != nil { | |
- ch <- err | |
- return | |
- } | |
+ // Write the buffer to the destination | |
+ w, wErr := dst.Write(bytes) | |
+ written += w | |
- // This addresses a condition in which the stream returned is apparently | |
- // valid, however there is no content within that stream. | |
- // | |
- // On balance, this is unlikely for a CI tool — most CI build logs. Given | |
- // this, builds with zero output are artificially marked as an | |
- // error. | |
- if err == io.EOF && consumed == 0 { | |
- ch <- errors.New("EOF character found but no logs read") | |
+ // Check if there was a failure to write | |
+ if wErr != nil { | |
+ eCh <- wErr | |
return | |
} | |
- if err != nil { | |
- if err != io.EOF { | |
- ch <- err | |
+ // Check if there was a failure to read | |
+ if rErr != nil { | |
+ | |
+ // If this was _not_ an EOF error, its' remarkable. Return it. | |
+ if rErr != io.EOF { | |
+ eCh <- rErr | |
return | |
} | |
+ | |
+ // If this was an EOF error, it can be ignored. | |
return | |
} | |
} | |
}() | |
- | |
select { | |
case <-ctx.Done(): | |
src.Close() | |
- return ctx.Err() | |
- case err := <-ch: | |
- return err | |
+ return written, ctx.Err() | |
+ case err := <-eCh: | |
+ return written, err | |
} | |
} | |
diff --git a/engine/engine_impl.go b/engine/engine_impl.go | |
index bf1e2f0..40e7e68 100644 | |
--- a/engine/engine_impl.go | |
+++ b/engine/engine_impl.go | |
@@ -17,6 +17,7 @@ import ( | |
"github.com/drone-runners/drone-runner-kube/internal/docker/image" | |
"github.com/drone/runner-go/pipeline/runtime" | |
"github.com/hashicorp/go-multierror" | |
+ "github.com/sirupsen/logrus" | |
v1 "k8s.io/api/core/v1" | |
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | |
k8sruntime "k8s.io/apimachinery/pkg/runtime" | |
@@ -180,7 +181,7 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime. | |
return nil, err | |
} | |
- err = k.tail(ctx, spec, step, output) | |
+ consumed, err := k.tail(ctx, spec, step, output) | |
// this feature flag retries fetching the logs if it fails on | |
// the first attempt. This is meant to help triage the following | |
@@ -206,15 +207,33 @@ func (k *Kubernetes) Run(ctx context.Context, specv runtime.Spec, stepv runtime. | |
} | |
} | |
- for err != nil && retries != 0 { | |
+ hasLogs := (err == nil && consumed > 0) | |
+ | |
+ for retries != 0 && hasLogs != true { | |
+ // It is impossible to differentiate whether or not a given container has no | |
+ // logs by design, or whether instead there is some race condition in which the | |
+ // response from Kubernetes is returning no logs incorrectly. | |
+ // | |
+ // Given this, no logs consumed is treated as a "probable error", and the retry invoked. | |
+ if err == nil && consumed > 0 { | |
+ hasLogs = true | |
+ | |
+ // Skip this execution to exit from this loop entirely | |
+ continue | |
+ } | |
+ | |
// Print the error of the last run | |
- log.Printf("unable to fetch logs: %s", err) | |
+ logrus.New().WithFields(logrus.Fields{ | |
+ "consumed": consumed, | |
+ "error": err, | |
+ "retries": retries, | |
+ }).Infoln("Attempted to fetch logs but found issues: either an error is present or no logs were consumed.") | |
// Wait 5 seconds to give whatever needs it time to rest. | |
<-time.After(time.Second * 5) | |
// Retry the logs | |
- err = k.tail(ctx, spec, step, output) | |
+ consumed, err = k.tail(ctx, spec, step, output) | |
// Decrease retries | |
retries-- | |
@@ -311,7 +330,7 @@ func (k *Kubernetes) waitForTerminated(ctx context.Context, spec *Spec, step *St | |
return state, nil | |
} | |
-func (k *Kubernetes) tail(ctx context.Context, spec *Spec, step *Step, output io.Writer) error { | |
+func (k *Kubernetes) tail(ctx context.Context, spec *Spec, step *Step, output io.Writer) (int, error) { | |
opts := &v1.PodLogOptions{ | |
Follow: true, | |
Container: step.ID, | |
@@ -326,7 +345,7 @@ func (k *Kubernetes) tail(ctx context.Context, spec *Spec, step *Step, output io | |
readCloser, err := req.Stream() | |
if err != nil { | |
- return err | |
+ return 0, err | |
} | |
defer readCloser.Close() | |
-- | |
2.30.0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cc @bradrydzewski; I won't have the opportunity to tidy this up and make it a real patch — but perhaps the explanations above go some ways to nailing this one down.
I'm not sure if it's a thing peculiar to this cluster; there's nothing inherently unusual about them (they're a little older version of k8s, but beyond that).
The code is not pretty 😅 Hopefully it works enough™.