Skip to content

Instantly share code, notes, and snippets.

@AlanCoding
Last active January 3, 2023 18:55
Show Gist options
  • Save AlanCoding/b91bdcb7107ee56129e3ac10884d208c to your computer and use it in GitHub Desktop.
Save AlanCoding/b91bdcb7107ee56129e3ac10884d208c to your computer and use it in GitHub Desktop.
Patches saved
print("\x00")
diff --git a/awx/main/dispatch/worker/callback.py b/awx/main/dispatch/worker/callback.py
index 0578a4ff97..b030b2d1cc 100644
--- a/awx/main/dispatch/worker/callback.py
+++ b/awx/main/dispatch/worker/callback.py
@@ -270,6 +270,7 @@ class CallbackBrokerWorker(BaseWorker):
except (OperationalError, InterfaceError, InternalError) as exc:
if retries >= self.MAX_RETRIES:
logger.exception('Worker could not re-establish database connectivity, giving up on one or more events.')
+ self.buff = {}
return
delay = 60 * retries
logger.warning(f'Database Error Flushing Job Events, retry #{retries + 1} in {delay} seconds: {str(exc)}')
@@ -279,6 +280,7 @@ class CallbackBrokerWorker(BaseWorker):
except DatabaseError:
logger.exception('Database Error Flushing Job Events')
django_connection.close()
+ self.buff = {}
break
except Exception as exc:
tb = traceback.format_exc()
diff --git a/awx/main/dispatch/worker/callback.py b/awx/main/dispatch/worker/callback.py
index 0578a4ff97..480174de31 100644
--- a/awx/main/dispatch/worker/callback.py
+++ b/awx/main/dispatch/worker/callback.py
@@ -13,6 +13,8 @@ from django.db import DatabaseError, OperationalError, transaction, connection a
from django.db.utils import InterfaceError, InternalError
from django_guid import set_guid
+from psycopg2.errors import UniqueViolation
+
import psutil
import redis
@@ -72,6 +74,7 @@ class CallbackBrokerWorker(BaseWorker):
def __init__(self):
self.buff = {}
+ self.naughty_list = {}
self.redis = redis.Redis.from_url(settings.BROKER_URL)
self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False)
self.queue_pop = 0
@@ -172,19 +175,18 @@ class CallbackBrokerWorker(BaseWorker):
# if an exception occurs, we should re-attempt to save the
# events one-by-one, because something in the list is
# broken/stale
- consecutive_errors = 0
events_saved = 0
metrics_events_batch_save_errors += 1
- for e in events:
+ for e in events.copy():
+ events.remove(e)
try:
e.save()
events_saved += 1
- consecutive_errors = 0
+ except UniqueViolation:
+ pass
except Exception as exc_indv:
- consecutive_errors += 1
+ self.naughty_list.setdefault(cls, []).append(e)
logger.info(f'Database Error Saving individual Job Event, error {str(exc_indv)}')
- if consecutive_errors >= 5:
- raise
metrics_singular_events_saved += events_saved
if events_saved == 0:
raise
diff --git a/awx/main/dispatch/worker/callback.py b/awx/main/dispatch/worker/callback.py
index 0578a4ff97..7a1337ec64 100644
--- a/awx/main/dispatch/worker/callback.py
+++ b/awx/main/dispatch/worker/callback.py
@@ -270,6 +270,7 @@ class CallbackBrokerWorker(BaseWorker):
except (OperationalError, InterfaceError, InternalError) as exc:
if retries >= self.MAX_RETRIES:
logger.exception('Worker could not re-establish database connectivity, giving up on one or more events.')
+ self.buff = {}
return
delay = 60 * retries
logger.warning(f'Database Error Flushing Job Events, retry #{retries + 1} in {delay} seconds: {str(exc)}')

Reversion of the rule of 5

The thought, originally, is that getting the same exception for multiple events could flag a database connectivity event.

However, this is also likely the cause from some content / bug combination. Thus, this proposed to revert that logic.

Counter-argument which then comes up is how to handle resiliency needs.

https://github.com/ansible/awx/compare/devel...AlanCoding:awx:raise_revert?expand=1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment