Skip to content

Instantly share code, notes, and snippets.

@kholisrag
Created June 17, 2022 15:36
Show Gist options
  • Save kholisrag/2a783ac769634e489fc222c1a659d900 to your computer and use it in GitHub Desktop.
Save kholisrag/2a783ac769634e489fc222c1a659d900 to your computer and use it in GitHub Desktop.
Failed Alloc Counter in All Nomad Job
package main
import (
"encoding/json"
"fmt"
"github.com/hashicorp/nomad/api"
"github.com/sirupsen/logrus"
)
func main() {
nomadCfg := api.DefaultConfig()
nomadCfg.Address = "http://localhost:4646"
nc, err := api.NewClient(nomadCfg)
if err != nil {
logrus.WithField("error_message", err).Fatal("can't connect to nomad")
}
leaderStatus, err := nc.Status().Leader()
if err != nil {
logrus.WithField("error_message", err).Fatal("can't query server status")
}
fmt.Println("Nomad Leader : ", leaderStatus)
listJob, _, err := nc.Jobs().List(&api.QueryOptions{
Prefix: "<JOB_BUILD>",
Region: "global",
Params: map[string]string{
"status": "dead",
},
})
if err != nil {
logrus.WithField("error_message", err).Fatal("query job list error")
}
var jobCounter int
var failedCounter int
var completeAllocs int
errorCodeCount := map[string]int{}
failedJobByError := map[string][]string{}
fmt.Println("##########################################################################################")
for _, job := range listJob {
jobSummary := job.JobSummary.Summary
jobCounter++
for _, summaryStatus := range jobSummary {
if summaryStatus.Failed != 0 {
jobAllocs, _, err := nc.Jobs().Allocations(job.ID, true, &api.QueryOptions{})
if err != nil {
logrus.WithField("error_message", err).Fatal("query allocs in %s error", job.Name)
}
for _, alloc := range jobAllocs {
allocInfo, _, err := nc.Allocations().Info(alloc.ID, &api.QueryOptions{})
if err != nil {
logrus.WithField("error_message", err).Fatal("query allocs info %s error", alloc.ID)
}
for _, taskState := range allocInfo.TaskStates {
for _, event := range taskState.Events {
exitCode, ok := event.Details["exit_code"]
if !ok {
continue
}
if exitCode == "0" {
continue
}
fmt.Printf("Job Name : %s, Alloc ID: %s, Exit Code %s \n", job.Name, alloc.ID, event.Details["exit_code"])
failedCounter++
errorCodeCount[exitCode]++
failedJobByError[exitCode] = append(failedJobByError[exitCode], fmt.Sprintf("%s-%s", job.Name, alloc.ID))
}
}
}
}
if summaryStatus.Complete != 0 {
completeAllocs += summaryStatus.Complete
}
}
}
fmt.Println("##########################################################################################")
prettyErrorCodeCount, err := json.MarshalIndent(errorCodeCount, "", " ")
if err != nil {
logrus.WithField("error_message", err).Fatal("error prettify ds job by exit code count output")
}
fmt.Printf("Failed DS Pod Name by Exit Code Count : \n%v\n\n", string(prettyErrorCodeCount))
prettyFailedJobByError, err := json.MarshalIndent(failedJobByError, "", " ")
if err != nil {
logrus.WithField("error_message", err).Fatal("error prettify list of ds job name by exit code output")
}
fmt.Printf("List of Failed DS Pod Name by Exit Code : \n%v\n\n", string(prettyFailedJobByError))
fmt.Printf("Failed Job Total: %v \n", failedCounter)
fmt.Printf("Completed Allocations: %v \n", completeAllocs)
fmt.Printf("Job Count: %v \n", jobCounter)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment