Skip to content

Instantly share code, notes, and snippets.

@SamGajdos
Last active November 10, 2022 19:15
Show Gist options
  • Save SamGajdos/4c214fb2d6e67eff01217c6e25cbb27b to your computer and use it in GitHub Desktop.
Save SamGajdos/4c214fb2d6e67eff01217c6e25cbb27b to your computer and use it in GitHub Desktop.
Playbook to execute sanity check against an OCP cluster.
---
- name: Run and check command "{{command}}"
ansible.builtin.shell: ./check_oc_command.py "{{check}}" "$({{command}})" "{{command}}"
async: 500
poll: 0
args:
executable: /bin/bash
register: script_output
failed_when: "'FAILED' in script_output"
- name: Append to variable
set_fact:
append_array: "{{ append_array | default([]) + [script_output] }}"
#!/usr/bin/env python3
import sys
def eprint(*args, **kwargs):
""" Prints a message on stderr """
print(*args, file=sys.stderr, **kwargs)
def checkReadyState(cmd, lines):
""" Parse ready state from output and checks if it is ready """
exitcode = 0
error_message = ""
lines = sys.argv[2].splitlines()
error_message = ""
if len(lines):
error_message = "FAILED in command: \'" + cmd + "\'\n"
error_message += lines[0] + "\n"
for i in range(1, len(lines)):
words = lines[i].split()
if (words[1] != "Ready"):
error_message += lines[i] + "\n"
exitcode = 1
if exitcode == 1:
eprint(error_message)
return exitcode
def checkResources(cmd, lines):
""" Parse CPU and Memory percentage and checks if it is active """
exitcode = 0
lines = sys.argv[2].splitlines()
error_message = ""
if len(lines):
error_message = "FAILED in command: \'" + cmd + "\'\n"
error_message += lines[0] + "\n"
for i in range(1, len(lines)):
words = lines[i].split()
# CPU or Memory
if words[2] == "0%" or words[4] == "0%":
error_message += lines[i] + "\n"
exitcode = 1
if exitcode == 1:
eprint(error_message)
return exitcode
def checkEtcd(cmd, lines):
""" Checks if etcd of all members is active """
if "unhealthy" in lines:
eprint("FAILED, in command: \'" + cmd + "\'\n")
return 1
def checkRouterRegistryDeployment(cmd, lines):
""" Parse and check if ingress/image register for a specified resource_type is active"""
exitcode = 0
lines = sys.argv[2].splitlines()
error_message = ""
if len(lines):
error_message = "FAILED in command: \'" + cmd + "\'\n"
error_message += lines[0] + "\n"
for i in range(1, len(lines)):
words = lines[i].split()
numbers=words[1].split("/")
if (numbers[0] != numbers[1]):
error_message += lines[i] + "\n"
exitcode = 1
if exitcode == 1:
eprint(error_message)
return exitcode
def checkRouterRegistryPod(cmd, lines):
""" Parse and check if ingress/image register for a specified resource_type is active"""
exitcode = 0
lines = sys.argv[2].splitlines()
error_message = ""
if len(lines):
error_message = "FAILED in command: \'" + cmd + "\'\n"
error_message += lines[0] + "\n"
for i in range(1, len(lines)):
words = lines[i].split()
numbers=words[1].split("/")
if (numbers[0] != numbers[1]):
if (words[2] != "Completed"):
error_message += lines[i] + "\n"
exitcode = 1
if exitcode == 1:
eprint(error_message)
return exitcode
def checkOperator(cmd, lines):
""" Cluster operator check if it is not degraded """
exitcode = 0
lines = sys.argv[2].splitlines()
error_message = ""
if len(lines):
error_message = "FAILED in command: \'" + cmd + "\'\n"
error_message += lines[0] + "\n"
for i in range(1, len(lines)):
words = lines[i].split()
if (words[4] != "False"):
error_message += lines[i] + "\n"
exitcode = 1
if exitcode == 1:
eprint(error_message)
return exitcode
def checkDeploymentStateFul(cmd, lines):
""" Parse and check if Deployments/StatefulSets are ready """
exitcode = 0
lines = sys.argv[2].splitlines()
error_message = ""
if len(lines):
error_message = "FAILED in command: \'" + cmd + "\'\n"
error_message += lines[0] + "\n"
for i in range(1, len(lines)):
words = lines[i].split()
numbers=words[2].split("/")
if (numbers[0] != numbers[1]):
error_message += lines[i] + "\n"
exitcode = 1
if exitcode == 1:
eprint(error_message)
return exitcode
def checkPod(cmd, lines):
""" Parse and check if Pods are ready """
exitcode = 0
lines = sys.argv[2].splitlines()
error_message = ""
if len(lines):
error_message = "FAILED in command: \'" + cmd + "\'\n"
error_message += lines[0] + "\n"
for i in range(1, len(lines)):
words = lines[i].split()
numbers=words[2].split("/")
if (numbers[0] != numbers[1]):
if (words[3] != "Completed"):
error_message += lines[i] + "\n"
exitcode = 1
if exitcode == 1:
eprint(error_message)
return exitcode
def checkRepSetDaeSetRepCtrlr(cmd, lines):
""" Parse and check ReplicaSet, DaemonSet, ReplicationController"""
exitcode = 0
lines = sys.argv[2].splitlines()
error_message = ""
if len(lines):
error_message = "FAILED in command: \'" + cmd + "\'\n"
error_message += lines[0] + "\n"
for i in range(1, len(lines)):
words = lines[i].split()
if (words[2] != words[4]):
error_message += lines[i] + "\n"
exitcode = 1
if exitcode == 1:
eprint(error_message)
return exitcode
def checkReadyz(cmd, lines):
""" Check readyz command """
if "readyz check passed" not in lines:
eprint("FAILED, in command: \'" + cmd + "\'\n")
return 1
def main():
# Prints the output of command
print(sys.argv[2])
print("\n")
if sys.argv[1] == "overview":
return checkReadyState(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "resources":
return checkResources(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "etcd":
return checkEtcd(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "router_pod":
return checkRouterRegistryPod(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "router_deployment":
return checkRouterRegistryDeployment(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "registry_pod":
return checkRouterRegistryPod(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "registry_deployment":
return checkRouterRegistryDeployment(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "clusteroperators":
return checkOperator(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "deployment":
return checkDeploymentStateFul(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "replicaset":
return checkRepSetDaeSetRepCtrlr(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "pods":
return checkPod(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "statefulset":
return checkDeploymentStateFul(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "daemonset":
return checkRepSetDaeSetRepCtrlr(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "replicationcontroller":
return checkRepSetDaeSetRepCtrlr(sys.argv[3], sys.argv[2])
elif sys.argv[1] == "readyz":
return checkReadyz(sys.argv[3], sys.argv[2])
else:
eprint("FAILED, unknown resource from argument.")
exit(1)
if __name__ == "__main__":
exit(main())
---
- hosts: localhost
vars:
# List of all commands to check
command_list:
- command: oc get nodes -o wide
check: "overview"
- command: oc adm top nodes
check: "resources"
- command: kubectl get --raw='/readyz?verbose'
check: "readyz"
- command: oc get etcd -o=jsonpath='{range .items[0].status.conditions[?(@.type=="EtcdMembersAvailable")]}{.message}{"\n"}'
check: "etcd"
- command: oc get pod --namespace openshift-ingress
check: "router_pod"
- command: oc get deployment --namespace openshift-ingress
check: "router_deployment"
- command: oc get pod -n openshift-image-registry
check: "registry_pod"
- command: oc get deployment -n openshift-image-registry
check: "registry_deployment"
- command: oc -n default get clusteroperators
check: "clusteroperators"
- command: oc get deployment --all-namespaces
check: "deployment"
- command: oc get replicaset --all-namespaces | egrep -v ' 0 .* 0 '
check: "replicaset"
- command: oc get pods --all-namespaces
check: "pods"
- command: oc get statefulset --all-namespaces
check: "statefulset"
- command: oc get daemonset --all-namespaces
check: "daemonset"
- command: oc get replicationcontroller --all-namespaces
check: "replicationcontroller"
tasks:
- name: Login into cluster
shell: |
set -o pipefail && \
oc login --insecure-skip-tls-verify --username "{{ cluster_user }}" --password "{{ cluster_password }}" "{{ cluster_api }}"
args:
executable: /bin/bash
register: cluster_login
failed_when: '"Login successful." not in cluster_login.stdout'
- name: Created an array
set_fact:
append_array: []
- include_tasks: check.yml
vars:
command: '{{item.command}}'
check: '{{item.check}}'
loop: '{{command_list}}'
- name: Async status
async_status:
jid: "{{ async_result_item }}"
loop: "{{ append_array|map(attribute='ansible_job_id')|list }}"
loop_control:
loop_var: "async_result_item"
register: result
until: result.finished
retries: 10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment