Last active
November 10, 2022 19:15
-
-
Save SamGajdos/4c214fb2d6e67eff01217c6e25cbb27b to your computer and use it in GitHub Desktop.
Playbook to execute sanity check against an OCP cluster.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
- name: Run and check command "{{command}}" | |
ansible.builtin.shell: ./check_oc_command.py "{{check}}" "$({{command}})" "{{command}}" | |
async: 500 | |
poll: 0 | |
args: | |
executable: /bin/bash | |
register: script_output | |
failed_when: "'FAILED' in script_output" | |
- name: Append to variable | |
set_fact: | |
append_array: "{{ append_array | default([]) + [script_output] }}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
def eprint(*args, **kwargs): | |
""" Prints a message on stderr """ | |
print(*args, file=sys.stderr, **kwargs) | |
def checkReadyState(cmd, lines): | |
""" Parse ready state from output and checks if it is ready """ | |
exitcode = 0 | |
error_message = "" | |
lines = sys.argv[2].splitlines() | |
error_message = "" | |
if len(lines): | |
error_message = "FAILED in command: \'" + cmd + "\'\n" | |
error_message += lines[0] + "\n" | |
for i in range(1, len(lines)): | |
words = lines[i].split() | |
if (words[1] != "Ready"): | |
error_message += lines[i] + "\n" | |
exitcode = 1 | |
if exitcode == 1: | |
eprint(error_message) | |
return exitcode | |
def checkResources(cmd, lines): | |
""" Parse CPU and Memory percentage and checks if it is active """ | |
exitcode = 0 | |
lines = sys.argv[2].splitlines() | |
error_message = "" | |
if len(lines): | |
error_message = "FAILED in command: \'" + cmd + "\'\n" | |
error_message += lines[0] + "\n" | |
for i in range(1, len(lines)): | |
words = lines[i].split() | |
# CPU or Memory | |
if words[2] == "0%" or words[4] == "0%": | |
error_message += lines[i] + "\n" | |
exitcode = 1 | |
if exitcode == 1: | |
eprint(error_message) | |
return exitcode | |
def checkEtcd(cmd, lines): | |
""" Checks if etcd of all members is active """ | |
if "unhealthy" in lines: | |
eprint("FAILED, in command: \'" + cmd + "\'\n") | |
return 1 | |
def checkRouterRegistryDeployment(cmd, lines): | |
""" Parse and check if ingress/image register for a specified resource_type is active""" | |
exitcode = 0 | |
lines = sys.argv[2].splitlines() | |
error_message = "" | |
if len(lines): | |
error_message = "FAILED in command: \'" + cmd + "\'\n" | |
error_message += lines[0] + "\n" | |
for i in range(1, len(lines)): | |
words = lines[i].split() | |
numbers=words[1].split("/") | |
if (numbers[0] != numbers[1]): | |
error_message += lines[i] + "\n" | |
exitcode = 1 | |
if exitcode == 1: | |
eprint(error_message) | |
return exitcode | |
def checkRouterRegistryPod(cmd, lines): | |
""" Parse and check if ingress/image register for a specified resource_type is active""" | |
exitcode = 0 | |
lines = sys.argv[2].splitlines() | |
error_message = "" | |
if len(lines): | |
error_message = "FAILED in command: \'" + cmd + "\'\n" | |
error_message += lines[0] + "\n" | |
for i in range(1, len(lines)): | |
words = lines[i].split() | |
numbers=words[1].split("/") | |
if (numbers[0] != numbers[1]): | |
if (words[2] != "Completed"): | |
error_message += lines[i] + "\n" | |
exitcode = 1 | |
if exitcode == 1: | |
eprint(error_message) | |
return exitcode | |
def checkOperator(cmd, lines): | |
""" Cluster operator check if it is not degraded """ | |
exitcode = 0 | |
lines = sys.argv[2].splitlines() | |
error_message = "" | |
if len(lines): | |
error_message = "FAILED in command: \'" + cmd + "\'\n" | |
error_message += lines[0] + "\n" | |
for i in range(1, len(lines)): | |
words = lines[i].split() | |
if (words[4] != "False"): | |
error_message += lines[i] + "\n" | |
exitcode = 1 | |
if exitcode == 1: | |
eprint(error_message) | |
return exitcode | |
def checkDeploymentStateFul(cmd, lines): | |
""" Parse and check if Deployments/StatefulSets are ready """ | |
exitcode = 0 | |
lines = sys.argv[2].splitlines() | |
error_message = "" | |
if len(lines): | |
error_message = "FAILED in command: \'" + cmd + "\'\n" | |
error_message += lines[0] + "\n" | |
for i in range(1, len(lines)): | |
words = lines[i].split() | |
numbers=words[2].split("/") | |
if (numbers[0] != numbers[1]): | |
error_message += lines[i] + "\n" | |
exitcode = 1 | |
if exitcode == 1: | |
eprint(error_message) | |
return exitcode | |
def checkPod(cmd, lines): | |
""" Parse and check if Pods are ready """ | |
exitcode = 0 | |
lines = sys.argv[2].splitlines() | |
error_message = "" | |
if len(lines): | |
error_message = "FAILED in command: \'" + cmd + "\'\n" | |
error_message += lines[0] + "\n" | |
for i in range(1, len(lines)): | |
words = lines[i].split() | |
numbers=words[2].split("/") | |
if (numbers[0] != numbers[1]): | |
if (words[3] != "Completed"): | |
error_message += lines[i] + "\n" | |
exitcode = 1 | |
if exitcode == 1: | |
eprint(error_message) | |
return exitcode | |
def checkRepSetDaeSetRepCtrlr(cmd, lines): | |
""" Parse and check ReplicaSet, DaemonSet, ReplicationController""" | |
exitcode = 0 | |
lines = sys.argv[2].splitlines() | |
error_message = "" | |
if len(lines): | |
error_message = "FAILED in command: \'" + cmd + "\'\n" | |
error_message += lines[0] + "\n" | |
for i in range(1, len(lines)): | |
words = lines[i].split() | |
if (words[2] != words[4]): | |
error_message += lines[i] + "\n" | |
exitcode = 1 | |
if exitcode == 1: | |
eprint(error_message) | |
return exitcode | |
def checkReadyz(cmd, lines): | |
""" Check readyz command """ | |
if "readyz check passed" not in lines: | |
eprint("FAILED, in command: \'" + cmd + "\'\n") | |
return 1 | |
def main(): | |
# Prints the output of command | |
print(sys.argv[2]) | |
print("\n") | |
if sys.argv[1] == "overview": | |
return checkReadyState(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "resources": | |
return checkResources(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "etcd": | |
return checkEtcd(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "router_pod": | |
return checkRouterRegistryPod(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "router_deployment": | |
return checkRouterRegistryDeployment(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "registry_pod": | |
return checkRouterRegistryPod(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "registry_deployment": | |
return checkRouterRegistryDeployment(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "clusteroperators": | |
return checkOperator(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "deployment": | |
return checkDeploymentStateFul(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "replicaset": | |
return checkRepSetDaeSetRepCtrlr(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "pods": | |
return checkPod(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "statefulset": | |
return checkDeploymentStateFul(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "daemonset": | |
return checkRepSetDaeSetRepCtrlr(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "replicationcontroller": | |
return checkRepSetDaeSetRepCtrlr(sys.argv[3], sys.argv[2]) | |
elif sys.argv[1] == "readyz": | |
return checkReadyz(sys.argv[3], sys.argv[2]) | |
else: | |
eprint("FAILED, unknown resource from argument.") | |
exit(1) | |
if __name__ == "__main__": | |
exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
- hosts: localhost | |
vars: | |
# List of all commands to check | |
command_list: | |
- command: oc get nodes -o wide | |
check: "overview" | |
- command: oc adm top nodes | |
check: "resources" | |
- command: kubectl get --raw='/readyz?verbose' | |
check: "readyz" | |
- command: oc get etcd -o=jsonpath='{range .items[0].status.conditions[?(@.type=="EtcdMembersAvailable")]}{.message}{"\n"}' | |
check: "etcd" | |
- command: oc get pod --namespace openshift-ingress | |
check: "router_pod" | |
- command: oc get deployment --namespace openshift-ingress | |
check: "router_deployment" | |
- command: oc get pod -n openshift-image-registry | |
check: "registry_pod" | |
- command: oc get deployment -n openshift-image-registry | |
check: "registry_deployment" | |
- command: oc -n default get clusteroperators | |
check: "clusteroperators" | |
- command: oc get deployment --all-namespaces | |
check: "deployment" | |
- command: oc get replicaset --all-namespaces | egrep -v ' 0 .* 0 ' | |
check: "replicaset" | |
- command: oc get pods --all-namespaces | |
check: "pods" | |
- command: oc get statefulset --all-namespaces | |
check: "statefulset" | |
- command: oc get daemonset --all-namespaces | |
check: "daemonset" | |
- command: oc get replicationcontroller --all-namespaces | |
check: "replicationcontroller" | |
tasks: | |
- name: Login into cluster | |
shell: | | |
set -o pipefail && \ | |
oc login --insecure-skip-tls-verify --username "{{ cluster_user }}" --password "{{ cluster_password }}" "{{ cluster_api }}" | |
args: | |
executable: /bin/bash | |
register: cluster_login | |
failed_when: '"Login successful." not in cluster_login.stdout' | |
- name: Created an array | |
set_fact: | |
append_array: [] | |
- include_tasks: check.yml | |
vars: | |
command: '{{item.command}}' | |
check: '{{item.check}}' | |
loop: '{{command_list}}' | |
- name: Async status | |
async_status: | |
jid: "{{ async_result_item }}" | |
loop: "{{ append_array|map(attribute='ansible_job_id')|list }}" | |
loop_control: | |
loop_var: "async_result_item" | |
register: result | |
until: result.finished | |
retries: 10 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment