mdrakiburrahman/sqlinstance-ha.yaml.tmpl

## sqlinstance-ha.yaml.tmpl
# ==========================================================================
# SQLINSTANCE-HA
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# WORKFLOW DURATION: 18m
# --------------------------------------------------------------------------
# CHAOS DESCRIPTION
# --------------------------------------------------------------------------
# In this chaos simulation we target a 2 or 3 replica MIAA:
#
#    1. Setup a User DB
#
#    2. Setup 10+ pods that continuously attempt to write transactions to the
#       primary endpoint in an infinite loop
#
#    3. Kill Replicas in an orchestrated manner to try force, and then promote
#       a lagging replica, repeat kill the previous 2 primary replicas
#
#    4. Kill random pods, and sqlserver and orchestrator containers
#
#    5. Degrade storage, burn CPU/Mem and partially corrupt networking
#
# --------------------------------------------------------------------------
# VAR SUBSTITUTIONS
# --------------------------------------------------------------------------
# - CHAOS_INSTANCE_NAME - Name of SQL Instance
# - CHAOS_PLUGIN_NAMESPACE - Arc Data Namespace
# - CHAOS_INSTANCE_USERNAME - Username of instance admin
# - CHAOS_INSTANCE_PASSWORD - Password of instance admin
# - CHAOS_PLUGIN - Sonobuoy Plugin & file; e.g. '{{ CHAOS_PLUGIN }}.yaml.tmpl'
# ==========================================================================

apiVersion: chaos-mesh.org/v1alpha1
kind: Workflow
metadata:
  name: {{ CHAOS_PLUGIN }}
  namespace: {{ CHAOS_PLUGIN_NAMESPACE }}
spec:
  entry: serial-root
  templates:
    # ==================== entry point ====================
    - name: serial-root
      templateType: Serial
      children:
        - setup-user-db
        - fill-user-db
        - parallel-root
      deadline: 18m

      # Although the deadline is 2 minutes, the task pod definitions stay up
      # until the 'Workflow' Custom Resource is explicitly deleted. Chaos Mesh
      # has a design gap that works in our favor to keep stressing transactions,
      # basically they don't expect you to have infinite loops in your tasks,
      # and bank on the fact that pods exit naturally.
      #
    - name: fill-user-db
      deadline: 2m
      templateType: Parallel
      children:
        - repeat-perform-transactions
        - repeat-perform-transactions
        - repeat-perform-transactions
        - repeat-perform-transactions
        - repeat-perform-transactions
        - repeat-perform-transactions
        - repeat-perform-transactions
        - repeat-perform-transactions
        - repeat-perform-transactions
        - repeat-perform-transactions
        # These kick in at the 1m mark, let's try to block transactions being
        # acked by replicas
        - partial-database-to-database-corrupt
        - partial-database-to-database-duplicate
        - partial-database-to-database-delay
        - partial-database-to-database-loss

    - name: parallel-root
      templateType: Parallel
      children:
        - repeat-perform-transactions             # In case Chaos Mesh fixes the task pod cleanup above, we want this single inserter to keep running
        - parallel-compute-killer
        - parallel-storage-killer
        - parallel-network-killer
        - burn-cpu-mem
    # ================== compute chaos ==================
    # //////////////////// KILL PODS ////////////////////
    # ===================================================
    # ==================== parallels ====================
    - name: parallel-compute-killer
      templateType: Parallel
      children:
        - repeat-kill-2-container                 # Should exit first, try and get a low sequence instance back
        - orchestrated-0-1-kills                  # Should exit after, get two instances that have similar seq
        - repeat-kill-ha-orchestrator-pod         # Exits at the end
        - repeat-kill-ha-supervisor-container     # Exits at the end
        - repeat-kill-random-database-pod         # Exits at the end
        - repeat-kill-controldb-pod               # Exits at the end
      deadline: 13m

    - name: repeat-kill-0-1-container
      templateType: Parallel
      children:
        - repeat-force-failover-tsql
        - repeat-kill-0-container
        - repeat-kill-1-container
    # ==================== orchestrated ====================
    - name: orchestrated-0-1-kills
      templateType: Serial
      deadline: 10m
      children:
        - force-failover-tsql
        - kill-ha-pod                   # For Network changes to take action immediately
        - kill-0-container
        - force-failover-tsql
        - sleep-for-paxos
        - kill-1-container
        - force-failover-tsql
        - sleep-for-paxos
        - repeat-kill-0-1-container
    # ==================== repeat kills ====================
    - name: repeat-kill-0-container
      templateType: Schedule
      schedule:
        schedule: '@every 10s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: PodChaos
        podChaos:
          containerNames: ['arc-sqlmi']
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-0
          mode: one
          action: container-kill

    - name: repeat-kill-1-container
      templateType: Schedule
      schedule:
        schedule: '@every 10s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: PodChaos
        podChaos:
          containerNames: ['arc-sqlmi']
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-1
          mode: one
          action: container-kill

    - name: repeat-kill-2-container
      templateType: Schedule
      deadline: 5m
      schedule:
        schedule: '@every 10s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: PodChaos
        podChaos:
          containerNames: ['arc-sqlmi']
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-2
          mode: one
          action: container-kill

    - name: repeat-kill-ha-orchestrator-pod
      templateType: Schedule
      schedule:
        schedule: '@every 7m'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: PodChaos
        podChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: orchestrator
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: one
          action: pod-kill

    - name: repeat-kill-ha-supervisor-container
      templateType: Schedule
      schedule:
        schedule: '@every 400s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: PodChaos
        podChaos:
          containerNames: ['arc-ha-supervisor']
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: database
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: one
          action: container-kill

    - name: repeat-kill-random-database-pod
      templateType: Schedule
      schedule:
        schedule: '@every 500s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: PodChaos
        podChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: database
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: one
          action: pod-kill

    - name: repeat-kill-controldb-pod
      templateType: Schedule
      schedule:
        schedule: '@every 7m'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: PodChaos
        podChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app: controldb
          mode: one
          action: pod-kill
    # ==================== single kills ====================
    - name: kill-ha-pod
      deadline: 1m
      templateType: PodChaos
      podChaos:
        selector:
          namespaces:
            - {{ CHAOS_PLUGIN_NAMESPACE }}
          labelSelectors:
            app.kubernetes.io/component: orchestrator
            app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
        mode: all
        action: pod-kill

    - name: kill-0-container
      deadline: 1m
      templateType: PodChaos
      podChaos:
        containerNames: ['arc-sqlmi']
        selector:
          namespaces:
            - {{ CHAOS_PLUGIN_NAMESPACE }}
          labelSelectors:
            statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-0
        mode: all
        action: container-kill

    - name: kill-1-container
      templateType: PodChaos
      deadline: 1m
      podChaos:
        containerNames: ['arc-sqlmi']
        selector:
          namespaces:
            - {{ CHAOS_PLUGIN_NAMESPACE }}
          labelSelectors:
            statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-1
        mode: all
        action: container-kill
    # ==================== sleep ====================
    - name: sleep-for-paxos
      templateType: Task
      deadline: 60s
      task:
        container:
          name: main-contaienr
          image: busybox
          command:
           - sh
           - -c
           - sleep 60
    # ====================== t-sql ======================
    - name: force-failover-tsql
      templateType: Task
      deadline: 60s
      task:
        container:
          name: tsql
          image: mcr.microsoft.com/mssql-tools
          command: ["/bin/sh", "-c"]
          args:
            - set +e;
              echo "Testing all 3 replicas for failover:";
              for replica in $(seq 0 2); do
                echo \"================================= {{ CHAOS_INSTANCE_NAME }}-$replica ====================================\";
                /opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-$replica.{{ CHAOS_INSTANCE_NAME }}-svc,1433 -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "ALTER AVAILABILITY GROUP current SET (ROLE = SECONDARY);";
                echo \"=============================================================================\";
              done;

      # Chaos Mesh doesn't allow wrapping Schedule CRD on top of task CRDs
      # (yet), so unfortunately we have to copy-paste the simple loop above.
      #
    - name: repeat-force-failover-tsql
      templateType: Task
      task:
        container:
          name: tsql
          image: mcr.microsoft.com/mssql-tools
          command: ["/bin/sh", "-c"]
          args:
            - set +e;
              echo "Testing all 3 replicas for failover:";
              while true; do
                for replica in $(seq 0 2); do
                  echo \"================================= {{ CHAOS_INSTANCE_NAME }}-$replica ====================================\";
                  /opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-$replica.{{ CHAOS_INSTANCE_NAME }}-svc,1433 -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "ALTER AVAILABILITY GROUP current SET (ROLE = SECONDARY);";
                  echo \"=============================================================================\";
                done;
              done;

    - name: setup-user-db
      templateType: Task
      deadline: 180s
      task:
        container:
          name: tsql
          image: mcr.microsoft.com/mssql-tools
          command: ["/bin/sh", "-c"]
          args:
            - set +e;
              echo "Dropping database, if exists:";
              /opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "master" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "DROP DATABASE IF EXISTS [UserDB];";
              echo "Setting up database:";
              /opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "master" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "CREATE DATABASE [UserDB];";
              echo "Dropping table:";
              /opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "DROP TABLE IF EXISTS [dbo].[UserTable];";
              echo "Creating table:";
              /opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "CREATE TABLE [dbo].[UserTable] (id int, name varchar(50));";
              echo "Done!";

    - name: repeat-perform-transactions
      templateType: Task
      task:
        container:
          name: tsql
          image: mcr.microsoft.com/mssql-tools
          command: ["/bin/sh", "-c"]
          args:
            - set +e;
              echo "Running infinite insertion loop";
              i=0;
              while true; do
                  echo "Attempting - $i";
                  /opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "INSERT INTO [dbo].[UserTable] (id, name) VALUES ($i, '{{ CHAOS_PLUGIN }}');" -t 1 -l 1;
                  i=$((i+1));
              done

    # ================ storage parallel =================
    # //////////// DESTROY CRITICAL STORAGE /////////////
    # ===================================================
    - name: parallel-storage-killer
      templateType: Parallel
      children:
        - storage-delay-write
        - storage-delay-read
    # ================== storage chaos ==================
    # ////////////     var/opt/mssql/data    ////////////
    # ===================================================
    # ====================== write ======================
    - name: storage-delay-write
      templateType: Schedule
      schedule:
        schedule: '@every 120s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: IOChaos
        ioChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: database
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: all
          action: latency
          delay: 150ms
          path: /var/opt/mssql/**/*
          methods:
            - WRITE
          percent: 15
          volumePath: /var/opt/mssql
    # ====================== read ======================
    - name: storage-delay-read
      templateType: Schedule
      schedule:
        schedule: '@every 80s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: IOChaos
        ioChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: database
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: all
          action: latency
          delay: 150ms
          path: /var/opt/mssql/**/*
          methods:
            - READ
          percent: 15
          volumePath: /var/opt/mssql
    # ================== comms parallel =================
    # ///////////  DESTROY COMMS CHANNELS   /////////////
    # ===================================================
    - name: parallel-network-killer
      templateType: Parallel
      children:
        - partial-k8s-block-ha
        - partial-database-to-ha-block
        - partial-database-to-database-corrupt
        - partial-database-to-database-duplicate
        - partial-database-to-database-delay
        - partial-database-to-database-loss
    # ====================== k8s ======================
    - name: partial-k8s-block-ha
      templateType: Schedule
      schedule:
        schedule: '@every 60s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: NetworkChaos
        networkChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: orchestrator
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: all
          action: corrupt
          duration: 55s
          corrupt:
            corrupt: '40'
            correlation: '20'
          direction: to
          externalTargets:
            - 10.0.0.0/16                 # AKS default
            - 100.64.0.0/16               # Our Kubeadm setup
            - 10.42.0.0/24                # K3s default
            - 172.30.0.0/16               # OpenShift default
            - kubernetes.default.svc
    # ====================== database-to-ha ======================
    - name: partial-database-to-ha-block
      templateType: Schedule
      schedule:
        schedule: '@every 60s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: NetworkChaos
        networkChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: orchestrator
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: all
          action: corrupt
          duration: 55s
          corrupt:
            corrupt: '45'
            correlation: '40'
          direction: both
          target:
            selector:
              namespaces:
                - {{ CHAOS_PLUGIN_NAMESPACE }}
              labelSelectors:
                app.kubernetes.io/component: database
                app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
            mode: all
    # ====================== database-to-database ======================
    - name: partial-database-to-database-corrupt
      templateType: Schedule
      schedule:
        schedule: '@every 60s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: NetworkChaos
        networkChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: database
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: all
          action: corrupt
          duration: 15s
          corrupt:
            corrupt: '40'
            correlation: '25'
          direction: to
          target:
            selector:
              namespaces:
                - {{ CHAOS_PLUGIN_NAMESPACE }}
              labelSelectors:
                app.kubernetes.io/component: database
                app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
            mode: all

    - name: partial-database-to-database-duplicate
      templateType: Schedule
      schedule:
        schedule: '@every 60s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: NetworkChaos
        networkChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: database
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: all
          action: duplicate
          duration: 10s
          duplicate:
            duplicate: "40"
            correlation: "25"
          direction: to
          target:
            selector:
              namespaces:
                - {{ CHAOS_PLUGIN_NAMESPACE }}
              labelSelectors:
                app.kubernetes.io/component: database
                app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
            mode: all

    - name: partial-database-to-database-delay
      templateType: Schedule
      schedule:
        schedule: '@every 60s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: NetworkChaos
        networkChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: database
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: all
          action: delay
          duration: 5s
          delay:
            latency: "90ms"
            correlation: "25"
            jitter: "90ms"
          direction: to
          target:
            selector:
              namespaces:
                - {{ CHAOS_PLUGIN_NAMESPACE }}
              labelSelectors:
                app.kubernetes.io/component: database
                app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
            mode: all

    - name: partial-database-to-database-loss
      templateType: Schedule
      schedule:
        schedule: '@every 60s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: NetworkChaos
        networkChaos:
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/component: database
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          mode: all
          action: loss
          duration: 24s
          loss:
            loss: "25"
            correlation: "25"
          direction: to
          target:
            selector:
              namespaces:
                - {{ CHAOS_PLUGIN_NAMESPACE }}
              labelSelectors:
                app.kubernetes.io/component: database
                app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
            mode: all
    # ==================== utilities ====================
    # ////////////           MISC            ////////////
    # ===================================================
    - name: burn-cpu-mem
      templateType: Schedule
      schedule:
        schedule: '@every 120s'
        startingDeadlineSeconds: null
        concurrencyPolicy: Forbid
        historyLimit: 1
        type: StressChaos
        stressChaos:
          mode: all
          selector:
            namespaces:
              - {{ CHAOS_PLUGIN_NAMESPACE }}
            labelSelectors:
              app.kubernetes.io/part-of: SqlManagedInstance
              app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
          stressors:
            memory:
              workers: 2
              size: 256Mi
            cpu:
              workers: 4
              load: 100
	# ==========================================================================
	# SQLINSTANCE-HA
	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	# WORKFLOW DURATION: 18m
	# --------------------------------------------------------------------------
	# CHAOS DESCRIPTION
	# --------------------------------------------------------------------------
	# In this chaos simulation we target a 2 or 3 replica MIAA:
	#
	# 1. Setup a User DB
	#
	# 2. Setup 10+ pods that continuously attempt to write transactions to the
	# primary endpoint in an infinite loop
	#
	# 3. Kill Replicas in an orchestrated manner to try force, and then promote
	# a lagging replica, repeat kill the previous 2 primary replicas
	#
	# 4. Kill random pods, and sqlserver and orchestrator containers
	#
	# 5. Degrade storage, burn CPU/Mem and partially corrupt networking
	#
	# --------------------------------------------------------------------------
	# VAR SUBSTITUTIONS
	# --------------------------------------------------------------------------
	# - CHAOS_INSTANCE_NAME - Name of SQL Instance
	# - CHAOS_PLUGIN_NAMESPACE - Arc Data Namespace
	# - CHAOS_INSTANCE_USERNAME - Username of instance admin
	# - CHAOS_INSTANCE_PASSWORD - Password of instance admin
	# - CHAOS_PLUGIN - Sonobuoy Plugin & file; e.g. '{{ CHAOS_PLUGIN }}.yaml.tmpl'
	# ==========================================================================

	apiVersion: chaos-mesh.org/v1alpha1
	kind: Workflow
	metadata:
	name: {{ CHAOS_PLUGIN }}
	namespace: {{ CHAOS_PLUGIN_NAMESPACE }}
	spec:
	entry: serial-root
	templates:
	# ==================== entry point ====================
	- name: serial-root
	templateType: Serial
	children:
	- setup-user-db
	- fill-user-db
	- parallel-root
	deadline: 18m

	# Although the deadline is 2 minutes, the task pod definitions stay up
	# until the 'Workflow' Custom Resource is explicitly deleted. Chaos Mesh
	# has a design gap that works in our favor to keep stressing transactions,
	# basically they don't expect you to have infinite loops in your tasks,
	# and bank on the fact that pods exit naturally.
	#
	- name: fill-user-db
	deadline: 2m
	templateType: Parallel
	children:
	- repeat-perform-transactions
	- repeat-perform-transactions
	- repeat-perform-transactions
	- repeat-perform-transactions
	- repeat-perform-transactions
	- repeat-perform-transactions
	- repeat-perform-transactions
	- repeat-perform-transactions
	- repeat-perform-transactions
	- repeat-perform-transactions
	# These kick in at the 1m mark, let's try to block transactions being
	# acked by replicas
	- partial-database-to-database-corrupt
	- partial-database-to-database-duplicate
	- partial-database-to-database-delay
	- partial-database-to-database-loss

	- name: parallel-root
	templateType: Parallel
	children:
	- repeat-perform-transactions # In case Chaos Mesh fixes the task pod cleanup above, we want this single inserter to keep running
	- parallel-compute-killer
	- parallel-storage-killer
	- parallel-network-killer
	- burn-cpu-mem
	# ================== compute chaos ==================
	# //////////////////// KILL PODS ////////////////////
	# ===================================================
	# ==================== parallels ====================
	- name: parallel-compute-killer
	templateType: Parallel
	children:
	- repeat-kill-2-container # Should exit first, try and get a low sequence instance back
	- orchestrated-0-1-kills # Should exit after, get two instances that have similar seq
	- repeat-kill-ha-orchestrator-pod # Exits at the end
	- repeat-kill-ha-supervisor-container # Exits at the end
	- repeat-kill-random-database-pod # Exits at the end
	- repeat-kill-controldb-pod # Exits at the end
	deadline: 13m

	- name: repeat-kill-0-1-container
	templateType: Parallel
	children:
	- repeat-force-failover-tsql
	- repeat-kill-0-container
	- repeat-kill-1-container
	# ==================== orchestrated ====================
	- name: orchestrated-0-1-kills
	templateType: Serial
	deadline: 10m
	children:
	- force-failover-tsql
	- kill-ha-pod # For Network changes to take action immediately
	- kill-0-container
	- force-failover-tsql
	- sleep-for-paxos
	- kill-1-container
	- force-failover-tsql
	- sleep-for-paxos
	- repeat-kill-0-1-container
	# ==================== repeat kills ====================
	- name: repeat-kill-0-container
	templateType: Schedule
	schedule:
	schedule: '@every 10s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: PodChaos
	podChaos:
	containerNames: ['arc-sqlmi']
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-0
	mode: one
	action: container-kill

	- name: repeat-kill-1-container
	templateType: Schedule
	schedule:
	schedule: '@every 10s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: PodChaos
	podChaos:
	containerNames: ['arc-sqlmi']
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-1
	mode: one
	action: container-kill

	- name: repeat-kill-2-container
	templateType: Schedule
	deadline: 5m
	schedule:
	schedule: '@every 10s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: PodChaos
	podChaos:
	containerNames: ['arc-sqlmi']
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-2
	mode: one
	action: container-kill

	- name: repeat-kill-ha-orchestrator-pod
	templateType: Schedule
	schedule:
	schedule: '@every 7m'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: PodChaos
	podChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: orchestrator
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: one
	action: pod-kill

	- name: repeat-kill-ha-supervisor-container
	templateType: Schedule
	schedule:
	schedule: '@every 400s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: PodChaos
	podChaos:
	containerNames: ['arc-ha-supervisor']
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: one
	action: container-kill

	- name: repeat-kill-random-database-pod
	templateType: Schedule
	schedule:
	schedule: '@every 500s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: PodChaos
	podChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: one
	action: pod-kill

	- name: repeat-kill-controldb-pod
	templateType: Schedule
	schedule:
	schedule: '@every 7m'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: PodChaos
	podChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app: controldb
	mode: one
	action: pod-kill
	# ==================== single kills ====================
	- name: kill-ha-pod
	deadline: 1m
	templateType: PodChaos
	podChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: orchestrator
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: pod-kill

	- name: kill-0-container
	deadline: 1m
	templateType: PodChaos
	podChaos:
	containerNames: ['arc-sqlmi']
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-0
	mode: all
	action: container-kill

	- name: kill-1-container
	templateType: PodChaos
	deadline: 1m
	podChaos:
	containerNames: ['arc-sqlmi']
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-1
	mode: all
	action: container-kill
	# ==================== sleep ====================
	- name: sleep-for-paxos
	templateType: Task
	deadline: 60s
	task:
	container:
	name: main-contaienr
	image: busybox
	command:
	- sh
	- -c
	- sleep 60
	# ====================== t-sql ======================
	- name: force-failover-tsql
	templateType: Task
	deadline: 60s
	task:
	container:
	name: tsql
	image: mcr.microsoft.com/mssql-tools
	command: ["/bin/sh", "-c"]
	args:
	- set +e;
	echo "Testing all 3 replicas for failover:";
	for replica in $(seq 0 2); do
	echo \"================================= {{ CHAOS_INSTANCE_NAME }}-$replica ====================================\";
	/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-$replica.{{ CHAOS_INSTANCE_NAME }}-svc,1433 -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "ALTER AVAILABILITY GROUP current SET (ROLE = SECONDARY);";
	echo \"=============================================================================\";
	done;

	# Chaos Mesh doesn't allow wrapping Schedule CRD on top of task CRDs
	# (yet), so unfortunately we have to copy-paste the simple loop above.
	#
	- name: repeat-force-failover-tsql
	templateType: Task
	task:
	container:
	name: tsql
	image: mcr.microsoft.com/mssql-tools
	command: ["/bin/sh", "-c"]
	args:
	- set +e;
	echo "Testing all 3 replicas for failover:";
	while true; do
	for replica in $(seq 0 2); do
	echo \"================================= {{ CHAOS_INSTANCE_NAME }}-$replica ====================================\";
	/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-$replica.{{ CHAOS_INSTANCE_NAME }}-svc,1433 -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "ALTER AVAILABILITY GROUP current SET (ROLE = SECONDARY);";
	echo \"=============================================================================\";
	done;
	done;

	- name: setup-user-db
	templateType: Task
	deadline: 180s
	task:
	container:
	name: tsql
	image: mcr.microsoft.com/mssql-tools
	command: ["/bin/sh", "-c"]
	args:
	- set +e;
	echo "Dropping database, if exists:";
	/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "master" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "DROP DATABASE IF EXISTS [UserDB];";
	echo "Setting up database:";
	/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "master" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "CREATE DATABASE [UserDB];";
	echo "Dropping table:";
	/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "DROP TABLE IF EXISTS [dbo].[UserTable];";
	echo "Creating table:";
	/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "CREATE TABLE [dbo].[UserTable] (id int, name varchar(50));";
	echo "Done!";

	- name: repeat-perform-transactions
	templateType: Task
	task:
	container:
	name: tsql
	image: mcr.microsoft.com/mssql-tools
	command: ["/bin/sh", "-c"]
	args:
	- set +e;
	echo "Running infinite insertion loop";
	i=0;
	while true; do
	echo "Attempting - $i";
	/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "INSERT INTO [dbo].[UserTable] (id, name) VALUES ($i, '{{ CHAOS_PLUGIN }}');" -t 1 -l 1;
	i=$((i+1));
	done

	# ================ storage parallel =================
	# //////////// DESTROY CRITICAL STORAGE /////////////
	# ===================================================
	- name: parallel-storage-killer
	templateType: Parallel
	children:
	- storage-delay-write
	- storage-delay-read
	# ================== storage chaos ==================
	# //////////// var/opt/mssql/data ////////////
	# ===================================================
	# ====================== write ======================
	- name: storage-delay-write
	templateType: Schedule
	schedule:
	schedule: '@every 120s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: IOChaos
	ioChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: latency
	delay: 150ms
	path: /var/opt/mssql/*/
	methods:
	- WRITE
	percent: 15
	volumePath: /var/opt/mssql
	# ====================== read ======================
	- name: storage-delay-read
	templateType: Schedule
	schedule:
	schedule: '@every 80s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: IOChaos
	ioChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: latency
	delay: 150ms
	path: /var/opt/mssql/*/
	methods:
	- READ
	percent: 15
	volumePath: /var/opt/mssql
	# ================== comms parallel =================
	# /////////// DESTROY COMMS CHANNELS /////////////
	# ===================================================
	- name: parallel-network-killer
	templateType: Parallel
	children:
	- partial-k8s-block-ha
	- partial-database-to-ha-block
	- partial-database-to-database-corrupt
	- partial-database-to-database-duplicate
	- partial-database-to-database-delay
	- partial-database-to-database-loss
	# ====================== k8s ======================
	- name: partial-k8s-block-ha
	templateType: Schedule
	schedule:
	schedule: '@every 60s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: NetworkChaos
	networkChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: orchestrator
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: corrupt
	duration: 55s
	corrupt:
	corrupt: '40'
	correlation: '20'
	direction: to
	externalTargets:
	- 10.0.0.0/16 # AKS default
	- 100.64.0.0/16 # Our Kubeadm setup
	- 10.42.0.0/24 # K3s default
	- 172.30.0.0/16 # OpenShift default
	- kubernetes.default.svc
	# ====================== database-to-ha ======================
	- name: partial-database-to-ha-block
	templateType: Schedule
	schedule:
	schedule: '@every 60s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: NetworkChaos
	networkChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: orchestrator
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: corrupt
	duration: 55s
	corrupt:
	corrupt: '45'
	correlation: '40'
	direction: both
	target:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	# ====================== database-to-database ======================
	- name: partial-database-to-database-corrupt
	templateType: Schedule
	schedule:
	schedule: '@every 60s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: NetworkChaos
	networkChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: corrupt
	duration: 15s
	corrupt:
	corrupt: '40'
	correlation: '25'
	direction: to
	target:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all

	- name: partial-database-to-database-duplicate
	templateType: Schedule
	schedule:
	schedule: '@every 60s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: NetworkChaos
	networkChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: duplicate
	duration: 10s
	duplicate:
	duplicate: "40"
	correlation: "25"
	direction: to
	target:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all

	- name: partial-database-to-database-delay
	templateType: Schedule
	schedule:
	schedule: '@every 60s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: NetworkChaos
	networkChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: delay
	duration: 5s
	delay:
	latency: "90ms"
	correlation: "25"
	jitter: "90ms"
	direction: to
	target:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all

	- name: partial-database-to-database-loss
	templateType: Schedule
	schedule:
	schedule: '@every 60s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: NetworkChaos
	networkChaos:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	action: loss
	duration: 24s
	loss:
	loss: "25"
	correlation: "25"
	direction: to
	target:
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/component: database
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	mode: all
	# ==================== utilities ====================
	# //////////// MISC ////////////
	# ===================================================
	- name: burn-cpu-mem
	templateType: Schedule
	schedule:
	schedule: '@every 120s'
	startingDeadlineSeconds: null
	concurrencyPolicy: Forbid
	historyLimit: 1
	type: StressChaos
	stressChaos:
	mode: all
	selector:
	namespaces:
	- {{ CHAOS_PLUGIN_NAMESPACE }}
	labelSelectors:
	app.kubernetes.io/part-of: SqlManagedInstance
	app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }}
	stressors:
	memory:
	workers: 2
	size: 256Mi
	cpu:
	workers: 4
	load: 100