Skip to content

Instantly share code, notes, and snippets.

@rasheedamir
Last active March 10, 2020 09:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save rasheedamir/7da0145ae1b5d9889e4085ded21d1acb to your computer and use it in GitHub Desktop.
Save rasheedamir/7da0145ae1b5d9889e4085ded21d1acb to your computer and use it in GitHub Desktop.
troubleshooting collection
FILE # 1

#!groovy

String GIT_VERSION

node {

  def buildEnv
  def devAddress

  stage ('Checkout') {
    deleteDir()
    checkout scm
    GIT_VERSION = sh (
      script: 'git describe --tags',
      returnStdout: true
    ).trim()
  }

  stage ('Build Custom Environment') {
    buildEnv = docker.build("build_env:${GIT_VERSION}", 'custom-build-env')
  }

  buildEnv.inside {

    stage ('Build') {
      sh 'sbt compile'
      sh 'sbt sampleClient/universal:stage'
    }

    stage ('Test') {
      parallel (
        'Test Server' : {
          sh 'sbt server/test'
        },
        'Test Sample Client' : {
          sh 'sbt sampleClient/test'
        }
      )
    }

    stage ('Prepare Docker Image') {
      sh 'sbt server/docker:stage'
    }
  }

  stage ('Build and Push Docker Image') {
    withCredentials([[$class: "UsernamePasswordMultiBinding", usernameVariable: 'DOCKERHUB_USER', passwordVariable: 'DOCKERHUB_PASS', credentialsId: 'Docker Hub']]) {
      sh 'docker login --username $DOCKERHUB_USER --password $DOCKERHUB_PASS'
    }
    def serverImage = docker.build("sambott/grpc-test:${GIT_VERSION}", 'server/target/docker/stage')
    serverImage.push()
    sh 'docker logout'
  }

  stage ('Deploy to DEV') {
    devAddress = deployContainer("sambott/grpc-test:${GIT_VERSION}", 'DEV')
  }

  stage ('Verify Deployment') {
    buildEnv.inside {
      sh "sample-client/target/universal/stage/bin/demo-client ${devAddress}"
    }
  }
}

stage 'Deploy to LIVE'
  timeout(time:2, unit:'DAYS') {
    input message:'Approve deployment to LIVE?'
  }
  node {
    deployContainer("sambott/grpc-test:${GIT_VERSION}", 'LIVE')
  }

def deployContainer(image, env) {
  docker.image('lachlanevenson/k8s-kubectl:v1.5.2').inside {
    withCredentials([[$class: "FileBinding", credentialsId: 'KubeConfig', variable: 'KUBE_CONFIG']]) {
      def kubectl = "kubectl  --kubeconfig=\$KUBE_CONFIG --context=${env}"
      sh "${kubectl} set image deployment/grpc-demo grpc-demo=${image}"
      sh "${kubectl} rollout status deployment/grpc-demo"
      return sh (
        script: "${kubectl} get service/grpc-demo -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'",
        returnStdout: true
      ).trim()
    }
  }
}
FILE # 2

node {
    stage('checkout') {
        checkout scm
    }

    docker.image('openjdk:8').inside('-u root -e MAVEN_OPTS="-Duser.home=./"') {
        stage('check java') {
            sh "java -version"
        }

        stage('clean') {
            sh "chmod +x mvnw"
            sh "./mvnw clean"
        }

        stage('install tools') {
            sh "./mvnw com.github.eirslett:frontend-maven-plugin:install-node-and-yarn -DnodeVersion=v6.11.1 -DyarnVersion=v0.27.5"
        }

        stage('yarn install') {
            sh "./mvnw com.github.eirslett:frontend-maven-plugin:yarn"
        }

        stage('backend tests') {
            try {
                sh "./mvnw test"
            } catch(err) {
                throw err
            } finally {
                junit '**/target/surefire-reports/TEST-*.xml'
            }
        }

        stage('frontend tests') {
            try {
                sh "./mvnw com.github.eirslett:frontend-maven-plugin:yarn -Dfrontend.yarn.arguments=test"
            } catch(err) {
                throw err
            } finally {
                junit '**/target/test-results/karma/TESTS-*.xml'
            }
        }

        stage('packaging') {
            sh "./mvnw package -Pprod -DskipTests"
            archiveArtifacts artifacts: '**/target/*.war', fingerprint: true
        }

        stage('quality analysis') {
            withSonarQubeEnv('Sonar') {
                sh "./mvnw sonar:sonar"
            }
        }
    }

    def dockerImage
    stage('build docker') {
        sh "cp -R src/main/docker target/"
        sh "cp target/*.war target/docker/"
        dockerImage = docker.build('moviemanager', 'target/docker')
    }

    stage('publish docker') {
        docker.withRegistry('https://registry.hub.docker.com', 'docker-login') {
            dockerImage.push 'latest'
        }
    }
}

stage 'Trigger e2e tests'

waitUntil {
	try {
		def chartRepoType = git_branch == "master" ? 'dev' : 'pr'
		build job: 'workflow-chart-e2e', parameters: [
			[$class: 'StringParameterValue', name: 'WORKFLOW_CLI_SHA', value: git_commit],
			[$class: 'StringParameterValue', name: 'ACTUAL_COMMIT', value: git_commit],
			[$class: 'StringParameterValue', name: 'COMPONENT_REPO', value: 'workflow-cli'],
			[$class: 'StringParameterValue', name: 'CHART_REPO_TYPE', value: chartRepoType],
			[$class: 'StringParameterValue', name: 'UPSTREAM_SLACK_CHANNEL', value: '#controller']]
		true
	} catch(error) {
		if (git_branch == "master") {
			throw error
		}

		node(linux) {
			withCredentials([[$class: 'StringBinding', credentialsId: '8a727911-596f-4057-97c2-b9e23de5268d', variable: 'SLACKEMAIL']]) {
				mail body: """<!DOCTYPE html>
<html>
<head>
<meta content='text/html; charset=UTF-8' http-equiv='Content-Type' />
</head>
<body>
<div>Author: ${env.CHANGE_AUTHOR}<br/>
Branch: ${env.BRANCH_NAME}<br/>
Commit: ${env.CHANGE_TITLE}<br/>
<a href="${env.BUILD_URL}console">Click here</a> to view logs.</p>
<a href="${env.BUILD_URL}input/">Click here</a> to restart e2e.</p>
</div>
</html>
""", from: 'jenkins@ci.deis.io', subject: 'Workflow CLI E2E Test Failure', to: env.SLACKEMAIL, mimeType: 'text/html'
			}
			input "Retry the e2e tests?"
		}
		false
	}
}
podTemplate(label: 'pod-hugo-app', containers: [
    containerTemplate(name: 'hugo', image: 'smesch/hugo', ttyEnabled: true, command: 'cat'),
    containerTemplate(name: 'html-proofer', image: 'smesch/html-proofer', ttyEnabled: true, command: 'cat'),
    containerTemplate(name: 'kubectl', image: 'smesch/kubectl', ttyEnabled: true, command: 'cat',
        volumes: [secretVolume(secretName: 'kube-config', mountPath: '/root/.kube')]),
    containerTemplate(name: 'docker', image: 'docker', ttyEnabled: true, command: 'cat',
        envVars: [containerEnvVar(key: 'DOCKER_CONFIG', value: '/tmp/'),])],
        volumes: [secretVolume(secretName: 'docker-config', mountPath: '/tmp'),
                  hostPathVolume(hostPath: '/var/run/docker.sock', mountPath: '/var/run/docker.sock')
  ]) {

    node('pod-hugo-app') {

        def DOCKER_HUB_ACCOUNT = 'smesch'
        def DOCKER_IMAGE_NAME = 'hugo-app-jenkins'
        def K8S_DEPLOYMENT_NAME = 'hugo-app'

        stage('Clone Hugo App Repository') {
            checkout scm
 
            container('hugo') {
                stage('Build Hugo Site') {
                    sh ("hugo --uglyURLs")
                }
            }
    
            container('html-proofer') {
                stage('Validate HTML') {
                    sh ("htmlproofer public --internal-domains ${env.JOB_NAME} --external_only --only-4xx")
                }
            }

            container('docker') {
                stage('Docker Build & Push Current & Latest Versions') {
                    sh ("docker build -t ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:${env.BUILD_NUMBER} .")
                    sh ("docker push ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:${env.BUILD_NUMBER}")
                    sh ("docker tag ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:${env.BUILD_NUMBER} ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:latest")
                    sh ("docker push ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:latest")
                }
            }

            container('kubectl') {
                stage('Deploy New Build To Kubernetes') {
                    sh ("kubectl set image deployment/${K8S_DEPLOYMENT_NAME} ${K8S_DEPLOYMENT_NAME}=${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:${env.BUILD_NUMBER}")
                }
            }

        }        
    }
}
@rasheedamir
Copy link
Author

rasheedamir commented Apr 23, 2018

PROBLEM

in k8s docs its recommended not to run more than 110 pods per node; but OpenShift docs say it can be 250; do you know why?

SOLUTION

from OS team:

openshift tests and validates in different environments. openshift has a test team that validates the environments, and we don’t always make community recommendations

remember, openshift is a kube distro that is opinionated and we control not just the host but the container stack and the kernel and all those other things, so we try to publish guidance related to a much smaller possible set of configurations

a recommendation for kube has take potentially thousands of possible combos into account

@rasheedamir
Copy link
Author

rasheedamir commented Apr 27, 2018

PROBLEM

how does kubeclt drain, cordon works?

SOLUTION

The given node will be marked unschedulable to prevent new pods from arriving. 'drain' evicts the pods if the APIServer supports eviciton (http://kubernetes.io/docs/admin/disruptions/). Otherwise, it will use normal DELETE to delete the pods. The 'drain' evicts or deletes all pods except mirror pods (which cannot be deleted through the API server).

If there are DaemonSet-managed pods, drain will not proceed without --ignore-daemonsets, and regardless it will not delete any DaemonSet-managed pods, because those pods would be immediately replaced by the DaemonSet controller, which ignores unschedulable markings.

If there are any pods that are neither mirror pods nor managed by ReplicationController, ReplicaSet, DaemonSet, StatefulSet or Job, then drain will not delete any pods unless you use --force.

'drain' waits for graceful termination. You should not operate on the machine until the command completes.

When you are ready to put the node back into service, use kubectl uncordon, which will make the node schedulable again.

@rasheedamir
Copy link
Author

rasheedamir commented Apr 27, 2018

PROBLEM

why write custom k8s controller?

SOLUTION

Writing a custom controller: Extending the functionality of your cluster

Much of the functionality in a Kubernetes cluster is managed by a reconciliation pattern within "controllers". The node, service, or deployment controllers (just to name a few) watch for changes to objects, then act on those changes to drive your cluster to a desired state. This same pattern can be used to implement custom logic, which can be used to extend the functionality of your cluster without ever needing to modify Kubernetes itself. This talk will cover how to implement your own custom controller, from contacting the Kubernetes API to using existing libraries to easily watch, react, and update components in your cluster. By building on existing functionality and following a few best practices, you can quickly and easily implement your own custom controller.

MUST WATCH ...

https://resources.coreos.com/youtube-coreos-fest-2017/writing-a-custom-controller-extending-the-functionality-of-your-cluster


https://github.com/kubernetes/community/blob/master/contributors/devel/controllers.md


https://github.com/stakater/what-happens-when-k8s


this is very good starting point:
https://rancher.com/using-kubernetes-api-go-kubecon-2017-session-recap/

but we won't use their recommendations for dependency management, code skeleton, etc.


this is an excellent read as well
https://medium.com/@cloudark/kubernetes-custom-controllers-b6c7d0668fdf


Steps

  • Created go project
  • Added client-go package dependencies to it
  • Created a client to talk to Kubernetes api
  • Defined an Informer that would watch node object changes, and execute callback function once that happens
  • Implemented an actual logic in the callback definition.
  • Tested the code by running the binary in outside of cluster, and then deployed it inside the cluster

sample controller by k8s; we must understand it

https://github.com/kubernetes/sample-controller


goog sample control to follow
https://github.com/aaronlevy/kube-controller-demo/blob/master/reboot-controller/main.go


@rasheedamir
Copy link
Author

PROBLEM

what happens when you make kubectl create ...?

SOLUTION

Such an awesome read:
https://github.com/stakater/what-happens-when-k8s

@rasheedamir
Copy link
Author

PROBLEM

what are k8s controller?

SOLUTION

A controller is an asychronous script that works to reconcile the current state of the Kubernetes system to a desired state. Each controller has a small responsibility and is run in parallel by the kube-controller-manager component.

@rasheedamir
Copy link
Author

PROBLEM

what are k8s informners?

SOLUTION

Informers
As you might have noticed, some controllers like the RBAC authorizer or the Deployment controller need to retrieve cluster state to function. To return to the example of the RBAC authorizer, we know that when a request comes in, the authenticator will save an initial representation of user state for later use. The RBAC authorizer will then use this to retrieve all the roles and role bindings that are associated with the user in etcd. How are controllers supposed to access and modify such resources? It turns out this is a common use case and is solved in Kubernetes with informers.

An informer is a pattern that allows controllers to subscribe to storage events and easily list resources they're interested in. Apart from providing an abstraction which is nice to work with, it also takes care of a lot of the nuts and bolts such as caching (caching is important because it reduces unnecessary kube-apiserver connections, and reduces duplicate serialization costs server- and controller-side). By using this design, it also allows controllers to interact in a threadsafe manner without having to worry about stepping on anybody else's toes.

For more information about how informers work in relation to controllers, check out this blog post (http://borismattijssen.github.io/articles/kubernetes-informers-controllers-reflectors-stores)

@rasheedamir
Copy link
Author

PROBLEM

what are docker containers?

SOLUTION

best possible explanation

https://jvns.ca/blog/2016/10/10/what-even-is-a-container/

@rasheedamir
Copy link
Author

PROBLEM

where to find kubelet logs on openshift?

SOLUTION

less /var/log/messages

@rasheedamir
Copy link
Author

rasheedamir commented Apr 30, 2018

PROBLEM

If you see this message in kubelet logs

Create more free space in thin pool or use dm.min_free_space option

SOLUTION

This issue is bit tricky

run docker info

on nodes to find details about Storage Driver: devicemapper

Storage Driver: devicemapper
 Pool Name: docker--vg-docker--pool
 Pool Blocksize: 524.3 kB
 Base Device Size: 10.74 GB
 Backing Filesystem: xfs
 Data file: 
 Metadata file: 
 Data Space Used: 33.74 GB
 Data Space Total: 96.41 GB
 Data Space Available: 62.68 GB
 Metadata Space Used: 9.695 MB
 Metadata Space Total: 109.1 MB
 Metadata Space Available: 99.36 MB
 Thin Pool Minimum Free Space: 9.641 GB
 Udev Sync Supported: true
 Deferred Removal Enabled: true
 Deferred Deletion Enabled: true
 Deferred Deleted Device Count: 0
 Library Version: 1.02.140-RHEL7 (2017-05-03)

Some handy commands to run are pvs, lvs, pvdisplay, lvdisplay

where Docker GC % and Kubelet GC % would need to be tuned correctly or there would be DOS?

As per @derekwaynecarr suggestion we need to have the kubelet be more aggressive than Docker on this as Docker will refuse to do anything if the pull or whatever would use more space than the threshold at 90%, so we'd want kubelet to begin eviction at 80% or so.

this setting does matter:
fs.Int32Var(&s.ImageGCHighThresholdPercent, "image-gc-high-threshold", s.ImageGCHighThresholdPercent, "The percent of disk usage after which image garbage collection is always run. Default: 85%")

Previously, docker could refuse to start new containers due to reaching dm.min_free_space (default 10%), but the devicemapper thin pool usage did not exceed image-gc-high-threshold (default 90%), so the image reclaim occurred and the node was stuck. This bug fix changes the default image-gc-high-threshold to 85%, which causes image reclaim to occur before the default dm.min_free_space is reached.


good references to read on
https://github.com/openshift/release/issues/40


Force deletion of containers is a bad idea IMO. People are using this option to quickly move on without addressing the real problem. It is due to that problem that device is busy and it needs to be addressed.

In the dm.min_free_space section the page says:

Whenever a new a thin pool device is created (during docker pull or during container creation), the Engine checks if the minimum free space is available. If sufficient space is unavailable, then device creation fails and any relevant docker operation fails.

To recover from this error, you must create more free space in the thin pool to recover from the error. You can create free space by deleting some images and containers from the thin pool. You can also add more storage to the thin pool.

I think it all depends how the thin pool was created in the first place. The corresponding method should be used accordingly. For instance, if the thin pool was created with the LVM tools (pvcreate on at least 1 PV, then vgcreate, then lvcreate), it should be extended with the LVM tools (if the VG is full, extend it with pvcreate on additional PVs then vgextend to add these PVs to the VG; and then lvextend on the thin pool).


Docker uses for image and metadata storage. If this storage space is filled up, the Docker daemon cannot create new containers.

Remove non-running containers and unused images from your container instances. You can use the following example commands to manually remove stopped containers and unused images. Deleted containers cannot be inspected later, and deleted images must be pulled again before starting new containers from them.

To remove non-running containers, execute the following command on your container instance:

docker rm $(docker ps -aq)

To remove unused images, execute the following command on your container instance:

docker rmi $(docker images -q)

Remove unused data blocks within containers. You can use the following command to run fstrim on any running container and discard any data blocks that are unused by the container file system.

sudo sh -c "docker ps -q | xargs docker inspect --format='{{ .State.Pid }}' | xargs -IZ fstrim /proc/Z/root/"

@rasheedamir
Copy link
Author

PROBLEM

how to add label/annotations to openshift project?

SOLUTION

Well you can't! As projects are immutable ...

BUT you can add them to the namespace ( same name as project ); and then they will show up on the project

add annotation on a namespace

oc annotate namespace/workshop stakater.com/persist=true (edited)

now describe the project and it will have that namespace

oc describe project/workshop

to remove the annotation run this

oc annotate namespace workshop stakater.com/persist-

@rasheedamir
Copy link
Author

#PROBLEM

how to print lvdisplay from all nodes using ansible?

SOLUTION

ansible -i hosts -u ansible -b -m shell nodes -a 'lvdisplay -C /dev/docker-vg/docker-pool|grep pool'

@rasheedamir
Copy link
Author

rasheedamir commented May 2, 2018

PROBLEM

how to print lvdisplay from all nodes using ansible?

SOLUTION

ansible -i hosts -u ansible -b -m shell nodes -a 'lvdisplay -C /dev/docker-vg/docker-pool|grep pool'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment