Skip to content

Instantly share code, notes, and snippets.

@rasheedamir
Last active March 10, 2020 09:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save rasheedamir/7da0145ae1b5d9889e4085ded21d1acb to your computer and use it in GitHub Desktop.
Save rasheedamir/7da0145ae1b5d9889e4085ded21d1acb to your computer and use it in GitHub Desktop.
troubleshooting collection
FILE # 1

#!groovy

String GIT_VERSION

node {

  def buildEnv
  def devAddress

  stage ('Checkout') {
    deleteDir()
    checkout scm
    GIT_VERSION = sh (
      script: 'git describe --tags',
      returnStdout: true
    ).trim()
  }

  stage ('Build Custom Environment') {
    buildEnv = docker.build("build_env:${GIT_VERSION}", 'custom-build-env')
  }

  buildEnv.inside {

    stage ('Build') {
      sh 'sbt compile'
      sh 'sbt sampleClient/universal:stage'
    }

    stage ('Test') {
      parallel (
        'Test Server' : {
          sh 'sbt server/test'
        },
        'Test Sample Client' : {
          sh 'sbt sampleClient/test'
        }
      )
    }

    stage ('Prepare Docker Image') {
      sh 'sbt server/docker:stage'
    }
  }

  stage ('Build and Push Docker Image') {
    withCredentials([[$class: "UsernamePasswordMultiBinding", usernameVariable: 'DOCKERHUB_USER', passwordVariable: 'DOCKERHUB_PASS', credentialsId: 'Docker Hub']]) {
      sh 'docker login --username $DOCKERHUB_USER --password $DOCKERHUB_PASS'
    }
    def serverImage = docker.build("sambott/grpc-test:${GIT_VERSION}", 'server/target/docker/stage')
    serverImage.push()
    sh 'docker logout'
  }

  stage ('Deploy to DEV') {
    devAddress = deployContainer("sambott/grpc-test:${GIT_VERSION}", 'DEV')
  }

  stage ('Verify Deployment') {
    buildEnv.inside {
      sh "sample-client/target/universal/stage/bin/demo-client ${devAddress}"
    }
  }
}

stage 'Deploy to LIVE'
  timeout(time:2, unit:'DAYS') {
    input message:'Approve deployment to LIVE?'
  }
  node {
    deployContainer("sambott/grpc-test:${GIT_VERSION}", 'LIVE')
  }

def deployContainer(image, env) {
  docker.image('lachlanevenson/k8s-kubectl:v1.5.2').inside {
    withCredentials([[$class: "FileBinding", credentialsId: 'KubeConfig', variable: 'KUBE_CONFIG']]) {
      def kubectl = "kubectl  --kubeconfig=\$KUBE_CONFIG --context=${env}"
      sh "${kubectl} set image deployment/grpc-demo grpc-demo=${image}"
      sh "${kubectl} rollout status deployment/grpc-demo"
      return sh (
        script: "${kubectl} get service/grpc-demo -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'",
        returnStdout: true
      ).trim()
    }
  }
}
FILE # 2

node {
    stage('checkout') {
        checkout scm
    }

    docker.image('openjdk:8').inside('-u root -e MAVEN_OPTS="-Duser.home=./"') {
        stage('check java') {
            sh "java -version"
        }

        stage('clean') {
            sh "chmod +x mvnw"
            sh "./mvnw clean"
        }

        stage('install tools') {
            sh "./mvnw com.github.eirslett:frontend-maven-plugin:install-node-and-yarn -DnodeVersion=v6.11.1 -DyarnVersion=v0.27.5"
        }

        stage('yarn install') {
            sh "./mvnw com.github.eirslett:frontend-maven-plugin:yarn"
        }

        stage('backend tests') {
            try {
                sh "./mvnw test"
            } catch(err) {
                throw err
            } finally {
                junit '**/target/surefire-reports/TEST-*.xml'
            }
        }

        stage('frontend tests') {
            try {
                sh "./mvnw com.github.eirslett:frontend-maven-plugin:yarn -Dfrontend.yarn.arguments=test"
            } catch(err) {
                throw err
            } finally {
                junit '**/target/test-results/karma/TESTS-*.xml'
            }
        }

        stage('packaging') {
            sh "./mvnw package -Pprod -DskipTests"
            archiveArtifacts artifacts: '**/target/*.war', fingerprint: true
        }

        stage('quality analysis') {
            withSonarQubeEnv('Sonar') {
                sh "./mvnw sonar:sonar"
            }
        }
    }

    def dockerImage
    stage('build docker') {
        sh "cp -R src/main/docker target/"
        sh "cp target/*.war target/docker/"
        dockerImage = docker.build('moviemanager', 'target/docker')
    }

    stage('publish docker') {
        docker.withRegistry('https://registry.hub.docker.com', 'docker-login') {
            dockerImage.push 'latest'
        }
    }
}

stage 'Trigger e2e tests'

waitUntil {
	try {
		def chartRepoType = git_branch == "master" ? 'dev' : 'pr'
		build job: 'workflow-chart-e2e', parameters: [
			[$class: 'StringParameterValue', name: 'WORKFLOW_CLI_SHA', value: git_commit],
			[$class: 'StringParameterValue', name: 'ACTUAL_COMMIT', value: git_commit],
			[$class: 'StringParameterValue', name: 'COMPONENT_REPO', value: 'workflow-cli'],
			[$class: 'StringParameterValue', name: 'CHART_REPO_TYPE', value: chartRepoType],
			[$class: 'StringParameterValue', name: 'UPSTREAM_SLACK_CHANNEL', value: '#controller']]
		true
	} catch(error) {
		if (git_branch == "master") {
			throw error
		}

		node(linux) {
			withCredentials([[$class: 'StringBinding', credentialsId: '8a727911-596f-4057-97c2-b9e23de5268d', variable: 'SLACKEMAIL']]) {
				mail body: """<!DOCTYPE html>
<html>
<head>
<meta content='text/html; charset=UTF-8' http-equiv='Content-Type' />
</head>
<body>
<div>Author: ${env.CHANGE_AUTHOR}<br/>
Branch: ${env.BRANCH_NAME}<br/>
Commit: ${env.CHANGE_TITLE}<br/>
<a href="${env.BUILD_URL}console">Click here</a> to view logs.</p>
<a href="${env.BUILD_URL}input/">Click here</a> to restart e2e.</p>
</div>
</html>
""", from: 'jenkins@ci.deis.io', subject: 'Workflow CLI E2E Test Failure', to: env.SLACKEMAIL, mimeType: 'text/html'
			}
			input "Retry the e2e tests?"
		}
		false
	}
}
podTemplate(label: 'pod-hugo-app', containers: [
    containerTemplate(name: 'hugo', image: 'smesch/hugo', ttyEnabled: true, command: 'cat'),
    containerTemplate(name: 'html-proofer', image: 'smesch/html-proofer', ttyEnabled: true, command: 'cat'),
    containerTemplate(name: 'kubectl', image: 'smesch/kubectl', ttyEnabled: true, command: 'cat',
        volumes: [secretVolume(secretName: 'kube-config', mountPath: '/root/.kube')]),
    containerTemplate(name: 'docker', image: 'docker', ttyEnabled: true, command: 'cat',
        envVars: [containerEnvVar(key: 'DOCKER_CONFIG', value: '/tmp/'),])],
        volumes: [secretVolume(secretName: 'docker-config', mountPath: '/tmp'),
                  hostPathVolume(hostPath: '/var/run/docker.sock', mountPath: '/var/run/docker.sock')
  ]) {

    node('pod-hugo-app') {

        def DOCKER_HUB_ACCOUNT = 'smesch'
        def DOCKER_IMAGE_NAME = 'hugo-app-jenkins'
        def K8S_DEPLOYMENT_NAME = 'hugo-app'

        stage('Clone Hugo App Repository') {
            checkout scm
 
            container('hugo') {
                stage('Build Hugo Site') {
                    sh ("hugo --uglyURLs")
                }
            }
    
            container('html-proofer') {
                stage('Validate HTML') {
                    sh ("htmlproofer public --internal-domains ${env.JOB_NAME} --external_only --only-4xx")
                }
            }

            container('docker') {
                stage('Docker Build & Push Current & Latest Versions') {
                    sh ("docker build -t ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:${env.BUILD_NUMBER} .")
                    sh ("docker push ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:${env.BUILD_NUMBER}")
                    sh ("docker tag ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:${env.BUILD_NUMBER} ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:latest")
                    sh ("docker push ${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:latest")
                }
            }

            container('kubectl') {
                stage('Deploy New Build To Kubernetes') {
                    sh ("kubectl set image deployment/${K8S_DEPLOYMENT_NAME} ${K8S_DEPLOYMENT_NAME}=${DOCKER_HUB_ACCOUNT}/${DOCKER_IMAGE_NAME}:${env.BUILD_NUMBER}")
                }
            }

        }        
    }
}
@hazim1093
Copy link

Problem

Request headers timeout, bad gateway Or other timeout errors on services behind keycloak-proxy

Solution

Whenever facing timeouts or bad gateway errors on some service behind keycloak proxy, always look out for proxy timeout settings.

e.g. In our case, kibana's call to elasticsearch was causing an error in kibana, as the response took a lot of time to return, keycloak proxy threw timeout error within 1 second.

You can update the values for keycloak proxy by specifying the following flags:

--upstream-response-header-timeout=30s
--upstream-timeout=30s
--upstream-keepalive-timeout=30s
--server-read-timeout=30s
--server-write-timeout=30s

@rasheedamir
Copy link
Author

PROBLEM

Wondering how this module can be added to nginx ingress controller?
https://github.com/curityio/nginx_phantom_token_module

SOLUTION

yes, we are not adding more auth modules to the nginx image. You can add support for this building a custom nginx image and then building the ingress controller image

no but basically you just need to edit this file https://github.com/kubernetes/ingress-nginx/blob/master/images/nginx/build.sh

and call make from that directory

@rasheedamir
Copy link
Author

PROBLEM

how to update kubelet settings of an OpenShift node?

SOLUTION

jump onto a node ssh <node-name>

sudo su ( to become root )

cd /etc/origin/node/

cp node-config.yaml node-config-12-april.yaml

vi node-config.yaml

then update kubelet arguments
kube-reserved:
 - "cpu=0.5,memory=5Gi"
 system-reserved:
 - "cpu=0.5,memory=5Gi"

:wq!

systemctl restart atomic-openshift-node

@rasheedamir
Copy link
Author

rasheedamir commented Apr 16, 2018

PROBLEM

How to Provisioning and Lifecycle of a Production Ready Kubernetes Cluster?

SOLUTION

We should consider it in three layers:

  1. Infrastructure Layer
  2. Kubernetes / OpenShift Layer
  3. Application Layer

We chose a set of common requirements across all these layers to help guide our choice of tools and minimise cognitive load. We wanted tools that could:

  • keep all the “state” for a given layer in version control (ie a git repo), such that we could compare the state of the system to the version controlled state and tell if anything was missing or wrong, and if something broke, roll back to a previously known-good state.
  • share as much configuration between environments as possible. Ideally the tools would support some concept of parameterisable “modules” which could be “instantiated” for each cluster, and allow for the differences between clusters to be minimally expressed and easily auditable.
  • alert us if the state of a cluster diverged from the configuration.

provisioning-and-lifecycle-of-a-production-ready-kubernetes-cluster-300x225

https://www.weave.works/blog/provisioning-lifecycle-production-ready-kubernetes-cluster/

@rasheedamir
Copy link
Author

rasheedamir commented Apr 16, 2018

PROBLEM

what is gitops?

SOLUTION

What exactly is GitOps? By using Git as our source of truth, we can operate almost everything. For example, version control, history, peer review, and rollback happen through Git without needing to poke around with tools like kubectl.

  • Our provisioning of AWS resources and deployment of k8s is declarative
  • Our entire system state is under version control and described in a single Git repository
  • Operational changes are made by pull request (plus build & release pipelines)
  • Diff tools detect any divergence and notify us via Slack alerts; and sync tools enable convergence
  • Rollback and audit logs are also provided via Git

https://www.weave.works/blog/gitops-operations-by-pull-request

Clear visibility of the state of a cluster is key for maintaining operational systems. Developers can be confident in their changes by observing a predictable series of deployment events.

@rasheedamir
Copy link
Author

rasheedamir commented Apr 16, 2018

PROBLEM

where to find kubernetes logos, icons, digital assets?

SOLUTION

unfortunately there is none single source of truth yet but its a work in progress

Here are some random collections for a good start

https://github.com/kubernetes/website/tree/master/docs/tutorials/kubernetes-basics/public/images
https://docs.google.com/presentation/d/1xftbl5mBh9tvJ9-Z0KgHlyh3yyALJIfpXcHrV7MpzZ8/edit#slide=id.g146b7a7a40_15_0 [ THIS IS BEST ]

the ticket has the info: kubernetes/website#747

@rasheedamir
Copy link
Author

PROBLEM

What is Flux?

SOLUTION

Automated git->cluster synchronisation

Flux's main feature is the automated synchronisation between a version control repository and a cluster. If you make any changes to your repository, those changes are automatically deployed to your cluster.

This is a simple, but dramatic improvement on current state of the art.

  • All configuration is stored within version control and is inherently up to date. At any point anyone could completely recreate the cluster in exactly the same state.
  • Changes to the cluster are immediately visible to all interested parties.
  • During a postmortem, the git log provides the perfect history for an audit.
  • End to end, code to production pipelines become not only possible, but easy.

https://github.com/weaveworks/flux/blob/master/site/introduction.md

@rasheedamir
Copy link
Author

PROBLEM

any great list of icons?

SOLUTION

https://material.io/icons/

@rasheedamir
Copy link
Author

PROBLEM

what are deployment strategies?

SOLUTION

VERY WELL EXPLAINED!

https://container-solutions.com/deployment-strategies/

We are going to talk about the following strategies:

  • recreate: version A is terminated then version B is rolled out
  • ramped (also known as rolling-update or incremental): version B is slowly rolled out and replacing version A
  • blue/green: version B is released alongside version A, then the traffic is switched to version B
  • canary: version B is released to a subset of users, then proceed to a full rollout
  • a/b testing: version B is released to a subset of users under specific condition
  • shadow: version B receives real world traffic alongside version A and doesn’t impact the response

@rasheedamir
Copy link
Author

PROBLEM

did a rolling update with kops of a cluster with k8s 1.7.8; and none of the nodes came back healthy!

SOLUTION

??? YET TO FIGURE OUT ... 

@rasheedamir
Copy link
Author

PROBLEM

any best practices for Jenkins pipelines?

SOLUTION

https://github.com/jenkinsci/pipeline-examples/blob/master/docs/BEST_PRACTICES.md

@rasheedamir
Copy link
Author

PROBLEM

oes there exist any solution e.g. we would like to setup alerts if there is a pod in cluster which doesn't have any limits set? i mean we want to allow developers to manage their pipelines in whatever way they like but still we want to have a notification if there is a pod which doesn't any limits set ...

so, that we can chase that team and ask them to update their deployments

SOLUTION

I guess we could add a validation step in a release; or in a PR on an environment to reject PRs which are missing, say, resources

validating all charts in an env have limits sounds a nice option


I’m not aware of anything but it might be nice to check in CI that there’s no limits and fail the CI checks. Not sure if we can add policy to cover that in sonarqube for example

or maybe a custom controller that analyses pods in a preview env


if folks were using GitOps on an environment, it could validate the charts in the CI build

via either a helm pre-install hook https://github.com/kubernetes/helm/blob/master/docs/charts_hooks.md#the-available-hooks or just generating templates of the charts and validating the output YAML for all Deployments and assert they have limits etc


wonder if we could extend helm lint to add more validations? https://github.com/kubernetes/helm/blob/master/docs/helm/helm_lint.md


if anyone can kubectl apply at any time then a controller is your only hope. A GitOps validation check on PRs would be cooler though & give better feedback & could automate fixing bad PRs over time by adding better default limits automatically

@rasheedamir
Copy link
Author

rasheedamir commented Apr 18, 2018

PROBLEM

ExternalDNS logs say

time="2018-04-18T05:41:08Z" level=error msg="NoCredentialProviders: no valid providers in chain. Deprecated.
	For verbose messaging see aws.Config.CredentialsChainVerboseErrors" 
time="2018-04-18T05:44:08Z" level=error msg="NoCredentialProviders: no valid providers in chain. Deprecated.
	For verbose messaging see aws.Config.CredentialsChainVerboseErrors" 

SOLUTION

We had a bad node in the cluster and by just kicking it out the pod go rescheduled on another node and things started working

@rasheedamir
Copy link
Author

rasheedamir commented Apr 18, 2018

PROBLEM

com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException: User: arn:aws:sts::847616676486:assumed-role/nodes.tools178.k8s.com/i-03868a9e822ec1300 is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::847616375496:role/nodes.tools178.k8s.com with an explicit deny (Service: AWSSecurityTokenService; Status Code: 403; Error Code: AccessDenied; Request ID: 3c71c867-42df-11e8-9659-a1f7fbcbfd1a)

SOLUTION

The issue was that withAWS blocked isn't needed if one is uploading to s3 bucket in same AWS account!

@rasheedamir
Copy link
Author

PROBLEM

what is AssumeRole in AWS?

SOLUTION

Returns a set of temporary security credentials (consisting of an access key ID, a secret access key, and a security token) that you can use to access AWS resources that you might not normally have access to.

@rasheedamir
Copy link
Author

PROBLEM

how does linux observability looks like?

SOLUTION

screen shot 2018-04-20 at 11 16 31

screen shot 2018-04-20 at 11 16 13

screen shot 2018-04-20 at 09 59 18

screen shot 2018-04-20 at 09 58 36

@rasheedamir
Copy link
Author

rasheedamir commented Apr 22, 2018

PROBELM

why use ansible & terraform?

SOLUTION

Terraform is a great tool for building infrastructure in the cloud. Ansible is a beautifully simple agentless (and serverless) configuration management tool. A common use case is to build servers with Terraform, and have Ansible configure them. Unfortunately Terraform lacks a provisioning plugin for Ansible - but fear not, they can be used together fairly trivially by using the local-exec provisioner of Terraform.

Ansible and Terraform are two very similar applications with some key differences. One of those key differences involves two very common DevOps concepts: configuration management and orchestration. These terms generally describe types of tools. Ansible is primarily a configuration management tool, commonly abbreviated as “CM”, and Terraform is an orchestration tool. Now, understand that there are overlaps and these terms are not necessarily mutually exclusive. It helps to use tools that match their strengths, so let’s talk a little about what each type of tool is optimized for.

I personally prefer to use Terraform for orchestration and Ansible for configuration management.

@rasheedamir
Copy link
Author

rasheedamir commented Apr 23, 2018

PROBLEM

how to run sysdig container on coreos machine?

nothing better than sysdig exists to understand stuff at lowest possible level

SOLUTION

We need to run a docker container of sysdig on the node where we have a container whose network traffic we want to log and thats it!

docker run -i -t --name sysdig --privileged -v /var/run/docker.sock:/host/var/run/docker.sock -v /dev:/host/dev -v /proc:/host/proc:ro -v /boot:/host/boot:ro -v /lib/modules:/host/lib/modules:ro -v /usr:/host/usr:ro sysdig/sysdig

how to read the http traffic of the pod?

before running the sysdig container find the container name and then look the magic happening here:

once in the sysdig container we should do follwoing:

sysdig -pc -c httplog container.name=k8s_kubernetes-dashboard-proxy_kubernetes-dashboard-proxy-c77b7cc85-htz2t_apps_a17d3a62-46e9-11e8-890a-0af9a1b637a6_0
sysdig -pc -c httplog container.name=k8s_kubernetes-dashboard_kubernetes-dashboard-56bfb447b9-lcn65_apps_82a15826-46eb-11e8-890a-0af9a1b637a6_0

echo_fds is better if you want to see everything...

sysdig -A -pc -c echo_fds fd.port=80 container.name=k8s_kubernetes-dashboard-proxy_kubernetes-dashboard-proxy-6fcf5858df-ksmtg_apps_c6b1797f-46f4-11e8-890a-0af9a1b637a6_0 > kube-proxy.log

to view the created file go to:

less -r kube-proxy.log 

@rasheedamir
Copy link
Author

rasheedamir commented Apr 23, 2018

PROBLEM

in k8s docs its recommended not to run more than 110 pods per node; but OpenShift docs say it can be 250; do you know why?

SOLUTION

from OS team:

openshift tests and validates in different environments. openshift has a test team that validates the environments, and we don’t always make community recommendations

remember, openshift is a kube distro that is opinionated and we control not just the host but the container stack and the kernel and all those other things, so we try to publish guidance related to a much smaller possible set of configurations

a recommendation for kube has take potentially thousands of possible combos into account

@rasheedamir
Copy link
Author

rasheedamir commented Apr 27, 2018

PROBLEM

how does kubeclt drain, cordon works?

SOLUTION

The given node will be marked unschedulable to prevent new pods from arriving. 'drain' evicts the pods if the APIServer supports eviciton (http://kubernetes.io/docs/admin/disruptions/). Otherwise, it will use normal DELETE to delete the pods. The 'drain' evicts or deletes all pods except mirror pods (which cannot be deleted through the API server).

If there are DaemonSet-managed pods, drain will not proceed without --ignore-daemonsets, and regardless it will not delete any DaemonSet-managed pods, because those pods would be immediately replaced by the DaemonSet controller, which ignores unschedulable markings.

If there are any pods that are neither mirror pods nor managed by ReplicationController, ReplicaSet, DaemonSet, StatefulSet or Job, then drain will not delete any pods unless you use --force.

'drain' waits for graceful termination. You should not operate on the machine until the command completes.

When you are ready to put the node back into service, use kubectl uncordon, which will make the node schedulable again.

@rasheedamir
Copy link
Author

rasheedamir commented Apr 27, 2018

PROBLEM

why write custom k8s controller?

SOLUTION

Writing a custom controller: Extending the functionality of your cluster

Much of the functionality in a Kubernetes cluster is managed by a reconciliation pattern within "controllers". The node, service, or deployment controllers (just to name a few) watch for changes to objects, then act on those changes to drive your cluster to a desired state. This same pattern can be used to implement custom logic, which can be used to extend the functionality of your cluster without ever needing to modify Kubernetes itself. This talk will cover how to implement your own custom controller, from contacting the Kubernetes API to using existing libraries to easily watch, react, and update components in your cluster. By building on existing functionality and following a few best practices, you can quickly and easily implement your own custom controller.

MUST WATCH ...

https://resources.coreos.com/youtube-coreos-fest-2017/writing-a-custom-controller-extending-the-functionality-of-your-cluster


https://github.com/kubernetes/community/blob/master/contributors/devel/controllers.md


https://github.com/stakater/what-happens-when-k8s


this is very good starting point:
https://rancher.com/using-kubernetes-api-go-kubecon-2017-session-recap/

but we won't use their recommendations for dependency management, code skeleton, etc.


this is an excellent read as well
https://medium.com/@cloudark/kubernetes-custom-controllers-b6c7d0668fdf


Steps

  • Created go project
  • Added client-go package dependencies to it
  • Created a client to talk to Kubernetes api
  • Defined an Informer that would watch node object changes, and execute callback function once that happens
  • Implemented an actual logic in the callback definition.
  • Tested the code by running the binary in outside of cluster, and then deployed it inside the cluster

sample controller by k8s; we must understand it

https://github.com/kubernetes/sample-controller


goog sample control to follow
https://github.com/aaronlevy/kube-controller-demo/blob/master/reboot-controller/main.go


@rasheedamir
Copy link
Author

PROBLEM

what happens when you make kubectl create ...?

SOLUTION

Such an awesome read:
https://github.com/stakater/what-happens-when-k8s

@rasheedamir
Copy link
Author

PROBLEM

what are k8s controller?

SOLUTION

A controller is an asychronous script that works to reconcile the current state of the Kubernetes system to a desired state. Each controller has a small responsibility and is run in parallel by the kube-controller-manager component.

@rasheedamir
Copy link
Author

PROBLEM

what are k8s informners?

SOLUTION

Informers
As you might have noticed, some controllers like the RBAC authorizer or the Deployment controller need to retrieve cluster state to function. To return to the example of the RBAC authorizer, we know that when a request comes in, the authenticator will save an initial representation of user state for later use. The RBAC authorizer will then use this to retrieve all the roles and role bindings that are associated with the user in etcd. How are controllers supposed to access and modify such resources? It turns out this is a common use case and is solved in Kubernetes with informers.

An informer is a pattern that allows controllers to subscribe to storage events and easily list resources they're interested in. Apart from providing an abstraction which is nice to work with, it also takes care of a lot of the nuts and bolts such as caching (caching is important because it reduces unnecessary kube-apiserver connections, and reduces duplicate serialization costs server- and controller-side). By using this design, it also allows controllers to interact in a threadsafe manner without having to worry about stepping on anybody else's toes.

For more information about how informers work in relation to controllers, check out this blog post (http://borismattijssen.github.io/articles/kubernetes-informers-controllers-reflectors-stores)

@rasheedamir
Copy link
Author

PROBLEM

what are docker containers?

SOLUTION

best possible explanation

https://jvns.ca/blog/2016/10/10/what-even-is-a-container/

@rasheedamir
Copy link
Author

PROBLEM

where to find kubelet logs on openshift?

SOLUTION

less /var/log/messages

@rasheedamir
Copy link
Author

rasheedamir commented Apr 30, 2018

PROBLEM

If you see this message in kubelet logs

Create more free space in thin pool or use dm.min_free_space option

SOLUTION

This issue is bit tricky

run docker info

on nodes to find details about Storage Driver: devicemapper

Storage Driver: devicemapper
 Pool Name: docker--vg-docker--pool
 Pool Blocksize: 524.3 kB
 Base Device Size: 10.74 GB
 Backing Filesystem: xfs
 Data file: 
 Metadata file: 
 Data Space Used: 33.74 GB
 Data Space Total: 96.41 GB
 Data Space Available: 62.68 GB
 Metadata Space Used: 9.695 MB
 Metadata Space Total: 109.1 MB
 Metadata Space Available: 99.36 MB
 Thin Pool Minimum Free Space: 9.641 GB
 Udev Sync Supported: true
 Deferred Removal Enabled: true
 Deferred Deletion Enabled: true
 Deferred Deleted Device Count: 0
 Library Version: 1.02.140-RHEL7 (2017-05-03)

Some handy commands to run are pvs, lvs, pvdisplay, lvdisplay

where Docker GC % and Kubelet GC % would need to be tuned correctly or there would be DOS?

As per @derekwaynecarr suggestion we need to have the kubelet be more aggressive than Docker on this as Docker will refuse to do anything if the pull or whatever would use more space than the threshold at 90%, so we'd want kubelet to begin eviction at 80% or so.

this setting does matter:
fs.Int32Var(&s.ImageGCHighThresholdPercent, "image-gc-high-threshold", s.ImageGCHighThresholdPercent, "The percent of disk usage after which image garbage collection is always run. Default: 85%")

Previously, docker could refuse to start new containers due to reaching dm.min_free_space (default 10%), but the devicemapper thin pool usage did not exceed image-gc-high-threshold (default 90%), so the image reclaim occurred and the node was stuck. This bug fix changes the default image-gc-high-threshold to 85%, which causes image reclaim to occur before the default dm.min_free_space is reached.


good references to read on
https://github.com/openshift/release/issues/40


Force deletion of containers is a bad idea IMO. People are using this option to quickly move on without addressing the real problem. It is due to that problem that device is busy and it needs to be addressed.

In the dm.min_free_space section the page says:

Whenever a new a thin pool device is created (during docker pull or during container creation), the Engine checks if the minimum free space is available. If sufficient space is unavailable, then device creation fails and any relevant docker operation fails.

To recover from this error, you must create more free space in the thin pool to recover from the error. You can create free space by deleting some images and containers from the thin pool. You can also add more storage to the thin pool.

I think it all depends how the thin pool was created in the first place. The corresponding method should be used accordingly. For instance, if the thin pool was created with the LVM tools (pvcreate on at least 1 PV, then vgcreate, then lvcreate), it should be extended with the LVM tools (if the VG is full, extend it with pvcreate on additional PVs then vgextend to add these PVs to the VG; and then lvextend on the thin pool).


Docker uses for image and metadata storage. If this storage space is filled up, the Docker daemon cannot create new containers.

Remove non-running containers and unused images from your container instances. You can use the following example commands to manually remove stopped containers and unused images. Deleted containers cannot be inspected later, and deleted images must be pulled again before starting new containers from them.

To remove non-running containers, execute the following command on your container instance:

docker rm $(docker ps -aq)

To remove unused images, execute the following command on your container instance:

docker rmi $(docker images -q)

Remove unused data blocks within containers. You can use the following command to run fstrim on any running container and discard any data blocks that are unused by the container file system.

sudo sh -c "docker ps -q | xargs docker inspect --format='{{ .State.Pid }}' | xargs -IZ fstrim /proc/Z/root/"

@rasheedamir
Copy link
Author

PROBLEM

how to add label/annotations to openshift project?

SOLUTION

Well you can't! As projects are immutable ...

BUT you can add them to the namespace ( same name as project ); and then they will show up on the project

add annotation on a namespace

oc annotate namespace/workshop stakater.com/persist=true (edited)

now describe the project and it will have that namespace

oc describe project/workshop

to remove the annotation run this

oc annotate namespace workshop stakater.com/persist-

@rasheedamir
Copy link
Author

#PROBLEM

how to print lvdisplay from all nodes using ansible?

SOLUTION

ansible -i hosts -u ansible -b -m shell nodes -a 'lvdisplay -C /dev/docker-vg/docker-pool|grep pool'

@rasheedamir
Copy link
Author

rasheedamir commented May 2, 2018

PROBLEM

how to print lvdisplay from all nodes using ansible?

SOLUTION

ansible -i hosts -u ansible -b -m shell nodes -a 'lvdisplay -C /dev/docker-vg/docker-pool|grep pool'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment