issacg/README.md

## README.md

      
    Raw
  

              README.md
            
          
    run:ai bootstrap for GKE

A few notes:
This should be run with a user with admin privileges to GKE and to the cluster (kubectl)
I use certmanager to allow LetsEncrypt to manage the HTTPS keypair

Make a note of when you use production and when you use staging LetsEncrypt (letsencrypt.yaml line 90)
I use a small placeholder HTTPS service (apple service - letsencrypt.yaml line 48) as
a placeholder to make cert-manager request the certificate and store it in the kubernetes
secret before installing run:ai.  You can remove it if not needed
I set the webroot of the kubernetes ingress to redirect to another website (letsencrypt.yaml line 115)

Look for < > strings that should be replaced in install.sh and letsencrypt.yaml

install.sh: line 4 - replace region with the cluster region
install.sh: line 32 - replace runai realm (see https://docs.run.ai/admin/runai-setup/authentication/researcher-authentication/#administration-user-interface-setup)
install.sh: lines 10 + 26 - replace email with an email that will be registered with LetsEncrypt
install.sh: lines 95 + 98 + 122 + 124 - replace cluster hostname with the hostname you want.
install.sh: line 115 - replace with redirect target (or remove if you prefer to return 404 errors instead of redirecting non-existant paths)


## install.sh
#!/bin/sh

# Anthos service - update name/region!
gcloud container clusters update runai-eu-mvp --enable-identity-service --region=<gke-cluster-region>

# namespaces
kubectl create namespace runai

# helm
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
helm repo add jetstack https://charts.jetstack.io
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm repo update

# prometheus
helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace --set grafana.enabled=false

# nginx
helm install nginx-ingress ingress-nginx/ingress-nginx --namespace nginx-ingress --create-namespace

# cert-manager
helm install cert-manager jetstack/cert-manager --namespace cert-manager --create-namespace --set installCRDs=true
kubectl apply -f letsencrypt.yaml

# gpu-operator
helm install gpu-operator nvidia/gpu-operator --namespace gpu-operator --create-namespace --set mig.strategy=mixed
kubectl apply -f resourcequota.yaml

# runai-oidc
kubectl get clientconfig default -n kube-public -o yaml > login-config.yaml
yq -i e ".spec +={\"authentication\":[{\"name\":\"oidc\",\"oidc\":{\"clientID\":\"runai\",\"issuerURI\":\"https://app.run.ai/auth/realms/<runai-realm>\",\"kubectlRedirectURI\":\"http://localhost:8000/callback\",\"userClaim\":\"sub\",\"userPrefix\":\"-\"}}]}" login-config.yaml
kubectl apply -f login-config.yaml
rm login-config.yaml

### Show public IP
kubectl --namespace nginx-ingress get services -o wide -w nginx-ingress-ingress-nginx-controller


## letsencrypt.yaml
# Update hostname below - make sure it resolves!

apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
  name: letsencrypt-staging
  namespace: runai
spec:
  acme:
    email: <email>
    server: https://acme-staging-v02.api.letsencrypt.org/directory
    privateKeySecretRef:
      name: letsencrypt-issuer-staging-account-key
    solvers:
    - http01:
        ingress:
          ingressClassName: nginx
---
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
  name: letsencrypt
  namespace: runai
spec:
  acme:
    email: <email>
    server: https://acme-v02.api.letsencrypt.org/directory
    privateKeySecretRef:
      name: letsencrypt-issuer-account-key
    solvers:
    - http01:
        ingress:
          ingressClassName: nginx

---
# create empty k8s secret to host the TLS keys (required by runai)
apiVersion: v1
kind: Secret
metadata:
  name: runai-cluster-domain-tls-secret
  namespace: runai
type: kubernetes.io/tls
stringData:
  tls.key: ""
  tls.crt: ""

---
# Dummy pod/service/ingress to latch on to letsencrypt so certmanager cretaes/maintains the certificate...
kind: Pod
apiVersion: v1
metadata:
  name: apple-app
  namespace: runai
  labels:
    app: apple
spec:
  containers:
    - name: apple-app
      image: hashicorp/http-echo
      args:
        - "-text=apple"
      resources:
        requests:
          memory: "32Mi"
          cpu: "25m"
        limits:
          memory: "128Mi"
          cpu: "500m"

---

kind: Service
apiVersion: v1
metadata:
  name: apple-service
  namespace: runai
spec:
  selector:
    app: apple
  ports:
    - port: 5678
---

apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  namespace: runai
  name: apple-ingress
  annotations:
    cert-manager.io/issuer: letsencrypt-staging
spec:
  tls:
  - secretName: runai-cluster-domain-tls-secret
    hosts:
      - <cluster-hostname>
  ingressClassName: nginx
  rules:
  - host: <cluster-hostname>
    http:
      paths:
        - path: /apple
          pathType: Prefix
          backend:
            service:
              name: apple-service
              port:
                number: 5678

---
# Redirect / traffic
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  annotations:
    nginx.ingress.kubernetes.io/temporal-redirect: "https://<your website>/"
  name: ingress-redirect
spec:
  ingressClassName: nginx
  tls:
  - secretName: runai-cluster-domain-tls-secret
    hosts:
      - <cluster-hostname>
  rules:
  - host: <cluster-hostname>
    http:
      paths:
      - path: /apple
        pathType: Exact
        backend:
          service:
            name: apple-service
            port:
              number: 5678


## resourcequota.yaml
apiVersion: v1
kind: ResourceQuota
metadata:
  name: gcp-critical-pods
  namespace: gpu-operator
spec:
  scopeSelector:
    matchExpressions:
    - operator: In
      scopeName: PriorityClass
      values:
      - system-node-critical
      - system-cluster-critical
	#!/bin/sh

	# Anthos service - update name/region!
	gcloud container clusters update runai-eu-mvp --enable-identity-service --region=<gke-cluster-region>

	# namespaces
	kubectl create namespace runai

	# helm
	helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
	helm repo add jetstack https://charts.jetstack.io
	helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
	helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
	helm repo update

	# prometheus
	helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace --set grafana.enabled=false

	# nginx
	helm install nginx-ingress ingress-nginx/ingress-nginx --namespace nginx-ingress --create-namespace

	# cert-manager
	helm install cert-manager jetstack/cert-manager --namespace cert-manager --create-namespace --set installCRDs=true
	kubectl apply -f letsencrypt.yaml

	# gpu-operator
	helm install gpu-operator nvidia/gpu-operator --namespace gpu-operator --create-namespace --set mig.strategy=mixed
	kubectl apply -f resourcequota.yaml

	# runai-oidc
	kubectl get clientconfig default -n kube-public -o yaml > login-config.yaml
	yq -i e ".spec +={\"authentication\":[{\"name\":\"oidc\",\"oidc\":{\"clientID\":\"runai\",\"issuerURI\":\"https://app.run.ai/auth/realms/<runai-realm>\",\"kubectlRedirectURI\":\"http://localhost:8000/callback\",\"userClaim\":\"sub\",\"userPrefix\":\"-\"}}]}" login-config.yaml
	kubectl apply -f login-config.yaml
	rm login-config.yaml

	### Show public IP
	kubectl --namespace nginx-ingress get services -o wide -w nginx-ingress-ingress-nginx-controller
	# Update hostname below - make sure it resolves!

	apiVersion: cert-manager.io/v1
	kind: Issuer
	metadata:
	name: letsencrypt-staging
	namespace: runai
	spec:
	acme:
	email: <email>
	server: https://acme-staging-v02.api.letsencrypt.org/directory
	privateKeySecretRef:
	name: letsencrypt-issuer-staging-account-key
	solvers:
	- http01:
	ingress:
	ingressClassName: nginx
	---
	apiVersion: cert-manager.io/v1
	kind: Issuer
	metadata:
	name: letsencrypt
	namespace: runai
	spec:
	acme:
	email: <email>
	server: https://acme-v02.api.letsencrypt.org/directory
	privateKeySecretRef:
	name: letsencrypt-issuer-account-key
	solvers:
	- http01:
	ingress:
	ingressClassName: nginx

	---
	# create empty k8s secret to host the TLS keys (required by runai)
	apiVersion: v1
	kind: Secret
	metadata:
	name: runai-cluster-domain-tls-secret
	namespace: runai
	type: kubernetes.io/tls
	stringData:
	tls.key: ""
	tls.crt: ""

	---
	# Dummy pod/service/ingress to latch on to letsencrypt so certmanager cretaes/maintains the certificate...
	kind: Pod
	apiVersion: v1
	metadata:
	name: apple-app
	namespace: runai
	labels:
	app: apple
	spec:
	containers:
	- name: apple-app
	image: hashicorp/http-echo
	args:
	- "-text=apple"
	resources:
	requests:
	memory: "32Mi"
	cpu: "25m"
	limits:
	memory: "128Mi"
	cpu: "500m"

	---

	kind: Service
	apiVersion: v1
	metadata:
	name: apple-service
	namespace: runai
	spec:
	selector:
	app: apple
	ports:
	- port: 5678
	---

	apiVersion: networking.k8s.io/v1
	kind: Ingress
	metadata:
	namespace: runai
	name: apple-ingress
	annotations:
	cert-manager.io/issuer: letsencrypt-staging
	spec:
	tls:
	- secretName: runai-cluster-domain-tls-secret
	hosts:
	- <cluster-hostname>
	ingressClassName: nginx
	rules:
	- host: <cluster-hostname>
	http:
	paths:
	- path: /apple
	pathType: Prefix
	backend:
	service:
	name: apple-service
	port:
	number: 5678

	---
	# Redirect / traffic
	apiVersion: networking.k8s.io/v1
	kind: Ingress
	metadata:
	annotations:
	nginx.ingress.kubernetes.io/temporal-redirect: "https://<your website>/"
	name: ingress-redirect
	spec:
	ingressClassName: nginx
	tls:
	- secretName: runai-cluster-domain-tls-secret
	hosts:
	- <cluster-hostname>
	rules:
	- host: <cluster-hostname>
	http:
	paths:
	- path: /apple
	pathType: Exact
	backend:
	service:
	name: apple-service
	port:
	number: 5678
	apiVersion: v1
	kind: ResourceQuota
	metadata:
	name: gcp-critical-pods
	namespace: gpu-operator
	spec:
	scopeSelector:
	matchExpressions:
	- operator: In
	scopeName: PriorityClass
	values:
	- system-node-critical
	- system-cluster-critical