zeppelinen/Training.ipynb

## aws_kubeflow.sh
# mac os x specific:
# install dependencies:
brew install awscli
brew install kubectl
brew install weaveworks/tap/eksctl
brew install ksonnet/tap/ks
brew install docker
brew install helm

# configure connection to docker daemon
# export DOCKER_HOST=ssh://user@docker-machine.com

# export environment variables to access AWS or configure ~/.aws/credentials
export AWS_DEFAULT_REGION=us-east-1
export AWS_ACCESS_KEY_ID=KEY_ID
export AWS_SECRET_ACCESS_KEY=KEY

# create Kubernetes cluster:
# -N - number of nodes
# -n - name of the cluster
# -t instance type
# --ssh-public-key - optional path to your public ssh key to install on cluster nodes

eksctl create cluster -N 3 -r us-east-1 -n kubeflow-1 -t m5.large --ssh-public-key ~/.ssh/id_rsa_4096.pub --timeout=90m

# check status of the cluster

kubectl describe nodes

# configure persistent volumes for Jupyter
# check if storage classes exist

kubectl get storageclass

# if it doesn't create storage class

cat <<EOF | kubectl create -f -
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
  name: gp2
  annotations:
    storageclass.beta.kubernetes.io/is-default-class: "true"
provisioner: kubernetes.io/aws-ebs
parameters:
  type: gp2
reclaimPolicy: Delete
mountOptions:
  - debug
EOF

# check that storageclass is created

kubectl get storageclass

# kubeflow specific installation steps

export NAMESPACE=kubeflow
kubectl create namespace ${NAMESPACE}

# download kubeflow deploy script

export KUBEFLOW_VERSION=0.2.5
export KUBEFLOW_DEPLOY=false
curl https://raw.githubusercontent.com/kubeflow/kubeflow/v${KUBEFLOW_VERSION}/scripts/deploy.sh | bash

# launch installation

cd kubeflow_ks_app/
ks env set default --namespace ${NAMESPACE}
ks apply default

# check kubeflow deployment status

kubectl get pod -n ${NAMESPACE}

# at this point the installation should be finished

### Jupyter specific steps

# Create docker repository
export ACCOUNTID=`aws iam get-user|grep Arn|cut -f6 -d:`
aws ecr get-login --no-include-email --region us-east-1 > docker_login.sh
bash docker_login.sh
aws ecr create-repository --repository-name tensorflow-notebook-cpu --region us-east-1

# fetch the model
curl -o train.py https://raw.githubusercontent.com/kubeflow/examples/master/github_issue_summarization/notebooks/train.py
curl -o seq2seq_utils.py https://raw.githubusercontent.com/kubeflow/examples/master/github_issue_summarization/notebooks/seq2seq_utils.py

# Build, tag and push Jupyter notebook docker image to registy
docker build -t $ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/tensorflow-notebook-cpu:latest . -f-<<EOF
FROM gcr.io/kubeflow-images-public/tensorflow-1.9.0-notebook-cpu
RUN pip uninstall msgpack thinc
RUN pip install thinc
RUN pip install ktext annoy sklearn h5py nltk pydot matplotlib
COPY train.py /workdir/train.py
COPY seq2seq_utils.py /workdir/seq2seq_utils.py
EOF

docker push $ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/tensorflow-notebook-cpu:latest

# Connect to Jupyter Notebook

kubectl port-forward svc/tf-hub-lb -n ${NAMESPACE} 8080:80
# open http://google.com
# configure spawner options

# Image: image we created perviously - AWS_ID.dkr.ecr.us-east-1.amazonaws.com/tensorflow-notebook-cpu:latest
# CPU, Memory - up to you
# Then click "spawn"

# Check that we have jupyter-"something" pods running

kubectl get pod -n ${NAMESPACE} | grep jupyter

# launch jupyter web terminal and clone example kubeflow repo:
# git clone https://github.com/kubeflow/examples
#
# in jupyter console browse to the Training ipynb adapt training parameters and launch it
# set DATA_DIR to /home/jovyan/github-issues-data
# set training_data_size to amount of rows you want to process
# to launch select cell-run all


### Serve trained model

# create microservice with trained model and IssueSummarization.py script

# copy Dockerfiles and deps from github
git clone https://github.com/kubeflow/examples serve/

# run docker build
cd serve/github_issue_summarization/notebooks
docker run -v $(pwd):/my_model seldonio/core-python-wrapper:0.7 /my_model IssueSummarization 0.1 gcr.io --base-image=python:3.6 --image-name=gcr-repository-name/issue-summarization

# copy models from kubernetes
cd build
sudo chown `id -u` .
PODNAME=`kubectl get pods --namespace=${NAMESPACE} --selector="app=jupyterhub" --output=template --template="{{with index .items 0}}{{.metadata.name}}{{end}}"`

kubectl --namespace=${NAMESPACE} cp ${PODNAME}:/home/jovyan/examples/github_issue_summarization/notebooks/seq2seq_model_tutorial.h5 .
kubectl --namespace=${NAMESPACE} cp ${PODNAME}:/home/jovyan/examples/github_issue_summarization/notebooks/body_pp.dpkl .
kubectl --namespace=${NAMESPACE} cp ${PODNAME}:/home/jovyan/examples/github_issue_summarization/notebooks/title_pp.dpkl .

# There's an issue with ip settings of jupyter notebook scripts in some environments.
# See https://github.com/codenvy/codenvy/issues/2427 for details.

# take the start-notebook.sh from the kubeflow repo and put it to the build directory.
cp PATH_TO/kubeflow_repo/components/tsorflow-notebook-image/start-notebook.sh ./

# then edit this script and add the argument --ip=0.0.0.0 to jupyter command.
# to put the script in our serve model image open the Dockerfile and add the following line:
COPY start-notebook.sh /usr/local/bin/start-notebook.sh
before WORKDIR /microservice line

# also you need to copy again scripts and data from notebooks directory (the one inside build dir)
cp notebooks/IssueSummarization.py ./
cp notebooks/Training.ipynb ./

# than correct seldon requirements to make it compatible with current pandas version:
# edit seldon_requirements.txt and change the line with numpy to just numpy without any versions

# build and push service image to our registry
aws ecr create-repository --repository-name github-issue-summarization --region us-east-1
docker build --force-rm=true -t $ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/github-issue-summarization:latest .
docker push $ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/github-issue-summarization:latest

# serve the model
# go back to kubeflow_ks_app directory

ks generate seldon seldon --name=seldon
ks apply default -c seldon

# ensure it's up and running

kubectl get pods -n ${NAMESPACE} | grep seldon-cluster-manager

# deploy trained docker image as a deployment

ks generate seldon-serve-simple issue-summarization-model-serving \
--name=issue-summarization \
--image=$ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/github-issue-summarization:0.1 \
--replicas=3

ks apply default -c issue-summarization-model-serving

# check that we can access the model API

kubectl port-forward svc/ambassador -n ${NAMESPACE} 8081:80

# try to predict summary for an issue
curl -X POST -H 'Content-Type: application/json' -d '{"data":{"ndarray":[[".pyenv/versions/2.7.13/envs/ENV2/lib/python2.7/site-packages/keystoneauth1/adapter.py:136: UserWarning: Using keystoneclient sessions has been deprecated. Please update your software to use keystoneauth1. warnings.warn(Using keystoneclient sessions has been deprecated. Determining IP Address to use with a ping test. Checking ... IP to be used is: INFO: Connecting to Instance at IP: Warning: Identity file gvonlasz not accessible: No such file or directory."]]}}' http://localhost:8081/seldon/issue-summarization/api/v0.1/predictions

## Training.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Training.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	# mac os x specific:
	# install dependencies:
	brew install awscli
	brew install kubectl
	brew install weaveworks/tap/eksctl
	brew install ksonnet/tap/ks
	brew install docker
	brew install helm

	# configure connection to docker daemon
	# export DOCKER_HOST=ssh://user@docker-machine.com

	# export environment variables to access AWS or configure ~/.aws/credentials
	export AWS_DEFAULT_REGION=us-east-1
	export AWS_ACCESS_KEY_ID=KEY_ID
	export AWS_SECRET_ACCESS_KEY=KEY

	# create Kubernetes cluster:
	# -N - number of nodes
	# -n - name of the cluster
	# -t instance type
	# --ssh-public-key - optional path to your public ssh key to install on cluster nodes

	eksctl create cluster -N 3 -r us-east-1 -n kubeflow-1 -t m5.large --ssh-public-key ~/.ssh/id_rsa_4096.pub --timeout=90m

	# check status of the cluster

	kubectl describe nodes

	# configure persistent volumes for Jupyter
	# check if storage classes exist

	kubectl get storageclass

	# if it doesn't create storage class

	cat <<EOF \| kubectl create -f -
	kind: StorageClass
	apiVersion: storage.k8s.io/v1
	metadata:
	name: gp2
	annotations:
	storageclass.beta.kubernetes.io/is-default-class: "true"
	provisioner: kubernetes.io/aws-ebs
	parameters:
	type: gp2
	reclaimPolicy: Delete
	mountOptions:
	- debug
	EOF

	# check that storageclass is created

	kubectl get storageclass

	# kubeflow specific installation steps

	export NAMESPACE=kubeflow
	kubectl create namespace ${NAMESPACE}

	# download kubeflow deploy script

	export KUBEFLOW_VERSION=0.2.5
	export KUBEFLOW_DEPLOY=false
	curl https://raw.githubusercontent.com/kubeflow/kubeflow/v${KUBEFLOW_VERSION}/scripts/deploy.sh \| bash

	# launch installation

	cd kubeflow_ks_app/
	ks env set default --namespace ${NAMESPACE}
	ks apply default

	# check kubeflow deployment status

	kubectl get pod -n ${NAMESPACE}

	# at this point the installation should be finished

	### Jupyter specific steps

	# Create docker repository
	export ACCOUNTID=`aws iam get-user\|grep Arn\|cut -f6 -d:`
	aws ecr get-login --no-include-email --region us-east-1 > docker_login.sh
	bash docker_login.sh
	aws ecr create-repository --repository-name tensorflow-notebook-cpu --region us-east-1

	# fetch the model
	curl -o train.py https://raw.githubusercontent.com/kubeflow/examples/master/github_issue_summarization/notebooks/train.py
	curl -o seq2seq_utils.py https://raw.githubusercontent.com/kubeflow/examples/master/github_issue_summarization/notebooks/seq2seq_utils.py

	# Build, tag and push Jupyter notebook docker image to registy
	docker build -t $ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/tensorflow-notebook-cpu:latest . -f-<<EOF
	FROM gcr.io/kubeflow-images-public/tensorflow-1.9.0-notebook-cpu
	RUN pip uninstall msgpack thinc
	RUN pip install thinc
	RUN pip install ktext annoy sklearn h5py nltk pydot matplotlib
	COPY train.py /workdir/train.py
	COPY seq2seq_utils.py /workdir/seq2seq_utils.py
	EOF

	docker push $ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/tensorflow-notebook-cpu:latest

	# Connect to Jupyter Notebook

	kubectl port-forward svc/tf-hub-lb -n ${NAMESPACE} 8080:80
	# open http://google.com
	# configure spawner options

	# Image: image we created perviously - AWS_ID.dkr.ecr.us-east-1.amazonaws.com/tensorflow-notebook-cpu:latest
	# CPU, Memory - up to you
	# Then click "spawn"

	# Check that we have jupyter-"something" pods running

	kubectl get pod -n ${NAMESPACE} \| grep jupyter

	# launch jupyter web terminal and clone example kubeflow repo:
	# git clone https://github.com/kubeflow/examples
	#
	# in jupyter console browse to the Training ipynb adapt training parameters and launch it
	# set DATA_DIR to /home/jovyan/github-issues-data
	# set training_data_size to amount of rows you want to process
	# to launch select cell-run all


	### Serve trained model

	# create microservice with trained model and IssueSummarization.py script

	# copy Dockerfiles and deps from github
	git clone https://github.com/kubeflow/examples serve/

	# run docker build
	cd serve/github_issue_summarization/notebooks
	docker run -v $(pwd):/my_model seldonio/core-python-wrapper:0.7 /my_model IssueSummarization 0.1 gcr.io --base-image=python:3.6 --image-name=gcr-repository-name/issue-summarization

	# copy models from kubernetes
	cd build
	sudo chown `id -u` .
	PODNAME=`kubectl get pods --namespace=${NAMESPACE} --selector="app=jupyterhub" --output=template --template="{{with index .items 0}}{{.metadata.name}}{{end}}"`

	kubectl --namespace=${NAMESPACE} cp ${PODNAME}:/home/jovyan/examples/github_issue_summarization/notebooks/seq2seq_model_tutorial.h5 .
	kubectl --namespace=${NAMESPACE} cp ${PODNAME}:/home/jovyan/examples/github_issue_summarization/notebooks/body_pp.dpkl .
	kubectl --namespace=${NAMESPACE} cp ${PODNAME}:/home/jovyan/examples/github_issue_summarization/notebooks/title_pp.dpkl .

	# There's an issue with ip settings of jupyter notebook scripts in some environments.
	# See https://github.com/codenvy/codenvy/issues/2427 for details.

	# take the start-notebook.sh from the kubeflow repo and put it to the build directory.
	cp PATH_TO/kubeflow_repo/components/tsorflow-notebook-image/start-notebook.sh ./

	# then edit this script and add the argument --ip=0.0.0.0 to jupyter command.
	# to put the script in our serve model image open the Dockerfile and add the following line:
	COPY start-notebook.sh /usr/local/bin/start-notebook.sh
	before WORKDIR /microservice line

	# also you need to copy again scripts and data from notebooks directory (the one inside build dir)
	cp notebooks/IssueSummarization.py ./
	cp notebooks/Training.ipynb ./

	# than correct seldon requirements to make it compatible with current pandas version:
	# edit seldon_requirements.txt and change the line with numpy to just numpy without any versions

	# build and push service image to our registry
	aws ecr create-repository --repository-name github-issue-summarization --region us-east-1
	docker build --force-rm=true -t $ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/github-issue-summarization:latest .
	docker push $ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/github-issue-summarization:latest

	# serve the model
	# go back to kubeflow_ks_app directory

	ks generate seldon seldon --name=seldon
	ks apply default -c seldon

	# ensure it's up and running

	kubectl get pods -n ${NAMESPACE} \| grep seldon-cluster-manager

	# deploy trained docker image as a deployment

	ks generate seldon-serve-simple issue-summarization-model-serving \
	--name=issue-summarization \
	--image=$ACCOUNTID.dkr.ecr.us-east-1.amazonaws.com/github-issue-summarization:0.1 \
	--replicas=3

	ks apply default -c issue-summarization-model-serving

	# check that we can access the model API

	kubectl port-forward svc/ambassador -n ${NAMESPACE} 8081:80

	# try to predict summary for an issue
	curl -X POST -H 'Content-Type: application/json' -d '{"data":{"ndarray":[[".pyenv/versions/2.7.13/envs/ENV2/lib/python2.7/site-packages/keystoneauth1/adapter.py:136: UserWarning: Using keystoneclient sessions has been deprecated. Please update your software to use keystoneauth1. warnings.warn(Using keystoneclient sessions has been deprecated. Determining IP Address to use with a ping test. Checking ... IP to be used is: INFO: Connecting to Instance at IP: Warning: Identity file gvonlasz not accessible: No such file or directory."]]}}' http://localhost:8081/seldon/issue-summarization/api/v0.1/predictions