tonyfraser/Dockerfile

## Dockerfile
FROM openjdk:8
#python:3 -- doesn't have java, so switched to open jdk.
# ==> openjdk contains java 1.8, and is a debian image
# far easier to start with openjdk1.8 than to apt-get install default-jdk-11 or whatever it is.

WORKDIR /usr/src/app

# first get these jars into the docker container
# ~/thisGist/lib: tony$ ls -al
# total 23656
# drwxr-xr-x   6 tony  staff       204 Jul 26 12:36 .
# drwxr-xr-x  26 tony  staff       884 Jul 29 11:25 ..
# -rw-r--r--   1 tony  staff  11948376 Jul 26 12:36 aws-java-sdk-1.7.4.jar
# -rw-r--r--@  1 tony  staff    126287 Feb 22 14:51 hadoop-aws-2.7.3.jar
# -rw-r--r--@  1 tony  staff     26012 Feb 22 14:53 hadoop-client-2.7.3.jar
COPY ./lib/*.jar ./lib/

# library specific requirements for python config
COPY requirements.txt .

# /root is ~ for docker
# copy the s3 keys directly into paths so they are loaded from boto.
COPY ./conf/aws/credentials      /root/.aws/credentials
COPY ./conf/aws/config           /root/.aws/config

# debian -- need to make sure we have the bullseye repo in apt-get update
# note -- copy the three jar files directly into the spark lib directory so you don't have to deal with paths.
RUN    echo deb http://deb.debian.org/debian bullseye main >> /etc/apt/sources.list &&\
       apt-get update &&\
       apt-get install python3.7 python3-pip libsnappy-dev curl vim wget -y &&\
       pip3 install --upgrade pip &&\
       pip3 install -r requirements.txt &&\
       pip3 install awscli python-snappy pyspark botocore ipython &&\
       cp /usr/src/app/lib/* /usr/local/lib/python3.7/dist-packages/pyspark/jars/.

## configure or preload modules if necessary
# RUN python3 -m spacy download en
# RUN python3 -W ignore -m nltk.downloader vader_lexicon

#in docker, you want to be able to use ipython instead of pyspark
ENV PYSPARK_PYTHON=/usr/bin/python3

#cp source files into the docker image
#COPY src/ ./src

## --- END DOCKERFILE -----

## --- RUNNING DOCKER -----
# --- run the image && pass in a python file name from src
# docker run {docker image name} /usr/local/bin/spark-submit ./src/{pythonFileNameFromSrcDirectory}
# -- or run by editing directly in docker
# docker run -it --entrypoint=/bin/bash {docker image name}
# --Run `pyspark` on the container after the container launches (not python3, not ipython, but pyspark)
# paste python code in
#
# ---- PYTHON CODE ---------
# from pyspark.sql import SparkSession
# import botocore.session
# session = botocore.session.get_session()
# credentials = session.get_credentials()

# spark = (
#     SparkSession
#         .builder
#         .config('fs.s3a.access.key', credentials.access_key)
#         .config('fs.s3a.secret.key', credentials.secret_key)
#         .appName("cluster")
#         .getOrCreate()
# )

# df = spark.read.option("header", "true").csv('s3a://bucket/tfraser/household.csv')
# df.printSchema()

# ----------
# my specfiic requirements.txt
# Cython
# cachetools>=2.0.0
# cytoolz>=0.8.0
# fastparquet>=0.1.4
# #gensim>=3.4.0
# gensim>=3.7.3
# nltk>=3.2.5
# numpy>=1.15.4,<2.0.0
# pandas>=0.22.0
# python-levenshtein>=0.12.0
# requests>=2.10.0
# s3fs>=0.1.4
# scikit-learn>=0.17.0
# scipy>=1.1.0
# spacy>=2.1.3
# tqdm>=4.11.1
# unidecode>=0.04.19
# pycorenlp
	FROM openjdk:8
	#python:3 -- doesn't have java, so switched to open jdk.
	# ==> openjdk contains java 1.8, and is a debian image
	# far easier to start with openjdk1.8 than to apt-get install default-jdk-11 or whatever it is.

	WORKDIR /usr/src/app

	# first get these jars into the docker container
	# ~/thisGist/lib: tony$ ls -al
	# total 23656
	# drwxr-xr-x 6 tony staff 204 Jul 26 12:36 .
	# drwxr-xr-x 26 tony staff 884 Jul 29 11:25 ..
	# -rw-r--r-- 1 tony staff 11948376 Jul 26 12:36 aws-java-sdk-1.7.4.jar
	# -rw-r--r--@ 1 tony staff 126287 Feb 22 14:51 hadoop-aws-2.7.3.jar
	# -rw-r--r--@ 1 tony staff 26012 Feb 22 14:53 hadoop-client-2.7.3.jar
	COPY ./lib/*.jar ./lib/

	# library specific requirements for python config
	COPY requirements.txt .

	# /root is ~ for docker
	# copy the s3 keys directly into paths so they are loaded from boto.
	COPY ./conf/aws/credentials /root/.aws/credentials
	COPY ./conf/aws/config /root/.aws/config

	# debian -- need to make sure we have the bullseye repo in apt-get update
	# note -- copy the three jar files directly into the spark lib directory so you don't have to deal with paths.
	RUN echo deb http://deb.debian.org/debian bullseye main >> /etc/apt/sources.list &&\
	apt-get update &&\
	apt-get install python3.7 python3-pip libsnappy-dev curl vim wget -y &&\
	pip3 install --upgrade pip &&\
	pip3 install -r requirements.txt &&\
	pip3 install awscli python-snappy pyspark botocore ipython &&\
	cp /usr/src/app/lib/* /usr/local/lib/python3.7/dist-packages/pyspark/jars/.

	## configure or preload modules if necessary
	# RUN python3 -m spacy download en
	# RUN python3 -W ignore -m nltk.downloader vader_lexicon

	#in docker, you want to be able to use ipython instead of pyspark
	ENV PYSPARK_PYTHON=/usr/bin/python3

	#cp source files into the docker image
	#COPY src/ ./src

	## --- END DOCKERFILE -----

	## --- RUNNING DOCKER -----
	# --- run the image && pass in a python file name from src
	# docker run {docker image name} /usr/local/bin/spark-submit ./src/{pythonFileNameFromSrcDirectory}
	# -- or run by editing directly in docker
	# docker run -it --entrypoint=/bin/bash {docker image name}
	# --Run `pyspark` on the container after the container launches (not python3, not ipython, but pyspark)
	# paste python code in
	#
	# ---- PYTHON CODE ---------
	# from pyspark.sql import SparkSession
	# import botocore.session
	# session = botocore.session.get_session()
	# credentials = session.get_credentials()

	# spark = (
	# SparkSession
	# .builder
	# .config('fs.s3a.access.key', credentials.access_key)
	# .config('fs.s3a.secret.key', credentials.secret_key)
	# .appName("cluster")
	# .getOrCreate()
	# )

	# df = spark.read.option("header", "true").csv('s3a://bucket/tfraser/household.csv')
	# df.printSchema()

	# ----------
	# my specfiic requirements.txt
	# Cython
	# cachetools>=2.0.0
	# cytoolz>=0.8.0
	# fastparquet>=0.1.4
	# #gensim>=3.4.0
	# gensim>=3.7.3
	# nltk>=3.2.5
	# numpy>=1.15.4,<2.0.0
	# pandas>=0.22.0
	# python-levenshtein>=0.12.0
	# requests>=2.10.0
	# s3fs>=0.1.4
	# scikit-learn>=0.17.0
	# scipy>=1.1.0
	# spacy>=2.1.3
	# tqdm>=4.11.1
	# unidecode>=0.04.19
	# pycorenlp