Last active
July 29, 2019 17:46
-
-
Save tonyfraser/9a16ba9739d91f26e52fa4e89a492669 to your computer and use it in GitHub Desktop.
Dockerfile for running pyspark and python3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM openjdk:8 | |
#python:3 -- doesn't have java, so switched to open jdk. | |
# ==> openjdk contains java 1.8, and is a debian image | |
# far easier to start with openjdk1.8 than to apt-get install default-jdk-11 or whatever it is. | |
WORKDIR /usr/src/app | |
# first get these jars into the docker container | |
# ~/thisGist/lib: tony$ ls -al | |
# total 23656 | |
# drwxr-xr-x 6 tony staff 204 Jul 26 12:36 . | |
# drwxr-xr-x 26 tony staff 884 Jul 29 11:25 .. | |
# -rw-r--r-- 1 tony staff 11948376 Jul 26 12:36 aws-java-sdk-1.7.4.jar | |
# -rw-r--r--@ 1 tony staff 126287 Feb 22 14:51 hadoop-aws-2.7.3.jar | |
# -rw-r--r--@ 1 tony staff 26012 Feb 22 14:53 hadoop-client-2.7.3.jar | |
COPY ./lib/*.jar ./lib/ | |
# library specific requirements for python config | |
COPY requirements.txt . | |
# /root is ~ for docker | |
# copy the s3 keys directly into paths so they are loaded from boto. | |
COPY ./conf/aws/credentials /root/.aws/credentials | |
COPY ./conf/aws/config /root/.aws/config | |
# debian -- need to make sure we have the bullseye repo in apt-get update | |
# note -- copy the three jar files directly into the spark lib directory so you don't have to deal with paths. | |
RUN echo deb http://deb.debian.org/debian bullseye main >> /etc/apt/sources.list &&\ | |
apt-get update &&\ | |
apt-get install python3.7 python3-pip libsnappy-dev curl vim wget -y &&\ | |
pip3 install --upgrade pip &&\ | |
pip3 install -r requirements.txt &&\ | |
pip3 install awscli python-snappy pyspark botocore ipython &&\ | |
cp /usr/src/app/lib/* /usr/local/lib/python3.7/dist-packages/pyspark/jars/. | |
## configure or preload modules if necessary | |
# RUN python3 -m spacy download en | |
# RUN python3 -W ignore -m nltk.downloader vader_lexicon | |
#in docker, you want to be able to use ipython instead of pyspark | |
ENV PYSPARK_PYTHON=/usr/bin/python3 | |
#cp source files into the docker image | |
#COPY src/ ./src | |
## --- END DOCKERFILE ----- | |
## --- RUNNING DOCKER ----- | |
# --- run the image && pass in a python file name from src | |
# docker run {docker image name} /usr/local/bin/spark-submit ./src/{pythonFileNameFromSrcDirectory} | |
# -- or run by editing directly in docker | |
# docker run -it --entrypoint=/bin/bash {docker image name} | |
# --Run `pyspark` on the container after the container launches (not python3, not ipython, but pyspark) | |
# paste python code in | |
# | |
# ---- PYTHON CODE --------- | |
# from pyspark.sql import SparkSession | |
# import botocore.session | |
# session = botocore.session.get_session() | |
# credentials = session.get_credentials() | |
# spark = ( | |
# SparkSession | |
# .builder | |
# .config('fs.s3a.access.key', credentials.access_key) | |
# .config('fs.s3a.secret.key', credentials.secret_key) | |
# .appName("cluster") | |
# .getOrCreate() | |
# ) | |
# df = spark.read.option("header", "true").csv('s3a://bucket/tfraser/household.csv') | |
# df.printSchema() | |
# ---------- | |
# my specfiic requirements.txt | |
# Cython | |
# cachetools>=2.0.0 | |
# cytoolz>=0.8.0 | |
# fastparquet>=0.1.4 | |
# #gensim>=3.4.0 | |
# gensim>=3.7.3 | |
# nltk>=3.2.5 | |
# numpy>=1.15.4,<2.0.0 | |
# pandas>=0.22.0 | |
# python-levenshtein>=0.12.0 | |
# requests>=2.10.0 | |
# s3fs>=0.1.4 | |
# scikit-learn>=0.17.0 | |
# scipy>=1.1.0 | |
# spacy>=2.1.3 | |
# tqdm>=4.11.1 | |
# unidecode>=0.04.19 | |
# pycorenlp | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment