Skip to content

Instantly share code, notes, and snippets.

@zkan
Forked from jitsejan/Dockerfile
Created February 5, 2022 10:03
Show Gist options
  • Save zkan/eb14b8d5772156bbfe0cd58c2f79e1d0 to your computer and use it in GitHub Desktop.
Save zkan/eb14b8d5772156bbfe0cd58c2f79e1d0 to your computer and use it in GitHub Desktop.
PySpark, Docker and S3
from pyspark import SparkContext, SparkConf, SQLContext
conf = (
SparkConf()
.set("spark.hadoop.fs.s3a.path.style.access", True)
.set("spark.hadoop.fs.s3a.access.key", profile_info.get('aws_access_key_id'))
.set("spark.hadoop.fs.s3a.secret.key", profile_info.get('aws_secret_access_key'))
.set("spark.hadoop.fs.s3a.endpoint", f"s3-{profile_info.get('region')}.amazonaws.com")
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
.set("com.amazonaws.services.s3.enableV4", True)
.set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")
)
from configparser import ConfigParser
config_object = ConfigParser()
config_object.read("/home/jovyan/.aws/credentials")
profile_info = config_object["prod"]
[prod]
aws_access_key_id = xxxxxxyyyyyyy
aws_secret_access_key = zzzzzzzzyyyyyyy
region = eu-west-2
version: '3'
services:
jitsejan-pyspark:
user: root
privileged: true
image: jitsejan/pyspark-notebook
restart: always
volumes:
- ./notebooks:/opt/notebooks
- ./data:/opt/data
- $HOME/.aws/credentials:/home/jovyan/.aws/credentials:ro
environment:
- GRANT_SUDO=yes
ports:
- "8488:8488"
FROM jupyter/pyspark-notebook
USER root
# Add essential packages
RUN apt-get update && apt-get install -y build-essential curl git gnupg2 nano apt-transport-https software-properties-common
# Set locale
RUN apt-get update && apt-get install -y locales \
&& echo "en_US.UTF-8 UTF-8" > /etc/locale.gen \
&& locale-gen
# Add config to Jupyter notebook
COPY jupyter/jupyter_notebook_config.py /home/jovyan/.jupyter/
RUN chmod -R 777 /home/jovyan/
# Spark libraries
RUN wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -P $SPARK_HOME/jars/
RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar -P $SPARK_HOME/jars/
USER $NB_USER
# Install Python requirements
COPY requirements.txt /home/jovyan/
RUN pip install -r /home/jovyan/requirements.txt
# Install NLTK
RUN python -c "import nltk; nltk.download('popular')"
# Custom styling
RUN mkdir -p /home/jovyan/.jupyter/custom
COPY custom/custom.css /home/jovyan/.jupyter/custom/
# NB extensions
RUN jupyter contrib nbextension install --user
RUN jupyter nbextensions_configurator enable --user
# Run the notebook
CMD ["/opt/conda/bin/jupyter", "lab", "--allow-root"]
c = get_config()
c.InteractiveShell.ast_node_interactivity = "all"
c.NotebookApp.allow_origin = '*'
c.NotebookApp.ip = '*'
c.NotebookApp.notebook_dir = '/opt/notebooks/'
c.NotebookApp.open_browser = False
c.NotebookApp.password = u'sha1:a123:345345'
c.NotebookApp.port = 8488
sc = SparkContext(conf=conf).getOrCreate()
sqlContext = SQLContext(sc)
df = sqlContext.read.parquet("s3a://datalake/warehouse/platform/company_list/")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment