Skip to content

Instantly share code, notes, and snippets.

@eiso
Created November 17, 2017 02:25
Show Gist options
  • Save eiso/0e8aa2979a7dd6d38bf34c929a9418a6 to your computer and use it in GitHub Desktop.
Save eiso/0e8aa2979a7dd6d38bf34c929a9418a6 to your computer and use it in GitHub Desktop.
# srcd/engine
FROM ubuntu:16.04
ENV ENGINE_VERSION 0.1.7
RUN apt-get -y update && apt-get -y install wget vim git locales net-tools build-essential pkg-config cython3 python python3-dev python3-pip \
libjpeg-turbo8-dev libpng-dev libfreetype6-dev libxft-dev libyaml-dev \
libprotobuf-dev libsnappy-dev zlib1g-dev python3-cffi \
curl libcairo2 gdb python3-dbg graphviz \
lzop liblz4-dev pbzip2 liblzo2-dev \
&& apt-get clean \
&& locale-gen en_GB.UTF-8 && update-locale LANG=en_GB.UTF-8
RUN pip3 install --upgrade pip
WORKDIR builds
# Install and run borges
RUN wget https://github.com/src-d/borges/releases/download/v0.8.3/borges_v0.8.3_linux_amd64.tar.gz && \
tar -xvf borges_v0.8.3_linux_amd64.tar.gz && \
cp borges_linux_amd64/borges /usr/local/bin
RUN wget https://gist.githubusercontent.com/eiso/160b28cb38fae87ad54cb4c8e812a265/raw/11f499c07cee464685796ea1c54da8dcdda22ef1/repos.txt
RUN borges pack --file=repos.txt --to=/builds/sample-repos
# Install Java and Spark
RUN apt-get -y install openjdk-8-jre
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
RUN wget -O spark-2.2.0-bin-hadoop2.7.tgz "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/spark-2.2.0/spark-2.2.0-bin-hadoop2.7.tgz"
RUN tar -xvzf spark-2.2.0-bin-hadoop2.7.tgz
ENV PATH=${PATH}:/builds/spark-2.2.0-bin-hadoop2.7/bin
ENV SPARK_HOME /builds/spark-2.2.0-bin-hadoop2.7
ENV PYSPARK_PYTHON python3
# Git clone and install the engine
RUN git clone https://github.com/src-d/engine.git
WORKDIR /builds/engine
RUN git checkout tags/v$ENGINE_VERSION
RUN ./sbt publishLocal
WORKDIR /builds
RUN pip install sourced-engine==$ENGINE_VERSION
ENV SPARK_BBLFSH_HOST spark.tech.sourced.bblfsh.grpc.host
ENV SPARK_BBLFSH_PORT spark.tech.sourced.bblfsh.grpc.port
ENV BBLFSH_HOST bblfshd
ENV BBLFSH_PORT 9432
RUN printf "$SPARK_BBLFSH_HOST $BBLFSH_HOST\n$SPARK_BBLFSH_PORT $BBLFSH_PORT" > /builds/spark-2.2.0-bin-hadoop2.7/conf/spark-defaults.conf
# Adding Jupyter Notebook support
RUN pip3 install jupyter
RUN pip3 install jupyter-spark \
&& jupyter serverextension enable --py jupyter_spark \
&& jupyter nbextension install --py jupyter_spark \
&& jupyter nbextension enable --py jupyter_spark \
&& jupyter nbextension enable --py widgetsnbextension
ENV PYSPARK_SUBMIT_ARGS --packages tech.sourced:engine:$ENGINE_VERSION pyspark-shell
# Adding data science standard packages
RUN pip3 install PyYAML \
PyGitHub \
requests \
numpy \
scipy \
pandas \
spacy \
matplotlib \
cairocffi \
networkx \
pydot \
lz4 \
python-lzo \
protobuf \
parquet
RUN python3 -c "import matplotlib; matplotlib.use('Agg'); import matplotlib.pyplot"
# Need to use /bin since aliases don't work with `docker exec`
RUN echo '#! /bin/sh' >> /builds/pyspark && \
echo '/usr/local/bin/pyspark --packages "tech.sourced:engine:${ENGINE_VERSION}"' >> /builds/pyspark && \
chmod u+x /builds/pyspark
RUN echo '#! /bin/sh' >> /builds/spark-shell && \
echo '/usr/local/bin/spark-shell --packages "tech.sourced:engine:${ENGINE_VERSION}"' >> /builds/spark-shell && \
chmod u+x /builds/spark-shell
RUN echo '#! /bin/sh' >> /builds/jupyter && \
echo '/usr/local/bin/jupyter notebook --ip=0.0.0.0 --allow-root' >> /builds/jupyter && \
chmod u+x /builds/jupyter
ENV PATH=/builds:${PATH}
WORKDIR /sourced
CMD mv /builds/repos.txt /sourced/sample-repos.txt && \
mv /builds/sample-repos /sourced/ && \
cp /builds/engine/_examples/notebooks/* /sourced/ && \
borges pack --file=/sourced/repos.txt --to=/sourced/user-repos && \
sleep infinity
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment