Skip to content

Instantly share code, notes, and snippets.

@gbraccialli
Created March 25, 2020 05:11
Show Gist options
  • Save gbraccialli/d910d666bd7bdd71f7fedf9eebee6351 to your computer and use it in GitHub Desktop.
Save gbraccialli/d910d666bd7bdd71f7fedf9eebee6351 to your computer and use it in GitHub Desktop.
#-----------------------------------------------------------------------------------------------
#copy emr conf
#-----------------------------------------------------------------------------------------------
emr_ip=10.135.241.137
sudo rm -rf /etc/yum.repos.d/emr-*.repo
sudo rm -rf /var/aws/emr/repoPublicKey.txt
sudo mkdir -p /var/aws/emr/
sudo chmod +r -R /var/aws/
sudo rm -rf /etc/spark/
sudo rm -rf /etc/hadoop/
sudo scp -i xxx.pem hadoop@${emr_ip}:/etc/yum.repos.d/emr-*.repo /etc/yum.repos.d/
sudo scp -i xxxx.pem hadoop@${emr_ip}:/var/aws/emr/repoPublicKey.txt /var/aws/emr/
sudo scp -rp -i xxx.pem hadoop@${emr_ip}:/etc/spark/ /etc/spark/
sudo scp -rp -i /xxx.pem hadoop@${emr_ip}:/etc/hadoop/ /etc/hadoop/
#-----------------------------------------------------------------------------------------------
#install emr binaries
#-----------------------------------------------------------------------------------------------
sudo mkdir -p /var/aws/emr/
sudo mkdir -p /etc/hadoop/conf
sudo mkdir -p /etc/spark/conf
sudo mkdir -p /var/log/spark/user/
sudo mkdir -p /mnt/s3
sudo mkdir -p /mnt/var/lib/hadoop/tmp
sudo chmod 777 -R /var/log/spark/
sudo chmod 777 -R /mnt/s3
sudo chmod 777 -R /mnt/var/lib/hadoop/tmp
sudo yum install -y hadoop-client
sudo yum install -y hadoop-hdfs
sudo yum install -y hadoop-lzo
sudo yum install -y spark-core
sudo yum install -y spark-python
sudo yum install -y aws-java-sdk
sudo yum install -y aws-sagemaker-spark-sdk
sudo yum install -y emr-ddb
sudo yum install -y emr-ddb-hadoop
sudo yum install -y emr-ddb-hive
sudo yum install -y emr-goodies
sudo yum install -y emr-goodies-hadoop
sudo yum install -y emr-goodies-hive
sudo yum install -y emr-goodies-parquet
sudo yum install -y emr-goodies-spark
sudo yum install -y emr-s3-select
sudo yum install -y emrfs
sudo yum install -y livy
sudo yum install -y spark-core
sudo yum install -y spark-datanucleus
sudo yum install -y spark-external
sudo yum install -y spark-history-server
sudo yum install -y spark-python
sudo yum install -y git
#-----------------------------------------------------------------------------------------------
#install anaconda and create conda env
#-----------------------------------------------------------------------------------------------
wget https://repo.anaconda.com/archive/Anaconda3-2019.10-Linux-x86_64.sh
chmod +x Anaconda3-2019.10-Linux-x86_64.sh
sudo ./Anaconda3-2019.10-Linux-x86_64.sh -b -p /anaconda3
rm -rf Anaconda3-2019.10-Linux-x86_64.sh
sudo /anaconda3/bin/conda install -y -c conda-forge jupyterhub
sudo /anaconda3/bin/pip install jupyter
sudo /anaconda3/bin/pip install jupyterlab
sudo /anaconda3/bin/pip install jupyterhub
sudo /anaconda3/bin/pip install findspark
sudo /anaconda3/bin/pip install pandas
sudo /anaconda3/bin/pip install pyarrow
sudo /anaconda3/bin/pip install pandas-profiling
sudo /anaconda3/bin/pip install s3contents
sudo /anaconda3/bin/pip install hybridcontents
sudo /anaconda3/bin/pip install jupyter-server-proxy
sudo /anaconda3/bin/pip install ipykernel
sudo /anaconda3/bin/jupyter serverextension enable --sys-prefix jupyter_server_proxy
sudo /anaconda3/bin/conda init
sudo /anaconda3/bin/conda create -y -n xxxx python=3.6
sudo /anaconda3/bin/conda create -y -n yyyy python=3.6
#-----------------------------------------------------------------------------------------------
#start jupyter hub and create kernels
#-----------------------------------------------------------------------------------------------
sudo su -
conda activate xxxx
pip install ipykernel
python -m ipykernel install --name xxxx --display-name "xxxx"
conda deactivate
conda activate yyyy
pip install ipykernel
python -m ipykernel install --name yyyy --display-name "yyyy"
sudo chmod 777 -R /anaconda3
sudo adduser jupyter
sudo passwd jupyter
#-----------------------------------------------------------------------------------------------
#configure jupyter notebook-s3 storage
#-----------------------------------------------------------------------------------------------
sudo su - jupyter
#create file: /home/jupyter/.jupyter/jupyter_notebook_config.py
from s3contents import S3ContentsManager
c = get_config()
from hybridcontents import HybridContentsManager
#from IPython.html.services.contents.filemanager import FileContentsManager
from notebook.services.contents.filemanager import FileContentsManager
c.NotebookApp.contents_manager_class = HybridContentsManager
c.HybridContentsManager.manager_classes = {
# Associate the root directory with an S3ContentsManager.
# This manager will receive all requests that don"t fall under any of the
# other managers.
"": S3ContentsManager,
# Associate /directory with a FileContentsManager.
"xx_local_directory": FileContentsManager,
}
c.HybridContentsManager.manager_kwargs = {
# Args for root S3ContentsManager.
"": {
"bucket": "xxxxx",
"prefix": "jupyter_notebooks",
"sse": "AES256",
"signature_version": "s3v4"
},
# Args for the FileContentsManager mapped to /directory
"xx_local_directory": {
"root_dir": "/home/jupyter",
},
}
#-----------------------------------------------------------------------------------------------
#setup gpu and add kernel
#-----------------------------------------------------------------------------------------------
sudo wget http://us.download.nvidia.com/tesla/440.64.00/NVIDIA-Linux-x86_64-440.64.00.run -O /root/nvidia_install.run
sudo yum groupinstall -y "Development Tools"
sudo yum install -y kernel-tools kernel-tools-devel kernel kernel-devel
sydo yum install -y kernel-devel-$(uname -r)
sudo chmod +x /root/nvidia_install.run
sudo /root/nvidia_install.run
sudo /anaconda3/bin/conda create -y -n gpu python=3.7
sudo su -
conda activate gpu
conda install -c conda-forge pytorch cudatoolkit=10.1
pip install ipykernel
python -m ipykernel install --name gpu --display-name "gpu"
#-----------------------------------------------------------------------------------------------
#/etc/hadoop/conf/yarn-site.xml
#-----------------------------------------------------------------------------------------------
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
</property>
<property>
<name>yarn.scheduler.fair.user-as-default-queue</name>
<value>false</value>
</property>
<property>
<name>yarn.scheduler.fair.preemption</name>
<value>true</value>
</property>
<property>
<name>yarn.scheduler.fair.preemption.cluster-utilization-threshold</name>
<value>0.3f</value>
</property>
<property>
<name>yarn.scheduler.fair.waitTimeBeforeNextStarvationCheck</name>
<value>3000</value>
</property>
<property>
<name>yarn.scheduler.fair.waitTimeBeforeKill</name>
<value>3000</value>
</property>
</configuration>
#-----------------------------------------------------------------------------------------------
#/etc/hadoop/conf/fair-scheduler.xml
#-----------------------------------------------------------------------------------------------
<?xml version="1.0"?>
<allocations>
<queue name="root">
<minSharePreemptionTimeout>3</minSharePreemptionTimeout>
<fairSharePreemptionTimeout>3</fairSharePreemptionTimeout>
<allowPreemptionFrom>true</allowPreemptionFrom>
<minResources>100000 mb,0vcores</minResources>
<schedulingPolicy>fair</schedulingPolicy>
</queue>
<defaultFairSharePreemptionTimeout>3</defaultFairSharePreemptionTimeout>
<defaultMinSharePreemptionTimeout>3</defaultMinSharePreemptionTimeout>
<defaultFairSharePreemptionThreshold>0.8f</defaultFairSharePreemptionThreshold>
</allocations>
#-----------------------------------------------------------------------------------------------
#/etc/spark/conf/spark-defaults.conf (worker instance type = r5.2xlarge (64 GB / 8 cores))
#------------------------------------------------------------------
spark.executor.memory 12000M
spark.executor.cores 2
spark.yarn.executor.memoryOverheadFactor 0.1875
spark.driver.memory 8g
spark.driver.maxResultSize 16g
spark.driver.memoryOverhead 4g
spark.dynamicAllocation.executorIdleTimeout 30s
spark.dynamicAllocation.cachedExecutorIdleTimeout 30s
#-----------------------------------------------------------------------------------------------
#git setup
#-----------------------------------------------------------------------------------------------
git config --global credential.helper store
git clone https://github.com/xxxx
guilherme.braccialli@xxxxxx.com
xxxxxxxx
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment