- sudo yum install java-1.8.0-openjdk
- java -version
- sudo /usr/sbin/alternatives --config java
- Choose >> /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.191.b12-1.el7_6.x86_64
- nano ~/.bashrc
- Append the following line and save :
- export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.191.b12-1.el7_6.x86_64/jre"
- source ~/.bashrc
- echo $JAVA_HOME
- Creae folder
- mkdir pyspark
- Download PySpark ( use wget, curl, aria2 to dwownload file )
- Extract files
- tar -zxvf spark-2.3.3-bin-hadoop2.7.tgz
- Append the following line to the ~/.bashrc file:
- export SPARK_HOME="/home/[yourusername]/spark/spark-2.3.3-bin-hadoop2.7"
- export PATH="$SPARK_HOME/bin:$PATH"
- source ~/.bashrc
- Check PySpark and test in shell
- pyspark
- sc
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('pyspark')
sc = SparkContext(conf=conf)
and test
sc
- nano ~/.bashrc
- append the following line :
export PYSPARK_SUBMIT_ARGS="pyspark-shell"
export PYSPARK_DRIVER_PYTHON=ipython
export PYSPARK_DRIVER_PYTHON_OPTS='notebook' pyspark
- source ~/.bashrc
- Run jupyter notebook with the following command :
pyspark