For most of 2019 and early 2020, this will most likely be caused by not having the right Java version.
As of August 2019, all big data frame works mostly are written and compatible with Java 8.
Check your java version here:
$ java -version
package org.anish.spark.gcissue | |
import org.apache.spark.sql.{DataFrame, SQLContext} | |
import org.apache.spark.{SparkConf, SparkContext} | |
import scala.util.Random | |
/** | |
* A random code written to produce GC OutOfMemory Issue with Spark and Parquet. More details in my blog at <anish749.github.io> | |
* |
import java.net.URI | |
import org.apache.hadoop.conf.Configuration | |
import org.apache.hadoop.fs.{FileSystem, Path} | |
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter} | |
import org.apache.spark.{SparkConf, SparkContext} | |
import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsScalaMapConverter} | |
/** |
package org.anish.spark.mostcommonvalue | |
import org.apache.spark.sql.Row | |
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} | |
import org.apache.spark.sql.types._ | |
import scalaz.Scalaz._ | |
/** | |
* Spark User Defined Aggregate Function to calculate the most frequent value in a column. This is similar to |
find . -type f -name '*.java' -exec sed -i '' 's/find/replace/g' {} + | |
# Find files (type f) | |
# with name *.java | |
# execute sed -i (inplace) | |
# '' means no backup files to be created | |
# 's/find/replace/g' sed command to do actual replace | |
# {} + the output of the find command is concatanated and fed to sed. | |
package org.anish.akka | |
import akka.actor.{Actor, ActorLogging, ActorSystem, Props} | |
import akka.http.scaladsl.Http | |
import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._ | |
import akka.http.scaladsl.model.StatusCodes | |
import akka.http.scaladsl.server.Directives._ | |
import akka.pattern.ask | |
import akka.stream.ActorMaterializer | |
import akka.util.Timeout |
package org.anish.spark.skew | |
import java.io.File | |
import org.apache.commons.io.FileUtils | |
import org.apache.spark.sql.{Dataset, SaveMode} | |
/** | |
* Few Utility functions for extending Spark Datasets for exploring partitions | |
* Created by anish on 22/10/17. |
// An Algebird immutable Bloom Filter Aggregator backed by a private mutable.BitSet. | |
final case class FBloomFilterAggregator[A](numHashes: Int, width: Int)( | |
implicit hash: Hash128[A] | |
) extends Aggregator[A, mutable.BitSet, BF[A]] { | |
private val hasher = BFHash(numHashes, width)(hash) | |
def prepare(value: A): mutable.BitSet = { | |
val hashes = hasher(value) | |
val b = new mutable.BitSet() |
package qscio | |
import java.util.UUID | |
import com.spotify.scio.values.SCollection | |
import com.spotify.scio.{ScioContext, ScioMetrics, ScioResult} | |
import com.typesafe.config.{Config, ConfigFactory} | |
import org.apache.beam.runners.dataflow.DataflowRunner | |
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions | |
import org.apache.beam.runners.direct.DirectRunner |
Set Pyspark to use version Python 3 on AWS.
$ sudo sed -i -e '$a\export PYSPARK_PYTHON=/usr/bin/python3' /etc/spark/conf/spark-env.sh
Install boto3 if needed:
$ sudo python3 -m pip install boto3