Skip to content

Instantly share code, notes, and snippets.

Created June 24, 2015 20:48
Show Gist options
  • Save anonymous/578385766261d6fa7196 to your computer and use it in GitHub Desktop.
Save anonymous/578385766261d6fa7196 to your computer and use it in GitHub Desktop.
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import com.typesafe.config.Config
import spark.jobserver.NamedRddSupport
import spark.jobserver.SparkHiveJob
import spark.jobserver.SparkJob
import spark.jobserver.SparkJobValid
import spark.jobserver.SparkJobValidation
/**
* Example Spark Job using spark-jobserver
* Shares a RDD / DataFrame using {@link SparkHiveJob} leaving it cached in memory.
*
* @author John Muller
*/
object ExampleNamedDF extends SparkHiveJob {
// This is the method that gets called by spark-jobserver when it submits a SparkJob
override def runJob(sqlContext: HiveContext, jobConfig: Config): Any = {
// Select some stuff from Hive
val merckDataFrame = sqlContext.sql("SELECT a, b, c FROM TableA")
// Leave the dataframe cached in memory.
sqlContext.cacheTable("CACHED_DATA")
// The next SparkHiveJob will be able to access the in-memory DataFrame with the below:
// sqlContext.sql("SELECT * FROM CACHED_DATA")
}
// Dummy validation for this job, will always return that the job is valid.
override def validate(sqlContext: HiveContext, config: Config): SparkJobValidation = SparkJobValid
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment