Last active
January 30, 2020 06:50
-
-
Save Detoo/2e311198f7a217e969385143b92794ee to your computer and use it in GitHub Desktop.
Minimum-Viable Spark Notebook using Jupyter+Almond
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Minimum-Viable Spark Notebook" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Extra Repositories\n", | |
"Add extra repositories here.\n", | |
"\n", | |
"For this demo we add the repo for play-geojson." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import coursier.maven.MavenRepository\n", | |
"\n", | |
"interp.repositories() ++= Seq(\n", | |
" MavenRepository(\"http://dl.bintray.com/jroper/maven\")\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Download and Import Packages\n", | |
"Ammonite's magic import allow us to import Ivy dependencies from Maven Central (or from other repos we added above)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import $ivy.`org.apache.spark::spark-sql:2.4.2`\n", | |
"import $ivy.`com.typesafe.play::play-json:2.7.3`\n", | |
"import $ivy.`au.id.jazzy::play-geojson:1.6.0`" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Start Spark Session" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import org.apache.spark.sql.SparkSession\n", | |
"import org.apache.spark.sql.functions._\n", | |
"\n", | |
"val spark = SparkSession\n", | |
" .builder()\n", | |
" .appName(\"MvSparkNotebook\")\n", | |
" .config(\"spark.master\", \"local\")\n", | |
" .getOrCreate()\n", | |
"\n", | |
"import spark.implicits._" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Load DataSet\n", | |
"Synthesize and load a GeoJson feature with two points." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"root\n", | |
" |-- geometry: struct (nullable = true)\n", | |
" | |-- coordinates: array (nullable = true)\n", | |
" | | |-- element: array (containsNull = true)\n", | |
" | | | |-- element: long (containsNull = true)\n", | |
" | |-- type: string (nullable = true)\n", | |
" |-- type: string (nullable = true)\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"import au.id.jazzy.play.geojson.{ Feature, MultiPoint, LatLng }\n", | |
"import play.api.libs.json.Json\n", | |
"\n", | |
"val df = spark.read.json(\n", | |
" spark.createDataset(\n", | |
" Json.stringify(Json.toJson(\n", | |
" Feature(\n", | |
" MultiPoint(collection.immutable.Seq(LatLng(0.0, 0.0), LatLng(1.0, 1.0))),\n", | |
" properties = None\n", | |
" )\n", | |
" )) :: Nil\n", | |
" )\n", | |
")\n", | |
"df.printSchema()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Query\n", | |
"Select all points in the Feature whose latitude is larger than zero." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----------+\n", | |
"|coordinates|\n", | |
"+-----------+\n", | |
"| [1, 1]|\n", | |
"+-----------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df.select($\"geometry.coordinates\").withColumn(\"coordinates\", explode($\"coordinates\"))\n", | |
" .filter(expr(\"coordinates[0]\") > 0)\n", | |
" .show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Cleanup\n", | |
"Stop the Spark Session." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"spark.stop()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Scala", | |
"language": "scala", | |
"name": "scala" | |
}, | |
"language_info": { | |
"codemirror_mode": "text/x-scala", | |
"file_extension": ".scala", | |
"mimetype": "text/x-scala", | |
"name": "scala", | |
"nbconvert_exporter": "script", | |
"version": "2.11.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment