Skip to content

Instantly share code, notes, and snippets.

@vpipkt
Created May 24, 2018 15:31
Show Gist options
  • Save vpipkt/9c5083a56cc4373a5c51aa8d2c585e92 to your computer and use it in GitHub Desktop.
Save vpipkt/9c5083a56cc4373a5c51aa8d2c585e92 to your computer and use it in GitHub Desktop.
GeoJSON schema inference with RasterFrames
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Getting Started\n",
"## Initialize the SparkSession with RasterFrames support"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/plain": [
"Intitializing Scala interpreter ..."
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Spark Web UI available at http://172.17.0.2:4040\n",
"SparkContext available as 'sc' (version = 2.2.0, master = local[*], app id = local-1527174665310)\n",
"SparkSession available as 'spark'\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import astraea.spark.rasterframes._\n",
"import geotrellis.raster._\n",
"import geotrellis.spark.io.kryo.KryoRegistrator\n",
"import org.apache.spark.serializer.KryoSerializer\n",
"import org.apache.spark.sql._\n",
"import org.apache.spark.sql.functions._\n",
"import astraea.spark.rasterframes.datasource.geotrellis._\n",
"import astraea.spark.rasterframes.datasource.geotiff._\n",
"import astraea.spark.rasterframes.experimental.datasource.geojson._\n",
"implicit val spark = SparkSession.builder()\n",
" .master(\"local\")\n",
" .appName(\"rasterframes-skylon\")\n",
" .config(\"spark.executor.memory\", \"8g\") \n",
" .config(\"spark.executor.cores\", 3) \n",
" .config(\"spark.driver.memory\",\"8g\") \n",
" .config(\"spark.serializer\", classOf[KryoSerializer].getName)\n",
" .config(\"spark.kryoserializer.buffer.max\", \"500m\")\n",
" .config(\"spark.kryo.registrationRequired\", \"false\")\n",
" .config(\"spark.kryo.registrator\", classOf[KryoRegistrator].getName)\n",
" .getOrCreate()\n",
" .withRasterFrames\n",
"spark.sparkContext.setLogLevel(\"ERROR\")\n",
"import spark.implicits._"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Play with Sample Data\n",
"\n",
"DC metro stations from mapbox"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2018-05-24 15:17:09-- https://www.mapbox.com/help/data/stations.geojson\n",
"Resolving www.mapbox.com (www.mapbox.com)... 151.101.200.143\n",
"Connecting to www.mapbox.com (www.mapbox.com)|151.101.200.143|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 29920 (29K) [binary/octet-stream]\n",
"Saving to: ‘stations.geojson’\n",
"\n",
"stations.geojson 100%[===================>] 29.22K --.-KB/s in 0.01s \n",
"\n",
"2018-05-24 15:17:09 (2.01 MB/s) - ‘stations.geojson’ saved [29920/29920]\n",
"\n"
]
}
],
"source": [
"!wget https://www.mapbox.com/help/data/stations.geojson"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"examplePath: String = stations.geojson\n"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val examplePath = \"stations.geojson\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"import astraea.spark.rasterframes.experimental.datasource.geojson.DefaultSource._\n"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import astraea.spark.rasterframes.experimental.datasource.geojson.DefaultSource._"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- geometry: geometry (nullable = false)\n",
" |-- properties: map (nullable = true)\n",
" | |-- key: string\n",
" | |-- value: string (valueContainsNull = true)\n",
"\n"
]
}
],
"source": [
"spark.read\n",
" .option(INFER_SCHEMA, false)\n",
" .format(\"geojson\")\n",
" .load(examplePath)\n",
" .printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"vf: org.apache.spark.sql.DataFrame = [geometry: geometry, line: string ... 3 more fields]\n"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val vf = spark.read\n",
" .option(INFER_SCHEMA, true)\n",
" .format(\"geojson\")\n",
" .load(examplePath)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- geometry: geometry (nullable = true)\n",
" |-- line: string (nullable = true)\n",
" |-- marker-color: string (nullable = true)\n",
" |-- marker-symbol: string (nullable = true)\n",
" |-- name: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"vf.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-----+------------+-------------+--------------------+\n",
"| geometry| line|marker-color|marker-symbol| name|\n",
"+--------------------+-----+------------+-------------+--------------------+\n",
"|POINT (-77.032552...|green| #008000| rail-metro| Columbia Heights|\n",
"|POINT (-76.868074...| blue| #0000ff| rail-metro| Morgan Boulevard|\n",
"|POINT (-77.120817...| red| #ff0000| rail-metro| Twinbrook|\n",
"|POINT (-76.996001...| blue| #0000ff| rail-metro| Eastern Market|\n",
"|POINT (-77.033634...| blue| #0000ff| rail-metro| McPherson Sq|\n",
"|POINT (-77.129111...| blue| #0000ff| rail-metro| Van Dorn Street|\n",
"|POINT (-76.995936...| red| #ff0000| rail-metro| Rhode Island Ave|\n",
"|POINT (-77.021914...|green| #008000| rail-metro|Mt Vernon Sq - 7t...|\n",
"|POINT (-77.039700...| red| #ff0000| rail-metro| Farragut North|\n",
"|POINT (-77.042916...| red| #ff0000| rail-metro| Forest Glen|\n",
"+--------------------+-----+------------+-------------+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"vf.show(10, true)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "spylon-kernel",
"language": "scala",
"name": "spylon-kernel"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"help_links": [
{
"text": "MetaKernel Magics",
"url": "https://github.com/calysto/metakernel/blob/master/metakernel/magics/README.md"
}
],
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "0.4.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment