metasim/rf-kmeans.ipynb

## rf-kmeans.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Perform K-means Clustering on Multiband MODIS Imagery with RasterFrames\n",
    "\n",
    "First import the library components and initialize a Spark session."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyrasterframes.utils import create_rf_spark_session\n",
    "from pyspark.sql.functions import lit, col, count\n",
    "from pyrasterframes.rf_types import TileExploder, NoDataFilter\n",
    "from pyspark.ml.feature import VectorAssembler\n",
    "from pyspark.ml.clustering import KMeans\n",
    "from pyspark.ml import Pipeline\n",
    "\n",
    "spark = create_rf_spark_session()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, query for a single MODIS granule/scene from the public dataset catalog on AWS, capturing the results in a \"catalog\" Dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat = spark.read.format('aws-pds-modis-catalog').load() \\\n",
    "        .filter(\n",
    "            (col('granule_id') == 'h11v04') &\n",
    "            (col('acquisition_date') > lit('2018-02-19')) &\n",
    "            (col('acquisition_date') < lit('2018-02-22'))\n",
    "        )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read the four bands from the catalog. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputCols = ['B01', 'B02', 'B03', 'B04']\n",
    "\n",
    "df = spark.read.raster(cat, inputCols, lazy_tiles=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compose the Spark ML pipeline for KMeans clustering and create a model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "exploder = TileExploder()\n",
    "filter = NoDataFilter().setInputCols(inputCols)\n",
    "assembler = VectorAssembler().setInputCols(inputCols).setOutputCol(\"features\")\n",
    "kmeans = KMeans().setK(5)\n",
    "pipeline = Pipeline().setStages([exploder, filter, assembler, kmeans])\n",
    "model = pipeline.fit(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply model to cell values and count group membership."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+-----------------+\n",
      "|prediction|count(prediction)|\n",
      "+----------+-----------------+\n",
      "|         0|          4494371|\n",
      "|         1|          5297679|\n",
      "|         2|          2520941|\n",
      "|         3|          2743319|\n",
      "|         4|           955523|\n",
      "+----------+-----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "clustered = model.transform(df)\n",
    "clustered.groupBy('prediction').agg(count('prediction')).orderBy('prediction').show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Perform K-means Clustering on Multiband MODIS Imagery with RasterFrames\n",
	"\n",
	"First import the library components and initialize a Spark session."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from pyrasterframes.utils import create_rf_spark_session\n",
	"from pyspark.sql.functions import lit, col, count\n",
	"from pyrasterframes.rf_types import TileExploder, NoDataFilter\n",
	"from pyspark.ml.feature import VectorAssembler\n",
	"from pyspark.ml.clustering import KMeans\n",
	"from pyspark.ml import Pipeline\n",
	"\n",
	"spark = create_rf_spark_session()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Next, query for a single MODIS granule/scene from the public dataset catalog on AWS, capturing the results in a \"catalog\" Dataframe."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"cat = spark.read.format('aws-pds-modis-catalog').load() \\\n",
	" .filter(\n",
	" (col('granule_id') == 'h11v04') &\n",
	" (col('acquisition_date') > lit('2018-02-19')) &\n",
	" (col('acquisition_date') < lit('2018-02-22'))\n",
	" )"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Read the four bands from the catalog. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"inputCols = ['B01', 'B02', 'B03', 'B04']\n",
	"\n",
	"df = spark.read.raster(cat, inputCols, lazy_tiles=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Compose the Spark ML pipeline for KMeans clustering and create a model."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"exploder = TileExploder()\n",
	"filter = NoDataFilter().setInputCols(inputCols)\n",
	"assembler = VectorAssembler().setInputCols(inputCols).setOutputCol(\"features\")\n",
	"kmeans = KMeans().setK(5)\n",
	"pipeline = Pipeline().setStages([exploder, filter, assembler, kmeans])\n",
	"model = pipeline.fit(df)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Apply model to cell values and count group membership."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"+----------+-----------------+\n",
	"\|prediction\|count(prediction)\|\n",
	"+----------+-----------------+\n",
	"\| 0\| 4494371\|\n",
	"\| 1\| 5297679\|\n",
	"\| 2\| 2520941\|\n",
	"\| 3\| 2743319\|\n",
	"\| 4\| 955523\|\n",
	"+----------+-----------------+\n",
	"\n"
	]
	}
	],
	"source": [
	"clustered = model.transform(df)\n",
	"clustered.groupBy('prediction').agg(count('prediction')).orderBy('prediction').show()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}