Skip to content

Instantly share code, notes, and snippets.

@jamesmcclain
Created April 30, 2018 15:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jamesmcclain/d7301ba770477296348765a6d73feabf to your computer and use it in GitHub Desktop.
Save jamesmcclain/d7301ba770477296348765a6d73feabf to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{"nbformat_minor": 2, "cells": [{"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 1, "source": "import gdal\nfrom gdalconst import *\nimport geopyspark as gps\nimport numpy as np\n\nfrom pyspark import SparkContext\nimport matplotlib.pyplot as plt\nfrom shapely.geometry import box\nfrom datetime import datetime, timedelta\n\n%matplotlib inline"}, {"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 2, "source": "worker_count=64\nconf = gps.geopyspark_conf(appName=\"NASANEX INGEST\", master='yarn-client')\nconf.set(\"spark.hadoop.yarn.timeline-service.enabled\", False)\nconf.set('spark.ui.enabled', True)\nconf.set('spark.default.parallelism', worker_count)\nconf.set('spark.executor.memory', '9500M')\nconf.set('spark.executor.cores', 1)\nconf.set('spark.yarn.executor.memoryOverhead', '1500M')\nconf.set('spark.master.memory', '9500M')\nconf.set('spark.yarn.driver.memoryOverhead', '1500M')\nconf.set('spark.dynamicAllocation.enabled', True)\nconf.set('spark.shuffle.service.enabled', True)\nsc = SparkContext(conf=conf)"}, {"cell_type": "markdown", "metadata": {}, "source": "## LOCA ##"}, {"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 3, "source": "area_of_interest = box(-126.0, 23.0, -66.0, 54.0) # Obtained using rasterio (not shown)\n\nbase_max_uri =\"http://nasanex.s3.amazonaws.com/LOCA/CCSM4/16th/rcp85/r6i1p1/tasmax/tasmax_day_CCSM4_rcp85_r6i1p1_{}0101-{}1231.LOCA_2016-04-02.16th.nc\"\nbase_min_uri =\"http://nasanex.s3.amazonaws.com/LOCA/CCSM4/16th/rcp85/r6i1p1/tasmin/tasmin_day_CCSM4_rcp85_r6i1p1_{}0101-{}1231.LOCA_2016-04-02.16th.nc\"\nbase_pr_uri = \"http://nasanex.s3.amazonaws.com/LOCA/CCSM4/16th/rcp85/r6i1p1/pr/pr_day_CCSM4_rcp85_r6i1p1_{}0101-{}1231.LOCA_2016-04-02.16th.nc\""}, {"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 4, "source": "final_year = 2100\nyears = range(2018, final_year+1)"}, {"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 5, "source": "aoi = gps.Extent.from_polygon(area_of_interest)\n\ndef day_to_tile(max_dataset, min_dataset, pr_dataset, year, day):\n dt = datetime(year, 1, 1) + timedelta(days=(day-1))\n p = \"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs \"\n tpe = gps.TemporalProjectedExtent(extent=aoi, instant=dt, proj4=p)\n \n max_band = max_dataset.GetRasterBand(day)\n max_nodata = max_band.GetNoDataValue()\n max_arr = max_band.ReadAsArray()\n \n min_band = min_dataset.GetRasterBand(day)\n min_nodata = min_band.GetNoDataValue()\n min_arr = min_band.ReadAsArray()\n min_arr[min_arr == min_nodata] = max_nodata\n\n pr_band = pr_dataset.GetRasterBand(day)\n pr_nodata = pr_band.GetNoDataValue()\n pr_arr = pr_band.ReadAsArray()\n pr_arr[pr_arr == pr_nodata] = max_nodata\n\n t = (tpe, gps.Tile.from_numpy_array(np.array([max_arr, min_arr, pr_arr]), no_data_value=max_nodata))\n return t\n\ndef year_to_tiles(year):\n max_uri = base_max_uri.format(year, year)\n min_uri = base_min_uri.format(year, year)\n pr_uri = base_pr_uri.format(year, year)\n days = range(1,365+1)\n max_dataset = gdal.Open(max_uri, GA_ReadOnly)\n min_dataset = gdal.Open(min_uri, GA_ReadOnly)\n pr_dataset = gdal.Open(pr_uri, GA_ReadOnly)\n tiles = [day_to_tile(max_dataset, min_dataset, pr_dataset, year, day) for day in days]\n max_dataset = None\n min_dataset = None\n pr_dataset = None\n return tiles\n"}, {"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 6, "source": "tiles = sc.parallelize(years).repartition(82).flatMap(year_to_tiles)"}, {"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 7, "source": "raster_layer = gps.RasterLayer.from_numpy_rdd(gps.LayerType.SPACETIME, tiles)"}, {"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 8, "source": "tiled_raster_layer = raster_layer.tile_to_layout(layout = gps.LocalLayout(4,4), target_crs=4326)"}, {"cell_type": "code", "metadata": {"trusted": true, "collapsed": true}, "outputs": [], "execution_count": 9, "source": "gps.write(\n uri=\"s3://ingested-loca-data/CCSM4\",\n layer_name=\"CCSM4_rcp85_r6i1p1\",\n tiled_raster_layer = tiled_raster_layer,\n time_unit=gps.TimeUnit.YEARS\n)"}, {"cell_type": "code", "metadata": {"trusted": false, "collapsed": true}, "outputs": [], "execution_count": null, "source": ""}], "nbformat": 4, "metadata": {"language_info": {"pygments_lexer": "ipython3", "nbconvert_exporter": "python", "version": "3.4.3", "name": "python", "codemirror_mode": {"name": "ipython", "version": 3}, "mimetype": "text/x-python", "file_extension": ".py"}, "kernelspec": {"language": "python", "display_name": "GeoNotebook + GeoPySpark", "name": "geonotebook3"}}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment