perrygeo/zonal_stats.py

## zonal_stats.py
"""
Zonal Statistics
Vector-Raster Analysis

Copyright 2013 Matthew Perry

Usage:
  zonal_stats.py VECTOR RASTER
  zonal_stats.py -h | --help
  zonal_stats.py --version

Options:
  -h --help     Show this screen.
  --version     Show version.
"""
from osgeo import gdal, ogr
from osgeo.gdalconst import *
import numpy as np
import sys
gdal.PushErrorHandler('CPLQuietErrorHandler')


def bbox_to_pixel_offsets(gt, bbox):
    originX = gt[0]
    originY = gt[3]
    pixel_width = gt[1]
    pixel_height = gt[5]
    x1 = int((bbox[0] - originX) / pixel_width)
    x2 = int((bbox[1] - originX) / pixel_width) + 1

    y1 = int((bbox[3] - originY) / pixel_height)
    y2 = int((bbox[2] - originY) / pixel_height) + 1

    xsize = x2 - x1
    ysize = y2 - y1
    return (x1, y1, xsize, ysize)


def zonal_stats(vector_path, raster_path, nodata_value=None, global_src_extent=False):
    rds = gdal.Open(raster_path, GA_ReadOnly)
    assert(rds)
    rb = rds.GetRasterBand(1)
    rgt = rds.GetGeoTransform()

    if nodata_value:
        nodata_value = float(nodata_value)
        rb.SetNoDataValue(nodata_value)

    vds = ogr.Open(vector_path, GA_ReadOnly)  # TODO maybe open update if we want to write stats
    assert(vds)
    vlyr = vds.GetLayer(0)

    # create an in-memory numpy array of the source raster data
    # covering the whole extent of the vector layer
    if global_src_extent:
        # use global source extent
        # useful only when disk IO or raster scanning inefficiencies are your limiting factor
        # advantage: reads raster data in one pass
        # disadvantage: large vector extents may have big memory requirements
        src_offset = bbox_to_pixel_offsets(rgt, vlyr.GetExtent())
        src_array = rb.ReadAsArray(*src_offset)

        # calculate new geotransform of the layer subset
        new_gt = (
            (rgt[0] + (src_offset[0] * rgt[1])),
            rgt[1],
            0.0,
            (rgt[3] + (src_offset[1] * rgt[5])),
            0.0,
            rgt[5]
        )

    mem_drv = ogr.GetDriverByName('Memory')
    driver = gdal.GetDriverByName('MEM')

    # Loop through vectors
    stats = []
    feat = vlyr.GetNextFeature()
    while feat is not None:

        if not global_src_extent:
            # use local source extent
            # fastest option when you have fast disks and well indexed raster (ie tiled Geotiff)
            # advantage: each feature uses the smallest raster chunk
            # disadvantage: lots of reads on the source raster
            src_offset = bbox_to_pixel_offsets(rgt, feat.geometry().GetEnvelope())
            src_array = rb.ReadAsArray(*src_offset)

            # calculate new geotransform of the feature subset
            new_gt = (
                (rgt[0] + (src_offset[0] * rgt[1])),
                rgt[1],
                0.0,
                (rgt[3] + (src_offset[1] * rgt[5])),
                0.0,
                rgt[5]
            )

        # Create a temporary vector layer in memory
        mem_ds = mem_drv.CreateDataSource('out')
        mem_layer = mem_ds.CreateLayer('poly', None, ogr.wkbPolygon)
        mem_layer.CreateFeature(feat.Clone())

        # Rasterize it
        rvds = driver.Create('', src_offset[2], src_offset[3], 1, gdal.GDT_Byte)
        rvds.SetGeoTransform(new_gt)
        gdal.RasterizeLayer(rvds, [1], mem_layer, burn_values=[1])
        rv_array = rvds.ReadAsArray()

        # Mask the source data array with our current feature
        # we take the logical_not to flip 0<->1 to get the correct mask effect
        # we also mask out nodata values explictly
        masked = np.ma.MaskedArray(
            src_array,
            mask=np.logical_or(
                src_array == nodata_value,
                np.logical_not(rv_array)
            )
        )

        feature_stats = {
            'min': float(masked.min()),
            'mean': float(masked.mean()),
            'max': float(masked.max()),
            'std': float(masked.std()),
            'sum': float(masked.sum()),
            'count': int(masked.count()),
            'fid': int(feat.GetFID())}

        stats.append(feature_stats)

        rvds = None
        mem_ds = None
        feat = vlyr.GetNextFeature()

    vds = None
    rds = None
    return stats


if __name__ == "__main__":
    opts = {'VECTOR': sys.argv[1], 'RASTER': sys.argv[2]}
    stats = zonal_stats(opts['VECTOR'], opts['RASTER'])

    try:
        from pandas import DataFrame
        print DataFrame(stats)
    except ImportError:
        import json
        print json.dumps(stats, indent=2)

## zonal_stats_example.txt
$ time python zonal_stats.py test.shp terrain/slope.tif
    count  fid  max       mean  min        std    sum
0     203    0   96  65.876847    3  17.968489  13373
1     130    1   90  60.100000    3  16.728994   7813
2    1341    2  102  53.211037    2  17.901655  71356
3     130    3   90  60.100000    3  16.728994   7813
4     132    4   64  15.962121    1  15.360519   2107
5     132    5   53  31.515152   17   7.970100   4160
6     131    6   42   9.893130    0   8.168317   1296
7     132    7   64  28.712121    2  14.853594   3790
8     133    8   54  35.548872   11   8.878856   4728
9     131    9   82  52.297710    4  17.349877   6851
10    131   10   11   3.030534    0   1.781752    397
11    134   11   57  10.156716    1  11.960042   1361
12    133   12   45  19.000000    0  13.727750   2527
13    132   13   64  26.507576    1  18.848075   3499
14    132   14   94  52.787879    1  22.297585   6968
15    131   15   84  19.450382    1  15.992944   2548
16    132   16   52  11.583333    0  11.538501   1529
17    132   17  108  53.515152    6  18.198603   7064
18    341   18   76  39.117302    9  11.540482  13339
19    337   19   57  19.988131    4   9.593512   6736
20    336   20   78  48.636905   11  13.357014  16342
21    338   21    3   0.855030    0   0.527067    289
22    337   22   34   5.347181    0   7.069888   1802
23    341   23    0   0.000000    0   0.000000      0
24    341   24   42  16.612903    0   9.041271   5665
25    337   25  128  78.848665    5  18.689028  26572
26    341   26   29   7.973607    1   5.341357   2719
27    339   27   78  35.616519    5  14.455317  12074
28    341   28   65  20.199413    0  16.636394   6888
29    340   29   84  35.855882    1  17.022989  12191
30    338   30   96  61.440828    2  16.703587  20767
31    340   31  101  57.832353    8  18.161971  19663

real    0m1.311s
user    0m0.372s
sys     0m0.752s


#### Starspan equivalent
$ time starspan --vector test.shp --out-prefix testout --out-type table \
--summary-suffix _stats.csv --raster terrain/slope.tif \
--stats avg mode median min max sum stdev nulls && \
cat testout_stats.csv
  1: Extracting from /usr/local/apps/land_owner_tools/lot/fixtures/downloads/terrain/slope.tif
Summary:
  Intersecting features: 32
      Polygons: 32

  Processed pixels: 8379

real    0m1.440s
user    0m0.944s
sys     0m0.296s
	"""
	Zonal Statistics
	Vector-Raster Analysis

	Copyright 2013 Matthew Perry

	Usage:
	zonal_stats.py VECTOR RASTER
	zonal_stats.py -h \| --help
	zonal_stats.py --version

	Options:
	-h --help Show this screen.
	--version Show version.
	"""
	from osgeo import gdal, ogr
	from osgeo.gdalconst import *
	import numpy as np
	import sys
	gdal.PushErrorHandler('CPLQuietErrorHandler')


	def bbox_to_pixel_offsets(gt, bbox):
	originX = gt[0]
	originY = gt[3]
	pixel_width = gt[1]
	pixel_height = gt[5]
	x1 = int((bbox[0] - originX) / pixel_width)
	x2 = int((bbox[1] - originX) / pixel_width) + 1

	y1 = int((bbox[3] - originY) / pixel_height)
	y2 = int((bbox[2] - originY) / pixel_height) + 1

	xsize = x2 - x1
	ysize = y2 - y1
	return (x1, y1, xsize, ysize)


	def zonal_stats(vector_path, raster_path, nodata_value=None, global_src_extent=False):
	rds = gdal.Open(raster_path, GA_ReadOnly)
	assert(rds)
	rb = rds.GetRasterBand(1)
	rgt = rds.GetGeoTransform()

	if nodata_value:
	nodata_value = float(nodata_value)
	rb.SetNoDataValue(nodata_value)

	vds = ogr.Open(vector_path, GA_ReadOnly) # TODO maybe open update if we want to write stats
	assert(vds)
	vlyr = vds.GetLayer(0)

	# create an in-memory numpy array of the source raster data
	# covering the whole extent of the vector layer
	if global_src_extent:
	# use global source extent
	# useful only when disk IO or raster scanning inefficiencies are your limiting factor
	# advantage: reads raster data in one pass
	# disadvantage: large vector extents may have big memory requirements
	src_offset = bbox_to_pixel_offsets(rgt, vlyr.GetExtent())
	src_array = rb.ReadAsArray(*src_offset)

	# calculate new geotransform of the layer subset
	new_gt = (
	(rgt[0] + (src_offset[0] * rgt[1])),
	rgt[1],
	0.0,
	(rgt[3] + (src_offset[1] * rgt[5])),
	0.0,
	rgt[5]
	)

	mem_drv = ogr.GetDriverByName('Memory')
	driver = gdal.GetDriverByName('MEM')

	# Loop through vectors
	stats = []
	feat = vlyr.GetNextFeature()
	while feat is not None:

	if not global_src_extent:
	# use local source extent
	# fastest option when you have fast disks and well indexed raster (ie tiled Geotiff)
	# advantage: each feature uses the smallest raster chunk
	# disadvantage: lots of reads on the source raster
	src_offset = bbox_to_pixel_offsets(rgt, feat.geometry().GetEnvelope())
	src_array = rb.ReadAsArray(*src_offset)

	# calculate new geotransform of the feature subset
	new_gt = (
	(rgt[0] + (src_offset[0] * rgt[1])),
	rgt[1],
	0.0,
	(rgt[3] + (src_offset[1] * rgt[5])),
	0.0,
	rgt[5]
	)

	# Create a temporary vector layer in memory
	mem_ds = mem_drv.CreateDataSource('out')
	mem_layer = mem_ds.CreateLayer('poly', None, ogr.wkbPolygon)
	mem_layer.CreateFeature(feat.Clone())

	# Rasterize it
	rvds = driver.Create('', src_offset[2], src_offset[3], 1, gdal.GDT_Byte)
	rvds.SetGeoTransform(new_gt)
	gdal.RasterizeLayer(rvds, [1], mem_layer, burn_values=[1])
	rv_array = rvds.ReadAsArray()

	# Mask the source data array with our current feature
	# we take the logical_not to flip 0<->1 to get the correct mask effect
	# we also mask out nodata values explictly
	masked = np.ma.MaskedArray(
	src_array,
	mask=np.logical_or(
	src_array == nodata_value,
	np.logical_not(rv_array)
	)
	)

	feature_stats = {
	'min': float(masked.min()),
	'mean': float(masked.mean()),
	'max': float(masked.max()),
	'std': float(masked.std()),
	'sum': float(masked.sum()),
	'count': int(masked.count()),
	'fid': int(feat.GetFID())}

	stats.append(feature_stats)

	rvds = None
	mem_ds = None
	feat = vlyr.GetNextFeature()

	vds = None
	rds = None
	return stats


	if __name__ == "__main__":
	opts = {'VECTOR': sys.argv[1], 'RASTER': sys.argv[2]}
	stats = zonal_stats(opts['VECTOR'], opts['RASTER'])

	try:
	from pandas import DataFrame
	print DataFrame(stats)
	except ImportError:
	import json
	print json.dumps(stats, indent=2)
	$ time python zonal_stats.py test.shp terrain/slope.tif
	count fid max mean min std sum
	0 203 0 96 65.876847 3 17.968489 13373
	1 130 1 90 60.100000 3 16.728994 7813
	2 1341 2 102 53.211037 2 17.901655 71356
	3 130 3 90 60.100000 3 16.728994 7813
	4 132 4 64 15.962121 1 15.360519 2107
	5 132 5 53 31.515152 17 7.970100 4160
	6 131 6 42 9.893130 0 8.168317 1296
	7 132 7 64 28.712121 2 14.853594 3790
	8 133 8 54 35.548872 11 8.878856 4728
	9 131 9 82 52.297710 4 17.349877 6851
	10 131 10 11 3.030534 0 1.781752 397
	11 134 11 57 10.156716 1 11.960042 1361
	12 133 12 45 19.000000 0 13.727750 2527
	13 132 13 64 26.507576 1 18.848075 3499
	14 132 14 94 52.787879 1 22.297585 6968
	15 131 15 84 19.450382 1 15.992944 2548
	16 132 16 52 11.583333 0 11.538501 1529
	17 132 17 108 53.515152 6 18.198603 7064
	18 341 18 76 39.117302 9 11.540482 13339
	19 337 19 57 19.988131 4 9.593512 6736
	20 336 20 78 48.636905 11 13.357014 16342
	21 338 21 3 0.855030 0 0.527067 289
	22 337 22 34 5.347181 0 7.069888 1802
	23 341 23 0 0.000000 0 0.000000 0
	24 341 24 42 16.612903 0 9.041271 5665
	25 337 25 128 78.848665 5 18.689028 26572
	26 341 26 29 7.973607 1 5.341357 2719
	27 339 27 78 35.616519 5 14.455317 12074
	28 341 28 65 20.199413 0 16.636394 6888
	29 340 29 84 35.855882 1 17.022989 12191
	30 338 30 96 61.440828 2 16.703587 20767
	31 340 31 101 57.832353 8 18.161971 19663

	real 0m1.311s
	user 0m0.372s
	sys 0m0.752s


	#### Starspan equivalent
	$ time starspan --vector test.shp --out-prefix testout --out-type table \
	--summary-suffix _stats.csv --raster terrain/slope.tif \
	--stats avg mode median min max sum stdev nulls && \
	cat testout_stats.csv
	1: Extracting from /usr/local/apps/land_owner_tools/lot/fixtures/downloads/terrain/slope.tif
	Summary:
	Intersecting features: 32
	Polygons: 32

	Processed pixels: 8379

	real 0m1.440s
	user 0m0.944s
	sys 0m0.296s