Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python implementation of zonal statistics function. Optimized for dense polygon layers, uses numpy, GDAL and OGR to rival the speed of starspan.
"""
Zonal Statistics
Vector-Raster Analysis
Copyright 2013 Matthew Perry
Usage:
zonal_stats.py VECTOR RASTER
zonal_stats.py -h | --help
zonal_stats.py --version
Options:
-h --help Show this screen.
--version Show version.
"""
from osgeo import gdal, ogr
from osgeo.gdalconst import *
import numpy as np
import sys
gdal.PushErrorHandler('CPLQuietErrorHandler')
def bbox_to_pixel_offsets(gt, bbox):
originX = gt[0]
originY = gt[3]
pixel_width = gt[1]
pixel_height = gt[5]
x1 = int((bbox[0] - originX) / pixel_width)
x2 = int((bbox[1] - originX) / pixel_width) + 1
y1 = int((bbox[3] - originY) / pixel_height)
y2 = int((bbox[2] - originY) / pixel_height) + 1
xsize = x2 - x1
ysize = y2 - y1
return (x1, y1, xsize, ysize)
def zonal_stats(vector_path, raster_path, nodata_value=None, global_src_extent=False):
rds = gdal.Open(raster_path, GA_ReadOnly)
assert(rds)
rb = rds.GetRasterBand(1)
rgt = rds.GetGeoTransform()
if nodata_value:
nodata_value = float(nodata_value)
rb.SetNoDataValue(nodata_value)
vds = ogr.Open(vector_path, GA_ReadOnly) # TODO maybe open update if we want to write stats
assert(vds)
vlyr = vds.GetLayer(0)
# create an in-memory numpy array of the source raster data
# covering the whole extent of the vector layer
if global_src_extent:
# use global source extent
# useful only when disk IO or raster scanning inefficiencies are your limiting factor
# advantage: reads raster data in one pass
# disadvantage: large vector extents may have big memory requirements
src_offset = bbox_to_pixel_offsets(rgt, vlyr.GetExtent())
src_array = rb.ReadAsArray(*src_offset)
# calculate new geotransform of the layer subset
new_gt = (
(rgt[0] + (src_offset[0] * rgt[1])),
rgt[1],
0.0,
(rgt[3] + (src_offset[1] * rgt[5])),
0.0,
rgt[5]
)
mem_drv = ogr.GetDriverByName('Memory')
driver = gdal.GetDriverByName('MEM')
# Loop through vectors
stats = []
feat = vlyr.GetNextFeature()
while feat is not None:
if not global_src_extent:
# use local source extent
# fastest option when you have fast disks and well indexed raster (ie tiled Geotiff)
# advantage: each feature uses the smallest raster chunk
# disadvantage: lots of reads on the source raster
src_offset = bbox_to_pixel_offsets(rgt, feat.geometry().GetEnvelope())
src_array = rb.ReadAsArray(*src_offset)
# calculate new geotransform of the feature subset
new_gt = (
(rgt[0] + (src_offset[0] * rgt[1])),
rgt[1],
0.0,
(rgt[3] + (src_offset[1] * rgt[5])),
0.0,
rgt[5]
)
# Create a temporary vector layer in memory
mem_ds = mem_drv.CreateDataSource('out')
mem_layer = mem_ds.CreateLayer('poly', None, ogr.wkbPolygon)
mem_layer.CreateFeature(feat.Clone())
# Rasterize it
rvds = driver.Create('', src_offset[2], src_offset[3], 1, gdal.GDT_Byte)
rvds.SetGeoTransform(new_gt)
gdal.RasterizeLayer(rvds, [1], mem_layer, burn_values=[1])
rv_array = rvds.ReadAsArray()
# Mask the source data array with our current feature
# we take the logical_not to flip 0<->1 to get the correct mask effect
# we also mask out nodata values explictly
masked = np.ma.MaskedArray(
src_array,
mask=np.logical_or(
src_array == nodata_value,
np.logical_not(rv_array)
)
)
feature_stats = {
'min': float(masked.min()),
'mean': float(masked.mean()),
'max': float(masked.max()),
'std': float(masked.std()),
'sum': float(masked.sum()),
'count': int(masked.count()),
'fid': int(feat.GetFID())}
stats.append(feature_stats)
rvds = None
mem_ds = None
feat = vlyr.GetNextFeature()
vds = None
rds = None
return stats
if __name__ == "__main__":
opts = {'VECTOR': sys.argv[1], 'RASTER': sys.argv[2]}
stats = zonal_stats(opts['VECTOR'], opts['RASTER'])
try:
from pandas import DataFrame
print DataFrame(stats)
except ImportError:
import json
print json.dumps(stats, indent=2)
$ time python zonal_stats.py test.shp terrain/slope.tif
count fid max mean min std sum
0 203 0 96 65.876847 3 17.968489 13373
1 130 1 90 60.100000 3 16.728994 7813
2 1341 2 102 53.211037 2 17.901655 71356
3 130 3 90 60.100000 3 16.728994 7813
4 132 4 64 15.962121 1 15.360519 2107
5 132 5 53 31.515152 17 7.970100 4160
6 131 6 42 9.893130 0 8.168317 1296
7 132 7 64 28.712121 2 14.853594 3790
8 133 8 54 35.548872 11 8.878856 4728
9 131 9 82 52.297710 4 17.349877 6851
10 131 10 11 3.030534 0 1.781752 397
11 134 11 57 10.156716 1 11.960042 1361
12 133 12 45 19.000000 0 13.727750 2527
13 132 13 64 26.507576 1 18.848075 3499
14 132 14 94 52.787879 1 22.297585 6968
15 131 15 84 19.450382 1 15.992944 2548
16 132 16 52 11.583333 0 11.538501 1529
17 132 17 108 53.515152 6 18.198603 7064
18 341 18 76 39.117302 9 11.540482 13339
19 337 19 57 19.988131 4 9.593512 6736
20 336 20 78 48.636905 11 13.357014 16342
21 338 21 3 0.855030 0 0.527067 289
22 337 22 34 5.347181 0 7.069888 1802
23 341 23 0 0.000000 0 0.000000 0
24 341 24 42 16.612903 0 9.041271 5665
25 337 25 128 78.848665 5 18.689028 26572
26 341 26 29 7.973607 1 5.341357 2719
27 339 27 78 35.616519 5 14.455317 12074
28 341 28 65 20.199413 0 16.636394 6888
29 340 29 84 35.855882 1 17.022989 12191
30 338 30 96 61.440828 2 16.703587 20767
31 340 31 101 57.832353 8 18.161971 19663
real 0m1.311s
user 0m0.372s
sys 0m0.752s
#### Starspan equivalent
$ time starspan --vector test.shp --out-prefix testout --out-type table \
--summary-suffix _stats.csv --raster terrain/slope.tif \
--stats avg mode median min max sum stdev nulls && \
cat testout_stats.csv
1: Extracting from /usr/local/apps/land_owner_tools/lot/fixtures/downloads/terrain/slope.tif
Summary:
Intersecting features: 32
Polygons: 32
Processed pixels: 8379
real 0m1.440s
user 0m0.944s
sys 0m0.296s
@dagnachewl

This comment has been minimized.

dagnachewl commented Oct 14, 2013

This is an excellent tool.

Thanks a bunch!

Cheers

@vickyting0910

This comment has been minimized.

vickyting0910 commented Feb 3, 2014

Hello,
I tested your code, based on your txt file, but I got an error message.
MaskError: Mask and data not compatible: data size is 1, mask size is 228744.
Do I make a mistake?

@AsgerPetersen

This comment has been minimized.

AsgerPetersen commented Mar 20, 2014

@perrygeo As the script does not include an explicit license, I would like to ask for permission to fork, make changes to and subsequently use your zonal_stats.pyscript.

@dagnachewl

This comment has been minimized.

dagnachewl commented Mar 20, 2014

vickyting0910, this happens if your zone map has a different projection from the value map.
I added the following lines starting from line 52 in the above code to reproject vector geometry to same projection as raster layer.

# Reproject vector geometry to same projection as raster
sourceSR = vlyr.GetSpatialRef()
targetSR = osr.SpatialReference()
targetSR.ImportFromWkt(rds.GetProjectionRef())
coordTrans = osr.CreateCoordinateTransformation(sourceSR,targetSR)
feat = vlyr.GetNextFeature()
@JohnDGuth

This comment has been minimized.

JohnDGuth commented Mar 2, 2016

This works really well for small and medium sized polygons. However, on larger aeas, I've found that the sum and the mean start giving bogus answers. Is there a way to fix this for larger areas?

@mwooten3

This comment has been minimized.

mwooten3 commented Apr 24, 2017

Thank you so much for this tool, it's saved my life!! However, I just got the following error and am unsure what it means or how to fix:

  File "./zonalStats/zonal_stats_updateShp.py", line 133, in zonal_stats
    np.logical_not(rv_array)
  File "/usr/lib/python2.7/dist-packages/numpy/ma/core.py", line 2710, in __new__
    raise MaskError(msg % (nd, nm))
numpy.ma.core.MaskError: Mask and data not compatible: data size is 1, mask size is 4.

This was not occurring when I used an arcGIS generated polygon however started when I used a polygon (esri shp still) that was created using gdal_polygonize.py.

I checked the projection for both input shp and raster and they are the same.

Thanks again!

@mwooten3

This comment has been minimized.

mwooten3 commented May 10, 2017

@dagnachewl Thanks for uploading this solution. However, I tried adding that block of code to the zonal_stats.py and still get the same error:

numpy.ma.core.MaskError: Mask and data not compatible: data size is 1, mask size is 8.

Any other ideas on how to fix? Thanks

@MLNoon

This comment has been minimized.

MLNoon commented Jun 15, 2017

This looks like an excellent tool, however I am unable to run. I am getting an Assertion Error:

AssertionError Traceback (most recent call last)
in ()
142 if name == "main":
143 opts = {'VECTOR': sys.argv[1], 'RASTER': sys.argv[2]}
--> 144 stats = zonal_stats(opts['VECTOR'], opts['RASTER'])
145
146 try:

in zonal_stats(vector_path, raster_path, nodata_value, global_src_extent)
39 def zonal_stats(vector_path, raster_path, nodata_value=None, global_src_extent=False):
40 rds = gdal.Open(raster_path, GA_ReadOnly)
---> 41 assert(rds)
42 rb = rds.GetRasterBand(1)
43 rgt = rds.GetGeoTransform()

AssertionError:

@SCrommelinck

This comment has been minimized.

SCrommelinck commented May 16, 2018

Any suggestions on how to use a memory vector layer as input for the zonal_stats function?

I have a function that creates the vector layers with:

memoryDriver = ogr.GetDriverByName('MEMORY')
memoryDatasource = memoryDriver.CreateDataSource('memoryData')
memoryLayer = memoryDatasource.CreateLayer('memoryLayer', geom_type = ogr.wkbPolygon)

I would then like to use the output (polygons) of that function in:
zonal_stats(vector, RGB, band=1)

However, the ogr.Open() method in zonal_stats seems unable to open my vector data. I have tried using the memoryLayer and the memoryDatasource as well as their names as input. Any further ideas?

@david-airfire

This comment has been minimized.

david-airfire commented Jun 4, 2018

@vickyting0910, @mwooten3, @dagnachewl - I was getting this error. too (numpy.ma.core.MaskError: Mask and data not compatible: data size is 1, mask size is XXX). The projection was the same, so that wasn't the problem.

My vector map (USA + territories) is a larger extent than my raster (lower 48 only), and this error was occurring on features outside the raster's extent. I added an output field in the feature_stats called "error_flag". It defaults to 0, but I put the offending lines of code inside a try/except and flagged those features with a value of 1. You can see I also added the "NAME" and "UA_ID" fields to the output. Not shown here, I added a print statement to get more info on those features, and I found they were features in Hawaii, American Samoa, etc.

`

    error_flag = 0
    # Mask the source data array with our current feature
    # we take the logical_not to flip 0<->1 to get the correct mask effect
    # we also mask out nodata values explictly
    try: 
        masked = np.ma.MaskedArray(
            src_array,
            mask=np.logical_or(
                src_array == nodata_value,
                np.logical_not(rv_array)
            )
        )
    except Exception as ex:
        print('Error message from mask - {}: {}'.format(
            type(ex).__name__, ex))
        error_flag = 1
        

    feature_stats = {
        'min': float(masked.min()),
        'mean': float(masked.mean()),
        'max': float(masked.max()),
        'std': float(masked.std()),
        'sum': float(masked.sum()),
        'count': int(masked.count()),
        'fid': int(feat.GetFID()),
        'ua_id': str(feat.GetField('UA_ID')),
        'name': str(feat.GetField('NAME')),
        'error_flag': int(error_flag)
        }

`

I think the real solution is to clip my vector map, but at least this makes the code more bulletproof.

@tlepkowski

This comment has been minimized.

tlepkowski commented Oct 8, 2018

I'd like to know the point of the max and min values. Say, for example, I'm running on an elevation band and I want to know the high elevation and the location.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment