Skip to content

Instantly share code, notes, and snippets.

@perrygeo
Last active June 2, 2020 16:21
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save perrygeo/9239b9ab64731cacbb35 to your computer and use it in GitHub Desktop.
Save perrygeo/9239b9ab64731cacbb35 to your computer and use it in GitHub Desktop.
Parsing geotiff headers

Optimized determination of Geotiff bounds

This is functional but is merely a workaround until we get vsis3 - you can try out preliminary vsis3 support with GDAL 2.1 and rasterio 0.32a1

Goal: Based on the GeoTiff and TIFF specs, manually parse out tags to allow for the most IO-efficient reading of georeferencing information.

It works almost the same as rio info --bounds but gives a json array, and it's fast

$ time rio info --bounds R4C1.tif
153.984375 24.2578125 154.072265625 24.345703125

real	0m0.217s
user	0m0.161s
sys	0m0.053s

$ time geotiff_bbox.py R4C1.tif
[153.984375, 24.2578125, 154.072265625, 24.345703125]

real	0m0.020s
user	0m0.012s
sys	0m0.007s

More concretely this allows for calculation of tile coverage

geotiff_bbox.py R4C1.tif | mercantile tiles 13
[7600, 3524, 13]
[7600, 3525, 13]
[7600, 3526, 13]
[7601, 3524, 13]
[7601, 3525, 13]
[7601, 3526, 13]

And because it only reads just enough bytes to determine the bbox, we can use HTTP Range headers via the boto3 library to grab the same info from remote GeoTIFFs on s3:

$ time geotiff_bbox.py s3://mybucket/R4C1.tif
[153.984375, 24.2578125, 154.072265625, 24.345703125]

real	0m2.698s
user	0m0.191s
sys	0m0.045s

Not fast but almost all of that is HTTP latency. Since we're spending most of our time waiting on network, it's trivial to parallelize it. (see example.sh for a demonstration of this with GNU parallel)

*.tif
*.pyc
_*
.*.swp
# For every s3 object in the BUCKET matching the PARTIAL_KEY
# read the geotiff tags and construct bboxes
# then find all the unique tiles at zoom level 13
export TMPFILE="/tmp/bboxes.txt"
export ZOOMS="/tmp/zooms.txt"
aws s3 ls s3://$BUCKET/$PARTIAL_KEY/ \
| grep tif | head -n 20 | rev | cut -d " " -f1 | rev \
| parallel python geotiff_bbox.py s3://$BUCKET/$PARTIAL_KEY/{} | tee -a $TMPFILE
while read line
do
mercantile tiles 13 "$line"
done < $TMPFILE | sort | uniq | tee $ZOOMS
#!/usr/bin/env python
from __future__ import print_function
import struct
import sys
import json
import s3reader
def tags_to_bbox(header_vals, to_wgs84=True):
tiepoint = header_vals["ModelTiePointTag"]
scalex, scaley, _ = header_vals["ModelPixelScaleTag"]
width = header_vals["ImageWidth"]
height = header_vals["ImageLength"]
try:
epsg = header_vals["epsg"]
except KeyError:
epsg = None
if tiepoint[:3] == (0, 0, 0): # corner of UL
ulx, uly = tiepoint[3:5]
elif tiepoint[:3] == (0.5, 0.5, 0): # center of UL
culx, culy = tiepoint[3:5]
ulx = culx - (0.5 * scalex)
uly = culy + (0.5 * scaley)
else:
raise Exception("not tied to UL")
lrx = ulx + (scalex * width)
lry = uly + (-1 * scaley * height)
bounds = [ulx, lry, lrx, uly]
if to_wgs84 and epsg and epsg != 4326:
import rasterio.warp
xs = bounds[::2]
ys = bounds[1::2]
xs, ys = rasterio.warp.transform({'init': 'EPSG:{}'.format(epsg)},
{'init': 'EPSG:4326'},
xs, ys)
result = [0]*len(bounds)
result[::2] = xs
result[1::2] = ys
return result
else:
return bounds
def read_tags(path, tags):
struct_fmt = {
2: ('c', 1), # char
3: ('H', 2), # unsigned short int
4: ('I', 4), # unsigned long
12: ('d', 8), # 8 byte double
}
try:
s3reader.parse_path(path)
# s3 URI, use s3reader
_open = s3reader.open
except ValueError:
# normal file
_open = open
with _open(path, 'rb') as src:
header = src.read(8)
endian_tuple = struct.unpack('2c', header[0:2])
if endian_tuple == ('I', 'I'):
endian = '<' # little
elif endian_tuple == ('M', 'M'):
endian = '>'
else:
raise Exception
meaning_of_life = struct.unpack(endian + 'h', header[2:4])[0]
if meaning_of_life != 42:
raise Exception("Not a GeoTiff")
ifd_offset = struct.unpack(endian + "i", header[4:8])[0]
header_vals = {}
src.seek(ifd_offset)
nde = src.read(2)
n_dir_entries = struct.unpack(endian + 'h', nde)[0]
tags_bytes = src.read(12 * n_dir_entries)
for d in range(n_dir_entries):
start = d * 12
end = (d + 1) * 12
data = tags_bytes[start:end]
tag = struct.unpack(endian + 'H', data[0:2])[0]
tag, typ, count, val_offset = struct.unpack(endian + 'HHII', data)
try:
tagname = tags[tag]
except KeyError:
# special case
tagname = 'GeoTIFF-34735'
if tag != 34735:
continue
fmtchar, bytesper = struct_fmt[typ]
total_bytes = bytesper * count
if total_bytes <= 4:
values = val_offset # not an offset but an actual value
else:
src.seek(val_offset)
values_packed = src.read(total_bytes)
fmt = "{}{}".format(count, fmtchar)
values = struct.unpack(endian + fmt, values_packed)
if fmtchar == 'c':
values = ''.join(values)
if tag == 34735:
# geotiff, lets get the epsg code
keyids = values[0::4]
ttagloc = values[1::4]
keyvals = values[3::4]
try:
if dict(zip(keyids, ttagloc))[3072] == 0:
# it's a value not and offset
epsg = dict(zip(keyids, keyvals))[3072]
header_vals['epsg'] = epsg
except KeyError:
continue
header_vals[tagname] = values
return header_vals
def main(path):
tags = {
256: 'ImageWidth',
257: 'ImageLength',
33922: "ModelTiePointTag",
33550: "ModelPixelScaleTag"}
print(json.dumps(tags_to_bbox(read_tags(path, tags))))
if __name__ == "__main__":
path = sys.argv[1]
main(path)
tifffile
mercantile
shapely
boto3
#!/usr/bin/env python
import boto3
s3 = boto3.resource('s3')
def open(path, mode='rb', *args, **kwargs):
if mode != 'rb':
raise ValueError("s3reader only works in binary read ('rb') mode")
bucket, key = parse_path(path)
return S3Reader(bucket, key)
def parse_path(path):
if not path.startswith("s3://"):
raise ValueError("s3reader.open requires an s3:// URI")
path = path.replace("s3://", "")
parts = path.split("/")
bucket = parts[0]
key = "/".join(parts[1:])
return bucket, key
class S3Reader(object):
def __init__(self, bucket, key):
self.bucket = bucket
self.key = key
self.obj = s3.Object(self.bucket, self.key)
self.size = self.obj.content_length
self.pos = 0 # pointer to starting read position
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.pos = 0
def range_string(self, start, stop):
return "bytes={}-{}".format(start, stop)
def read(self, nbytes=None):
if not nbytes:
nbytes = self.size - self.pos
# TODO confirm that start and stop bytes are within 0 to size
rng = self.range_string(self.pos, self.pos + nbytes - 1)
self.pos += nbytes # TODO wait to move pointer until read confirmed
return self.obj.get(Range=rng)['Body'].read()
def seek(self, offset, whence=0):
self.pos = whence + offset
if __name__ == "__main__":
with open("s3://perrygeo-gdal/test1.tif") as test:
aa = test.read(5)
bb = test.read()
assert len(aa + bb) == test.obj.content_length
with open("s3://perrygeo-gdal/test1.tif") as test:
aa = test.read(5)
bb = test.read(5)
test.seek(2)
cc = test.read(8)
assert (aa + bb)[2:] == cc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment