perrygeo/.geotiff_headers.md

## .geotiff_headers.md

      
    Raw
  

              .geotiff_headers.md
            
          
    Optimized determination of Geotiff bounds

This is functional but is merely a workaround until we get vsis3 - you can try out preliminary vsis3 support with GDAL 2.1 and rasterio 0.32a1
Goal: Based on the GeoTiff and TIFF specs, manually parse out tags to allow for the most IO-efficient reading of georeferencing information.
It works almost the same as rio info --bounds but gives a json array, and it's fast
$ time rio info --bounds R4C1.tif
153.984375 24.2578125 154.072265625 24.345703125

real	0m0.217s
user	0m0.161s
sys	0m0.053s

$ time geotiff_bbox.py R4C1.tif
[153.984375, 24.2578125, 154.072265625, 24.345703125]

real	0m0.020s
user	0m0.012s
sys	0m0.007s

More concretely this allows for calculation of tile coverage
geotiff_bbox.py R4C1.tif | mercantile tiles 13
[7600, 3524, 13]
[7600, 3525, 13]
[7600, 3526, 13]
[7601, 3524, 13]
[7601, 3525, 13]
[7601, 3526, 13]

And because it only reads just enough bytes to determine the bbox, we can use HTTP Range headers via the boto3 library to grab the same info from remote GeoTIFFs on s3:
$ time geotiff_bbox.py s3://mybucket/R4C1.tif
[153.984375, 24.2578125, 154.072265625, 24.345703125]

real	0m2.698s
user	0m0.191s
sys	0m0.045s

Not fast but almost all of that is HTTP latency. Since we're spending most of our time waiting on network, it's trivial to parallelize it. (see example.sh for a demonstration of this with GNU parallel)

  
## .gitignore
*.tif
*.pyc
_*
.*.swp

## example.sh
# For every s3 object in the BUCKET matching the PARTIAL_KEY
# read the geotiff tags and construct bboxes
# then find all the unique tiles at zoom level 13

export TMPFILE="/tmp/bboxes.txt"
export ZOOMS="/tmp/zooms.txt"

aws s3 ls s3://$BUCKET/$PARTIAL_KEY/ \
    | grep tif | head -n 20 | rev | cut -d " " -f1 | rev \
    | parallel python geotiff_bbox.py s3://$BUCKET/$PARTIAL_KEY/{} | tee -a $TMPFILE

while read line
do
   mercantile tiles 13 "$line"
done < $TMPFILE | sort | uniq | tee $ZOOMS

## geotiff_bbox.py
#!/usr/bin/env python
from __future__ import print_function
import struct
import sys
import json

import s3reader

def tags_to_bbox(header_vals, to_wgs84=True):
    tiepoint = header_vals["ModelTiePointTag"]
    scalex, scaley, _ = header_vals["ModelPixelScaleTag"]
    width = header_vals["ImageWidth"]
    height = header_vals["ImageLength"]
    try:
        epsg = header_vals["epsg"]
    except KeyError:
        epsg = None

    if tiepoint[:3] == (0, 0, 0):  # corner of UL
        ulx, uly = tiepoint[3:5]
    elif tiepoint[:3] == (0.5, 0.5, 0):  # center of UL
        culx, culy = tiepoint[3:5]
        ulx = culx - (0.5 * scalex)
        uly = culy + (0.5 * scaley)
    else:
        raise Exception("not tied to UL")

    lrx = ulx + (scalex * width)
    lry = uly + (-1 * scaley * height)

    bounds = [ulx, lry, lrx, uly]

    if to_wgs84 and epsg and epsg != 4326:
        import rasterio.warp
        xs = bounds[::2]
        ys = bounds[1::2]
        xs, ys = rasterio.warp.transform({'init': 'EPSG:{}'.format(epsg)},
                                         {'init': 'EPSG:4326'},
                                         xs, ys)
        result = [0]*len(bounds)
        result[::2] = xs
        result[1::2] = ys
        return result
    else:
        return bounds


def read_tags(path, tags):
    struct_fmt = {
        2: ('c', 1),  # char
        3: ('H', 2),  # unsigned short int
        4: ('I', 4),  # unsigned long
        12: ('d', 8),  # 8 byte double
    }

    try:
        s3reader.parse_path(path)
        # s3 URI, use s3reader
        _open = s3reader.open
    except ValueError:
        # normal file
        _open = open

    with _open(path, 'rb') as src:
        header = src.read(8)

        endian_tuple = struct.unpack('2c', header[0:2])

        if endian_tuple == ('I', 'I'):
            endian = '<'  # little
        elif endian_tuple == ('M', 'M'):
            endian = '>'
        else:
            raise Exception

        meaning_of_life = struct.unpack(endian + 'h', header[2:4])[0]
        if meaning_of_life != 42:
            raise Exception("Not a GeoTiff")

        ifd_offset = struct.unpack(endian + "i", header[4:8])[0]

        header_vals = {}

        src.seek(ifd_offset)
        nde = src.read(2)
        n_dir_entries = struct.unpack(endian + 'h', nde)[0]

        tags_bytes = src.read(12 * n_dir_entries)

        for d in range(n_dir_entries):
            start = d * 12
            end = (d + 1) * 12
            data = tags_bytes[start:end]
            tag = struct.unpack(endian + 'H', data[0:2])[0]

            tag, typ, count, val_offset = struct.unpack(endian + 'HHII', data)

            try:
                tagname = tags[tag]
            except KeyError:
                # special case
                tagname = 'GeoTIFF-34735'
                if tag != 34735:
                    continue

            fmtchar, bytesper = struct_fmt[typ]
            total_bytes = bytesper * count
            if total_bytes <= 4:
                values = val_offset  # not an offset but an actual value
            else:
                src.seek(val_offset)
                values_packed = src.read(total_bytes)
                fmt = "{}{}".format(count, fmtchar)
                values = struct.unpack(endian + fmt, values_packed)

            if fmtchar == 'c':
                values = ''.join(values)

            if tag == 34735:
                # geotiff, lets get the epsg code
                keyids = values[0::4]
                ttagloc = values[1::4]
                keyvals = values[3::4]
                try:
                    if dict(zip(keyids, ttagloc))[3072] == 0:
                        # it's a value not and offset
                        epsg = dict(zip(keyids, keyvals))[3072]
                        header_vals['epsg'] = epsg
                except KeyError:
                    continue

            header_vals[tagname] = values

    return header_vals


def main(path):

    tags = {
        256: 'ImageWidth',
        257: 'ImageLength',
        33922: "ModelTiePointTag",
        33550: "ModelPixelScaleTag"}

    print(json.dumps(tags_to_bbox(read_tags(path, tags))))


if __name__ == "__main__":
    path = sys.argv[1]
    main(path)

## requirements.txt
tifffile
mercantile
shapely
boto3


## s3reader.py
#!/usr/bin/env python
import boto3
s3 = boto3.resource('s3')


def open(path, mode='rb', *args, **kwargs):
    if mode != 'rb':
        raise ValueError("s3reader only works in binary read ('rb') mode")
    bucket, key = parse_path(path)
    return S3Reader(bucket, key)


def parse_path(path):
    if not path.startswith("s3://"):
        raise ValueError("s3reader.open requires an s3:// URI")
    path = path.replace("s3://", "")
    parts = path.split("/")
    bucket = parts[0]
    key = "/".join(parts[1:])
    return bucket, key


class S3Reader(object):
    def __init__(self, bucket, key):
        self.bucket = bucket
        self.key = key
        self.obj = s3.Object(self.bucket, self.key)
        self.size = self.obj.content_length
        self.pos = 0  # pointer to starting read position

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.pos = 0

    def range_string(self, start, stop):
        return "bytes={}-{}".format(start, stop)

    def read(self, nbytes=None):
        if not nbytes:
            nbytes = self.size - self.pos
        # TODO confirm that start and stop bytes are within 0 to size
        rng = self.range_string(self.pos, self.pos + nbytes - 1)
        self.pos += nbytes  # TODO wait to move pointer until read confirmed
        return self.obj.get(Range=rng)['Body'].read()

    def seek(self, offset, whence=0):
        self.pos = whence + offset


if __name__ == "__main__":
    with open("s3://perrygeo-gdal/test1.tif") as test:
        aa = test.read(5)
        bb = test.read()
        assert len(aa + bb) == test.obj.content_length

    with open("s3://perrygeo-gdal/test1.tif") as test:
        aa = test.read(5)
        bb = test.read(5)
        test.seek(2)
        cc = test.read(8)
        assert (aa + bb)[2:] == cc
	# For every s3 object in the BUCKET matching the PARTIAL_KEY
	# read the geotiff tags and construct bboxes
	# then find all the unique tiles at zoom level 13

	export TMPFILE="/tmp/bboxes.txt"
	export ZOOMS="/tmp/zooms.txt"

	aws s3 ls s3://$BUCKET/$PARTIAL_KEY/ \
	\| grep tif \| head -n 20 \| rev \| cut -d " " -f1 \| rev \
	\| parallel python geotiff_bbox.py s3://$BUCKET/$PARTIAL_KEY/{} \| tee -a $TMPFILE

	while read line
	do
	mercantile tiles 13 "$line"
	done < $TMPFILE \| sort \| uniq \| tee $ZOOMS
	#!/usr/bin/env python
	from __future__ import print_function
	import struct
	import sys
	import json

	import s3reader

	def tags_to_bbox(header_vals, to_wgs84=True):
	tiepoint = header_vals["ModelTiePointTag"]
	scalex, scaley, _ = header_vals["ModelPixelScaleTag"]
	width = header_vals["ImageWidth"]
	height = header_vals["ImageLength"]
	try:
	epsg = header_vals["epsg"]
	except KeyError:
	epsg = None

	if tiepoint[:3] == (0, 0, 0): # corner of UL
	ulx, uly = tiepoint[3:5]
	elif tiepoint[:3] == (0.5, 0.5, 0): # center of UL
	culx, culy = tiepoint[3:5]
	ulx = culx - (0.5 * scalex)
	uly = culy + (0.5 * scaley)
	else:
	raise Exception("not tied to UL")

	lrx = ulx + (scalex * width)
	lry = uly + (-1 * scaley * height)

	bounds = [ulx, lry, lrx, uly]

	if to_wgs84 and epsg and epsg != 4326:
	import rasterio.warp
	xs = bounds[::2]
	ys = bounds[1::2]
	xs, ys = rasterio.warp.transform({'init': 'EPSG:{}'.format(epsg)},
	{'init': 'EPSG:4326'},
	xs, ys)
	result = [0]*len(bounds)
	result[::2] = xs
	result[1::2] = ys
	return result
	else:
	return bounds



	def read_tags(path, tags):
	struct_fmt = {
	2: ('c', 1), # char
	3: ('H', 2), # unsigned short int
	4: ('I', 4), # unsigned long
	12: ('d', 8), # 8 byte double
	}

	try:
	s3reader.parse_path(path)
	# s3 URI, use s3reader
	_open = s3reader.open
	except ValueError:
	# normal file
	_open = open

	with _open(path, 'rb') as src:
	header = src.read(8)

	endian_tuple = struct.unpack('2c', header[0:2])

	if endian_tuple == ('I', 'I'):
	endian = '<' # little
	elif endian_tuple == ('M', 'M'):
	endian = '>'
	else:
	raise Exception

	meaning_of_life = struct.unpack(endian + 'h', header[2:4])[0]
	if meaning_of_life != 42:
	raise Exception("Not a GeoTiff")

	ifd_offset = struct.unpack(endian + "i", header[4:8])[0]

	header_vals = {}

	src.seek(ifd_offset)
	nde = src.read(2)
	n_dir_entries = struct.unpack(endian + 'h', nde)[0]

	tags_bytes = src.read(12 * n_dir_entries)

	for d in range(n_dir_entries):
	start = d * 12
	end = (d + 1) * 12
	data = tags_bytes[start:end]
	tag = struct.unpack(endian + 'H', data[0:2])[0]

	tag, typ, count, val_offset = struct.unpack(endian + 'HHII', data)

	try:
	tagname = tags[tag]
	except KeyError:
	# special case
	tagname = 'GeoTIFF-34735'
	if tag != 34735:
	continue

	fmtchar, bytesper = struct_fmt[typ]
	total_bytes = bytesper * count
	if total_bytes <= 4:
	values = val_offset # not an offset but an actual value
	else:
	src.seek(val_offset)
	values_packed = src.read(total_bytes)
	fmt = "{}{}".format(count, fmtchar)
	values = struct.unpack(endian + fmt, values_packed)

	if fmtchar == 'c':
	values = ''.join(values)

	if tag == 34735:
	# geotiff, lets get the epsg code
	keyids = values[0::4]
	ttagloc = values[1::4]
	keyvals = values[3::4]
	try:
	if dict(zip(keyids, ttagloc))[3072] == 0:
	# it's a value not and offset
	epsg = dict(zip(keyids, keyvals))[3072]
	header_vals['epsg'] = epsg
	except KeyError:
	continue

	header_vals[tagname] = values

	return header_vals


	def main(path):

	tags = {
	256: 'ImageWidth',
	257: 'ImageLength',
	33922: "ModelTiePointTag",
	33550: "ModelPixelScaleTag"}

	print(json.dumps(tags_to_bbox(read_tags(path, tags))))


	if __name__ == "__main__":
	path = sys.argv[1]
	main(path)
	#!/usr/bin/env python
	import boto3
	s3 = boto3.resource('s3')


	def open(path, mode='rb', args, *kwargs):
	if mode != 'rb':
	raise ValueError("s3reader only works in binary read ('rb') mode")
	bucket, key = parse_path(path)
	return S3Reader(bucket, key)


	def parse_path(path):
	if not path.startswith("s3://"):
	raise ValueError("s3reader.open requires an s3:// URI")
	path = path.replace("s3://", "")
	parts = path.split("/")
	bucket = parts[0]
	key = "/".join(parts[1:])
	return bucket, key


	class S3Reader(object):
	def __init__(self, bucket, key):
	self.bucket = bucket
	self.key = key
	self.obj = s3.Object(self.bucket, self.key)
	self.size = self.obj.content_length
	self.pos = 0 # pointer to starting read position

	def __enter__(self):
	return self

	def __exit__(self, type, value, traceback):
	self.pos = 0

	def range_string(self, start, stop):
	return "bytes={}-{}".format(start, stop)

	def read(self, nbytes=None):
	if not nbytes:
	nbytes = self.size - self.pos
	# TODO confirm that start and stop bytes are within 0 to size
	rng = self.range_string(self.pos, self.pos + nbytes - 1)
	self.pos += nbytes # TODO wait to move pointer until read confirmed
	return self.obj.get(Range=rng)['Body'].read()

	def seek(self, offset, whence=0):
	self.pos = whence + offset


	if __name__ == "__main__":
	with open("s3://perrygeo-gdal/test1.tif") as test:
	aa = test.read(5)
	bb = test.read()
	assert len(aa + bb) == test.obj.content_length

	with open("s3://perrygeo-gdal/test1.tif") as test:
	aa = test.read(5)
	bb = test.read(5)
	test.seek(2)
	cc = test.read(8)
	assert (aa + bb)[2:] == cc