celoyd/pca_multiband.py

## pca_multiband.py
# python pca_multiband.py input.jpeg output.tif
# n-band image -> PCA -> n-band TIFF image
# with lots of hackety assumptions
# (e.g., output is same type as input)

from sys import argv
import rasterio as rio
import numpy as np
from sklearn import decomposition

with rio.open(argv[1]) as src:
    meta = src.meta
    pixels = src.read()
    # pixels[pixels == meta['nodata']] = 0.0
    pixels = pixels.astype(np.float32)
    #pixels = np.nan_to_num(pixels)
    #pixels[np.isinf(pixels)] = 0.0

dtype = meta["dtype"]
count = meta["count"]

def normalize(n):
    m = np.nanmean(n)
    s = np.nanstd(n)
    return (n - m) / s


# Todo: make this cleaner:
pixels = np.dstack([normalize(c).ravel() for c in pixels])[0]

pca = decomposition.PCA(
    n_components=count, whiten=True
)  # The whitening is redundant, right?
pca.fit(pixels)

for band in range(len(pca.components_)):
    weights = ", ".join(["{0:.4g}".format(x) for x in pca.components_[band]])
    bidx = band + 1
    var = pca.explained_variance_ratio_[band]
    print(f"Band {bidx} will hold {var:.4g} of variance with weights:\n{weights}")

out = pca.transform(pixels)

# This is the messy reverse of the messy ravel above:
xyz = np.array([out[:, c].reshape(meta["height"], meta["width"]) for c in range(count)])

if dtype == 'uint8':
    dtype = np.uint16

try:
    typemax = np.finfo(dtype).max
except:
    typemax = np.iinfo(dtype).max

# Scale each band separately to fill out the data type.
# (You either really want this or really don't want this.)
xyz = np.array([((c - np.amin(c)) / (np.amax(c) - np.amin(c))) * typemax for c in xyz])

xyz = xyz.astype(dtype)

meta.update(
    {
        # 'transform': meta['affine'],
        "driver": "GTiff",
        "photometric": "none",
        "dtype": dtype,
    }
)

with rio.open(argv[2], "w", **meta) as dst:
    dst.write(xyz)
	# python pca_multiband.py input.jpeg output.tif
	# n-band image -> PCA -> n-band TIFF image
	# with lots of hackety assumptions
	# (e.g., output is same type as input)

	from sys import argv
	import rasterio as rio
	import numpy as np
	from sklearn import decomposition

	with rio.open(argv[1]) as src:
	meta = src.meta
	pixels = src.read()
	# pixels[pixels == meta['nodata']] = 0.0
	pixels = pixels.astype(np.float32)
	#pixels = np.nan_to_num(pixels)
	#pixels[np.isinf(pixels)] = 0.0

	dtype = meta["dtype"]
	count = meta["count"]

	def normalize(n):
	m = np.nanmean(n)
	s = np.nanstd(n)
	return (n - m) / s


	# Todo: make this cleaner:
	pixels = np.dstack([normalize(c).ravel() for c in pixels])[0]

	pca = decomposition.PCA(
	n_components=count, whiten=True
	) # The whitening is redundant, right?
	pca.fit(pixels)

	for band in range(len(pca.components_)):
	weights = ", ".join(["{0:.4g}".format(x) for x in pca.components_[band]])
	bidx = band + 1
	var = pca.explained_variance_ratio_[band]
	print(f"Band {bidx} will hold {var:.4g} of variance with weights:\n{weights}")

	out = pca.transform(pixels)

	# This is the messy reverse of the messy ravel above:
	xyz = np.array([out[:, c].reshape(meta["height"], meta["width"]) for c in range(count)])

	if dtype == 'uint8':
	dtype = np.uint16

	try:
	typemax = np.finfo(dtype).max
	except:
	typemax = np.iinfo(dtype).max

	# Scale each band separately to fill out the data type.
	# (You either really want this or really don't want this.)
	xyz = np.array([((c - np.amin(c)) / (np.amax(c) - np.amin(c))) * typemax for c in xyz])

	xyz = xyz.astype(dtype)

	meta.update(
	{
	# 'transform': meta['affine'],
	"driver": "GTiff",
	"photometric": "none",
	"dtype": dtype,
	}
	)

	with rio.open(argv[2], "w", **meta) as dst:
	dst.write(xyz)