Skip to content

Instantly share code, notes, and snippets.

@kissmygritts
Created January 26, 2024 01:09
Show Gist options
  • Save kissmygritts/cf6f847e598254541859cab32db9eca9 to your computer and use it in GitHub Desktop.
Save kissmygritts/cf6f847e598254541859cab32db9eca9 to your computer and use it in GitHub Desktop.
from __future__ import annotations
from enum import Enum
from dataclasses import dataclass
from abc import ABC, abstractstaticmethod
import geopandas as gpd
# The current feature catalog class is doing a lot and feels overloaded.
# it's trying to be a factory/formatter class and a dataset class. I think
# we can split the dataset classes into a few separate concrete implementations
# of a FeatureCatalog abstract base class. This class will have it's own
# loader (as an abstract method) that creates an instance of itself.
#
# My current understanding is each FeatureCatalog class uses the same inventory
# file, but parses that inventory file differently for each class. This makes
# a lot of the code more difficult to reason about. For example the LiveDeadCatalog
# needs several months of sentinel data in the FeatureCatalog. The CanopyCoverCatalog
# needs only 2 months of sentinel data.
#
# Then create a FeatureCatalogFactor to do the intial loading of the S3 inventory
# file. Parsing this inventory file into each of the catalogs is delegated to
# the correct FeatureCatalog class. The factory will instantiate the dataset class.
class CatalogType(Enum):
CANOPY_COVER = "canopy_cover"
CHM = "chm"
LIVE_DEAD = "live_dead"
@dataclass
class FeatureCatalog(ABC):
gdf: gpd.GeodDataFrame
@abstractstaticmethod
def format():
pass
def n_features(self) -> int:
# common feature catalog methods
return len(self.gdf)
@dataclass
class CanopyCoverFeatureCatalog(FeatureCatalog):
def format() -> FeatureCatalog:
# add concrete implementation here, i.e. the formatting logic
# that creates a canopy cover feature catalog
pass
@dataclass
class CHMFeatureCatalog(FeatureCatalog):
def format() -> FeatureCatalog:
# add concrete implementation here, i.e. the formatting logic
# that creates a chm feature catalog
pass
class FeatureCatalogFactory:
def __init__(self, inventory_url: str | None):
self._full_inventory = None
self._inventory_url = inventory_url
self.load_custom_inventory()
def load_custom_inventory(self) -> None:
# some initialization logic if necessary, this can be used
# in the init or by outside callers
if self._inventory_url:
self._full_inventory = gpd.read_file(self._inventory_url)
def create(self, catalog_type: CatalogType) -> FeatureCatalog:
# do something with inventory URL if needed
return self._catalog_formatters[catalog_type].format()
@staticmethod
def _catalog_class_map() -> dict[CatalogType, FeatureCatalog]:
# replace key with an enum and then we have better type safety
return {
CatalogType.CANOPY_COVER: CanopyCoverFeatureCatalog,
CatalogType.CHM: CHMFeatureCatalog,
}
# usage
# defualt behavior
catalog_formatter = FeatureCatalogFactory()
canopy_cover_catalog: CanopyCoverFeatureCatalog = catalog_formatter(CatalogType.CANOPY_COVER)
# use a custom inventory file
catalog_formatter = FeatureCatalogFactory("s3://vp-eng-test-data/inventory/inventory.csv")
chm_catalog: CHMFeatureCatalog = catalog_formatter(CatalogType.CHM)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment