knaaptime/correlogram.py

## correlogram.py
# Spatial Correlograms

import geopandas as gpd
import pandas as pd
from esda import G, Geary, Moran
from joblib import Parallel, delayed
from libpysal.cg.kdtree import KDTree
from libpysal.weights import KNN, DistanceBand
from libpysal.weights.util import get_points_array
from tqdm.auto import tqdm


def _get_autocorrelation_stat(inputs):
    """helper function for computing parallel autocorrelation statistics

    Parameters
    ----------
    inputs : tuple
        tuple of (y, tree, W, statistic, STATISTIC, dist, weights_kwargs, stat_kwargs)

    Returns
    -------
    pandas.Series
        a pandas series with the computed autocorrelation statistic and its simulated p-value
    """
    (
        y,  # y variable
        tree,  # kd tree
        W,  # weights class DistanceBand or KNN
        statistic,  # name of the stat ('I', etc) for getattr
        STATISTIC,  # class of statistic (Moran, Geary, etc)
        dist,  # threshold/k parameter for the weights
        weights_kwargs,  # additional args
        stat_kwargs,  # additional args
    ) = inputs

    w = W(tree, dist, silence_warnings=True, **weights_kwargs)
    stat = getattr(STATISTIC(y, w, **stat_kwargs), statistic)
    p = getattr(STATISTIC(y, w), "p_sim")
    return pd.Series([stat, p], index=[statistic, "pvalue"], name=dist)


def correlogram(
    gdf: gpd.GeoDataFrame,
    variable: str,
    distances: list,
    distance_type: str = "band",
    statistic: str = "I",
    weights_kwargs: dict = None,
    stat_kwargs: dict = None,
    n_jobs: int = -1,
    backend: str = "loky",
):
    """Generate a spatial correlogram

    Parameters
    ----------
    gdf : gpd.GeoDataFrame
        geodataframe holding spatial and attribute data
    variable: str
        column on the geodataframe used to compute autocorrelation statistic
    distances : list
        list of distances to compute the autocorrelation statistic
    distance_type : str, optional
        which concept of distance to increment. Options are {`band`, `knn`}.
        by default 'band' (for `libpysal.weights.DistanceBand` weights)
    statistic : str, by default 'I'
        which spatial autocorrelation statistic to compute. Options in {`I`, `G`, `C`}
    weights_kwargs : dict
        additional keyword arguments passed to the libpysal.weights.W class
    stat_kwargs : dict
        additional keyword arguments passed to the `esda` autocorrelation statistic class.
        For example for faster results with no statistical inference, set the nuumber
        of permutations to zero with {permutations: 0}
    n_jobs : int
        number of jobs to pass to joblib. If -1 (default), all cores will be used
    backend : str
        backend parameter passed to joblb

    Returns
    -------
    outputs : pandas.DataFrame
        table of autocorrelation statistics at increasing distance bandwidths
    """
    if stat_kwargs is None:
        stat_kwargs = dict()
    if weights_kwargs is None:
        weights_kwargs = dict()
    if statistic == "I":
        STATISTIC = Moran
    elif statistic == "G":
        STATISTIC = G
    elif statistic == "C":
        STATISTIC = Geary
    else:
        with NotImplementedError as e:
            raise e("Only I, G, and C statistics are currently implemented")

    if distance_type == "band":
        W = DistanceBand
    elif distance_type == "knn":
        if max(distances) > gdf.shape[0] - 1:
            with ValueError as e:
                raise e("max number of neighbors must be less than or equal to n-1")
        W = KNN
    else:
        with NotImplementedError as e:
            raise e("distance_type must be either `band` or `knn` ")

    #  should be able to build the tree once and reuse it?
    #  but in practice, im not seeing any real difference from starting a new W from scratch each time
    pts = get_points_array(gdf[gdf.geometry.name])
    tree = KDTree(pts)
    y = gdf[variable].values

    inputs = [
        tuple(
            [
                y,
                tree,
                W,
                statistic,
                STATISTIC,
                dist,
                weights_kwargs,
                stat_kwargs,
            ]
        )
        for dist in distances
    ]

    outputs = Parallel(n_jobs=n_jobs, backend=backend)(
        delayed(_get_autocorrelation_stat)(i) for i in inputs
    )

    return pd.DataFrame(outputs)


# def _serial_correlogram(
#     gdf: gpd.GeoDataFrame,
#     variable: str,
#     distances: list,
#     distance_type: str = "band",
#     statistic: str = "I",
# ):
#     """_summary_

#     Parameters
#     ----------
#     gdf : gpd.GeoDataFrame
#         geodataframe holding spatial and attribute data
#     variable: str
#         column on the geodataframe used to compute autocorrelation statistic
#     distances : list
#         _description_
#     distance_type : str, optional
#         which concept of distance to increment. Options are {`band`, `knn`}.
#         by default 'band'
#     statistic : str, by default 'I'
#         which spatial autocorrelation statistic to compute. Options in {`I`, `G`}
#     """
#     if statistic == "I":
#         STATISTIC = Moran
#     elif statistic == "G":
#         STATISTIC = G

#     else:
#         with NotImplementedError as e:
#             raise e("Only I and G statistics are currently implemented")

#     if distance_type == "band":
#         W = DistanceBand
#     elif distance_type == "knn":
#         W = KNN
#     else:
#         with NotImplementedError as e:
#             raise e("distance_type must be either `band` or `knn` ")

#     #  should be able to build the tree once and reuse it?
#     pts = get_points_array(gdf[gdf.geometry.name])
#     tree = KDTree(pts)

#     output = dict()
#     ps = dict()

#     for dist in tqdm(distances):
#         w = W(tree, dist, silence_warnings=True)
#         stat = getattr(STATISTIC(gdf[variable].values, w), statistic)
#         output[dist] = stat
#         ps[dist] = getattr(STATISTIC(gdf[variable].values, w), "p_sim")
#     return pd.DataFrame([pd.Series(output), pd.Series(ps)])
	# Spatial Correlograms

	import geopandas as gpd
	import pandas as pd
	from esda import G, Geary, Moran
	from joblib import Parallel, delayed
	from libpysal.cg.kdtree import KDTree
	from libpysal.weights import KNN, DistanceBand
	from libpysal.weights.util import get_points_array
	from tqdm.auto import tqdm


	def _get_autocorrelation_stat(inputs):
	"""helper function for computing parallel autocorrelation statistics

	Parameters
	----------
	inputs : tuple
	tuple of (y, tree, W, statistic, STATISTIC, dist, weights_kwargs, stat_kwargs)

	Returns
	-------
	pandas.Series
	a pandas series with the computed autocorrelation statistic and its simulated p-value
	"""
	(
	y, # y variable
	tree, # kd tree
	W, # weights class DistanceBand or KNN
	statistic, # name of the stat ('I', etc) for getattr
	STATISTIC, # class of statistic (Moran, Geary, etc)
	dist, # threshold/k parameter for the weights
	weights_kwargs, # additional args
	stat_kwargs, # additional args
	) = inputs

	w = W(tree, dist, silence_warnings=True, **weights_kwargs)
	stat = getattr(STATISTIC(y, w, **stat_kwargs), statistic)
	p = getattr(STATISTIC(y, w), "p_sim")
	return pd.Series([stat, p], index=[statistic, "pvalue"], name=dist)


	def correlogram(
	gdf: gpd.GeoDataFrame,
	variable: str,
	distances: list,
	distance_type: str = "band",
	statistic: str = "I",
	weights_kwargs: dict = None,
	stat_kwargs: dict = None,
	n_jobs: int = -1,
	backend: str = "loky",
	):
	"""Generate a spatial correlogram

	Parameters
	----------
	gdf : gpd.GeoDataFrame
	geodataframe holding spatial and attribute data
	variable: str
	column on the geodataframe used to compute autocorrelation statistic
	distances : list
	list of distances to compute the autocorrelation statistic
	distance_type : str, optional
	which concept of distance to increment. Options are {`band`, `knn`}.
	by default 'band' (for `libpysal.weights.DistanceBand` weights)
	statistic : str, by default 'I'
	which spatial autocorrelation statistic to compute. Options in {`I`, `G`, `C`}
	weights_kwargs : dict
	additional keyword arguments passed to the libpysal.weights.W class
	stat_kwargs : dict
	additional keyword arguments passed to the `esda` autocorrelation statistic class.
	For example for faster results with no statistical inference, set the nuumber
	of permutations to zero with {permutations: 0}
	n_jobs : int
	number of jobs to pass to joblib. If -1 (default), all cores will be used
	backend : str
	backend parameter passed to joblb

	Returns
	-------
	outputs : pandas.DataFrame
	table of autocorrelation statistics at increasing distance bandwidths
	"""
	if stat_kwargs is None:
	stat_kwargs = dict()
	if weights_kwargs is None:
	weights_kwargs = dict()
	if statistic == "I":
	STATISTIC = Moran
	elif statistic == "G":
	STATISTIC = G
	elif statistic == "C":
	STATISTIC = Geary
	else:
	with NotImplementedError as e:
	raise e("Only I, G, and C statistics are currently implemented")

	if distance_type == "band":
	W = DistanceBand
	elif distance_type == "knn":
	if max(distances) > gdf.shape[0] - 1:
	with ValueError as e:
	raise e("max number of neighbors must be less than or equal to n-1")
	W = KNN
	else:
	with NotImplementedError as e:
	raise e("distance_type must be either `band` or `knn` ")

	# should be able to build the tree once and reuse it?
	# but in practice, im not seeing any real difference from starting a new W from scratch each time
	pts = get_points_array(gdf[gdf.geometry.name])
	tree = KDTree(pts)
	y = gdf[variable].values

	inputs = [
	tuple(
	[
	y,
	tree,
	W,
	statistic,
	STATISTIC,
	dist,
	weights_kwargs,
	stat_kwargs,
	]
	)
	for dist in distances
	]

	outputs = Parallel(n_jobs=n_jobs, backend=backend)(
	delayed(_get_autocorrelation_stat)(i) for i in inputs
	)

	return pd.DataFrame(outputs)


	# def _serial_correlogram(
	# gdf: gpd.GeoDataFrame,
	# variable: str,
	# distances: list,
	# distance_type: str = "band",
	# statistic: str = "I",
	# ):
	# """_summary_

	# Parameters
	# ----------
	# gdf : gpd.GeoDataFrame
	# geodataframe holding spatial and attribute data
	# variable: str
	# column on the geodataframe used to compute autocorrelation statistic
	# distances : list
	# _description_
	# distance_type : str, optional
	# which concept of distance to increment. Options are {`band`, `knn`}.
	# by default 'band'
	# statistic : str, by default 'I'
	# which spatial autocorrelation statistic to compute. Options in {`I`, `G`}
	# """
	# if statistic == "I":
	# STATISTIC = Moran
	# elif statistic == "G":
	# STATISTIC = G

	# else:
	# with NotImplementedError as e:
	# raise e("Only I and G statistics are currently implemented")

	# if distance_type == "band":
	# W = DistanceBand
	# elif distance_type == "knn":
	# W = KNN
	# else:
	# with NotImplementedError as e:
	# raise e("distance_type must be either `band` or `knn` ")

	# # should be able to build the tree once and reuse it?
	# pts = get_points_array(gdf[gdf.geometry.name])
	# tree = KDTree(pts)

	# output = dict()
	# ps = dict()

	# for dist in tqdm(distances):
	# w = W(tree, dist, silence_warnings=True)
	# stat = getattr(STATISTIC(gdf[variable].values, w), statistic)
	# output[dist] = stat
	# ps[dist] = getattr(STATISTIC(gdf[variable].values, w), "p_sim")
	# return pd.DataFrame([pd.Series(output), pd.Series(ps)])