Skip to content

Instantly share code, notes, and snippets.

@knaaptime
Created July 19, 2023 14:57
Show Gist options
  • Save knaaptime/510fd5f03fed425f3d3d5163b599320d to your computer and use it in GitHub Desktop.
Save knaaptime/510fd5f03fed425f3d3d5163b599320d to your computer and use it in GitHub Desktop.
# Spatial Correlograms
import geopandas as gpd
import pandas as pd
from esda import G, Geary, Moran
from joblib import Parallel, delayed
from libpysal.cg.kdtree import KDTree
from libpysal.weights import KNN, DistanceBand
from libpysal.weights.util import get_points_array
from tqdm.auto import tqdm
def _get_autocorrelation_stat(inputs):
"""helper function for computing parallel autocorrelation statistics
Parameters
----------
inputs : tuple
tuple of (y, tree, W, statistic, STATISTIC, dist, weights_kwargs, stat_kwargs)
Returns
-------
pandas.Series
a pandas series with the computed autocorrelation statistic and its simulated p-value
"""
(
y, # y variable
tree, # kd tree
W, # weights class DistanceBand or KNN
statistic, # name of the stat ('I', etc) for getattr
STATISTIC, # class of statistic (Moran, Geary, etc)
dist, # threshold/k parameter for the weights
weights_kwargs, # additional args
stat_kwargs, # additional args
) = inputs
w = W(tree, dist, silence_warnings=True, **weights_kwargs)
stat = getattr(STATISTIC(y, w, **stat_kwargs), statistic)
p = getattr(STATISTIC(y, w), "p_sim")
return pd.Series([stat, p], index=[statistic, "pvalue"], name=dist)
def correlogram(
gdf: gpd.GeoDataFrame,
variable: str,
distances: list,
distance_type: str = "band",
statistic: str = "I",
weights_kwargs: dict = None,
stat_kwargs: dict = None,
n_jobs: int = -1,
backend: str = "loky",
):
"""Generate a spatial correlogram
Parameters
----------
gdf : gpd.GeoDataFrame
geodataframe holding spatial and attribute data
variable: str
column on the geodataframe used to compute autocorrelation statistic
distances : list
list of distances to compute the autocorrelation statistic
distance_type : str, optional
which concept of distance to increment. Options are {`band`, `knn`}.
by default 'band' (for `libpysal.weights.DistanceBand` weights)
statistic : str, by default 'I'
which spatial autocorrelation statistic to compute. Options in {`I`, `G`, `C`}
weights_kwargs : dict
additional keyword arguments passed to the libpysal.weights.W class
stat_kwargs : dict
additional keyword arguments passed to the `esda` autocorrelation statistic class.
For example for faster results with no statistical inference, set the nuumber
of permutations to zero with {permutations: 0}
n_jobs : int
number of jobs to pass to joblib. If -1 (default), all cores will be used
backend : str
backend parameter passed to joblb
Returns
-------
outputs : pandas.DataFrame
table of autocorrelation statistics at increasing distance bandwidths
"""
if stat_kwargs is None:
stat_kwargs = dict()
if weights_kwargs is None:
weights_kwargs = dict()
if statistic == "I":
STATISTIC = Moran
elif statistic == "G":
STATISTIC = G
elif statistic == "C":
STATISTIC = Geary
else:
with NotImplementedError as e:
raise e("Only I, G, and C statistics are currently implemented")
if distance_type == "band":
W = DistanceBand
elif distance_type == "knn":
if max(distances) > gdf.shape[0] - 1:
with ValueError as e:
raise e("max number of neighbors must be less than or equal to n-1")
W = KNN
else:
with NotImplementedError as e:
raise e("distance_type must be either `band` or `knn` ")
# should be able to build the tree once and reuse it?
# but in practice, im not seeing any real difference from starting a new W from scratch each time
pts = get_points_array(gdf[gdf.geometry.name])
tree = KDTree(pts)
y = gdf[variable].values
inputs = [
tuple(
[
y,
tree,
W,
statistic,
STATISTIC,
dist,
weights_kwargs,
stat_kwargs,
]
)
for dist in distances
]
outputs = Parallel(n_jobs=n_jobs, backend=backend)(
delayed(_get_autocorrelation_stat)(i) for i in inputs
)
return pd.DataFrame(outputs)
# def _serial_correlogram(
# gdf: gpd.GeoDataFrame,
# variable: str,
# distances: list,
# distance_type: str = "band",
# statistic: str = "I",
# ):
# """_summary_
# Parameters
# ----------
# gdf : gpd.GeoDataFrame
# geodataframe holding spatial and attribute data
# variable: str
# column on the geodataframe used to compute autocorrelation statistic
# distances : list
# _description_
# distance_type : str, optional
# which concept of distance to increment. Options are {`band`, `knn`}.
# by default 'band'
# statistic : str, by default 'I'
# which spatial autocorrelation statistic to compute. Options in {`I`, `G`}
# """
# if statistic == "I":
# STATISTIC = Moran
# elif statistic == "G":
# STATISTIC = G
# else:
# with NotImplementedError as e:
# raise e("Only I and G statistics are currently implemented")
# if distance_type == "band":
# W = DistanceBand
# elif distance_type == "knn":
# W = KNN
# else:
# with NotImplementedError as e:
# raise e("distance_type must be either `band` or `knn` ")
# # should be able to build the tree once and reuse it?
# pts = get_points_array(gdf[gdf.geometry.name])
# tree = KDTree(pts)
# output = dict()
# ps = dict()
# for dist in tqdm(distances):
# w = W(tree, dist, silence_warnings=True)
# stat = getattr(STATISTIC(gdf[variable].values, w), statistic)
# output[dist] = stat
# ps[dist] = getattr(STATISTIC(gdf[variable].values, w), "p_sim")
# return pd.DataFrame([pd.Series(output), pd.Series(ps)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment