Skip to content

Instantly share code, notes, and snippets.

@echeipesh
Last active October 22, 2021 20:12
Show Gist options
  • Save echeipesh/55999853302ab897ae498ce22a659994 to your computer and use it in GitHub Desktop.
Save echeipesh/55999853302ab897ae498ce22a659994 to your computer and use it in GitHub Desktop.
GlobalForestWatch Output Audits
#%%
gt_dir = "/opt/data/gfw2-data/forest_change/umd_landsat_alerts/prod/analysis"
pro_dir = "/opt/data/gfwpro/gfwpro-raster-data"
import os
gt_rasters = [file for file in os.listdir(gt_dir) if file.endswith(".tif")]
pro_rasters = [file for file in os.listdir(pro_dir) if file.endswith(".tif")]
path = gt_rasters[0]
import xarray
import numpy as np
import multiprocessing as mp
for path in gt_rasters:
print(f"Checking: {path}")
with xarray.open_rasterio(f"{gt_dir}/{path}") as gt_raster:
with xarray.open_rasterio(f"{pro_dir}/{path}") as pro_raster:
assert(np.array_equal(gt_raster.data, pro_raster.data))
#%%
from datetime import date, timedelta
from typing import Tuple
from wri_list_tools.list_utils import read_list_tsv
def decode_alert(v: int) -> Tuple[date, bool]:
if v >= 30000:
confidence = True
days = v - 30000
elif v >= 20000:
confidence = False
days = v - 20000
else:
raise ValueError(v)
base = date(year=2014, month=12, day=31)
delta = timedelta(days=days)
return (base + delta, confidence)
assert(decode_alert(20732) == (date(2017,1,1), False))
assert(decode_alert(30732) == (date(2017,1,1), True))
#%%
from itertools import islice
from numpy import linspace
from typing import List, Tuple
import math
def split_bbox(bbox, x_step, y_step) -> List[Tuple]:
def n_grams(a, n):
z = (islice(a, i, None) for i in range(n))
return list(zip(*z))
(xmin, ymin, xmax, ymax) = bbox
x_tiles = math.ceil((xmax - xmin) / x_step)
y_tiles = math.ceil((ymax - ymin) / y_step)
x_breaks = linspace(xmin, xmax, num=x_tiles + 1)
y_breaks = linspace(ymin, ymax, num=y_tiles + 1)
return [ (xmin, ymin, xmax, ymax)
for (ymin, ymax) in n_grams(y_breaks, 2)
for (xmin, xmax) in n_grams(x_breaks, 2)
]
#%%
glad_pro = "/opt/data/gfwpro/gfwpro-raster-data/050W_20S_040W_10S.tif"
glad_gt = "/opt/data/gfw2-data/forest_change/umd_landsat_alerts/prod/analysis/050W_20S_040W_10S.tif"
#%%
import rasterio as rio
from shapely.geometry import box
import geopandas as gpd
with rio.open(glad_gt) as ra:
bounds = ra.bounds
geom = box(*bounds)
windows = split_bbox(bounds,
x_step = (bounds[2]-bounds[0])/100,
y_step = (bounds[3]-bounds[1])/100)
glad_test_df = gpd.GeoDataFrame.from_dict({
"list_id": [1]*len(windows),
"location_id": list(range(0, len(windows))),
"geometry": [box(*bounds) for bounds in windows]
})
from wri_list_tools import write_list_tsv
write_list_tsv(glad_test_df, "/tmp/glad-test-location-split-100.tsv")
#%%
from wri_list_tools import read_list_tsv
read_list_tsv("/tmp/glad-test-location-split-100.tsv").to_file("/tmp/glad-test-location-split-100.geojson", driver="GeoJSON")
#%%
import xarray
from xarray.coding.frequencies import month_anchor_check
arr = xarray.open_rasterio(glad_gt)
#%%
import numpy as np
glad_alerts = arr.data[np.where(arr.data>0)]
(len(arr.data), len(glad_alerts))
#%%
day_alerts={}
week_alerts = {}
month_alerts = {}
for alert in glad_alerts:
assert(alert > 0)
(d, conf) = decode_alert(int(alert))
month_key = f"{d.year}-{d.month:02}"
isoc = d.isocalendar()
week_key = f"{isoc.year}-{isoc.week:02}"
day_key = d.isoformat()
day_alerts[day_key] = day_alerts.get(day_key, 0) + 1
week_alerts[week_key] = week_alerts.get(week_key, 0) + 1
month_alerts[month_key] = month_alerts.get(month_key, 0) + 1
from pprint import pprint
pprint(month_alerts)
#%% load Dashboard job output
from wri_list_tools import Dashboard
dash_test = Dashboard("/tmp/dashboard-glad-test-matched")
dash_test.df
# dash_test['glad_alerts_monthly'].reset_index()['gadm_id'].unique()
#%%
month_key = '2020-07'
gt_ans = dash_test['glad_alerts_monthly'].\
reset_index().\
query(f"month=='{month_key}'")\
['glad_alerts_monthly'].sum()
py_ans = month_alerts[month_key]
(py_ans, gt_ans)
#%% GDAL valid alerts
int(0.00856 * 40000 * 40000)
#%% total GLAD Alerts from Python
int(sum(month_alerts.values()))
#%% total GLAD Alerts from GT
dash_test['glad_alerts_daily']['glad_alerts_daily'].sum()
#%% Monthly aggregate from GT
gt_glad_alerts_monthly = dash_test['glad_alerts_monthly'].reset_index()[['month','glad_alerts_monthly']].groupby(['month']).sum().sort_index()
gt_glad_alerts_monthly.sort_index()
#%% monthly DF from Python
import pandas as pd
py_glad_alerts_monthly = pd.DataFrame.from_dict(month_alerts.copy(), orient="index", columns=['glad_alerts_monthly']).rename_axis("month").sort_index()
py_glad_alerts_monthly
#%% join monhtly alerts
gt_glad_alerts_monthly.compare(py_glad_alerts_monthly)
#%% Weekly aggregate from GT
gt_glad_alerts_weekly = dash_test['glad_alerts_weekly'].reset_index()[['week','glad_alerts_weekly']].groupby(['week']).sum().sort_index()
gt_glad_alerts_weekly.sort_index()
#%% weekly DF from Python
import pandas as pd
py_glad_alerts_weekly = pd.DataFrame.from_dict(week_alerts, orient="index", columns=['glad_alerts_weekly']).rename_axis("week").sort_index()
py_glad_alerts_weekly
#%%
joined = py_glad_alerts_weekly.join(gt_glad_alerts_weekly, lsuffix="_py", rsuffix="_gt") #.to_csv("/tmp/weekly_diff.csv")
joined.query("glad_alerts_weekly_py != glad_alerts_weekly_gt")
#%% Weekly aggregate from GT
gt_glad_alerts_daily = dash_test['glad_alerts_daily'].reset_index()[['date','glad_alerts_daily']].groupby(['date']).sum().sort_index()
gt_glad_alerts_daily.sort_index()
#%% weekly DF from Python
import pandas as pd
py_glad_alerts_daily = pd.DataFrame.from_dict(day_alerts.copy(), orient="index", columns=['glad_alerts_daily']).rename_axis("date").sort_index()
py_glad_alerts_daily
#%%
joined = py_glad_alerts_daily.join(gt_glad_alerts_daily, lsuffix="_py", rsuffix="_gt") #.to_csv("/tmp/weekly_diff.csv")
joined.query("glad_alerts_daily_py != glad_alerts_daily_gt")
## Prepare test inputs from GADM context features
#%% What is the Dashboard job being intersected with?
import pandas as pd
from shapely import wkb
import geopandas as gpd
context_df = pd.read_csv("/opt/data/gfwpro/gadm36_adm2_1_1.tsv", delimiter="\t")
context_df['geometry'] = context_df['geom'].map(lambda s: wkb.loads(s, hex=True))
# gpd.GeoDataFrame(context_df.drop('geom', axis=1)).to_file("/tmp/gadm36_adm2_1_1.gpkg", driver="GPKG")
input_df = context_df.query("tile_id=='10S_050W'")[['fid','geometry']]
input_df = input_df.rename(columns={'fid': 'location_id'})
input_df['list_id'] = 1
input_df = input_df.set_index(['location_id', 'list_id'])
write_list_tsv(input_df, '/tmp/gadm36_input_10S_050W.tsv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment