Skip to content

Instantly share code, notes, and snippets.

@ckrapu
Created May 17, 2023 17:41
Show Gist options
  • Save ckrapu/a2537104c3e5a03d86cf3dbd65feb20a to your computer and use it in GitHub Desktop.
Save ckrapu/a2537104c3e5a03d86cf3dbd65feb20a to your computer and use it in GitHub Desktop.
Example Snowflake query for DomZip
INSERT INTO USER_SANDBOX.CHRIS_KRAPU.DOMGEO_DAILY_TEST_FOR_SALE (
RDC_VISITOR_ID
, MEMBER_ID
, DOMINANT_GEO
, MEAN_DIST_KM
, DG_VIEWS
, SUM_VIEWS_ACROSS_TOP
, DG_VIEWS_PCT
, TOP_VIEWED_GEOS
, SPATIAL_DISPERSION_INDEX
, BINNED_CONFIDENCE
, VERSION
, DG_CREATION_TIME_MST
, LOOKBACK_DAYS
, LOOKBACK_START_DATE
, LOOKBACK_END_DATE
, RESOLUTION
, CITY
, STATE
, COUNTY
, DMA
)
-- This set of CTEs determines the view-weighted centroid for each GEO
-- as the median of the property longitudes / latitudes within that GEO.
-- Note that if there are multiple rdc_visitor_id for a single member_id,
-- we really just want to take the events per member_id so we make all those
-- rdc_visitor_ids into null values
WITH relevant_fct_events as (
SELECT
IFF(PERSISTED_IDENTITY_ID IS NULL OR PERSISTED_IDENTITY_ID='', POST_PROP_WEB_CLIENT_VISITOR_ID, '')
as POST_PROP_WEB_CLIENT_VISITOR_ID
, PERSISTED_IDENTITY_ID
, HIT_DATETIME
, PERSISTED_PROPERTY_ID
, PERSISTED_CITY
, PERSISTED_STATE
, PERSISTED_ZIP
, 'zip' AS RESOLUTION
,CASE
WHEN resolution = 'zip' THEN LPAD(RIGHT(PERSISTED_ZIP, 5), 5, '0')
WHEN resolution = 'neighborhood' THEN
IFF(
--happens when neighborhood field like 'Scranton_OH' which is redundant with city/state cols
CONTAINS(PERSISTED_NEIGHBORHOOD, '_'),
REPLACE(PERSISTED_NEIGHBORHOOD, ' ', '-'),
TRIM( --Take off leading or trailing underscores
REGEXP_REPLACE( --Replace multiple dashes with single dash
REGEXP_REPLACE( --Get rid of "unknown" to clean up city_state combos
REGEXP_REPLACE( --Replace all whitespace with a single dash
REGEXP_REPLACE( -- form neighborhood + city + state string
SPLIT_PART(PERSISTED_NEIGHBORHOOD, '_', 1)
|| '_'
|| PERSISTED_CITY
|| '_'
|| PERSISTED_STATE,
'\\s+', '-'),
'[\'\.]',''),
'unknown_*',''),
'--+', '-'),
'_')
)
ELSE NULL
END AS GEO
FROM RDC_CORE.FACT.FCT_PRODUCT_EVENT_DETAIL as fct
WHERE TRUE
AND HIT_YEAR IN (2023)
AND HIT_MONTH IN (5)
AND DATE(HIT_DATETIME) BETWEEN '2023-05-03' AND '2023-05-17'
AND lower(PERSISTED_EVENT_NAME) = 'pageview'
AND PERSISTED_ZIP not in ('unknown', '')
AND PERSISTED_STATE not in ('unknown', '')
AND PERSISTED_CITY not in ('unknown', '')
),
RELEVANT_FCT_EVENTS_FILTERED as (
SELECT * FROM relevant_fct_events
),
ZIP_TABLE as (
SELECT
LPAD(TO_VARCHAR(ZIP),5, '0') as ZIP
, CITY
, STATE
, COUNTY
FROM RDC_ANALYTICS.DIM.US_ZIP_MAPPING
),
geo_mean_coords AS (
SELECT
GEO
, MEDIAN(TRY_TO_DOUBLE(prop.PROPERTY_LATITUDE)) as LAT
, MEDIAN(TRY_TO_DOUBLE(prop.PROPERTY_LONGITUDE)) as LON
, COUNT(*) AS NUM_PROPERTIES
, MODE(PERSISTED_CITY) as CITY
, MODE(PERSISTED_STATE) as STATE
, MODE(PERSISTED_ZIP) as ZIP
FROM RELEVANT_FCT_EVENTS_FILTERED as fct
JOIN LEGACY_BRIDGE.BIZ_DATA_HOMES.PROPERTY as prop on fct.PERSISTED_PROPERTY_ID = prop.PROPERTY_ID
GROUP BY GEO
HAVING LON BETWEEN -145.0 AND -40.0
),
coords_with_city_state as (
SELECT
GEO
, LAT
, LON
, NUM_PROPERTIES
, zt.CITY
, zt.STATE
, zt.COUNTY
, gmc.ZIP
, mapping.DMA_CODE
FROM geo_mean_coords gmc
JOIN ZIP_TABLE zt ON zt.ZIP = gmc.ZIP
JOIN RDC_ANALYTICS.DIM.ZIP_DMA_MAP mapping ON gmc.ZIP = mapping.ZIP_CODE
),
-- This query identifies the most viewed GEOs per user as well as
-- a single dominant GEO.
top_k_per_user AS (
SELECT
POST_PROP_WEB_CLIENT_VISITOR_ID AS RDC_VISITOR_ID
, PERSISTED_IDENTITY_ID AS MEMBER_ID
, GEO
, COUNT(*) AS N_VIEWS_IN_GEO
, DATE(MAX(HIT_DATETIME)) AS MOST_RECENT_VIEW_DATE
FROM RELEVANT_FCT_EVENTS_FILTERED
WHERE TRUE
AND (RDC_VISITOR_ID NOT IN ('', 'unknown') OR MEMBER_ID NOT IN ('', 'unknown'))
AND (RDC_VISITOR_ID IS NOT NULL OR MEMBER_ID IS NOT NULL)
GROUP BY RDC_VISITOR_ID, MEMBER_ID, GEO
QUALIFY ROW_NUMBER() over (
PARTITION BY RDC_VISITOR_ID, MEMBER_ID
ORDER BY N_VIEWS_IN_GEO
) BETWEEN 1 AND 20
),-- This query converts the format above with one row per user and geo code into
-- a format with one row per user, and a new object column "TOP_K" that maps the GEO to the number of views.
-- Note that TOP_K needs to preserve the order of the GEOs as ranked by N_VIEW_IN_GEO
agg_geos AS (
SELECT RDC_VISITOR_ID
, MEMBER_ID
, OBJECTAGG(GEO, N_VIEWS_IN_GEO) AS TOP_K
, SUM(N_VIEWS_IN_GEO) AS SUM_VIEWS_ACROSS_TOP
, MAX(N_VIEWS_IN_GEO) AS DG_VIEWS
FROM top_k_per_user
GROUP BY RDC_VISITOR_ID, MEMBER_ID
),
-- This CTE picks off the top ranked GEO from the N per-member GEOs
dominant_geos AS (
SELECT RDC_VISITOR_ID
, MEMBER_ID
, GEO AS DOMINANT_GEO
, N_VIEWS_IN_GEO
FROM top_k_per_user QUALIFY ROW_NUMBER() over (
PARTITION BY RDC_VISITOR_ID, MEMBER_ID
ORDER BY N_VIEWS_IN_GEO DESC
) = 1
),
view_dates AS (
SELECT RDC_VISITOR_ID
, MEMBER_ID
, MAX(MOST_RECENT_VIEW_DATE) AS MOST_RECENT_VIEW_DATE
, SUM(N_VIEWS_IN_GEO) AS SUM_VIEWS_ACROSS_TOP
FROM top_k_per_user
GROUP BY RDC_VISITOR_ID, MEMBER_ID
),
-- This CTE forms a long table with one row per recommendation and is used
-- to compute the spatial dispersion index by first calculating distances
-- between GEO codes and then computing the average distance.
-- TODO: Optimize this query by precomputing the GEO-GEO distance pairs instead of
-- computing them on the fly; many of these pairs are redundant and it may be
-- faster to join than to do the haversine distance calculations.
geo_dists AS (
SELECT
topk.RDC_VISITOR_ID
, topk.MEMBER_ID
, topk.GEO
, topk.N_VIEWS_IN_GEO
, dom.DOMINANT_GEO as DOMINANT_GEO
, gmc_dom.LAT as DOM_LAT
, gmc_dom.LON as DOM_LON
, gmc_relative.LAT as REL_LAT
, gmc_relative.LON as REL_LON
, HAVERSINE(DOM_LAT, DOM_LON, REL_LAT, REL_LON) as DIST_KM
, agg_geos.SUM_VIEWS_ACROSS_TOP
, topk.N_VIEWS_IN_GEO / agg_geos.SUM_VIEWS_ACROSS_TOP as VIEW_WEIGHTS
FROM top_k_per_user AS topk
JOIN dominant_geos AS dom on
topk.RDC_VISITOR_ID = dom.RDC_VISITOR_ID
and topk.MEMBER_ID = dom.MEMBER_ID
JOIN geo_mean_coords AS gmc_dom ON dom.DOMINANT_GEO = gmc_dom.GEO
JOIN geo_mean_coords AS gmc_relative ON topk.GEO = gmc_relative.GEO
JOIN agg_geos on
topk.RDC_VISITOR_ID = agg_geos.RDC_VISITOR_ID
and topk.MEMBER_ID = agg_geos.MEMBER_ID
WHERE topk.GEO != dom.DOMINANT_GEO
)
,member_with_percentile AS (
SELECT
RDC_VISITOR_ID
, MEMBER_ID
, DOMINANT_GEO
, SUM(VIEW_WEIGHTS * DIST_KM) AS MEAN_DIST_KM
FROM geo_dists
GROUP BY RDC_VISITOR_ID, MEMBER_ID, DOMINANT_GEO
)
-- This is done in a separate CTE to avoid unpredictable / undesired behavior
-- when using the PERCENT_RANK function as part of a larger query.
,percentile_ranks as (
SELECT
RDC_VISITOR_ID
, MEMBER_ID
, PERCENT_RANK() OVER (ORDER BY MEAN_DIST_KM) AS SPATIAL_DISPERSION_INDEX
FROM member_with_percentile
),
final AS (
SELECT
mwp.RDC_VISITOR_ID
, mwp.MEMBER_ID
, mwp.DOMINANT_GEO
, mwp.MEAN_DIST_KM
, agg_geos.DG_VIEWS
, agg_geos.SUM_VIEWS_ACROSS_TOP
, agg_geos.DG_VIEWS / agg_geos.SUM_VIEWS_ACROSS_TOP as DG_VIEWS_PCT
, agg_geos.TOP_K as TOP_VIEWED_GEOS
, pr.SPATIAL_DISPERSION_INDEX
, CASE --first three clauses should cover every case, but error clause is included for safety
WHEN (mwp.MEAN_DIST_KM > 160 OR SUM_VIEWS_ACROSS_TOP < 6) THEN 'low'
WHEN (mwp.MEAN_DIST_KM > 40 OR SUM_VIEWS_ACROSS_TOP < 12) THEN 'medium'
WHEN (mwp.MEAN_DIST_KM <= 40 OR SUM_VIEWS_ACROSS_TOP >= 12) THEN 'high'
ELSE 'error'
END AS BINNED_CONFIDENCE
, '1.1.1' as VERSION
, '2023-05-17 00:59:07.817185-06:00' as DG_CREATION_TIME_MST
, 14 as LOOKBACK_DAYS
, '2023-05-03' AS LOOKBACK_START_DATE
, '2023-05-17' AS LOOKBACK_END_DATE
, 'zip' as RESOLUTION
, cwc.CITY
, cwc.STATE
, cwc.COUNTY
, cwc.DMA_CODE as DOMINANT_DMA
FROM member_with_percentile mwp
JOIN agg_geos USING (RDC_VISITOR_ID, MEMBER_ID)
JOIN percentile_ranks pr USING (RDC_VISITOR_ID, MEMBER_ID)
JOIN coords_with_city_state cwc on mwp.DOMINANT_GEO = cwc.GEO
)
SELECT * FROM final
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment