Created
May 17, 2023 17:41
-
-
Save ckrapu/a2537104c3e5a03d86cf3dbd65feb20a to your computer and use it in GitHub Desktop.
Example Snowflake query for DomZip
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INSERT INTO USER_SANDBOX.CHRIS_KRAPU.DOMGEO_DAILY_TEST_FOR_SALE ( | |
RDC_VISITOR_ID | |
, MEMBER_ID | |
, DOMINANT_GEO | |
, MEAN_DIST_KM | |
, DG_VIEWS | |
, SUM_VIEWS_ACROSS_TOP | |
, DG_VIEWS_PCT | |
, TOP_VIEWED_GEOS | |
, SPATIAL_DISPERSION_INDEX | |
, BINNED_CONFIDENCE | |
, VERSION | |
, DG_CREATION_TIME_MST | |
, LOOKBACK_DAYS | |
, LOOKBACK_START_DATE | |
, LOOKBACK_END_DATE | |
, RESOLUTION | |
, CITY | |
, STATE | |
, COUNTY | |
, DMA | |
) | |
-- This set of CTEs determines the view-weighted centroid for each GEO | |
-- as the median of the property longitudes / latitudes within that GEO. | |
-- Note that if there are multiple rdc_visitor_id for a single member_id, | |
-- we really just want to take the events per member_id so we make all those | |
-- rdc_visitor_ids into null values | |
WITH relevant_fct_events as ( | |
SELECT | |
IFF(PERSISTED_IDENTITY_ID IS NULL OR PERSISTED_IDENTITY_ID='', POST_PROP_WEB_CLIENT_VISITOR_ID, '') | |
as POST_PROP_WEB_CLIENT_VISITOR_ID | |
, PERSISTED_IDENTITY_ID | |
, HIT_DATETIME | |
, PERSISTED_PROPERTY_ID | |
, PERSISTED_CITY | |
, PERSISTED_STATE | |
, PERSISTED_ZIP | |
, 'zip' AS RESOLUTION | |
,CASE | |
WHEN resolution = 'zip' THEN LPAD(RIGHT(PERSISTED_ZIP, 5), 5, '0') | |
WHEN resolution = 'neighborhood' THEN | |
IFF( | |
--happens when neighborhood field like 'Scranton_OH' which is redundant with city/state cols | |
CONTAINS(PERSISTED_NEIGHBORHOOD, '_'), | |
REPLACE(PERSISTED_NEIGHBORHOOD, ' ', '-'), | |
TRIM( --Take off leading or trailing underscores | |
REGEXP_REPLACE( --Replace multiple dashes with single dash | |
REGEXP_REPLACE( --Get rid of "unknown" to clean up city_state combos | |
REGEXP_REPLACE( --Replace all whitespace with a single dash | |
REGEXP_REPLACE( -- form neighborhood + city + state string | |
SPLIT_PART(PERSISTED_NEIGHBORHOOD, '_', 1) | |
|| '_' | |
|| PERSISTED_CITY | |
|| '_' | |
|| PERSISTED_STATE, | |
'\\s+', '-'), | |
'[\'\.]',''), | |
'unknown_*',''), | |
'--+', '-'), | |
'_') | |
) | |
ELSE NULL | |
END AS GEO | |
FROM RDC_CORE.FACT.FCT_PRODUCT_EVENT_DETAIL as fct | |
WHERE TRUE | |
AND HIT_YEAR IN (2023) | |
AND HIT_MONTH IN (5) | |
AND DATE(HIT_DATETIME) BETWEEN '2023-05-03' AND '2023-05-17' | |
AND lower(PERSISTED_EVENT_NAME) = 'pageview' | |
AND PERSISTED_ZIP not in ('unknown', '') | |
AND PERSISTED_STATE not in ('unknown', '') | |
AND PERSISTED_CITY not in ('unknown', '') | |
), | |
RELEVANT_FCT_EVENTS_FILTERED as ( | |
SELECT * FROM relevant_fct_events | |
), | |
ZIP_TABLE as ( | |
SELECT | |
LPAD(TO_VARCHAR(ZIP),5, '0') as ZIP | |
, CITY | |
, STATE | |
, COUNTY | |
FROM RDC_ANALYTICS.DIM.US_ZIP_MAPPING | |
), | |
geo_mean_coords AS ( | |
SELECT | |
GEO | |
, MEDIAN(TRY_TO_DOUBLE(prop.PROPERTY_LATITUDE)) as LAT | |
, MEDIAN(TRY_TO_DOUBLE(prop.PROPERTY_LONGITUDE)) as LON | |
, COUNT(*) AS NUM_PROPERTIES | |
, MODE(PERSISTED_CITY) as CITY | |
, MODE(PERSISTED_STATE) as STATE | |
, MODE(PERSISTED_ZIP) as ZIP | |
FROM RELEVANT_FCT_EVENTS_FILTERED as fct | |
JOIN LEGACY_BRIDGE.BIZ_DATA_HOMES.PROPERTY as prop on fct.PERSISTED_PROPERTY_ID = prop.PROPERTY_ID | |
GROUP BY GEO | |
HAVING LON BETWEEN -145.0 AND -40.0 | |
), | |
coords_with_city_state as ( | |
SELECT | |
GEO | |
, LAT | |
, LON | |
, NUM_PROPERTIES | |
, zt.CITY | |
, zt.STATE | |
, zt.COUNTY | |
, gmc.ZIP | |
, mapping.DMA_CODE | |
FROM geo_mean_coords gmc | |
JOIN ZIP_TABLE zt ON zt.ZIP = gmc.ZIP | |
JOIN RDC_ANALYTICS.DIM.ZIP_DMA_MAP mapping ON gmc.ZIP = mapping.ZIP_CODE | |
), | |
-- This query identifies the most viewed GEOs per user as well as | |
-- a single dominant GEO. | |
top_k_per_user AS ( | |
SELECT | |
POST_PROP_WEB_CLIENT_VISITOR_ID AS RDC_VISITOR_ID | |
, PERSISTED_IDENTITY_ID AS MEMBER_ID | |
, GEO | |
, COUNT(*) AS N_VIEWS_IN_GEO | |
, DATE(MAX(HIT_DATETIME)) AS MOST_RECENT_VIEW_DATE | |
FROM RELEVANT_FCT_EVENTS_FILTERED | |
WHERE TRUE | |
AND (RDC_VISITOR_ID NOT IN ('', 'unknown') OR MEMBER_ID NOT IN ('', 'unknown')) | |
AND (RDC_VISITOR_ID IS NOT NULL OR MEMBER_ID IS NOT NULL) | |
GROUP BY RDC_VISITOR_ID, MEMBER_ID, GEO | |
QUALIFY ROW_NUMBER() over ( | |
PARTITION BY RDC_VISITOR_ID, MEMBER_ID | |
ORDER BY N_VIEWS_IN_GEO | |
) BETWEEN 1 AND 20 | |
),-- This query converts the format above with one row per user and geo code into | |
-- a format with one row per user, and a new object column "TOP_K" that maps the GEO to the number of views. | |
-- Note that TOP_K needs to preserve the order of the GEOs as ranked by N_VIEW_IN_GEO | |
agg_geos AS ( | |
SELECT RDC_VISITOR_ID | |
, MEMBER_ID | |
, OBJECTAGG(GEO, N_VIEWS_IN_GEO) AS TOP_K | |
, SUM(N_VIEWS_IN_GEO) AS SUM_VIEWS_ACROSS_TOP | |
, MAX(N_VIEWS_IN_GEO) AS DG_VIEWS | |
FROM top_k_per_user | |
GROUP BY RDC_VISITOR_ID, MEMBER_ID | |
), | |
-- This CTE picks off the top ranked GEO from the N per-member GEOs | |
dominant_geos AS ( | |
SELECT RDC_VISITOR_ID | |
, MEMBER_ID | |
, GEO AS DOMINANT_GEO | |
, N_VIEWS_IN_GEO | |
FROM top_k_per_user QUALIFY ROW_NUMBER() over ( | |
PARTITION BY RDC_VISITOR_ID, MEMBER_ID | |
ORDER BY N_VIEWS_IN_GEO DESC | |
) = 1 | |
), | |
view_dates AS ( | |
SELECT RDC_VISITOR_ID | |
, MEMBER_ID | |
, MAX(MOST_RECENT_VIEW_DATE) AS MOST_RECENT_VIEW_DATE | |
, SUM(N_VIEWS_IN_GEO) AS SUM_VIEWS_ACROSS_TOP | |
FROM top_k_per_user | |
GROUP BY RDC_VISITOR_ID, MEMBER_ID | |
), | |
-- This CTE forms a long table with one row per recommendation and is used | |
-- to compute the spatial dispersion index by first calculating distances | |
-- between GEO codes and then computing the average distance. | |
-- TODO: Optimize this query by precomputing the GEO-GEO distance pairs instead of | |
-- computing them on the fly; many of these pairs are redundant and it may be | |
-- faster to join than to do the haversine distance calculations. | |
geo_dists AS ( | |
SELECT | |
topk.RDC_VISITOR_ID | |
, topk.MEMBER_ID | |
, topk.GEO | |
, topk.N_VIEWS_IN_GEO | |
, dom.DOMINANT_GEO as DOMINANT_GEO | |
, gmc_dom.LAT as DOM_LAT | |
, gmc_dom.LON as DOM_LON | |
, gmc_relative.LAT as REL_LAT | |
, gmc_relative.LON as REL_LON | |
, HAVERSINE(DOM_LAT, DOM_LON, REL_LAT, REL_LON) as DIST_KM | |
, agg_geos.SUM_VIEWS_ACROSS_TOP | |
, topk.N_VIEWS_IN_GEO / agg_geos.SUM_VIEWS_ACROSS_TOP as VIEW_WEIGHTS | |
FROM top_k_per_user AS topk | |
JOIN dominant_geos AS dom on | |
topk.RDC_VISITOR_ID = dom.RDC_VISITOR_ID | |
and topk.MEMBER_ID = dom.MEMBER_ID | |
JOIN geo_mean_coords AS gmc_dom ON dom.DOMINANT_GEO = gmc_dom.GEO | |
JOIN geo_mean_coords AS gmc_relative ON topk.GEO = gmc_relative.GEO | |
JOIN agg_geos on | |
topk.RDC_VISITOR_ID = agg_geos.RDC_VISITOR_ID | |
and topk.MEMBER_ID = agg_geos.MEMBER_ID | |
WHERE topk.GEO != dom.DOMINANT_GEO | |
) | |
,member_with_percentile AS ( | |
SELECT | |
RDC_VISITOR_ID | |
, MEMBER_ID | |
, DOMINANT_GEO | |
, SUM(VIEW_WEIGHTS * DIST_KM) AS MEAN_DIST_KM | |
FROM geo_dists | |
GROUP BY RDC_VISITOR_ID, MEMBER_ID, DOMINANT_GEO | |
) | |
-- This is done in a separate CTE to avoid unpredictable / undesired behavior | |
-- when using the PERCENT_RANK function as part of a larger query. | |
,percentile_ranks as ( | |
SELECT | |
RDC_VISITOR_ID | |
, MEMBER_ID | |
, PERCENT_RANK() OVER (ORDER BY MEAN_DIST_KM) AS SPATIAL_DISPERSION_INDEX | |
FROM member_with_percentile | |
), | |
final AS ( | |
SELECT | |
mwp.RDC_VISITOR_ID | |
, mwp.MEMBER_ID | |
, mwp.DOMINANT_GEO | |
, mwp.MEAN_DIST_KM | |
, agg_geos.DG_VIEWS | |
, agg_geos.SUM_VIEWS_ACROSS_TOP | |
, agg_geos.DG_VIEWS / agg_geos.SUM_VIEWS_ACROSS_TOP as DG_VIEWS_PCT | |
, agg_geos.TOP_K as TOP_VIEWED_GEOS | |
, pr.SPATIAL_DISPERSION_INDEX | |
, CASE --first three clauses should cover every case, but error clause is included for safety | |
WHEN (mwp.MEAN_DIST_KM > 160 OR SUM_VIEWS_ACROSS_TOP < 6) THEN 'low' | |
WHEN (mwp.MEAN_DIST_KM > 40 OR SUM_VIEWS_ACROSS_TOP < 12) THEN 'medium' | |
WHEN (mwp.MEAN_DIST_KM <= 40 OR SUM_VIEWS_ACROSS_TOP >= 12) THEN 'high' | |
ELSE 'error' | |
END AS BINNED_CONFIDENCE | |
, '1.1.1' as VERSION | |
, '2023-05-17 00:59:07.817185-06:00' as DG_CREATION_TIME_MST | |
, 14 as LOOKBACK_DAYS | |
, '2023-05-03' AS LOOKBACK_START_DATE | |
, '2023-05-17' AS LOOKBACK_END_DATE | |
, 'zip' as RESOLUTION | |
, cwc.CITY | |
, cwc.STATE | |
, cwc.COUNTY | |
, cwc.DMA_CODE as DOMINANT_DMA | |
FROM member_with_percentile mwp | |
JOIN agg_geos USING (RDC_VISITOR_ID, MEMBER_ID) | |
JOIN percentile_ranks pr USING (RDC_VISITOR_ID, MEMBER_ID) | |
JOIN coords_with_city_state cwc on mwp.DOMINANT_GEO = cwc.GEO | |
) | |
SELECT * FROM final |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment