Skip to content

Instantly share code, notes, and snippets.

@wesseljt
Created May 7, 2024 22:26
Show Gist options
  • Save wesseljt/c6e12a047f16a6a09e5e25cbcb0298b6 to your computer and use it in GitHub Desktop.
Save wesseljt/c6e12a047f16a6a09e5e25cbcb0298b6 to your computer and use it in GitHub Desktop.
-- Create the stitching table to store results
CREATE TEMPORARY TABLE user_stitching (
user_id INT,
mapped_id INT,
mapping_type VARCHAR(10),
earliest_mapping_timestamp TIMESTAMP,
last_activity_timestamp TIMESTAMP,
is_active BOOLEAN,
source VARCHAR(50)
);
-- Insert basic stitching data
INSERT INTO user_stitching
SELECT
a.user_id,
b.mapped_id,
CASE
WHEN a.user_id = b.mapped_id THEN 'self'
ELSE 'mapped'
END AS mapping_type,
a.earliest_mapping_timestamp,
NULL AS last_activity_timestamp,
TRUE AS is_active,
'initial' AS source
FROM initial_user_mappings a
JOIN additional_mapping_data b ON a.user_id = b.user_id
WHERE a.earliest_mapping_timestamp < b.cutoff_timestamp;
-- Update stitching with additional mappings and different conditions
INSERT INTO user_stitching
SELECT
a.user_id,
c.mapped_id,
'cross_mapped' AS mapping_type,
a.earliest_mapping_timestamp,
MAX(c.activity_timestamp) AS last_activity_timestamp,
TRUE AS is_active,
'cross' AS source
FROM initial_user_mappings a
JOIN cross_user_mappings c ON a.user_id = c.user_id
WHERE a.earliest_mapping_timestamp < c.cutoff_timestamp
GROUP BY a.user_id, c.mapped_id, a.earliest_mapping_timestamp;
-- Validate and deactivate outdated mappings
UPDATE user_stitching
SET is_active = FALSE
WHERE last_activity_timestamp < (CURRENT_DATE - INTERVAL '1 year')
AND mapping_type = 'cross_mapped';
-- Insert activity based mappings with multiple joins and conditions
INSERT INTO user_stitching
SELECT
e.user_id,
e.mapped_id,
'activity_based' AS mapping_type,
e.earliest_mapping_timestamp,
MAX(f.activity_timestamp) AS last_activity_timestamp,
TRUE AS is_active,
'activity' AS source
FROM user_activity_mappings e
JOIN activity_logs f ON e.user_id = f.user_id
WHERE e.earliest_mapping_timestamp < f.activity_timestamp
GROUP BY e.user_id, e.mapped_id, e.earliest_mapping_timestamp;
-- Consolidate duplicates with priority to the latest mapping
WITH ranked_mappings AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY last_activity_timestamp DESC) AS rank
FROM user_stitching
)
DELETE FROM ranked_mappings WHERE rank > 1;
-- Insert complex conditional mappings based on user segmentation
INSERT INTO user_stitching
SELECT
g.user_id,
h.mapped_id,
'segment_based' AS mapping_type,
g.earliest_mapping_timestamp,
MAX(i.transaction_date) AS last_activity_timestamp,
CASE WHEN j.user_segment = 'premium' THEN TRUE ELSE FALSE END AS is_active,
'segment' AS source
FROM user_segment_mappings g
JOIN segment_data h ON g.user_id = h.user_id
JOIN transaction_data i ON h.user_id = i.user_id
JOIN user_profiles j ON h.user_id = j.user_id
WHERE g.earliest_mapping_timestamp < i.transaction_date
AND j.user_segment IN ('premium', 'gold')
GROUP BY g.user_id, h.mapped_id, g.earliest_mapping_timestamp, j.user_segment;
-- Final cleanup pass to remove inactive mappings not updated in the last insertion cycle
DELETE FROM user_stitching
WHERE is_active = FALSE AND last_activity_timestamp IS NULL;
-- Create indexes to improve the performance of the stitching table
CREATE INDEX idx_user_stitching_user_id ON user_stitching(user_id);
CREATE INDEX idx_user_stitching_mapped_id ON user_stitching(mapped_id);
-- Output some diagnostics about the stitching process
SELECT 'Total Mappings', COUNT(*) FROM user_stitching UNION
SELECT 'Active Mappings', COUNT(*) FROM user_stitching WHERE is_active = TRUE UNION
SELECT 'Inactive Mappings', COUNT(*) FROM user_stitching WHERE is_active = FALSE;
-- Enhanced Diagnostic Queries
-- Count of total mappings by mapping type
SELECT
mapping_type,
COUNT(*) AS total_count
FROM user_stitching
GROUP BY mapping_type;
-- Count of active vs inactive mappings by source
SELECT
source,
is_active,
COUNT(*) AS count
FROM user_stitching
GROUP BY source, is_active;
-- Average time span from earliest mapping timestamp to last activity timestamp by mapping type
SELECT
mapping_type,
AVG(last_activity_timestamp - earliest_mapping_timestamp) AS average_duration
FROM user_stitching
WHERE last_activity_timestamp IS NOT NULL
GROUP BY mapping_type;
-- Distribution of mappings by user segment for segment-based mappings
SELECT
'Segment-based Mapping Distribution' AS report,
user_segment,
COUNT(*) AS num_mappings
FROM user_stitching
JOIN user_profiles ON user_stitching.user_id = user_profiles.user_id
WHERE mapping_type = 'segment_based'
GROUP BY user_segment;
-- List top 10 users with the most mappings
SELECT
user_id,
COUNT(*) AS mappings_count
FROM user_stitching
GROUP BY user_id
ORDER BY mappings_count DESC
LIMIT 10;
-- Count mappings that have been updated within the last month
SELECT
'Recently Updated Mappings' AS report,
COUNT(*) AS count
FROM user_stitching
WHERE last_activity_timestamp >= (CURRENT_DATE - INTERVAL '30 days');
-- Identify mappings without recent activity
SELECT
'Stale Mappings' AS report,
COUNT(*) AS count
FROM user_stitching
WHERE is_active = TRUE AND (last_activity_timestamp < (CURRENT_DATE - INTERVAL '1 year') OR last_activity_timestamp IS NULL);
-- Check consistency of data between user_stitching and user_profiles
SELECT
'Consistency Check' AS report,
COUNT(*) AS inconsistent_entries
FROM user_stitching s
LEFT JOIN user_profiles p ON s.user_id = p.user_id
WHERE p.user_id IS NULL;
-- Breakdown of inactive mappings by reason
SELECT
'Inactive Mapping Reasons' AS report,
CASE
WHEN last_activity_timestamp < (CURRENT_DATE - INTERVAL '1 year') THEN 'No recent activity'
WHEN is_active = FALSE THEN 'Manually deactivated'
ELSE 'Other'
END AS reason,
COUNT(*) AS count
FROM user_stitching
WHERE is_active = FALSE
GROUP BY reason;
-- Execution of diagnostic queries to monitor and debug the stitching logic continuously
-- These queries provide insights into the health of the data, the efficiency of the process, and potential areas of improvement.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment