Last active
February 28, 2024 16:27
-
-
Save masoud-saedi/df21e1f95fb900db6a5405e400a80d44 to your computer and use it in GitHub Desktop.
SQL Queries for TravelTide Customer Segmentation Project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Cohort definition: | |
WITH cohort_users AS ( | |
SELECT user_id | |
FROM sessions | |
WHERE session_start > '2023-01-04' | |
GROUP BY user_id | |
HAVING COUNT(session_id) > 7 | |
), | |
-- Using for calculating the distance between two airports in the final query: | |
VincentyDistance AS ( | |
SELECT | |
s.user_id, | |
f.trip_id, | |
-- Constants for WGS-84 ellipsiod parameters | |
6378137.0 AS a, -- semi-major axis in meters | |
6356752.3142 AS b, -- semi-minor axis in meters | |
1/298.257223563 AS f, -- flattening | |
-- Convert degrees to radians for latitudes and longitudes | |
RADIANS(u.home_airport_lat) AS lat1, | |
RADIANS(u.home_airport_lon) AS lon1, | |
RADIANS(f.destination_airport_lat) AS lat2, | |
RADIANS(f.destination_airport_lon) AS lon2 | |
FROM sessions AS s | |
JOIN flights AS f ON s.trip_id = f.trip_id | |
JOIN users AS u ON s.user_id = u.user_id | |
WHERE s.user_id IN (SELECT user_id FROM cohort_users) | |
), | |
-- Using for claculating the distance between two airports in the final query: | |
VincentyInitialComputations AS ( | |
SELECT | |
user_id, | |
trip_id, | |
a, | |
b, | |
f, | |
lat1, | |
lon1, | |
lat2, | |
lon2, | |
-- Compute delta longitude | |
lon2 - lon1 AS L, | |
ATAN((1 - f) * TAN(lat1)) AS U1, | |
ATAN((1 - f) * TAN(lat2)) AS U2 | |
FROM VincentyDistance | |
), | |
-- Using for claculating the distance between two airports in the final query: | |
VincentyIntermediateComputations AS ( | |
SELECT | |
user_id, | |
trip_id, | |
a, | |
b, | |
f, | |
lat1, | |
lon1, | |
lat2, | |
lon2, | |
L, | |
U1, | |
U2, | |
SQRT( | |
(COS(U2)*SIN(L)) * (COS(U2)*SIN(L)) + | |
(COS(U1)*SIN(U2) - SIN(U1)*COS(U2)*COS(L)) * (COS(U1)*SIN(U2) - SIN(U1)*COS(U2)*COS(L)) | |
) AS sinSigma, | |
SIN(U1)*SIN(U2) + COS(U1)*COS(U2)*COS(L) AS cosSigma | |
FROM VincentyInitialComputations | |
), | |
-- Using for claculating the distance between two airports in the final query: | |
VincentyComputations AS ( | |
SELECT | |
user_id, | |
trip_id, | |
a, | |
b, | |
f, | |
lat1, | |
lon1, | |
lat2, | |
lon2, | |
L, | |
U1, | |
U2, | |
sinSigma, | |
cosSigma, | |
COS(U1)*COS(U2)*SIN(L) AS sinAlpha, | |
SQRT(1 - COS(U1)*COS(U2)*SIN(L)*COS(U1)*COS(U2)*SIN(L)) AS cos2Alpha, | |
cosSigma - 2*SIN(U1)*SIN(U2) / (SQRT(1 - COS(U1)*COS(U2)*SIN(L)*COS(U1)*COS(U2)*SIN(L))) AS cos2SigmaM, | |
a*1.0/(1.0-f) AS u_sq | |
FROM VincentyIntermediateComputations | |
), | |
-- Aggregate and Merge Data | |
aggregated_data AS ( | |
SELECT | |
s.user_id, | |
-- total number of user's sessions | |
COUNT(DISTINCT s.session_id) AS session_count, | |
-- average duration of sessions in minute | |
ROUND(AVG(EXTRACT(MINUTE FROM (session_end - session_start))),2) AS avg_session_duration_minute, | |
-- average number of clicks in all browsing sessions | |
ROUND(AVG(page_clicks),2) AS avg_page_clicks, | |
-- total number of booked trips | |
COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END) AS total_trips, | |
-- conversion rate, dividing the number of booked trips (in case of no cancellation) by total number of browising sessions | |
ROUND( | |
CASE WHEN COUNT(DISTINCT s.session_id) > 0 THEN | |
1.0 * COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END) / COUNT(DISTINCT s.session_id) | |
ELSE 0 END | |
,2) AS conversion_rate, | |
-- Cancellation proportion, returns NULL for users who didn't book any trip to not get division by zero error | |
ROUND( | |
1.0 * COUNT(DISTINCT CASE WHEN cancellation THEN s.trip_id END) / | |
NULLIF(COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END), 0) | |
,2) AS cancellation_proportion, | |
-- calculating the booking to departure time gap in seconds and then days by dividing by 86400 | |
ROUND( | |
AVG(EXTRACT(EPOCH FROM (f.departure_time - s.session_end)) / 86400) | |
,2) AS avg_booking_departure_gap_days_flights, | |
-- As some users only booked hotels, I add another calculation considering hotel check_in_time and later in Python will merge these two columns | |
ROUND( | |
AVG(EXTRACT(EPOCH FROM (h.check_in_time - s.session_end)) / 86400) | |
,2) AS avg_booking_departure_gap_days_hotels, | |
-- total number of flights | |
COUNT(DISTINCT CASE WHEN flight_booked THEN s.trip_id END) AS total_flights_booked, | |
/* Weekend trips proportion, to distinguish weekened gateway travelers, | |
when the departure time is on Fridays or Saturdays, and return_time is on Sundays or Mondays | |
and the duration of the trip is less than three days | |
*/ | |
ROUND( | |
CASE WHEN COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END) > 0 THEN | |
1.0 * COUNT(DISTINCT CASE WHEN EXTRACT(DOW FROM departure_time) IN (5,6) | |
AND return_flight_booked IS TRUE | |
AND EXTRACT(DOW FROM return_time) IN (0,1) | |
AND EXTRACT(DAY FROM (return_time - departure_time)) < 3 | |
THEN f.trip_id | |
ELSE NULL END) / COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END) ELSE 0 END | |
,2) AS weekend_trip_proportion, | |
-- Round trips proportion, users who booked two ways flights | |
ROUND( | |
CASE WHEN COUNT(DISTINCT CASE WHEN flight_booked THEN s.trip_id END) > 0 THEN | |
1.0 * COUNT(DISTINCT CASE WHEN return_flight_booked THEN s.trip_id END) / | |
COUNT(DISTINCT CASE WHEN flight_booked THEN s.trip_id END) ELSE 0 END | |
,2) AS round_trips_proportion, | |
-- average flight price | |
ROUND(AVG(base_fare_usd),2) AS avg_flight_price_usd, | |
-- average flight discount amount | |
ROUND(AVG(flight_discount_amount),2) AS avg_flight_discount_amount, | |
-- discounted flights proportion | |
ROUND(SUM(CASE WHEN flight_discount THEN 1 ELSE 0 END) :: NUMERIC / COUNT(*),2) AS discounted_flight_proportion, | |
-- average number of booked flight seats | |
ROUND(AVG(seats),2) AS avg_flight_seats, | |
-- average number of checked bags in flights | |
ROUND(AVG(checked_bags),2) AS avg_checked_bags, | |
-- Vincenty formula for average distance between airports (average distance flown in km) | |
ROUND( | |
AVG( | |
v.a * ATAN2( | |
SQRT((v.cos2Alpha)*(v.sinSigma*v.sinSigma)), | |
v.cosSigma - v.f*v.cos2Alpha*(v.cos2SigmaM) | |
)/1000 | |
)::NUMERIC, 2 | |
) AS avg_distance_flown_km, | |
-- total number of booked hotels | |
COUNT(DISTINCT CASE WHEN hotel_booked THEN s.trip_id END) AS total_hotels_booked, | |
-- average hotel price | |
ROUND(AVG(hotel_per_room_usd),2) AS avg_hotel_price_usd, | |
-- average hotel discount amount | |
ROUND(AVG(hotel_discount_amount),2) AS avg_hotel_discount_amount, | |
-- discounted hotel proportion | |
ROUND(SUM(CASE WHEN hotel_discount THEN 1 ELSE 0 END) :: NUMERIC / COUNT(*),2) AS discounted_hotel_proportion, | |
-- average number of rooms in booked hotels | |
ROUND(AVG(rooms),2) AS avg_hotel_rooms, | |
-- average duration of hotel stays in days | |
ROUND(AVG(EXTRACT(DAY FROM (check_out_time - check_in_time))),2) AS avg_stay_duration_day | |
FROM sessions AS s | |
LEFT JOIN flights AS f ON s.trip_id = f.trip_id | |
LEFT JOIN hotels AS h ON s.trip_id = h.trip_id | |
LEFT JOIN users AS u ON s.user_id = u.user_id | |
LEFT JOIN VincentyComputations AS v ON s.trip_id = v.trip_id | |
WHERE s.user_id IN (SELECT user_id FROM cohort_users) | |
GROUP BY s.user_id | |
) | |
-- Final Selection | |
SELECT | |
-- user demographic information | |
u.user_id, | |
u.sign_up_date, | |
EXTRACT(YEAR FROM AGE(u.birthdate)) AS age, | |
u.gender, | |
u.married, | |
u.has_children, | |
u.home_country, | |
u.home_city, | |
-- browsing sessions info | |
ad.session_count, | |
ad.avg_session_duration_minute, | |
ad.avg_page_clicks, | |
-- booking behaviour | |
ad.total_trips, | |
ad.conversion_rate, | |
ad.weekend_trip_proportion, | |
ad.cancellation_proportion, | |
ad.avg_booking_departure_gap_days_flights, | |
ad.avg_booking_departure_gap_days_hotels, | |
-- booked flights info | |
ad.total_flights_booked, | |
ad.round_trips_proportion, | |
ad.avg_flight_price_usd, | |
ad.avg_flight_discount_amount, | |
ad.discounted_flight_proportion, | |
ad.avg_flight_seats, | |
ad.avg_checked_bags, | |
ad.avg_distance_flown_km, | |
-- booked hotels info | |
ad.total_hotels_booked, | |
ad.avg_hotel_price_usd, | |
ad.avg_hotel_discount_amount, | |
ad.discounted_hotel_proportion, | |
ad.avg_hotel_rooms, | |
ad.avg_stay_duration_day | |
FROM users AS u | |
JOIN aggregated_data AS ad ON u.user_id = ad.user_id; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment