Skip to content

Instantly share code, notes, and snippets.

@masoud-saedi
Last active February 28, 2024 16:27
Show Gist options
  • Save masoud-saedi/df21e1f95fb900db6a5405e400a80d44 to your computer and use it in GitHub Desktop.
Save masoud-saedi/df21e1f95fb900db6a5405e400a80d44 to your computer and use it in GitHub Desktop.
SQL Queries for TravelTide Customer Segmentation Project
-- Cohort definition:
WITH cohort_users AS (
SELECT user_id
FROM sessions
WHERE session_start > '2023-01-04'
GROUP BY user_id
HAVING COUNT(session_id) > 7
),
-- Using for calculating the distance between two airports in the final query:
VincentyDistance AS (
SELECT
s.user_id,
f.trip_id,
-- Constants for WGS-84 ellipsiod parameters
6378137.0 AS a, -- semi-major axis in meters
6356752.3142 AS b, -- semi-minor axis in meters
1/298.257223563 AS f, -- flattening
-- Convert degrees to radians for latitudes and longitudes
RADIANS(u.home_airport_lat) AS lat1,
RADIANS(u.home_airport_lon) AS lon1,
RADIANS(f.destination_airport_lat) AS lat2,
RADIANS(f.destination_airport_lon) AS lon2
FROM sessions AS s
JOIN flights AS f ON s.trip_id = f.trip_id
JOIN users AS u ON s.user_id = u.user_id
WHERE s.user_id IN (SELECT user_id FROM cohort_users)
),
-- Using for claculating the distance between two airports in the final query:
VincentyInitialComputations AS (
SELECT
user_id,
trip_id,
a,
b,
f,
lat1,
lon1,
lat2,
lon2,
-- Compute delta longitude
lon2 - lon1 AS L,
ATAN((1 - f) * TAN(lat1)) AS U1,
ATAN((1 - f) * TAN(lat2)) AS U2
FROM VincentyDistance
),
-- Using for claculating the distance between two airports in the final query:
VincentyIntermediateComputations AS (
SELECT
user_id,
trip_id,
a,
b,
f,
lat1,
lon1,
lat2,
lon2,
L,
U1,
U2,
SQRT(
(COS(U2)*SIN(L)) * (COS(U2)*SIN(L)) +
(COS(U1)*SIN(U2) - SIN(U1)*COS(U2)*COS(L)) * (COS(U1)*SIN(U2) - SIN(U1)*COS(U2)*COS(L))
) AS sinSigma,
SIN(U1)*SIN(U2) + COS(U1)*COS(U2)*COS(L) AS cosSigma
FROM VincentyInitialComputations
),
-- Using for claculating the distance between two airports in the final query:
VincentyComputations AS (
SELECT
user_id,
trip_id,
a,
b,
f,
lat1,
lon1,
lat2,
lon2,
L,
U1,
U2,
sinSigma,
cosSigma,
COS(U1)*COS(U2)*SIN(L) AS sinAlpha,
SQRT(1 - COS(U1)*COS(U2)*SIN(L)*COS(U1)*COS(U2)*SIN(L)) AS cos2Alpha,
cosSigma - 2*SIN(U1)*SIN(U2) / (SQRT(1 - COS(U1)*COS(U2)*SIN(L)*COS(U1)*COS(U2)*SIN(L))) AS cos2SigmaM,
a*1.0/(1.0-f) AS u_sq
FROM VincentyIntermediateComputations
),
-- Aggregate and Merge Data
aggregated_data AS (
SELECT
s.user_id,
-- total number of user's sessions
COUNT(DISTINCT s.session_id) AS session_count,
-- average duration of sessions in minute
ROUND(AVG(EXTRACT(MINUTE FROM (session_end - session_start))),2) AS avg_session_duration_minute,
-- average number of clicks in all browsing sessions
ROUND(AVG(page_clicks),2) AS avg_page_clicks,
-- total number of booked trips
COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END) AS total_trips,
-- conversion rate, dividing the number of booked trips (in case of no cancellation) by total number of browising sessions
ROUND(
CASE WHEN COUNT(DISTINCT s.session_id) > 0 THEN
1.0 * COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END) / COUNT(DISTINCT s.session_id)
ELSE 0 END
,2) AS conversion_rate,
-- Cancellation proportion, returns NULL for users who didn't book any trip to not get division by zero error
ROUND(
1.0 * COUNT(DISTINCT CASE WHEN cancellation THEN s.trip_id END) /
NULLIF(COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END), 0)
,2) AS cancellation_proportion,
-- calculating the booking to departure time gap in seconds and then days by dividing by 86400
ROUND(
AVG(EXTRACT(EPOCH FROM (f.departure_time - s.session_end)) / 86400)
,2) AS avg_booking_departure_gap_days_flights,
-- As some users only booked hotels, I add another calculation considering hotel check_in_time and later in Python will merge these two columns
ROUND(
AVG(EXTRACT(EPOCH FROM (h.check_in_time - s.session_end)) / 86400)
,2) AS avg_booking_departure_gap_days_hotels,
-- total number of flights
COUNT(DISTINCT CASE WHEN flight_booked THEN s.trip_id END) AS total_flights_booked,
/* Weekend trips proportion, to distinguish weekened gateway travelers,
when the departure time is on Fridays or Saturdays, and return_time is on Sundays or Mondays
and the duration of the trip is less than three days
*/
ROUND(
CASE WHEN COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END) > 0 THEN
1.0 * COUNT(DISTINCT CASE WHEN EXTRACT(DOW FROM departure_time) IN (5,6)
AND return_flight_booked IS TRUE
AND EXTRACT(DOW FROM return_time) IN (0,1)
AND EXTRACT(DAY FROM (return_time - departure_time)) < 3
THEN f.trip_id
ELSE NULL END) / COUNT(DISTINCT CASE WHEN NOT cancellation THEN s.trip_id END) ELSE 0 END
,2) AS weekend_trip_proportion,
-- Round trips proportion, users who booked two ways flights
ROUND(
CASE WHEN COUNT(DISTINCT CASE WHEN flight_booked THEN s.trip_id END) > 0 THEN
1.0 * COUNT(DISTINCT CASE WHEN return_flight_booked THEN s.trip_id END) /
COUNT(DISTINCT CASE WHEN flight_booked THEN s.trip_id END) ELSE 0 END
,2) AS round_trips_proportion,
-- average flight price
ROUND(AVG(base_fare_usd),2) AS avg_flight_price_usd,
-- average flight discount amount
ROUND(AVG(flight_discount_amount),2) AS avg_flight_discount_amount,
-- discounted flights proportion
ROUND(SUM(CASE WHEN flight_discount THEN 1 ELSE 0 END) :: NUMERIC / COUNT(*),2) AS discounted_flight_proportion,
-- average number of booked flight seats
ROUND(AVG(seats),2) AS avg_flight_seats,
-- average number of checked bags in flights
ROUND(AVG(checked_bags),2) AS avg_checked_bags,
-- Vincenty formula for average distance between airports (average distance flown in km)
ROUND(
AVG(
v.a * ATAN2(
SQRT((v.cos2Alpha)*(v.sinSigma*v.sinSigma)),
v.cosSigma - v.f*v.cos2Alpha*(v.cos2SigmaM)
)/1000
)::NUMERIC, 2
) AS avg_distance_flown_km,
-- total number of booked hotels
COUNT(DISTINCT CASE WHEN hotel_booked THEN s.trip_id END) AS total_hotels_booked,
-- average hotel price
ROUND(AVG(hotel_per_room_usd),2) AS avg_hotel_price_usd,
-- average hotel discount amount
ROUND(AVG(hotel_discount_amount),2) AS avg_hotel_discount_amount,
-- discounted hotel proportion
ROUND(SUM(CASE WHEN hotel_discount THEN 1 ELSE 0 END) :: NUMERIC / COUNT(*),2) AS discounted_hotel_proportion,
-- average number of rooms in booked hotels
ROUND(AVG(rooms),2) AS avg_hotel_rooms,
-- average duration of hotel stays in days
ROUND(AVG(EXTRACT(DAY FROM (check_out_time - check_in_time))),2) AS avg_stay_duration_day
FROM sessions AS s
LEFT JOIN flights AS f ON s.trip_id = f.trip_id
LEFT JOIN hotels AS h ON s.trip_id = h.trip_id
LEFT JOIN users AS u ON s.user_id = u.user_id
LEFT JOIN VincentyComputations AS v ON s.trip_id = v.trip_id
WHERE s.user_id IN (SELECT user_id FROM cohort_users)
GROUP BY s.user_id
)
-- Final Selection
SELECT
-- user demographic information
u.user_id,
u.sign_up_date,
EXTRACT(YEAR FROM AGE(u.birthdate)) AS age,
u.gender,
u.married,
u.has_children,
u.home_country,
u.home_city,
-- browsing sessions info
ad.session_count,
ad.avg_session_duration_minute,
ad.avg_page_clicks,
-- booking behaviour
ad.total_trips,
ad.conversion_rate,
ad.weekend_trip_proportion,
ad.cancellation_proportion,
ad.avg_booking_departure_gap_days_flights,
ad.avg_booking_departure_gap_days_hotels,
-- booked flights info
ad.total_flights_booked,
ad.round_trips_proportion,
ad.avg_flight_price_usd,
ad.avg_flight_discount_amount,
ad.discounted_flight_proportion,
ad.avg_flight_seats,
ad.avg_checked_bags,
ad.avg_distance_flown_km,
-- booked hotels info
ad.total_hotels_booked,
ad.avg_hotel_price_usd,
ad.avg_hotel_discount_amount,
ad.discounted_hotel_proportion,
ad.avg_hotel_rooms,
ad.avg_stay_duration_day
FROM users AS u
JOIN aggregated_data AS ad ON u.user_id = ad.user_id;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment