Skip to content

Instantly share code, notes, and snippets.

View do-y-lee's full-sized avatar

Do Lee do-y-lee

View GitHub Profile
@do-y-lee
do-y-lee / window_func_demo_tables.sql
Last active July 18, 2020 07:53
orders and product_orders tables for window functions
DROP TABLE IF EXISTS orders;
CREATE TABLE orders (
order_id serial primary key,
order_date timestamp,
user_id integer,
amount numeric,
num_product_items integer
);
@do-y-lee
do-y-lee / window_func_partitionby_orderby.sql
Last active September 22, 2022 06:01
Window functions
SELECT
user_id,
order_id,
product_name,
amount,
-- rank items within the order by amount
ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY amount DESC) AS row_num,
-- rank items within the order by amount and highlight same values with same rank
DENSE_RANK() OVER (PARTITION BY order_id ORDER BY amount DESC) AS dense_rnk,
-- understand the percentile ranking of items in terms of amount within the order
@do-y-lee
do-y-lee / window_func_orderby_only.sql
Last active July 14, 2020 22:07
Using ORDER BY clause
SELECT
order_id,
amount,
num_product_items,
-- ranking by num_product_items
ROW_NUMBER() OVER (ORDER BY num_product_items) AS row_num_by_items,
RANK() OVER (ORDER BY num_product_items) AS rank_by_items,
DENSE_RANK() OVER (ORDER BY num_product_items) AS dense_rank_by_items,
-- ranking by amount
ROW_NUMBER() OVER (ORDER BY amount) AS row_num_by_amount,
@do-y-lee
do-y-lee / window_func_partition_by_only.sql
Last active July 22, 2020 19:00
Using only PARTITION BY
SELECT
user_id,
order_id,
product_name,
amount,
NTILE(3) OVER (PARTITION BY order_id) AS ntile_by_order_id,
FIRST_VALUE(amount) OVER (PARTITION BY order_id) AS first_amount_value_by_order_id,
COUNT(*) OVER (PARTITION BY order_id) AS count_by_order_id,
SUM(amount) OVER (PARTITION BY order_id) AS sum_by_order_id,
ROUND(AVG(amount) OVER (PARTITION BY order_id), 4) AS avg_amount_by_order_id
@do-y-lee
do-y-lee / window_func_over_with_metrics.sql
Last active July 16, 2020 07:37
Building metrics with OVER ()
SELECT
t1.order_id,
t1.order_date,
-- aggregate functions + OVER clause = window functions
SUM(t1.amount) OVER () AS total_amount,
ROUND(AVG(t1.amount) OVER (), 2) AS avg_order_size,
SUM(t1.num_product_items) OVER () AS total_product_items,
COUNT(*) OVER () AS total_count,
-- create ratios and % of total using values from window functions
ROUND(1.0 * t1.amount / SUM(t1.amount) OVER (), 4) AS percent_amount,
@do-y-lee
do-y-lee / window_func_over_clause.sql
Last active July 14, 2020 19:42
Use of OVER () clause with aggregate SQL functions
SELECT
t1.*,
SUM(t1.amount) OVER () AS total_amount,
COUNT(*) OVER () AS total_count,
AVG(t1.amount) OVER () AS total_mean
FROM orders t1;
# define our optimal random forest model
best_rfgs = rfgs.best_estimator_
# create model dict variable for output_model_stats function
model = {"rf_grid_search": best_rfgs}
df_rfgs = output_model_stats(model, X_train, y_train)
df_rfgs = df_rfgs.append(df_rfrs)
df_rfgs
# instantiate base random forest model class
rf = RandomForestClassifier(random_state=0)
# define gridsearch parameters
param_grid = {'criterion': ['gini', 'entropy'],
'n_estimators': [300, 500, 700],
'max_features': ['auto', 'log2'],
'max_depth': [3, 5, 7],
'min_samples_split': [8, 10, 12],
'min_samples_leaf': [8, 10, 12]}
# define our optimal random forest model
best_rfrs = rf_rsearch.best_estimator_
# create model dict variable for output_model_stats function
model = {"rf_randomized_search": best_rfrs}
df_rfrs = output_model_stats(model, X_train, y_train)
df_rfrs = df_rfrs.append(df_iteration_3)
df_rfrs
@do-y-lee
do-y-lee / titanic_randomizedsearchcv_initialize.py
Last active June 23, 2020 19:57
Titanic Randomized Search CV
# param_distributions
rs_grid = {'n_estimators': [100, 300, 500, 800, 1000, 1200, 1400],
'criterion': ['gini', 'entropy'],
'max_features': ['auto', 'log2'],
'max_depth': [3, 5, 7, 9, 15, 25, 30],
'min_samples_split': [2, 3, 4, 5, 7, 10, 15],
'min_samples_leaf': [1, 2, 5, 10, 12, 14]}
# instantiate RF model class for tuning
rf = RandomForestClassifier(random_state=0)