Do Lee do-y-lee

## window_func_demo_tables.sql
DROP TABLE IF EXISTS orders;

CREATE TABLE orders (
  order_id serial primary key,
  order_date timestamp,
  user_id integer,
  amount numeric,
  num_product_items integer
);

## window_func_partitionby_orderby.sql
SELECT
  user_id,
  order_id,
  product_name,
  amount,
  -- rank items within the order by amount
  ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY amount DESC) AS row_num,
  -- rank items within the order by amount and highlight same values with same rank
  DENSE_RANK() OVER (PARTITION BY order_id ORDER BY amount DESC) AS dense_rnk,
  -- understand the percentile ranking of items in terms of amount within the order

## window_func_orderby_only.sql
SELECT
  order_id,
  amount,
  num_product_items,
  -- ranking by num_product_items
  ROW_NUMBER() OVER (ORDER BY num_product_items) AS row_num_by_items,
  RANK() OVER (ORDER BY num_product_items) AS rank_by_items,
  DENSE_RANK() OVER (ORDER BY num_product_items) AS dense_rank_by_items,
  -- ranking by amount
  ROW_NUMBER() OVER (ORDER BY amount) AS row_num_by_amount,

## window_func_partition_by_only.sql
SELECT
  user_id,
  order_id,
  product_name,
  amount,
  NTILE(3) OVER (PARTITION BY order_id) AS ntile_by_order_id,
  FIRST_VALUE(amount) OVER (PARTITION BY order_id) AS first_amount_value_by_order_id,
  COUNT(*) OVER (PARTITION BY order_id) AS count_by_order_id,
  SUM(amount) OVER (PARTITION BY order_id) AS sum_by_order_id,
  ROUND(AVG(amount) OVER (PARTITION BY order_id), 4) AS avg_amount_by_order_id

## window_func_over_with_metrics.sql
SELECT
  t1.order_id,
  t1.order_date,
  -- aggregate functions + OVER clause = window functions
  SUM(t1.amount) OVER () AS total_amount,
  ROUND(AVG(t1.amount) OVER (), 2) AS avg_order_size,
  SUM(t1.num_product_items) OVER () AS total_product_items,
  COUNT(*) OVER () AS total_count,
  -- create ratios and % of total using values from window functions
  ROUND(1.0 * t1.amount / SUM(t1.amount) OVER (), 4) AS percent_amount,

## window_func_over_clause.sql
SELECT
  t1.*,
  SUM(t1.amount) OVER () AS total_amount,
  COUNT(*) OVER () AS total_count,
  AVG(t1.amount) OVER () AS total_mean
FROM orders t1;

## titanic_gridsearchcv_rf_results.py
# define our optimal random forest model
best_rfgs = rfgs.best_estimator_

# create model dict variable for output_model_stats function
model = {"rf_grid_search": best_rfgs}

df_rfgs = output_model_stats(model, X_train, y_train)
df_rfgs = df_rfgs.append(df_rfrs)
df_rfgs

## titanic_gridsearchcv_initialize.py
# instantiate base random forest model class
rf = RandomForestClassifier(random_state=0)

# define gridsearch parameters
param_grid = {'criterion': ['gini', 'entropy'],
              'n_estimators': [300, 500, 700],
              'max_features': ['auto', 'log2'],
              'max_depth': [3, 5, 7],
              'min_samples_split': [8, 10, 12],
              'min_samples_leaf': [8, 10, 12]}

## titanic_randomizedsearchcv_rf_results.py
# define our optimal random forest model
best_rfrs = rf_rsearch.best_estimator_

# create model dict variable for output_model_stats function
model = {"rf_randomized_search": best_rfrs}

df_rfrs = output_model_stats(model, X_train, y_train)
df_rfrs = df_rfrs.append(df_iteration_3)
df_rfrs

## titanic_randomizedsearchcv_initialize.py
# param_distributions
rs_grid = {'n_estimators': [100, 300, 500, 800, 1000, 1200, 1400],
           'criterion': ['gini', 'entropy'],
           'max_features': ['auto', 'log2'],
           'max_depth': [3, 5, 7, 9, 15, 25, 30],
           'min_samples_split': [2, 3, 4, 5, 7, 10, 15],
           'min_samples_leaf': [1, 2, 5, 10, 12, 14]}

# instantiate RF model class for tuning
rf = RandomForestClassifier(random_state=0)
	DROP TABLE IF EXISTS orders;

	CREATE TABLE orders (
	order_id serial primary key,
	order_date timestamp,
	user_id integer,
	amount numeric,
	num_product_items integer
	);
	SELECT
	user_id,
	order_id,
	product_name,
	amount,
	-- rank items within the order by amount
	ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY amount DESC) AS row_num,
	-- rank items within the order by amount and highlight same values with same rank
	DENSE_RANK() OVER (PARTITION BY order_id ORDER BY amount DESC) AS dense_rnk,
	-- understand the percentile ranking of items in terms of amount within the order
	SELECT
	t1.order_id,
	t1.order_date,
	-- aggregate functions + OVER clause = window functions
	SUM(t1.amount) OVER () AS total_amount,
	ROUND(AVG(t1.amount) OVER (), 2) AS avg_order_size,
	SUM(t1.num_product_items) OVER () AS total_product_items,
	COUNT(*) OVER () AS total_count,
	-- create ratios and % of total using values from window functions
	ROUND(1.0 * t1.amount / SUM(t1.amount) OVER (), 4) AS percent_amount,
	SELECT
	t1.*,
	SUM(t1.amount) OVER () AS total_amount,
	COUNT(*) OVER () AS total_count,
	AVG(t1.amount) OVER () AS total_mean
	FROM orders t1;
	# define our optimal random forest model
	best_rfgs = rfgs.best_estimator_

	# create model dict variable for output_model_stats function
	model = {"rf_grid_search": best_rfgs}

	df_rfgs = output_model_stats(model, X_train, y_train)
	df_rfgs = df_rfgs.append(df_rfrs)
	df_rfgs
	# instantiate base random forest model class
	rf = RandomForestClassifier(random_state=0)

	# define gridsearch parameters
	param_grid = {'criterion': ['gini', 'entropy'],
	'n_estimators': [300, 500, 700],
	'max_features': ['auto', 'log2'],
	'max_depth': [3, 5, 7],
	'min_samples_split': [8, 10, 12],
	'min_samples_leaf': [8, 10, 12]}
	# define our optimal random forest model
	best_rfrs = rf_rsearch.best_estimator_

	# create model dict variable for output_model_stats function
	model = {"rf_randomized_search": best_rfrs}

	df_rfrs = output_model_stats(model, X_train, y_train)
	df_rfrs = df_rfrs.append(df_iteration_3)
	df_rfrs
	# param_distributions
	rs_grid = {'n_estimators': [100, 300, 500, 800, 1000, 1200, 1400],
	'criterion': ['gini', 'entropy'],
	'max_features': ['auto', 'log2'],
	'max_depth': [3, 5, 7, 9, 15, 25, 30],
	'min_samples_split': [2, 3, 4, 5, 7, 10, 15],
	'min_samples_leaf': [1, 2, 5, 10, 12, 14]}

	# instantiate RF model class for tuning
	rf = RandomForestClassifier(random_state=0)