Paul Singman peacing

## f_crypto_prices_daily_change_v2.sql
with yest AS (
    SELECT currency, price
    FROM f_crypto_prices
    WHERE datekey = '20220315')
, today AS (
    SELECT currency, price
    FROM f_crypto_prices
    WHERE datekey = '20220316')

SELECT

## f_crypto_prices_daily_change.sql
with yest AS (
    SELECT currency, price
    FROM f_crypto_prices
    WHERE TRUNC(datetime, 'day') = '20220315')
, today AS (
    SELECT currency, price
    FROM f_crypto_prices
    WHERE TRUNC(datetime, 'day') = '20220316')

SELECT

## crypto_alerts.sql
with begin as
  (select currency,
          max(price) as price
   from crypto_prices_delta
   where date_key = {{date_key | sqlsafe}}
     and datetime =
       (select min(datetime)
        from crypto_prices_delta
        where date_key = {{date_key | sqlsafe}})
   group by currency), end as

## hex_fetcher_loop.py
import time

counter = 0
while True:

  ################
  ## code
  ## to fetch
  ## crypto API
  ## data

## crypto_data_fetcher.py
import requests
from datetime import datetime
import pandas as pd

import awswrangler as wr
import boto3

session = boto3.session.Session(aws_access_key_id=key_access_aws, aws_secret_access_key=key_secret_aws)
s3 = session.client('s3')

## pandas_on_spark_monthly_loan_stats.py
# group by month using the loan created date and calculate some stats off the grouped data
def group_functions(x):
    d = {}

    d['loan_app_count'] = x['loan_amount'].count()
    d['loan_funded_count'] = x['funded_at'].count()
    d['conversion_rate'] = max(x['funded_at'].count() /x['created_at'].count(), 0)
    d['time_to_conversion_avg'] = x['time_to_conversion'].mean()

    return pd.Series(d, index=['loan_app_count', 'loan_funded_count',

## pandas_monthly_loan_stats.py
# group by month using the loan created date and calculate some stats off the grouped data
def group_functions(x):
    d = {}

    d['loan_app_count'] = x['loan_amount'].count()
    d['loan_funded_count'] = x['funded_at'].count()
    d['conversion_rate'] = max(x['funded_at'].count() / x['created_at'].count(), 0)
    d['time_to_conversion_avg'] = x['time_to_conversion'].mean()

    return pd.Series(d, index=['loan_app_count','loan_funded_count',

## pandas-on-spark-read-df.py
import pyspark.pandas as ps

# data path in HDFS
loans_filename = '/FileStore/tables/loans.csv'

loans_df = ps.read_csv(
  loans_filename,
  header=None,
  names=['loan_amount', 'address', 'created_at', 'funded_at'],
  infer_datetime_format=True,

## docker-compose.yml
version: "3"
services:
  lakefs-setup:
    image: treeverse/lakefs:latest
    container_name: lakefs-setup
    depends_on:
      - postgres
      - minio-setup
    environment:
      - LAKEFS_AUTH_ENCRYPT_SECRET_KEY=some random secret string

## sessions.sql
create table user_sessions
USING DELTA AS
( select user_id, event_date, event_action,
SUM(is_new_session) OVER (ORDER BY user_id, event_date) AS global_session_id,
SUM(is_new_session) OVER (PARTITION BY user_id ORDER BY event_date) AS user_session_id
FROM
(  select *,
  CASE WHEN unix_timestamp(event_date) - unix_timestamp(last_event) >= (24*60*60) OR
  last_event is NULL THEN 1 ELSE 0 END AS is_new_session
  from (
	with yest AS (
	SELECT currency, price
	FROM f_crypto_prices
	WHERE datekey = '20220315')
	, today AS (
	SELECT currency, price
	FROM f_crypto_prices
	WHERE datekey = '20220316')

	SELECT
	with begin as
	(select currency,
	max(price) as price
	from crypto_prices_delta
	where date_key = {{date_key \| sqlsafe}}
	and datetime =
	(select min(datetime)
	from crypto_prices_delta
	where date_key = {{date_key \| sqlsafe}})
	group by currency), end as
	import time

	counter = 0
	while True:

	################
	## code
	## to fetch
	## crypto API
	## data
	import requests
	from datetime import datetime
	import pandas as pd

	import awswrangler as wr
	import boto3

	session = boto3.session.Session(aws_access_key_id=key_access_aws, aws_secret_access_key=key_secret_aws)
	s3 = session.client('s3')
	# group by month using the loan created date and calculate some stats off the grouped data
	def group_functions(x):
	d = {}

	d['loan_app_count'] = x['loan_amount'].count()
	d['loan_funded_count'] = x['funded_at'].count()
	d['conversion_rate'] = max(x['funded_at'].count() /x['created_at'].count(), 0)
	d['time_to_conversion_avg'] = x['time_to_conversion'].mean()

	return pd.Series(d, index=['loan_app_count', 'loan_funded_count',
	import pyspark.pandas as ps

	# data path in HDFS
	loans_filename = '/FileStore/tables/loans.csv'

	loans_df = ps.read_csv(
	loans_filename,
	header=None,
	names=['loan_amount', 'address', 'created_at', 'funded_at'],
	infer_datetime_format=True,
	version: "3"
	services:
	lakefs-setup:
	image: treeverse/lakefs:latest
	container_name: lakefs-setup
	depends_on:
	- postgres
	- minio-setup
	environment:
	- LAKEFS_AUTH_ENCRYPT_SECRET_KEY=some random secret string
	create table user_sessions
	USING DELTA AS
	( select user_id, event_date, event_action,
	SUM(is_new_session) OVER (ORDER BY user_id, event_date) AS global_session_id,
	SUM(is_new_session) OVER (PARTITION BY user_id ORDER BY event_date) AS user_session_id
	FROM
	( select *,
	CASE WHEN unix_timestamp(event_date) - unix_timestamp(last_event) >= (246060) OR
	last_event is NULL THEN 1 ELSE 0 END AS is_new_session
	from (