Kat Li YiLi225

## running_total.sql
--- 2) Running total/frequency
SELECT
  DAT.NUM_VAR,
  SUM(NUM_VAR) OVER (PARTITION BY JOIN_ID) AS TOTAL_SUM,
  ROUND(CUM_SUM / SUM(NUM_VAR) OVER (PARTITION BY JOIN_ID), 4) AS CUM_FREQ
FROM
(
  SELECT
	 T.*,
	 SUM(NUM_VAR) OVER (ORDER BY NUM_VAR ROWS UNBOUNDED PRECEDING) AS CUM_SUM,

## conditional_where.sql
-- 4) Conditional where clause
SELECT
  DAT.ID_VAR,
  DAT.SEQ_VAR,
  DAT.NUM_VAR,
  DATE_VAR1,
  DATE_VAR2,
  TRUNC(DATE_VAR2) - TRUNC(DATE_VAR1) AS LAG_IN_DATES
FROM
  CURRENT_TABLE      DAT

## lag.sql
--- 5) LAG() or LEAD() function
SELECT
 DAT.ID_VAR,
 DAT.SEQ_VAR,
 DAT.NUM_VAR,
 NUM_VAR - PREV_NUM AS NUM_DIFF
FROM
(
	SELECT
	 T.*,

## getSQL.py
import pandas as pd
def getSQL(sql_query,
           place_holder_str,
           replace_place_holder_with,
           database_con):
    '''
    Args:
        sql_query: sql query file
        place_holder_str: string in the original sql query that is to be replaced
        replace_place_holder_with: real values that should be put in

## SQL_FILE.sql
SELECT
  *
FROM
  CURRENT_TABLE   DAT
WHERE
  ID_VAR IN ('ID_LIST')
ORDER BY ID_VAR, SEQ_VAR

## call_getSQL.py
seq12_df = getSQL('SQL_FILE.sql', 'ID_LIST', "','".join(['19228', '19272']), database_con=conn)

## regularExpression.sql
-- Find and extract numbers between 0 - 9 that consecutively happens 5 times
SELECT
  SUBSTRING(LONG_TEXT, REG_IDX, REG_IDX+5) AS NUMBER_LIST_FOUND
FROM
(
  SELECT
    REGEXP_INSTR(LONG_TEXT, '[0-9]{5}') AS REG_IDX,
    LONG_TEXT
  FROM
    BONUS

## helper.R
#### helper function for trajectory line chart:
#### note: there is global variable in this function!!!
helper_vis_continuous <- function(dat = country_data, country = 'US') {
  ## vertical structure for ggplot
  current_dat = dat %>%
    filter(Country.Region == country) %>%
    reshape2::melt(.) %>%
    set_colnames(c('Country', 'Status', 'Date', 'Total'))

  ## starts with the date when 1st case was confirmed for this country

## wide_to_long.R
#### Select the countries for plotting, and convert wide format to long
current_dat = country_data %>%
  filter(Country.Region %in% selected_countries) %>%
  reshape2::melt(.) %>%
  set_colnames(c('Country', 'Status', 'Date', 'Total'))

## starts with the date when 1st case confirmed
find_case1_onwards <- function(country_name) {
  case1_idx = which(current_dat[current_dat$Country == country_name, 'Total'] > 0)[1]


## check_cols.R
### check whether the column names of 3 datasets match up
columns = sapply(series_data_, colnames)

### !!! The code below certainly works for datasets with small numbers of columns,
### However, what if we have 1000 columns to do pair-wise checking,
### or additional columns being added to the datasource?
all(columns[, 1] == columns[, 2])
all(columns[, 2] == columns[, 3])
all(columns[, 1] == columns[, 3])
	--- 2) Running total/frequency
	SELECT
	DAT.NUM_VAR,
	SUM(NUM_VAR) OVER (PARTITION BY JOIN_ID) AS TOTAL_SUM,
	ROUND(CUM_SUM / SUM(NUM_VAR) OVER (PARTITION BY JOIN_ID), 4) AS CUM_FREQ
	FROM
	(
	SELECT
	T.*,
	SUM(NUM_VAR) OVER (ORDER BY NUM_VAR ROWS UNBOUNDED PRECEDING) AS CUM_SUM,
	-- 4) Conditional where clause
	SELECT
	DAT.ID_VAR,
	DAT.SEQ_VAR,
	DAT.NUM_VAR,
	DATE_VAR1,
	DATE_VAR2,
	TRUNC(DATE_VAR2) - TRUNC(DATE_VAR1) AS LAG_IN_DATES
	FROM
	CURRENT_TABLE DAT
	--- 5) LAG() or LEAD() function
	SELECT
	DAT.ID_VAR,
	DAT.SEQ_VAR,
	DAT.NUM_VAR,
	NUM_VAR - PREV_NUM AS NUM_DIFF
	FROM
	(
	SELECT
	T.*,
	import pandas as pd
	def getSQL(sql_query,
	place_holder_str,
	replace_place_holder_with,
	database_con):
	'''
	Args:
	sql_query: sql query file
	place_holder_str: string in the original sql query that is to be replaced
	replace_place_holder_with: real values that should be put in
	-- Find and extract numbers between 0 - 9 that consecutively happens 5 times
	SELECT
	SUBSTRING(LONG_TEXT, REG_IDX, REG_IDX+5) AS NUMBER_LIST_FOUND
	FROM
	(
	SELECT
	REGEXP_INSTR(LONG_TEXT, '[0-9]{5}') AS REG_IDX,
	LONG_TEXT
	FROM
	BONUS
	#### helper function for trajectory line chart:
	#### note: there is global variable in this function!!!
	helper_vis_continuous <- function(dat = country_data, country = 'US') {
	## vertical structure for ggplot
	current_dat = dat %>%
	filter(Country.Region == country) %>%
	reshape2::melt(.) %>%
	set_colnames(c('Country', 'Status', 'Date', 'Total'))

	## starts with the date when 1st case was confirmed for this country
	#### Select the countries for plotting, and convert wide format to long
	current_dat = country_data %>%
	filter(Country.Region %in% selected_countries) %>%
	reshape2::melt(.) %>%
	set_colnames(c('Country', 'Status', 'Date', 'Total'))

	## starts with the date when 1st case confirmed
	find_case1_onwards <- function(country_name) {
	case1_idx = which(current_dat[current_dat$Country == country_name, 'Total'] > 0)[1]
	### check whether the column names of 3 datasets match up
	columns = sapply(series_data_, colnames)

	### !!! The code below certainly works for datasets with small numbers of columns,
	### However, what if we have 1000 columns to do pair-wise checking,
	### or additional columns being added to the datasource?
	all(columns[, 1] == columns[, 2])
	all(columns[, 2] == columns[, 3])
	all(columns[, 1] == columns[, 3])