Skip to content

Instantly share code, notes, and snippets.

View running_total.sql
--- 2) Running total/frequency
SELECT
DAT.NUM_VAR,
SUM(NUM_VAR) OVER (PARTITION BY JOIN_ID) AS TOTAL_SUM,
ROUND(CUM_SUM / SUM(NUM_VAR) OVER (PARTITION BY JOIN_ID), 4) AS CUM_FREQ
FROM
(
SELECT
T.*,
SUM(NUM_VAR) OVER (ORDER BY NUM_VAR ROWS UNBOUNDED PRECEDING) AS CUM_SUM,
View conditional_where.sql
-- 4) Conditional where clause
SELECT
DAT.ID_VAR,
DAT.SEQ_VAR,
DAT.NUM_VAR,
DATE_VAR1,
DATE_VAR2,
TRUNC(DATE_VAR2) - TRUNC(DATE_VAR1) AS LAG_IN_DATES
FROM
CURRENT_TABLE DAT
View lag.sql
--- 5) LAG() or LEAD() function
SELECT
DAT.ID_VAR,
DAT.SEQ_VAR,
DAT.NUM_VAR,
NUM_VAR - PREV_NUM AS NUM_DIFF
FROM
(
SELECT
T.*,
View getSQL.py
import pandas as pd
def getSQL(sql_query,
place_holder_str,
replace_place_holder_with,
database_con):
'''
Args:
sql_query: sql query file
place_holder_str: string in the original sql query that is to be replaced
replace_place_holder_with: real values that should be put in
View SQL_FILE.sql
SELECT
*
FROM
CURRENT_TABLE DAT
WHERE
ID_VAR IN ('ID_LIST')
ORDER BY ID_VAR, SEQ_VAR
View call_getSQL.py
seq12_df = getSQL('SQL_FILE.sql', 'ID_LIST', "','".join(['19228', '19272']), database_con=conn)
View regularExpression.sql
-- Find and extract numbers between 0 - 9 that consecutively happens 5 times
SELECT
SUBSTRING(LONG_TEXT, REG_IDX, REG_IDX+5) AS NUMBER_LIST_FOUND
FROM
(
SELECT
REGEXP_INSTR(LONG_TEXT, '[0-9]{5}') AS REG_IDX,
LONG_TEXT
FROM
BONUS
@YiLi225
YiLi225 / helper.R
Created Mar 29, 2020
ggplot line chart helper function
View helper.R
#### helper function for trajectory line chart:
#### note: there is global variable in this function!!!
helper_vis_continuous <- function(dat = country_data, country = 'US') {
## vertical structure for ggplot
current_dat = dat %>%
filter(Country.Region == country) %>%
reshape2::melt(.) %>%
set_colnames(c('Country', 'Status', 'Date', 'Total'))
## starts with the date when 1st case was confirmed for this country
@YiLi225
YiLi225 / wide_to_long.R
Created Mar 29, 2020
Create plotting dataset for plotly
View wide_to_long.R
#### Select the countries for plotting, and convert wide format to long
current_dat = country_data %>%
filter(Country.Region %in% selected_countries) %>%
reshape2::melt(.) %>%
set_colnames(c('Country', 'Status', 'Date', 'Total'))
## starts with the date when 1st case confirmed
find_case1_onwards <- function(country_name) {
case1_idx = which(current_dat[current_dat$Country == country_name, 'Total'] > 0)[1]
@YiLi225
YiLi225 / check_cols.R
Last active Mar 29, 2020
Check the columns of the datasets
View check_cols.R
### check whether the column names of 3 datasets match up
columns = sapply(series_data_, colnames)
### !!! The code below certainly works for datasets with small numbers of columns,
### However, what if we have 1000 columns to do pair-wise checking,
### or additional columns being added to the datasource?
all(columns[, 1] == columns[, 2])
all(columns[, 2] == columns[, 3])
all(columns[, 1] == columns[, 3])