Skip to content

Instantly share code, notes, and snippets.

View jason-jz-zhu's full-sized avatar

Jiazhen Zhu jason-jz-zhu

View GitHub Profile
@jason-jz-zhu
jason-jz-zhu / auto_pipeline_extactor.yaml
Created May 20, 2022 17:48
auto_pipeline_extactor.yaml
---
sources:
- name: "author"
type: "orc"
path: "data/author/"
- name: "book"
type: "orc"
path: "data/book/"
- name: "editor"
type: "orc"
@jason-jz-zhu
jason-jz-zhu / auto_pipeline_run.sh
Created May 20, 2022 17:25
auto_pipeline_run.sh
python workflowRunner.py
--extracts_yaml="yamls/extractors.yaml"
--transforms_yaml="yamls/transforms.yaml"
--loaders_yaml="yamls/loaders.yaml"
---
targets:
- name: "fact_dim_df"
type: "orc"
mode: "overwrite"
path: "data/fact_dim_tbl/"
- name: "agg_df"
type: "orc"
mode: "overwrite"
path: "data/agg_tbl/"
@jason-jz-zhu
jason-jz-zhu / auto_pipeline_transformer_with.yaml
Created May 20, 2022 17:18
auto pipeline transformer with
---
query: "with step1 as (
select firstname, id from df
), step2 as (
select gender, salary, id from df
), step3 as (
select
s1.id, s1.firstname, s2.gender, s2.salary
from step1 as s1
inner join step2 as s2
@jason-jz-zhu
jason-jz-zhu / auto_pipeline_transformer.yaml
Created May 20, 2022 17:16
auto pipeline transfromor
---
steps:
- query: "SELECT
b.id,
b.title,
a.first_name AS author_first_name,
a.last_name AS author_last_name,
e.last_name AS editor,
b.type AS type,
case when t.last_name is null then 0 else t.last_name end AS translator,
@jason-jz-zhu
jason-jz-zhu / loader.yaml
Created May 20, 2022 17:13
auto pipeline loader
---
targets:
- name: "fact_dim_df"
type: "orc"
mode: "overwrite"
path: "data/fact_dim_tbl/"
- name: "agg_df"
type: "orc"
mode: "overwrite"
path: "data/agg_tbl/"
@jason-jz-zhu
jason-jz-zhu / prerequest_install.sh
Created May 20, 2022 16:19
auto pipeline prerequest
pip install databathing
pip install -r requirements.txt
#!/bin/bash
set -exo pipefail
readonly PACKAGES=$(/usr/share/google/get_metadata_value attributes/PIP_PACKAGES || true)
function install_pip() {
if command -v pip >/dev/null; then
echo "pip is already installed."
return 0
build:
rm -rfv ./dist
mkdir ./dist
zip -r ./dist/demo_prj.zip ./demo_prj
cp workflowRunner.py ./dist/workflowRunner.py
cp environment.yaml ./dist/environment.yaml
cp gcp_pip_install.sh ./dist/gcp_pip_install.sh
import pandas as pd
import sys
from demo_prj.utils import pipeline
from demo_prj.utils import logger
from demo_prj.utils import spark_util
from demo_prj.utils import demo_util
from demo_prj.config import df_storage
LOG = logger.get_log(__name__)
pipeline = pipeline.Pipeline()