Skip to content

Instantly share code, notes, and snippets.

View robert8138's full-sized avatar

Robert Chang robert8138

  • Airbnb
  • San Francisco
View GitHub Profile
@robert8138
robert8138 / incremental_load.sql
Last active July 11, 2022 12:08
Demonstrate how to do incremental load
-- Not Recommended Approach: Scan the entire table and rebuild everyday
INSERT OVERWRITE TABLE dim_total_bookings PARTITION (ds = '{{ ds }}')
SELECT
dim_market
, SUM(m_bookings) AS m_bookings
FROM
fct_bookings
WHERE
ds <= '{{ ds }}' -- this is expensive, and can quickly run into scale issue
GROUP BY
@robert8138
robert8138 / pipeline_example.py
Created June 19, 2017 00:39
Pipeline Example
transforms = []
transforms.append(
('select_binary', ColumnSelector(features=binary))
)
transforms.append(
('numeric', ExtendedPipeline([
('select', ColumnSelector(features=numeric)),
('impute', Imputer(missing_values='NaN', strategy='mean', axis=0)),
@robert8138
robert8138 / airflow_toy_example_dag.py
Last active October 26, 2020 05:49
A toy example DAG
"""
A DAG definition file in Airflow, written in Python.
"""
from datetime import datetime, timedelta
from airflow.models import DAG # Import the DAG class
from airflow.operators.bash_operator import BashOperator
from airflow.operators.sensors import TimeDeltaSensor
default_args = {
'owner': 'you',
@robert8138
robert8138 / view_count_youtube_api.py
Last active June 10, 2020 18:50
Use YouTube API to get view trends of popular MOOC course
import os
import pandas as pd
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
import matplotlib
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
CREATE TABLE IF NOT EXISTS fct_bookings (
id_listing BIGINT COMMENT 'Unique ID of the listing'
, id_host BIGINT COMMENT 'Unique ID of the host who owns the listing'
, m_bookings BIGINT COMMENT 'Denoted 1 if a booking transaction occurred'
)
PARTITION BY ( -- this is how we define partition keys
ds STRING
);
@robert8138
robert8138 / paremterized_workflow.sql
Last active June 8, 2020 23:44
Parameterize workflow to incorporate backfill logic
{%- if backfill %}
INSERT OVERWRITE TABLE bookings_summary PARTITION (ds)
{%- else %}
INSERT OVERWRITE TABLE bookings_summary PARTITION (ds = '{{ ds }}')
{%- endif %}
SELECT
dim_market
, SUM(m_bookings) AS m_bookings
{%- if backfill %}
, ds
# Define the CREATE TABLE statement here
{%- macro create_table() %}
...
{%- endmacro %}
# Main ETL logic, insert the results into a STAGING table
{%- macro main() %}
...
{%- endmacro %}
@robert8138
robert8138 / fct_and_dim_tables_in_action.sql
Last active February 24, 2020 13:31
Fact and Dimension Tables in Actions
SELECT
b.dim_market
, SUM(a.m_bookings) AS m_bookings
FROM (
SELECT
id_listing
, 1 AS m_bookings
, m_a # not used (for illustration only)
, m_b # not used (for illustration only)
, m_c # not used (for illustration only)
@robert8138
robert8138 / toy_DAG_definition_file.py
Last active December 26, 2019 14:50
A toy example of a DAG definition file in Airflow
"""
A DAG docstring might be a good way to explain at a high level
what problem space the DAG is looking at.
Links to design documents, upstream dependencies etc
are highly recommended.
"""
from datetime import datetime, timedelta
from airflow.models import DAG # Import the DAG class
from airflow.operators.sensors import NamedHivePartitionSensor
from airflow.operators.hive_operator import HiveOperator
# API endpoints
@webapp.route('/api/all')
def api_all():
events = Events.query.all()
return jsonify(json_list = [event.serialize() for event in events])
@webapp.route('/api/<event_type>')
def api_by_event_type(event_type):
events = Events.query.filter_by(event_type = event_type).all()
return jsonify(json_list = [event.serialize() for event in events])