Skip to content

Instantly share code, notes, and snippets.

@octavian-zarzu
Created May 5, 2023 14:41
Show Gist options
  • Save octavian-zarzu/a5bb53b0d21f062e6677de1d900a2f4e to your computer and use it in GitHub Desktop.
Save octavian-zarzu/a5bb53b0d21f062e6677de1d900a2f4e to your computer and use it in GitHub Desktop.
CREATE EXTERNAL TABLE ex_nyc_phil (
raw_data TEXT
)
URL = 's3://<your_bucket>/'
OBJECT_PATTERN = '*.json'
TYPE = (JSON PARSE_AS_TEXT = 'TRUE');
SELECT raw_data FROM ex_nyc_phil;
{
"programs":[
{
"season":"1842-43",
"orchestra":"New York Philharmonic",
"concerts":[
{
"Date":"1843-02-18T05:00:00Z",
"eventType":"Subscription Season",
"Venue":"Apollo Rooms",
"Location":"Manhattan, NY",
"Time":"8:00PM"
}
],
"programID":"5178",
"works":[
{
"workTitle":"I PURITANI",
"composerName":"Bellini, Vincenzo",
"conductorName":"Hill, Ureli Corelli",
"ID":"8838*2",
"soloists":[
{
"soloistName":"Otto, Antoinette",
"soloistRoles":"S",
"soloistInstrument":"Soprano"
}
],
"movement":"Elvira (aria): \"Qui la voce...Vien, diletto\""
}
]
}
]
}
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data,'programs') AS programs_arrays FROM ex_nyc_phil;
SELECT LENGTH(JSON_EXTRACT_ARRAY_RAW(raw_data,'programs')) AS programs_arrays FROM ex_nyc_phil;
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
)
SELECT program
FROM programs
UNNEST(programs_arrays AS program);
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), concerts AS (
SELECT JSON_EXTRACT_ARRAY_RAW(program, 'concerts') as concerts_arrays
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
JSON_EXTRACT(concert, 'Date', 'TEXT')::timestamptz as concert_date,
JSON_EXTRACT(concert, 'eventType', 'TEXT') as concert_event_type,
JSON_EXTRACT(concert, 'Venue', 'TEXT') as concert_venue,
JSON_EXTRACT(concert, 'Location', 'TEXT') as concert_location,
JSON_EXTRACT(concert, 'Time', 'TEXT') as concert_time
FROM concerts
UNNEST(concerts_arrays AS concert);
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), concerts AS (
SELECT JSON_EXTRACT_ARRAY_RAW(program, 'concerts') as concerts_arrays
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
JSON_EXTRACT(concert, 'Time', 'TEXT') as concert_time,
count(*)
FROM concerts
UNNEST(concerts_arrays AS concert)
GROUP BY ALL
ORDER BY count(*) DESC;
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), concerts AS (
SELECT JSON_EXTRACT_ARRAY_RAW(program, 'concerts') as concerts_arrays
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
JSON_EXTRACT(concert, 'Venue', 'TEXT') as concert_venue,
count(*)
FROM concerts
UNNEST(concerts_arrays AS concert)
GROUP BY ALL
ORDER BY count(*) DESC;
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), works AS (
SELECT
JSON_EXTRACT(program, 'season', 'TEXT') AS season,
JSON_EXTRACT_ARRAY_RAW(program, 'works') as works_array
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
season,
JSON_EXTRACT(work, 'composerName', 'TEXT') as composer_name
FROM works
UNNEST(works_array AS work);
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), works AS (
SELECT
JSON_EXTRACT(program, 'season', 'TEXT') AS season,
JSON_EXTRACT_ARRAY_RAW(program, 'works') as works_array
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
season,
ARRAY_DISTINCT(NEST(JSON_EXTRACT(work, 'composerName', 'TEXT'))) as composer_names,
ARRAY_UNIQ(NEST(JSON_EXTRACT(work, 'composerName', 'TEXT'))) as composer_number
FROM works
UNNEST(works_array AS work)
GROUP BY ALL
ORDER BY season;
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), works AS (
SELECT
JSON_EXTRACT(program, 'season', 'TEXT') AS season,
JSON_EXTRACT_ARRAY_RAW(program, 'works') as works_array
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
season,
TRANSFORM(x -> JSON_EXTRACT(x, 'composerName', 'TEXT'), works_array) as composer_name
FROM works;
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), works AS (
SELECT
JSON_EXTRACT(program, 'season', 'TEXT') AS season,
JSON_EXTRACT_ARRAY_RAW(program, 'works') as works_array
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
season,
ARRAY_DISTINCT(FLATTEN(NEST(TRANSFORM(x -> JSON_EXTRACT(x, 'composerName', 'TEXT'), works_array)))) as composer_names,
ARRAY_UNIQ(FLATTEN(NEST(TRANSFORM(x -> JSON_EXTRACT(x, 'composerName', 'TEXT'), works_array)))) as composer_number
FROM works
GROUP BY season
ORDER BY 1 ASC;
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), works AS (
SELECT
JSON_EXTRACT(program, 'season', 'TEXT') AS season,
JSON_EXTRACT_ARRAY_RAW(program, 'works') as works_array
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
season,
TRANSFORM(x -> REGEXP_REPLACE(JSON_EXTRACT(x, 'composerName', 'TEXT'),'([A-Za-z])[a-z]+,\\s(.+)', '\\1***\,\\2'), works_array) as composer_name
FROM works;
CREATE VIEW soloists AS
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), works AS (
SELECT
JSON_EXTRACT(program, 'season', 'TEXT') AS season,
JSON_EXTRACT_ARRAY_RAW(program, 'works') as works_array
FROM programs
UNNEST(programs_arrays AS program)
)
SELECT
season,
JSON_EXTRACT(work, 'workTitle', 'TEXT') AS work_title,
JSON_EXTRACT_ARRAY_RAW(work, 'soloists') AS soloists_array
FROM works
UNNEST (works_array AS work);
SELECT
season,
work_title,
soloists_array,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistName', 'TEXT'), soloists_array) as solist_names,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistRole', 'TEXT'), soloists_array) as soloist_role,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistInstrument', 'TEXT'), soloists_array) as soloist_instruments
FROM soloists;
WITH soloists_data AS (
SELECT
season,
work_title,
soloists_array,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistName', 'TEXT'), soloists_array) as solist_names,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistRole', 'TEXT'), soloists_array) as soloist_role,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistInstrument', 'TEXT'), soloists_array) as soloist_instruments
FROM soloists
)
SELECT *
FROM soloists_data
WHERE CONTAINS(soloist_instruments,'Piano')
WITH soloists_data AS (
SELECT
season,
work_title,
soloists_array,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistName', 'TEXT'), soloists_array) as solist_names,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistRole', 'TEXT'), soloists_array) as soloist_role,
TRANSFORM(x -> JSON_EXTRACT(x, 'soloistInstrument', 'TEXT'), soloists_array) as soloist_instruments
FROM soloists
)
SELECT
solist_names,
soloist_instruments,
ARRAY_SORT(x, y -> y, solist_names, soloist_instruments) soloist_names_sorted_by_instruments
FROM soloists_data;
CREATE TABLE nyc_phil AS
WITH programs AS (
SELECT JSON_EXTRACT_ARRAY_RAW(raw_data, 'programs') AS programs_arrays
FROM ex_nyc_phil
), concerts_works AS (
SELECT
JSON_EXTRACT(program, 'season', 'TEXT') AS season,
JSON_EXTRACT(program, 'orchestra', 'TEXT') AS orchestra,
JSON_EXTRACT_ARRAY_RAW(program, 'concerts') as concerts_array,
JSON_EXTRACT(program, 'programID', 'TEXT') as program_id,
JSON_EXTRACT_ARRAY_RAW(program, 'works') as works_array
FROM programs
UNNEST(programs_arrays AS program)
), concerts_works_soloists AS (
SELECT
season,
orchestra,
JSON_EXTRACT(concert, 'Date', 'TEXT')::timestamptz as concert_date,
JSON_EXTRACT(concert, 'eventType', 'TEXT') as concert_event_type,
JSON_EXTRACT(concert, 'Venue', 'TEXT') as concert_venue,
JSON_EXTRACT(concert, 'Location', 'TEXT') as concert_location,
JSON_EXTRACT(concert, 'Time', 'TEXT') as concert_time,
program_id,
JSON_EXTRACT(work, 'workTitle', 'TEXT') as work_title,
JSON_EXTRACT(work, 'ID', 'TEXT') as work_id,
JSON_EXTRACT(work, 'conductorName', 'TEXT') as conduct_name,
JSON_EXTRACT(work, 'composerName', 'TEXT') as composer_name,
CASE WHEN JSON_EXTRACT_ARRAY_RAW(work, 'soloists') = [] THEN ['No soloists'] ELSE JSON_EXTRACT_ARRAY_RAW(work, 'soloists') END as soloists_array -- replacing empty entries where there are no soloists
FROM concerts_works
UNNEST (concerts_array as concert)
UNNEST (works_array as work)
)
SELECT
season,
orchestra,
concert_date,
concert_event_type,
concert_venue,
concert_location,
concert_time,
program_id,
work_title,
work_id,
conduct_name,
composer_name,
JSON_EXTRACT(soloist, 'soloistName', 'TEXT') as soloist_name,
JSON_EXTRACT(soloist, 'soloistRoles', 'TEXT') as soloist_roles,
JSON_EXTRACT(soloist, 'soloistInstrument', 'TEXT') as soloist_instrument
FROM concerts_works_soloists
UNNEST (soloists_array AS soloist);
SELECT * FROM nyc_phil;
FIREBOLT_ENGINE=<firebolt_engine>
FIREBOLT_DATABASE=<firebolt_db>
FIREBOLT_USER=<firebolt_user>
FIREBOLT_PASSWORD=<firebolt_password>
import os
from dotenv import load_dotenv
from firebolt.client.auth import UsernamePassword
from firebolt.db import connect
import streamlit as st
import altair as alt
import pandas as pd
import matplotlib.pyplot as plt
load_dotenv()
def get_firebolt_connection():
engine_name = os.getenv("FIREBOLT_ENGINE")
database_name = os.getenv("FIREBOLT_DATABASE")
username = os.getenv("FIREBOLT_USER")
password = os.getenv("FIREBOLT_PASSWORD")
# create a connection based on provided credentials
connection = connect(
auth=UsernamePassword(username, password),
engine_name=engine_name,
database=database_name,
)
return connection
connection = get_firebolt_connection()
cursor = connection.cursor()
st.set_page_config(layout="wide")
st.title('NY Philharmonic Performance History')
cursor.execute("""
SELECT
min(concert_date),
max(concert_date)
FROM nyc_phil
""")
(ts_min, ts_max) = cursor.fetchone()
(slider_min, slider_max) = st.slider(
"Concert Date",
min_value = ts_min,
max_value = ts_max,
value = (ts_min, ts_max),
format="DD/MM/YYYY - hh:mm")
cursor.execute("""
SELECT
COUNT(DISTINCT program_id)
FROM nyc_phil
WHERE concert_date > ?
AND concert_date < ?
""", [slider_min, slider_max])
t = cursor.fetchone()
st.write('Total number of distinct programs in the interval: ', int(t[0]))
st.write('Number of programs per season')
cursor.execute("""
SELECT
season,
COUNT(DISTINCT program_id)
FROM nyc_phil
WHERE concert_date > ?
AND concert_date < ?
GROUP BY ALL
ORDER BY season
""", [slider_min, slider_max])
program_by_season = cursor.fetchall()
program_by_season_df = pd.DataFrame(program_by_season, columns=['season','no_of_programs'])
st.bar_chart(
data=program_by_season_df,
x = 'season',
y = 'no_of_programs'
)
SELECT
season,
COUNT(DISTINCT program_id)
FROM nyc_phil
WHERE concert_date > {{stard_concert_date}}::timestamptz
AND concert_date < {{end_concert_date}}::timestamptz
GROUP BY ALL
ORDER BY season;
SELECT
season,
composer_name,
COUNT(DISTINCT work_id)
FROM nyc_phil
WHERE concert_date > {{stard_concert_date}}::timestamptz
AND concert_date < {{end_concert_date}}::timestamptz
GROUP BY ALL
HAVING COUNT(DISTINCT work_id)>25
ORDER BY season;
SELECT
concert_venue,
COUNT(DISTINCT program_id)
FROM nyc_phil
WHERE concert_date > {{stard_concert_date}}::timestamptz
AND concert_date < {{end_concert_date}}::timestamptz
GROUP BY ALL
HAVING COUNT(DISTINCT program_id) > 100;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment