Skip to content

Instantly share code, notes, and snippets.

@qi-qi
Last active March 8, 2019 10:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save qi-qi/c58e48a32521991eabdbbf7c9c1e14ff to your computer and use it in GitHub Desktop.
Save qi-qi/c58e48a32521991eabdbbf7c9c1e14ff to your computer and use it in GitHub Desktop.
athena-join-parquet.sql
CREATE table test.stats_20190210_20190306 WITH (
bucketed_by = ARRAY['event_ts'],
bucket_count = 10,
format = 'orc',
external_location = 's3://acast-data-dev/stats_20190210_20190306/') AS
with t2 as (select request_id, bytes from test.cf_stitch)
SELECT cast(from_iso8601_timestamp(t1.event_date) AS timestamp) AS event_ts,
t1.best_effort_user_id,
t1.ip,
t1.ua_source,
t1.episode_url,
t1.method,
t1.status,
t1.range_req,
t1.content_length,
t2.bytes,
t1.client_type,
t1.is_rt,
transform(t1.ads, ad -> ad.i) AS ad,
t1.request_id AS cdn_request_id
FROM data_raw.batch t1 JOIN t2 ON t1.request_id = t2.request_id
WHERE t1.dt >= '2019-02-09' AND t1.dt <= '2019-03-07'
AND cast(from_iso8601_timestamp(event_date) AS Date) >= Date('2019-02-10')
AND cast(from_iso8601_timestamp(event_date) AS Date) <= Date('2019-03-06')
-- =====================
CREATE table test.stats_20190210_20190306_partition_orc WITH (
partitioned_by = ARRAY['dt'],
bucketed_by = ARRAY['event_ts'],
bucket_count = 1,
format = 'orc',
external_location = 's3://acast-data-dev/stats_20190210_20190306_partition_orc/') AS
select *, cast(event_ts as Date) as dt
from test.stats_20190210_20190306
-- ===================================
CREATE table test.stats_20190225_20190306 WITH (
partitioned_by = ARRAY['event_dt'],
bucketed_by = ARRAY['event_ts'],
bucket_count = 1,
format = 'parquet',
external_location = 's3://acast-data-dev/stats_20190225_20190306/') AS
with t2 as (select request_id, bytes from test.cf_stitch)
SELECT cast(from_iso8601_timestamp(t1.event_date) AS timestamp) AS event_ts,
t1.best_effort_user_id,
t1.ip,
t1.ua_source,
t1.episode_url,
t1.method,
t1.status,
t1.range_req,
t1.content_length,
t2.bytes,
t1.client_type,
t1.is_rt,
transform(t1.ads, ad -> ad.i) AS ad,
t1.request_id AS cdn_request_id,
cast(from_iso8601_timestamp(t1.event_date) AS Date) AS event_dt
FROM data_raw.batch t1
JOIN t2 ON t1.request_id = t2.request_id
WHERE t1.dt >= '2019-02-24'
AND t1.dt <= '2019-03-07'
AND cast(from_iso8601_timestamp(event_date) AS Date) >= Date('2019-02-25')
AND cast(from_iso8601_timestamp(event_date) AS Date) <= Date('2019-03-06');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment