Skip to content

Instantly share code, notes, and snippets.

@sh16ma
Last active February 2, 2022 07:42
Show Gist options
  • Save sh16ma/6fb9d31f8771b2a36698e1b9d2014542 to your computer and use it in GitHub Desktop.
Save sh16ma/6fb9d31f8771b2a36698e1b9d2014542 to your computer and use it in GitHub Desktop.
#🔎 #BigQuery #カラム別要約統計量 #Pandasでええやん #気候データ(サンプル利用)
#standaardSQL
with
encord_MinTemp as (select cast(MinTemp as float64) as MinTemp from `pj.ds.weatherAUS` where MinTemp <> 'NA')
,encord_MaxTemp as (select cast(MaxTemp as float64) as MaxTemp from `pj.ds.weatherAUS` where MaxTemp <> 'NA')
,encord_Rainfall as (select cast(Rainfall as float64) as Rainfall from `pj.ds.weatherAUS` where Rainfall <> 'NA')
,encord_Evaporation as (select cast(Evaporation as float64) as Evaporation from `pj.ds.weatherAUS` where Evaporation <> 'NA')
,encord_Sunshine as (select cast(Sunshine as float64) as Sunshine from `pj.ds.weatherAUS` where Sunshine <> 'NA')
,encord_WindGustSpeed as (select cast(WindGustSpeed as float64) as WindGustSpeed from `pj.ds.weatherAUS` where WindGustSpeed <> 'NA')
,encord_WindSpeed9am as (select cast(WindSpeed9am as float64) as WindSpeed9am from `pj.ds.weatherAUS` where WindSpeed9am <> 'NA')
,encord_WindSpeed3pm as (select cast(WindSpeed3pm as float64) as WindSpeed3pm from `pj.ds.weatherAUS` where WindSpeed3pm <> 'NA')
,encord_Humidity9am as (select cast(Humidity9am as float64) as Humidity9am from `pj.ds.weatherAUS` where Humidity9am <> 'NA')
,encord_Humidity3pm as (select cast(Humidity3pm as float64) as Humidity3pm from `pj.ds.weatherAUS` where Humidity3pm <> 'NA')
,encord_Pressure9am as (select cast(Pressure9am as float64) as Pressure9am from `pj.ds.weatherAUS` where Pressure9am <> 'NA')
,encord_Pressure3pm as (select cast(Pressure3pm as float64) as Pressure3pm from `pj.ds.weatherAUS` where Pressure3pm <> 'NA')
,encord_Cloud9am as (select cast(Cloud9am as float64) as Cloud9am from `pj.ds.weatherAUS` where Cloud9am <> 'NA')
,encord_Cloud3pm as (select cast(Cloud3pm as float64) as Cloud3pm from `pj.ds.weatherAUS` where Cloud3pm <> 'NA')
,encord_Temp9am as (select cast(Temp9am as float64) as Temp9am from `pj.ds.weatherAUS` where Temp9am <> 'NA')
,encord_Temp3pm as (select cast(Temp3pm as float64) as Temp3pm from `pj.ds.weatherAUS` where Temp3pm <> 'NA')
,statics_columns as (
(select 'MinTemp' as key ,count(*) as n ,avg(MinTemp) as mean ,stddev(MinTemp) as std ,min(MinTemp) as min ,(select q from (select percentile_cont(MinTemp, 0.25) over() as q from encord_MinTemp) group by q) as first_quartile ,(select q from (select percentile_cont(MinTemp, 0.5) over() as q from encord_MinTemp) group by q) as median ,(select q from (select percentile_cont(MinTemp, 0.75) over() as q from encord_MinTemp) group by q) as thrd_quartile ,max(MinTemp) as max from encord_MinTemp)
union all (select 'MaxTemp' as key ,count(*) as n ,avg(MaxTemp) as mean ,stddev(MaxTemp) as std ,min(MaxTemp) as min ,(select q from (select percentile_cont(MaxTemp, 0.25) over() as q from encord_MaxTemp) group by q) as first_quartile ,(select q from (select percentile_cont(MaxTemp, 0.5) over() as q from encord_MaxTemp) group by q) as median ,(select q from (select percentile_cont(MaxTemp, 0.75) over() as q from encord_MaxTemp) group by q) as thrd_quartile ,max(MaxTemp) as max from encord_MaxTemp)
union all (select 'Rainfall' as key ,count(*) as n ,avg(Rainfall) as mean ,stddev(Rainfall) as std ,min(Rainfall) as min ,(select q from (select percentile_cont(Rainfall, 0.25) over() as q from encord_Rainfall) group by q) as first_quartile ,(select q from (select percentile_cont(Rainfall, 0.5) over() as q from encord_Rainfall) group by q) as median ,(select q from (select percentile_cont(Rainfall, 0.75) over() as q from encord_Rainfall) group by q) as thrd_quartile ,max(Rainfall) as max from encord_Rainfall)
union all (select 'Evaporation' as key ,count(*) as n ,avg(Evaporation) as mean ,stddev(Evaporation) as std ,min(Evaporation) as min ,(select q from (select percentile_cont(Evaporation, 0.25) over() as q from encord_Evaporation) group by q) as first_quartile ,(select q from (select percentile_cont(Evaporation, 0.5) over() as q from encord_Evaporation) group by q) as median ,(select q from (select percentile_cont(Evaporation, 0.75) over() as q from encord_Evaporation) group by q) as thrd_quartile ,max(Evaporation) as max from encord_Evaporation)
union all (select 'Sunshine' as key ,count(*) as n ,avg(Sunshine) as mean ,stddev(Sunshine) as std ,min(Sunshine) as min ,(select q from (select percentile_cont(Sunshine, 0.25) over() as q from encord_Sunshine) group by q) as first_quartile ,(select q from (select percentile_cont(Sunshine, 0.5) over() as q from encord_Sunshine) group by q) as median ,(select q from (select percentile_cont(Sunshine, 0.75) over() as q from encord_Sunshine) group by q) as thrd_quartile ,max(Sunshine) as max from encord_Sunshine)
union all (select 'WindGustSpeed' as key ,count(*) as n ,avg(WindGustSpeed) as mean ,stddev(WindGustSpeed) as std ,min(WindGustSpeed) as min ,(select q from (select percentile_cont(WindGustSpeed, 0.25) over() as q from encord_WindGustSpeed) group by q) as first_quartile ,(select q from (select percentile_cont(WindGustSpeed, 0.5) over() as q from encord_WindGustSpeed) group by q) as median ,(select q from (select percentile_cont(WindGustSpeed, 0.75) over() as q from encord_WindGustSpeed) group by q) as thrd_quartile ,max(WindGustSpeed) as max from encord_WindGustSpeed)
union all (select 'WindSpeed9am' as key ,count(*) as n ,avg(WindSpeed9am) as mean ,stddev(WindSpeed9am) as std ,min(WindSpeed9am) as min ,(select q from (select percentile_cont(WindSpeed9am, 0.25) over() as q from encord_WindSpeed9am) group by q) as first_quartile ,(select q from (select percentile_cont(WindSpeed9am, 0.5) over() as q from encord_WindSpeed9am) group by q) as median ,(select q from (select percentile_cont(WindSpeed9am, 0.75) over() as q from encord_WindSpeed9am) group by q) as thrd_quartile ,max(WindSpeed9am) as max from encord_WindSpeed9am)
union all (select 'WindSpeed3pm' as key ,count(*) as n ,avg(WindSpeed3pm) as mean ,stddev(WindSpeed3pm) as std ,min(WindSpeed3pm) as min ,(select q from (select percentile_cont(WindSpeed3pm, 0.25) over() as q from encord_WindSpeed3pm) group by q) as first_quartile ,(select q from (select percentile_cont(WindSpeed3pm, 0.5) over() as q from encord_WindSpeed3pm) group by q) as median ,(select q from (select percentile_cont(WindSpeed3pm, 0.75) over() as q from encord_WindSpeed3pm) group by q) as thrd_quartile ,max(WindSpeed3pm) as max from encord_WindSpeed3pm)
union all (select 'Humidity9am' as key ,count(*) as n ,avg(Humidity9am) as mean ,stddev(Humidity9am) as std ,min(Humidity9am) as min ,(select q from (select percentile_cont(Humidity9am, 0.25) over() as q from encord_Humidity9am) group by q) as first_quartile ,(select q from (select percentile_cont(Humidity9am, 0.5) over() as q from encord_Humidity9am) group by q) as median ,(select q from (select percentile_cont(Humidity9am, 0.75) over() as q from encord_Humidity9am) group by q) as thrd_quartile ,max(Humidity9am) as max from encord_Humidity9am)
union all (select 'Humidity3pm' as key ,count(*) as n ,avg(Humidity3pm) as mean ,stddev(Humidity3pm) as std ,min(Humidity3pm) as min ,(select q from (select percentile_cont(Humidity3pm, 0.25) over() as q from encord_Humidity3pm) group by q) as first_quartile ,(select q from (select percentile_cont(Humidity3pm, 0.5) over() as q from encord_Humidity3pm) group by q) as median ,(select q from (select percentile_cont(Humidity3pm, 0.75) over() as q from encord_Humidity3pm) group by q) as thrd_quartile ,max(Humidity3pm) as max from encord_Humidity3pm)
union all (select 'Pressure9am' as key ,count(*) as n ,avg(Pressure9am) as mean ,stddev(Pressure9am) as std ,min(Pressure9am) as min ,(select q from (select percentile_cont(Pressure9am, 0.25) over() as q from encord_Pressure9am) group by q) as first_quartile ,(select q from (select percentile_cont(Pressure9am, 0.5) over() as q from encord_Pressure9am) group by q) as median ,(select q from (select percentile_cont(Pressure9am, 0.75) over() as q from encord_Pressure9am) group by q) as thrd_quartile ,max(Pressure9am) as max from encord_Pressure9am)
union all (select 'Pressure3pm' as key ,count(*) as n ,avg(Pressure3pm) as mean ,stddev(Pressure3pm) as std ,min(Pressure3pm) as min ,(select q from (select percentile_cont(Pressure3pm, 0.25) over() as q from encord_Pressure3pm) group by q) as first_quartile ,(select q from (select percentile_cont(Pressure3pm, 0.5) over() as q from encord_Pressure3pm) group by q) as median ,(select q from (select percentile_cont(Pressure3pm, 0.75) over() as q from encord_Pressure3pm) group by q) as thrd_quartile ,max(Pressure3pm) as max from encord_Pressure3pm)
union all (select 'Cloud9am' as key ,count(*) as n ,avg(Cloud9am) as mean ,stddev(Cloud9am) as std ,min(Cloud9am) as min ,(select q from (select percentile_cont(Cloud9am, 0.25) over() as q from encord_Cloud9am) group by q) as first_quartile ,(select q from (select percentile_cont(Cloud9am, 0.5) over() as q from encord_Cloud9am) group by q) as median ,(select q from (select percentile_cont(Cloud9am, 0.75) over() as q from encord_Cloud9am) group by q) as thrd_quartile ,max(Cloud9am) as max from encord_Cloud9am)
union all (select 'Cloud3pm' as key ,count(*) as n ,avg(Cloud3pm) as mean ,stddev(Cloud3pm) as std ,min(Cloud3pm) as min ,(select q from (select percentile_cont(Cloud3pm, 0.25) over() as q from encord_Cloud3pm) group by q) as first_quartile ,(select q from (select percentile_cont(Cloud3pm, 0.5) over() as q from encord_Cloud3pm) group by q) as median ,(select q from (select percentile_cont(Cloud3pm, 0.75) over() as q from encord_Cloud3pm) group by q) as thrd_quartile ,max(Cloud3pm) as max from encord_Cloud3pm)
union all (select 'Temp9am' as key ,count(*) as n ,avg(Temp9am) as mean ,stddev(Temp9am) as std ,min(Temp9am) as min ,(select q from (select percentile_cont(Temp9am, 0.25) over() as q from encord_Temp9am) group by q) as first_quartile ,(select q from (select percentile_cont(Temp9am, 0.5) over() as q from encord_Temp9am) group by q) as median ,(select q from (select percentile_cont(Temp9am, 0.75) over() as q from encord_Temp9am) group by q) as thrd_quartile ,max(Temp9am) as max from encord_Temp9am)
union all (select 'Temp3pm' as key ,count(*) as n ,avg(Temp3pm) as mean ,stddev(Temp3pm) as std ,min(Temp3pm) as min ,(select q from (select percentile_cont(Temp3pm, 0.25) over() as q from encord_Temp3pm) group by q) as first_quartile ,(select q from (select percentile_cont(Temp3pm, 0.5) over() as q from encord_Temp3pm) group by q) as median ,(select q from (select percentile_cont(Temp3pm, 0.75) over() as q from encord_Temp3pm) group by q) as thrd_quartile ,max(Temp3pm) as max from encord_Temp3pm)
)
select * from statics_columns order by key;
@sh16ma
Copy link
Author

sh16ma commented Nov 9, 2021

img

@sh16ma
Copy link
Author

sh16ma commented Nov 9, 2021

💡 GoogleSheets利用

カラムを「A1」列に転記。
以下の関数を順番にコピペ。

「B1」列にコピペ

="encord_"&A1&" as (select cast("&A1&" as float64) as "&A1&" from `pj.ds.weatherAUS` where "&A1&" <> 'NA')"

「B2」列以降にコピペ

=",encord_"&A2&" as (select cast("&A2&" as float64) as "&A2&" from `pj.ds.weatherAUS` where "&A2&" <> 'NA')"

「C1」列にコピペ

="(select '"&A1&"' as key ,count(*) as n ,avg("&A1&") as mean ,stddev("&A1&") as std ,min("&A1&") as min ,(select q from (select percentile_cont("&A1&", 0.25) over() as q from encord_"&A1&") group by q) as first_quartile ,(select q from (select percentile_cont("&A1&", 0.5) over() as q from encord_"&A1&") group by  q) as median ,(select q from (select percentile_cont("&A1&", 0.75) over() as q from encord_"&A1&") group by q) as thrd_quartile ,max("&A1&") as max from encord_"&A1&")"

「C2」列以降にコピペ

="union all (select '"&A2&"' as key ,count(*) as n ,avg("&A2&") as mean ,stddev("&A2&") as std ,min("&A2&") as min ,(select q from (select percentile_cont("&A2&", 0.25) over() as q from encord_"&A2&") group by q) as first_quartile ,(select q from (select percentile_cont("&A2&", 0.5) over() as q from encord_"&A2&") group by  q) as median ,(select q from (select percentile_cont("&A2&", 0.75) over() as q from encord_"&A2&") group by q) as thrd_quartile ,max("&A2&") as max from encord_"&A2&")"

@sh16ma
Copy link
Author

sh16ma commented Nov 10, 2021

年別×気温区分別:要約統計量

with 
    MinTemp_encd as (
        select
            Date
            , cast(MinTemp as float64) as MinTemp
        from
            `prj-test3.case_study1.weatherAUS`
        where 
            MinTemp <> 'NA'
    )
    , MinTemp_quatile as (
        select
            year
            , max(MinTemp_q1) as MinTemp_q1
            , max(MinTemp_q2) as MinTemp_q2 -- median
            , max(MinTemp_q3) as MinTemp_q3
        from
            (
            select
                format_date('%Y', Date) as year
                , percentile_cont(MinTemp, 0.25)over(partition by format_date('%Y', Date)) as MinTemp_q1
                , percentile_cont(MinTemp, 0.5)over(partition by format_date('%Y', Date)) as MinTemp_q2
                , percentile_cont(MinTemp, 0.75)over(partition by format_date('%Y', Date)) as MinTemp_q3
            from 
                MinTemp_encd
            )
        group by 
        year
    )
    , MinTemp_statistic as (
        select
            year
            , st.MinTemp_mean
            , st.MinTemp_std
            , st.MinTemp_min
            , qt.MinTemp_q1
            , qt.MinTemp_q2
            , qt.MinTemp_q3
            , st.MinTemp_max
        from
            (select
                format_date('%Y', Date) as year 
                , avg(MinTemp) as MinTemp_mean
                , stddev_pop(MinTemp) as MinTemp_std
                , min(MinTemp) as MinTemp_min
                , max(MinTemp) as MinTemp_max
            from 
                MinTemp_encd
            group by 
                year
            ) as st
        join MinTemp_quatile as qt
        using(year)
    )
    , MaxTemp_encd as (
        select
            Date
            , cast(MaxTemp as float64) as MaxTemp
        from
            `prj-test3.case_study1.weatherAUS`
        where 
            MaxTemp <> 'NA'
    )
    , MaxTemp_quatile as (
        select
            year
            , max(MaxTemp_q1) as MaxTemp_q1
            , max(MaxTemp_q2) as MaxTemp_q2 -- median
            , max(MaxTemp_q3) as MaxTemp_q3
        from
            (
            select
                format_date('%Y', Date) as year
                , percentile_cont(MaxTemp, 0.25)over(partition by format_date('%Y', Date)) as MaxTemp_q1
                , percentile_cont(MaxTemp, 0.5)over(partition by format_date('%Y', Date)) as MaxTemp_q2
                , percentile_cont(MaxTemp, 0.75)over(partition by format_date('%Y', Date)) as MaxTemp_q3
            from 
                MaxTemp_encd
            )
        group by 
        year
    )
    , MaxTemp_statistic as (
        select
            year
            , st.MaxTemp_mean
            , st.MaxTemp_std
            , st.MaxTemp_min
            , qt.MaxTemp_q1
            , qt.MaxTemp_q2
            , qt.MaxTemp_q3
            , st.MaxTemp_max
        from
            (select
                format_date('%Y', Date) as year 
                , avg(MaxTemp) as MaxTemp_mean
                , stddev_pop(MaxTemp) as MaxTemp_std
                , min(MaxTemp) as MaxTemp_min
                , max(MaxTemp) as MaxTemp_max
            from 
                MaxTemp_encd
            group by 
                year
            ) as st
        join MaxTemp_quatile as qt
        using(year)
    )
/*****************
 * OUT-PUT
 *****************/
select
    * 
from 
    MinTemp_statistic
join 
    MaxTemp_statistic using(year)
order by 
    year
;

クエリ完了(経過時間: 2.2 秒、処理されたバイト数: 2.7 MB)

img

※ 苦戦したポイント

  • ウィンドウ関数のOVER句でPARTITIONBYを年別に切り分ける点。
  • 親クエリで子クエリ(サブクエリ)の値を取得の際に、集計が伴う点(maxでもminでも可能 ∵同一数字)。
MaxTemp_quatile as (
        select
            year
            , max(MaxTemp_q1) as MaxTemp_q1
            , max(MaxTemp_q2) as MaxTemp_q2 -- median
            , max(MaxTemp_q3) as MaxTemp_q3
        from
            (
            # 年にフォーマット変換して、四分位数を取得
            select
                format_date('%Y', Date) as year
                , percentile_cont(MaxTemp, 0.25)over(partition by format_date('%Y', Date)) as MaxTemp_q1
                , percentile_cont(MaxTemp, 0.5)over(partition by format_date('%Y', Date)) as MaxTemp_q2
                , percentile_cont(MaxTemp, 0.75)over(partition by format_date('%Y', Date)) as MaxTemp_q3
            from 
                MaxTemp_encd
            )
        group by 
                year
)

@sh16ma
Copy link
Author

sh16ma commented Nov 10, 2021

ロケーション別×気温区分別 : 要約統計量

with
    loc_days as (
        select
            Location
            , count(Location) as days
        from 
            `pj.ds.weatherAUS`
        group by 
            Location
    ) 
    , MinTemp_encd as (
        select
            Location
            , Date
            , cast(MinTemp as float64) as MinTemp
        from
            `pj.ds.weatherAUS`
        where 
            MinTemp <> 'NA'
    )
    , MinTemp_quatile as (
        select
            Location
            , max(MinTemp_q1) as MinTemp_q1
            , max(MinTemp_q2) as MinTemp_q2 -- median
            , max(MinTemp_q3) as MinTemp_q3
        from
            (
            select
                Location
                , percentile_cont(MinTemp, 0.25)over(partition by Location) as MinTemp_q1
                , percentile_cont(MinTemp, 0.5)over(partition by Location) as MinTemp_q2
                , percentile_cont(MinTemp, 0.75)over(partition by Location) as MinTemp_q3
            from 
                MinTemp_encd
            )
        group by 
            Location
    )
    , MinTemp_statistic as (
        select
            Location
            , st.MinTemp_mean
            , st.MinTemp_std
            , st.MinTemp_min
            , qt.MinTemp_q1
            , qt.MinTemp_q2
            , qt.MinTemp_q3
            , st.MinTemp_max
        from
            (select
                Location
                , avg(MinTemp) as MinTemp_mean
                , stddev_pop(MinTemp) as MinTemp_std
                , min(MinTemp) as MinTemp_min
                , max(MinTemp) as MinTemp_max
            from 
                MinTemp_encd
            group by 
                Location
            ) as st
        join MinTemp_quatile as qt
        using(Location)
    )
    , MaxTemp_encd as (
        select
            Location
            , Date
            , cast(MaxTemp as float64) as MaxTemp
        from
            `pj.ds.weatherAUS`
        where 
            MaxTemp <> 'NA'
    )
    , MaxTemp_quatile as (
        select
            Location
            , max(MaxTemp_q1) as MaxTemp_q1
            , max(MaxTemp_q2) as MaxTemp_q2 -- median
            , max(MaxTemp_q3) as MaxTemp_q3
        from
            (
            select
                Location
                , percentile_cont(MaxTemp, 0.25)over(partition by Location) as MaxTemp_q1
                , percentile_cont(MaxTemp, 0.5)over(partition by Location) as MaxTemp_q2
                , percentile_cont(MaxTemp, 0.75)over(partition by Location) as MaxTemp_q3
            from 
                MaxTemp_encd
            )
        group by 
            Location
    )
    , MaxTemp_statistic as (
        select
            Location
            , st.MaxTemp_mean
            , st.MaxTemp_std
            , st.MaxTemp_min
            , qt.MaxTemp_q1
            , qt.MaxTemp_q2
            , qt.MaxTemp_q3
            , st.MaxTemp_max
        from
            (select
                Location
                , avg(MaxTemp) as MaxTemp_mean
                , stddev_pop(MaxTemp) as MaxTemp_std
                , min(MaxTemp) as MaxTemp_min
                , max(MaxTemp) as MaxTemp_max
            from 
                MaxTemp_encd
            group by 
                Location
            ) as st
        join MaxTemp_quatile as qt using(Location)
    )
/*****************
 * OUT-PUT
 *****************/
select
    * 
from 
    loc_days
join
    MinTemp_statistic using(Location)
join 
    MaxTemp_statistic using(Location)
order by 
    Location
;

クエリ完了(経過時間: 2.4 秒、処理されたバイト数: 3 MB)
img

※ サンプルデータを利用

@sh16ma
Copy link
Author

sh16ma commented Nov 10, 2021

天候区分別×気温区分別:要約統計量

with 
    MinTemp_encd as (
        select
            Date
            , MinTemp
            , CategoryTemp
        from(
            select
                Date
                , cast(MinTemp as float64) as MinTemp
                , cast(Cloud9am as float64) as Cloud9am
                , cast(Cloud3pm as float64) as Cloud3pm
                , case 
                    when (cast(Cloud9am as float64)+cast(Cloud3pm as float64))/2 <= 1 then 'Sunny+'
                    when (cast(Cloud9am as float64)+cast(Cloud3pm as float64))/2 between 2 and 8 then 'Sunny'
                    when (cast(Cloud9am as float64)+cast(Cloud3pm as float64))/2 >= 9 then 'Cloudy'
                end as CategoryTemp
            from
                `prj-test3.case_study1.weatherAUS`
            where 
                MinTemp <> 'NA'
                and 
                Cloud9am <> 'NA'
                and
                Cloud3pm <> 'NA'
        )
        where 
            CategoryTemp = 'Sunny+'
    )
    , MinTemp_quatile as (
        select
            year
            , max(MinTemp_q1) as MinTemp_q1
            , max(MinTemp_q2) as MinTemp_q2 -- median
            , max(MinTemp_q3) as MinTemp_q3
        from
            (
            select
                format_date('%Y', Date) as year
                , percentile_cont(MinTemp, 0.25)over(partition by format_date('%Y', Date)) as MinTemp_q1
                , percentile_cont(MinTemp, 0.5)over(partition by format_date('%Y', Date)) as MinTemp_q2
                , percentile_cont(MinTemp, 0.75)over(partition by format_date('%Y', Date)) as MinTemp_q3
            from 
                MinTemp_encd
            )
        group by 
        year
    )
    , MinTemp_statistic as (
        select
            year
            , st.MinTemp_mean
            , st.MinTemp_std
            , st.MinTemp_min
            , qt.MinTemp_q1
            , qt.MinTemp_q2
            , qt.MinTemp_q3
            , st.MinTemp_max
        from
            (select
                format_date('%Y', Date) as year 
                , avg(MinTemp) as MinTemp_mean
                , stddev_pop(MinTemp) as MinTemp_std
                , min(MinTemp) as MinTemp_min
                , max(MinTemp) as MinTemp_max
            from 
                MinTemp_encd
            group by 
                year
            ) as st
        join MinTemp_quatile as qt
        using(year)
    )
    , MaxTemp_encd as (
        select
            Date
            , MaxTemp
            , CategoryTemp
        from(
            select
                Date
                , cast(MaxTemp as float64) as MaxTemp
                , cast(Cloud9am as float64) as Cloud9am
                , cast(Cloud3pm as float64) as Cloud3pm
                , case 
                    when (cast(Cloud9am as float64)+cast(Cloud3pm as float64))/2 <= 1 then 'Sunny+'
                    when (cast(Cloud9am as float64)+cast(Cloud3pm as float64))/2 between 2 and 8 then 'Sunny'
                    when (cast(Cloud9am as float64)+cast(Cloud3pm as float64))/2 >= 9 then 'Cloudy'
                end as CategoryTemp
            from
                `prj-test3.case_study1.weatherAUS`
            where 
                MaxTemp <> 'NA'
                and 
                Cloud9am <> 'NA'
                and
                Cloud3pm <> 'NA'
        )
        where 
            CategoryTemp = 'Sunny+'
    )
    , MaxTemp_quatile as (
        select
            year
            , max(MaxTemp_q1) as MaxTemp_q1
            , max(MaxTemp_q2) as MaxTemp_q2 -- median
            , max(MaxTemp_q3) as MaxTemp_q3
        from
            (
            select
                format_date('%Y', Date) as year
                , percentile_cont(MaxTemp, 0.25)over(partition by format_date('%Y', Date)) as MaxTemp_q1
                , percentile_cont(MaxTemp, 0.5)over(partition by format_date('%Y', Date)) as MaxTemp_q2
                , percentile_cont(MaxTemp, 0.75)over(partition by format_date('%Y', Date)) as MaxTemp_q3
            from 
                MaxTemp_encd
            )
        group by 
        year
    )
    , MaxTemp_statistic as (
        select
            year
            , st.MaxTemp_mean
            , st.MaxTemp_std
            , st.MaxTemp_min
            , qt.MaxTemp_q1
            , qt.MaxTemp_q2
            , qt.MaxTemp_q3
            , st.MaxTemp_max
        from
            (select
                format_date('%Y', Date) as year 
                , avg(MaxTemp) as MaxTemp_mean
                , stddev_pop(MaxTemp) as MaxTemp_std
                , min(MaxTemp) as MaxTemp_min
                , max(MaxTemp) as MaxTemp_max
            from 
                MaxTemp_encd
            group by 
                year
            ) as st
        join MaxTemp_quatile as qt
        using(year)
    )

/*****************
 * OUT-PUT
 *****************/
select
    * 
from 
    MinTemp_statistic
join 
    MaxTemp_statistic using(year)
order by 
    year
;

クエリ完了(経過時間: 3.5 秒、処理されたバイト数: 3.6 MB)

img

@sh16ma
Copy link
Author

sh16ma commented Nov 10, 2021

風向き区分別 × 風量:要約統計量

with 
    WindGustDir_encd as (
        select
            WindGustDir
            , cast(WindGustSpeed as float64) as WindGustSpeed
        from
            `pj.ds.weatherAUS`
        where 
            WindGustDir <> 'NA'
            and
            WindGustSpeed <> 'NA'
    )
    , WindGustDir_quatile as (
        select
            WindGustDir
            , max(WindGustSpeed_q1) as WindGustSpeed_q1
            , max(WindGustSpeed_q2) as WindGustSpeed_q2
            , max(WindGustSpeed_q3) as WindGustSpeed_q3
        from
            (
            select
                WindGustDir
                , percentile_cont(WindGustSpeed, 0.25)over(partition by WindGustDir) as WindGustSpeed_q1
                , percentile_cont(WindGustSpeed, 0.5)over(partition by WindGustDir) as WindGustSpeed_q2
                , percentile_cont(WindGustSpeed, 0.75)over(partition by WindGustDir) as WindGustSpeed_q3
            from 
                WindGustDir_encd
            )
        group by 
            WindGustDir
    )
    , WindGustDir_statistic as (
        select
            WindGustDir
            , st.WindGustSpeed_mean
            , st.WindGustSpeed_std
            , st.WindGustSpeed_min
            , qt.WindGustSpeed_q1
            , qt.WindGustSpeed_q2
            , qt.WindGustSpeed_q3
            , st.WindGustSpeed_max
        from
            (select
                WindGustDir
                , avg(WindGustSpeed) as WindGustSpeed_mean
                , stddev_pop(WindGustSpeed) as WindGustSpeed_std
                , min(WindGustSpeed) as WindGustSpeed_min
                , max(WindGustSpeed) as WindGustSpeed_max
            from 
                WindGustDir_encd
            group by 
                WindGustDir
            ) as st
        join WindGustDir_quatile as qt 
        using(WindGustDir)
    )
/*****************
 * OUT-PUT
 *****************/
select * from WindGustDir_statistic order by WindGustDir;

クエリ完了(経過時間: 3.2 秒、処理されたバイト数: 1.1 MB)

img

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment