Skip to content

Instantly share code, notes, and snippets.

@hinkelman
Last active March 26, 2024 23:25

Revisions

  1. hinkelman revised this gist Mar 26, 2024. 1 changed file with 80 additions and 103 deletions.
    183 changes: 80 additions & 103 deletions data-transformation.ss
    Original file line number Diff line number Diff line change
    @@ -1,104 +1,86 @@
    (import (chez-stats)
    (dataframe))
    (import (dataframe))

    (define flights
    (-> (read-delim "nycflights.tsv" #\tab)
    (rowtable->dataframe)))

    ;; using this as a check of missingness
    (define (all-true? . x)
    (for-all (lambda (y) y) x))
    (define flights (tsv->dataframe "nycflights.tsv"))

    (dataframe-dim flights)

    ;; 4.1 Introduction
    ;; 3.1 Introduction

    ;; 4.1.3
    ;; 3.1.3

    (-> flights
    (dataframe-filter*
    (arr_delay dest)
    (and (all-true? arr_delay)
    (string=? dest "IAH")))
    (dataframe-filter* (dest) (string=? dest "IAH"))
    (dataframe-aggregate*
    (year month day)
    (arr_delay (arr_delay) (sum arr_delay)))
    (arr_delay (arr_delay) (inexact (mean arr_delay))))
    (dataframe-display))

    ;; 4.2 Rows
    ;; 3.2 Rows

    ;; 4.2.1
    ;; 3.2.1

    (define delayed-flights
    (-> flights
    (dataframe-filter*
    (dep_delay)
    (and (all-true? dep_delay)
    (and (not (na? dep_delay))
    (> dep_delay 120)))))

    (apply min ($ delayed-flights 'dep_delay))

    (-> flights
    (dataframe-filter*
    (month day)
    (and (all-true? month day)
    (and (not (or (na? month) (na? day)))
    (= month 1)
    (= day 1)))
    (dataframe-display))
    (dataframe-glimpse))

    (define jan-feb-flights
    (-> flights
    (dataframe-filter*
    (month)
    (and (all-true? month)
    (and (not (na? month))
    (or (= month 1)
    (= month 2))))))

    (dataframe-values-unique jan-feb-flights 'month)
    (remove-duplicates ($ jan-feb-flights 'month))

    (-> flights
    (dataframe-filter*
    (month)
    (and (all-true? month)
    (and (not (na? month))
    (member month '(1 2))))
    (dataframe-display))

    (define jan1
    (-> flights
    (dataframe-filter*
    (month day)
    (and (all-true? month)
    (= month 1)
    (= day 1)))))
    (dataframe-glimpse))

    ;; 4.2.3
    ;; 3.2.3

    (-> flights
    (dataframe-filter*
    (year month day dep_time)
    (all-true? year month day dep_time))
    (not (or (na? year) (na? month) (na? day) (na? dep_time))))
    (dataframe-sort*
    (< year) (< month) (< day) (< dep_time))
    (dataframe-display))
    (dataframe-glimpse))

    (-> flights
    (dataframe-filter*
    (dep_delay)
    (all-true? dep_delay))
    (not (na? dep_delay)))
    (dataframe-sort*
    (> dep_delay))
    (dataframe-display))
    (dataframe-glimpse))

    ;; 4.2.4
    ;; 3.2.4

    (-> flights
    (dataframe-unique)
    (dataframe-display))
    (dataframe-glimpse))

    (-> flights
    (dataframe-select 'origin 'dest)
    (dataframe-select* origin dest)
    (dataframe-unique)
    (dataframe-display))
    (dataframe-glimpse))

    ;; .keep_all in dplyr::distinct keeps all columns
    ;; for the first occurrence of the unique combos
    @@ -112,226 +94,221 @@
    (> n))
    (dataframe-display))

    ;; 4.2.5 Exercises
    ;; 3.2.5 Exercises
    ;; not included in blog post (and only did 1-4)

    ;; 1. Single pipeline for each condition

    (-> flights
    (dataframe-filter*
    (arr_delay)
    (and (all-true? arr_delay)
    (and (not (na? arr_delay))
    (>= arr_delay 2)))
    (dataframe-display))
    (dataframe-glimpse))

    (-> flights
    (dataframe-filter*
    (dest)
    (member dest '("IAH" "HOU")))
    (dataframe-display))
    (dataframe-glimpse))

    (-> flights
    (dataframe-filter*
    (carrier)
    (member carrier '("UA" "AA" "DL")))
    (dataframe-display))
    (dataframe-glimpse))

    (-> flights
    (dataframe-filter*
    (month)
    (and (all-true? month)
    (and (not (na? month))
    (member month '(7 8 9))))
    (dataframe-display))
    (dataframe-glimpse)))

    (-> flights
    (dataframe-filter*
    (dep_delay arr_delay)
    (and (all-true? dep_delay arr_delay)
    (and (not (or (na? dep_delay) (na? arr_delay)))
    (<= dep_delay 0)
    (>= arr_delay 120)))
    (dataframe-display))
    (dataframe-glimpse))

    (-> flights
    (dataframe-filter*
    (dep_delay arr_delay)
    (and (all-true? dep_delay arr_delay)
    (and (not (or (na? dep_delay) (na? arr_delay)))
    ;; no flights met this condition
    (>= dep_delay 60)
    (<= arr_delay -30)))
    (<= arr_delay -30))))
    ;; no flights met this condition
    ;; empty dataframes trigger assertion-violation
    ;; in dataframe-display
    (dataframe-display))
    ;; triggers an exception

    ;; 2. Sort for longest departure delays and earliest flights
    (-> flights
    (dataframe-filter* (dep_delay) (all-true? dep_delay))
    (dataframe-filter* (dep_delay) (not (na? dep_delay)))
    (dataframe-sort* (> dep_delay))
    (dataframe-display))
    (dataframe-glimpse))

    (-> flights
    (dataframe-filter* (dep_time) (all-true? dep_time))
    (dataframe-filter* (dep_time) (not (na? dep_time)))
    (dataframe-sort* (< dep_time))
    (dataframe-display))
    (dataframe-glimpse))

    ;; 3. Sort flights to find the fastest flights
    (-> flights
    (dataframe-filter*
    (distance air_time)
    (all-true? distance air_time))
    (not (or (na? distance) (na? air_time))))
    (dataframe-modify*
    (speed (distance air_time) (* (/ distance air_time) 60)))
    (dataframe-sort*
    (> speed))
    (dataframe-display))
    (dataframe-glimpse))

    ;; 4. Was there a flight on every day in 2013?
    (dataframe-values-unique flights 'year)
    (remove-duplicates ($ flights 'year))

    (define flights2
    (-> flights
    (dataframe-filter*
    (month day)
    (all-true? month day))
    (not (or (na? month) (na? day))))
    (dataframe-modify*
    (month-day
    (month day)
    (string-append (number->string month) "-" (number->string day))))))

    (= 365 (length (dataframe-values-unique flights2 'month-day)))

    ;; 4.3 Columns
    (= 365 (length (remove-duplicates ($ flights2 'month-day))))
    ;; 3.3 Columns

    ;; 4.3.1
    ;; 3.3.1

    (-> flights
    (dataframe-filter*
    (dep_delay arr_delay distance air_time)
    (all-true? dep_delay arr_delay distance air_time))
    (not (or (na? dep_delay) (na? arr_delay) (na? distance) (na? air_time))))
    (dataframe-modify*
    (gain (dep_delay arr_delay) (- dep_delay arr_delay))
    (speed (distance air_time) (* (/ distance air_time) 60)))
    (dataframe-display))
    (dataframe-glimpse))

    ;; no equivalent to .before or .after
    ;; using select instead to achieve similar affect as .after example
    (-> flights
    (dataframe-filter*
    (dep_delay arr_delay distance air_time)
    (all-true? dep_delay arr_delay distance air_time))
    (not (or (na? dep_delay) (na? arr_delay) (na? distance) (na? air_time))))
    (dataframe-modify*
    (gain (dep_delay arr_delay) (- dep_delay arr_delay))
    (speed (distance air_time) (* (/ distance air_time) 60)))
    (dataframe-select 'year 'month 'day 'gain 'speed)
    (speed (distance air_time) (inexact (* (/ distance air_time) 60))))
    (dataframe-select* year month day gain speed)
    (dataframe-display))

    ;; 4.3.2
    ;; 3.3.2

    (-> flights
    (dataframe-select 'year 'month 'day)
    (dataframe-select* year month day)
    (dataframe-display))

    ;; none of the other advanced select examples work in dataframe->rowtable

    ;; 4.3.3
    ;; 3.3.3

    (-> flights
    (dataframe-rename '(tailnum tail_num))
    (dataframe-display))
    (dataframe-rename* (tailnum tail_num))
    (dataframe-glimpse))

    ;; 4.3.4
    ;; 3.3.4
    ;; no equivalent

    ;; 4.3.5 exercises
    ;; 3.3.5 exercises
    ;; skipped

    ;; 4.4 pipe
    ;; 3.4 pipe

    (-> flights
    (dataframe-filter*
    (dest distance air_time)
    (and (all-true? dest distance air_time)
    (and (not (or (na? dest) (na? distance) (na? air_time)))
    (string=? dest "IAH")))
    (dataframe-modify*
    (speed
    (distance air_time)
    (exact->inexact (* (/ distance air_time) 60))))
    (dataframe-select
    'year 'month 'day 'dep_time 'carrier 'flight 'speed)
    (inexact (* (/ distance air_time) 60))))
    (dataframe-select*
    year month day dep_time carrier flight speed)
    (dataframe-sort*
    (> speed))
    (dataframe-display))

    (dataframe-display
    (dataframe-sort*
    (dataframe-select
    (dataframe-select*
    (dataframe-modify*
    (dataframe-filter*
    flights
    (dest distance air_time)
    (and (all-true? dest distance air_time)
    (and (not (or (na? dest) (na? distance) (na? air_time)))
    (string=? dest "IAH")))
    (speed
    (distance air_time)
    (exact->inexact (* (/ distance air_time) 60))))
    'year 'month 'day 'dep_time 'carrier 'flight 'speed)
    (inexact (* (/ distance air_time) 60))))
    year month day dep_time carrier flight speed)
    (> speed)))

    (define flights1
    (dataframe-filter*
    flights
    (dest distance air_time)
    (and (all-true? dest distance air_time)
    (and (not (or (na? dest) (na? distance) (na? air_time)))
    (string=? dest "IAH"))))

    (define flights2
    (dataframe-modify*
    flights1
    (speed
    (distance air_time)
    (exact->inexact (* (/ distance air_time) 60)))))
    (inexact (* (/ distance air_time) 60)))))

    (define flights3
    (dataframe-select
    (dataframe-select*
    flights2
    'year 'month 'day 'dep_time 'carrier 'flight 'speed))
    year month day dep_time carrier flight speed))

    (dataframe-display
    (dataframe-sort* flights3 (> speed)))

    ;; 4.5 Groups
    ;; 3.5 Groups

    ;; 4.5.2
    ;; 3.5.2

    ;; no automatic sorting in dataframe-aggregate
    (-> flights
    (dataframe-filter*
    (dep_delay)
    (all-true? dep_delay))
    (dataframe-aggregate*
    (month)
    (avg_delay (dep_delay) (exact->inexact (mean dep_delay))))
    (avg_delay (dep_delay) (inexact (mean dep_delay))))
    (dataframe-sort*
    (< month))
    (dataframe-display))

    (-> flights
    (dataframe-filter*
    (dep_delay)
    (all-true? dep_delay))
    (not (na? dep_delay)))
    (dataframe-aggregate*
    (month)
    (avg_delay (dep_delay) (exact->inexact (mean dep_delay)))
    (avg_delay (dep_delay) (inexact (mean dep_delay)))
    (n (dep_delay) (length dep_delay)))
    (dataframe-sort*
    (< month))
    (dataframe-display))

    ;; 4.5.3 slice
    ;; 3.5.3 slice

    ;; no equivalent

    ;; 4.5.5
    ;; 3.5.5

    (define daily_flights
    (dataframe-aggregate*
  2. hinkelman created this gist Jul 8, 2023.
    340 changes: 340 additions & 0 deletions data-transformation.ss
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,340 @@
    (import (chez-stats)
    (dataframe))

    (define flights
    (-> (read-delim "nycflights.tsv" #\tab)
    (rowtable->dataframe)))

    ;; using this as a check of missingness
    (define (all-true? . x)
    (for-all (lambda (y) y) x))

    (dataframe-dim flights)

    ;; 4.1 Introduction

    ;; 4.1.3

    (-> flights
    (dataframe-filter*
    (arr_delay dest)
    (and (all-true? arr_delay)
    (string=? dest "IAH")))
    (dataframe-aggregate*
    (year month day)
    (arr_delay (arr_delay) (sum arr_delay)))
    (dataframe-display))

    ;; 4.2 Rows

    ;; 4.2.1

    (define delayed-flights
    (-> flights
    (dataframe-filter*
    (dep_delay)
    (and (all-true? dep_delay)
    (> dep_delay 120)))))

    (apply min ($ delayed-flights 'dep_delay))

    (-> flights
    (dataframe-filter*
    (month day)
    (and (all-true? month day)
    (= month 1)
    (= day 1)))
    (dataframe-display))

    (define jan-feb-flights
    (-> flights
    (dataframe-filter*
    (month)
    (and (all-true? month)
    (or (= month 1)
    (= month 2))))))

    (dataframe-values-unique jan-feb-flights 'month)

    (-> flights
    (dataframe-filter*
    (month)
    (and (all-true? month)
    (member month '(1 2))))
    (dataframe-display))

    (define jan1
    (-> flights
    (dataframe-filter*
    (month day)
    (and (all-true? month)
    (= month 1)
    (= day 1)))))

    ;; 4.2.3

    (-> flights
    (dataframe-filter*
    (year month day dep_time)
    (all-true? year month day dep_time))
    (dataframe-sort*
    (< year) (< month) (< day) (< dep_time))
    (dataframe-display))

    (-> flights
    (dataframe-filter*
    (dep_delay)
    (all-true? dep_delay))
    (dataframe-sort*
    (> dep_delay))
    (dataframe-display))

    ;; 4.2.4

    (-> flights
    (dataframe-unique)
    (dataframe-display))

    (-> flights
    (dataframe-select 'origin 'dest)
    (dataframe-unique)
    (dataframe-display))

    ;; .keep_all in dplyr::distinct keeps all columns
    ;; for the first occurrence of the unique combos
    ;; no equivalent in dataframe-unique

    (-> flights
    (dataframe-aggregate*
    (origin dest)
    (n (origin) (length origin)))
    (dataframe-sort*
    (> n))
    (dataframe-display))

    ;; 4.2.5 Exercises
    ;; not included in blog post (and only did 1-4)

    ;; 1. Single pipeline for each condition

    (-> flights
    (dataframe-filter*
    (arr_delay)
    (and (all-true? arr_delay)
    (>= arr_delay 2)))
    (dataframe-display))

    (-> flights
    (dataframe-filter*
    (dest)
    (member dest '("IAH" "HOU")))
    (dataframe-display))

    (-> flights
    (dataframe-filter*
    (carrier)
    (member carrier '("UA" "AA" "DL")))
    (dataframe-display))

    (-> flights
    (dataframe-filter*
    (month)
    (and (all-true? month)
    (member month '(7 8 9))))
    (dataframe-display))

    (-> flights
    (dataframe-filter*
    (dep_delay arr_delay)
    (and (all-true? dep_delay arr_delay)
    (<= dep_delay 0)
    (>= arr_delay 120)))
    (dataframe-display))

    (-> flights
    (dataframe-filter*
    (dep_delay arr_delay)
    (and (all-true? dep_delay arr_delay)
    ;; no flights met this condition
    (>= dep_delay 60)
    (<= arr_delay -30)))
    ;; no flights met this condition
    ;; empty dataframes trigger assertion-violation
    ;; in dataframe-display
    (dataframe-display))

    ;; 2. Sort for longest departure delays and earliest flights
    (-> flights
    (dataframe-filter* (dep_delay) (all-true? dep_delay))
    (dataframe-sort* (> dep_delay))
    (dataframe-display))

    (-> flights
    (dataframe-filter* (dep_time) (all-true? dep_time))
    (dataframe-sort* (< dep_time))
    (dataframe-display))

    ;; 3. Sort flights to find the fastest flights
    (-> flights
    (dataframe-filter*
    (distance air_time)
    (all-true? distance air_time))
    (dataframe-modify*
    (speed (distance air_time) (* (/ distance air_time) 60)))
    (dataframe-sort*
    (> speed))
    (dataframe-display))

    ;; 4. Was there a flight on every day in 2013?
    (dataframe-values-unique flights 'year)

    (define flights2
    (-> flights
    (dataframe-filter*
    (month day)
    (all-true? month day))
    (dataframe-modify*
    (month-day
    (month day)
    (string-append (number->string month) "-" (number->string day))))))

    (= 365 (length (dataframe-values-unique flights2 'month-day)))

    ;; 4.3 Columns

    ;; 4.3.1

    (-> flights
    (dataframe-filter*
    (dep_delay arr_delay distance air_time)
    (all-true? dep_delay arr_delay distance air_time))
    (dataframe-modify*
    (gain (dep_delay arr_delay) (- dep_delay arr_delay))
    (speed (distance air_time) (* (/ distance air_time) 60)))
    (dataframe-display))

    ;; no equivalent to .before or .after
    ;; using select instead to achieve similar affect as .after example
    (-> flights
    (dataframe-filter*
    (dep_delay arr_delay distance air_time)
    (all-true? dep_delay arr_delay distance air_time))
    (dataframe-modify*
    (gain (dep_delay arr_delay) (- dep_delay arr_delay))
    (speed (distance air_time) (* (/ distance air_time) 60)))
    (dataframe-select 'year 'month 'day 'gain 'speed)
    (dataframe-display))

    ;; 4.3.2

    (-> flights
    (dataframe-select 'year 'month 'day)
    (dataframe-display))

    ;; none of the other advanced select examples work in dataframe->rowtable

    ;; 4.3.3

    (-> flights
    (dataframe-rename '(tailnum tail_num))
    (dataframe-display))

    ;; 4.3.4
    ;; no equivalent

    ;; 4.3.5 exercises
    ;; skipped

    ;; 4.4 pipe

    (-> flights
    (dataframe-filter*
    (dest distance air_time)
    (and (all-true? dest distance air_time)
    (string=? dest "IAH")))
    (dataframe-modify*
    (speed
    (distance air_time)
    (exact->inexact (* (/ distance air_time) 60))))
    (dataframe-select
    'year 'month 'day 'dep_time 'carrier 'flight 'speed)
    (dataframe-sort*
    (> speed))
    (dataframe-display))

    (dataframe-display
    (dataframe-sort*
    (dataframe-select
    (dataframe-modify*
    (dataframe-filter*
    flights
    (dest distance air_time)
    (and (all-true? dest distance air_time)
    (string=? dest "IAH")))
    (speed
    (distance air_time)
    (exact->inexact (* (/ distance air_time) 60))))
    'year 'month 'day 'dep_time 'carrier 'flight 'speed)
    (> speed)))

    (define flights1
    (dataframe-filter*
    flights
    (dest distance air_time)
    (and (all-true? dest distance air_time)
    (string=? dest "IAH"))))

    (define flights2
    (dataframe-modify*
    flights1
    (speed
    (distance air_time)
    (exact->inexact (* (/ distance air_time) 60)))))

    (define flights3
    (dataframe-select
    flights2
    'year 'month 'day 'dep_time 'carrier 'flight 'speed))

    (dataframe-display
    (dataframe-sort* flights3 (> speed)))

    ;; 4.5 Groups

    ;; 4.5.2

    ;; no automatic sorting in dataframe-aggregate
    (-> flights
    (dataframe-filter*
    (dep_delay)
    (all-true? dep_delay))
    (dataframe-aggregate*
    (month)
    (avg_delay (dep_delay) (exact->inexact (mean dep_delay))))
    (dataframe-sort*
    (< month))
    (dataframe-display))

    (-> flights
    (dataframe-filter*
    (dep_delay)
    (all-true? dep_delay))
    (dataframe-aggregate*
    (month)
    (avg_delay (dep_delay) (exact->inexact (mean dep_delay)))
    (n (dep_delay) (length dep_delay)))
    (dataframe-sort*
    (< month))
    (dataframe-display))

    ;; 4.5.3 slice

    ;; no equivalent

    ;; 4.5.5

    (define daily_flights
    (dataframe-aggregate*
    flights
    (year month day)
    (n (dep_time) (length dep_time))))