Last active
March 26, 2024 23:25
Code for blog post at https://www.travishinkelman.com/data-transformation-scheme
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(import (chez-stats) | |
(dataframe)) | |
(define flights | |
(-> (read-delim "nycflights.tsv" #\tab) | |
(rowtable->dataframe))) | |
;; using this as a check of missingness | |
(define (all-true? . x) | |
(for-all (lambda (y) y) x)) | |
(dataframe-dim flights) | |
;; 4.1 Introduction | |
;; 4.1.3 | |
(-> flights | |
(dataframe-filter* | |
(arr_delay dest) | |
(and (all-true? arr_delay) | |
(string=? dest "IAH"))) | |
(dataframe-aggregate* | |
(year month day) | |
(arr_delay (arr_delay) (sum arr_delay))) | |
(dataframe-display)) | |
;; 4.2 Rows | |
;; 4.2.1 | |
(define delayed-flights | |
(-> flights | |
(dataframe-filter* | |
(dep_delay) | |
(and (all-true? dep_delay) | |
(> dep_delay 120))))) | |
(apply min ($ delayed-flights 'dep_delay)) | |
(-> flights | |
(dataframe-filter* | |
(month day) | |
(and (all-true? month day) | |
(= month 1) | |
(= day 1))) | |
(dataframe-display)) | |
(define jan-feb-flights | |
(-> flights | |
(dataframe-filter* | |
(month) | |
(and (all-true? month) | |
(or (= month 1) | |
(= month 2)))))) | |
(dataframe-values-unique jan-feb-flights 'month) | |
(-> flights | |
(dataframe-filter* | |
(month) | |
(and (all-true? month) | |
(member month '(1 2)))) | |
(dataframe-display)) | |
(define jan1 | |
(-> flights | |
(dataframe-filter* | |
(month day) | |
(and (all-true? month) | |
(= month 1) | |
(= day 1))))) | |
;; 4.2.3 | |
(-> flights | |
(dataframe-filter* | |
(year month day dep_time) | |
(all-true? year month day dep_time)) | |
(dataframe-sort* | |
(< year) (< month) (< day) (< dep_time)) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* | |
(dep_delay) | |
(all-true? dep_delay)) | |
(dataframe-sort* | |
(> dep_delay)) | |
(dataframe-display)) | |
;; 4.2.4 | |
(-> flights | |
(dataframe-unique) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-select 'origin 'dest) | |
(dataframe-unique) | |
(dataframe-display)) | |
;; .keep_all in dplyr::distinct keeps all columns | |
;; for the first occurrence of the unique combos | |
;; no equivalent in dataframe-unique | |
(-> flights | |
(dataframe-aggregate* | |
(origin dest) | |
(n (origin) (length origin))) | |
(dataframe-sort* | |
(> n)) | |
(dataframe-display)) | |
;; 4.2.5 Exercises | |
;; not included in blog post (and only did 1-4) | |
;; 1. Single pipeline for each condition | |
(-> flights | |
(dataframe-filter* | |
(arr_delay) | |
(and (all-true? arr_delay) | |
(>= arr_delay 2))) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* | |
(dest) | |
(member dest '("IAH" "HOU"))) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* | |
(carrier) | |
(member carrier '("UA" "AA" "DL"))) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* | |
(month) | |
(and (all-true? month) | |
(member month '(7 8 9)))) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* | |
(dep_delay arr_delay) | |
(and (all-true? dep_delay arr_delay) | |
(<= dep_delay 0) | |
(>= arr_delay 120))) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* | |
(dep_delay arr_delay) | |
(and (all-true? dep_delay arr_delay) | |
;; no flights met this condition | |
(>= dep_delay 60) | |
(<= arr_delay -30))) | |
;; no flights met this condition | |
;; empty dataframes trigger assertion-violation | |
;; in dataframe-display | |
(dataframe-display)) | |
;; 2. Sort for longest departure delays and earliest flights | |
(-> flights | |
(dataframe-filter* (dep_delay) (all-true? dep_delay)) | |
(dataframe-sort* (> dep_delay)) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* (dep_time) (all-true? dep_time)) | |
(dataframe-sort* (< dep_time)) | |
(dataframe-display)) | |
;; 3. Sort flights to find the fastest flights | |
(-> flights | |
(dataframe-filter* | |
(distance air_time) | |
(all-true? distance air_time)) | |
(dataframe-modify* | |
(speed (distance air_time) (* (/ distance air_time) 60))) | |
(dataframe-sort* | |
(> speed)) | |
(dataframe-display)) | |
;; 4. Was there a flight on every day in 2013? | |
(dataframe-values-unique flights 'year) | |
(define flights2 | |
(-> flights | |
(dataframe-filter* | |
(month day) | |
(all-true? month day)) | |
(dataframe-modify* | |
(month-day | |
(month day) | |
(string-append (number->string month) "-" (number->string day)))))) | |
(= 365 (length (dataframe-values-unique flights2 'month-day))) | |
;; 4.3 Columns | |
;; 4.3.1 | |
(-> flights | |
(dataframe-filter* | |
(dep_delay arr_delay distance air_time) | |
(all-true? dep_delay arr_delay distance air_time)) | |
(dataframe-modify* | |
(gain (dep_delay arr_delay) (- dep_delay arr_delay)) | |
(speed (distance air_time) (* (/ distance air_time) 60))) | |
(dataframe-display)) | |
;; no equivalent to .before or .after | |
;; using select instead to achieve similar affect as .after example | |
(-> flights | |
(dataframe-filter* | |
(dep_delay arr_delay distance air_time) | |
(all-true? dep_delay arr_delay distance air_time)) | |
(dataframe-modify* | |
(gain (dep_delay arr_delay) (- dep_delay arr_delay)) | |
(speed (distance air_time) (* (/ distance air_time) 60))) | |
(dataframe-select 'year 'month 'day 'gain 'speed) | |
(dataframe-display)) | |
;; 4.3.2 | |
(-> flights | |
(dataframe-select 'year 'month 'day) | |
(dataframe-display)) | |
;; none of the other advanced select examples work in dataframe->rowtable | |
;; 4.3.3 | |
(-> flights | |
(dataframe-rename '(tailnum tail_num)) | |
(dataframe-display)) | |
;; 4.3.4 | |
;; no equivalent | |
;; 4.3.5 exercises | |
;; skipped | |
;; 4.4 pipe | |
(-> flights | |
(dataframe-filter* | |
(dest distance air_time) | |
(and (all-true? dest distance air_time) | |
(string=? dest "IAH"))) | |
(dataframe-modify* | |
(speed | |
(distance air_time) | |
(exact->inexact (* (/ distance air_time) 60)))) | |
(dataframe-select | |
'year 'month 'day 'dep_time 'carrier 'flight 'speed) | |
(dataframe-sort* | |
(> speed)) | |
(dataframe-display)) | |
(dataframe-display | |
(dataframe-sort* | |
(dataframe-select | |
(dataframe-modify* | |
(dataframe-filter* | |
flights | |
(dest distance air_time) | |
(and (all-true? dest distance air_time) | |
(string=? dest "IAH"))) | |
(speed | |
(distance air_time) | |
(exact->inexact (* (/ distance air_time) 60)))) | |
'year 'month 'day 'dep_time 'carrier 'flight 'speed) | |
(> speed))) | |
(define flights1 | |
(dataframe-filter* | |
flights | |
(dest distance air_time) | |
(and (all-true? dest distance air_time) | |
(string=? dest "IAH")))) | |
(define flights2 | |
(dataframe-modify* | |
flights1 | |
(speed | |
(distance air_time) | |
(exact->inexact (* (/ distance air_time) 60))))) | |
(define flights3 | |
(dataframe-select | |
flights2 | |
'year 'month 'day 'dep_time 'carrier 'flight 'speed)) | |
(dataframe-display | |
(dataframe-sort* flights3 (> speed))) | |
;; 4.5 Groups | |
;; 4.5.2 | |
;; no automatic sorting in dataframe-aggregate | |
(-> flights | |
(dataframe-filter* | |
(dep_delay) | |
(all-true? dep_delay)) | |
(dataframe-aggregate* | |
(month) | |
(avg_delay (dep_delay) (exact->inexact (mean dep_delay)))) | |
(dataframe-sort* | |
(< month)) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* | |
(dep_delay) | |
(all-true? dep_delay)) | |
(dataframe-aggregate* | |
(month) | |
(avg_delay (dep_delay) (exact->inexact (mean dep_delay))) | |
(n (dep_delay) (length dep_delay))) | |
(dataframe-sort* | |
(< month)) | |
(dataframe-display)) | |
;; 4.5.3 slice | |
;; no equivalent | |
;; 4.5.5 | |
(define daily_flights | |
(dataframe-aggregate* | |
flights | |
(year month day) | |
(n (dep_time) (length dep_time)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment