Last active
March 26, 2024 23:25
-
-
Save hinkelman/0945b3c905dcd244809bbed81d2faeb1 to your computer and use it in GitHub Desktop.
Code for blog post at https://www.travishinkelman.com/data-transformation-scheme
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(import (dataframe)) | |
(define flights (tsv->dataframe "nycflights.tsv")) | |
(dataframe-dim flights) | |
;; 3.1 Introduction | |
;; 3.1.3 | |
(-> flights | |
(dataframe-filter* (dest) (string=? dest "IAH")) | |
(dataframe-aggregate* | |
(year month day) | |
(arr_delay (arr_delay) (inexact (mean arr_delay)))) | |
(dataframe-display)) | |
;; 3.2 Rows | |
;; 3.2.1 | |
(define delayed-flights | |
(-> flights | |
(dataframe-filter* | |
(dep_delay) | |
(and (not (na? dep_delay)) | |
(> dep_delay 120))))) | |
(apply min ($ delayed-flights 'dep_delay)) | |
(-> flights | |
(dataframe-filter* | |
(month day) | |
(and (not (or (na? month) (na? day))) | |
(= month 1) | |
(= day 1))) | |
(dataframe-glimpse)) | |
(define jan-feb-flights | |
(-> flights | |
(dataframe-filter* | |
(month) | |
(and (not (na? month)) | |
(or (= month 1) | |
(= month 2)))))) | |
(remove-duplicates ($ jan-feb-flights 'month)) | |
(-> flights | |
(dataframe-filter* | |
(month) | |
(and (not (na? month)) | |
(member month '(1 2)))) | |
(dataframe-glimpse)) | |
;; 3.2.3 | |
(-> flights | |
(dataframe-filter* | |
(year month day dep_time) | |
(not (or (na? year) (na? month) (na? day) (na? dep_time)))) | |
(dataframe-sort* | |
(< year) (< month) (< day) (< dep_time)) | |
(dataframe-glimpse)) | |
(-> flights | |
(dataframe-filter* | |
(dep_delay) | |
(not (na? dep_delay))) | |
(dataframe-sort* | |
(> dep_delay)) | |
(dataframe-glimpse)) | |
;; 3.2.4 | |
(-> flights | |
(dataframe-unique) | |
(dataframe-glimpse)) | |
(-> flights | |
(dataframe-select* origin dest) | |
(dataframe-unique) | |
(dataframe-glimpse)) | |
;; .keep_all in dplyr::distinct keeps all columns | |
;; for the first occurrence of the unique combos | |
;; no equivalent in dataframe-unique | |
(-> flights | |
(dataframe-aggregate* | |
(origin dest) | |
(n (origin) (length origin))) | |
(dataframe-sort* | |
(> n)) | |
(dataframe-display)) | |
;; 3.2.5 Exercises | |
;; not included in blog post (and only did 1-4) | |
;; 1. Single pipeline for each condition | |
(-> flights | |
(dataframe-filter* | |
(arr_delay) | |
(and (not (na? arr_delay)) | |
(>= arr_delay 2))) | |
(dataframe-glimpse)) | |
(-> flights | |
(dataframe-filter* | |
(dest) | |
(member dest '("IAH" "HOU"))) | |
(dataframe-glimpse)) | |
(-> flights | |
(dataframe-filter* | |
(carrier) | |
(member carrier '("UA" "AA" "DL"))) | |
(dataframe-glimpse)) | |
(-> flights | |
(dataframe-filter* | |
(month) | |
(and (not (na? month)) | |
(member month '(7 8 9)))) | |
(dataframe-glimpse))) | |
(-> flights | |
(dataframe-filter* | |
(dep_delay arr_delay) | |
(and (not (or (na? dep_delay) (na? arr_delay))) | |
(<= dep_delay 0) | |
(>= arr_delay 120))) | |
(dataframe-glimpse)) | |
(-> flights | |
(dataframe-filter* | |
(dep_delay arr_delay) | |
(and (not (or (na? dep_delay) (na? arr_delay))) | |
;; no flights met this condition | |
(>= dep_delay 60) | |
(<= arr_delay -30)))) | |
;; no flights met this condition | |
;; triggers an exception | |
;; 2. Sort for longest departure delays and earliest flights | |
(-> flights | |
(dataframe-filter* (dep_delay) (not (na? dep_delay))) | |
(dataframe-sort* (> dep_delay)) | |
(dataframe-glimpse)) | |
(-> flights | |
(dataframe-filter* (dep_time) (not (na? dep_time))) | |
(dataframe-sort* (< dep_time)) | |
(dataframe-glimpse)) | |
;; 3. Sort flights to find the fastest flights | |
(-> flights | |
(dataframe-filter* | |
(distance air_time) | |
(not (or (na? distance) (na? air_time)))) | |
(dataframe-modify* | |
(speed (distance air_time) (* (/ distance air_time) 60))) | |
(dataframe-sort* | |
(> speed)) | |
(dataframe-glimpse)) | |
;; 4. Was there a flight on every day in 2013? | |
(remove-duplicates ($ flights 'year)) | |
(define flights2 | |
(-> flights | |
(dataframe-filter* | |
(month day) | |
(not (or (na? month) (na? day)))) | |
(dataframe-modify* | |
(month-day | |
(month day) | |
(string-append (number->string month) "-" (number->string day)))))) | |
(= 365 (length (remove-duplicates ($ flights2 'month-day)))) | |
;; 3.3 Columns | |
;; 3.3.1 | |
(-> flights | |
(dataframe-filter* | |
(dep_delay arr_delay distance air_time) | |
(not (or (na? dep_delay) (na? arr_delay) (na? distance) (na? air_time)))) | |
(dataframe-modify* | |
(gain (dep_delay arr_delay) (- dep_delay arr_delay)) | |
(speed (distance air_time) (* (/ distance air_time) 60))) | |
(dataframe-glimpse)) | |
;; no equivalent to .before or .after | |
;; using select instead to achieve similar affect as .after example | |
(-> flights | |
(dataframe-filter* | |
(dep_delay arr_delay distance air_time) | |
(not (or (na? dep_delay) (na? arr_delay) (na? distance) (na? air_time)))) | |
(dataframe-modify* | |
(gain (dep_delay arr_delay) (- dep_delay arr_delay)) | |
(speed (distance air_time) (inexact (* (/ distance air_time) 60)))) | |
(dataframe-select* year month day gain speed) | |
(dataframe-display)) | |
;; 3.3.2 | |
(-> flights | |
(dataframe-select* year month day) | |
(dataframe-display)) | |
;; none of the other advanced select examples work in dataframe->rowtable | |
;; 3.3.3 | |
(-> flights | |
(dataframe-rename* (tailnum tail_num)) | |
(dataframe-glimpse)) | |
;; 3.3.4 | |
;; no equivalent | |
;; 3.3.5 exercises | |
;; skipped | |
;; 3.4 pipe | |
(-> flights | |
(dataframe-filter* | |
(dest distance air_time) | |
(and (not (or (na? dest) (na? distance) (na? air_time))) | |
(string=? dest "IAH"))) | |
(dataframe-modify* | |
(speed | |
(distance air_time) | |
(inexact (* (/ distance air_time) 60)))) | |
(dataframe-select* | |
year month day dep_time carrier flight speed) | |
(dataframe-sort* | |
(> speed)) | |
(dataframe-display)) | |
(dataframe-display | |
(dataframe-sort* | |
(dataframe-select* | |
(dataframe-modify* | |
(dataframe-filter* | |
flights | |
(dest distance air_time) | |
(and (not (or (na? dest) (na? distance) (na? air_time))) | |
(string=? dest "IAH"))) | |
(speed | |
(distance air_time) | |
(inexact (* (/ distance air_time) 60)))) | |
year month day dep_time carrier flight speed) | |
(> speed))) | |
(define flights1 | |
(dataframe-filter* | |
flights | |
(dest distance air_time) | |
(and (not (or (na? dest) (na? distance) (na? air_time))) | |
(string=? dest "IAH")))) | |
(define flights2 | |
(dataframe-modify* | |
flights1 | |
(speed | |
(distance air_time) | |
(inexact (* (/ distance air_time) 60))))) | |
(define flights3 | |
(dataframe-select* | |
flights2 | |
year month day dep_time carrier flight speed)) | |
(dataframe-display | |
(dataframe-sort* flights3 (> speed))) | |
;; 3.5 Groups | |
;; 3.5.2 | |
;; no automatic sorting in dataframe-aggregate | |
(-> flights | |
(dataframe-aggregate* | |
(month) | |
(avg_delay (dep_delay) (inexact (mean dep_delay)))) | |
(dataframe-sort* | |
(< month)) | |
(dataframe-display)) | |
(-> flights | |
(dataframe-filter* | |
(dep_delay) | |
(not (na? dep_delay))) | |
(dataframe-aggregate* | |
(month) | |
(avg_delay (dep_delay) (inexact (mean dep_delay))) | |
(n (dep_delay) (length dep_delay))) | |
(dataframe-sort* | |
(< month)) | |
(dataframe-display)) | |
;; 3.5.3 slice | |
;; no equivalent | |
;; 3.5.5 | |
(define daily_flights | |
(dataframe-aggregate* | |
flights | |
(year month day) | |
(n (dep_time) (length dep_time)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment