Skip to content

Instantly share code, notes, and snippets.

@hinkelman
Last active March 26, 2024 23:25
(import (chez-stats)
(dataframe))
(define flights
(-> (read-delim "nycflights.tsv" #\tab)
(rowtable->dataframe)))
;; using this as a check of missingness
(define (all-true? . x)
(for-all (lambda (y) y) x))
(dataframe-dim flights)
;; 4.1 Introduction
;; 4.1.3
(-> flights
(dataframe-filter*
(arr_delay dest)
(and (all-true? arr_delay)
(string=? dest "IAH")))
(dataframe-aggregate*
(year month day)
(arr_delay (arr_delay) (sum arr_delay)))
(dataframe-display))
;; 4.2 Rows
;; 4.2.1
(define delayed-flights
(-> flights
(dataframe-filter*
(dep_delay)
(and (all-true? dep_delay)
(> dep_delay 120)))))
(apply min ($ delayed-flights 'dep_delay))
(-> flights
(dataframe-filter*
(month day)
(and (all-true? month day)
(= month 1)
(= day 1)))
(dataframe-display))
(define jan-feb-flights
(-> flights
(dataframe-filter*
(month)
(and (all-true? month)
(or (= month 1)
(= month 2))))))
(dataframe-values-unique jan-feb-flights 'month)
(-> flights
(dataframe-filter*
(month)
(and (all-true? month)
(member month '(1 2))))
(dataframe-display))
(define jan1
(-> flights
(dataframe-filter*
(month day)
(and (all-true? month)
(= month 1)
(= day 1)))))
;; 4.2.3
(-> flights
(dataframe-filter*
(year month day dep_time)
(all-true? year month day dep_time))
(dataframe-sort*
(< year) (< month) (< day) (< dep_time))
(dataframe-display))
(-> flights
(dataframe-filter*
(dep_delay)
(all-true? dep_delay))
(dataframe-sort*
(> dep_delay))
(dataframe-display))
;; 4.2.4
(-> flights
(dataframe-unique)
(dataframe-display))
(-> flights
(dataframe-select 'origin 'dest)
(dataframe-unique)
(dataframe-display))
;; .keep_all in dplyr::distinct keeps all columns
;; for the first occurrence of the unique combos
;; no equivalent in dataframe-unique
(-> flights
(dataframe-aggregate*
(origin dest)
(n (origin) (length origin)))
(dataframe-sort*
(> n))
(dataframe-display))
;; 4.2.5 Exercises
;; not included in blog post (and only did 1-4)
;; 1. Single pipeline for each condition
(-> flights
(dataframe-filter*
(arr_delay)
(and (all-true? arr_delay)
(>= arr_delay 2)))
(dataframe-display))
(-> flights
(dataframe-filter*
(dest)
(member dest '("IAH" "HOU")))
(dataframe-display))
(-> flights
(dataframe-filter*
(carrier)
(member carrier '("UA" "AA" "DL")))
(dataframe-display))
(-> flights
(dataframe-filter*
(month)
(and (all-true? month)
(member month '(7 8 9))))
(dataframe-display))
(-> flights
(dataframe-filter*
(dep_delay arr_delay)
(and (all-true? dep_delay arr_delay)
(<= dep_delay 0)
(>= arr_delay 120)))
(dataframe-display))
(-> flights
(dataframe-filter*
(dep_delay arr_delay)
(and (all-true? dep_delay arr_delay)
;; no flights met this condition
(>= dep_delay 60)
(<= arr_delay -30)))
;; no flights met this condition
;; empty dataframes trigger assertion-violation
;; in dataframe-display
(dataframe-display))
;; 2. Sort for longest departure delays and earliest flights
(-> flights
(dataframe-filter* (dep_delay) (all-true? dep_delay))
(dataframe-sort* (> dep_delay))
(dataframe-display))
(-> flights
(dataframe-filter* (dep_time) (all-true? dep_time))
(dataframe-sort* (< dep_time))
(dataframe-display))
;; 3. Sort flights to find the fastest flights
(-> flights
(dataframe-filter*
(distance air_time)
(all-true? distance air_time))
(dataframe-modify*
(speed (distance air_time) (* (/ distance air_time) 60)))
(dataframe-sort*
(> speed))
(dataframe-display))
;; 4. Was there a flight on every day in 2013?
(dataframe-values-unique flights 'year)
(define flights2
(-> flights
(dataframe-filter*
(month day)
(all-true? month day))
(dataframe-modify*
(month-day
(month day)
(string-append (number->string month) "-" (number->string day))))))
(= 365 (length (dataframe-values-unique flights2 'month-day)))
;; 4.3 Columns
;; 4.3.1
(-> flights
(dataframe-filter*
(dep_delay arr_delay distance air_time)
(all-true? dep_delay arr_delay distance air_time))
(dataframe-modify*
(gain (dep_delay arr_delay) (- dep_delay arr_delay))
(speed (distance air_time) (* (/ distance air_time) 60)))
(dataframe-display))
;; no equivalent to .before or .after
;; using select instead to achieve similar affect as .after example
(-> flights
(dataframe-filter*
(dep_delay arr_delay distance air_time)
(all-true? dep_delay arr_delay distance air_time))
(dataframe-modify*
(gain (dep_delay arr_delay) (- dep_delay arr_delay))
(speed (distance air_time) (* (/ distance air_time) 60)))
(dataframe-select 'year 'month 'day 'gain 'speed)
(dataframe-display))
;; 4.3.2
(-> flights
(dataframe-select 'year 'month 'day)
(dataframe-display))
;; none of the other advanced select examples work in dataframe->rowtable
;; 4.3.3
(-> flights
(dataframe-rename '(tailnum tail_num))
(dataframe-display))
;; 4.3.4
;; no equivalent
;; 4.3.5 exercises
;; skipped
;; 4.4 pipe
(-> flights
(dataframe-filter*
(dest distance air_time)
(and (all-true? dest distance air_time)
(string=? dest "IAH")))
(dataframe-modify*
(speed
(distance air_time)
(exact->inexact (* (/ distance air_time) 60))))
(dataframe-select
'year 'month 'day 'dep_time 'carrier 'flight 'speed)
(dataframe-sort*
(> speed))
(dataframe-display))
(dataframe-display
(dataframe-sort*
(dataframe-select
(dataframe-modify*
(dataframe-filter*
flights
(dest distance air_time)
(and (all-true? dest distance air_time)
(string=? dest "IAH")))
(speed
(distance air_time)
(exact->inexact (* (/ distance air_time) 60))))
'year 'month 'day 'dep_time 'carrier 'flight 'speed)
(> speed)))
(define flights1
(dataframe-filter*
flights
(dest distance air_time)
(and (all-true? dest distance air_time)
(string=? dest "IAH"))))
(define flights2
(dataframe-modify*
flights1
(speed
(distance air_time)
(exact->inexact (* (/ distance air_time) 60)))))
(define flights3
(dataframe-select
flights2
'year 'month 'day 'dep_time 'carrier 'flight 'speed))
(dataframe-display
(dataframe-sort* flights3 (> speed)))
;; 4.5 Groups
;; 4.5.2
;; no automatic sorting in dataframe-aggregate
(-> flights
(dataframe-filter*
(dep_delay)
(all-true? dep_delay))
(dataframe-aggregate*
(month)
(avg_delay (dep_delay) (exact->inexact (mean dep_delay))))
(dataframe-sort*
(< month))
(dataframe-display))
(-> flights
(dataframe-filter*
(dep_delay)
(all-true? dep_delay))
(dataframe-aggregate*
(month)
(avg_delay (dep_delay) (exact->inexact (mean dep_delay)))
(n (dep_delay) (length dep_delay)))
(dataframe-sort*
(< month))
(dataframe-display))
;; 4.5.3 slice
;; no equivalent
;; 4.5.5
(define daily_flights
(dataframe-aggregate*
flights
(year month day)
(n (dep_time) (length dep_time))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment