Skip to content

Instantly share code, notes, and snippets.

@hinkelman
Last active March 26, 2024 23:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hinkelman/0945b3c905dcd244809bbed81d2faeb1 to your computer and use it in GitHub Desktop.
Save hinkelman/0945b3c905dcd244809bbed81d2faeb1 to your computer and use it in GitHub Desktop.
(import (dataframe))
(define flights (tsv->dataframe "nycflights.tsv"))
(dataframe-dim flights)
;; 3.1 Introduction
;; 3.1.3
(-> flights
(dataframe-filter* (dest) (string=? dest "IAH"))
(dataframe-aggregate*
(year month day)
(arr_delay (arr_delay) (inexact (mean arr_delay))))
(dataframe-display))
;; 3.2 Rows
;; 3.2.1
(define delayed-flights
(-> flights
(dataframe-filter*
(dep_delay)
(and (not (na? dep_delay))
(> dep_delay 120)))))
(apply min ($ delayed-flights 'dep_delay))
(-> flights
(dataframe-filter*
(month day)
(and (not (or (na? month) (na? day)))
(= month 1)
(= day 1)))
(dataframe-glimpse))
(define jan-feb-flights
(-> flights
(dataframe-filter*
(month)
(and (not (na? month))
(or (= month 1)
(= month 2))))))
(remove-duplicates ($ jan-feb-flights 'month))
(-> flights
(dataframe-filter*
(month)
(and (not (na? month))
(member month '(1 2))))
(dataframe-glimpse))
;; 3.2.3
(-> flights
(dataframe-filter*
(year month day dep_time)
(not (or (na? year) (na? month) (na? day) (na? dep_time))))
(dataframe-sort*
(< year) (< month) (< day) (< dep_time))
(dataframe-glimpse))
(-> flights
(dataframe-filter*
(dep_delay)
(not (na? dep_delay)))
(dataframe-sort*
(> dep_delay))
(dataframe-glimpse))
;; 3.2.4
(-> flights
(dataframe-unique)
(dataframe-glimpse))
(-> flights
(dataframe-select* origin dest)
(dataframe-unique)
(dataframe-glimpse))
;; .keep_all in dplyr::distinct keeps all columns
;; for the first occurrence of the unique combos
;; no equivalent in dataframe-unique
(-> flights
(dataframe-aggregate*
(origin dest)
(n (origin) (length origin)))
(dataframe-sort*
(> n))
(dataframe-display))
;; 3.2.5 Exercises
;; not included in blog post (and only did 1-4)
;; 1. Single pipeline for each condition
(-> flights
(dataframe-filter*
(arr_delay)
(and (not (na? arr_delay))
(>= arr_delay 2)))
(dataframe-glimpse))
(-> flights
(dataframe-filter*
(dest)
(member dest '("IAH" "HOU")))
(dataframe-glimpse))
(-> flights
(dataframe-filter*
(carrier)
(member carrier '("UA" "AA" "DL")))
(dataframe-glimpse))
(-> flights
(dataframe-filter*
(month)
(and (not (na? month))
(member month '(7 8 9))))
(dataframe-glimpse)))
(-> flights
(dataframe-filter*
(dep_delay arr_delay)
(and (not (or (na? dep_delay) (na? arr_delay)))
(<= dep_delay 0)
(>= arr_delay 120)))
(dataframe-glimpse))
(-> flights
(dataframe-filter*
(dep_delay arr_delay)
(and (not (or (na? dep_delay) (na? arr_delay)))
;; no flights met this condition
(>= dep_delay 60)
(<= arr_delay -30))))
;; no flights met this condition
;; triggers an exception
;; 2. Sort for longest departure delays and earliest flights
(-> flights
(dataframe-filter* (dep_delay) (not (na? dep_delay)))
(dataframe-sort* (> dep_delay))
(dataframe-glimpse))
(-> flights
(dataframe-filter* (dep_time) (not (na? dep_time)))
(dataframe-sort* (< dep_time))
(dataframe-glimpse))
;; 3. Sort flights to find the fastest flights
(-> flights
(dataframe-filter*
(distance air_time)
(not (or (na? distance) (na? air_time))))
(dataframe-modify*
(speed (distance air_time) (* (/ distance air_time) 60)))
(dataframe-sort*
(> speed))
(dataframe-glimpse))
;; 4. Was there a flight on every day in 2013?
(remove-duplicates ($ flights 'year))
(define flights2
(-> flights
(dataframe-filter*
(month day)
(not (or (na? month) (na? day))))
(dataframe-modify*
(month-day
(month day)
(string-append (number->string month) "-" (number->string day))))))
(= 365 (length (remove-duplicates ($ flights2 'month-day))))
;; 3.3 Columns
;; 3.3.1
(-> flights
(dataframe-filter*
(dep_delay arr_delay distance air_time)
(not (or (na? dep_delay) (na? arr_delay) (na? distance) (na? air_time))))
(dataframe-modify*
(gain (dep_delay arr_delay) (- dep_delay arr_delay))
(speed (distance air_time) (* (/ distance air_time) 60)))
(dataframe-glimpse))
;; no equivalent to .before or .after
;; using select instead to achieve similar affect as .after example
(-> flights
(dataframe-filter*
(dep_delay arr_delay distance air_time)
(not (or (na? dep_delay) (na? arr_delay) (na? distance) (na? air_time))))
(dataframe-modify*
(gain (dep_delay arr_delay) (- dep_delay arr_delay))
(speed (distance air_time) (inexact (* (/ distance air_time) 60))))
(dataframe-select* year month day gain speed)
(dataframe-display))
;; 3.3.2
(-> flights
(dataframe-select* year month day)
(dataframe-display))
;; none of the other advanced select examples work in dataframe->rowtable
;; 3.3.3
(-> flights
(dataframe-rename* (tailnum tail_num))
(dataframe-glimpse))
;; 3.3.4
;; no equivalent
;; 3.3.5 exercises
;; skipped
;; 3.4 pipe
(-> flights
(dataframe-filter*
(dest distance air_time)
(and (not (or (na? dest) (na? distance) (na? air_time)))
(string=? dest "IAH")))
(dataframe-modify*
(speed
(distance air_time)
(inexact (* (/ distance air_time) 60))))
(dataframe-select*
year month day dep_time carrier flight speed)
(dataframe-sort*
(> speed))
(dataframe-display))
(dataframe-display
(dataframe-sort*
(dataframe-select*
(dataframe-modify*
(dataframe-filter*
flights
(dest distance air_time)
(and (not (or (na? dest) (na? distance) (na? air_time)))
(string=? dest "IAH")))
(speed
(distance air_time)
(inexact (* (/ distance air_time) 60))))
year month day dep_time carrier flight speed)
(> speed)))
(define flights1
(dataframe-filter*
flights
(dest distance air_time)
(and (not (or (na? dest) (na? distance) (na? air_time)))
(string=? dest "IAH"))))
(define flights2
(dataframe-modify*
flights1
(speed
(distance air_time)
(inexact (* (/ distance air_time) 60)))))
(define flights3
(dataframe-select*
flights2
year month day dep_time carrier flight speed))
(dataframe-display
(dataframe-sort* flights3 (> speed)))
;; 3.5 Groups
;; 3.5.2
;; no automatic sorting in dataframe-aggregate
(-> flights
(dataframe-aggregate*
(month)
(avg_delay (dep_delay) (inexact (mean dep_delay))))
(dataframe-sort*
(< month))
(dataframe-display))
(-> flights
(dataframe-filter*
(dep_delay)
(not (na? dep_delay)))
(dataframe-aggregate*
(month)
(avg_delay (dep_delay) (inexact (mean dep_delay)))
(n (dep_delay) (length dep_delay)))
(dataframe-sort*
(< month))
(dataframe-display))
;; 3.5.3 slice
;; no equivalent
;; 3.5.5
(define daily_flights
(dataframe-aggregate*
flights
(year month day)
(n (dep_time) (length dep_time))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment