Skip to content

Instantly share code, notes, and snippets.

@Metaxal
Last active March 19, 2021 14:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Metaxal/ae0a6937d8f388f3f40ec7396041be55 to your computer and use it in GitHub Desktop.
Save Metaxal/ae0a6937d8f388f3f40ec7396041be55 to your computer and use it in GitHub Desktop.
The count-words problem using a discrimination tree with assocs and switching to hash when too many elements
#lang racket/base
(require racket/dict)
;;; License: [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) or
;;; [MIT license](http://opensource.org/licenses/MIT) at your option.
(define N-PAIRS-MAX 20)
(define int-downcase (- (char->integer #\a) (char->integer #\A)))
(define (char-downcase c)
(integer->char (+ (char->integer c) int-downcase)))
(struct node (count dict n-pairs) #:mutable #:authentic)
(define (new-node)
(node 0 '() 0))
(define root (new-node))
(define (process-line! s start end nd)
(cond
[(= start end)
(set-node-count! nd (+ 1 (node-count nd)))]
[else
(define c (string-ref s start))
(cond
[(char=? c #\space)
(set-node-count! nd (+ 1 (node-count nd)))
(process-line! s (+ start 1) end root)]
[else
(define c2 (if (char<=? #\A c #\Z)
(char-downcase c)
c))
(define nd-dict (node-dict nd))
(cond
[(or (null? nd-dict)
(pair? nd-dict))
(define child
(cond
[(assv c2 nd-dict) => cdr]
[else
(define child (new-node))
(define new-dict (cons (cons c2 child) nd-dict))
(define n-pairs (node-n-pairs nd))
(set-node-n-pairs! nd (+ 1 n-pairs))
(set-node-dict! nd
(if (>= n-pairs N-PAIRS-MAX)
(make-hasheqv new-dict) ; too many assoc elements, use a hash instead
new-dict))
child]))
(process-line! s (+ start 1) end child)]
[else ; dict is a hash
(define child (hash-ref! nd-dict c2 new-node))
(process-line! s (+ start 1) end child)])])]))
(define (trie->list root)
(define res '())
(let loop ([node root] [cs '()])
(define n (node-count node))
(when (> n 0)
(set! res (cons (cons (apply string (reverse cs)) n)
res)))
(for ([(char child) (in-dict (node-dict node))])
(loop child (cons char cs))))
res)
(time
(for ([line (in-lines)])
(process-line! line 0 (string-length line) root)))
(define sorted-words (time (sort (trie->list root) > #:key cdr))) ; negligible time
(time
(for ([p (in-list sorted-words)])
(printf "~a ~a\n" (car p) (cdr p))))
(time (printf "#words: ~a\n" (length sorted-words))) ; 0 time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment