Last active
March 19, 2021 14:14
-
-
Save Metaxal/ae0a6937d8f388f3f40ec7396041be55 to your computer and use it in GitHub Desktop.
The count-words problem using a discrimination tree with assocs and switching to hash when too many elements
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket/base | |
(require racket/dict) | |
;;; License: [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) or | |
;;; [MIT license](http://opensource.org/licenses/MIT) at your option. | |
(define N-PAIRS-MAX 20) | |
(define int-downcase (- (char->integer #\a) (char->integer #\A))) | |
(define (char-downcase c) | |
(integer->char (+ (char->integer c) int-downcase))) | |
(struct node (count dict n-pairs) #:mutable #:authentic) | |
(define (new-node) | |
(node 0 '() 0)) | |
(define root (new-node)) | |
(define (process-line! s start end nd) | |
(cond | |
[(= start end) | |
(set-node-count! nd (+ 1 (node-count nd)))] | |
[else | |
(define c (string-ref s start)) | |
(cond | |
[(char=? c #\space) | |
(set-node-count! nd (+ 1 (node-count nd))) | |
(process-line! s (+ start 1) end root)] | |
[else | |
(define c2 (if (char<=? #\A c #\Z) | |
(char-downcase c) | |
c)) | |
(define nd-dict (node-dict nd)) | |
(cond | |
[(or (null? nd-dict) | |
(pair? nd-dict)) | |
(define child | |
(cond | |
[(assv c2 nd-dict) => cdr] | |
[else | |
(define child (new-node)) | |
(define new-dict (cons (cons c2 child) nd-dict)) | |
(define n-pairs (node-n-pairs nd)) | |
(set-node-n-pairs! nd (+ 1 n-pairs)) | |
(set-node-dict! nd | |
(if (>= n-pairs N-PAIRS-MAX) | |
(make-hasheqv new-dict) ; too many assoc elements, use a hash instead | |
new-dict)) | |
child])) | |
(process-line! s (+ start 1) end child)] | |
[else ; dict is a hash | |
(define child (hash-ref! nd-dict c2 new-node)) | |
(process-line! s (+ start 1) end child)])])])) | |
(define (trie->list root) | |
(define res '()) | |
(let loop ([node root] [cs '()]) | |
(define n (node-count node)) | |
(when (> n 0) | |
(set! res (cons (cons (apply string (reverse cs)) n) | |
res))) | |
(for ([(char child) (in-dict (node-dict node))]) | |
(loop child (cons char cs)))) | |
res) | |
(time | |
(for ([line (in-lines)]) | |
(process-line! line 0 (string-length line) root))) | |
(define sorted-words (time (sort (trie->list root) > #:key cdr))) ; negligible time | |
(time | |
(for ([p (in-list sorted-words)]) | |
(printf "~a ~a\n" (car p) (cdr p)))) | |
(time (printf "#words: ~a\n" (length sorted-words))) ; 0 time |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment