public
Created

TF-IDF in Racket

  • Download Gist
tf-idf.rkt
Racket
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
#lang racket
(require unstable/dict)
(provide main)
 
;; Set[String]
(define stopwords (list->set (file->lines "./stopwords.txt")))
 
;; String -> List[String]
(define (tokenize raw-text) ;; Lowercases and splits on non-letters, non-numbers.
(filter-not (λ (e) (set-member? stopwords e))
(regexp-split #rx"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string-downcase raw-text))))
 
;; Number Hash[Any,Any] -> Number
(define (idf2 n-docs match) (sqr (log (/ n-docs (hash-count match)))))
 
;; Sequence[String] -> Hash[String,Number]
(define (frequencies l) (for/fold ([h (hash)]) ([i l]) (hash-update h i add1 0)))
 
;; Path-String -> Hash[String,Hash[Path-String,Number]]
;; Index for one file. Given an fname, returns a map of token -> map of (fname -> count)
(define (index-one fname)
(for/hash ([(k c) (in-hash (frequencies (tokenize (file->string fname))))])
(values k (hash fname c))))
 
;; Number Hash[String,Number] -> List[Hash[String,Number]]
;; Given the total term count and a map of doc -> count, accumulate tfidf for docs.
(define (accum-tfidf total-doc-count match)
(for/list ([(doc w-count) (in-hash match)])
(hash doc (* w-count (idf2 total-doc-count match)))))
 
;; Hash[String,Hash[String,Number]] Number String -> Hash[String,Number]
;; Returns accumulated tfidf for each doc.
(define (search db total-doc-count raw-text)
(for*/fold ([h (hash)]) ([e (tokenize raw-text)] [r (in-value (hash-ref db e #f))] #:when r)
(apply dict-union #:combine + h (accum-tfidf total-doc-count r))))
 
;; Hash[String,Hash[String,Number]] Number Hash[String,Number] String -> Void
(define (read-and-search db total-doc-count doc-norms raw-text)
(define results (search db total-doc-count raw-text))
(define scores (take-right (sort (for/list ([(k v) results])
(list k (/ v (hash-ref doc-norms k))))
< #:key second)
(min 3 (hash-count results))))
(printf "FOR: ~a matched: ~a\n" raw-text scores))
 
;; List[String] -> Void
(define (main . args)
(define db (apply dict-union (hash) #:combine (λ (a b) (dict-union a b #:combine (λ (a b) a))) (map index-one args)))
(define doc-norms-raw (apply dict-union (hash) #:combine + (append-map (λ (e) (accum-tfidf (length args) e)) (hash-values db))))
(define doc-norms (for/hash ([(k v) doc-norms-raw]) (values k (sqrt v))))
(for ([e (in-lines)]) (read-and-search db (length args) doc-norms e)))

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.