Skip to content

Instantly share code, notes, and snippets.

@samth
Created August 19, 2011 15:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save samth/1157123 to your computer and use it in GitHub Desktop.
Save samth/1157123 to your computer and use it in GitHub Desktop.
TF-IDF in Racket
#lang racket
(require unstable/dict)
(provide main)
;; Set[String]
(define stopwords (list->set (file->lines "./stopwords.txt")))
;; String -> List[String]
(define (tokenize raw-text) ;; Lowercases and splits on non-letters, non-numbers.
(filter-not (λ (e) (set-member? stopwords e))
(regexp-split #rx"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string-downcase raw-text))))
;; Number Hash[Any,Any] -> Number
(define (idf2 n-docs match) (sqr (log (/ n-docs (hash-count match)))))
;; Sequence[String] -> Hash[String,Number]
(define (frequencies l) (for/fold ([h (hash)]) ([i l]) (hash-update h i add1 0)))
;; Path-String -> Hash[String,Hash[Path-String,Number]]
;; Index for one file. Given an fname, returns a map of token -> map of (fname -> count)
(define (index-one fname)
(for/hash ([(k c) (in-hash (frequencies (tokenize (file->string fname))))])
(values k (hash fname c))))
;; Number Hash[String,Number] -> List[Hash[String,Number]]
;; Given the total term count and a map of doc -> count, accumulate tfidf for docs.
(define (accum-tfidf total-doc-count match)
(for/list ([(doc w-count) (in-hash match)])
(hash doc (* w-count (idf2 total-doc-count match)))))
;; Hash[String,Hash[String,Number]] Number String -> Hash[String,Number]
;; Returns accumulated tfidf for each doc.
(define (search db total-doc-count raw-text)
(for*/fold ([h (hash)]) ([e (tokenize raw-text)] [r (in-value (hash-ref db e #f))] #:when r)
(apply dict-union #:combine + h (accum-tfidf total-doc-count r))))
;; Hash[String,Hash[String,Number]] Number Hash[String,Number] String -> Void
(define (read-and-search db total-doc-count doc-norms raw-text)
(define results (search db total-doc-count raw-text))
(define scores (take-right (sort (for/list ([(k v) results])
(list k (/ v (hash-ref doc-norms k))))
< #:key second)
(min 3 (hash-count results))))
(printf "FOR: ~a matched: ~a\n" raw-text scores))
;; List[String] -> Void
(define (main . args)
(define db (apply dict-union (hash) #:combine (λ (a b) (dict-union a b #:combine (λ (a b) a))) (map index-one args)))
(define doc-norms-raw (apply dict-union (hash) #:combine + (append-map (λ (e) (accum-tfidf (length args) e)) (hash-values db))))
(define doc-norms (for/hash ([(k v) doc-norms-raw]) (values k (sqrt v))))
(for ([e (in-lines)]) (read-and-search db (length args) doc-norms e)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment