Last active
August 17, 2017 19:27
-
-
Save pataprogramming/4cfdd72295c5451d0a084c4469d98995 to your computer and use it in GitHub Desktop.
Similar words by frequency - phillydev #daily_programmer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns daily.similar-frequencies | |
(:require [clojure.string :as string] | |
[clojure.java.io :as io])) | |
(def word-file "resources/google-10000-english-no-swears.txt") | |
;; A vector containing all words in the dataset | |
(def words | |
(with-open [rdr (io/reader word-file)] | |
(->> rdr | |
line-seq | |
(mapv string/trim)))) | |
;; A hashmap of letters to the number of times they occur in the dataset | |
(def histogram | |
(->> words | |
(mapcat identity) | |
frequencies)) | |
;; A vector of letters in the dataset, ordered by frequency | |
(def ranks | |
(->> histogram | |
(sort-by val >) | |
(mapv first))) | |
(defn score-by-count [word] | |
"Given word, counts the frequencies of each letter and returns a vector | |
(ordered by the frequencies of each letter. Vectors can be compared | |
element by element to impose an ordering." | |
(let [freqs (frequencies word)] | |
(mapv #(get freqs %) ranks))) | |
;; Comparator combinators! | |
(defn reverse-comparator [c] | |
"Given a comparator c, returns a new comparator that reverses c's ordering." | |
(fn [x y] | |
(* -1 (c x y)))) | |
(defn chain-comparators [& comparators] | |
"Given any number of comparators, returns a new comparator that chains them | |
together; if the result of applying the first comparator to its arguments | |
x and y is 0, it compares using the second comparator, and so on." | |
(fn [x y] | |
(loop [[c & cs] comparators] | |
(if c | |
(let [result (c x y)] | |
(if (zero? result) | |
(recur cs) | |
result)) | |
0)))) | |
(defn key-comparator [key-fn c] | |
"Given a comparator c and a function key-fn, returns a new comparator that | |
applies key-fn to its arguments x and y before using c to perform the | |
comparison." | |
(fn [x y] | |
(c (key-fn x) (key-fn y)))) | |
;; Now, the logic to stitch the weighting functions into comparators for sorting | |
(def order-by-count | |
(chain-comparators | |
;; Word with most instances of most frequent letter in dataset comes first | |
(reverse-comparator (key-comparator score-by-count compare)) | |
;; If tied, word that comes earlier in alphabetical order wins | |
compare)) | |
(defn get-similar-words-by [cmp word] | |
"Given a comparator cmp and a word, returns words from the dataset that | |
are of 'similar' length, sorted using cmp." | |
(->> words | |
(filter #(or (= (count %) (count word)) | |
(= (count %) (inc (count word))))) | |
(sort cmp))) | |
(def get-similar-words-by-count | |
^{:doc "Given a word as an argument, returns words of 'similar' length | |
from the dataset, ordered by the number of times the most | |
frequent letters appear in the word."} | |
(partial get-similar-words-by order-by-count)) | |
;; Given the parameterization, it's easy to swap out the scoring function: | |
(defn score-by-weight [word] | |
"Given word, computes a weight value by scoring each letter and summing the | |
scores. A letter's score is equal to the number of times it appears in | |
the dataset." | |
(->> word | |
(map histogram) | |
(reduce +))) | |
(def order-by-weight | |
(chain-comparators | |
;; Order words by computing their weights | |
(reverse-comparator (key-comparator score-by-weight compare)) | |
;; If tied, word that comes earlier in the alphabetical order wins | |
compare)) | |
(def get-similar-words-by-weight | |
(partial get-similar-words-by order-by-weight)) | |
;; daily.similar-frequencies> (take 10 (get-similar-words-by-count "foo")) | |
;; ("ieee" "ease" "sees" "seen" "else" "seed" "seem" "eyes" "fees" "seek") | |
;; daily.similar-frequencies> (take 10 (get-similar-words-by-weight "foo")) | |
;; ("ieee" "ease" "sees" "seen" "tree" "teen" "else" "reel" "area" "seas") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment