Skip to content

Instantly share code, notes, and snippets.

@ericnormand
Last active August 4, 2019 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericnormand/9e54b331bdcf30e781cd129bde5b9c8b to your computer and use it in GitHub Desktop.
Save ericnormand/9e54b331bdcf30e781cd129bde5b9c8b to your computer and use it in GitHub Desktop.

Soundex

Soundex is an algorithm that attempts to encode two similar-sounding English words to the same value. It does this by dropping vowels and encoding consonants to numbers in a lossy way.

The algorithm is described on Wikipedia as "American Soundex". The challenge this week is to implement Soundex.

(ns soundex.main
(:require [clojure.string :as str]))
(def letters->code {#{\B \F \P \V} 1
#{\C \G \J \K \Q \S \X \Z} 2
#{\D \T} 3
#{\L} 4
#{\M \N} 5
#{\R} 6})
(def letter->code (reduce-kv (fn [m letters code]
(reduce #(assoc %1 %2 code) m letters))
{}
letters->code))
(defn soundex-seq [letters]
(loop [letters letters
prev-code nil]
(if letters
(let [[letter & remaining] letters
code (letter->code letter)]
(if prev-code
(if (= prev-code code)
(recur remaining prev-code)
(if (#{\H \W} letter)
(recur remaining prev-code)
(lazy-seq (cons prev-code (soundex-seq letters)))))
(recur remaining code)))
(when prev-code
[prev-code]))))
(defn soundex [s]
(let [[first-letter & _ :as letters] (str/upper-case s)
result (take 4 (soundex-seq letters))
result (cond-> result (letter->code first-letter) rest)]
(apply str first-letter (take 3 (concat result (repeat 0))))))
And then a simple test file (sadly, merely example-based; I told you I was interested in learning about PBT!):
(ns soundex.main-test
(:require [clojure.test :refer [deftest is]]
[soundex.main :refer [soundex]]))
(deftest soundex-test
(is (= "R163" (soundex "Robert")))
(is (= "R163" (soundex "Rupert")))
(is (= "R150" (soundex "Rubin")))
(is (= "A261" (soundex "Ashcraft")))
(is (= "A261" (soundex "Ashcroft")))
(is (= "T522" (soundex "Tymczak")))
(is (= "P236" (soundex "Pfister")))
(is (= "H555" (soundex "Honeyman"))))
Hello Eric,
Here is my submission for issue 337.
Regards,
Jos
(ns scratchpad.pftv.337
(:require [clojure.test :refer [deftest is testing run-tests]]))
(defn consonant-mapping [ch]
(case (Character/toLowerCase ch)
(\h \w) -1
(\b \f \p \v) 1
(\c \g \j \k \q \s \x \z) 2
(\d \t) 3
(\l) 4
(\m \n) 5
(\r) 6
nil))
(defn consonant? [ch]
(let [c (consonant-mapping ch)]
(and c (> c 0))))
(defn calculate-digits [name]
(second
(reduce
(fn [[prev acc] digit]
(cond
(nil? digit) [nil acc] ;; skip vowels
(= digit -1) [prev acc] ;; skip h/w but remember previous prev
(= digit prev) [prev acc] ;; same consonant
:else [digit (conj acc digit)]))
[nil []]
(map consonant-mapping name))))
;; https://en.wikipedia.org/wiki/Soundex#American_Soundex
(defn soundex [name]
(let [first-letter (first name)
all-digits (calculate-digits name)
;; drop the first digit if first letter is a consonant
digits (if (consonant? first-letter)
(rest all-digits)
all-digits)]
(apply str first-letter
;; right pad with zeroes
(take 3 (concat digits (repeat 0))))))
(deftest test-soundex
(testing "Examples from Wikipedia"
(is (= (soundex "Robert") (soundex "Rupert")))
(is (= "R150" (soundex "Rubin")))
(is (= (soundex "Ashcraft") (soundex "Ashcroft")))
(is (= "T522" (soundex "Tymczak" )))
(is (= "P236" (soundex "Pfister")))
(is (= "H555" (soundex "Honeyman")))))
(defn vc-code [c]
(case c
(\B \F \P \V) 1
(\C \G \J \K \Q \S \X \Z) 2
(\D \T) 3
\L 4
(\M \N) 5
\R 6
(\A \E \I \O \U \Y \H \W) 0))
(defn drop-consec-dups [s]
(map first (partition-by identity s)))
(defn drop-first-dup [a s]
(if (= (vc-code a) (first s)) (rest s) s))
(defn soundex [s]
(let [[f & s] (clojure.string/upper-case s)]
(->> (map vc-code s)
drop-consec-dups
(remove #(= 0 %))
(drop-first-dup f)
(cons f)
((fn [s] (concat s "000")))
(take 4)
(apply str))))
;; Tests
[(= "A226" (soundex "Ashcroft") (soundex "Ashcraft"))
(= "R150" (soundex "Rubin"))
(= "H555" (soundex "Honeyman"))
(= "J500" (soundex "Jackson"))
(= "T522" (soundex "Tymczak"))
(= "P236" (soundex "Pfister"))
(= "R163" (soundex "Robert") (soundex "Rupert"))
(= "G362" (soundex "Gutierrez"))
(= "L000" (soundex "Lee"))
(= "W252" (soundex "Washington"))]
(ns miner.soundex
(:require [clojure.string :as str]))
;; Clojure challenge: soundex
;; https://purelyfunctional.tv/issues/purelyfunctional-tv-newsletter-337-functional-programming-is-deep/
(defn encode [ch]
(case ch
(\b \B \f \F \p \P \v \V) 1
(\c \C \g \G \j \J \k \K \q \Q \s \S \x \X \z \Z) 2
(\d \D \t \T) 3
(\l \L) 4
(\m \M \n \N) 5
(\r \R) 6
(\h \H \w \W) -1
;; AEIOUY
0))
(defn soundex [word]
(when-let [cs (seq word)]
(let [c (first cs)
d (if (pos-int? (encode c)) 1 0)
sdx (into [(str/upper-case c)]
(comp (map encode) (remove neg?) (dedupe)
(remove zero?) (drop d) (take 3))
cs)]
(apply str (into sdx (repeat (- 4 (count sdx)) 0))))))
(defn smoke-test-soundex []
(doseq [[word sdx] {"Soundex", "S532"
"Example", "E251"
"Sownteks", "S532"
"Ekzampul", "E251"
"Euler", "E460"
"Gauss", "G200"
"Hilbert", "H416"
"Knuth", "K530"
"Lloyd", "L300"
"Lukasiewicz", "L222"
"Ellery", "E460"
"Ghosh", "G200"
"Heilbronn", "H416"
"HONEYMAN", "H555"
"Rubin", "R150"
"robert", "R163"
"Rupert", "R163"
"Kant", "K530"
"Ladd", "L300"
"Lissajous", "L222"
"Wheaton", "W350"
"Burroughs", "B620"
"Burrows", "B620"
"O'Hara", "O600"
"Washington", "W252"
"Lee", "L000"
"Gutierrez", "G362"
"Pfister", "P236"
"Jackson", "J250"
"Tymczak", "T522"
"VanDeusen", "V532"
"Ashcroft", "A261"
"Ashcraft", "A261"}]
(assert (= (soundex word) sdx) word))
true)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment