Created
June 25, 2019 05:05
-
-
Save dfuenzalida/aaa76836e445d633e1f6ac46213ef1c3 to your computer and use it in GitHub Desktop.
Anagrams challenge for PF.tv 332
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns unicode-anagrams.core | |
(:gen-class)) | |
(defn normalize-string | |
"Normalize a string `s` before computing anagram information" | |
[s] | |
(-> s | |
.toUpperCase | |
(.replaceAll "\\s" ""))) | |
;; I was curious about how hard it could be to match some variants of a letter | |
;; with the neutral version (eg. match a Ç with the letter C) and the following | |
;; function was good enough for all the *latin* char variants I could think of. | |
(defn letter-name | |
"Return a vector of a size up to 4 representing the characters unicode name, | |
causing latin letter variants to be equal, i.e.: | |
(letter-name \"Ç\") ;; => [\"LATIN\" \"CAPITAL\" \"LETTER\" \"C\"] | |
" | |
[c] | |
(let [fullname-parts (-> c str | |
(Character/codePointAt 0) | |
Character/getName | |
(clojure.string/split #"\s+"))] | |
(vec (take 4 fullname-parts)))) | |
(defn latin-index [s] | |
(->> s normalize-string (map letter-name) frequencies)) | |
(defn ascii-index [s] | |
(->> s normalize-string frequencies)) | |
(defn anagrams? | |
"Checks if two given strings have the same frequencies of latin letter names" | |
[s1 s2] | |
(let [f1 (latin-index s1) | |
f2 (latin-index s2)] | |
(= f1 f2))) | |
(comment | |
;; A few tests... | |
(mapv letter-name (normalize-string "Édúçâtìön")) | |
(anagrams? "EDUCATION" "Édúçâtìön") | |
(anagrams? "Acrid Avid Jam Shred" "Richard David James") | |
(anagrams? "Wax The Nip" "Aphex Twin") | |
) | |
(defonce words | |
(->> (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt") | |
clojure.string/split-lines | |
(remove clojure.string/blank?))) | |
(defn anagrams-in [xs] | |
(let [indexer latin-index ;; or `ascii-index` for ascii-only, but faster | |
groups (group-by indexer xs)] | |
(->> (vals groups) | |
(remove #(= 1 (count %)))))) ;; remove groups of one word, those are not anagrams | |
;; (count (anagrams-in words)) ;; => 1303 | |
;; (last (sort-by count (anagrams-in words))) ;; => ["angel" "angle" "galen" "glean" "lange"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment