dfuenzalida/unicode-anagrams.core.clj

## unicode-anagrams.core.clj
(ns unicode-anagrams.core
  (:gen-class))

(defn normalize-string
  "Normalize a string `s` before computing anagram information"
  [s]
  (-> s
      .toUpperCase
      (.replaceAll "\\s" "")))

;; I was curious about how hard it could be to match some variants of a letter
;; with the neutral version (eg. match a Ç with the letter C) and the following
;; function was good enough for all the *latin* char variants I could think of.

(defn letter-name
  "Return a vector of a size up to 4 representing the characters unicode name,
  causing latin letter variants to be equal, i.e.:

  (letter-name \"Ç\") ;; => [\"LATIN\" \"CAPITAL\" \"LETTER\" \"C\"]
  "
  [c]
  (let [fullname-parts (-> c str
                           (Character/codePointAt 0)
                           Character/getName
                           (clojure.string/split #"\s+"))]
    (vec (take 4 fullname-parts))))

(defn latin-index [s]
  (->> s normalize-string (map letter-name) frequencies))

(defn ascii-index [s]
  (->> s normalize-string frequencies))

(defn anagrams?
  "Checks if two given strings have the same frequencies of latin letter names"
  [s1 s2]
  (let [f1 (latin-index s1)
        f2 (latin-index s2)]
    (= f1 f2)))

(comment
  ;; A few tests...
  (mapv letter-name (normalize-string "Édúçâtìön"))
  (anagrams? "EDUCATION" "Édúçâtìön")
  (anagrams? "Acrid Avid Jam Shred" "Richard David James")
  (anagrams? "Wax The Nip" "Aphex Twin")
  )

(defonce words
  (->> (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
       clojure.string/split-lines
       (remove clojure.string/blank?)))

(defn anagrams-in [xs]
  (let [indexer latin-index ;; or `ascii-index` for ascii-only, but faster
        groups  (group-by indexer xs)]
    (->> (vals groups)
         (remove #(= 1 (count %)))))) ;; remove groups of one word, those are not anagrams

;; (count (anagrams-in words)) ;; => 1303
;; (last (sort-by count (anagrams-in words))) ;; => ["angel" "angle" "galen" "glean" "lange"]
	(ns unicode-anagrams.core
	(:gen-class))

	(defn normalize-string
	"Normalize a string `s` before computing anagram information"
	[s]
	(-> s
	.toUpperCase
	(.replaceAll "\\s" "")))

	;; I was curious about how hard it could be to match some variants of a letter
	;; with the neutral version (eg. match a Ç with the letter C) and the following
	;; function was good enough for all the latin char variants I could think of.

	(defn letter-name
	"Return a vector of a size up to 4 representing the characters unicode name,
	causing latin letter variants to be equal, i.e.:

	(letter-name \"Ç\") ;; => [\"LATIN\" \"CAPITAL\" \"LETTER\" \"C\"]
	"
	[c]
	(let [fullname-parts (-> c str
	(Character/codePointAt 0)
	Character/getName
	(clojure.string/split #"\s+"))]
	(vec (take 4 fullname-parts))))

	(defn latin-index [s]
	(->> s normalize-string (map letter-name) frequencies))

	(defn ascii-index [s]
	(->> s normalize-string frequencies))

	(defn anagrams?
	"Checks if two given strings have the same frequencies of latin letter names"
	[s1 s2]
	(let [f1 (latin-index s1)
	f2 (latin-index s2)]
	(= f1 f2)))

	(comment
	;; A few tests...
	(mapv letter-name (normalize-string "Édúçâtìön"))
	(anagrams? "EDUCATION" "Édúçâtìön")
	(anagrams? "Acrid Avid Jam Shred" "Richard David James")
	(anagrams? "Wax The Nip" "Aphex Twin")
	)

	(defonce words
	(->> (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
	clojure.string/split-lines
	(remove clojure.string/blank?)))

	(defn anagrams-in [xs]
	(let [indexer latin-index ;; or `ascii-index` for ascii-only, but faster
	groups (group-by indexer xs)]
	(->> (vals groups)
	(remove #(= 1 (count %)))))) ;; remove groups of one word, those are not anagrams

	;; (count (anagrams-in words)) ;; => 1303
	;; (last (sort-by count (anagrams-in words))) ;; => ["angel" "angle" "galen" "glean" "lange"]