Skip to content

Instantly share code, notes, and snippets.

@ericnormand
Last active June 28, 2019 15:15
Embed
What would you like to do?

anagrams

Two words are anagrams of each other if they have the same letters but in different orders.

This week's challenge is a two-parter.

Write a function to determine if two words are anagrams.

Given a dictionary of words (such as this one), find all of the anagrams for a target word.

Bonus points for efficiency.

(defn anagram? [word prospect]
(and (not= word prospect)
(= (sort word) (sort prospect))))
(defn anagrams-for [word prospect-list]
(filter #(anagram? (str/lower-case word)
(str/lower-case %))
prospect-list))
(require '[clojure.java.io :refer [reader]])
;; (def url "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
(def url "unixdict.txt")
(defn find-anagrams [word]
(let [target (sort word)
word-size (count word)
words (reader url)]
(filter (fn [line]
(if (= (count line) word-size)
(= target (sort line))
false))
(line-seq words))))
(println (time (find-anagrams "listen")))
(ns unicode-anagrams.core
(:gen-class))
(defn normalize-string
"Normalize a string `s` before computing anagram information"
[s]
(-> s
.toUpperCase
(.replaceAll "\\s" "")))
;; I was curious about how hard it could be to match some variants of a letter
;; with the neutral version (eg. match a Ç with the letter C) and the following
;; function was good enough for all the *latin* char variants I could think of.
(defn letter-name
"Return a vector of a size up to 4 representing the characters unicode name,
causing latin letter variants to be equal, i.e.:
(letter-name \"Ç\") ;; => [\"LATIN\" \"CAPITAL\" \"LETTER\" \"C\"]
"
[c]
(let [fullname-parts (-> c str
(Character/codePointAt 0)
Character/getName
(clojure.string/split #"\s+"))]
(vec (take 4 fullname-parts))))
(defn latin-index [s]
(->> s normalize-string (map letter-name) frequencies))
(defn ascii-index [s]
(->> s normalize-string frequencies))
(defn anagrams?
"Checks if two given strings have the same frequencies of latin letter names"
[s1 s2]
(let [f1 (latin-index s1)
f2 (latin-index s2)]
(= f1 f2)))
(comment
;; A few tests...
(mapv letter-name (normalize-string "Édúçâtìön"))
(anagrams? "EDUCATION" "Édúçâtìön")
(anagrams? "Acrid Avid Jam Shred" "Richard David James")
(anagrams? "Wax The Nip" "Aphex Twin")
)
(defonce words
(->> (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
clojure.string/split-lines
(remove clojure.string/blank?)))
(defn anagrams-in [xs]
(let [indexer latin-index ;; or `ascii-index` for ascii-only, but faster
groups (group-by indexer xs)]
(->> (vals groups)
(remove #(= 1 (count %)))))) ;; remove groups of one word, those are not anagrams
;; (count (anagrams-in words)) ;; => 1303
;; (last (sort-by count (anagrams-in words))) ;; => ["angel" "angle" "galen" "glean" "lange"]
(defn anagram? [w1 w2]
(= (sort (clojure.string/lower-case w1))
(sort (clojure.string/lower-case w2))))
(defn- ->index [word]
(-> word
clojure.string/lower-case
frequencies))
(def dict (clojure.string/split-lines (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")))
(def dindex (group-by ->index dict))
(defn anagrams-of
([dict word]
(filter #(anagram? word %) dict))
([word]
(get dindex (->index word))))
(ns anagrams.core
(:gen-class)
(:require
[clojure.core.reducers :as r]
[clojure.string :as string]
[clojure.java.io :as io]))
(defn gen-primes
([] (gen-primes (iterate inc (bigint 2))))
([ps]
(cons (first ps)
(lazy-seq
(gen-primes
(remove #(= 0 (mod % (first ps)))
(rest ps)))))))
(def primes (gen-primes))
(def letters
(into #{} (map char) (range (int \a) (inc (int \z)))))
(def char->prime
(zipmap letters primes))
(def product (partial reduce * 1))
(defn word->number
[s]
(->> s
(string/lower-case)
seq
(filter letters)
(map char->prime)
product))
(defn anagram?
[s1 s2]
(= (word->number s1)
(word->number s2)))
(defn load-dict
[]
(with-open [f (io/reader (io/resource "words.txt"))]
(loop [line (.readLine f)
words []]
(if (nil? line)
words
(recur (.readLine f)
(conj words [(word->number line) [line]]))))))
(defonce dict (load-dict))
(defn anagram-search
[w]
(let [needle (word->number w)
anagramable? (fn [n] (and (<= n needle) (= 0 (mod needle n))))
possible-words (filter (fn [[n _]] (anagramable? n)) dict)
next-combos (fn [potentials]
(r/foldcat
(r/mapcat
(fn [[n words]]
(r/foldcat
(->> possible-words
(r/map (fn [[m [w]]] [(* n m) (vec (sort (conj words w)))]))
(r/filter (fn [[n*m _]] (anagramable? n*m))))))
potentials)))]
;; new approach: loop over current list of candidates, store any
;; that are exactly needle, discard any that are greater than
;; needle *or* don't factor it, then form new combinations of the
;; single values with the remaining values, until the remaining
;; values is empty
(loop [candidates possible-words
anagrams #{}]
(prn "candidates" (count candidates))
(if (empty? candidates)
anagrams
(let [{anas :ana possible :can}
(->> candidates
(group-by (fn [[n words]]
(cond
(= n needle) :ana
(anagramable? n) :can
:else nil))))
next-candidates (next-combos possible)]
(recur next-candidates (into anagrams (map second anas))))))))
(defn -main
[& args]
(println "Anagrams of " (first args))
(doseq [words (anagram-search (first args))]
(println words)))
(ns clojure-experiments.purely-functional.puzzles.0322-anagrams
"Two parts:
- Write a function to determine if two words are anagrams.
- Given a dictionary of words (such as this one: http://wiki.puzzlers.org/pub/wordlists/unixdict.txt),
find all of the anagrams for a target word.
Bonus points for efficiency."
(:require [clojure.java.io :as io]))
;;; Save the dictionary for later processing
(def dictionary-file (io/file "src/clojure_experiments/purely-functional/puzzles/0322-dictionary.txt"))
(defn- save-dictionary
"Helper function to save dictionary for fast offline access."
[]
(spit dictionary-file
(slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt" )))
#_(save-dictionary)
(def my-dictionary (->> dictionary-file
slurp
(clojure.string/split-lines)))
#_(take 10 my-dictionary)
(defn anagrams? [w1 w2]
(and (= (count w1) (count w2))
(= (set w1) (set w2))))
(anagrams? "ahoj" "helo")
;; => false
(anagrams? "helo" "ehlo")
;; => true
(anagrams? "helo" "hello")
;; => false
(defn dictionary-anagrams [word dictionary]
(reduce
(fn [anagrams candidate]
(if (anagrams? word candidate)
(conj anagrams candidate)
anagrams))
[]
dictionary))
(comment
(dictionary-anagrams "helo" my-dictionary)
;; => [hole]
(dictionary-anagrams "car" my-dictionary)
;; => ["arc" "car" "rca"]
(time (dictionary-anagrams "tire" my-dictionary))
;; => ["rite" "tier" "tire"]
;; "Elapsed time: 6.253211 msecs"
;; end
)
(ns anagrams.core
(:require [clojure.string :as cstr]))
(defn ->lc
"Slightly faster lower-case function, but only for ASCII text"
[w]
(let [sb (StringBuffer.)]
(run! #(.append sb (char (bit-set (int %) 5))) w)
(.toString sb)))
(defn word-hash [w] (sort w))
(def dict-location "resources/unixdict.txt")
(def lc
->lc ; faster, but assumes ASCII
;#(.toLowerCase %) ; slower but more correct
)
(defn read-dict
"Read in the dictionary from path. Assumes one word per line."
[dict-file-path]
(->> (cstr/split (slurp dict-file-path) #"\n")
(reduce #(assoc %1 (word-hash (lc %2)) (conj (get %1 (word-hash (lc %2)) []) (lc %2))) {})))
;---------------------------
(defn is-anagram?
"Is word 1 an anagram of word 2?"
[w1 w2]
(let [w1' (lc w1)
w2' (lc w2)]
(and (= (word-hash w1') (word-hash w2'))
(not= w1' w2'))))
(defn find-anagrams
"Find anagrams of a word, or empty list if none in the current dictionary"
([w] (find-anagrams w (read-dict dict-location)))
([w dict]
(let [w' (lc w) ]
(filter #(not= w' %) (get dict (word-hash w'))))))
(require '[clojure.string :as string])
(defn example-dict
[]
(-> "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt"
(slurp)
(string/split-lines)))
(defn anagram?
"Same set of characters, but not equal."
[s1 s2]
(and (= (frequencies s1)
(frequencies s2))
(not= s1 s2)))
(defn anagrams1
[s dict]
(filter (partial anagram? s) dict)) ; Can be sped up by memoizing `frequencies`
(defn anagrams2
[s dict]
(->> (get (group-by frequencies dict) ; Group by frequencies
(frequencies s))
(remove #{s})))
(defn- remove-letter
[s letter]
(let [[first-half last-half] (split-with (complement #{letter}) s)]
(concat first-half (rest last-half))))
(defn- anagrams3*
[[f & r] dict]
(if f ; While there are more letters to check
(->> dict
(filter (fn [{:keys [rest]}] (some #{f} rest))) ; Filter on words in dict containing first letter of search-string
(map #(update % :rest remove-letter f)) ; Remove that letter from remaining words in dict
(recur r))
dict))
(defn anagrams3
[s dict]
(->> dict
(filter #(= (count s) (count %))) ; Anagrams have the same length (not necessary, but speeds up the function quite a bit)
(map (fn [word] {:word word :rest word})) ; Preprocess dictionary
(anagrams3* s)
(map :word) ; Get back original word
(remove #{s}))) ; Remove the string being anagram'd
(ns miner.ana
(:require [clojure.java.io :as io]))
;;; Eric Normand's Clojure challenge: anagrams
;;; https://purelyfunctional.tv/?p=31286
(defn digest [w]
(sort (seq w)))
(defn anagram? [a b]
(and (not= a b)
(= (digest a) (digest b))))
(defn anagram-search [word anagram-group]
(let [group (get anagram-group (digest word))]
(not-empty (remove #{word} group))))
;; assume text file with one word per line
(defn load-words [filename]
(with-open [fi (io/reader filename)]
(binding [*in* fi]
(doall (take-while some? (repeatedly read-line))))))
(defn anagram-group [filename]
(group-by digest (load-words filename)))
(def default-dict "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
(def default-anagram-group (delay (anagram-group default-dict)))
(def local-dict "/usr/share/dict/words")
(def local-anagram-group (delay (anagram-group local-dict)))
;; `dict` can be either a file reference or a predigested anagram group (for better performance)
(defn anagrams
([word]
(anagram-search word @default-anagram-group))
([word dict]
(anagram-search word (if (map? dict) dict (anagram-group dict)))))
#_
(anagrams "crate")
;; ("caret" "carte" "cater" "trace")
#_
(apply max-key count (vals @local-anagram-group))
;; ["angor" "argon" "goran" "grano" "groan" "nagor" "orang" "organ" "rogan"]
#_
(pprint (sort-by first (filter #(>= (count %) 5) (vals @default-anagram-group))))
;; (["abel" "able" "bale" "bela" "elba"]
;; ["alger" "glare" "lager" "large" "regal"]
;; ["angel" "angle" "galen" "glean" "lange"]
;; ["caret" "carte" "cater" "crate" "trace"]
;; ["elan" "lane" "lean" "lena" "neal"]
;; ["evil" "levi" "live" "veil" "vile"])
(ns scratch.anagrams
(:require [clojure.string :as string]))
(defn anagrams? [& words]
(reduce = (map sort words)))
(anagrams? "neo" "one")
;; => true
(anagrams? "jamesreeves" "weavejester")
;; => false
(defn load-dictionary [path]
(-> path slurp string/split-lines))
(def unix-dict-path "/home/teodorlu/Downloads/unixdict.txt")
;; alternatively, (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
(def unix-dict (load-dictionary unix-dict-path))
(->> unix-dict
shuffle
(take 5))
;; => ("inception" "mickey" "interest" "analogue" "discernible")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TAKE 1: just filter through the words
(defn find-anagrams [dictionary word]
(let [characters (sort word)]
(filter (fn [dict-word]
(= (sort dict-word) characters))
dictionary)))
(def unix-anagrams (partial #'find-anagrams unix-dict))
(unix-anagrams "neo")
;; => ("one")
(time (unix-anagrams "clojure"))
;; => ()
(unix-anagrams "lisp")
;; => ("lisp" "slip")
(time (unix-anagrams "lisp"))
;; => ("lisp" "slip")
"Elapsed time: 0.123896 msecs"
(time (count (load-dictionary unix-dict-path)))
"Elapsed time: 20.51089 msecs"
"Elapsed time: 28.673577 msecs"
"Elapsed time: 25.327939 msecs"
"Elapsed time: 20.880587 msecs"
"Elapsed time: 24.631787 msecs"
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TAKE 2: Index on sorted words.
(defn load-anagram-index [path]
(->> path
slurp
string/split-lines
(group-by sort)))
(time (load-anagram-index unix-dict-path))
"Elapsed time: 81.89557 msecs"
(def unix-dict-index (load-anagram-index unix-dict-path))
(time (unix-dict-index (sort "lisp")))
;; => ["lisp" "slip"]
"Elapsed time: 0.209954 msecs"
"Elapsed time: 0.149922 msecs"
"Elapsed time: 0.135523 msecs"
(time (unix-dict-index (sort "neo")))
;; => ["one"]
"Elapsed time: 0.047577 msecs"
"Elapsed time: 0.191929 msecs"
"Elapsed time: 0.11953 msecs"
(time (dotimes [i 100]
(unix-dict-index (sort "neo"))))
"Elapsed time: 1.173734 msecs"
(time (dotimes [i (* 10 1000)]
(unix-dict-index (sort "neo"))))
"Elapsed time: 54.678017 msecs"
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; On efficiency
;;
;; There are two variables here:
;;
;; - How many anagrams are you going to look up?
;; - How large is your dictionary?
;;
;; In my case, I couldn't measure a speedup from using the index.
;;
;; Conclusion: plain filter is good for small problems. For larger problems, the
;; dictionary size and the number of words to be checked for whether they are
;; anagrams is relevant.
(defn- to-bag-of-chars
[w]
(frequencies w))
(defn- word-signature
"A fast hash on Strings which is invariant under re-ordering of the characters."
^long [^String s]
(loop [i (int 0)
sig (long 0)]
(if (= i (.length s))
sig
(recur
(unchecked-inc-int i)
(+ sig (.codePointAt s i))))))
(defn anagrams?
[w1 w2]
(and
(and ;; for efficiency - will catch a lot of easy cases
(= (count w1) (count w2))
(= (word-signature w1) (word-signature w2)))
(=
(to-bag-of-chars w1)
(to-bag-of-chars w2))))
(defn anagrams-of
[dictionary-words target-word]
(filter
#(anagrams? target-word %)
dictionary-words))
(comment
(require '[clojure.string :as str])
;; Fetching our dictionary
(def dict
(str/split-lines
(slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")))
;; Let's have a look at a sample
(->> dict shuffle (take 10))
=> ("yacht" "decade" "armchair" "audio" "gender" "carrel" "antiquated" "powers" "metro" "skull")
;; Example of searching for anagrams of one word
(anagrams-of dict "art")
=> ("art" "rat" "tar")
;; What are the words with the most anagrams?
(->> dict
(group-by to-bag-of-chars)
vals
(sort-by count) reverse
(take 10)
vec)
=>
[["elan" "lane" "lean" "lena" "neal"]
["abel" "able" "bale" "bela" "elba"]
["alger" "glare" "lager" "large" "regal"]
["angel" "angle" "galen" "glean" "lange"]
["caret" "carte" "cater" "crate" "trace"]
["evil" "levi" "live" "veil" "vile"]
["beard" "bread" "debar" "debra"]
["esprit" "priest" "sprite" "stripe"]
["keats" "skate" "stake" "steak"]
["diet" "edit" "tide" "tied"]]
*e)
(ns anagram-challenge.core
(:gen-class))
(defn anagram? [one two]
(and (= (count one) (count two))
(= (into #{} (clojure.string/lower-case one))
(into #{} (clojure.string/lower-case two)))))
(defn fish-all-anagrams [hook pool]
(filter #(anagram? hook %) pool))
(defn -main [& args]
(when-not (empty? args)
(with-open [rdr (clojure.java.io/reader "unixdict.txt")]
(let [lines (reduce conj [] (line-seq rdr))]
(println (fish-all-anagrams (first args) lines))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment