Skip to content

Instantly share code, notes, and snippets.

@ericnormand
Last active June 28, 2019 15:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericnormand/4de91f3c5fc4c97f04b7718a5726a76e to your computer and use it in GitHub Desktop.
Save ericnormand/4de91f3c5fc4c97f04b7718a5726a76e to your computer and use it in GitHub Desktop.

anagrams

Two words are anagrams of each other if they have the same letters but in different orders.

This week's challenge is a two-parter.

Write a function to determine if two words are anagrams.

Given a dictionary of words (such as this one), find all of the anagrams for a target word.

Bonus points for efficiency.

(defn anagram? [word prospect]
(and (not= word prospect)
(= (sort word) (sort prospect))))
(defn anagrams-for [word prospect-list]
(filter #(anagram? (str/lower-case word)
(str/lower-case %))
prospect-list))
(require '[clojure.java.io :refer [reader]])
;; (def url "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
(def url "unixdict.txt")
(defn find-anagrams [word]
(let [target (sort word)
word-size (count word)
words (reader url)]
(filter (fn [line]
(if (= (count line) word-size)
(= target (sort line))
false))
(line-seq words))))
(println (time (find-anagrams "listen")))
(ns unicode-anagrams.core
(:gen-class))
(defn normalize-string
"Normalize a string `s` before computing anagram information"
[s]
(-> s
.toUpperCase
(.replaceAll "\\s" "")))
;; I was curious about how hard it could be to match some variants of a letter
;; with the neutral version (eg. match a Ç with the letter C) and the following
;; function was good enough for all the *latin* char variants I could think of.
(defn letter-name
"Return a vector of a size up to 4 representing the characters unicode name,
causing latin letter variants to be equal, i.e.:
(letter-name \"Ç\") ;; => [\"LATIN\" \"CAPITAL\" \"LETTER\" \"C\"]
"
[c]
(let [fullname-parts (-> c str
(Character/codePointAt 0)
Character/getName
(clojure.string/split #"\s+"))]
(vec (take 4 fullname-parts))))
(defn latin-index [s]
(->> s normalize-string (map letter-name) frequencies))
(defn ascii-index [s]
(->> s normalize-string frequencies))
(defn anagrams?
"Checks if two given strings have the same frequencies of latin letter names"
[s1 s2]
(let [f1 (latin-index s1)
f2 (latin-index s2)]
(= f1 f2)))
(comment
;; A few tests...
(mapv letter-name (normalize-string "Édúçâtìön"))
(anagrams? "EDUCATION" "Édúçâtìön")
(anagrams? "Acrid Avid Jam Shred" "Richard David James")
(anagrams? "Wax The Nip" "Aphex Twin")
)
(defonce words
(->> (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
clojure.string/split-lines
(remove clojure.string/blank?)))
(defn anagrams-in [xs]
(let [indexer latin-index ;; or `ascii-index` for ascii-only, but faster
groups (group-by indexer xs)]
(->> (vals groups)
(remove #(= 1 (count %)))))) ;; remove groups of one word, those are not anagrams
;; (count (anagrams-in words)) ;; => 1303
;; (last (sort-by count (anagrams-in words))) ;; => ["angel" "angle" "galen" "glean" "lange"]
(defn anagram? [w1 w2]
(= (sort (clojure.string/lower-case w1))
(sort (clojure.string/lower-case w2))))
(defn- ->index [word]
(-> word
clojure.string/lower-case
frequencies))
(def dict (clojure.string/split-lines (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")))
(def dindex (group-by ->index dict))
(defn anagrams-of
([dict word]
(filter #(anagram? word %) dict))
([word]
(get dindex (->index word))))
(ns anagrams.core
(:gen-class)
(:require
[clojure.core.reducers :as r]
[clojure.string :as string]
[clojure.java.io :as io]))
(defn gen-primes
([] (gen-primes (iterate inc (bigint 2))))
([ps]
(cons (first ps)
(lazy-seq
(gen-primes
(remove #(= 0 (mod % (first ps)))
(rest ps)))))))
(def primes (gen-primes))
(def letters
(into #{} (map char) (range (int \a) (inc (int \z)))))
(def char->prime
(zipmap letters primes))
(def product (partial reduce * 1))
(defn word->number
[s]
(->> s
(string/lower-case)
seq
(filter letters)
(map char->prime)
product))
(defn anagram?
[s1 s2]
(= (word->number s1)
(word->number s2)))
(defn load-dict
[]
(with-open [f (io/reader (io/resource "words.txt"))]
(loop [line (.readLine f)
words []]
(if (nil? line)
words
(recur (.readLine f)
(conj words [(word->number line) [line]]))))))
(defonce dict (load-dict))
(defn anagram-search
[w]
(let [needle (word->number w)
anagramable? (fn [n] (and (<= n needle) (= 0 (mod needle n))))
possible-words (filter (fn [[n _]] (anagramable? n)) dict)
next-combos (fn [potentials]
(r/foldcat
(r/mapcat
(fn [[n words]]
(r/foldcat
(->> possible-words
(r/map (fn [[m [w]]] [(* n m) (vec (sort (conj words w)))]))
(r/filter (fn [[n*m _]] (anagramable? n*m))))))
potentials)))]
;; new approach: loop over current list of candidates, store any
;; that are exactly needle, discard any that are greater than
;; needle *or* don't factor it, then form new combinations of the
;; single values with the remaining values, until the remaining
;; values is empty
(loop [candidates possible-words
anagrams #{}]
(prn "candidates" (count candidates))
(if (empty? candidates)
anagrams
(let [{anas :ana possible :can}
(->> candidates
(group-by (fn [[n words]]
(cond
(= n needle) :ana
(anagramable? n) :can
:else nil))))
next-candidates (next-combos possible)]
(recur next-candidates (into anagrams (map second anas))))))))
(defn -main
[& args]
(println "Anagrams of " (first args))
(doseq [words (anagram-search (first args))]
(println words)))
(ns clojure-experiments.purely-functional.puzzles.0322-anagrams
"Two parts:
- Write a function to determine if two words are anagrams.
- Given a dictionary of words (such as this one: http://wiki.puzzlers.org/pub/wordlists/unixdict.txt),
find all of the anagrams for a target word.
Bonus points for efficiency."
(:require [clojure.java.io :as io]))
;;; Save the dictionary for later processing
(def dictionary-file (io/file "src/clojure_experiments/purely-functional/puzzles/0322-dictionary.txt"))
(defn- save-dictionary
"Helper function to save dictionary for fast offline access."
[]
(spit dictionary-file
(slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt" )))
#_(save-dictionary)
(def my-dictionary (->> dictionary-file
slurp
(clojure.string/split-lines)))
#_(take 10 my-dictionary)
(defn anagrams? [w1 w2]
(and (= (count w1) (count w2))
(= (set w1) (set w2))))
(anagrams? "ahoj" "helo")
;; => false
(anagrams? "helo" "ehlo")
;; => true
(anagrams? "helo" "hello")
;; => false
(defn dictionary-anagrams [word dictionary]
(reduce
(fn [anagrams candidate]
(if (anagrams? word candidate)
(conj anagrams candidate)
anagrams))
[]
dictionary))
(comment
(dictionary-anagrams "helo" my-dictionary)
;; => [hole]
(dictionary-anagrams "car" my-dictionary)
;; => ["arc" "car" "rca"]
(time (dictionary-anagrams "tire" my-dictionary))
;; => ["rite" "tier" "tire"]
;; "Elapsed time: 6.253211 msecs"
;; end
)
(ns anagrams.core
(:require [clojure.string :as cstr]))
(defn ->lc
"Slightly faster lower-case function, but only for ASCII text"
[w]
(let [sb (StringBuffer.)]
(run! #(.append sb (char (bit-set (int %) 5))) w)
(.toString sb)))
(defn word-hash [w] (sort w))
(def dict-location "resources/unixdict.txt")
(def lc
->lc ; faster, but assumes ASCII
;#(.toLowerCase %) ; slower but more correct
)
(defn read-dict
"Read in the dictionary from path. Assumes one word per line."
[dict-file-path]
(->> (cstr/split (slurp dict-file-path) #"\n")
(reduce #(assoc %1 (word-hash (lc %2)) (conj (get %1 (word-hash (lc %2)) []) (lc %2))) {})))
;---------------------------
(defn is-anagram?
"Is word 1 an anagram of word 2?"
[w1 w2]
(let [w1' (lc w1)
w2' (lc w2)]
(and (= (word-hash w1') (word-hash w2'))
(not= w1' w2'))))
(defn find-anagrams
"Find anagrams of a word, or empty list if none in the current dictionary"
([w] (find-anagrams w (read-dict dict-location)))
([w dict]
(let [w' (lc w) ]
(filter #(not= w' %) (get dict (word-hash w'))))))
(require '[clojure.string :as string])
(defn example-dict
[]
(-> "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt"
(slurp)
(string/split-lines)))
(defn anagram?
"Same set of characters, but not equal."
[s1 s2]
(and (= (frequencies s1)
(frequencies s2))
(not= s1 s2)))
(defn anagrams1
[s dict]
(filter (partial anagram? s) dict)) ; Can be sped up by memoizing `frequencies`
(defn anagrams2
[s dict]
(->> (get (group-by frequencies dict) ; Group by frequencies
(frequencies s))
(remove #{s})))
(defn- remove-letter
[s letter]
(let [[first-half last-half] (split-with (complement #{letter}) s)]
(concat first-half (rest last-half))))
(defn- anagrams3*
[[f & r] dict]
(if f ; While there are more letters to check
(->> dict
(filter (fn [{:keys [rest]}] (some #{f} rest))) ; Filter on words in dict containing first letter of search-string
(map #(update % :rest remove-letter f)) ; Remove that letter from remaining words in dict
(recur r))
dict))
(defn anagrams3
[s dict]
(->> dict
(filter #(= (count s) (count %))) ; Anagrams have the same length (not necessary, but speeds up the function quite a bit)
(map (fn [word] {:word word :rest word})) ; Preprocess dictionary
(anagrams3* s)
(map :word) ; Get back original word
(remove #{s}))) ; Remove the string being anagram'd
(ns miner.ana
(:require [clojure.java.io :as io]))
;;; Eric Normand's Clojure challenge: anagrams
;;; https://purelyfunctional.tv/?p=31286
(defn digest [w]
(sort (seq w)))
(defn anagram? [a b]
(and (not= a b)
(= (digest a) (digest b))))
(defn anagram-search [word anagram-group]
(let [group (get anagram-group (digest word))]
(not-empty (remove #{word} group))))
;; assume text file with one word per line
(defn load-words [filename]
(with-open [fi (io/reader filename)]
(binding [*in* fi]
(doall (take-while some? (repeatedly read-line))))))
(defn anagram-group [filename]
(group-by digest (load-words filename)))
(def default-dict "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
(def default-anagram-group (delay (anagram-group default-dict)))
(def local-dict "/usr/share/dict/words")
(def local-anagram-group (delay (anagram-group local-dict)))
;; `dict` can be either a file reference or a predigested anagram group (for better performance)
(defn anagrams
([word]
(anagram-search word @default-anagram-group))
([word dict]
(anagram-search word (if (map? dict) dict (anagram-group dict)))))
#_
(anagrams "crate")
;; ("caret" "carte" "cater" "trace")
#_
(apply max-key count (vals @local-anagram-group))
;; ["angor" "argon" "goran" "grano" "groan" "nagor" "orang" "organ" "rogan"]
#_
(pprint (sort-by first (filter #(>= (count %) 5) (vals @default-anagram-group))))
;; (["abel" "able" "bale" "bela" "elba"]
;; ["alger" "glare" "lager" "large" "regal"]
;; ["angel" "angle" "galen" "glean" "lange"]
;; ["caret" "carte" "cater" "crate" "trace"]
;; ["elan" "lane" "lean" "lena" "neal"]
;; ["evil" "levi" "live" "veil" "vile"])
(ns scratch.anagrams
(:require [clojure.string :as string]))
(defn anagrams? [& words]
(reduce = (map sort words)))
(anagrams? "neo" "one")
;; => true
(anagrams? "jamesreeves" "weavejester")
;; => false
(defn load-dictionary [path]
(-> path slurp string/split-lines))
(def unix-dict-path "/home/teodorlu/Downloads/unixdict.txt")
;; alternatively, (slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")
(def unix-dict (load-dictionary unix-dict-path))
(->> unix-dict
shuffle
(take 5))
;; => ("inception" "mickey" "interest" "analogue" "discernible")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TAKE 1: just filter through the words
(defn find-anagrams [dictionary word]
(let [characters (sort word)]
(filter (fn [dict-word]
(= (sort dict-word) characters))
dictionary)))
(def unix-anagrams (partial #'find-anagrams unix-dict))
(unix-anagrams "neo")
;; => ("one")
(time (unix-anagrams "clojure"))
;; => ()
(unix-anagrams "lisp")
;; => ("lisp" "slip")
(time (unix-anagrams "lisp"))
;; => ("lisp" "slip")
"Elapsed time: 0.123896 msecs"
(time (count (load-dictionary unix-dict-path)))
"Elapsed time: 20.51089 msecs"
"Elapsed time: 28.673577 msecs"
"Elapsed time: 25.327939 msecs"
"Elapsed time: 20.880587 msecs"
"Elapsed time: 24.631787 msecs"
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TAKE 2: Index on sorted words.
(defn load-anagram-index [path]
(->> path
slurp
string/split-lines
(group-by sort)))
(time (load-anagram-index unix-dict-path))
"Elapsed time: 81.89557 msecs"
(def unix-dict-index (load-anagram-index unix-dict-path))
(time (unix-dict-index (sort "lisp")))
;; => ["lisp" "slip"]
"Elapsed time: 0.209954 msecs"
"Elapsed time: 0.149922 msecs"
"Elapsed time: 0.135523 msecs"
(time (unix-dict-index (sort "neo")))
;; => ["one"]
"Elapsed time: 0.047577 msecs"
"Elapsed time: 0.191929 msecs"
"Elapsed time: 0.11953 msecs"
(time (dotimes [i 100]
(unix-dict-index (sort "neo"))))
"Elapsed time: 1.173734 msecs"
(time (dotimes [i (* 10 1000)]
(unix-dict-index (sort "neo"))))
"Elapsed time: 54.678017 msecs"
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; On efficiency
;;
;; There are two variables here:
;;
;; - How many anagrams are you going to look up?
;; - How large is your dictionary?
;;
;; In my case, I couldn't measure a speedup from using the index.
;;
;; Conclusion: plain filter is good for small problems. For larger problems, the
;; dictionary size and the number of words to be checked for whether they are
;; anagrams is relevant.
(defn- to-bag-of-chars
[w]
(frequencies w))
(defn- word-signature
"A fast hash on Strings which is invariant under re-ordering of the characters."
^long [^String s]
(loop [i (int 0)
sig (long 0)]
(if (= i (.length s))
sig
(recur
(unchecked-inc-int i)
(+ sig (.codePointAt s i))))))
(defn anagrams?
[w1 w2]
(and
(and ;; for efficiency - will catch a lot of easy cases
(= (count w1) (count w2))
(= (word-signature w1) (word-signature w2)))
(=
(to-bag-of-chars w1)
(to-bag-of-chars w2))))
(defn anagrams-of
[dictionary-words target-word]
(filter
#(anagrams? target-word %)
dictionary-words))
(comment
(require '[clojure.string :as str])
;; Fetching our dictionary
(def dict
(str/split-lines
(slurp "http://wiki.puzzlers.org/pub/wordlists/unixdict.txt")))
;; Let's have a look at a sample
(->> dict shuffle (take 10))
=> ("yacht" "decade" "armchair" "audio" "gender" "carrel" "antiquated" "powers" "metro" "skull")
;; Example of searching for anagrams of one word
(anagrams-of dict "art")
=> ("art" "rat" "tar")
;; What are the words with the most anagrams?
(->> dict
(group-by to-bag-of-chars)
vals
(sort-by count) reverse
(take 10)
vec)
=>
[["elan" "lane" "lean" "lena" "neal"]
["abel" "able" "bale" "bela" "elba"]
["alger" "glare" "lager" "large" "regal"]
["angel" "angle" "galen" "glean" "lange"]
["caret" "carte" "cater" "crate" "trace"]
["evil" "levi" "live" "veil" "vile"]
["beard" "bread" "debar" "debra"]
["esprit" "priest" "sprite" "stripe"]
["keats" "skate" "stake" "steak"]
["diet" "edit" "tide" "tied"]]
*e)
(ns anagram-challenge.core
(:gen-class))
(defn anagram? [one two]
(and (= (count one) (count two))
(= (into #{} (clojure.string/lower-case one))
(into #{} (clojure.string/lower-case two)))))
(defn fish-all-anagrams [hook pool]
(filter #(anagram? hook %) pool))
(defn -main [& args]
(when-not (empty? args)
(with-open [rdr (clojure.java.io/reader "unixdict.txt")]
(let [lines (reduce conj [] (line-seq rdr))]
(println (fish-all-anagrams (first args) lines))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment