Skip to content

Instantly share code, notes, and snippets.

@ericnormand
Created August 16, 2021 18:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericnormand/b0346aab201486c4d9bcce4debd10cd7 to your computer and use it in GitHub Desktop.
Save ericnormand/b0346aab201486c4d9bcce4debd10cd7 to your computer and use it in GitHub Desktop.
439 PurelyFunctional.tv Newsletter

Syllable splitter

Write a function that splits English words into syllables, according to these rules.

Examples

(sylsplit "hello") ;=> ["hel" "lo"]
(sylsplit "apple") ;=> ["ap" "ple"]
(sylsplit "entrenched") ;=> ["en" "trenched"]

Note that this task is deliberately underspecified. The rules of English syllable splitting can get quite complicated. Take your solution as far as you like.

Thanks to this site for the problem idea, where it is rated Expert in JavaScript. The problem has been modified.

Please submit your solutions as comments on this gist.

To subscribe: https://purelyfunctional.tv/newsletter/

@jonasseglare
Copy link

jonasseglare commented Aug 20, 2021

Optimizing a path through a state graph using dynamic programming might be a possible approach:

(def true-vowels (set "aouei"))
(def all-vowels (conj true-vowels \y))

(defn true-consonant? [x]
  (and (not (all-vowels x))
       (char? x)))

(defn any-consonant? [x]
  (and (not (true-vowels x))
       (char? x)))

(defn allocate-state
  ([model n] (update model :count + n))
  ([model] (allocate-state model 1)))

(def last-state (comp dec :count))

(defn normalize-pred [x]
  (if (or (keyword? x)
          (char? x))
    #{x}
    x))

(defn state-index [model k]
  (get-in model [:keys k]))

(def huge 10000)
(def default-tr [huge false false])

(defn transition [symbol-pred cost split-before? split-after?]
  {:pre [(number? cost) (boolean? split-before?) (boolean? split-after?)]}
  (let [pred (normalize-pred symbol-pred)]
    #(if (pred %)
       [cost split-before? split-after?]
       default-tr)))

(defn merge-transition [a b]
  (cond
    (nil? a) b
    (nil? b) a
    :default #(min-key first (a %) (b %))))

(defn add-transition [model src-dst t]
  (update-in model [:transitions src-dst] merge-transition t))

(defn get-cost [model src dst symbol]
  (if-let [f (get-in model [:transitions [src dst]])]
    (f symbol)
    default-tr))

(defn connect [model src dst split-before? pred-seq split-after? cost]
  (let [pred-seq (vec pred-seq)
        n (count pred-seq)
        m (dec n)
        c (:count model)
        src-dst-pairs (vec (partition 2 1 (into [] cat [[(state-index model src)]
                                                        (range c (+ c m))
                                                        [(state-index model dst)]])))]
    (reduce (fn [model i]
              (add-transition model (nth src-dst-pairs i) (transition (nth pred-seq i) (if (zero? i) cost 0) (and (zero? i) split-before?) (and (= (dec n) i) split-after?))))
            (allocate-state model m)
            (range n))))

(defn connect-sources [model sources dst split-before? pred-seq split-after? cost]
  (reduce #(connect %1 %2 dst split-before? pred-seq split-after? cost)
          model
          sources))

(defn state-key [model k]
  (-> model
      allocate-state
      (assoc-in [:keys k] (:count model))))

(defn connect-alternatives [model src dst split-before? alts split-after? cost]
  (reduce #(connect %1 src dst split-before? %2 split-after? cost)
          model
          alts))

(def consonant-sounds [[any-consonant?] "ch" "sh"])
(def silent-endings ["ed"])

(def model (-> {:count 0 :keys {} :transitions {}}
               (state-key :start)
               (state-key :end)
               (state-key :after-prefix)
               (state-key :vowels)
               (state-key :some-consonants)
               (state-key :final-consonants)
               (state-key :single-consonant)
               (state-key :first-consonant)
               (state-key :silent-ending)
               (state-key :before-silent-ending)
               (connect-alternatives :start :after-prefix false ["re" "pre"] true -100)
               (connect-sources [:start :after-prefix :some-consonants] :vowels false [true-vowels] false 0)
               (connect-alternatives :vowels :single-consonant true consonant-sounds false 0)
               (connect :single-consonant :vowels false [true-vowels] false 0)
               (connect-alternatives :vowels :first-consonant false consonant-sounds true 0)
               (connect-alternatives :first-consonant :some-consonants false consonant-sounds false 0)
               (connect :vowels :vowels false [true-vowels] false 0)
               (connect-sources [:start :after-prefix :vowels :silent-ending :some-consonants :final-consonants] :end false [:terminal] false 0)
               (connect :vowels :final-consonants false [any-consonant?] false 0)
               (connect :vowels :before-silent-ending false [any-consonant?] false -1000)
               (connect :before-silent-ending :before-silent-ending false [any-consonant?] false 0)
               (connect-alternatives :before-silent-ending :silent-ending false silent-endings false 0)
               (connect-sources [:start :after-prefix :some-consonants] :some-consonants false [any-consonant?] false 0)))

(def state-range (-> model :count range))

(def rev-map (into {} (for [[k v] (:keys model)]
                        [v k])))

(defn min-index [f s]
  (->> s
       (map-indexed vector)
       (apply min-key (comp f second))))

(def ^:dynamic debug? false)

(defn sylsplit-sub [at previous remaining]
  (when debug? 
    (println "\n>Symbol:" (first remaining))
    (doseq [[i [cost prev]] (map-indexed vector previous)]
      (println "  " prev "->" i ":" cost)))
  (if (empty? remaining)
    (let [[index [cost pred pos]] (min-index first previous)]
      [pos [index pred]])
    (let [[pos result] (sylsplit-sub (inc at)
                                     (vec (for [next-state state-range]
                                            (first (sort-by first (for [prev-state state-range]
                                                                    (let [[cost b? a?] (get-cost model prev-state next-state (first remaining))]
                                                                      ()
                                                                      [(+ cost (first (nth previous prev-state)))
                                                                       prev-state
                                                                       (into #{} cat [(if b? [at] []) (if a? [(inc at)] [])])]))))))
                                     (rest remaining))]
      (let [[item-cost item-index item-pos] (nth previous (last result))]
        [(into pos item-pos) (conj result item-index)]))))

(defn sylsplit [src]
  (let [[pos states] (->> [(clojure.string/lower-case src) [:terminal]]
                          (into [] cat)
                          (sylsplit-sub 0 (into [[0 nil]] (-> model :count dec (repeat [huge nil])))))
        slice-bounds (partition 2 1 (into [] cat [[0] (sort pos) [(count src)]]))]
    (when debug?
      (println "\n----- SYLSPLIT: " src)
      (println "Slice positions:" pos)
      (println "Assigned states:")
      (doseq [i (-> states reverse rest)]
        (println i (get rev-map i i))))
    (mapv (partial apply subs src) slice-bounds)))


(sylsplit "hello")
;; => ["hel" "lo"]

(sylsplit "apple")
;; => ["ap" "ple"]

(sylsplit "entrenched")
;; => ["en" "trenched"]

(sylsplit "preview")
;; => ["pre" "view"]

(sylsplit "frozen")
;; => ["fro" "zen"]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment