obmarg/tokenizer.clj

## tokenizer.clj
(ns tokenizer.core)

(defn insert-word
  [dict word]
  "Inserts a single word into a dictionary tree"
  (update-in dict (concat (interpose :children word) [:word]) (fn [orig] word)))

(defn build-dict-tree
  [words]
  "Builds a dict-tree from a list of words."
  {:children (reduce insert-word {} words)})

(defn tap [what x] (println what x) x)

(defn tokenize
  [string dict-tree]
  "Tokenizes a string.  Returns a vector of words on success, otherwise nil.

   Currently selects the first possible tokenization it comes across.
   Will prefer short words over long words (though that's easily changable).

   Works best with a limited dictionary, but will fall over if words to be
   tokenized are not in the dictionary. Could be improved with knowledge of the
   frequencies of individual words in the dictionary.
  "
  (loop [[letter & rest :as string] string
         {:keys [children word]}    dict-tree
         potential-words            []]
    ;; We loop through the letters of the string & the dict-tree, finding
    ;; potential words that we could select as tokens.
    (let [potential-words (if word
                            (conj potential-words [word string])
                            potential-words)]
      (if-let [next-tree (get children letter)]
        (recur rest next-tree potential-words)
        ;; If we've run out of tree to recurse, then we have either:
        (if-not letter
          ;; Found the last word in our string
          (when word [word])
          ;; Or need to find which potential word leads to a valid tokenization
          ;; of the rest of the string.
          (->> potential-words
               ;; For now we just take potential-words in default order
               ;; (shortest first) but given word popularity information we
               ;; could try in that order (or any other order).
               (map (fn [[word rest]]
                      (when-let [words (tokenize rest dict-tree)]
                        (cons word words))))
               (filter identity)
               ;; For now we just pick the first valid tokenization of the rest
               ;; of the string, but we _could_ use some smarter logic to find
               ;; the optimum tokenization.
               ;; We could also just return a list of all possible tokenizations
               ;; and let a further level of logic determine the one to pick.
               first))))))
	(ns tokenizer.core)

	(defn insert-word
	[dict word]
	"Inserts a single word into a dictionary tree"
	(update-in dict (concat (interpose :children word) [:word]) (fn [orig] word)))

	(defn build-dict-tree
	[words]
	"Builds a dict-tree from a list of words."
	{:children (reduce insert-word {} words)})

	(defn tap [what x] (println what x) x)

	(defn tokenize
	[string dict-tree]
	"Tokenizes a string. Returns a vector of words on success, otherwise nil.

	Currently selects the first possible tokenization it comes across.
	Will prefer short words over long words (though that's easily changable).

	Works best with a limited dictionary, but will fall over if words to be
	tokenized are not in the dictionary. Could be improved with knowledge of the
	frequencies of individual words in the dictionary.
	"
	(loop [[letter & rest :as string] string
	{:keys [children word]} dict-tree
	potential-words []]
	;; We loop through the letters of the string & the dict-tree, finding
	;; potential words that we could select as tokens.
	(let [potential-words (if word
	(conj potential-words [word string])
	potential-words)]
	(if-let [next-tree (get children letter)]
	(recur rest next-tree potential-words)
	;; If we've run out of tree to recurse, then we have either:
	(if-not letter
	;; Found the last word in our string
	(when word [word])
	;; Or need to find which potential word leads to a valid tokenization
	;; of the rest of the string.
	(->> potential-words
	;; For now we just take potential-words in default order
	;; (shortest first) but given word popularity information we
	;; could try in that order (or any other order).
	(map (fn [[word rest]]
	(when-let [words (tokenize rest dict-tree)]
	(cons word words))))
	(filter identity)
	;; For now we just pick the first valid tokenization of the rest
	;; of the string, but we _could_ use some smarter logic to find
	;; the optimum tokenization.
	;; We could also just return a list of all possible tokenizations
	;; and let a further level of logic determine the one to pick.
	first))))))