Skip to content

Instantly share code, notes, and snippets.

@yubrshen
Created October 15, 2014 08:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yubrshen/63ffda973aff27d39868 to your computer and use it in GitHub Desktop.
Save yubrshen/63ffda973aff27d39868 to your computer and use it in GitHub Desktop.
finding the closest match of words in two phases independent of order
(ns dw-pos-study.pairwise-match
(:use incanter.stats)
(:use utils-yushen.util)
(:use clojure.test))
;; This package tries to find the most likely match between words independent of order in two phrases.
;; Care is taken to be efficient in order to support plausible approximate match of names between users' text nad a database.
(defn switch [[x y]]
[y x])
(defn pairwise-distances-sorted
"Returns sorted the pairwise distance between words of search and candidate in increasing distance order.
The keyword parameter perfect-match is used to control how strict two words are considered to be perfect match.
By default, it's 0, thus identical words are perfect match. Relax the degree for perfect match may reduce the number of pairwise distances, and the sorting them."
[search candidate & {:keys [perfect-match] :or {perfect-match 0 }}]
(->>
(loop [pairs (for [x search y candidate] [x y])
distance-map {}]
(if (empty? pairs)
distance-map
(let [pair (first pairs)]
(cond
(get distance-map pair) (recur (rest pairs) distance-map)
(get distance-map (switch pair)) (recur (rest pairs) (merge distance-map {pair (get distance-map (switch pair))}))
:else (let [distance (apply incanter.stats/levenshtein-distance pair)
relative-distance (/ distance (apply max (map count pair))) ; use relative distance to allow comparable tolerance in determining prefect match across different pair.
; the maximum length of the words in pair is the maximum of the distance possible for the pair.
perfect-match? (<= relative-distance perfect-match) ; try to optimize when there is perfect match found
]
(recur (if perfect-match? (filter (fn [[x y]] (and (not= x (first pair)) (not= y (second pair)))) (rest pairs)) (rest pairs)) ; further reduce the search space of pairwise distances.
(merge distance-map {pair distance})))))))
(into [], ) ; use sequence, as map's order is not satble
(sort-by second, )))
;; Copide from http://programming-puzzler.blogspot.com/2010/07/translating-code-from-python-and-scheme.html thanks to mbAugust 2, 2010 at 10:59 PM
(defn remove-first
[item coll]
(lazy-seq
(when-let [s (seq coll)] ; use when-let, it automatically takes care of returning nil when there are no more items in the sequence.
; caching of the result of the seq call is that you save one level of indirection for any following first, next or rest call.
(let [fst (first s)]
(if (= fst item)
(next s)
(cons fst (remove-first item (rest coll))))))))
(defn matched-closest
"Find the closest matches of words between search and candidate independent of word order, returning the matches, and the residual of not matching.
The match data contains the matched word pair, the levenshtein distance between them, and the maximum of word lengths of the words in the pair,
which serves as the context to tell how significant the distance is.
The keyword parameter perfect-match is used to control how strict two words are considered to be perfect match.
By default, it's 0, thus identical words are perfect match."
[search candidate & {:keys [perfect-match] :or {perfect-match 0}}]
(let [word-distances-sorted (pairwise-distances-sorted search candidate :perfect-match perfect-match)]
(loop [matched []
filtered word-distances-sorted
[not-matched-search not-matched-candidate] [search candidate]]
(if (empty? filtered)
[matched [not-matched-search not-matched-candidate]]
(let [[[a b] d] (first filtered)
matched-updated (conj matched [[a b] d (max (count a) (count b))])
; remove any element with key containing a or b in their respective position, as [a b] is already considered, there is no point to cornsider them anymore.
filtered-updated (filter (fn [[[x y] _]] (and (not= a x) (not= b y)))
filtered)]
(recur matched-updated
filtered-updated
[(remove-first a not-matched-search) (remove-first b not-matched-candidate)]))))))
(is (matched-closest (clojure.string/split "garbage real stuff" #"\s") (clojure.string/split "real stuff" #"\s"))
[[[["stuff" "stuff"] 0 5] [["real" "real"] 0 4]] ['("garbage") '()]])
(is (matched-closest (clojure.string/split "garbage real stuff" #"\s") (clojure.string/split "real stuff nonsense" #"\s"))
[[[["stuff" "stuff"] 0 5] [["real" "real"] 0 4] [["garbage" "nonsense"] 7 8]] ['() '()]])
(is (matched-closest (clojure.string/split "garbage real staff" #"\s") (clojure.string/split "reel stuff nonsense" #"\s") :perfect-match 0.5)
[[[["staff" "stuff"] 1 5] [["real" "reel"] 1 4] [["garbage" "nonsense"] 7 8]] ['() '()]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment