-
-
Save wilkerlucio/db54dc83a9664124f3febf6356f04509 to your computer and use it in GitHub Desktop.
(ns util.natural-sorting | |
(:refer-clojure :exclude [sort sort-by]) | |
(:require [clojure.string])) | |
(defn parse-int [s] | |
#?(:clj (Long/parseLong s) | |
:cljs (js/parseInt s))) | |
(defn vector-compare [[value1 & rest1] [value2 & rest2]] | |
(let [result (compare value1 value2)] | |
(cond | |
(not (zero? result)) result | |
(nil? value1) 0 | |
:else (recur rest1 rest2)))) | |
(defn prepare-string [s] | |
(let [s (or s "") | |
parts (vec (clojure.string/split s #"\d+")) | |
numbers (->> (re-seq #"\d+" s) | |
(map parse-int) | |
(vec))] | |
(vec (interleave (conj parts "") (conj numbers -1))))) | |
(defn natural-compare [a b] | |
(vector-compare (prepare-string a) | |
(prepare-string b))) | |
(defn sort [coll] (clojure.core/sort natural-compare coll)) | |
(defn sort-by [keyfn coll] | |
(clojure.core/sort-by keyfn natural-compare coll)) |
Issues:
interleave
drops items from the longest argument- Instead of splitting and matching on
\d+
and then combining the result, maybe it makes sense to use(clojure.string/split s #"(?<=[^\d])(?=\d)|(?<=\d)(?=[^\d])")
and then just convert every other item tostr
(odd or even depends on whether the first one can be converted).
@rodsenra Change last part of line 22 to (conj numbers 0)
and (sort ["C" "C4.1a"])
will work.
@p-himik For #1, it's okay that interleave
drops items from longer argument, we'll reach a final ""
or 0
on the shorter one (after making the change I mentioned above) and will be done comparing. For #2, regex101.com shows more than twice as many steps to evaluate your more complicated regex vs. using the simpler \d+
. Some actual performance testing might be warranted?
A version of vector comparison with lazy seqs instead of loop/recur:
(defn- vector-compare [a b]
(or (first (drop-while zero? (map compare (conj a nil) (conj b nil)))) 0))
@wevrem thanks, snippet fixed with (conj numbers 0)
@wilkerlucio In further testing, we should conj -1
, not 0
, to the list of numbers, otherwise (sort ["0" ""])
will fail.
If anyone wants some tests I wrote a series:
(ns tove.natural-sort-test
(:require
[clojure.test :refer [deftest is]]
[tove.natural-sort :as nsort]))
(deftest parse-int
(is (= (nsort/parse-int "1")
1)))
(deftest vector-compare
(is (= (nsort/vector-compare ["1"] ["2"])
-1))
(is (= (nsort/vector-compare ["1"] ["1"])
0))
(is (= (nsort/vector-compare ["2"] ["1"])
1))
(is (= (nsort/vector-compare ["1" "1"] ["2" "2"])
-1))
(is (= (nsort/vector-compare ["1" "1"] ["1" "1"])
0))
(is (= (nsort/vector-compare ["2" "2"] ["1" "1"])
1))
(is (= (nsort/vector-compare ["2" "1"] ["2" "2"])
-1))
(is (= (nsort/vector-compare ["2" "1"] ["1" "2"])
1))
(is (= (nsort/vector-compare [1] [2])
-1))
(is (= (nsort/vector-compare [1] [1])
0))
(is (= (nsort/vector-compare [2] [1])
1))
(is (= (nsort/vector-compare "1" "2")
-1))
(is (= (nsort/vector-compare "1" "1")
0))
(is (= (nsort/vector-compare "2" "1")
1))
(is (= (nsort/vector-compare "10" "11")
-1))
(is (= (nsort/vector-compare "10" "10")
0))
(is (= (nsort/vector-compare "11" "10")
1))
;; Only ever looks at the first element in the sequence and returns if there is a difference.
;; So for a character sequence it only cares about the first character.
(is (= (nsort/vector-compare "9" "10")
8))
(is (= (nsort/vector-compare "90" "10")
8))
(is (= (nsort/vector-compare "95" "15")
8))
(is (= (nsort/vector-compare "90" "19")
8))
;; But if we give it a sequence of actual numbers it will of course compare the entire numbers, in turn.
(is (= (nsort/vector-compare [90] [19])
1))
(is (= (nsort/vector-compare [20] [19])
1))
(is (= (nsort/vector-compare [19] [19])
0))
(is (= (nsort/vector-compare [19 1] [19 1])
0))
(is (= (nsort/vector-compare [19 2] [19 1])
1))
(is (= (nsort/vector-compare [18] [19])
-1)))
(deftest natural-compare
(is (= (nsort/natural-compare "hest1" "hest2")
-1))
(is (= (nsort/natural-compare "hest1" "hest1")
0))
(is (= (nsort/natural-compare "hest1" "hest1")
0))
(is (= (nsort/natural-compare "Märrklobb 1" "Märrklobb 5")
-1))
(is (= (nsort/natural-compare "Märrklobb 1" "Märrklobb 1")
0))
(is (= (nsort/natural-compare "Märrklobb 2" "Märrklobb 1")
1))
;; Since natural-compare splits strings at integers and puts them into the sequence it will
;; compare the words and the numbers separately.
(is (= (nsort/natural-compare "Märrklobb 20" "Märrklobb 1")
1))
(is (= (nsort/natural-compare "Märrklobb 20" "Märrklobb 3") ; So this actually works!
1))
(is (= (nsort/natural-compare "Märrklobb 20" "Märrklobb 30")
-1)))
(deftest sort
(let [märrklobbs (for [i (range 1 13)] (str "Märrklobb " i))]
(is (= (nsort/sort (shuffle märrklobbs))
märrklobbs)))
(is (= (nsort/sort ["0" "" "Gris"])
'("" "0" "Gris"))))
It breaks for (sort ["C" "C4.1a"])