public
Last active

function for detecting unicode ligatures in text

  • Download Gist
unicode-partition.clj
Clojure
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
(ns net.abhinavsarkar.unicode-partition
(:import [java.awt.font TextAttribute GlyphVector]
[java.awt Font]
[javax.swing JTextArea]))
 
(let [^java.util.Map text-attrs {
TextAttribute/FAMILY "Arial Unicode MS"
TextAttribute/SIZE 25
TextAttribute/LIGATURES TextAttribute/LIGATURES_ON}
font (Font/getFont text-attrs)
ta (doto (JTextArea.) (.setFont font))
frc (.getFontRenderContext (.getFontMetrics ta font))]
(defn unicode-partition
"takes an unicode string and returns a vector of strings by partitioning
the input string in such a way that multiple code points of a single
ligature are in same partition in the output vector"
[^String text]
(let [glyph-vector
(.layoutGlyphVector
font, frc, (.toCharArray text),
0, (.length text), Font/LAYOUT_LEFT_TO_RIGHT)
glyph-num (.getNumGlyphs glyph-vector)
glyph-positions
(flatten (partition 1 2
(.getGlyphPositions glyph-vector 0 glyph-num nil)))
glyph-widths
(map -
(concat (next glyph-positions)
[(.. glyph-vector getLogicalBounds getWidth)])
glyph-positions)
glyph-indices
(seq (.getGlyphCharIndices glyph-vector 0 glyph-num nil))
glyph-index-width-map (zipmap glyph-indices glyph-widths)
corrected-glyph-widths
(vec (reduce
(fn [^floats acc [^Integer k ^Float v]]
(do (aset acc k v) acc))
(float-array (count glyph-index-width-map))
glyph-index-width-map))]
(persistent!
(loop [idx 0 pidx 0 char-seq text acc (transient [])]
(if (nil? char-seq)
acc
(if-not (zero? (nth corrected-glyph-widths idx))
(recur (inc idx) (inc pidx) (next char-seq)
(conj! acc (str (first char-seq))))
(recur (inc idx) pidx (next char-seq)
(assoc! acc (dec pidx)
(str (nth acc (dec pidx)) (first char-seq)))))))))))

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.