Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
function for detecting unicode ligatures in text
(ns net.abhinavsarkar.unicode-partition
(:import [java.awt.font TextAttribute GlyphVector]
[java.awt Font]
[javax.swing JTextArea]))
(let [^java.util.Map text-attrs {
TextAttribute/FAMILY "Arial Unicode MS"
TextAttribute/SIZE 25
TextAttribute/LIGATURES TextAttribute/LIGATURES_ON}
font (Font/getFont text-attrs)
ta (doto (JTextArea.) (.setFont font))
frc (.getFontRenderContext (.getFontMetrics ta font))]
(defn unicode-partition
"takes an unicode string and returns a vector of strings by partitioning
the input string in such a way that multiple code points of a single
ligature are in same partition in the output vector"
[^String text]
(let [glyph-vector
font, frc, (.toCharArray text),
0, (.length text), Font/LAYOUT_LEFT_TO_RIGHT)
glyph-num (.getNumGlyphs glyph-vector)
(flatten (partition 1 2
(.getGlyphPositions glyph-vector 0 glyph-num nil)))
(map -
(concat (next glyph-positions)
[(.. glyph-vector getLogicalBounds getWidth)])
(seq (.getGlyphCharIndices glyph-vector 0 glyph-num nil))
glyph-index-width-map (zipmap glyph-indices glyph-widths)
(vec (reduce
(fn [^floats acc [^Integer k ^Float v]]
(do (aset acc k v) acc))
(float-array (count glyph-index-width-map))
(loop [idx 0 pidx 0 char-seq text acc (transient [])]
(if (nil? char-seq)
(if-not (zero? (nth corrected-glyph-widths idx))
(recur (inc idx) (inc pidx) (next char-seq)
(conj! acc (str (first char-seq))))
(recur (inc idx) pidx (next char-seq)
(assoc! acc (dec pidx)
(str (nth acc (dec pidx)) (first char-seq)))))))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment