Skip to content

Instantly share code, notes, and snippets.

@taylorwood
Last active December 15, 2017 17:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save taylorwood/635cacf430db53265280ba98908848f2 to your computer and use it in GitHub Desktop.
Save taylorwood/635cacf430db53265280ba98908848f2 to your computer and use it in GitHub Desktop.
Parse/render Unicode.org Emoji data https://www.unicode.org/reports/tr51/#Data_Files
(ns unicode.emoji
"Parses Unicode.org Emoji specifications."
(:require [clojure.java.io :as io]
[clojure.string :as cs]))
(defn slurp-lines
"Returns lines from unicode emoji file. Optionally reads file of same name
from resources."
[file & [resource?]]
(-> (if resource?
(io/resource file)
(str "https://unicode.org/Public/emoji/5.0/" file))
(slurp)
(cs/split-lines)))
(defn codepoint->str
"Returns the string for a codepoint integer."
[cp]
(String. (Character/toChars cp)))
(defn codepoints->str
"Returns the string for a codepoint sequence."
[cps]
(let [chars (->> cps
(mapcat #(Character/toChars %))
(into-array Integer/TYPE))]
(String. #^ints chars 0 (count chars))))
(comment
"Combine codepoints to form composite emoji."
(def man 0x1F468)
(codepoint->str man) ;;=> "๐Ÿ‘จ"
(def girl 0x1F467)
(codepoint->str girl) ;;=> "๐Ÿ‘ง"
;; combine man & girl emoji w/zero-width join for father & daughter emoji
(codepoints->str
(interpose 0x200D [man girl])) ;;=> "๐Ÿ‘จโ€๐Ÿ‘ง"
;; my two dads!
(codepoints->str
(interpose 0x200D [man man girl]))) ;;=> "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘ง"
(comment
"Three variations on eject."
(codepoint->str 9167) ;;=> "โ"
(codepoints->str [9167 65038]) ;;=> "โ๏ธŽ"
(codepoints->str [9167 65039])) ;;=> "โ๏ธ"
(defn hex->int [h] (Integer/parseInt h 16))
(def remove-filler
"Transducer to remove blank or comment-only lines."
(remove #(or (cs/blank? %) (cs/starts-with? % "#"))))
(defn parse-data-file
"Parses each line in an Emoji data file with f."
[file f]
(eduction remove-filler (map f) (slurp-lines file)))
;; basic emoji
(defn range-str->ints
"Takes an Emoji codepoint range string and returns sequence of all
codepoints in range. Input should be a single hex value or two hex values
separated by two periods."
[rs]
(let [[lo hi] (map hex->int (cs/split rs #"\.\."))]
(if hi
(range lo (inc hi))
[lo])))
(defn parse-data-line
"Parses an Emoji Data line string, returns a map with emojis.
Format: \"<codepoint(s)> ; <property> # <comments>\""
[l]
(let [[hex-cps prop comments]
(->> l
(re-matches #"(.+?);(.+?)#(.+)")
(rest)
(map cs/trim))]
{:emojis (map (fn [cp]
{:codepoint cp
:emoji (codepoint->str cp)})
(range-str->ints hex-cps))
:property prop
:comments comments}))
(def emojis
"Standalone emojis."
(parse-data-file "emoji-data.txt" parse-data-line))
;; emoji sequences
(defn parse-sequence-line
"Parses an Emoji Sequence line string, returns a map with emoji.
Format: \"code_point(s) ; type_field ; description # comments\""
[l]
(let [[hex-cps typ desc comments]
(->> l
(re-matches #"(.+?);(.+?);(.+?)#(.+)")
(rest)
(map cs/trim))
cps (map hex->int (cs/split hex-cps #" "))]
{:codepoints cps
:emoji (codepoints->str cps)
:type typ
:description desc
:comments comments}))
(def emoji-sequences
"Codepoint sequences of base emoji with modifiers e.g. skin tone, flags."
(parse-data-file "emoji-sequences.txt" parse-sequence-line))
;; emoji variation sequences
(defn parse-variation-sequence-line
"Parses an Emoji Variation Sequence line string, returns a map with emoji."
[l]
(let [[hex-cps typ comments]
(->> l
(re-matches #"(.+?);(.+?);\s*#(.+)")
(rest)
(map cs/trim))
cps (map hex->int (cs/split hex-cps #" "))]
{:codepoints cps
:emoji (codepoints->str cps)
:type typ
:comments comments}))
(def emoji-variation-sequences
"Codepoint pairs for text/emoji styles."
(parse-data-file "emoji-variation-sequences.txt" parse-variation-sequence-line))
;; zero-width joiners
(def emoji-zwj-sequences
"Special combinations of base emoji that form composite glyphs."
(parse-data-file "emoji-zwj-sequences.txt" parse-sequence-line))
;; all together now
(def all-emoji
(distinct ;; some emoji appear in multiple sections
(concat (map :emoji (mapcat :emojis emojis))
(map :emoji emoji-sequences)
(map :emoji emoji-variation-sequences)
(map :emoji emoji-zwj-sequences))))
(comment
(count all-emoji)
;;=> 3368
(take 10 (shuffle all-emoji))
;;=> ("๐ŸŒค๏ธ" "๐Ÿง–๐Ÿฟโ€โ™€๏ธ" "๐Ÿšญ๏ธŽ" "๐Ÿ‡ธ๐Ÿ‡ญ" "๐Ÿ‘“๏ธŽ" "๐Ÿ‡บ๐Ÿ‡ฟ" "๐Ÿ–Š" "๐Ÿ˜๏ธ" "โ›น๐Ÿฟโ€โ™‚๏ธ" "๐Ÿ‘ฎ๐Ÿปโ€โ™‚๏ธ")
(spit "emojiverse.txt" (cs/join \newline all-emoji)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment