Last active
December 15, 2017 17:02
-
-
Save taylorwood/635cacf430db53265280ba98908848f2 to your computer and use it in GitHub Desktop.
Parse/render Unicode.org Emoji data https://www.unicode.org/reports/tr51/#Data_Files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns unicode.emoji | |
"Parses Unicode.org Emoji specifications." | |
(:require [clojure.java.io :as io] | |
[clojure.string :as cs])) | |
(defn slurp-lines | |
"Returns lines from unicode emoji file. Optionally reads file of same name | |
from resources." | |
[file & [resource?]] | |
(-> (if resource? | |
(io/resource file) | |
(str "https://unicode.org/Public/emoji/5.0/" file)) | |
(slurp) | |
(cs/split-lines))) | |
(defn codepoint->str | |
"Returns the string for a codepoint integer." | |
[cp] | |
(String. (Character/toChars cp))) | |
(defn codepoints->str | |
"Returns the string for a codepoint sequence." | |
[cps] | |
(let [chars (->> cps | |
(mapcat #(Character/toChars %)) | |
(into-array Integer/TYPE))] | |
(String. #^ints chars 0 (count chars)))) | |
(comment | |
"Combine codepoints to form composite emoji." | |
(def man 0x1F468) | |
(codepoint->str man) ;;=> "๐จ" | |
(def girl 0x1F467) | |
(codepoint->str girl) ;;=> "๐ง" | |
;; combine man & girl emoji w/zero-width join for father & daughter emoji | |
(codepoints->str | |
(interpose 0x200D [man girl])) ;;=> "๐จโ๐ง" | |
;; my two dads! | |
(codepoints->str | |
(interpose 0x200D [man man girl]))) ;;=> "๐จโ๐จโ๐ง" | |
(comment | |
"Three variations on eject." | |
(codepoint->str 9167) ;;=> "โ" | |
(codepoints->str [9167 65038]) ;;=> "โ๏ธ" | |
(codepoints->str [9167 65039])) ;;=> "โ๏ธ" | |
(defn hex->int [h] (Integer/parseInt h 16)) | |
(def remove-filler | |
"Transducer to remove blank or comment-only lines." | |
(remove #(or (cs/blank? %) (cs/starts-with? % "#")))) | |
(defn parse-data-file | |
"Parses each line in an Emoji data file with f." | |
[file f] | |
(eduction remove-filler (map f) (slurp-lines file))) | |
;; basic emoji | |
(defn range-str->ints | |
"Takes an Emoji codepoint range string and returns sequence of all | |
codepoints in range. Input should be a single hex value or two hex values | |
separated by two periods." | |
[rs] | |
(let [[lo hi] (map hex->int (cs/split rs #"\.\."))] | |
(if hi | |
(range lo (inc hi)) | |
[lo]))) | |
(defn parse-data-line | |
"Parses an Emoji Data line string, returns a map with emojis. | |
Format: \"<codepoint(s)> ; <property> # <comments>\"" | |
[l] | |
(let [[hex-cps prop comments] | |
(->> l | |
(re-matches #"(.+?);(.+?)#(.+)") | |
(rest) | |
(map cs/trim))] | |
{:emojis (map (fn [cp] | |
{:codepoint cp | |
:emoji (codepoint->str cp)}) | |
(range-str->ints hex-cps)) | |
:property prop | |
:comments comments})) | |
(def emojis | |
"Standalone emojis." | |
(parse-data-file "emoji-data.txt" parse-data-line)) | |
;; emoji sequences | |
(defn parse-sequence-line | |
"Parses an Emoji Sequence line string, returns a map with emoji. | |
Format: \"code_point(s) ; type_field ; description # comments\"" | |
[l] | |
(let [[hex-cps typ desc comments] | |
(->> l | |
(re-matches #"(.+?);(.+?);(.+?)#(.+)") | |
(rest) | |
(map cs/trim)) | |
cps (map hex->int (cs/split hex-cps #" "))] | |
{:codepoints cps | |
:emoji (codepoints->str cps) | |
:type typ | |
:description desc | |
:comments comments})) | |
(def emoji-sequences | |
"Codepoint sequences of base emoji with modifiers e.g. skin tone, flags." | |
(parse-data-file "emoji-sequences.txt" parse-sequence-line)) | |
;; emoji variation sequences | |
(defn parse-variation-sequence-line | |
"Parses an Emoji Variation Sequence line string, returns a map with emoji." | |
[l] | |
(let [[hex-cps typ comments] | |
(->> l | |
(re-matches #"(.+?);(.+?);\s*#(.+)") | |
(rest) | |
(map cs/trim)) | |
cps (map hex->int (cs/split hex-cps #" "))] | |
{:codepoints cps | |
:emoji (codepoints->str cps) | |
:type typ | |
:comments comments})) | |
(def emoji-variation-sequences | |
"Codepoint pairs for text/emoji styles." | |
(parse-data-file "emoji-variation-sequences.txt" parse-variation-sequence-line)) | |
;; zero-width joiners | |
(def emoji-zwj-sequences | |
"Special combinations of base emoji that form composite glyphs." | |
(parse-data-file "emoji-zwj-sequences.txt" parse-sequence-line)) | |
;; all together now | |
(def all-emoji | |
(distinct ;; some emoji appear in multiple sections | |
(concat (map :emoji (mapcat :emojis emojis)) | |
(map :emoji emoji-sequences) | |
(map :emoji emoji-variation-sequences) | |
(map :emoji emoji-zwj-sequences)))) | |
(comment | |
(count all-emoji) | |
;;=> 3368 | |
(take 10 (shuffle all-emoji)) | |
;;=> ("๐ค๏ธ" "๐ง๐ฟโโ๏ธ" "๐ญ๏ธ" "๐ธ๐ญ" "๐๏ธ" "๐บ๐ฟ" "๐" "๐๏ธ" "โน๐ฟโโ๏ธ" "๐ฎ๐ปโโ๏ธ") | |
(spit "emojiverse.txt" (cs/join \newline all-emoji))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment