Skip to content

Instantly share code, notes, and snippets.

@danfickle
Created August 20, 2019 00:44
Show Gist options
  • Save danfickle/82dc6757244edfb1937722eebbb9e9e2 to your computer and use it in GitHub Desktop.
Save danfickle/82dc6757244edfb1937722eebbb9e9e2 to your computer and use it in GitHub Desktop.
Twemoji parser for Java 8 using trie (replaces emoji with img tags).
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* @author danfickle
* Licensed under Apache or MIT.
* Initially published on 2019-08-20.
*/
public class TwemojiParser {
// You will need to download the latest preview.html from the gh-pages branch
// of the twemoji project. Known to work with version 12.1.2.
// https://github.com/twitter/twemoji/blob/gh-pages/v/12.1.2/preview.html
// Path is /v/12.1.2/preview.html
private static final String PREVIEW_FILE_RESOURCE = "/twemoji/preview.html";
private static class MatcherNode {
private Map<Integer, MatcherNode> next;
private MatcherNode put(int cp) {
if (next == null) {
next = new HashMap<>();
}
return next.computeIfAbsent(cp, unused -> new MatcherNode());
}
private MatcherNode get(int cp) {
return next != null ? next.get(cp) : null;
}
}
private final String _prefix;
private final String _suffix;
private final String _baseUrl;
private final String _extension;
private final String _size;
public TwemojiParser(String prefix, String suffix, String baseUrl, String extension, String size) {
this._prefix = prefix;
this._suffix = suffix;
this._baseUrl = baseUrl;
this._extension = extension;
this._size = size;
}
public TwemojiParser() {
this("<span class=\"emoji\">", "</span>", "https://twemoji.maxcdn.com/v/12.1.2/", ".png", "72x72");
}
// Should be thread safe as I believe static initialization happens before everything else...
private static final MatcherNode _root = createMatcherWithExceptionHandling();
private static String resourceToString(String resource) throws IOException {
try (InputStream is = TwemojiParser.class.getResourceAsStream(resource);
InputStreamReader isr = new InputStreamReader(is);
BufferedReader reader = new BufferedReader(isr)) {
return reader.lines().collect(Collectors.joining(System.lineSeparator()));
}
}
private static void addToMatcher(String[] codePointStrings, MatcherNode root) {
MatcherNode current = root;
for (int i = 0; i < codePointStrings.length; i++) {
int cp = Integer.parseUnsignedInt(codePointStrings[i], 16);
current = current.put(cp);
}
}
private static MatcherNode createMatcherWithExceptionHandling() {
try {
return createMatcher();
} catch (IOException e) {
// TODO: Error/logging as you wish...
e.printStackTrace();
throw new RuntimeException(e);
}
}
private static MatcherNode createMatcher() throws IOException {
String previewFile = resourceToString(PREVIEW_FILE_RESOURCE);
Matcher listMatch = Pattern.compile(
Pattern.quote("<ul class=\"emoji-list\">") + "(.*)" + Pattern.quote("</ul>"), Pattern.DOTALL).matcher(previewFile);
listMatch.find();
String emojiListString = listMatch.group(1);
Matcher itemMatch = Pattern.compile(
Pattern.quote("<li>") + "(.*?)" + Pattern.quote(";</li>")).matcher(emojiListString);
MatcherNode root = new MatcherNode();
while (itemMatch.find()) {
String codePointString = itemMatch.group(1);
String rawCodePointString = codePointString.replace("&#x", "");
String[] codePoints = rawCodePointString.split(Pattern.quote(";"));
addToMatcher(codePoints, root);
}
return root;
}
/**
* Replace input containing emoji characters with img tags for Twemoji.
* REMEMBER: Escape user input BEFORE passing to this method.
*/
public String replaceEmoji(String input) {
StringBuilder sb = new StringBuilder(input.length());
MatcherNode current = _root;
List<Integer> emojiCodePoints = new ArrayList<>();
for (int i = 0; i < input.length(); ) {
int cp = input.codePointAt(i);
MatcherNode next = current.get(cp);
boolean consumed;
if (next != null) {
// At the start or middle of an emoji character sequence...
emojiCodePoints.add(cp);
current = next;
consumed = true;
} else if (!emojiCodePoints.isEmpty()) {
// At the end of an emoji...
// TODO: Leave alone variant.
String imgTag = getEmoji(emojiCodePoints);
sb.append(imgTag);
emojiCodePoints.clear();
current = _root;
consumed = false;
} else if (current == _root) {
// Not an emoji character...
sb.appendCodePoint(cp);
consumed = true;
} else {
// Shouldn't happen...
consumed = false;
}
if (consumed) {
i += Character.charCount(cp);
}
}
if (!emojiCodePoints.isEmpty()) {
String imgTag = getEmoji(emojiCodePoints);
sb.append(imgTag);
}
return sb.toString();
}
private String getEmoji(List<Integer> codePoints) {
StringBuilder sb = new StringBuilder(100);
sb.append(_prefix)
.append("<img src=\"")
.append(_baseUrl)
.append(_size)
.append('/');
if (codePoints.size() == 1) {
sb.append(Integer.toHexString(codePoints.get(0)));
} else {
String joined =
codePoints.stream()
.map(Integer::toHexString)
.collect(Collectors.joining("-"));
sb.append(joined);
}
sb.append(_extension)
.append("\" alt=\"");
for (Integer cp : codePoints) {
sb.appendCodePoint(cp);
}
sb.append("\"/>");
sb.append(_suffix);
return sb.toString();
}
// Testing purposes only...
public static void main(String[] args) {
String withEmoji1 = "🚊abscdef😃ghi👋👋🏻jklmn👋🏿opqrst👩😃😃😃uvw👩🏻👩🏿xyz🛸";
String withEmoji2 = "abc 😃 def";
TwemojiParser parser = new TwemojiParser();
System.out.println(parser.replaceEmoji(withEmoji1));
System.out.println("------");
System.out.println(parser.replaceEmoji(withEmoji2));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment