Created
August 20, 2019 00:44
-
-
Save danfickle/82dc6757244edfb1937722eebbb9e9e2 to your computer and use it in GitHub Desktop.
Twemoji parser for Java 8 using trie (replaces emoji with img tags).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import java.util.stream.Collectors; | |
/** | |
* @author danfickle | |
* Licensed under Apache or MIT. | |
* Initially published on 2019-08-20. | |
*/ | |
public class TwemojiParser { | |
// You will need to download the latest preview.html from the gh-pages branch | |
// of the twemoji project. Known to work with version 12.1.2. | |
// https://github.com/twitter/twemoji/blob/gh-pages/v/12.1.2/preview.html | |
// Path is /v/12.1.2/preview.html | |
private static final String PREVIEW_FILE_RESOURCE = "/twemoji/preview.html"; | |
private static class MatcherNode { | |
private Map<Integer, MatcherNode> next; | |
private MatcherNode put(int cp) { | |
if (next == null) { | |
next = new HashMap<>(); | |
} | |
return next.computeIfAbsent(cp, unused -> new MatcherNode()); | |
} | |
private MatcherNode get(int cp) { | |
return next != null ? next.get(cp) : null; | |
} | |
} | |
private final String _prefix; | |
private final String _suffix; | |
private final String _baseUrl; | |
private final String _extension; | |
private final String _size; | |
public TwemojiParser(String prefix, String suffix, String baseUrl, String extension, String size) { | |
this._prefix = prefix; | |
this._suffix = suffix; | |
this._baseUrl = baseUrl; | |
this._extension = extension; | |
this._size = size; | |
} | |
public TwemojiParser() { | |
this("<span class=\"emoji\">", "</span>", "https://twemoji.maxcdn.com/v/12.1.2/", ".png", "72x72"); | |
} | |
// Should be thread safe as I believe static initialization happens before everything else... | |
private static final MatcherNode _root = createMatcherWithExceptionHandling(); | |
private static String resourceToString(String resource) throws IOException { | |
try (InputStream is = TwemojiParser.class.getResourceAsStream(resource); | |
InputStreamReader isr = new InputStreamReader(is); | |
BufferedReader reader = new BufferedReader(isr)) { | |
return reader.lines().collect(Collectors.joining(System.lineSeparator())); | |
} | |
} | |
private static void addToMatcher(String[] codePointStrings, MatcherNode root) { | |
MatcherNode current = root; | |
for (int i = 0; i < codePointStrings.length; i++) { | |
int cp = Integer.parseUnsignedInt(codePointStrings[i], 16); | |
current = current.put(cp); | |
} | |
} | |
private static MatcherNode createMatcherWithExceptionHandling() { | |
try { | |
return createMatcher(); | |
} catch (IOException e) { | |
// TODO: Error/logging as you wish... | |
e.printStackTrace(); | |
throw new RuntimeException(e); | |
} | |
} | |
private static MatcherNode createMatcher() throws IOException { | |
String previewFile = resourceToString(PREVIEW_FILE_RESOURCE); | |
Matcher listMatch = Pattern.compile( | |
Pattern.quote("<ul class=\"emoji-list\">") + "(.*)" + Pattern.quote("</ul>"), Pattern.DOTALL).matcher(previewFile); | |
listMatch.find(); | |
String emojiListString = listMatch.group(1); | |
Matcher itemMatch = Pattern.compile( | |
Pattern.quote("<li>") + "(.*?)" + Pattern.quote(";</li>")).matcher(emojiListString); | |
MatcherNode root = new MatcherNode(); | |
while (itemMatch.find()) { | |
String codePointString = itemMatch.group(1); | |
String rawCodePointString = codePointString.replace("&#x", ""); | |
String[] codePoints = rawCodePointString.split(Pattern.quote(";")); | |
addToMatcher(codePoints, root); | |
} | |
return root; | |
} | |
/** | |
* Replace input containing emoji characters with img tags for Twemoji. | |
* REMEMBER: Escape user input BEFORE passing to this method. | |
*/ | |
public String replaceEmoji(String input) { | |
StringBuilder sb = new StringBuilder(input.length()); | |
MatcherNode current = _root; | |
List<Integer> emojiCodePoints = new ArrayList<>(); | |
for (int i = 0; i < input.length(); ) { | |
int cp = input.codePointAt(i); | |
MatcherNode next = current.get(cp); | |
boolean consumed; | |
if (next != null) { | |
// At the start or middle of an emoji character sequence... | |
emojiCodePoints.add(cp); | |
current = next; | |
consumed = true; | |
} else if (!emojiCodePoints.isEmpty()) { | |
// At the end of an emoji... | |
// TODO: Leave alone variant. | |
String imgTag = getEmoji(emojiCodePoints); | |
sb.append(imgTag); | |
emojiCodePoints.clear(); | |
current = _root; | |
consumed = false; | |
} else if (current == _root) { | |
// Not an emoji character... | |
sb.appendCodePoint(cp); | |
consumed = true; | |
} else { | |
// Shouldn't happen... | |
consumed = false; | |
} | |
if (consumed) { | |
i += Character.charCount(cp); | |
} | |
} | |
if (!emojiCodePoints.isEmpty()) { | |
String imgTag = getEmoji(emojiCodePoints); | |
sb.append(imgTag); | |
} | |
return sb.toString(); | |
} | |
private String getEmoji(List<Integer> codePoints) { | |
StringBuilder sb = new StringBuilder(100); | |
sb.append(_prefix) | |
.append("<img src=\"") | |
.append(_baseUrl) | |
.append(_size) | |
.append('/'); | |
if (codePoints.size() == 1) { | |
sb.append(Integer.toHexString(codePoints.get(0))); | |
} else { | |
String joined = | |
codePoints.stream() | |
.map(Integer::toHexString) | |
.collect(Collectors.joining("-")); | |
sb.append(joined); | |
} | |
sb.append(_extension) | |
.append("\" alt=\""); | |
for (Integer cp : codePoints) { | |
sb.appendCodePoint(cp); | |
} | |
sb.append("\"/>"); | |
sb.append(_suffix); | |
return sb.toString(); | |
} | |
// Testing purposes only... | |
public static void main(String[] args) { | |
String withEmoji1 = "🚊abscdef😃ghi👋👋🏻jklmn👋🏿opqrst👩😃😃😃uvw👩🏻👩🏿xyz🛸"; | |
String withEmoji2 = "abc 😃 def"; | |
TwemojiParser parser = new TwemojiParser(); | |
System.out.println(parser.replaceEmoji(withEmoji1)); | |
System.out.println("------"); | |
System.out.println(parser.replaceEmoji(withEmoji2)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment