Skip to content

Instantly share code, notes, and snippets.

@ybiquitous
Last active August 29, 2015 14:05
Show Gist options
  • Save ybiquitous/c4a510627e93709c138e to your computer and use it in GitHub Desktop.
Save ybiquitous/c4a510627e93709c138e to your computer and use it in GitHub Desktop.
Unicode - 結合文字と補助文字 (Java 1.6+)
<!DOCTYPE html>
<html lang="ja">
<head>
<meta charset="UTF-8" />
<title>Unicode - 結合文字と補助文字</title>
</head>
<body>
<form action="test.jsp" method="POST">
<input type="text" name="p" value="がが丈𠀋" />
<input type="submit" />
<input type="reset" />
</form>
</body>
</html>
<%@page contentType="text/html; charset=UTF-8" %>
<%@page import="java.text.*" %>
<%@page import="java.util.*" %>
<% request.setCharacterEncoding("UTF-8"); %>
<%!
String withUnicode(String text) {
StringBuilder sb = new StringBuilder(text);
sb.append(" (");
for (int i = 0; i < text.length(); i++) {
if (Character.isLowSurrogate(text.charAt(i))) continue;
if (i > 0) sb.append(" ");
sb.append("U+");
sb.append(Integer.toHexString(text.codePointAt(i)).toUpperCase());
}
sb.append(")");
return sb.toString();
}
String hexDump(String text) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
if (i > 0) sb.append(" ");
sb.append(Integer.toHexString(text.charAt(i)).toUpperCase());
}
return sb.toString();
}
int countByBreakIterator(String text) {
BreakIterator bi = BreakIterator.getCharacterInstance(Locale.JAPAN);
bi.setText(text);
int count = 0;
for (int start = bi.first(), end = bi.next();
end != BreakIterator.DONE;
start = end, end = bi.next()) {
count++;
}
return count;
}
String byBreakIterator(String text) {
BreakIterator bi = BreakIterator.getCharacterInstance();
bi.setText(text);
StringBuilder sb = new StringBuilder(text.length());
for (int start = bi.first(), end = bi.next();
end != BreakIterator.DONE;
start = end, end = bi.next()) {
sb.append(text.substring(start, end));
}
return sb.toString();
}
%>
<!DOCTYPE html>
<html lang="ja">
<head><title>Unicode - 結合文字と補助文字</title></head>
<body>
<% String p = request.getParameter("p"); %>
<h2><%= withUnicode(p) %></h2>
<pre>
hex dump = <%= hexDump(p) %>
String.length() = <%= p.length() %>
String.codePointCount(all) = <%= p.codePointCount(0, p.length()) %>
BreakIterator(count) = <%= countByBreakIterator(p) %>
BreakIterator = <%= withUnicode(byBreakIterator(p)) %>
Normalizer.normalize(NFC) = <%= withUnicode(Normalizer.normalize(p,Normalizer.Form.NFC)) %>
Normalizer.normalize(NFD) = <%= withUnicode(Normalizer.normalize(p,Normalizer.Form.NFD)) %>
Normalizer.normalize(NFKC) = <%= withUnicode(Normalizer.normalize(p,Normalizer.Form.NFKC)) %>
Normalizer.normalize(NFKD) = <%= withUnicode(Normalizer.normalize(p,Normalizer.Form.NFKD)) %>
</pre>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment