Skip to content

Instantly share code, notes, and snippets.

@junlincao
Last active February 26, 2018 06:57
Show Gist options
  • Save junlincao/74678905a871af2c1ae1982128974595 to your computer and use it in GitHub Desktop.
Save junlincao/74678905a871af2c1ae1982128974595 to your computer and use it in GitHub Desktop.
转换utf-8 3个字节无法表示的字符
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
/**
* 转换utf-8 3个字节无法表示的字符
*
* @author CJL
* @since 2018-02-26
*/
public class EmojiConverter {
private static final char MARKER = '\u0007'; // \u0000
private static final int RADIX = 32; // 保存为字符进制数(建议16或32)
private static final int BUF_LEN = 1024;
public static String encode(String str) throws IOException {
EmojiConverterHelper.SimpleStringWriter sw = new EmojiConverterHelper.SimpleStringWriter(str.length());
encode(new StringReader(str), sw);
return sw.toString();
}
public static String decode(String str) throws IOException {
EmojiConverterHelper.SimpleStringWriter sw = new EmojiConverterHelper.SimpleStringWriter(str.length());
decode(new StringReader(str), sw);
return sw.toString();
}
public static String filter(String str) throws IOException {
EmojiConverterHelper.SimpleStringWriter sw = new EmojiConverterHelper.SimpleStringWriter(str.length());
filter(new StringReader(str), sw);
return sw.toString();
}
public static void encode(Reader reader, Writer writer) throws IOException {
char[] buf = new char[BUF_LEN];
char[] tmp = new char[8]; // 用于存储Integer转换后的字符
int pos = 0;
OUTER:
do {
int readLen = reader.read(buf, pos, buf.length - pos);
if (pos == 0 && readLen == -1) { // 没有字符了,退出
break;
}
int endPos = pos + (readLen == -1 ? 0 : readLen); // buf中有效字符结束位置(不包括)
pos = 0;
while (true) { // 遍历buf中的字符
if (pos >= endPos) { // 已经遍历完了,则退出循环
pos = 0;
break;
}
char c = buf[pos];
if (!Character.isSurrogate(c)) { // 非特殊字符,则直接写入
writer.write(c);
pos++;
continue;
}
if (pos == endPos - 1) { // buf中的字符已经到了末尾,不足以组成SurrogatePair,则重新读取缓冲
if (readLen == -1) { // 已经读完了字符串,最后一个字符只能直接写入了
writer.write(c);
break OUTER;
} else { // 将末尾未读取到的字符放到buf头部去,方便下次填充buf
buf[0] = buf[pos];
pos = 1;
break;
}
}
char cNext = buf[pos + 1];
if (Character.isSurrogatePair(c, cNext)) {
int code = Character.toCodePoint(c, cNext);
writer.write(MARKER);
int hexLen = EmojiConverterHelper.writeIntChars(code, tmp, RADIX);
writer.write(tmp, 0, hexLen);
writer.write(MARKER);
pos += 2;
} else { // 单独的一个Surrogate字符,应该是异常字符!
writer.write(c);
pos++;
}
}
} while (true);
}
public static void decode(Reader reader, Writer writer) throws IOException {
char[] buf = new char[BUF_LEN];
int pos = 0;
do {
int readLen = reader.read(buf, pos, buf.length - pos);
if (pos == 0 && readLen == -1) { // 没有字符了,退出
break;
}
int endPos = pos + (readLen == -1 ? 0 : readLen); // buf中有效字符结束位置(不包括)
pos = 0;
while (true) { // 遍历buf中的字符
if (pos >= endPos) { // 已经遍历完了,则退出循环
pos = 0;
break;
}
char c = buf[pos];
if (c != MARKER) {
writer.write(c);
pos++;
continue;
}
int maxNextMarkerPos = Math.min(endPos, pos + 10);
int nextMarkerPos = nextMarkerPos(buf, pos + 1, maxNextMarkerPos);
if (nextMarkerPos == -1 || nextMarkerPos - pos > 9) {
// 没找到匹配的结束marker
if (readLen == -1 || pos == 0 || maxNextMarkerPos < endPos) {
writer.write(c);
pos++;
continue;
}
// 这种没找到可能是因为读到了buf末尾,需要继续填充buf内容
// 将buf后面未读的数据移动到前面去,等待下次填充buf后重新读取
System.arraycopy(buf, pos, buf, 0, endPos - pos);
pos = endPos - pos;
break;
}
try {
// 读取两个marker中间的hex字符,转换为相关的unicode字符
int codePoint = EmojiConverterHelper.parseInt(buf, pos + 1, nextMarkerPos, RADIX);
writer.write(Character.toChars(codePoint));
pos = nextMarkerPos + 1;
} catch (NumberFormatException e) {
writer.write(c);
pos++;
}
}
} while (true);
}
/**
* 过滤emoji字符,上面方法转换前后的都可以过滤
*/
public static void filter(Reader reader, Writer writer) throws IOException {
char[] buf = new char[BUF_LEN];
int pos = 0;
do {
int readLen = reader.read(buf, pos, buf.length - pos);
if (pos == 0 && readLen == -1) { // 没有字符了,退出
break;
}
int endPos = pos + (readLen == -1 ? 0 : readLen); // buf中有效字符结束位置(不包括)
pos = 0;
while (true) { // 遍历buf中的字符
if (pos >= endPos) { // 已经遍历完了,则退出循环
pos = 0;
break;
}
char c = buf[pos];
if (Character.isSurrogate(c)) {
pos++;
continue;
}
if (c != MARKER) {
writer.write(c);
pos++;
continue;
}
int maxNextMarkerPos = Math.min(endPos, pos + 10);
int nextMarkerPos = nextMarkerPos(buf, pos + 1, maxNextMarkerPos);
if (nextMarkerPos == -1 || nextMarkerPos - pos > 9) {
// 没找到匹配的结束marker
if (readLen == -1 || pos == 0 || maxNextMarkerPos < endPos) {
writer.write(c);
pos++;
continue;
}
// 这种没找到可能是因为读到了buf末尾,需要继续填充buf内容
// 将buf后面未读的数据移动到前面去,等待下次填充buf后重新读取
System.arraycopy(buf, pos, buf, 0, endPos - pos);
pos = endPos - pos;
break;
}
try {
EmojiConverterHelper.parseInt(buf, pos + 1, nextMarkerPos, RADIX);
pos = nextMarkerPos + 1;
} catch (NumberFormatException e) {
writer.write(c);
pos++;
}
}
} while (true);
}
private static int nextMarkerPos(char[] buf, int fromPos, int endPos) {
for (int i = fromPos; i < endPos; i++) {
if (buf[i] == MARKER) {
return i;
}
}
return -1;
}
public static void main(String[] args) throws Exception {
String str = "你😄\uD83D\uDE03\uD83C\uDDF5\uD83C\uDDFE好";
System.out.println(str);
System.out.println(encode(str));
System.out.println(decode(encode(str)));
if (!str.equals(decode(encode(str)))) {
throw new RuntimeException("not equal!");
}
String strErr1 = "A\uD83DB";
System.out.println(encode(strErr1));
String strErr2 = "你\u00073tg4\u0007\u00073tg32018年2月25日,平昌冬奥会闭幕式在平昌\u0007奥林匹克体育场举行。中国国家主席习近平通过视频致辞,和亿万中国人民一起,向全世界发出北京2022的盛情邀请\u00073sfl\u0007\u00073sfu\u0007好? ?";
System.out.println(decode(strErr2));
String strErr3 = "你\u00073tg4\u0007\u00073tg32018年2月25日,平昌冬奥会闭幕式在平昌\u0007奥\u0007林匹克体育场举行。中国国家主席习近平通过视频致辞,和亿万中国人民一起,向全世界发出北京2022的盛情邀请\u00073sfl\u0007\u00073sfu\u0007好? ?";
System.out.println(decode(strErr3));
String strErr4 = "你\u00073tg4\u0007\u00073tg32018年2月25日,平昌冬奥会闭幕式在平昌\u0007奥\u0007666\u0007林匹克体育场举行。中国国家主席习近平通过视频致辞,和亿万中国人民一起,向全世界发出北京2022的盛情邀请\u00073sfl\u0007\u00073sfu\u0007好? ?";
System.out.println(decode(strErr4));
}
}
import java.io.IOException;
import java.io.Writer;
/**
* 为了减少gc,直接复制修改了Integer类中的部分代码
*
* @author CJL
* @since 2018-02-26
*/
class EmojiConverterHelper {
/**
* All possible chars for representing a number as a String
*/
private final static char[] digits = {
'0', '1', '2', '3', '4', '5',
'6', '7', '8', '9', 'a', 'b',
'c', 'd', 'e', 'f', 'g', 'h',
'i', 'j', 'k', 'l', 'm', 'n',
'o', 'p', 'q', 'r', 's', 't',
'u', 'v', 'w', 'x', 'y', 'z'
};
/**
* 将Int转换为指定进制字符串放入buf中
*
* @return buf写入长度
*/
static int writeIntChars(int i, char[] buf, int radix) {
switch (radix) {
case 2:
return toUnsignedString0(i, buf, 1);
case 4:
return toUnsignedString0(i, buf, 2);
case 8:
return toUnsignedString0(i, buf, 3);
case 16:
return toUnsignedString0(i, buf, 4);
case 32:
return toUnsignedString0(i, buf, 5);
default:
throw new NumberFormatException("Not support radix " + radix);
}
}
/**
* modify from {@code Integer.toUnsignedString0(..)}
*/
private static int toUnsignedString0(int val, char[] buf, int shift) {
// assert shift > 0 && shift <=5 : "Illegal shift value";
int mag = Integer.SIZE - Integer.numberOfLeadingZeros(val);
final int chars = Math.max(((mag + (shift - 1)) / shift), 1);
int offset = 0;
int charPos = chars;
int radix = 1 << shift;
int mask = radix - 1;
do {
buf[offset + --charPos] = digits[val & mask];
val >>>= shift;
} while (val != 0 && charPos > 0);
return chars;
}
/**
* modify from {@code Integer.parseInt(..)}
*/
static int parseInt(char[] buf, int fromPos, int toPos, int radix) throws NumberFormatException {
if (buf == null) {
throw new NumberFormatException("null");
}
if (radix < Character.MIN_RADIX) {
throw new NumberFormatException("radix " + radix +
" less than Character.MIN_RADIX");
}
if (radix > Character.MAX_RADIX) {
throw new NumberFormatException("radix " + radix +
" greater than Character.MAX_RADIX");
}
int result = 0;
boolean negative = false;
int pos = fromPos;
int limit = -Integer.MAX_VALUE;
int multmin;
int digit;
if (toPos - fromPos > 0) {
char firstChar = buf[fromPos];
if (firstChar < '0') { // Possible leading "+" or "-"
if (firstChar == '-') {
negative = true;
limit = Integer.MIN_VALUE;
} else if (firstChar != '+')
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos));
if (toPos - fromPos == 1) // Cannot have lone "+" or "-"
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos));
pos++;
}
multmin = limit / radix;
while (pos < toPos) {
// Accumulating negatively avoids surprises near MAX_VALUE
digit = Character.digit(buf[pos++], radix);
if (digit < 0) {
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos));
}
if (result < multmin) {
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos));
}
result *= radix;
if (result < limit + digit) {
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos));
}
result -= digit;
}
} else {
throw new NumberFormatException("For input string:" + new String(buf, fromPos, toPos - fromPos));
}
return negative ? result : -result;
}
/**
* copyed from StringWriter, change StringBuffer to StringBuilder
*/
public static class SimpleStringWriter extends Writer {
private StringBuilder buf;
public SimpleStringWriter() {
buf = new StringBuilder();
lock = buf;
}
public SimpleStringWriter(int initialSize) {
if (initialSize < 0) {
throw new IllegalArgumentException("Negative buffer size");
}
buf = new StringBuilder(initialSize);
lock = buf;
}
public void write(int c) {
buf.append((char) c);
}
public void write(char cbuf[], int off, int len) {
if ((off < 0) || (off > cbuf.length) || (len < 0) ||
((off + len) > cbuf.length) || ((off + len) < 0)) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return;
}
buf.append(cbuf, off, len);
}
public void write(String str) {
buf.append(str);
}
public void write(String str, int off, int len) {
buf.append(str.substring(off, off + len));
}
public SimpleStringWriter append(CharSequence csq) {
if (csq == null)
write("null");
else
write(csq.toString());
return this;
}
public SimpleStringWriter append(CharSequence csq, int start, int end) {
CharSequence cs = (csq == null ? "null" : csq);
write(cs.subSequence(start, end).toString());
return this;
}
public SimpleStringWriter append(char c) {
write(c);
return this;
}
public String toString() {
return buf.toString();
}
public StringBuilder getBuffer() {
return buf;
}
public void flush() {
}
public void close() throws IOException {
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment