Created
April 6, 2018 07:27
-
-
Save komiya-atsushi/8ec34b1d08f278ed9ddcba55af544ce3 to your computer and use it in GitHub Desktop.
HTML の meta 要素から文字エンコーディングを抽出するやつ。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package me.k11i.hoge; | |
import java.util.Arrays; | |
import java.util.EnumMap; | |
import java.util.List; | |
import java.util.Map; | |
import me.k11i.hoge.HtmlCharsetDetector.FiniteStateMachine.State; | |
import static me.k11i.hoge.HtmlCharsetDetector.FiniteStateMachine.State.*; | |
public class HtmlCharsetDetector { | |
static class FiniteStateMachine { | |
private static final char[] SPACES = {'\u0020', '\u0009', '\n', '\u000c', '\r'}; | |
private static final char[] QUOTES = {'\'', '"'}; | |
enum State { | |
_INITIAL(), | |
// <meta | |
META_BOE('<'), | |
META_M('m'), | |
META_E('e'), | |
META_T('t'), | |
META_A('a'), | |
SPACES_AFTER_META(SPACES), | |
// [case 1] http-equiv="Content-Type" | |
HTTP_EQUIV_H('h'), | |
HTTP_EQUIV_T1('t'), | |
HTTP_EQUIV_T2('t'), | |
HTTP_EQUIV_P('p'), | |
HTTP_EQUIV_HYPHEN('-'), | |
HTTP_EQUIV_E('e'), | |
HTTP_EQUIV_Q('q'), | |
HTTP_EQUIV_U('u'), | |
HTTP_EQUIV_I('i'), | |
HTTP_EQUIV_V('v'), | |
HTTP_EQUIV_EQ('='), | |
HTTP_EQUIV_BEGIN_QUOTE(QUOTES), | |
HTTP_EQUIV_CONTENT_TYPE_C('c'), | |
HTTP_EQUIV_CONTENT_TYPE_O('o'), | |
HTTP_EQUIV_CONTENT_TYPE_N1('n'), | |
HTTP_EQUIV_CONTENT_TYPE_T1('t'), | |
HTTP_EQUIV_CONTENT_TYPE_E1('e'), | |
HTTP_EQUIV_CONTENT_TYPE_N2('n'), | |
HTTP_EQUIV_CONTENT_TYPE_T2('t'), | |
HTTP_EQUIV_CONTENT_TYPE_HYPHEN('-'), | |
HTTP_EQUIV_CONTENT_TYPE_T3('t'), | |
HTTP_EQUIV_CONTENT_TYPE_Y('y'), | |
HTTP_EQUIV_CONTENT_TYPE_P('p'), | |
HTTP_EQUIV_CONTENT_TYPE_E2('e'), | |
HTTP_EQUIV_END_QUOTE(QUOTES), | |
SPACES_AFTER_HTTP_EQUIV(SPACES), | |
// content="text/html; charset=*" | |
CONTENT_C('c'), | |
CONTENT_O('o'), | |
CONTENT_N1('n'), | |
CONTENT_T1('t'), | |
CONTENT_E('e'), | |
CONTENT_N2('n'), | |
CONTENT_T2('t'), | |
CONTENT_EQ('='), | |
CONTENT_BEGIN_QUOTE(QUOTES), | |
CONTENT_TEXT_HTML_T1('t'), | |
CONTENT_TEXT_HTML_E('e'), | |
CONTENT_TEXT_HTML_X('x'), | |
CONTENT_TEXT_HTML_T2('t'), | |
CONTENT_TEXT_HTML_SL('/'), | |
CONTENT_TEXT_HTML_H('h'), | |
CONTENT_TEXT_HTML_T3('t'), | |
CONTENT_TEXT_HTML_M('m'), | |
CONTENT_TEXT_HTML_L('l'), | |
CONTENT_TEXT_HTML_SC(';'), | |
CONTENT_SPACES_BEFORE_CHARSET(SPACES), | |
CONTENT_CHARSET_C('c'), | |
CONTENT_CHARSET_H('h'), | |
CONTENT_CHARSET_A('a'), | |
CONTENT_CHARSET_R('r'), | |
CONTENT_CHARSET_S('s'), | |
CONTENT_CHARSET_E('e'), | |
CONTENT_CHARSET_T('t'), | |
CONTENT_CHARSET_EQ('='), | |
CONTENT_CHARSET_VALUE(true, QUOTES), | |
CONTENT_END_QUOTE(QUOTES), | |
// [case 2] charset="*" | |
CHARSET_C('c'), | |
CHARSET_H('h'), | |
CHARSET_A('a'), | |
CHARSET_R('r'), | |
CHARSET_S('s'), | |
CHARSET_E('e'), | |
CHARSET_T('t'), | |
CHARSET_EQ('='), | |
CHARSET_BEGIN_QUOTE(QUOTES), | |
CHARSET_VALUE(true, QUOTES), | |
CHARSET_END_QUOTE(QUOTES), | |
SPACES_BEFORE_EOE(SPACES), | |
META_EOE_SL('/'), | |
META_EOE_GT('>'), | |
_FINAL(); | |
private final boolean capture; | |
private final char[] chars; | |
State(char... chars) { | |
this(false, chars); | |
} | |
State(boolean capture, char... chars) { | |
this.capture = capture; | |
this.chars = chars; | |
} | |
boolean canTransition(char ch) { | |
if (capture) { | |
return !contains(ch); | |
} else { | |
return contains(ch); | |
} | |
} | |
private boolean contains(char ch) { | |
for (char _ch : chars) { | |
if (_ch == ch) { | |
return true; | |
} | |
} | |
return false; | |
} | |
boolean isCaptureNeeded() { | |
return capture; | |
} | |
} | |
private static class TransitionMapBuilder { | |
private Map<State, List<State>> map = new EnumMap<>(State.class); | |
TransitionMapBuilder sequence(State... states) { | |
for (int i = 0; i < states.length - 1; i++) { | |
add(states[i], states[i + 1]); | |
} | |
return this; | |
} | |
TransitionMapBuilder add(State from, State... to) { | |
map.put(from, Arrays.asList(to)); | |
return this; | |
} | |
Map<State, List<State>> build() { | |
return map; | |
} | |
} | |
private static final Map<State, List<State>> TRANSITIONS; | |
static { | |
TRANSITIONS = new TransitionMapBuilder() | |
.sequence( | |
_INITIAL, | |
META_BOE, | |
META_M, META_E, META_T, META_A, | |
SPACES_AFTER_META) | |
.add(SPACES_AFTER_META, | |
SPACES_AFTER_META, | |
HTTP_EQUIV_H, // http-equiv=~ | |
CHARSET_C) // charset=~ | |
// http-equiv="Content-Type" | |
.sequence( | |
HTTP_EQUIV_H, HTTP_EQUIV_T1, HTTP_EQUIV_T2, HTTP_EQUIV_P, HTTP_EQUIV_HYPHEN, HTTP_EQUIV_E, HTTP_EQUIV_Q, HTTP_EQUIV_U, HTTP_EQUIV_I, HTTP_EQUIV_V, | |
HTTP_EQUIV_EQ, HTTP_EQUIV_BEGIN_QUOTE, | |
HTTP_EQUIV_CONTENT_TYPE_C, HTTP_EQUIV_CONTENT_TYPE_O, HTTP_EQUIV_CONTENT_TYPE_N1, HTTP_EQUIV_CONTENT_TYPE_T1, HTTP_EQUIV_CONTENT_TYPE_E1, HTTP_EQUIV_CONTENT_TYPE_N2, HTTP_EQUIV_CONTENT_TYPE_T2, HTTP_EQUIV_CONTENT_TYPE_HYPHEN, HTTP_EQUIV_CONTENT_TYPE_T3, HTTP_EQUIV_CONTENT_TYPE_Y, HTTP_EQUIV_CONTENT_TYPE_P, HTTP_EQUIV_CONTENT_TYPE_E2, | |
HTTP_EQUIV_END_QUOTE, SPACES_AFTER_HTTP_EQUIV) | |
.add(SPACES_AFTER_HTTP_EQUIV, | |
SPACES_AFTER_HTTP_EQUIV, | |
CONTENT_C) | |
// content="text/html; charset=*" | |
.sequence( | |
CONTENT_C, CONTENT_O, CONTENT_N1, CONTENT_T1, CONTENT_E, CONTENT_N2, CONTENT_T2, | |
CONTENT_EQ, CONTENT_BEGIN_QUOTE, | |
CONTENT_TEXT_HTML_T1, CONTENT_TEXT_HTML_E, CONTENT_TEXT_HTML_X, CONTENT_TEXT_HTML_T2, CONTENT_TEXT_HTML_SL, CONTENT_TEXT_HTML_H, CONTENT_TEXT_HTML_T3, CONTENT_TEXT_HTML_M, CONTENT_TEXT_HTML_L, CONTENT_TEXT_HTML_SC) | |
.add(CONTENT_TEXT_HTML_SC, | |
CONTENT_SPACES_BEFORE_CHARSET, | |
CONTENT_CHARSET_C) | |
.add(CONTENT_SPACES_BEFORE_CHARSET, | |
CONTENT_SPACES_BEFORE_CHARSET, | |
CONTENT_CHARSET_C) | |
.sequence( | |
CONTENT_CHARSET_C, CONTENT_CHARSET_H, CONTENT_CHARSET_A, CONTENT_CHARSET_R, CONTENT_CHARSET_S, CONTENT_CHARSET_E, CONTENT_CHARSET_T, | |
CONTENT_CHARSET_EQ, CONTENT_CHARSET_VALUE) | |
.add(CONTENT_CHARSET_VALUE, | |
CONTENT_CHARSET_VALUE, | |
CONTENT_END_QUOTE) | |
.add(CONTENT_END_QUOTE, SPACES_BEFORE_EOE) | |
// [case 2] charset="*" | |
.sequence( | |
CHARSET_C, CHARSET_H, CHARSET_A, CHARSET_R, CHARSET_S, CHARSET_E, CHARSET_T, | |
CHARSET_EQ, CHARSET_BEGIN_QUOTE, CHARSET_VALUE) | |
.add(CHARSET_VALUE, | |
CHARSET_VALUE, | |
CHARSET_END_QUOTE) | |
.add(CHARSET_END_QUOTE, SPACES_BEFORE_EOE) | |
// /> | |
.add(SPACES_BEFORE_EOE, | |
SPACES_BEFORE_EOE, | |
META_EOE_SL) | |
.sequence(META_EOE_SL, META_EOE_GT, _FINAL) | |
.add(_FINAL, _FINAL) | |
.build(); | |
} | |
State next(State current, char ch) { | |
ch = Character.toLowerCase(ch); | |
for (State next : TRANSITIONS.get(current)) { | |
if (next.canTransition(ch)) { | |
return next; | |
} | |
} | |
return _INITIAL; | |
} | |
} | |
private static final FiniteStateMachine FSM = new FiniteStateMachine(); | |
public static String extractCharsetName(byte[] bytes) { | |
State state = _INITIAL; | |
StringBuilder capturedChars = new StringBuilder(); | |
for (byte b : bytes) { | |
if (b < 0) { | |
continue; | |
} | |
char ch = (char) b; | |
//System.out.printf("%s: '%c'%n", state, ch); | |
state = FSM.next(state, ch); | |
if (state.isCaptureNeeded()) { | |
capturedChars.append(ch); | |
} | |
if (state == _FINAL) { | |
break; | |
} | |
} | |
return capturedChars.toString().trim(); | |
} | |
public static void main(String[] args) { | |
System.out.println(extractCharsetName(" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /> ".getBytes())); | |
System.out.println(extractCharsetName(" <meta charset=\"utf-8\" /> ".getBytes())); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment