Skip to content

Instantly share code, notes, and snippets.

@komiya-atsushi
Created April 6, 2018 07:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save komiya-atsushi/8ec34b1d08f278ed9ddcba55af544ce3 to your computer and use it in GitHub Desktop.
Save komiya-atsushi/8ec34b1d08f278ed9ddcba55af544ce3 to your computer and use it in GitHub Desktop.
HTML の meta 要素から文字エンコーディングを抽出するやつ。
package me.k11i.hoge;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import me.k11i.hoge.HtmlCharsetDetector.FiniteStateMachine.State;
import static me.k11i.hoge.HtmlCharsetDetector.FiniteStateMachine.State.*;
public class HtmlCharsetDetector {
static class FiniteStateMachine {
private static final char[] SPACES = {'\u0020', '\u0009', '\n', '\u000c', '\r'};
private static final char[] QUOTES = {'\'', '"'};
enum State {
_INITIAL(),
// <meta
META_BOE('<'),
META_M('m'),
META_E('e'),
META_T('t'),
META_A('a'),
SPACES_AFTER_META(SPACES),
// [case 1] http-equiv="Content-Type"
HTTP_EQUIV_H('h'),
HTTP_EQUIV_T1('t'),
HTTP_EQUIV_T2('t'),
HTTP_EQUIV_P('p'),
HTTP_EQUIV_HYPHEN('-'),
HTTP_EQUIV_E('e'),
HTTP_EQUIV_Q('q'),
HTTP_EQUIV_U('u'),
HTTP_EQUIV_I('i'),
HTTP_EQUIV_V('v'),
HTTP_EQUIV_EQ('='),
HTTP_EQUIV_BEGIN_QUOTE(QUOTES),
HTTP_EQUIV_CONTENT_TYPE_C('c'),
HTTP_EQUIV_CONTENT_TYPE_O('o'),
HTTP_EQUIV_CONTENT_TYPE_N1('n'),
HTTP_EQUIV_CONTENT_TYPE_T1('t'),
HTTP_EQUIV_CONTENT_TYPE_E1('e'),
HTTP_EQUIV_CONTENT_TYPE_N2('n'),
HTTP_EQUIV_CONTENT_TYPE_T2('t'),
HTTP_EQUIV_CONTENT_TYPE_HYPHEN('-'),
HTTP_EQUIV_CONTENT_TYPE_T3('t'),
HTTP_EQUIV_CONTENT_TYPE_Y('y'),
HTTP_EQUIV_CONTENT_TYPE_P('p'),
HTTP_EQUIV_CONTENT_TYPE_E2('e'),
HTTP_EQUIV_END_QUOTE(QUOTES),
SPACES_AFTER_HTTP_EQUIV(SPACES),
// content="text/html; charset=*"
CONTENT_C('c'),
CONTENT_O('o'),
CONTENT_N1('n'),
CONTENT_T1('t'),
CONTENT_E('e'),
CONTENT_N2('n'),
CONTENT_T2('t'),
CONTENT_EQ('='),
CONTENT_BEGIN_QUOTE(QUOTES),
CONTENT_TEXT_HTML_T1('t'),
CONTENT_TEXT_HTML_E('e'),
CONTENT_TEXT_HTML_X('x'),
CONTENT_TEXT_HTML_T2('t'),
CONTENT_TEXT_HTML_SL('/'),
CONTENT_TEXT_HTML_H('h'),
CONTENT_TEXT_HTML_T3('t'),
CONTENT_TEXT_HTML_M('m'),
CONTENT_TEXT_HTML_L('l'),
CONTENT_TEXT_HTML_SC(';'),
CONTENT_SPACES_BEFORE_CHARSET(SPACES),
CONTENT_CHARSET_C('c'),
CONTENT_CHARSET_H('h'),
CONTENT_CHARSET_A('a'),
CONTENT_CHARSET_R('r'),
CONTENT_CHARSET_S('s'),
CONTENT_CHARSET_E('e'),
CONTENT_CHARSET_T('t'),
CONTENT_CHARSET_EQ('='),
CONTENT_CHARSET_VALUE(true, QUOTES),
CONTENT_END_QUOTE(QUOTES),
// [case 2] charset="*"
CHARSET_C('c'),
CHARSET_H('h'),
CHARSET_A('a'),
CHARSET_R('r'),
CHARSET_S('s'),
CHARSET_E('e'),
CHARSET_T('t'),
CHARSET_EQ('='),
CHARSET_BEGIN_QUOTE(QUOTES),
CHARSET_VALUE(true, QUOTES),
CHARSET_END_QUOTE(QUOTES),
SPACES_BEFORE_EOE(SPACES),
META_EOE_SL('/'),
META_EOE_GT('>'),
_FINAL();
private final boolean capture;
private final char[] chars;
State(char... chars) {
this(false, chars);
}
State(boolean capture, char... chars) {
this.capture = capture;
this.chars = chars;
}
boolean canTransition(char ch) {
if (capture) {
return !contains(ch);
} else {
return contains(ch);
}
}
private boolean contains(char ch) {
for (char _ch : chars) {
if (_ch == ch) {
return true;
}
}
return false;
}
boolean isCaptureNeeded() {
return capture;
}
}
private static class TransitionMapBuilder {
private Map<State, List<State>> map = new EnumMap<>(State.class);
TransitionMapBuilder sequence(State... states) {
for (int i = 0; i < states.length - 1; i++) {
add(states[i], states[i + 1]);
}
return this;
}
TransitionMapBuilder add(State from, State... to) {
map.put(from, Arrays.asList(to));
return this;
}
Map<State, List<State>> build() {
return map;
}
}
private static final Map<State, List<State>> TRANSITIONS;
static {
TRANSITIONS = new TransitionMapBuilder()
.sequence(
_INITIAL,
META_BOE,
META_M, META_E, META_T, META_A,
SPACES_AFTER_META)
.add(SPACES_AFTER_META,
SPACES_AFTER_META,
HTTP_EQUIV_H, // http-equiv=~
CHARSET_C) // charset=~
// http-equiv="Content-Type"
.sequence(
HTTP_EQUIV_H, HTTP_EQUIV_T1, HTTP_EQUIV_T2, HTTP_EQUIV_P, HTTP_EQUIV_HYPHEN, HTTP_EQUIV_E, HTTP_EQUIV_Q, HTTP_EQUIV_U, HTTP_EQUIV_I, HTTP_EQUIV_V,
HTTP_EQUIV_EQ, HTTP_EQUIV_BEGIN_QUOTE,
HTTP_EQUIV_CONTENT_TYPE_C, HTTP_EQUIV_CONTENT_TYPE_O, HTTP_EQUIV_CONTENT_TYPE_N1, HTTP_EQUIV_CONTENT_TYPE_T1, HTTP_EQUIV_CONTENT_TYPE_E1, HTTP_EQUIV_CONTENT_TYPE_N2, HTTP_EQUIV_CONTENT_TYPE_T2, HTTP_EQUIV_CONTENT_TYPE_HYPHEN, HTTP_EQUIV_CONTENT_TYPE_T3, HTTP_EQUIV_CONTENT_TYPE_Y, HTTP_EQUIV_CONTENT_TYPE_P, HTTP_EQUIV_CONTENT_TYPE_E2,
HTTP_EQUIV_END_QUOTE, SPACES_AFTER_HTTP_EQUIV)
.add(SPACES_AFTER_HTTP_EQUIV,
SPACES_AFTER_HTTP_EQUIV,
CONTENT_C)
// content="text/html; charset=*"
.sequence(
CONTENT_C, CONTENT_O, CONTENT_N1, CONTENT_T1, CONTENT_E, CONTENT_N2, CONTENT_T2,
CONTENT_EQ, CONTENT_BEGIN_QUOTE,
CONTENT_TEXT_HTML_T1, CONTENT_TEXT_HTML_E, CONTENT_TEXT_HTML_X, CONTENT_TEXT_HTML_T2, CONTENT_TEXT_HTML_SL, CONTENT_TEXT_HTML_H, CONTENT_TEXT_HTML_T3, CONTENT_TEXT_HTML_M, CONTENT_TEXT_HTML_L, CONTENT_TEXT_HTML_SC)
.add(CONTENT_TEXT_HTML_SC,
CONTENT_SPACES_BEFORE_CHARSET,
CONTENT_CHARSET_C)
.add(CONTENT_SPACES_BEFORE_CHARSET,
CONTENT_SPACES_BEFORE_CHARSET,
CONTENT_CHARSET_C)
.sequence(
CONTENT_CHARSET_C, CONTENT_CHARSET_H, CONTENT_CHARSET_A, CONTENT_CHARSET_R, CONTENT_CHARSET_S, CONTENT_CHARSET_E, CONTENT_CHARSET_T,
CONTENT_CHARSET_EQ, CONTENT_CHARSET_VALUE)
.add(CONTENT_CHARSET_VALUE,
CONTENT_CHARSET_VALUE,
CONTENT_END_QUOTE)
.add(CONTENT_END_QUOTE, SPACES_BEFORE_EOE)
// [case 2] charset="*"
.sequence(
CHARSET_C, CHARSET_H, CHARSET_A, CHARSET_R, CHARSET_S, CHARSET_E, CHARSET_T,
CHARSET_EQ, CHARSET_BEGIN_QUOTE, CHARSET_VALUE)
.add(CHARSET_VALUE,
CHARSET_VALUE,
CHARSET_END_QUOTE)
.add(CHARSET_END_QUOTE, SPACES_BEFORE_EOE)
// />
.add(SPACES_BEFORE_EOE,
SPACES_BEFORE_EOE,
META_EOE_SL)
.sequence(META_EOE_SL, META_EOE_GT, _FINAL)
.add(_FINAL, _FINAL)
.build();
}
State next(State current, char ch) {
ch = Character.toLowerCase(ch);
for (State next : TRANSITIONS.get(current)) {
if (next.canTransition(ch)) {
return next;
}
}
return _INITIAL;
}
}
private static final FiniteStateMachine FSM = new FiniteStateMachine();
public static String extractCharsetName(byte[] bytes) {
State state = _INITIAL;
StringBuilder capturedChars = new StringBuilder();
for (byte b : bytes) {
if (b < 0) {
continue;
}
char ch = (char) b;
//System.out.printf("%s: '%c'%n", state, ch);
state = FSM.next(state, ch);
if (state.isCaptureNeeded()) {
capturedChars.append(ch);
}
if (state == _FINAL) {
break;
}
}
return capturedChars.toString().trim();
}
public static void main(String[] args) {
System.out.println(extractCharsetName(" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /> ".getBytes()));
System.out.println(extractCharsetName(" <meta charset=\"utf-8\" /> ".getBytes()));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment