Last active
August 29, 2015 14:06
-
-
Save sinkuu/b82c945ab7c533ab6c6e to your computer and use it in GitHub Desktop.
WIP: poor HTML parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import std.algorithm; | |
import std.array; | |
import std.ascii; | |
import std.range; | |
import std.traits; | |
import std.variant; | |
import std.utf; | |
struct HTMLParser(R) if (isInputRange!R) | |
{ | |
static assert(isSomeChar!(ElementEncodingType!R)); | |
alias EArr = ElementEncodingType!R[]; | |
this(R input) | |
{ | |
_input = input; | |
// initial parsing | |
popFront(); | |
} | |
@property | |
bool empty() const @safe pure nothrow @nogc | |
{ | |
return _empty; | |
} | |
@property | |
ParseEvent front() const @safe pure nothrow @nogc | |
{ | |
assert(!empty); | |
return _current; | |
} | |
void popFront() | |
{ | |
static if (isSomeString!R) | |
{ | |
auto refr = refArrayRange(&_input); | |
} | |
skipWhiteSpace(); | |
if (_input.empty) goto Lempty; | |
if (_input.front != '<') | |
{ | |
Text t; | |
static if (isSomeString!R) | |
{ | |
auto s = _input; | |
t.content = s[0 .. refr.until('<').walkLength]; | |
auto rstripLength = t.content.byCodeUnit.retro.until!(c => !c.isWhite).walkLength; | |
t.content = t.content[0 .. t.content.length - rstripLength]; | |
} | |
else | |
{ | |
t.content = refRange(&_input).until('<').array; | |
} | |
_current = t; | |
_empty = false; | |
_state = ParserState.tagOrText; | |
return; | |
} | |
// skip '<' | |
static if (isSomeString!R) | |
_input = _input[1 .. $]; | |
else | |
_input.popFront(); | |
if (_input.empty) goto Lempty; | |
if (_input.skipOver('/')) | |
{ | |
if (_input.empty) goto Lempty; | |
EndElement end; | |
static if (isSomeString!R) | |
{ | |
auto s = _input; | |
end.name = s[0 .. refr.until!(c => !c.isAlphaNum).walkLength]; | |
} | |
else | |
{ | |
end.name = refRange(&_input).until!(c => !c.isAlphaNum).array; | |
} | |
_current = end; | |
skipOverClosing(); | |
} | |
else if (_input.skipOver('!')) | |
{ | |
if (_input.skipOver("--")) // comment | |
{ | |
Comment c; | |
static if (isSomeString!R) | |
{ | |
auto s = _input; | |
auto len = refr.findSplitBefore("-->")[0].length; | |
c.content = s[0 .. len]; | |
} | |
else | |
{ | |
c.content = refRange(&_input).findSplitBefore("-->")[0].array; | |
} | |
_input.popFrontN(3); // skip "-->" | |
_current = c; | |
} | |
else if (_input.skipOver("[CDATA[")) | |
{ | |
RawText t; | |
t._isCData = true; | |
static if (isSomeString!R) | |
{ | |
auto s = _input; | |
auto len = refr.findSplitBefore("]]>")[0].length; | |
assert(len < s.length); | |
t.content = s[0 .. len]; | |
} | |
else | |
{ | |
t.content = refRange(&_input).findSplitBefore("]]>")[0].array; | |
} | |
_input.popFrontN(3); // skip "]]>" | |
_current = t; | |
} | |
else if (_state == ParserState.start && _input.istartsWith("DOCTYPE ")) | |
{ | |
static if (isSomeString!R) | |
{ | |
auto s = _input; | |
auto type = s[0 .. | |
refr.until('>').walkLength]; | |
} | |
else | |
{ | |
auto type = refRange(&_input).until('>').array; | |
} | |
_current = StartDocument(true, type); | |
_state = ParserState.tagOrText; | |
} | |
else | |
{ | |
skipOverClosing(); | |
if (_input.empty) goto Lempty; | |
} | |
} | |
else | |
{ | |
StartElement start; | |
static if (isSomeString!R) | |
{ | |
auto s = _input; | |
start.name = s[0 .. | |
refr.until!(c => !c.isAlphaNum).walkLength]; | |
} | |
else | |
{ | |
start.name = refRange(&_input).until!(c => !c.isAlphaNum).array; | |
} | |
skipWhiteSpace(); | |
if (_input.empty) goto Lempty; | |
if (!_input.startsWith('>')) | |
{ | |
while (true) | |
{ | |
static if (isSomeString!R) | |
{ | |
auto t = _input; | |
auto name = t[0 .. refr | |
.until!(c => !c.isValidAttrNameChar).walkLength]; | |
} | |
else | |
{ | |
auto name = refRange(&_input).until!(c => !c.isValidAttrNameChar).array; | |
} | |
if (name.empty) break; | |
skipWhiteSpace(); | |
if (_input.empty) goto Lempty; | |
if (_input.skipOver('=')) | |
{ | |
skipWhiteSpace(); | |
if (_input.empty) goto Lempty; | |
EArr value; | |
dchar quote = _input.front; | |
if (quote == '"' || quote == '\'') // quoted value | |
{ | |
_input.popFront(); | |
if (_input.empty) goto Lempty; | |
static if (isSomeString!R) | |
{ | |
t = _input; | |
value = t[0 .. refr.until(quote).walkLength]; | |
} | |
else | |
{ | |
value = refRange(&_input).until(quote).array; | |
} | |
_input.popFront(); | |
if (_input.empty) goto Lempty; | |
} | |
else // unquoted value | |
{ | |
static if (isSomeString!R) | |
{ | |
t = _input; | |
value = t[0 .. refr | |
.until!(c => !c.isValidUnquotedAttrValueChar).walkLength]; | |
} | |
else | |
{ | |
value = refRange(&_input).until!(c => !c.isValidUnquotedAttrValueChar) | |
.array; | |
} | |
} | |
start.attributes ~= Attribute(true, name, value); | |
} | |
else | |
{ | |
start.attributes ~= Attribute(false, name, null); | |
} | |
skipWhiteSpace(); | |
if (_input.empty || _input.startsWith('>')) break; | |
} | |
} | |
_current = start; | |
skipOverClosing(); | |
} | |
_empty = false; | |
if (_state == ParserState.start) _state = ParserState.tagOrText; | |
return; | |
Lempty: | |
_empty = true; | |
} | |
alias ParseEvent = Algebraic!( | |
StartDocument, | |
StartElement, | |
EndElement, | |
Text, | |
RawText, | |
Comment, | |
); | |
static struct StartDocument | |
{ | |
private bool _isDoctype; | |
@property bool isDoctype() const { return _isDoctype; } | |
EArr type; | |
} | |
static struct StartElement | |
{ | |
EArr name; | |
Attribute[] attributes; | |
} | |
static struct Attribute | |
{ | |
private bool _empty; | |
@property bool empty() const { return _empty; } | |
EArr name; | |
EArr value; | |
} | |
static struct EndElement | |
{ | |
EArr name; | |
} | |
static struct Text | |
{ | |
EArr content; | |
} | |
static struct RawText | |
{ | |
private bool _isCData; | |
@property bool isCData() const { return _isCData; } | |
EArr content; | |
} | |
static struct Comment | |
{ | |
EArr content; | |
} | |
private: | |
R _input; | |
ParseEvent _current; | |
bool _empty = true; | |
enum ParserState | |
{ | |
start, | |
tagOrText, | |
//attribute | |
} | |
ParserState _state = ParserState.start; | |
// skips zero or more space characters | |
void skipWhiteSpace() | |
{ | |
static if (isSomeString!R) | |
{ | |
while (_input.length > 0 && _input[0].isWhite) _input = _input[1 .. $]; | |
} | |
else | |
{ | |
while (!_input.empty && _input.front.isWhite) _input.popFront(); | |
} | |
} | |
void skipOverClosing() | |
{ | |
static if (isSomeString!R) | |
{ | |
while (_input.length > 0 && _input[0] != '>') _input = _input[1 .. $]; | |
if (_input.length > 0) _input = _input[1 .. $]; | |
} | |
else | |
{ | |
while (!_input.empty && _input.front != '>') _input.popFront(); | |
if (!_input.empty) _input.popFront(); | |
} | |
} | |
} | |
auto htmlParser(T)(T html) | |
{ | |
return HTMLParser!T(html); | |
} | |
unittest | |
{ | |
auto html = `<!DOCTYPE html><html lang="ja" xmlns:og="http://www.example.com/ns#"><body>` ~ | |
`<h1 data-utf="試験"><![CDATA[It works!]]></h1><p id="msg" class="txt">テスト</p><!--コメント-->` ~ | |
`</body></html>`; | |
auto parser = htmlParser(html); | |
auto result = appender!(string); | |
foreach (i; parser) | |
{ | |
i.tryVisit!( | |
(parser.StartDocument e) | |
{ | |
result ~= "<" ~ (e.isDoctype ? "!" : "?"); | |
result ~= e.type; | |
if (!e.isDoctype) result ~= "?"; | |
}, | |
(parser.StartElement e) | |
{ | |
result ~= "<" ~ e.name; | |
foreach (i; e.attributes) | |
{ | |
result ~= " " ~ i.name; | |
if (i.empty) result ~= `="` ~ i.value ~ `"`; | |
} | |
result ~= '>'; | |
}, | |
(parser.EndElement e) | |
{ | |
result ~= "</" ~ e.name ~ ">"; | |
}, | |
(parser.Text e) | |
{ | |
result ~= e.content; | |
}, | |
(parser.RawText e) | |
{ | |
if (e.isCData) result ~= "<![CDATA["; | |
result ~= e.content; | |
if (e.isCData) result ~= "]]>"; | |
}, | |
(parser.Comment e) | |
{ | |
result ~= "<!--" ~ e.content ~ "-->"; | |
}); | |
} | |
assert(equal(html, result.data), result.data); | |
} | |
private: | |
bool istartsWith(R1, R2)(R1 r1, R2 r2) if (isInputRange!R1 && isInputRange!R2) | |
{ | |
import std.uni : sicmp; | |
return startsWith!((dchar a, dchar b) { dchar[2] sa = [a, b]; return sicmp(sa[0 .. 1], sa[1 .. $]) == 0; })(r1, r2); | |
} | |
unittest | |
{ | |
assert("HELLO".istartsWith("hEllO")); | |
assert(!"HELLO".istartsWith("*EllO")); | |
} | |
bool isValidAttrNameChar(dchar c) @safe pure nothrow | |
{ | |
import std.uni : isWhite, isControl; | |
return c != 0 && !isWhite(c) && | |
c.among('"', '\'', '>', '/', '=') == 0 && !isControl(c); | |
} | |
bool isValidUnquotedAttrValueChar(dchar c) @safe pure nothrow | |
{ | |
import std.uni : isWhite, isControl; | |
return !isWhite(c) && c.among('"', '\'', '=', '<', '>', '`') == 0; | |
} | |
/** | |
See_Also: std.range.RefRange | |
*/ | |
auto refArrayRange(T)(T* arr) | |
if (isDynamicArray!T) | |
{ | |
static struct RefArrayRangeImpl | |
{ | |
private T* _ptr; | |
invariant | |
{ | |
assert(_ptr !is null); | |
} | |
@property auto ref front() const | |
{ | |
assert(!empty); | |
return (*_ptr)[0]; | |
} | |
@property bool empty() const | |
{ | |
return _ptr.length == 0; | |
} | |
void popFront() | |
{ | |
assert(!empty); | |
*_ptr = (*_ptr)[1 .. $]; | |
} | |
@property auto ref back() const | |
{ | |
return (*_ptr)[$ - 1]; | |
} | |
void popBack() | |
{ | |
assert(!empty); | |
*_ptr = (*_ptr)[0 .. $ - 1]; | |
} | |
@property size_t length() const | |
{ | |
return _ptr.length; | |
} | |
auto ref opIndex(size_t i) const | |
{ | |
return (*_ptr)[i]; | |
} | |
RefArrayRangeImpl opSlice(size_t a, size_t b) | |
{ | |
import std.conv : emplace; | |
alias S = typeof(*_ptr); | |
auto mem = new void[S.sizeof]; | |
emplace!S(mem, (*_ptr)[a .. b]); | |
return RefArrayRangeImpl(cast(S*)mem); | |
} | |
auto ref opSlice() | |
{ | |
return *_ptr; | |
} | |
alias opDollar = length; | |
@property | |
auto save() | |
{ | |
import std.conv : emplace; | |
alias S = typeof(*_ptr); | |
auto mem = new void[S.sizeof]; | |
emplace!S(mem, *_ptr); | |
return RefArrayRangeImpl(cast(S*)mem); | |
} | |
} | |
return RefArrayRangeImpl(arr); | |
} | |
unittest | |
{ | |
int[3] st = [1, 2, 3]; | |
int[] arr = st[]; | |
int[] orig = arr; | |
auto rarr = refArrayRange(&arr); | |
static assert(isInputRange!(typeof(rarr))); | |
static assert(isBidirectionalRange!(typeof(rarr))); | |
static assert(isRandomAccessRange!(typeof(rarr))); | |
static assert(isForwardRange!(typeof(rarr))); | |
static assert(hasSlicing!(typeof(rarr))); | |
static assert(hasLength!(typeof(rarr))); | |
assert(rarr.front == 1); | |
assert(rarr[1] == 2); | |
assert(rarr.length == 3); | |
assert(rarr[0 .. 3][0 .. 2].length == 2); | |
assert(rarr[] is arr); | |
auto save = rarr.save; | |
rarr.popFront(); | |
assert(equal(arr, only(2, 3))); | |
assert(save.length == 3); | |
assert(*save._ptr is orig); | |
rarr = save; | |
assert(equal(rarr, orig)); | |
} | |
pure nothrow @nogc | |
unittest | |
{ | |
string s = "テスト"; | |
assert(refArrayRange(&s).walkLength == 9); | |
static assert(is(typeof(refArrayRange(&s).array) == string)); | |
wstring ws = "テスト"w; static assert(is(typeof(refArrayRange(&ws).array) == wstring)); | |
dstring ds = "テスト"d; static assert(is(typeof(refArrayRange(&ds).array) == dstring)); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment