Skip to content

Instantly share code, notes, and snippets.

@sinkuu
Last active August 29, 2015 14:06
Show Gist options
  • Save sinkuu/b82c945ab7c533ab6c6e to your computer and use it in GitHub Desktop.
Save sinkuu/b82c945ab7c533ab6c6e to your computer and use it in GitHub Desktop.
WIP: poor HTML parser
import std.algorithm;
import std.array;
import std.ascii;
import std.range;
import std.traits;
import std.variant;
import std.utf;
struct HTMLParser(R) if (isInputRange!R)
{
static assert(isSomeChar!(ElementEncodingType!R));
alias EArr = ElementEncodingType!R[];
this(R input)
{
_input = input;
// initial parsing
popFront();
}
@property
bool empty() const @safe pure nothrow @nogc
{
return _empty;
}
@property
ParseEvent front() const @safe pure nothrow @nogc
{
assert(!empty);
return _current;
}
void popFront()
{
static if (isSomeString!R)
{
auto refr = refArrayRange(&_input);
}
skipWhiteSpace();
if (_input.empty) goto Lempty;
if (_input.front != '<')
{
Text t;
static if (isSomeString!R)
{
auto s = _input;
t.content = s[0 .. refr.until('<').walkLength];
auto rstripLength = t.content.byCodeUnit.retro.until!(c => !c.isWhite).walkLength;
t.content = t.content[0 .. t.content.length - rstripLength];
}
else
{
t.content = refRange(&_input).until('<').array;
}
_current = t;
_empty = false;
_state = ParserState.tagOrText;
return;
}
// skip '<'
static if (isSomeString!R)
_input = _input[1 .. $];
else
_input.popFront();
if (_input.empty) goto Lempty;
if (_input.skipOver('/'))
{
if (_input.empty) goto Lempty;
EndElement end;
static if (isSomeString!R)
{
auto s = _input;
end.name = s[0 .. refr.until!(c => !c.isAlphaNum).walkLength];
}
else
{
end.name = refRange(&_input).until!(c => !c.isAlphaNum).array;
}
_current = end;
skipOverClosing();
}
else if (_input.skipOver('!'))
{
if (_input.skipOver("--")) // comment
{
Comment c;
static if (isSomeString!R)
{
auto s = _input;
auto len = refr.findSplitBefore("-->")[0].length;
c.content = s[0 .. len];
}
else
{
c.content = refRange(&_input).findSplitBefore("-->")[0].array;
}
_input.popFrontN(3); // skip "-->"
_current = c;
}
else if (_input.skipOver("[CDATA["))
{
RawText t;
t._isCData = true;
static if (isSomeString!R)
{
auto s = _input;
auto len = refr.findSplitBefore("]]>")[0].length;
assert(len < s.length);
t.content = s[0 .. len];
}
else
{
t.content = refRange(&_input).findSplitBefore("]]>")[0].array;
}
_input.popFrontN(3); // skip "]]>"
_current = t;
}
else if (_state == ParserState.start && _input.istartsWith("DOCTYPE "))
{
static if (isSomeString!R)
{
auto s = _input;
auto type = s[0 ..
refr.until('>').walkLength];
}
else
{
auto type = refRange(&_input).until('>').array;
}
_current = StartDocument(true, type);
_state = ParserState.tagOrText;
}
else
{
skipOverClosing();
if (_input.empty) goto Lempty;
}
}
else
{
StartElement start;
static if (isSomeString!R)
{
auto s = _input;
start.name = s[0 ..
refr.until!(c => !c.isAlphaNum).walkLength];
}
else
{
start.name = refRange(&_input).until!(c => !c.isAlphaNum).array;
}
skipWhiteSpace();
if (_input.empty) goto Lempty;
if (!_input.startsWith('>'))
{
while (true)
{
static if (isSomeString!R)
{
auto t = _input;
auto name = t[0 .. refr
.until!(c => !c.isValidAttrNameChar).walkLength];
}
else
{
auto name = refRange(&_input).until!(c => !c.isValidAttrNameChar).array;
}
if (name.empty) break;
skipWhiteSpace();
if (_input.empty) goto Lempty;
if (_input.skipOver('='))
{
skipWhiteSpace();
if (_input.empty) goto Lempty;
EArr value;
dchar quote = _input.front;
if (quote == '"' || quote == '\'') // quoted value
{
_input.popFront();
if (_input.empty) goto Lempty;
static if (isSomeString!R)
{
t = _input;
value = t[0 .. refr.until(quote).walkLength];
}
else
{
value = refRange(&_input).until(quote).array;
}
_input.popFront();
if (_input.empty) goto Lempty;
}
else // unquoted value
{
static if (isSomeString!R)
{
t = _input;
value = t[0 .. refr
.until!(c => !c.isValidUnquotedAttrValueChar).walkLength];
}
else
{
value = refRange(&_input).until!(c => !c.isValidUnquotedAttrValueChar)
.array;
}
}
start.attributes ~= Attribute(true, name, value);
}
else
{
start.attributes ~= Attribute(false, name, null);
}
skipWhiteSpace();
if (_input.empty || _input.startsWith('>')) break;
}
}
_current = start;
skipOverClosing();
}
_empty = false;
if (_state == ParserState.start) _state = ParserState.tagOrText;
return;
Lempty:
_empty = true;
}
alias ParseEvent = Algebraic!(
StartDocument,
StartElement,
EndElement,
Text,
RawText,
Comment,
);
static struct StartDocument
{
private bool _isDoctype;
@property bool isDoctype() const { return _isDoctype; }
EArr type;
}
static struct StartElement
{
EArr name;
Attribute[] attributes;
}
static struct Attribute
{
private bool _empty;
@property bool empty() const { return _empty; }
EArr name;
EArr value;
}
static struct EndElement
{
EArr name;
}
static struct Text
{
EArr content;
}
static struct RawText
{
private bool _isCData;
@property bool isCData() const { return _isCData; }
EArr content;
}
static struct Comment
{
EArr content;
}
private:
R _input;
ParseEvent _current;
bool _empty = true;
enum ParserState
{
start,
tagOrText,
//attribute
}
ParserState _state = ParserState.start;
// skips zero or more space characters
void skipWhiteSpace()
{
static if (isSomeString!R)
{
while (_input.length > 0 && _input[0].isWhite) _input = _input[1 .. $];
}
else
{
while (!_input.empty && _input.front.isWhite) _input.popFront();
}
}
void skipOverClosing()
{
static if (isSomeString!R)
{
while (_input.length > 0 && _input[0] != '>') _input = _input[1 .. $];
if (_input.length > 0) _input = _input[1 .. $];
}
else
{
while (!_input.empty && _input.front != '>') _input.popFront();
if (!_input.empty) _input.popFront();
}
}
}
auto htmlParser(T)(T html)
{
return HTMLParser!T(html);
}
unittest
{
auto html = `<!DOCTYPE html><html lang="ja" xmlns:og="http://www.example.com/ns#"><body>` ~
`<h1 data-utf="試験"><![CDATA[It works!]]></h1><p id="msg" class="txt">テスト</p><!--コメント-->` ~
`</body></html>`;
auto parser = htmlParser(html);
auto result = appender!(string);
foreach (i; parser)
{
i.tryVisit!(
(parser.StartDocument e)
{
result ~= "<" ~ (e.isDoctype ? "!" : "?");
result ~= e.type;
if (!e.isDoctype) result ~= "?";
},
(parser.StartElement e)
{
result ~= "<" ~ e.name;
foreach (i; e.attributes)
{
result ~= " " ~ i.name;
if (i.empty) result ~= `="` ~ i.value ~ `"`;
}
result ~= '>';
},
(parser.EndElement e)
{
result ~= "</" ~ e.name ~ ">";
},
(parser.Text e)
{
result ~= e.content;
},
(parser.RawText e)
{
if (e.isCData) result ~= "<![CDATA[";
result ~= e.content;
if (e.isCData) result ~= "]]>";
},
(parser.Comment e)
{
result ~= "<!--" ~ e.content ~ "-->";
});
}
assert(equal(html, result.data), result.data);
}
private:
bool istartsWith(R1, R2)(R1 r1, R2 r2) if (isInputRange!R1 && isInputRange!R2)
{
import std.uni : sicmp;
return startsWith!((dchar a, dchar b) { dchar[2] sa = [a, b]; return sicmp(sa[0 .. 1], sa[1 .. $]) == 0; })(r1, r2);
}
unittest
{
assert("HELLO".istartsWith("hEllO"));
assert(!"HELLO".istartsWith("*EllO"));
}
bool isValidAttrNameChar(dchar c) @safe pure nothrow
{
import std.uni : isWhite, isControl;
return c != 0 && !isWhite(c) &&
c.among('"', '\'', '>', '/', '=') == 0 && !isControl(c);
}
bool isValidUnquotedAttrValueChar(dchar c) @safe pure nothrow
{
import std.uni : isWhite, isControl;
return !isWhite(c) && c.among('"', '\'', '=', '<', '>', '`') == 0;
}
/**
See_Also: std.range.RefRange
*/
auto refArrayRange(T)(T* arr)
if (isDynamicArray!T)
{
static struct RefArrayRangeImpl
{
private T* _ptr;
invariant
{
assert(_ptr !is null);
}
@property auto ref front() const
{
assert(!empty);
return (*_ptr)[0];
}
@property bool empty() const
{
return _ptr.length == 0;
}
void popFront()
{
assert(!empty);
*_ptr = (*_ptr)[1 .. $];
}
@property auto ref back() const
{
return (*_ptr)[$ - 1];
}
void popBack()
{
assert(!empty);
*_ptr = (*_ptr)[0 .. $ - 1];
}
@property size_t length() const
{
return _ptr.length;
}
auto ref opIndex(size_t i) const
{
return (*_ptr)[i];
}
RefArrayRangeImpl opSlice(size_t a, size_t b)
{
import std.conv : emplace;
alias S = typeof(*_ptr);
auto mem = new void[S.sizeof];
emplace!S(mem, (*_ptr)[a .. b]);
return RefArrayRangeImpl(cast(S*)mem);
}
auto ref opSlice()
{
return *_ptr;
}
alias opDollar = length;
@property
auto save()
{
import std.conv : emplace;
alias S = typeof(*_ptr);
auto mem = new void[S.sizeof];
emplace!S(mem, *_ptr);
return RefArrayRangeImpl(cast(S*)mem);
}
}
return RefArrayRangeImpl(arr);
}
unittest
{
int[3] st = [1, 2, 3];
int[] arr = st[];
int[] orig = arr;
auto rarr = refArrayRange(&arr);
static assert(isInputRange!(typeof(rarr)));
static assert(isBidirectionalRange!(typeof(rarr)));
static assert(isRandomAccessRange!(typeof(rarr)));
static assert(isForwardRange!(typeof(rarr)));
static assert(hasSlicing!(typeof(rarr)));
static assert(hasLength!(typeof(rarr)));
assert(rarr.front == 1);
assert(rarr[1] == 2);
assert(rarr.length == 3);
assert(rarr[0 .. 3][0 .. 2].length == 2);
assert(rarr[] is arr);
auto save = rarr.save;
rarr.popFront();
assert(equal(arr, only(2, 3)));
assert(save.length == 3);
assert(*save._ptr is orig);
rarr = save;
assert(equal(rarr, orig));
}
pure nothrow @nogc
unittest
{
string s = "テスト";
assert(refArrayRange(&s).walkLength == 9);
static assert(is(typeof(refArrayRange(&s).array) == string));
wstring ws = "テスト"w; static assert(is(typeof(refArrayRange(&ws).array) == wstring));
dstring ds = "テスト"d; static assert(is(typeof(refArrayRange(&ds).array) == dstring));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment