Skip to content

Instantly share code, notes, and snippets.

@JakobOvrum
Last active August 29, 2015 14:17
Show Gist options
  • Save JakobOvrum/e98ff326ef3f16f33d6d to your computer and use it in GitHub Desktop.
Save JakobOvrum/e98ff326ef3f16f33d6d to your computer and use it in GitHub Desktop.
Print HTML title from untrusted URL
import core.time, std.algorithm, std.range,
std.string, std.net.curl, std.regex;
import std.stdio : stderr, stdout;
immutable usage = `%s <URL to HTML page>`;
enum StatusCode
{
success = 0,
missingURL,
unsupportedProtocol,
timeout,
missingTitle,
}
int main(string[] args)
{
if(args.length < 2)
{
stderr.writefln(usage, args[0]);
return StatusCode.missingURL;
}
auto url = args[1];
static urlPattern = ctRegex!`^([a-z]+)://`;
if(auto m = url.matchFirst(urlPattern))
{
auto prefix = m.captures.front;
auto protocol = m.captures.drop(1).front;
if(protocol != "http")
{
stderr.writefln("unsupported protocol: %s", protocol);
return StatusCode.unsupportedProtocol;
}
url.skipOver(prefix);
}
auto http = HTTP(url);
http.method = HTTP.Method.get;
http.operationTimeout = 5.seconds;
http.maxRedirects = 5;
string contentType;
string charset;
http.onReceiveHeader = (key, value) {
if(key == "content-type")
{
// parse "text/html; charset=xxx"
auto fields = value.splitter(";").map!(field => field.strip);
contentType = fields.front.idup;
auto charsetParam = fields.drop(1)
.find!(field => field.startsWith("charset="));
if(!charsetParam.empty)
{
charset = charsetParam
.front
.splitter("=")
.drop(1)
.front
.idup;
}
}
};
enum sizeThreshold = 1024 * 8;
ubyte[sizeThreshold] buffer;
auto bufferTail = buffer[];
http.onReceive = (data) {
if(contentType && contentType != "text/html")
return HTTP.requestAbort;
if(bufferTail.length < data.length)
{
immutable partLength = bufferTail.length;
bufferTail = data
.take(partLength)
.copy(bufferTail);
return HTTP.requestAbort;
}
else
{
bufferTail = data.copy(bufferTail);
return data.length;
}
};
try http.perform();
catch(CurlTimeoutException ex)
{
stderr.writeln(ex.msg);
return StatusCode.timeout;
}
catch(CurlException)
{
// Assume aborted... :S
}
auto document = buffer[0 .. buffer.length - bufferTail.length];
string transcoded;
if(icmp(charset, "utf-8") != 0)
{
// TODO: transcode to UTF-8
transcoded = cast(string)document;
stderr.writefln(`WARNING: unrecognized charset "%s"`, charset);
}
else
transcoded = cast(string)document;
// HE COMES
static titlePattern = ctRegex!(`<title>(.*?)</title>`, "si");
if(auto m = transcoded.matchFirst(titlePattern))
stdout.writeln(m.captures.drop(1).front);
else
{
stderr.writefln("unable to find title near start of document (within %s bytes)", sizeThreshold);
return StatusCode.missingTitle;
}
return StatusCode.success;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment