Created
July 29, 2012 18:40
-
-
Save erickt/3200980 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mod url; | |
mod url_authority; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//#set_loc(1, "url.rl"); | |
// | |
// URL Parser | |
// Copyright (c) 2010 J.A. Roberts Tunney | |
// MIT License | |
// | |
// Converted to Rust by Erick Tryzelaar | |
// | |
// To compile: | |
// | |
// ragel --host-lang=rust url.rl -o url.rs | |
// ragel --host-lang=rust url_authority.rl -o url_authority.rs | |
// rustc url.rc | |
// ./url | |
// | |
// To show a diagram of your state machine: | |
// | |
// ragel -V -p -o url.dot url.rl | |
// dot -Tpng -o url.png url.dot | |
// chrome url.png | |
// | |
// ragel -V -p -o url_authority.dot url_authority.rl | |
// dot -Tpng -o url_authority.png url_authority.dot | |
// chrome url_authority.png | |
// | |
// Reference: | |
// | |
// - http://tools.ietf.org/html/rfc3986 | |
// | |
use std; | |
import result::{result, ok, err}; | |
import url_authority::{url, parse_authority}; | |
fn dummy() -> url { | |
{ | |
scheme: ~"", user: ~"", pass: ~"", host: ~"", port: 0, | |
params: ~"", path: ~"", query: ~"", fragment: ~"", | |
} | |
} | |
//#set_loc(43, "url.rl"); | |
//#set_loc(48, "url.rs"); | |
fn init__url_actions_0() -> ~[i8] { | |
~[ | |
0, 1, 0, 1, 1, 1, 2, 1, 3, 1, 4, 1, | |
5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 10, 2, | |
0, 9, 2, 1, 2, 2, 1, 3, 2, 1, 10, 3, | |
7, 1, 2 | |
] | |
} | |
fn init__url_key_offsets_0() -> ~[i8] { | |
~[ | |
0, 0, 4, 14, 24, 30, 36, 42, 48, 54, 60, 66, | |
72, 73, 83, 91, 99, 108, 116, 124 | |
] | |
} | |
fn init__url_trans_keys_0() -> ~[u8] { | |
~[ | |
65, 90, 97, 122, 43, 58, 45, 46, 48, 57, 65, 90, | |
97, 122, 37, 47, 60, 127, 0, 32, 34, 35, 62, 63, | |
48, 57, 65, 70, 97, 102, 48, 57, 65, 70, 97, 102, | |
48, 57, 65, 70, 97, 102, 48, 57, 65, 70, 97, 102, | |
48, 57, 65, 70, 97, 102, 48, 57, 65, 70, 97, 102, | |
48, 57, 65, 70, 97, 102, 48, 57, 65, 70, 97, 102, | |
47, 34, 35, 37, 47, 60, 62, 63, 127, 0, 32, 37, | |
60, 62, 127, 0, 32, 34, 35, 37, 60, 62, 127, 0, | |
32, 34, 35, 34, 35, 37, 60, 62, 63, 127, 0, 32, | |
34, 35, 37, 60, 62, 127, 0, 32, 34, 35, 37, 60, | |
62, 127, 0, 32, 34, 35, 37, 47, 60, 62, 63, 127, | |
0, 32, 0 | |
] | |
} | |
fn init__url_single_lengths_0() -> ~[i8] { | |
~[ | |
0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, | |
1, 8, 4, 4, 7, 6, 6, 8 | |
] | |
} | |
fn init__url_range_lengths_0() -> ~[i8] { | |
~[ | |
0, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
0, 1, 2, 2, 1, 1, 1, 1 | |
] | |
} | |
fn init__url_index_offsets_0() -> ~[i8] { | |
~[ | |
0, 0, 3, 10, 18, 22, 26, 30, 34, 38, 42, 46, | |
50, 52, 62, 69, 76, 85, 93, 101 | |
] | |
} | |
fn init__url_indicies_0() -> ~[i8] { | |
~[ | |
0, 2, 1, 3, 4, 3, 3, 5, 3, 1, 7, 8, | |
1, 1, 1, 1, 1, 6, 9, 9, 9, 1, 10, 10, | |
10, 1, 11, 11, 11, 1, 12, 12, 12, 1, 13, 13, | |
13, 1, 14, 14, 14, 1, 15, 15, 15, 1, 16, 16, | |
16, 1, 17, 1, 1, 18, 19, 20, 1, 1, 21, 1, | |
1, 12, 23, 1, 1, 1, 1, 1, 22, 25, 1, 1, | |
1, 1, 1, 24, 1, 27, 28, 1, 1, 29, 1, 1, | |
26, 1, 31, 32, 1, 1, 1, 1, 30, 1, 33, 34, | |
1, 1, 1, 1, 16, 1, 35, 7, 36, 1, 1, 37, | |
1, 1, 6, 0 | |
] | |
} | |
fn init__url_trans_targs_0() -> ~[i8] { | |
~[ | |
2, 0, 2, 2, 3, 2, 13, 6, 12, 5, 15, 7, | |
13, 9, 16, 11, 18, 19, 14, 6, 16, 17, 15, 4, | |
15, 4, 16, 14, 8, 17, 18, 14, 10, 14, 10, 14, | |
16, 17 | |
] | |
} | |
fn init__url_trans_actions_0() -> ~[i8] { | |
~[ | |
29, 0, 26, 5, 13, 7, 1, 1, 0, 9, 11, 0, | |
0, 9, 11, 0, 0, 0, 15, 0, 35, 15, 26, 3, | |
5, 0, 5, 17, 0, 17, 1, 23, 1, 19, 0, 0, | |
26, 0 | |
] | |
} | |
fn init__url_eof_actions_0() -> ~[i8] { | |
~[ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 15, 32, 21, 17, 23, 19, 0 | |
] | |
} | |
const url_start: int = 1; | |
const url_first_final: int = 13; | |
const url_error: int = 0; | |
const url_en_main: int = 1; | |
//#set_loc(44, "url.rl"); | |
// i parse absolute urls and don't suck at it. i'll parse just about | |
// any type of url you can think of and give you a human-friendly data | |
// structure. | |
// | |
// this routine takes no more than a few microseconds, is reentrant, | |
// performs in a predictable manner (for security/soft-realtime,) | |
// doesn't modify your `data` buffer, and under no circumstances will | |
// it panic (i hope!) | |
fn url_parse(data: ~[u8]) -> result<url, @~str> { | |
let mut cs: int; | |
let mut p = 0; | |
let mut pe = data.len(); | |
let mut eof = data.len(); | |
let mut mark = 0; | |
let mut url = dummy(); | |
// this buffer is so we can unescape while we roll | |
let mut buf = vec::to_mut(vec::from_elem(data.len(), 0)); | |
let mut hex = 0; | |
let mut amt = 0; | |
//#set_loc(175, "url.rs"); | |
{ | |
cs = url_start; | |
} | |
//#set_loc(180, "url.rs"); | |
{ | |
let _url_actions = init__url_actions_0(); | |
let _url_key_offsets = init__url_key_offsets_0(); | |
let _url_trans_keys = init__url_trans_keys_0(); | |
let _url_single_lengths = init__url_single_lengths_0(); | |
let _url_range_lengths = init__url_range_lengths_0(); | |
let _url_index_offsets = init__url_index_offsets_0(); | |
let _url_indicies = init__url_indicies_0(); | |
let _url_trans_targs = init__url_trans_targs_0(); | |
let _url_trans_actions = init__url_trans_actions_0(); | |
let _url_eof_actions = init__url_eof_actions_0(); | |
let mut _klen: int; | |
let mut _trans = 0; | |
let mut _acts: int; | |
let mut _nacts: int; | |
let mut _keys: int; | |
let mut _goto_targ = 0; | |
loop { | |
alt check _goto_targ { | |
0 { | |
if p == pe { | |
_goto_targ = 4; | |
again; | |
} | |
if cs == 0 { | |
_goto_targ = 5; | |
again; | |
} | |
_goto_targ = 1; | |
again; | |
} | |
1 { | |
let mut _break_match = false; | |
loop { | |
_keys = _url_key_offsets[cs] as int; | |
_trans = _url_index_offsets[cs] as int; | |
_klen = _url_single_lengths[cs] as int; | |
if _klen > 0 { | |
let mut _lower: int = _keys; | |
let mut _mid: int; | |
let mut _upper: int = _keys + _klen - 1; | |
loop { | |
if _upper < _lower { break; } | |
_mid = _lower + ((_upper-_lower) >> 1); | |
if data[p] < _url_trans_keys[_mid] { | |
_upper = _mid - 1; | |
} else if data[p] > _url_trans_keys[_mid] { | |
_lower = _mid + 1; | |
} else { | |
_trans += (_mid - _keys); | |
_break_match = true; | |
break; | |
} | |
} | |
if _break_match { break; } | |
_keys += _klen; | |
_trans += _klen; | |
} | |
_klen = _url_range_lengths[cs] as int; | |
if _klen > 0 { | |
let mut _lower = _keys; | |
let mut _mid: int; | |
let mut _upper = _keys + (_klen<<1) - 2; | |
loop { | |
if _upper < _lower { break; } | |
_mid = _lower + (((_upper-_lower) >> 1) & int::compl(1)); | |
if data[p] < _url_trans_keys[_mid] { | |
_upper = _mid - 2; | |
} else if data[p] > _url_trans_keys[_mid+1] { | |
_lower = _mid + 2; | |
} else { | |
_trans += ((_mid - _keys)>>1); | |
_break_match = true; | |
break; | |
} | |
} | |
if _break_match { break; } | |
_trans += _klen; | |
} | |
break; | |
} | |
_trans = _url_indicies[_trans] as int; | |
cs = _url_trans_targs[_trans] as int; | |
if _url_trans_actions[_trans] != 0 { | |
_acts = _url_trans_actions[_trans] as int; | |
_nacts = _url_actions[_acts] as int; | |
_acts += 1; | |
while _nacts > 0 { | |
_nacts -= 1; | |
let __acts = _acts; | |
_acts += 1; | |
alt check _url_actions[__acts] { | |
0 { | |
//#set_loc(68, "url.rl"); | |
{ mark = p; } | |
} | |
1 { | |
//#set_loc(69, "url.rl"); | |
{ amt = 0; } | |
} | |
2 { | |
//#set_loc(70, "url.rl"); | |
{ buf[amt] = data[p]; amt += 1; } | |
} | |
3 { | |
//#set_loc(71, "url.rl"); | |
{ buf[amt] = data[p] + 0x20; amt += 1; } | |
} | |
4 { | |
//#set_loc(73, "url.rl"); | |
{ | |
hex = alt char::to_digit(data[p] as char, 16) { | |
none { ret err(@~"invalid hex"); } | |
some(hex) { hex * 16 } | |
} | |
} | |
} | |
5 { | |
//#set_loc(80, "url.rl"); | |
{ | |
hex += alt char::to_digit(data[p] as char, 16) { | |
none { ret err(@~"invalid hex"); } | |
some(hex) { hex } | |
}; | |
buf[amt] = hex as u8; | |
amt += 1; | |
} | |
} | |
6 { | |
//#set_loc(89, "url.rl"); | |
{ | |
url.scheme = str::from_bytes(buf.slice(0, amt)); | |
} | |
} | |
7 { | |
//#set_loc(93, "url.rl"); | |
{ | |
let v = vec::view(data, mark, p); | |
let authority = parse_authority(url, v); | |
if authority.is_err() { | |
ret err(authority.get_err()); | |
} | |
url = result::unwrap(authority); | |
} | |
} | |
8 { | |
//#set_loc(102, "url.rl"); | |
{ | |
url.path = str::from_bytes(buf.slice(0, amt)); | |
} | |
} | |
9 { | |
//#set_loc(106, "url.rl"); | |
{ | |
url.query = str::from_bytes(data.slice(mark, p)); | |
} | |
} | |
//#set_loc(344, "url.rs"); | |
} | |
} | |
} | |
_goto_targ = 2; | |
again; | |
} | |
2 { | |
if cs == 0 { | |
_goto_targ = 5; | |
again; | |
} | |
p += 1; | |
if p != pe { | |
_goto_targ = 1; | |
again; | |
} | |
_goto_targ = 4; | |
again; | |
} | |
4 { | |
if p == eof { | |
let mut __acts = _url_eof_actions[cs] as int; | |
let mut __nacts = _url_actions[__acts] as int; | |
__acts += 1; | |
while __nacts > 0 { | |
__nacts -= 1; | |
let ___acts = __acts; | |
__acts += 1; | |
alt check _url_actions[___acts] { | |
0 { | |
//#set_loc(68, "url.rl"); | |
{ mark = p; } | |
} | |
1 { | |
//#set_loc(69, "url.rl"); | |
{ amt = 0; } | |
} | |
7 { | |
//#set_loc(93, "url.rl"); | |
{ | |
let v = vec::view(data, mark, p); | |
let authority = parse_authority(url, v); | |
if authority.is_err() { | |
ret err(authority.get_err()); | |
} | |
url = result::unwrap(authority); | |
} | |
} | |
8 { | |
//#set_loc(102, "url.rl"); | |
{ | |
url.path = str::from_bytes(buf.slice(0, amt)); | |
} | |
} | |
9 { | |
//#set_loc(106, "url.rl"); | |
{ | |
url.query = str::from_bytes(data.slice(mark, p)); | |
} | |
} | |
10 { | |
//#set_loc(110, "url.rl"); | |
{ | |
url.fragment = str::from_bytes(buf.slice(0, amt)); | |
} | |
} | |
//#set_loc(412, "url.rs"); | |
} | |
} | |
} | |
} | |
5 { } | |
} | |
break; | |
} | |
} | |
//#set_loc(146, "url.rl"); | |
if cs < url_first_final { | |
if p == pe { | |
err(@~"unexpected eof") | |
} else { | |
err(@#fmt("error in url at pos %u", p)) | |
} | |
} else { | |
ok(url) | |
} | |
} | |
////////////////////////////////////////////////////////////////////// | |
#[cfg(test)] | |
mod tests { | |
import std::time; | |
#[test] | |
fn test() { | |
let data = [( | |
~"http://user:pass@example.com:80;hello/lol.php?fun#omg", | |
{ | |
scheme: ~"http", | |
user: ~"user", | |
pass: ~"pass", | |
host: ~"example.com", | |
port: 80, | |
params: ~"hello", | |
path: ~"/lol.php", | |
query: ~"fun", | |
fragment: ~"omg", | |
} | |
), ( | |
~"a:b", | |
{ | |
scheme: ~"a", | |
host: ~"b", | |
with dummy() | |
} | |
), ( | |
~"GoPHeR://@example.com@:;/?#", | |
{ | |
scheme: ~"gopher", | |
host: ~"@example.com@", | |
path: ~"/", | |
with dummy() | |
} | |
), ( | |
~"ldap://[2001:db8::7]/c=GB?objectClass/?one", | |
{ | |
scheme: ~"ldap", | |
host: ~"2001:db8::7", | |
path: ~"/c=GB", | |
query: ~"objectClass/?one", | |
with dummy() | |
} | |
), ( | |
~"http://user@example.com", | |
{ | |
scheme: ~"http", | |
user: ~"user", | |
host: ~"example.com", | |
with dummy() | |
} | |
), ( | |
~"http://品研发和研发管@☃.com:65000;%20", | |
{ | |
scheme: ~"http", | |
user: ~"品研发和研发管", | |
host: ~"☃.com", | |
port: 65000, | |
params: ~"%20", | |
with dummy() | |
} | |
), ( | |
~"https://example.com:80", | |
{ | |
scheme: ~"https", | |
host: ~"example.com", | |
port: 80, | |
with dummy() | |
} | |
), ( | |
~"file:///etc/passwd", | |
{ | |
scheme: ~"file", | |
path: ~"/etc/passwd", | |
with dummy() | |
} | |
), ( | |
~"file:///c:/WINDOWS/clock.avi", | |
{ | |
scheme: ~"file", | |
path: ~"/c:/WINDOWS/clock.avi", /* <-- is this kosher? */ | |
with dummy() | |
} | |
), ( | |
~"file://hostname/path/to/the%20file.txt", | |
{ | |
scheme: ~"file", | |
host: ~"hostname", | |
path: ~"/path/to/the file.txt", | |
with dummy() | |
} | |
), ( | |
~"sip:example.com", | |
{ | |
scheme: ~"sip", | |
host: ~"example.com", | |
with dummy() | |
} | |
), ( | |
~"sip:example.com:5060", | |
{ | |
scheme: ~"sip", | |
host: ~"example.com", | |
port: 5060, | |
with dummy() | |
} | |
), ( | |
~"mailto:ditto@pokémon.com", | |
{ | |
scheme: ~"mailto", | |
user: ~"ditto", | |
host: ~"pokémon.com", | |
with dummy() | |
} | |
), ( | |
~"sip:[dead:beef::666]:5060", | |
{ | |
scheme: ~"sip", | |
host: ~"dead:beef::666", | |
port: 5060, | |
with dummy() | |
} | |
), ( | |
~"tel:+12126660420", | |
{ | |
scheme: ~"tel", | |
host: ~"+12126660420", | |
with dummy() | |
} | |
), ( | |
~"sip:bob%20barker:priceisright@[dead:beef::666]:5060;isup-oli=00/palfun.html?haha#omg", | |
{ | |
scheme: ~"sip", | |
user: ~"bob barker", | |
pass: ~"priceisright", | |
host: ~"dead:beef::666", | |
port: 5060, | |
params: ~"isup-oli=00", | |
path: ~"/palfun.html", | |
query: ~"haha", | |
fragment: ~"omg", | |
with dummy() | |
} | |
), ( | |
~"http://www.google.com/search?%68l=en&safe=off&q=omfg&aq=f&aqi=g2g-s1g1g-s1g5&aql=&oq=&gs_rfai=", | |
{ | |
scheme: ~"http", | |
host: ~"www.google.com", | |
path: ~"/search", | |
query: ~"%68l=en&safe=off&q=omfg&aq=f&aqi=g2g-s1g1g-s1g5&aql=&oq=&gs_rfai=", | |
with dummy() | |
} | |
)]; | |
for data.each |data| { | |
alt data { | |
(s, expected) { | |
alt url_parse(str::bytes(s)) { | |
err(e) { fail *e; } | |
ok(url) { assert expected == url; } | |
} | |
} | |
} | |
} | |
} | |
#[test] | |
fn benchmark() { | |
let rounds = 100000; | |
let urls = [ | |
~"a:a", | |
~"http://google.com/", | |
~"sip:jtunney@lobstertech.com", | |
~"http://user:pass@example.com:80;hello/lol.php?fun#omg", | |
~"file:///etc/passwd", | |
]; | |
for urls.each |url| { | |
let t1 = time::precise_time_ns(); | |
for rounds.times { | |
url_parse(str::bytes(url)); | |
} | |
let t2 = time::precise_time_ns(); | |
io::println(#fmt("BENCH parse %s -> %f ns", | |
url, | |
((t2 - t1) as float) / (rounds as float))); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment