Skip to content

Instantly share code, notes, and snippets.

@SimonSapin
Created December 4, 2013 20:23
Show Gist options
  • Save SimonSapin/7794849 to your computer and use it in GitHub Desktop.
Save SimonSapin/7794849 to your computer and use it in GitHub Desktop.
WIP URL parser for Rust, by @jgraham
use std::str;
use std::iterator::{Iterator};
use std::str::{eq, eq_slice, ByteIterator};
use std::vec;
use std::ops;
use std::char;
struct IPv6Address {
data: [u16, ..8]
}
impl Eq for IPv6Address {
fn eq(&self, other: &IPv6Address) -> bool {
for i in range(0,8) {
let j = i as uint;
if self.data[j] != other.data[j] {
return false;
}
};
true
}
}
impl Clone for IPv6Address {
fn clone(&self) -> IPv6Address {
let mut new = IPv6Address::new();
for i in range(0,8) {
let j = i as uint;
new.data[j] = self.data[j]
};
new
}
}
#[deriving(Eq, Clone)]
enum Host {
NoHost,
DomainHost(~str),
IPv6Host(IPv6Address),
}
#[deriving(Eq, Clone)]
enum Scheme {
EmptyScheme, //Initial value
FTPScheme,
FileScheme,
GopherScheme,
HTTPScheme,
HTTPSScheme,
WSScheme,
WSSScheme,
OtherScheme(~str)
}
pub struct ParsedURL {
scheme: Scheme,
scheme_data: ~str,
username: ~str,
password: Option<~str>,
host: Host,
port: ~str,
path: ~[~str],
query: Option<~str>,
fragment: Option<~str>,
relative: bool
}
impl Scheme {
pub fn from_str(string: &str) -> Scheme {
match string {
"ftp" => FTPScheme,
"file" => FileScheme,
"gopher" => GopherScheme,
"http" => HTTPScheme,
"https" => HTTPSScheme,
"ws" => WSSScheme,
"wss" => WSSScheme,
_ => OtherScheme(string.to_owned())
}
}
pub fn is_relative(&self) -> bool {
match *self {
FTPScheme | FileScheme | GopherScheme | HTTPScheme | HTTPSScheme | WSScheme | WSSScheme => true,
_ => false
}
}
pub fn default_port(&self) -> Option<~str> {
match *self {
FTPScheme => Some(~"21"),
GopherScheme => Some(~"70"),
HTTPScheme => Some(~"80"),
HTTPSScheme => Some(~"443"),
WSScheme => Some(~"80"),
WSSScheme => Some(~"443"),
_ => None
}
}
}
#[deriving(Eq)]
enum ParserState {
SchemeStartState,
SchemeState,
SchemeDataState,
NoSchemeState,
RelativeOrAuthorityState,
RelativeState,
RelativeSlashState,
AuthorityFirstSlashState,
AuthoritySecondSlashState,
AuthorityIgnoreSlashesState,
AuthorityState,
FileHostState,
HostState,
HostnameState,
PortState,
RelativePathStartState,
RelativePathState,
QueryState,
FragmentState
}
pub struct SeekableCharIterator {
priv string: ~str,
priv len: uint,
priv pos: uint
}
impl Iterator<char> for SeekableCharIterator {
fn next(&mut self) -> Option<char> {
if self.pos >= self.len {
None
} else {
let rv = Some(self.string[self.pos] as char);
self.pos += 1;
rv
}
}
fn size_hint(&self) -> (uint, Option<uint>) {
(self.len - self.pos, Some(self.len - self.pos))
}
}
impl SeekableCharIterator {
pub fn new(string: ~str) -> SeekableCharIterator {
SeekableCharIterator {
len: string.len(),
string: string,
pos: 0
}
}
pub fn peek(&self, chars: uint) -> Option<char> {
if chars + self.pos - 1> self.len {
None
} else {
Some(self.string[chars + self.pos - 1] as char)
}
}
pub fn seek(&mut self, pos: uint) {
if pos >= self.len {
self.pos = self.len;
} else {
self.pos = pos;
}
}
pub fn rewind(&mut self, n: uint) {
if (self.pos > 0) {
self.pos -= n
}
}
}
enum URLParseError {
InvalidURL
}
impl ParsedURL {
pub fn new(scheme: Scheme,
scheme_data: ~str,
username: ~str,
password: Option<~str>,
host: Host,
port: ~str,
path: ~[~str],
query: Option<~str>,
fragment: Option<~str>,
relative: bool) -> ParsedURL {
ParsedURL {scheme: scheme,
scheme_data: scheme_data,
username: username,
password: password,
host: host,
port: port,
path: path,
query: query,
fragment: fragment,
relative: relative}
}
pub fn parse(raw_input: &str, base_url: Option<&ParsedURL>, encoding: Option<&str>, mut initial_url: Option<ParsedURL>, state_override: Option<ParserState>) -> Result<Option<ParsedURL>, URLParseError> {
let (url_, input_) = match initial_url {
Some(x) => (x, raw_input),
None => {
let mut new_url = ParsedURL::new(EmptyScheme, ~"", ~"", None, NoHost, ~"", ~[], None, None, false);
//Need to check this is the right chars
(new_url, raw_input.trim())
}
};
let mut url = url_;
let mut char_iter = SeekableCharIterator::new(input_.to_owned());
let mut state = match state_override {
Some(state) => state,
None => SchemeStartState
};
let mut encoding_override = match encoding {
Some(x) => x,
None => "utf-8"
};
//There is possibly a better type to use here
let mut buf = ~"";
let mut at_flag = false;
let mut square_paren_flag = false;
loop {
let maybe_c = char_iter.next();
match maybe_c {
Some(c) => {
//Normal character handling
match state {
SchemeStartState => {
match c {
'a'..'z' | 'A'..'Z' => {
buf.push_char(char_to_lower(c));
state = SchemeState;
},
_ => {
match state_override {
Some(state) => {return Err(InvalidURL);}
None => {
char_iter.rewind(1);
state = NoSchemeState;
}
}
}
}
},
SchemeState =>
match c {
'a'..'z' | 'A'..'Z' | '+' | '-' | '.' => {
buf.push_char(char_to_lower(c));
},
':' => {
let scheme = Scheme::from_str(buf);
buf = ~"";
if state_override.is_some() {
return Ok(None);
}
if scheme.is_relative() {
url.relative = true;
}
if scheme == FileScheme {
state = RelativeState;
} else if (url.relative) {
if (base_url.is_some() &&
base_url.unwrap().scheme == scheme) {
state = RelativeOrAuthorityState;
} else {
state = AuthorityFirstSlashState;
}
} else {
state = SchemeDataState;
}
url.scheme = scheme;
},
_ => {
match state_override {
Some(x) => {
//break if we have EOF, but not sure if we can get here with EOF
return Err(InvalidURL);
},
None => {
buf = ~"";
state = NoSchemeState;
char_iter.seek(0);
}
}
}
},
SchemeDataState => {
match c {
'?' => {
url.scheme_data = buf;
buf = ~"";
url.query = Some(~"");
state = QueryState;
},
'#' => {
url.scheme_data = buf;
buf = ~"";
url.fragment = Some(~"");
state = FragmentState;
},
_ => {
if c != '%' && !is_url_char(c) {
return Err(InvalidURL);
} else if (c == '%' &&
!(unwrap_bool(is_hex_char, char_iter.peek(1)) &&
unwrap_bool(is_hex_char, char_iter.peek(2)))) {
return Err(InvalidURL);
} else if (c == '\x09' || c == '\x0a' || c == '\x0d') {
//Ignore these characters
} else {
buf.push_str(utf8_percent_encode(c, SimpleEncodeSet));
}
}
}
},
NoSchemeState => {
if base_url.is_none() || !base_url.unwrap().relative {
return Err(InvalidURL);
} else {
state = RelativeState;
char_iter.rewind(1);
}
},
RelativeOrAuthorityState => {
let next = char_iter.peek(1);
if (c == '/' && next.is_some() && next.unwrap() == '/') {
state = AuthorityIgnoreSlashesState;
char_iter.next();
} else {
//XXX non-fatal parse error
char_iter.rewind(1);
state = RelativeState;
}
},
RelativeState => {
let base = base_url.expect("In relative state we must have a base url");
url.relative = true;
if url.scheme != FileScheme && base_url.is_some() {
url.scheme = base.scheme.clone();
}
//Need to deal with EOF also
match c {
'\\' | '/' => state = RelativeSlashState,
'?' => {
url.host = base.host.clone();
url.port = base.port.clone();
url.path = base.path.clone();
url.query = Some(~"");
state = QueryState;
},
'#' => {
url.host = base.host.clone();
url.port = base.port.clone();
url.path = base.path.clone();
url.query = base.query.clone();
url.fragment = Some(~"");
state = FragmentState;
},
_ => {
let next = char_iter.peek(1);
let second = char_iter.peek(2);
if (url.scheme != FileScheme ||
!is_ascii_alpha(c) ||
!(next == Some(':') ||
next == Some('|')) ||
!(second == Some('/') ||
second == Some('\\') ||
second == Some('?') ||
second == Some('#'))) {
url.host = base.host.clone();
url.path = base.path.clone();
url.port = base.port.clone();
url.path.pop(); //??? "And then pop URL's path"
}
}
}
},
RelativeSlashState => {
let base = base_url.expect("In relative slash state we must have a base url");
match c {
'\\' | '/' => {
if url.scheme == FileScheme {
state = FileHostState;
} else {
state = AuthorityIgnoreSlashesState;
}
},
_ => {
if url.scheme != FileScheme {
url.host = base.host.clone();
url.port = base.port.clone();
}
state = RelativePathState;
char_iter.rewind(1);
}
}
},
AuthorityFirstSlashState => {
match c {
'/' => state = AuthoritySecondSlashState,
_ => {
state = AuthorityIgnoreSlashesState;
char_iter.rewind(1);
}
}
},
AuthoritySecondSlashState => {
state = AuthorityIgnoreSlashesState;
if c != '/' {
char_iter.rewind(1);
}
},
AuthorityIgnoreSlashesState => {
if c != '/' && c != '\\' {
state = AuthorityState;
char_iter.rewind(1);
}
},
AuthorityState => {
match c {
'@' => {
if at_flag {
let mut new_buf = ~"%40";
new_buf.push_str(buf);
buf = new_buf;
}
at_flag = true;
let mut target = ~"";
for cp in buf.iter() {
if (cp == '\x09' ||
cp == '\x0a' ||
cp == '\x0d') {
loop;
}
if cp == ':' && url.password.is_none() {
url.password = Some(~"");
url.username.push_str(target.clone());
target = ~"";
} else {
target.push_str(utf8_percent_encode(cp, DefaultEncodeSet))
}
}
match url.password {
Some(ref mut x) => {
x.push_str(target);
},
None => url.username.push_str(target)
}
},
'/' | '\\' | '?' | '#' => {
char_iter.rewind(buf.len());
buf = ~"";
state = HostState;
},
_ => {
buf.push_char(c);
}
}
},
FileHostState => {
match c {
'/' | '\\' | '?' | '#' => {
if (buf.len() == 2 &&
is_ascii_alpha(buf[0] as char) &&
(buf[1] as char == ':' ||
buf[1] as char == '|')) {
state = RelativePathState;
} else if eq_slice(buf, &"") {
state = RelativePathStartState;
} else {
let host = host_parse(buf);
match host {
Some(x) => {
url.host = x;
state = RelativePathStartState;
},
None => {
return Err(InvalidURL);
}
}
}
},
'\x09' | '\x0a' | '\x0d' => {
//parse error
},
_ => {
buf.push_char(c);
}
}
},
HostState | HostnameState => {
if !square_paren_flag && c == ':' {
let host = host_parse(buf);
match host {
None => return Ok(None),
Some(x) => {
url.host = x;
buf = ~"";
state = PortState;
}
}
} else {
match c {
'/' | '\\' | '?' | '#' => {
char_iter.rewind(1);
let host = host_parse(buf);
match host {
Some(x) => {
url.host = x;
buf = ~"";
state = RelativePathStartState;
if state_override.is_some() {
return Ok(None);
}
},
None => return Err(InvalidURL)
}
},
'\x09' | '\x0A' | '\x0D' => {
//Do nothing
},
_ => {
if c == '[' {
square_paren_flag = true;
} else if c == ']' {
square_paren_flag = false;
};
buf.push_char(c);
}
}
}
},
PortState => {
if c.is_digit_radix(10) {
buf.push_char(c);
} else if (c == '#' || c == '\\' || c == '/' || c == '?' ||
state_override.is_some()) {
while buf[0] as char == '\x30' && buf.len() > 1 {
buf = buf.slice(1, buf.len()).to_owned();
}
match url.scheme.default_port() {
Some(p) => {
if eq_slice(p, buf) {
buf = ~"";
}
},
None => {}
}
url.port = buf;
if state_override.is_some() {
return Ok(None);
}
buf = ~"";
state = RelativePathStartState;
char_iter.rewind(1);
} else if c == '\x09' || c == '\x0A' || c == '\x0D' {
//Do nothing
} else {
return Ok(None);
}
},
RelativePathStartState => {
state = RelativePathState;
if c != '\\' && c != '/' {
char_iter.rewind(1);
}
},
RelativePathState => {
if (c == '/' || c == '\\' ||
(state_override.is_none() &&
c == '?' || c == '#')) {
if eq_slice(buf, &"%2e") {
buf = ~".";
} else if (eq_slice(buf, &".%2e") ||
eq_slice(buf, &"%2e%2e") ||
eq_slice(buf, &"%2e.")) {
buf = ~"..";
}
if eq_slice(buf, &"..") {
url.path.pop_opt();
if c != '\\' && c != '/' {
url.path.push(~"");
}
} else if !eq_slice(buf, &".") {
if (url.scheme == FileScheme && url.path.is_empty() &&
buf.len() == 2 && buf[1] as char == '|') {
buf.pop_char();
buf.push_char(':');
}
url.path.push(buf);
}
buf = ~"";
if c == '?' {
state = QueryState;
url.query = Some(~"");
} else if c == '#' {
state = FragmentState;
url.fragment = Some(~"");
}
} else if (c == '\x09' || c == '\x0A' || c == '\x0D') {
//Do nothing
} else {
buf.push_str(utf8_percent_encode(c, DefaultEncodeSet));
}
},
QueryState => {
if state_override.is_none() && c == '#' {
if url.relative {
encoding_override = "utf-8";
//TODO Now we should encode the buffer
for byte in buf.byte_iter() {
let char_str = match byte {
0..0x20 | 0x7F..0xFF | 0x22 | 0x23 | 0x3C |
0x3E | 0x60 => {
percent_encode(byte as char)
},
_ => (byte as char).to_str()
};
url.query.unwrap().push_str(char_str);
}
};
let buf = ~"";
if c == '#' {
state = FragmentState;
url.fragment = Some(~"");
}
} else if (c == '\x09' || c == '\x0A' || c == '\x0D') {
//Do nothing
} else {
buf.push_char(c);
}
},
FragmentState => {
match c {
'\x09' | '\x0A' | '\x0D' => {},
_ => {
url.fragment.expect("Fragment cannot be None").push_str(utf8_percent_encode(c, SimpleEncodeSet));
}
}
}
_ => fail!("Not implemented")
}
},
None => {
//EOF handling
match state {
SchemeDataState => {
url.scheme_data = buf;
}
_ => {fail!("Not implemented")}
}
break;
}
}
}
Ok(Some(url))
}
}
fn unwrap_bool<T>(f: &fn(T)->bool, x: Option<T>) -> bool {
match x {
None => false,
Some(v) => f(v)
}
}
fn is_url_char(c: char) -> bool {
match c {
'a'..'z' |
'A'..'Z' |
'0'..'9' | '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' | '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' | '\u00A0'..'\uD7FF' | '\uE000'..'\uFDCF' | '\uFDF0'..'\uFFEF' | '\U00010000'..'\U0001FFFD' | '\U00020000'..'\U0002FFFD' | '\U00030000'..'\U0003FFFD' | '\U00040000'..'\U0004FFFD' | '\U00050000'..'\U0005FFFD' | '\U00060000'..'\U0006FFFD' | '\U00070000'..'\U0007FFFD' | '\U00080000'..'\U0008FFFD' | '\U00090000'..'\U0009FFFD' | '\U000A0000'..'\U000AFFFD' | '\U000B0000'..'\U000BFFFD' | '\U000C0000'..'\U000CFFFD' | '\U000D0000'..'\U000DFFFD' | '\U000E1000'..'\U000EFFFD' | '\U000F0000'..'\U000FFFFD' | '\U00100000'..'\U0010FFFD' => true,
_ => false
}
}
fn is_ascii_alpha(c: char) -> bool {
match c {
'a'..'z' | 'A'..'Z' => true,
_ => false
}
}
fn is_hex_char(c: char) -> bool {
match c {
'0'..'9' | 'a'..'f' | 'A'..'F' => true,
_ => false
}
}
fn char_to_lower(c: char) -> char {
match c {
'A'..'Z' => (c as u8 | '\x40' as u8) as char,
_ => c
}
}
#[deriving(Eq)]
enum EncodeSet {
SimpleEncodeSet,
DefaultEncodeSet,
PasswordEncodeSet,
UsernameEncodeSet
}
fn in_encode_set(c: char, set: EncodeSet) -> bool {
if (c < '\x20' || c > '\x7e') {
return true;
} else if (set == SimpleEncodeSet) {
return false;
}
if (c == '\x20' || c == '"' || c == '#' || c == '<' || c == '>' || c == '?' || c == '`') {
return true;
} else if (set == DefaultEncodeSet) {
return false;
}
if (c == '\\' || c == '@' || c == '/') {
return true;
} else if (set == PasswordEncodeSet) {
return false;
}
if (c == ':') {
return true;
} else if (set == UsernameEncodeSet) {
return false;
}
fail!("Unexpected encode set")
}
fn percent_encode(c: char) -> ~str {
let mut rv = ~"%";
rv.push_str(c.to_str_radix(16));
return rv;
}
fn utf8_percent_encode(c: char, set: EncodeSet) -> ~str {
let mut rv = ~"";
if !in_encode_set(c, set) {
rv.push_char(c);
} else {
let mut buf = vec::from_elem(c.len_utf8_bytes(), 0 as u8);
c.encode_utf8(buf);
for b in buf.iter() {
rv.push_char('%');
rv.push_str(b.to_str_radix(16));
}
}
rv
}
fn host_parse(input: &str) -> Option<Host> {
if input.len() == 0 {
return None
}
if input[0] as char == '[' {
if input[input.len() - 1] as char != ']' {
return None
}
return match ipv6_parse(input.slice(1, input.len() - 1)) {
Some(x) => Some(IPv6Host(x)),
None => None
}
} else {
let decoded = percent_decode(input);
}
None
}
impl IPv6Address {
fn new() -> IPv6Address {
return IPv6Address {
data: [0, 0, 0, 0, 0, 0, 0, 0]
}
}
fn set(&mut self, i: uint, x: u16) {
self.data[i] = x;
}
}
impl ops::Index<uint, u16> for IPv6Address {
fn index(&self, i: &uint) -> u16 {
self.data[*i]
}
}
fn ipv6_parse(input: &str) -> Option<IPv6Address> {
let mut address = IPv6Address::new();
let mut piece_pointer = 0 as uint;
let mut compress_pointer = None;
let mut is_ip_v4 = false;
let mut iter = SeekableCharIterator::new(input.to_owned());
let first = input[0] as char;
if first == ':' {
if first != ':' {
return None
} else {
iter.next();
iter.next();
piece_pointer += 1;
compress_pointer = Some(piece_pointer);
}
}
loop {
let maybe_c = iter.next();
if piece_pointer == 8 {
return None;
}
match maybe_c {
Some(c_0) => {
let mut c = c_0;
if c == ':' {
if compress_pointer.is_none() {
return None;
piece_pointer += 1;
compress_pointer = Some(piece_pointer);
loop;
}
}
let mut value = 0 as u16;
let mut length = 0;
while length < 4 {
if c.is_digit_radix(16) {
break;
}
value = value * 0x10 + c.to_digit(16).unwrap() as u16;
length += 1;
let maybe_c = iter.next();
match maybe_c {
Some(x) => {c = x},
None => break
}
}
match c {
'.' => {
if length == 0 {
return None;
}
iter.rewind(length);
is_ip_v4 = true;
break;
},
':' => {},
_ => {
return None;
},
};
address.set(piece_pointer, value);
piece_pointer += 1;
}
None => {}
}
}
if is_ip_v4 {
if piece_pointer > 6 {
return None;
}
let mut dots_seen = 0;
for c in iter {
let mut value = 0;
while c.is_digit_radix(10) {
value = value * 10 + c.to_digit(10).unwrap() as u16;
}
if value > 255 {
return None;
}
if dots_seen < 3 && c != '.' {
return None;
} else if dots_seen == 3 {
return None;
}
let piece = address[piece_pointer];
address.set(piece_pointer, piece * 0x100 + value);
if dots_seen != 1 {
piece_pointer += 1;
}
dots_seen += 1;
}
if dots_seen < 3 {
return None;
}
}
if compress_pointer.is_some() {
let mut swaps = piece_pointer - compress_pointer.unwrap();
piece_pointer = 7;
while piece_pointer != 0 && swaps != 0 {
let swap_pointer = compress_pointer.unwrap() + swaps - 1;
let piece = address[piece_pointer];
let swap_piece = address[swap_pointer];
address.set(piece_pointer, swap_piece);
address.set(swap_pointer, piece);
swaps -= 1;
piece_pointer -= 1;
}
} else {
if piece_pointer != 8 {
return None;
}
}
return Some(address);
}
fn percent_decode(input: &str) -> ~str {
//XXX not sure that this is multibyte character safe
let mut bytes = ~"";
let mut iter = SeekableCharIterator::new(input.to_owned());
loop {
let maybe_c = iter.next();
match maybe_c {
Some(c) => {
let out_c = match c {
'%' => {
let next = iter.peek(1);
let second = iter.peek(2);
if !(unwrap_bool(is_hex_char, next) &&
unwrap_bool(is_hex_char, second)) {
c
} else {
let mut decoded = iter.next().expect("Char should not be None").to_digit(16).expect("Char should be a digit");
decoded *= 16;
decoded += iter.next().expect("Char should not be None").to_digit(16).expect("Char should be a digit");
char::from_digit(decoded, 16).expect("Decoded should be a character")
}
},
_ => {
c
}
};
bytes.push_char(out_c);
},
None => break
}
}
bytes
}
fn main() {
ParsedURL::parse("http://example.org:8080/foo?bar#baz", None, None, None, None);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment