Skip to content

Instantly share code, notes, and snippets.

@greggirwin
Last active August 14, 2019 21:29
Show Gist options
  • Save greggirwin/207149d46441cd48a1426e60926a7d25 to your computer and use it in GitHub Desktop.
Save greggirwin/207149d46441cd48a1426e60926a7d25 to your computer and use it in GitHub Desktop.
Red URL Parser (now included in Red: %environment/networking.red)
Red [
title: "RFC3986 URL parser"
file: %url-parser.red
author: "@greggirwin"
date: 03-Oct-2018
notes: {
Reference: https://tools.ietf.org/html/rfc3986#page-16
Most rule names are taken from the RFC, with the goal of
making it easy to compare to the reference. Some rules
are simplified in this version (e.g. IP address literals).
Where pct-encoded rules are listed in the RFC, they are
omitted from parse rules here, as the input is dehexed
before being parsed.
Relative URI path references are not yet supported.
}
]
url-parser: object [
;-- Parse Variables
=scheme: =user-info: =host: =port: =path: =query: =fragment: none
vars: [=scheme =user-info =host =port =path =query =fragment]
;-- General Character Sets
alpha: charset [#"a" - #"z" #"A" - #"Z"]
digit: charset "0123456789"
alpha-num: union alpha digit
;-- URL Character Sets
; The purpose of reserved characters is to provide a set of delimiting
; characters that are distinguishable from other data within a URI.
gen-delims: charset ":/?#[]@"
sub-delims: charset "!$&'()*+,;="
reserved: [gen-delims | sub-delims]
unreserved: compose [alpha | digit | (charset "-._~")]
; Helper func for extending alpha-num
alpha-num+: func [more [string!]][union alpha-num charset more]
scheme-char: alpha-num+ "+-."
;-- URL Grammar
url-rules: [scheme-part hier-part opt query opt fragment]
scheme-part: [copy =scheme [alpha some scheme-char] #":"]
hier-part: ["//" authority path-abempty | path-absolute | path-rootless | path-empty]
; The authority component is preceded by a double slash ("//") and is
; terminated by the next slash ("/"), question mark ("?"), or number
; sign ("#") character, or by the end of the URI.
authority: [opt user-info host opt [":" port]]
; "user:password" format for user-info is deprecated.
user-info: [copy =user-info [any [unreserved | sub-delims | #":"] #"@"]]
; Host is not detailed per the RFC yet. It covers IPv6 addresses, which go in
; square brackets, making them a non-loadable URL in Red. They can also contain
; colons, which makes finding the port marker more involved.
IP-literal: [copy =IP-literal ["[" thru "]"]] ; simplified from [IPv6address | IPvFuture]
host: [
IP-literal (=host: =IP-literal)
| copy =host any [unreserved | sub-delims]
]
port: [copy =port [1 5 digit]]
; path-abempty ; begins with "/" or is empty
; path-absolute ; begins with "/" but not "//"
; path-noscheme ; begins with a non-colon segment
; path-rootless ; begins with a segment
; path-empty ; zero characters
path-abempty: [copy =path any-segments | path-empty] ; (print ["path:" mold =path])
path-absolute: [copy =path [#"/" opt [segment-nz any-segments]]] ; (print ["path-abs:" mold =path])
;!! path-noscheme is only used in relative URIs, which aren't supported here yet.
;path-noscheme: [copy =path [segment-nz-nc any-segments]] ; (print ["path-no-scheme:" mold =path])
path-rootless: [copy =path [segment-nz any-segments]] ; (print ["path-rootless:" mold =path])
path-empty: [none]
any-segments: [any [#"/" segment]]
segment: [any pchar]
segment-nz: [some pchar]
segment-nz-nc: [some [unreserved | sub-delims | #"@"]] ; non-zero-length segment with no colon
pchar: [unreserved | sub-delims | #":" | #"@"] ; path characters
query: ["?" copy =query any [pchar | slash | #"?"]]
fragment: ["#" copy =fragment any [pchar | slash | #"?"]]
;-- Parse Function
parse-url: function [
"Return object with URL components, or cause an error if not a valid URL"
url [url! string!]
/throw-error "Throw an error, instead of returning NONE."
/extern vars =path =host
][
set vars none ; clear object level parse variables
either parse dehex url url-rules [
if empty? =host [=host: none]
=path: either all [=path not empty? =path][
split-path to file! =path
][
[#[none] #[none]]
]
object [
scheme: to word! =scheme
user-info: =user-info
host: =host
port: if =port [to integer! =port]
path: first =path
target: second =path
query: =query
fragment: =fragment
]
][
if throw-error [
make error! rejoin ["URL error: " url]
]
]
]
; Exported function (Rebol compatible name)
set 'decode-url function [
"Decode a URL into an object containing its constituent parts"
url [url! string!]
][
parse-url url
]
]
test-urls: reduce [
foo:// ; no path
object [
scheme: 'foo
user-info: none
host: none
port: none
path: none
target: none
query: none
fragment: none
]
foo:/a/b/c ; path-absolute
object [
scheme: 'foo
user-info: none
host: none
port: none
path: %/a/b/
target: %c
query: none
fragment: none
]
foo://example.com:8042/over/there?name=ferret#nose
object [
scheme: 'foo
user-info: none
host: "example.com"
port: 8042
path: %/over/
target: %there
query: "name=ferret"
fragment: "nose"
]
ftp://ftp.is.co.za/rfc/rfc1808.txt
object [
scheme: 'ftp
user-info: none
host: "ftp.is.co.za"
port: none
path: %/rfc/
target: %rfc1808.txt
query: none
fragment: none
]
http://www.ietf.org/rfc/rfc2396.txt
object [
scheme: 'http
user-info: none
host: "www.ietf.org"
port: none
path: %/rfc/
target: %rfc2396.txt
query: none
fragment: none
]
to url! "ldap://[2001:db8::7]/c=GB?objectClass?one" ; not loadable Red
object [
scheme: 'ldap
user-info: none
host: "[2001:db8::7]"
port: none
path: %/
target: %c=GB
query: "objectClass?one"
fragment: none
]
mailto:John.Doe@example.com
object [
scheme: 'mailto
user-info: none
host: none
port: none
path: %./
target: %"John.Doe@example.com"
query: none
fragment: none
]
news:comp.infosystems.www.servers.unix
object [
scheme: 'news
user-info: none
host: none
port: none
path: %./
target: %comp.infosystems.www.servers.unix
query: none
fragment: none
]
tel:+1-816-555-1212
object [
scheme: 'tel
user-info: none
host: none
port: none
path: %./
target: %+1-816-555-1212
query: none
fragment: none
]
telnet://192.0.2.16:80/
object [
scheme: 'telnet
user-info: none
host: "192.0.2.16"
port: 80
path: %/
target: none
query: none
fragment: none
]
urn:oasis:names:specification:docbook:dtd:xml:4.1.2
object [
scheme: 'urn
user-info: none
host: none
port: none
path: %./
target: %"oasis:names:specification:docbook:dtd:xml:4.1.2"
query: none
fragment: none
]
]
test-url-parser: function [input expected-result][
if expected-result <> res: decode-url input [
print [
"parse-url failed for url:" mold url newline
"Expected:" mold expected-result newline
"Got:" mold res
]
]
]
foreach [url obj] test-urls [test-url-parser url obj]
@greggirwin
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment