Created
March 12, 2018 08:18
gossip sip parser definition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%{# -*-ragel-*- | |
# | |
## SIP Message Parser Definition | |
# | |
# The parser should be a linearly complex FSM. It should be able to parse a | |
# real-world SIP INVITE in less than 30μs. This parser only trades speed in | |
# favor of friendly data structures, which are big structs linked together. | |
# | |
## Implementation Notes | |
# | |
# SIP uses a plaintext encoding that is hard to parse correctly. | |
# | |
# o Whitespace can be used liberally in a variety of different ways. | |
# | |
# - Via host:port can have whitespace, e.g. "host \t: port" | |
# | |
# o UTF-8 is supported in some places but not others. | |
# | |
# o Headers can span multiple lines. | |
# | |
# o Header values can contain comments, e.g. Message: lol (i'm (hidden)) | |
# | |
# o Header names are case-insensitive and have shorthand notation. | |
# | |
# o There's ~50 standard headers, many of which have custom parsing rules. | |
# | |
# o URIs can have ;params;like=this | |
# | |
# - Params can belong either to a URI or Addr object, e.g. <sip:uri;param> | |
# cf. <sip:uri>;param | |
# | |
# - Addresses may omit angle brackets, in which case params belong to the | |
# Addr object. | |
# | |
# - URI params ;are=escaped%20like%22this but params belonging to Addr | |
# ;are="escaped like\"this" | |
# | |
# - Backslash escaping is not like C, e.g. \t\n -> tn | |
# | |
# - Address display name can have whitespace without quotes, which is | |
# collapsed. Quoted form is not collapsed. | |
# | |
# o Via and address headers can be repeated in two ways: repeating the | |
# header, using commas within a single header, or both. | |
# | |
# See: http://www.colm.net/files/ragel/ragel-guide-6.9.pdf | |
# See: https://tools.ietf.org/html/rfc2234 | |
machine sip; | |
action hold { | |
fhold; | |
} | |
action break { | |
fbreak; | |
} | |
action mark { | |
mark = p | |
} | |
action backtrack { | |
fexec mark; | |
} | |
action start { | |
amt = 0 | |
} | |
action append { | |
buf[amt] = fc | |
amt++ | |
} | |
action space { | |
buf[amt] = ' ' | |
amt++ | |
} | |
action hexHi { | |
hex = unhex(fc) * 16 | |
} | |
action hexLo { | |
hex += unhex(fc) | |
buf[amt] = hex | |
amt++ | |
} | |
action Method { | |
msg.Method = string(data[mark:p]) | |
} | |
action VersionMajor { | |
msg.VersionMajor = msg.VersionMajor * 10 + (fc - 0x30) | |
} | |
action VersionMinor { | |
msg.VersionMinor = msg.VersionMinor * 10 + (fc - 0x30) | |
} | |
action RequestURI { | |
msg.Request, err = ParseURI(data[mark:p]) | |
if err != nil { return nil, err } | |
} | |
action StatusCode { | |
msg.Status = msg.Status * 10 + (int(fc) - 0x30) | |
} | |
action ReasonPhrase { | |
msg.Phrase = string(buf[0:amt]) | |
} | |
action ViaNew { | |
via = new(Via) | |
} | |
action Via { | |
*viap = via | |
viap = &via.Next | |
via = nil | |
} | |
action ViaProtocol { | |
via.Protocol = string(data[mark:p]) | |
} | |
action ViaVersion { | |
via.Version = string(data[mark:p]) | |
} | |
action ViaTransport { | |
via.Transport = string(data[mark:p]) | |
} | |
action ViaHost { | |
via.Host = string(data[mark:p]) | |
} | |
action ViaPort { | |
via.Port = via.Port * 10 + (uint16(fc) - 0x30) | |
} | |
action ViaParam { | |
via.Param = &Param{name, string(buf[0:amt]), via.Param} | |
} | |
action gxh { | |
fhold; | |
fgoto xheader; | |
} | |
action name { | |
name = string(data[mark:p]) | |
} | |
action value {{ | |
b := data[mark:p - 1] | |
if value != nil { | |
*value = string(b) | |
} else { | |
msg.XHeader = &XHeader{name, b, msg.XHeader} | |
} | |
}} | |
action AddrNew { | |
addr = new(Addr) | |
} | |
action AddrQuotedDisplay { | |
addr.Display = string(buf[0:amt]) | |
} | |
action AddrUnquotedDisplay {{ | |
end := p | |
for end > mark && whitespacec(data[end - 1]) { | |
end-- | |
} | |
addr.Display = string(data[mark:end]) | |
}} | |
action AddrUri { | |
addr.Uri, err = ParseURI(data[mark:p]) | |
if err != nil { return nil, err } | |
} | |
action AddrParam { | |
addr.Param = &Param{name, string(buf[0:amt]), addr.Param} | |
} | |
action Addr { | |
*addrp = addr | |
addrp = &addr.Next | |
addr = nil | |
} | |
action CallID { | |
msg.CallID = string(data[mark:p]) | |
} | |
action ContentLength { | |
clen = clen * 10 + (int(fc) - 0x30) | |
} | |
action ContentType { | |
ctype = string(data[mark:p]) | |
} | |
action CSeq { | |
msg.CSeq = msg.CSeq * 10 + (int(fc) - 0x30) | |
} | |
action CSeqMethod { | |
msg.CSeqMethod = string(data[mark:p]) | |
} | |
action Expires { | |
msg.Expires = msg.Expires * 10 + (int(fc) - 0x30) | |
} | |
action MaxForwards { | |
msg.MaxForwards = msg.MaxForwards * 10 + (int(fc) - 0x30) | |
} | |
action MinExpires { | |
msg.MinExpires = msg.MinExpires * 10 + (int(fc) - 0x30) | |
} | |
action goto_addr { fgoto addr; } | |
action goto_addr_angled { fgoto addr_angled; } | |
action goto_addr_param { fgoto addr_param; } | |
action goto_addr_uri { fgoto addr_uri; } | |
action goto_ctype { fgoto ctype; } | |
action goto_header { fgoto header; } | |
action goto_value { fgoto value; } | |
action goto_via { fgoto via; } | |
action goto_via_param { fgoto via_param; } | |
action lookAheadWSP { lookAheadWSP(data, p, pe) } | |
SP = " "; | |
HTAB = "\t"; | |
CR = "\r"; | |
LF = "\n"; | |
DQUOTE = "\""; | |
CRLF = ( CR when !lookAheadWSP ) LF; | |
WSP = SP | HTAB; | |
LWS = ( WSP* ( CR when lookAheadWSP ) LF )? WSP+; | |
SWS = LWS?; | |
LWSCRLF_append = ( CR when lookAheadWSP ) @append LF @append; | |
LWS_append = ( WSP* @append LWSCRLF_append )? WSP+ @append; | |
UTF8_CONT = 0x80..0xBF @append; | |
UTF8_NONASCII = 0xC0..0xDF @append UTF8_CONT {1} | |
| 0xE0..0xEF @append UTF8_CONT {2} | |
| 0xF0..0xF7 @append UTF8_CONT {3} | |
| 0xF8..0xFb @append UTF8_CONT {4} | |
| 0xFC..0xFD @append UTF8_CONT {5}; | |
UTF8 = 0x21..0x7F @append | UTF8_NONASCII; | |
mUTF8_CONT = 0x80..0xBF; | |
mUTF8_NONASCII = 0xC0..0xDF mUTF8_CONT {1} | |
| 0xE0..0xEF mUTF8_CONT {2} | |
| 0xF0..0xF7 mUTF8_CONT {3} | |
| 0xF8..0xFb mUTF8_CONT {4} | |
| 0xFC..0xFD mUTF8_CONT {5}; | |
mUTF8 = 0x21..0x7F | mUTF8_NONASCII; | |
# https://tools.ietf.org/html/rfc3261#section-25.1 | |
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," ; | |
mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" ; | |
unreserved = alnum | mark ; | |
tokenc = alnum | "-" | "." | "!" | "%" | "*" | "_" | "+" | "`" | |
| "'" | "~" ; | |
separators = "(" | ")" | "<" | ">" | "@" | "," | ";" | ":" | "\\" | |
| "\"" | "/" | "[" | "]" | "?" | "=" | "{" | "}" | SP | |
| HTAB ; | |
wordc = alnum | "-" | "." | "!" | "%" | "*" | "_" | "+" | "`" | |
| "'" | "~" | "(" | ")" | "<" | ">" | ":" | "\\" | "\"" | |
| "/" | "[" | "]" | "?" | "{" | "}" ; | |
schmchars = alnum | "+" | "-" | "." ; | |
word = wordc+; | |
STAR = SWS "*" SWS; | |
SLASH = SWS "/" SWS; | |
EQUAL = SWS "=" SWS; | |
LPAREN = SWS "(" SWS; | |
RPAREN = SWS ")" SWS; | |
RAQUOT = ">" SWS; | |
LAQUOT = SWS "<"; | |
COMMA = SWS "," SWS; | |
SEMI = SWS ";" SWS; | |
COLON = SWS ":" SWS; | |
HCOLON = WSP* ":" SWS; | |
LDQUOT = SWS "\""; | |
RDQUOT = "\"" SWS; | |
escaped = "%" ( xdigit @hexHi ) ( xdigit @hexLo ) ; | |
ipv4c = digit | "." ; | |
ipv6c = xdigit | "." | ":" ; | |
hostc = alnum | "-" | "." ; | |
token = tokenc+; | |
tokenhost = ( tokenc | "[" | "]" | ":" )+; | |
reasonc = UTF8_NONASCII | ( reserved | unreserved | SP | HTAB ) @append; | |
reasonmc = escaped | reasonc; | |
cid = word ( "@" word )?; | |
hval = ( mUTF8 | LWS )* >mark; | |
schemec = alnum | "+" | "-" | "."; | |
scheme = alpha schemec*; | |
uric = reserved | unreserved | "%" | "[" | "]"; | |
uri = scheme ":" uric+; | |
# Quoted strings can have just about anything, including backslash escapes, | |
# which aren't quite as fancy as the ones you'd see in programming. | |
qdtextc = 0x21 | 0x23..0x5B | 0x5D..0x7E; | |
qdtext = UTF8_NONASCII | LWS_append | qdtextc @append; | |
quoted_pair = "\\" ( 0x00..0x09 | 0x0B..0x0C | 0x0E..0x7F ) @append; | |
quoted_content = ( qdtext | quoted_pair )* >start; | |
quoted_string = DQUOTE quoted_content DQUOTE; | |
unquoted_string = ( token LWS )+; | |
# Content Type Parsing | |
# | |
# This is easy-peasy. It almost always contains the value "application/sdp". | |
# We're going to ignore the parameters, because this information is actually | |
# stored in Msg by way of type interface and we don't support any types that | |
# take parameters. | |
ctype_param = SEMI token EQUAL ( token | quoted_string ); | |
ctype_mime = ( token "/" token ) >mark %ContentType; | |
ctype := ctype_mime ctype_param* CRLF @goto_header; | |
# Parameter Parsing | |
# | |
# Parameters can be used by vias and addresses, but not URIs. They can look | |
# like=this or like="this". The =value part is optional. | |
param_name = token >mark %name; | |
param_content = tokenhost @append; | |
param_value = param_content | quoted_string; | |
param = param_name >start (EQUAL param_value)?; | |
# Via Parsing | |
# | |
# Vias are used to trace SIP hops. It's similar to an address, but with simpler | |
# syntax. Here's some examples: | |
# | |
# - Via: SIP/2.0/UDP 1.2.3.4:5060;branch=z9hG4bK-d1d81e94a099 | |
# - Via: SIP/2.0/TLS [feed:a::bee] ;branch="z9hG4bK-doge" ;rport=666 | |
# | |
# Parsing these is kind of difficult because infinite whitespace is allowed | |
# between colons, semicolons, commas, and don't forget that lines can | |
# continue. So we're going to break things down into four separate machines | |
# that jump between each other. | |
ViaProtocol = token >mark %ViaProtocol; | |
ViaVersion = token >mark %ViaVersion; | |
ViaTransport = token >mark %ViaTransport; | |
ViaSent = ViaProtocol SLASH ViaVersion SLASH ViaTransport; | |
ViaHostIPv4 = ipv4c+ >mark %ViaHost; | |
ViaHostIPv6 = "[" ipv6c+ >mark %ViaHost "]"; | |
ViaHostName = hostc+ >mark %ViaHost; | |
ViaHost = ViaHostIPv4 | ViaHostIPv6 | ViaHostName; | |
ViaPort = digit+ @ViaPort; | |
via_param_end = CRLF @ViaParam @Via @goto_header | |
| SEMI <: any @ViaParam @hold @start @goto_via_param | |
| COMMA <: any @ViaParam @Via @ViaNew @hold @goto_via; | |
via_param := param via_param_end; | |
via_end = CRLF @Via @goto_header | |
| SEMI <: any @hold @start @goto_via_param | |
| COMMA <: any @Via @ViaNew @hold @goto_via; | |
via := ViaSent LWS ViaHost (COLON ViaPort)? via_end; | |
# Address Parsing | |
# | |
# These can come in the following forms, which can be comma-delimited: | |
# | |
# - Unangled: sip:lol.example;param | |
# - Angled: <sip:lol.example;param>;param | |
# - Unquoted: oh my goth <sip:boo@lol[feed:a::bee]:5060> | |
# - Quoted: "oh my \"goth\"" <sip:lol.example> | |
# | |
# In order to tell the unangled and unquoted angled forms apart, we need to | |
# look for ':' or '<' character and then backtrack to the appropriate machine. | |
# | |
# Because Addr and URI can both have parameters, one might wonder what happens | |
# to them in the unmangled form. Are they owned by URI? Or are they owned by | |
# Addr? The answer is the latter. | |
# | |
# The URIs themselves are parsed by a separate routine. All we do here is | |
# extract the bytes and pass them along. It would be nice if we could put the | |
# URI parsing in this file, where the URI parsing is invoked by fcall. But | |
# that's not possible, because it appears Ragel Go is broken in that regard. | |
addr_spec = LAQUOT uri >mark %AddrUri RAQUOT; | |
addr_display = quoted_string >start %AddrQuotedDisplay | |
| unquoted_string >mark %AddrUnquotedDisplay; | |
addr_param_end = CRLF @AddrParam @Addr @goto_header | |
| SEMI <: any @AddrParam @hold @goto_addr_param | |
| COMMA <: any @AddrParam @Addr @hold @goto_addr; | |
addr_param := param addr_param_end; | |
addr_angled_end = CRLF @Addr @goto_header | |
| SEMI <: any @hold @goto_addr_param | |
| COMMA <: any @Addr @hold @goto_addr; | |
addr_angled := addr_display? addr_spec addr_angled_end; | |
addr_uri_end = CRLF %Addr @goto_header | |
| SEMI <: any @hold @goto_addr_param | |
| COMMA <: any @Addr @hold @goto_addr; | |
addr_uri := ( uri - ";" ) %AddrUri addr_uri_end; | |
addr := [<\"] @AddrNew @hold @goto_addr_angled | |
| unquoted_string >mark "<" @AddrNew @backtrack @goto_addr_angled | |
| scheme >mark ":" @AddrNew @backtrack @goto_addr_uri; | |
# Address Header Name Definitions | |
# | |
# These headers set the addr pointer to tell the 'value' machine where to | |
# store the value after using ParseAddrBytes(). | |
aname = ("Contact"i | "m"i) %{addrp=lastAddr(&msg.Contact)} | |
| ("From"i | "f"i) %{addrp=lastAddr(&msg.From)} | |
| "P-Asserted-Identity"i %{addrp=lastAddr(&msg.PAssertedIdentity)} | |
| "Record-Route"i %{addrp=lastAddr(&msg.RecordRoute)} | |
| "Remote-Party-ID"i %{addrp=lastAddr(&msg.RemotePartyID)} | |
| "Route"i %{addrp=lastAddr(&msg.Route)} | |
| ("To"i | "t"i) %{addrp=lastAddr(&msg.To)} | |
; | |
# String Header Name Definitions | |
# | |
# These headers set the value pointer to tell the 'value' machine where to | |
# store the resulting token string. | |
sname = "Accept"i %{value=&msg.Accept} | |
| ("Accept-Contact"i | "a"i) %{value=&msg.AcceptContact} | |
| "Accept-Encoding"i %{value=&msg.AcceptEncoding} | |
| "Accept-Language"i %{value=&msg.AcceptLanguage} | |
| ("Allow"i | "u"i) %{value=&msg.Allow} | |
| ("Allow-Events"i | "u"i) %{value=&msg.AllowEvents} | |
| "Alert-Info"i %{value=&msg.AlertInfo} | |
| "Authentication-Info"i %{value=&msg.AuthenticationInfo} | |
| "Authorization"i %{value=&msg.Authorization} | |
| "Content-Disposition"i %{value=&msg.ContentDisposition} | |
| "Content-Language"i %{value=&msg.ContentLanguage} | |
| ("Content-Encoding"i | "e"i) %{value=&msg.ContentEncoding} | |
| "Call-Info"i %{value=&msg.CallInfo} | |
| "Date"i %{value=&msg.Date} | |
| "Error-Info"i %{value=&msg.ErrorInfo} | |
| ("Event"i | "o"i) %{value=&msg.Event} | |
| "In-Reply-To"i %{value=&msg.InReplyTo} | |
| "Reply-To"i %{value=&msg.ReplyTo} | |
| "MIME-Version"i %{value=&msg.MIMEVersion} | |
| "Organization"i %{value=&msg.Organization} | |
| "Priority"i %{value=&msg.Priority} | |
| "Proxy-Authenticate"i %{value=&msg.ProxyAuthenticate} | |
| "Proxy-Authorization"i %{value=&msg.ProxyAuthorization} | |
| "Proxy-Require"i %{value=&msg.ProxyRequire} | |
| ("Refer-To"i | "r"i) %{value=&msg.ReferTo} | |
| ("Referred-By"i | "b"i) %{value=&msg.ReferredBy} | |
| "Require"i %{value=&msg.Require} | |
| "Retry-After"i %{value=&msg.RetryAfter} | |
| "Server"i %{value=&msg.Server} | |
| ("Subject"i | "s"i) %{value=&msg.Subject} | |
| ("Supported"i | "k"i) %{value=&msg.Supported} | |
| "Timestamp"i %{value=&msg.Timestamp} | |
| "Unsupported"i %{value=&msg.Unsupported} | |
| "User-Agent"i %{value=&msg.UserAgent} | |
| "Warning"i %{value=&msg.Warning} | |
| "WWW-Authenticate"i %{value=&msg.WWWAuthenticate} | |
; | |
# Custom Header Definitions | |
# | |
# These headers do not jump to the 'value' machine, but instead specify | |
# their own special type of parsing. | |
cheader = ("Call-ID"i | "i"i) $!gxh HCOLON cid >mark %CallID | |
| ("Content-Length"i | "l"i) $!gxh HCOLON digit+ >{clen=0} @ContentLength | |
| "CSeq"i $!gxh HCOLON (digit+ @CSeq) LWS token >mark %CSeqMethod | |
| ("Expires"i | "l"i) $!gxh HCOLON digit+ >{msg.Expires=0} @Expires | |
| ("Max-Forwards"i | "l"i) $!gxh HCOLON digit+ >{msg.MaxForwards=0} @MaxForwards | |
| ("Min-Expires"i | "l"i) $!gxh HCOLON digit+ >{msg.MinExpires=0} @MinExpires | |
; | |
# Header Parsing | |
# | |
# The header machine parses a single header and then jumps to itself to | |
# loop. When the final CRLF is observed, we then break out of the Ragel | |
# parser and let the Go code handle payload extraction. | |
# | |
# Parsing standard header names is a prefix trie search in generated code. | |
# Lookahead to set the mark on the header name. In order to support | |
# extended headers, we'll use $!gxh to jump to the xheader machine when an | |
# unrecognized character is detected in the header name. | |
# | |
# An independent machine has been created for generic header values, so | |
# that it doesn't need to be duplicated for each leaf in the prefix | |
# trie. When the value machine has finished reading a value, it'll be | |
# parsed and stored based on whether the value/addr pointers are set. | |
# | |
# Header values can span multiple lines. Lookahead is used in the LWS | |
# definition to check for whitespace at the start of the next line upon | |
# encountering a line feed character, in order to determine if a line | |
# continuation is present. | |
# | |
# In order to concatenate across machines, we use lookahead in conjunction | |
# with the left-guarded concatenation operator. This pattern works is | |
# defined as follows: `foo <: any @hold @goto_bar`. | |
# | |
# Header names are case insensitive. Each recognized header is assigned to | |
# a specific field in the Msg data structure. Extended headers are stored | |
# to a linked list data structure with the casing preserved. This is so | |
# messages can be reproduced with roughly the same appearance. It is the | |
# responsibility of the person using Msg.Headers to do case-insensitive | |
# string comparisons. | |
value := hval <: CRLF @value @goto_header; | |
xheader := token %name HCOLON <: any @{value=nil} @hold @goto_value; | |
sheader = cheader <: CRLF @goto_header | |
| aname $!gxh HCOLON <: any @{value=nil} @hold @goto_addr | |
| sname $!gxh HCOLON <: any @hold @goto_value | |
| ("Via"i | "v"i) $!gxh HCOLON <: any @ViaNew @hold @goto_via | |
| ("Content-Type"i | "c"i) $!gxh HCOLON <: any @hold @goto_ctype; | |
header := CRLF @break | |
| tokenc @mark @hold sheader; | |
# Start Line Parsing | |
# | |
# The Request and Response definitions are very straightforward, and the | |
# main machine is the union of the two. Once the line feed character has | |
# been observed, we then jump to the header machine. | |
# SIP Message Parsing | |
Method = token >mark %Method; | |
SIPVersionNo = digit+ @VersionMajor "." digit+ @VersionMinor; | |
RequestURI = ^SP+ >mark %RequestURI; | |
StatusCode = ( digit @StatusCode ) {3}; | |
ReasonPhrase = reasonmc+ >start %ReasonPhrase; | |
SIPVersion = "SIP/" SIPVersionNo; | |
Request = Method SP RequestURI SP SIPVersion CRLF @goto_header; | |
Response = SIPVersion SP StatusCode SP ReasonPhrase CRLF @goto_header; | |
Message = Request | Response; | |
}%% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment