Skip to content

Instantly share code, notes, and snippets.

@lostinplace
Created October 20, 2016 13:37
Show Gist options
  • Save lostinplace/05a1b09086bd8584f58ad85bb8117e99 to your computer and use it in GitHub Desktop.
Save lostinplace/05a1b09086bd8584f58ad85bb8117e99 to your computer and use it in GitHub Desktop.
package wikiparser
import scala.language.implicitConversions
object Xml {
import fastparse.all._
val digits = "0123456789"
val Digit = P( CharIn(digits) )
val hexDigits = digits + "abcdefABCDEF"
val HexDigit = P( CharIn(hexDigits) )
val HexNum = P( "0x" ~ CharsWhile(hexDigits.contains(_)) )
val WSChars = P( CharsWhile("\u0020\u0009".contains(_)) )
val Newline = P( StringIn("\r\n", "\n") )
val WS = P( NoCut(NoTrace((WSChars).rep)) )
val WL0 = P( NoTrace((WSChars | Newline).rep) )(sourcecode.Name("WL"))
val WL = P( NoCut(WL0) )
val XmlExpr = P( WL ~ Xml.XmlContent ~ (WL ~ Xml.Element).rep )
val XmlPattern = P( WL ~ Xml.ElemPattern )
object Xml{
val BaseChar = P(CharIn(
'\u0041' to '\u005A', '\u0061' to '\u007A', '\u00C0' to '\u00D6', '\u00D8' to '\u00F6',
'\u00F8' to '\u00FF', '\u0100' to '\u0131', '\u0134' to '\u013E', '\u0141' to '\u0148',
'\u014A' to '\u017E', '\u0180' to '\u01C3', '\u01CD' to '\u01F0', '\u01F4' to '\u01F5',
'\u01FA' to '\u0217', '\u0250' to '\u02A8', '\u02BB' to '\u02C1', "\u0386",
'\u0388' to '\u038A', "\u038C", '\u038E' to '\u03A1', '\u03A3' to '\u03CE',
'\u03D0' to '\u03D6', "\u03DA", "\u03DC", "\u03DE", "\u03E0", '\u03E2' to '\u03F3',
'\u0401' to '\u040C', '\u040E' to '\u044F', '\u0451' to '\u045C', '\u045E' to '\u0481',
'\u0490' to '\u04C4', '\u04C7' to '\u04C8', '\u04CB' to '\u04CC', '\u04D0' to '\u04EB',
'\u04EE' to '\u04F5', '\u04F8' to '\u04F9', '\u0531' to '\u0556', "\u0559",
'\u0561' to '\u0586', '\u05D0' to '\u05EA', '\u05F0' to '\u05F2', '\u0621' to '\u063A',
'\u0641' to '\u064A', '\u0671' to '\u06B7', '\u06BA' to '\u06BE', '\u06C0' to '\u06CE',
'\u06D0' to '\u06D3', "\u06D5", '\u06E5' to '\u06E6', '\u0905' to '\u0939', "\u093D",
'\u0958' to '\u0961', '\u0985' to '\u098C', '\u098F' to '\u0990', '\u0993' to '\u09A8',
'\u09AA' to '\u09B0', "\u09B2", '\u09B6' to '\u09B9', '\u09DC' to '\u09DD',
'\u09DF' to '\u09E1', '\u09F0' to '\u09F1', '\u0A05' to '\u0A0A', '\u0A0F' to '\u0A10',
'\u0A13' to '\u0A28', '\u0A2A' to '\u0A30', '\u0A32' to '\u0A33', '\u0A35' to '\u0A36',
'\u0A38' to '\u0A39', '\u0A59' to '\u0A5C', "\u0A5E", '\u0A72' to '\u0A74',
'\u0A85' to '\u0A8B', "\u0A8D", '\u0A8F' to '\u0A91', '\u0A93' to '\u0AA8',
'\u0AAA' to '\u0AB0', '\u0AB2' to '\u0AB3', '\u0AB5' to '\u0AB9', "\u0ABD", "\u0AE0",
'\u0B05' to '\u0B0C', '\u0B0F' to '\u0B10', '\u0B13' to '\u0B28', '\u0B2A' to '\u0B30',
'\u0B32' to '\u0B33', '\u0B36' to '\u0B39', "\u0B3D", '\u0B5C' to '\u0B5D',
'\u0B5F' to '\u0B61', '\u0B85' to '\u0B8A', '\u0B8E' to '\u0B90', '\u0B92' to '\u0B95',
'\u0B99' to '\u0B9A', "\u0B9C", '\u0B9E' to '\u0B9F', '\u0BA3' to '\u0BA4',
'\u0BA8' to '\u0BAA', '\u0BAE' to '\u0BB5', '\u0BB7' to '\u0BB9', '\u0C05' to '\u0C0C',
'\u0C0E' to '\u0C10', '\u0C12' to '\u0C28', '\u0C2A' to '\u0C33', '\u0C35' to '\u0C39',
'\u0C60' to '\u0C61', '\u0C85' to '\u0C8C', '\u0C8E' to '\u0C90', '\u0C92' to '\u0CA8',
'\u0CAA' to '\u0CB3', '\u0CB5' to '\u0CB9', "\u0CDE", '\u0CE0' to '\u0CE1',
'\u0D05' to '\u0D0C', '\u0D0E' to '\u0D10', '\u0D12' to '\u0D28', '\u0D2A' to '\u0D39',
'\u0D60' to '\u0D61', '\u0E01' to '\u0E2E', "\u0E30", '\u0E32' to '\u0E33',
'\u0E40' to '\u0E45', '\u0E81' to '\u0E82', "\u0E84", '\u0E87' to '\u0E88', "\u0E8A",
"\u0E8D", '\u0E94' to '\u0E97', '\u0E99' to '\u0E9F', '\u0EA1' to '\u0EA3', "\u0EA5",
"\u0EA7", '\u0EAA' to '\u0EAB', '\u0EAD' to '\u0EAE', "\u0EB0", '\u0EB2' to '\u0EB3',
"\u0EBD", '\u0EC0' to '\u0EC4', '\u0F40' to '\u0F47', '\u0F49' to '\u0F69',
'\u10A0' to '\u10C5', '\u10D0' to '\u10F6', "\u1100", '\u1102' to '\u1103',
'\u1105' to '\u1107', "\u1109", '\u110B' to '\u110C', '\u110E' to '\u1112',
"\u113C", "\u113E", "\u1140", "\u114C", "\u114E", "\u1150", '\u1154' to '\u1155',
"\u1159", '\u115F' to '\u1161', "\u1163", "\u1165", "\u1167", "\u1169",
'\u116D' to '\u116E', '\u1172' to '\u1173', "\u1175", "\u119E", "\u11A8", "\u11AB",
'\u11AE' to '\u11AF', '\u11B7' to '\u11B8', "\u11BA", '\u11BC' to '\u11C2', "\u11EB",
"\u11F0", "\u11F9", '\u1E00' to '\u1E9B', '\u1EA0' to '\u1EF9', '\u1F00' to '\u1F15',
'\u1F18' to '\u1F1D', '\u1F20' to '\u1F45', '\u1F48' to '\u1F4D', '\u1F50' to '\u1F57',
"\u1F59", "\u1F5B", "\u1F5D", '\u1F5F' to '\u1F7D', '\u1F80' to '\u1FB4',
'\u1FB6' to '\u1FBC', "\u1FBE", '\u1FC2' to '\u1FC4', '\u1FC6' to '\u1FCC',
'\u1FD0' to '\u1FD3', '\u1FD6' to '\u1FDB', '\u1FE0' to '\u1FEC', '\u1FF2' to '\u1FF4',
'\u1FF6' to '\u1FFC', "\u2126", '\u212A' to '\u212B', "\u212E", '\u2180' to '\u2182',
'\u3041' to '\u3094', '\u30A1' to '\u30FA', '\u3105' to '\u312C', '\uAC00' to '\uD7A3'
))
val Ideographic = P( CharIn(
'\u4E00' to '\u9FA5', "\u3007", '\u3021' to '\u3029'
))
val Eq = P (WL.? ~ "=" ~ WL.?)
val TagHeader = P( "<" ~ Name ~ (WL ~ Attribute).rep ~ WL.? )
val Element = P( TagHeader ~ (EmptyElemTagEnd | STagEnd ~ Content ~ ETag ) )
val EmptyElemTagEnd = P( "/>" )
val STagEnd = P( ">" )
val ETag = P( "</" ~ Name ~ WL.? ~ ">" )
val Content = P( (CharData | Content1).rep )
val Content1 = P( XmlContent | Reference )
val XmlContent: P0 = P( Element | CDSect | PI | Comment )
val CDSect = P( CDStart ~ CData ~ CDEnd )
val CDStart = P( "<![CDATA[" )
val CData = P( (!"]]>" ~ Char).rep )
val CDEnd = P( "]]>" )
val Attribute = P( Name ~ Eq ~ AttValue )
val AttValue = P(
"\"" ~ (CharQ | Reference).rep ~ "\"" |
"'" ~ (CharA | Reference).rep ~ "'"
)
val Comment = P( "<!--" ~ ((!"-" ~ Char) | ("-" ~ (!"-" ~ Char))).rep ~ "-->" )
val PI = P( "<?" ~ PITarget ~ (WL ~ (!"?>" ~ Char).rep).? ~ "?>" )
val PITarget = P( !(("X" | "x") ~ ("M" | "m") ~ ("L" | "l")) ~ Name )
val CharRef = P( "&#" ~ CharIn('0' to '9').rep(1) ~ ";" | "&#x" ~ HexNum ~ ";" )
val Reference = P( EntityRef | CharRef )
val EntityRef = P( "&" ~ Name ~ ";" )
val Char = P( AnyChar )
val CharData = P( (!("{" | "]]>" | CharRef) ~ Char1 | "{{").rep(1) )
val Char1 = P( !("<" | "&") ~ Char )
val CharQ = P( !"\"" ~ Char1 )
val CharA = P( !"'" ~ Char1 )
val CharB = P( !"{" ~ Char1 )
val Name = P( XNameStart ~ NameChar.rep )
val XNameStart = P( "_" | BaseChar | Ideographic )
val NameStartChar = P(CharIn(
":", 'A' to 'Z', "_", 'a' to 'z', '\u00C0' to '\u00D6', '\u00D8' to '\u00F6',
'\u00F8' to '\u02FF', '\u0370' to '\u037D', '\u037F' to '\u1FFF', '\u200C' to '\u200D',
'\u2070' to '\u218F', '\u2C00' to '\u2FEF', '\u3001' to '\uD7FF', '\uF900' to '\uFDCF',
'\uFDF0' to '\uFFFD' // ++ [#x10000-#xEFFFF] ???? don't chars max out at \uffff ????
))
val NameChar = P( NameStartChar | CharIn(
"-", ".", '0' to '9', "\u00B7", '\u0300' to '\u036F', '\u203F' to '\u2040'
))
val ElemPattern: P0 = P( TagPHeader ~ (EmptyElemTagPEnd | STagPEnd ~ ContentP ~ ETagP ))
val TagPHeader = P( "<" ~ Name ~ WL.? )
val EmptyElemTagPEnd = P( "/>" )
val STagPEnd = P( ">")
val ETagP = P( "</" ~ Name ~ WL.? ~ ">" )
val ContentP = P( CharData.? ~ (ElemPattern ~ CharData.?).rep )
val ContentP1 = P( ElemPattern | Reference | CDSect | PI | Comment )
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment