RomainTT/csv-schema.lark

## csv-schema.lark
// RULES                                                                                                            // -----                                                                                                                                                                                                                                schema : prolog body
prolog : versiondecl globaldirectives                                                                               versiondecl : ("version 1.0" | "version 1.1" | "version 1.2")
globaldirectives : separatordirective? quoteddirective? totalcolumnsdirective? permitemptydirective? (noheaderdirective | ignorecolumnnamecasedirective)?                                                                               directiveprefix : "@"
separatordirective : directiveprefix "separator" (separatortabexpr | separatorchar)
separatortabexpr : "tab" | "\t"
separatorchar : CHARACTERLITERAL
quoteddirective : directiveprefix "quoted"
totalcolumnsdirective : directiveprefix "totalColumns" POSITIVENONZEROINTEGERLITERAL
permitemptydirective : directiveprefix "permitEmpty"
noheaderdirective : directiveprefix "noHeader"
ignorecolumnnamecasedirective : directiveprefix "ignoreColumnNameCase"
body : bodypart+
bodypart : comment* columndefinition comment*
comment : singlelinecomment | multilinecomment
singlelinecomment : /\/\/[\S\t ]*/
multilinecomment : /\\\*(?:[^*\r\n]+|(?:\r?\n))*\\\*/
columndefinition : (columnidentifier | quotedcolumnidentifier) ":" columnrule
columnidentifier : POSITIVENONZEROINTEGERLITERAL | IDENT
quotedcolumnidentifier : STRINGLITERAL
columnrule : columnvalidationexpr* columndirectives
columndirectives : optionaldirective? matchisfalsedirective? ignorecasedirective? warningdirective?
optionaldirective : directiveprefix "optional"
matchisfalsedirective : directiveprefix "matchIsFalse"
ignorecasedirective : directiveprefix "ignoreCase"
warningdirective : directiveprefix "warningDirective"
columnvalidationexpr : combinatorialexpr | noncombinatorialexpr
combinatorialexpr : orexpr | andexpr
orexpr : noncombinatorialexpr "or" columnvalidationexpr
andexpr : noncombinatorialexpr "and" columnvalidationexpr
noncombinatorialexpr : nonconditionalexpr | conditionalexpr
nonconditionalexpr : singleexpr | externalsingleexpr | parenthesizedexpr
singleexpr : explicitcontextexpr? (isexpr | anyexpr | notexpr | inexpr | startswithexpr | endswithexpr | regexpexpr | rangeexpr | lengthexpr | emptyexpr | notemptyexpr | uniqueexpr | uriexpr | xsddatetimeexpr | xsddatetimewithtimezoneexpr | xsddateexpr | xsdtimeexpr | ukdateexpr | dateexpr | partialukdateexpr | partialdateexpr | uuid4expr | positiveintegerexpr | uppercaseexpr | lowercaseexpr | identicalexpr)
explicitcontextexpr : columnref "/"
columnref : "$" (columnidentifier | quotedcolumnidentifier)
isexpr : "is(" stringprovider ")"
anyexpr : "any(" stringprovider ")"
notexpr : "not(" stringprovider ")"
inexpr : "in(" stringprovider ")"
startswithexpr : "starts(" stringprovider ")"
endswithexpr : "ends(" stringprovider ")"
regexpexpr : "regex(" STRINGLITERAL ")"
rangeexpr : "range(" (numericorany "," NUMERICLITERAL | NUMERICLITERAL "," numericorany) ")"
numericorany : NUMERICLITERAL | WILDCARDLITERAL
lengthexpr : "length(" (positiveintegerorany ",")? positiveintegerorany ")"
positiveintegerorany : POSITIVEINTEGERLITERAL | WILDCARDLITERAL
emptyexpr : "empty"
notemptyexpr : "notEmpty"
uniqueexpr : "unique" ("(" columnref ("," columnref)* ")")?
uriexpr : "uri"
xsddatetimeexpr : "xdatetime" ("(" XSDDATETIMELITERAL "," XSDDATETIMELITERAL ")")?
xsddatetimewithtimezoneexpr : "xdatetimetz" ("(" XSDDATETIMEWITHTIMEZONELITERAL "," XSDDATETIMEWITHTIMEZONELITERAL ")")?
xsddateexpr : "xdate" ("(" XSDDATELITERAL "," XSDDATELITERAL ")")?
xsdtimeexpr : "xtime" ("(" XSDTIMELITERAL "," XSDTIMELITERAL ")")?
ukdateexpr : "ukdate" ("(" UKDATELITERAL "," UKDATELITERAL ")")?
dateexpr : "date(" stringprovider "," stringprovider "," stringprovider ("," XSDDATELITERAL "," XSDDATELITERAL)? ")"

partialukdateexpr : "partukdate"
partialdateexpr : "partdate(" stringprovider "," stringprovider "," stringprovider ")"
uuid4expr : "uuid4"
positiveintegerexpr : "positiveinteger"
uppercaseexpr : "uppercase"
lowercaseexpr : "lowercase"
identicalexpr : "positiveinteger"
externalsingleexpr : explicitcontextexpr? (fileexistsexpr | integritycheckexpr | checksumexpr | filecountexpr)
fileexistsexpr : "fileexists" ("(" stringprovider ")")?
integritycheckexpr : "integritycheck" "(" (stringprovider ",")? (stringprovider ",")? ("\"includefolder\"" | "\"excludefolder\"") ")"
checksumexpr : "checksum(" fileexpr "," STRINGLITERAL ")"
fileexpr : "file(" (stringprovider ",")? stringprovider ")"
filecountexpr : "filecount(" fileexpr ")"
stringprovider : columnref | STRINGLITERAL | concatexpr | noextexpr | uridecodeexpr
concatexpr : "concat(" stringprovider ("," stringprovider)+ ")"
noextexpr : "noext(" stringprovider ")"
uridecodeexpr : "uridecode(" stringprovider ("," stringprovider)? ")"
parenthesizedexpr : "(" columnvalidationexpr+ ")"
conditionalexpr : ifexpr | switchexpr
ifexpr : "if(" (combinatorialexpr | nonconditionalexpr) "," columnvalidationexpr+ ("," columnvalidationexpr+)? ")"
switchexpr : "switch(" switchcaseexpr+ ("," columnvalidationexpr+)? ")"
switchcaseexpr : "if(" (combinatorialexpr | nonconditionalexpr) "," columnvalidationexpr+ ")"


// TERMINALS
// ---------

XSDDATETIMELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT "t" XSDTIMELITERAL
XSDDATETIMEWITHTIMEZONELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT "t" XSDTIMEWITHOUTTIMEZONECOMPONENT XSDTIMEZONECOMPONENT
XSDDATELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT XSDOPTIONALTIMEZONECOMPONENT
XSDTIMELITERAL : XSDTIMEWITHOUTTIMEZONECOMPONENT XSDTIMEZONECOMPONENT
XSDDATEWITHOUTTIMEZONECOMPONENT : /[0-9]{4}-(((0(1|3|5|7|8)|1(0|2))-(0[1-9]|(1|2)[0-9]|3[0-1]))|((0(4|6|9)|11)-(0[1-9]|(1|2)[0-9]|30))|(02-(0[1-9]|(1|2)[0-9])))/
XSDTIMEWITHOUTTIMEZONECOMPONENT : /([0-1][0-9]|2[0-4]):(0[0-9]|[1-5][0-9]):(0[0-9]|[1-5][0-9])(\.[0-9]{3})?/
XSDOPTIONALTIMEZONECOMPONENT : /((\+|-)(0[1-9]|1[0-9]|2[0-4]):(0[0-9]|[1-5][0-9])|z)/
XSDTIMEZONECOMPONENT : /((\+|-)(0[1-9]|1[0-9]|2[0-4]):(0[0-9]|[1-5][0-9])|z)/
UKDATELITERAL : /(((0[1-9]|(1|2)[0-9]|3[0-1])\/(0(1|3|5|7|8)|1(0|2)))|((0[1-9]|(1|2)[0-9]|30)\/(0(4|6|9)|11))|((0[1-9]|(1|2)[0-9])\/02))\/[0-9]{4}/
POSITIVENONZEROINTEGERLITERAL : /[1-9][0-9]*/
POSITIVEINTEGERLITERAL : /[0-9]+/
NUMERICLITERAL : /[0-9]+(\.[0-9]+)?/
STRINGLITERAL : "\"" /[^"]+/ "\""
CHARACTERLITERAL : "'" /[^\r\n\f']/ "'"
WILDCARDLITERAL : "*"
IDENT : /[a-za-z0-9\-_\.]+/

// LARK SPECIFIC
// -------------

%import common.NEWLINE
%import common.WS
%ignore NEWLINE
%ignore WS
	// RULES // ----- schema : prolog body
	prolog : versiondecl globaldirectives versiondecl : ("version 1.0" \| "version 1.1" \| "version 1.2")
	globaldirectives : separatordirective? quoteddirective? totalcolumnsdirective? permitemptydirective? (noheaderdirective \| ignorecolumnnamecasedirective)? directiveprefix : "@"
	separatordirective : directiveprefix "separator" (separatortabexpr \| separatorchar)
	separatortabexpr : "tab" \| "\t"
	separatorchar : CHARACTERLITERAL
	quoteddirective : directiveprefix "quoted"
	totalcolumnsdirective : directiveprefix "totalColumns" POSITIVENONZEROINTEGERLITERAL
	permitemptydirective : directiveprefix "permitEmpty"
	noheaderdirective : directiveprefix "noHeader"
	ignorecolumnnamecasedirective : directiveprefix "ignoreColumnNameCase"
	body : bodypart+
	bodypart : comment* columndefinition comment*
	comment : singlelinecomment \| multilinecomment
	singlelinecomment : /\/\/[\S\t ]*/
	multilinecomment : /\\\(?:[^\r\n]+\|(?:\r?\n))\\\/
	columndefinition : (columnidentifier \| quotedcolumnidentifier) ":" columnrule
	columnidentifier : POSITIVENONZEROINTEGERLITERAL \| IDENT
	quotedcolumnidentifier : STRINGLITERAL
	columnrule : columnvalidationexpr* columndirectives
	columndirectives : optionaldirective? matchisfalsedirective? ignorecasedirective? warningdirective?
	optionaldirective : directiveprefix "optional"
	matchisfalsedirective : directiveprefix "matchIsFalse"
	ignorecasedirective : directiveprefix "ignoreCase"
	warningdirective : directiveprefix "warningDirective"
	columnvalidationexpr : combinatorialexpr \| noncombinatorialexpr
	combinatorialexpr : orexpr \| andexpr
	orexpr : noncombinatorialexpr "or" columnvalidationexpr
	andexpr : noncombinatorialexpr "and" columnvalidationexpr
	noncombinatorialexpr : nonconditionalexpr \| conditionalexpr
	nonconditionalexpr : singleexpr \| externalsingleexpr \| parenthesizedexpr
	singleexpr : explicitcontextexpr? (isexpr \| anyexpr \| notexpr \| inexpr \| startswithexpr \| endswithexpr \| regexpexpr \| rangeexpr \| lengthexpr \| emptyexpr \| notemptyexpr \| uniqueexpr \| uriexpr \| xsddatetimeexpr \| xsddatetimewithtimezoneexpr \| xsddateexpr \| xsdtimeexpr \| ukdateexpr \| dateexpr \| partialukdateexpr \| partialdateexpr \| uuid4expr \| positiveintegerexpr \| uppercaseexpr \| lowercaseexpr \| identicalexpr)
	explicitcontextexpr : columnref "/"
	columnref : "$" (columnidentifier \| quotedcolumnidentifier)
	isexpr : "is(" stringprovider ")"
	anyexpr : "any(" stringprovider ")"
	notexpr : "not(" stringprovider ")"
	inexpr : "in(" stringprovider ")"
	startswithexpr : "starts(" stringprovider ")"
	endswithexpr : "ends(" stringprovider ")"
	regexpexpr : "regex(" STRINGLITERAL ")"
	rangeexpr : "range(" (numericorany "," NUMERICLITERAL \| NUMERICLITERAL "," numericorany) ")"
	numericorany : NUMERICLITERAL \| WILDCARDLITERAL
	lengthexpr : "length(" (positiveintegerorany ",")? positiveintegerorany ")"
	positiveintegerorany : POSITIVEINTEGERLITERAL \| WILDCARDLITERAL
	emptyexpr : "empty"
	notemptyexpr : "notEmpty"
	uniqueexpr : "unique" ("(" columnref ("," columnref)* ")")?
	uriexpr : "uri"
	xsddatetimeexpr : "xdatetime" ("(" XSDDATETIMELITERAL "," XSDDATETIMELITERAL ")")?
	xsddatetimewithtimezoneexpr : "xdatetimetz" ("(" XSDDATETIMEWITHTIMEZONELITERAL "," XSDDATETIMEWITHTIMEZONELITERAL ")")?
	xsddateexpr : "xdate" ("(" XSDDATELITERAL "," XSDDATELITERAL ")")?
	xsdtimeexpr : "xtime" ("(" XSDTIMELITERAL "," XSDTIMELITERAL ")")?
	ukdateexpr : "ukdate" ("(" UKDATELITERAL "," UKDATELITERAL ")")?
	dateexpr : "date(" stringprovider "," stringprovider "," stringprovider ("," XSDDATELITERAL "," XSDDATELITERAL)? ")"

	partialukdateexpr : "partukdate"
	partialdateexpr : "partdate(" stringprovider "," stringprovider "," stringprovider ")"
	uuid4expr : "uuid4"
	positiveintegerexpr : "positiveinteger"
	uppercaseexpr : "uppercase"
	lowercaseexpr : "lowercase"
	identicalexpr : "positiveinteger"
	externalsingleexpr : explicitcontextexpr? (fileexistsexpr \| integritycheckexpr \| checksumexpr \| filecountexpr)
	fileexistsexpr : "fileexists" ("(" stringprovider ")")?
	integritycheckexpr : "integritycheck" "(" (stringprovider ",")? (stringprovider ",")? ("\"includefolder\"" \| "\"excludefolder\"") ")"
	checksumexpr : "checksum(" fileexpr "," STRINGLITERAL ")"
	fileexpr : "file(" (stringprovider ",")? stringprovider ")"
	filecountexpr : "filecount(" fileexpr ")"
	stringprovider : columnref \| STRINGLITERAL \| concatexpr \| noextexpr \| uridecodeexpr
	concatexpr : "concat(" stringprovider ("," stringprovider)+ ")"
	noextexpr : "noext(" stringprovider ")"
	uridecodeexpr : "uridecode(" stringprovider ("," stringprovider)? ")"
	parenthesizedexpr : "(" columnvalidationexpr+ ")"
	conditionalexpr : ifexpr \| switchexpr
	ifexpr : "if(" (combinatorialexpr \| nonconditionalexpr) "," columnvalidationexpr+ ("," columnvalidationexpr+)? ")"
	switchexpr : "switch(" switchcaseexpr+ ("," columnvalidationexpr+)? ")"
	switchcaseexpr : "if(" (combinatorialexpr \| nonconditionalexpr) "," columnvalidationexpr+ ")"


	// TERMINALS
	// ---------

	XSDDATETIMELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT "t" XSDTIMELITERAL
	XSDDATETIMEWITHTIMEZONELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT "t" XSDTIMEWITHOUTTIMEZONECOMPONENT XSDTIMEZONECOMPONENT
	XSDDATELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT XSDOPTIONALTIMEZONECOMPONENT
	XSDTIMELITERAL : XSDTIMEWITHOUTTIMEZONECOMPONENT XSDTIMEZONECOMPONENT
	XSDDATEWITHOUTTIMEZONECOMPONENT : /[0-9]{4}-(((0(1\|3\|5\|7\|8)\|1(0\|2))-(0[1-9]\|(1\|2)[0-9]\|3[0-1]))\|((0(4\|6\|9)\|11)-(0[1-9]\|(1\|2)[0-9]\|30))\|(02-(0[1-9]\|(1\|2)[0-9])))/
	XSDTIMEWITHOUTTIMEZONECOMPONENT : /([0-1][0-9]\|2[0-4]):(0[0-9]\|[1-5][0-9]):(0[0-9]\|[1-5][0-9])(\.[0-9]{3})?/
	XSDOPTIONALTIMEZONECOMPONENT : /((\+\|-)(0[1-9]\|1[0-9]\|2[0-4]):(0[0-9]\|[1-5][0-9])\|z)/
	XSDTIMEZONECOMPONENT : /((\+\|-)(0[1-9]\|1[0-9]\|2[0-4]):(0[0-9]\|[1-5][0-9])\|z)/
	UKDATELITERAL : /(((0[1-9]\|(1\|2)[0-9]\|3[0-1])\/(0(1\|3\|5\|7\|8)\|1(0\|2)))\|((0[1-9]\|(1\|2)[0-9]\|30)\/(0(4\|6\|9)\|11))\|((0[1-9]\|(1\|2)[0-9])\/02))\/[0-9]{4}/
	POSITIVENONZEROINTEGERLITERAL : /[1-9][0-9]*/
	POSITIVEINTEGERLITERAL : /[0-9]+/
	NUMERICLITERAL : /[0-9]+(\.[0-9]+)?/
	STRINGLITERAL : "\"" /[^"]+/ "\""
	CHARACTERLITERAL : "'" /[^\r\n\f']/ "'"
	WILDCARDLITERAL : "*"
	IDENT : /[a-za-z0-9\-_\.]+/

	// LARK SPECIFIC
	// -------------

	%import common.NEWLINE
	%import common.WS
	%ignore NEWLINE
	%ignore WS