Skip to content

Instantly share code, notes, and snippets.

@RomainTT
Last active December 31, 2021 11:21
Show Gist options
  • Save RomainTT/93f7190ce925f00d439f1465b944cb0c to your computer and use it in GitHub Desktop.
Save RomainTT/93f7190ce925f00d439f1465b944cb0c to your computer and use it in GitHub Desktop.
Lark file for CSV schema v1.2
// RULES // ----- schema : prolog body
prolog : versiondecl globaldirectives versiondecl : ("version 1.0" | "version 1.1" | "version 1.2")
globaldirectives : separatordirective? quoteddirective? totalcolumnsdirective? permitemptydirective? (noheaderdirective | ignorecolumnnamecasedirective)? directiveprefix : "@"
separatordirective : directiveprefix "separator" (separatortabexpr | separatorchar)
separatortabexpr : "tab" | "\t"
separatorchar : CHARACTERLITERAL
quoteddirective : directiveprefix "quoted"
totalcolumnsdirective : directiveprefix "totalColumns" POSITIVENONZEROINTEGERLITERAL
permitemptydirective : directiveprefix "permitEmpty"
noheaderdirective : directiveprefix "noHeader"
ignorecolumnnamecasedirective : directiveprefix "ignoreColumnNameCase"
body : bodypart+
bodypart : comment* columndefinition comment*
comment : singlelinecomment | multilinecomment
singlelinecomment : /\/\/[\S\t ]*/
multilinecomment : /\\\*(?:[^*\r\n]+|(?:\r?\n))*\\\*/
columndefinition : (columnidentifier | quotedcolumnidentifier) ":" columnrule
columnidentifier : POSITIVENONZEROINTEGERLITERAL | IDENT
quotedcolumnidentifier : STRINGLITERAL
columnrule : columnvalidationexpr* columndirectives
columndirectives : optionaldirective? matchisfalsedirective? ignorecasedirective? warningdirective?
optionaldirective : directiveprefix "optional"
matchisfalsedirective : directiveprefix "matchIsFalse"
ignorecasedirective : directiveprefix "ignoreCase"
warningdirective : directiveprefix "warningDirective"
columnvalidationexpr : combinatorialexpr | noncombinatorialexpr
combinatorialexpr : orexpr | andexpr
orexpr : noncombinatorialexpr "or" columnvalidationexpr
andexpr : noncombinatorialexpr "and" columnvalidationexpr
noncombinatorialexpr : nonconditionalexpr | conditionalexpr
nonconditionalexpr : singleexpr | externalsingleexpr | parenthesizedexpr
singleexpr : explicitcontextexpr? (isexpr | anyexpr | notexpr | inexpr | startswithexpr | endswithexpr | regexpexpr | rangeexpr | lengthexpr | emptyexpr | notemptyexpr | uniqueexpr | uriexpr | xsddatetimeexpr | xsddatetimewithtimezoneexpr | xsddateexpr | xsdtimeexpr | ukdateexpr | dateexpr | partialukdateexpr | partialdateexpr | uuid4expr | positiveintegerexpr | uppercaseexpr | lowercaseexpr | identicalexpr)
explicitcontextexpr : columnref "/"
columnref : "$" (columnidentifier | quotedcolumnidentifier)
isexpr : "is(" stringprovider ")"
anyexpr : "any(" stringprovider ")"
notexpr : "not(" stringprovider ")"
inexpr : "in(" stringprovider ")"
startswithexpr : "starts(" stringprovider ")"
endswithexpr : "ends(" stringprovider ")"
regexpexpr : "regex(" STRINGLITERAL ")"
rangeexpr : "range(" (numericorany "," NUMERICLITERAL | NUMERICLITERAL "," numericorany) ")"
numericorany : NUMERICLITERAL | WILDCARDLITERAL
lengthexpr : "length(" (positiveintegerorany ",")? positiveintegerorany ")"
positiveintegerorany : POSITIVEINTEGERLITERAL | WILDCARDLITERAL
emptyexpr : "empty"
notemptyexpr : "notEmpty"
uniqueexpr : "unique" ("(" columnref ("," columnref)* ")")?
uriexpr : "uri"
xsddatetimeexpr : "xdatetime" ("(" XSDDATETIMELITERAL "," XSDDATETIMELITERAL ")")?
xsddatetimewithtimezoneexpr : "xdatetimetz" ("(" XSDDATETIMEWITHTIMEZONELITERAL "," XSDDATETIMEWITHTIMEZONELITERAL ")")?
xsddateexpr : "xdate" ("(" XSDDATELITERAL "," XSDDATELITERAL ")")?
xsdtimeexpr : "xtime" ("(" XSDTIMELITERAL "," XSDTIMELITERAL ")")?
ukdateexpr : "ukdate" ("(" UKDATELITERAL "," UKDATELITERAL ")")?
dateexpr : "date(" stringprovider "," stringprovider "," stringprovider ("," XSDDATELITERAL "," XSDDATELITERAL)? ")"
partialukdateexpr : "partukdate"
partialdateexpr : "partdate(" stringprovider "," stringprovider "," stringprovider ")"
uuid4expr : "uuid4"
positiveintegerexpr : "positiveinteger"
uppercaseexpr : "uppercase"
lowercaseexpr : "lowercase"
identicalexpr : "positiveinteger"
externalsingleexpr : explicitcontextexpr? (fileexistsexpr | integritycheckexpr | checksumexpr | filecountexpr)
fileexistsexpr : "fileexists" ("(" stringprovider ")")?
integritycheckexpr : "integritycheck" "(" (stringprovider ",")? (stringprovider ",")? ("\"includefolder\"" | "\"excludefolder\"") ")"
checksumexpr : "checksum(" fileexpr "," STRINGLITERAL ")"
fileexpr : "file(" (stringprovider ",")? stringprovider ")"
filecountexpr : "filecount(" fileexpr ")"
stringprovider : columnref | STRINGLITERAL | concatexpr | noextexpr | uridecodeexpr
concatexpr : "concat(" stringprovider ("," stringprovider)+ ")"
noextexpr : "noext(" stringprovider ")"
uridecodeexpr : "uridecode(" stringprovider ("," stringprovider)? ")"
parenthesizedexpr : "(" columnvalidationexpr+ ")"
conditionalexpr : ifexpr | switchexpr
ifexpr : "if(" (combinatorialexpr | nonconditionalexpr) "," columnvalidationexpr+ ("," columnvalidationexpr+)? ")"
switchexpr : "switch(" switchcaseexpr+ ("," columnvalidationexpr+)? ")"
switchcaseexpr : "if(" (combinatorialexpr | nonconditionalexpr) "," columnvalidationexpr+ ")"
// TERMINALS
// ---------
XSDDATETIMELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT "t" XSDTIMELITERAL
XSDDATETIMEWITHTIMEZONELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT "t" XSDTIMEWITHOUTTIMEZONECOMPONENT XSDTIMEZONECOMPONENT
XSDDATELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT XSDOPTIONALTIMEZONECOMPONENT
XSDTIMELITERAL : XSDTIMEWITHOUTTIMEZONECOMPONENT XSDTIMEZONECOMPONENT
XSDDATEWITHOUTTIMEZONECOMPONENT : /[0-9]{4}-(((0(1|3|5|7|8)|1(0|2))-(0[1-9]|(1|2)[0-9]|3[0-1]))|((0(4|6|9)|11)-(0[1-9]|(1|2)[0-9]|30))|(02-(0[1-9]|(1|2)[0-9])))/
XSDTIMEWITHOUTTIMEZONECOMPONENT : /([0-1][0-9]|2[0-4]):(0[0-9]|[1-5][0-9]):(0[0-9]|[1-5][0-9])(\.[0-9]{3})?/
XSDOPTIONALTIMEZONECOMPONENT : /((\+|-)(0[1-9]|1[0-9]|2[0-4]):(0[0-9]|[1-5][0-9])|z)/
XSDTIMEZONECOMPONENT : /((\+|-)(0[1-9]|1[0-9]|2[0-4]):(0[0-9]|[1-5][0-9])|z)/
UKDATELITERAL : /(((0[1-9]|(1|2)[0-9]|3[0-1])\/(0(1|3|5|7|8)|1(0|2)))|((0[1-9]|(1|2)[0-9]|30)\/(0(4|6|9)|11))|((0[1-9]|(1|2)[0-9])\/02))\/[0-9]{4}/
POSITIVENONZEROINTEGERLITERAL : /[1-9][0-9]*/
POSITIVEINTEGERLITERAL : /[0-9]+/
NUMERICLITERAL : /[0-9]+(\.[0-9]+)?/
STRINGLITERAL : "\"" /[^"]+/ "\""
CHARACTERLITERAL : "'" /[^\r\n\f']/ "'"
WILDCARDLITERAL : "*"
IDENT : /[a-za-z0-9\-_\.]+/
// LARK SPECIFIC
// -------------
%import common.NEWLINE
%import common.WS
%ignore NEWLINE
%ignore WS
@RomainTT
Copy link
Author

RomainTT commented Dec 31, 2021

Original grammar is found here.

Lark is found here

Here are the changes between the original grammar and this Lark file:

  • rules are lower case only
  • terminals are upper case only
  • regular expressions are between /…/
  • Dynamic Earley doesn't allow zero-width regexps, therefore some * have been changed to +
  • Newlines and spaces must be ignored, this is done by %import and %ignore statements
  • A terminal beginning with - is not valid with Lark. These have been removed.
  • Terminals between ?…? are not understood by Lark. In case of regexp, it has been replaced by /…/, otherwise removed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment