Skip to content

Instantly share code, notes, and snippets.

@mattwarren
Last active August 29, 2015 14:00
Show Gist options
  • Save mattwarren/11398785 to your computer and use it in GitHub Desktop.
Save mattwarren/11398785 to your computer and use it in GitHub Desktop.
using System.Globalization;
using Irony.Parsing;
namespace Grammar
{
[Language("LuceneGrammar", "1.0", "Lucene Grammar")]
public class LuceneGrammar : Irony.Parsing.Grammar
{
public LuceneGrammar()
: base(true) // true means case sensitive
{
this.GrammarComments =
"Implementation of the Lucene Query Syntax\r\n" +
"See http://lucene.apache.org/core/2_9_4/queryparsersyntax.html \r\n" +
"and https://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html \r\n" +
"and http://lucene.apache.org/core/3_4_0/api/all/org/apache/lucene/queryParser/standard/StandardQueryParser.html \r\n" +
"and http://umbracosearchtools.codeplex.com/SourceControl/changeset/view/15506#48328";
// Terminals
var StringLiteral = new StringLiteral("StringLiternal", "\"", StringOptions.NoEscapes);
var TextValue = new RegexBasedTerminal("TextValue", @"([^:""+\-\s~\[\]\{\}\(\)\^]" +
@"|\\\^|\\{|\\\[|\\~|\\""|\\:)+");
var FieldName = new RegexBasedTerminal("FieldName", @"([\w][\w\d,_\-\.]*|\*)\s*:");
// Lucene supports escaping special characters that are part of the query syntax. The current list special characters are
// + - && || ! ( ) { } [ ] ^ " ~ * ? : \
// To escape these character use the \ before the character. For example to search for (1+1):2 use the query:
// \(1\+1\)\:2
var ImpliedOr = new ImpliedSymbolTerminal("ImpliedOr"); // in Lucene "OR" is the default if nothing is supplied
// NonTerminals
var BinaryExpression = new NonTerminal("BinaryExpression");
var BinaryOp = new NonTerminal("BinaryOp");
var Query = new NonTerminal("Query");
var Clause = new NonTerminal("Clause");
var SubClause = new NonTerminal("SubClause");
var Term = new NonTerminal("Term");
var Range = new NonTerminal("Range");
var OpenRange = new NonTerminal("OpenRange");
var CloseRange = new NonTerminal("CloseRange");
var QualifiedTerm = new NonTerminal("QualifiedTerm");
var UnqualifiedTerm = new NonTerminal("UnqualifiedTerm");
var Required = new NonTerminal("Required");
var Prohibited = new NonTerminal("Prohibited");
// A query is broken up into terms and operators. There are two types of terms: Single Terms and Phrases.
// - A Single Term is a single word such as "test" or "hello".
// - A Phrase is a group of words surrounded by double quotes such as "hello dolly".
// Multiple terms can be combined together with Boolean operators to form a more complex query (see below).
//Query -> Clause (And Clause | Or Clause | NotClause | Clause)*;
//NotClause -> Not Clause;
//Clause -> (SubClause | Term);
//SubClause -> (PLUS Query) | (MINUS Query) | (OPEN_PAREN Query CLOSE_PAREN);
//Term -> Range | QualifiedTerm | UnqualifiedTerm;
//QualifiedTerm -> FIELD_NAME ( Range | TEXT_VALUE | STRING_LITERAL );
//Range -> OPEN_SQUARE UnqualifiedTerm TO UnqualifiedTerm CLOSE_SQUARE;
//UnqualifiedTerm -> (STRING_LITERAL | TEXT_VALUE) Fuzzy? Boost? ;
this.Root = Query;
Query.Rule = MakeStarRule(Query, BinaryExpression);
//Query.Rule = Clause + MakePlusRule(Query, BinaryExpression); // This should be valid in Irony
BinaryExpression.Rule = BinaryOp + Clause;
BinaryOp.Rule = ImpliedOr | "AND" | "&&" | "OR" | "||";
Clause.Rule = SubClause | Term;
Required.Rule = ToTerm("+", "Plus") + Term;
Prohibited.Rule = ToTerm("-", "Minus") + Term;
SubClause.Rule = Required | Prohibited | ("(" + Query + ")") | (FieldName + "(" + Query + ")");
//Term.Rule = Range | QualifiedTerm | UnqualifiedTerm;
Term.Rule = QualifiedTerm | UnqualifiedTerm;
QualifiedTerm.Rule = FieldName + (Range | StringLiteral | TextValue); // UnqualifiedTerm;
UnqualifiedTerm.Rule = StringLiteral | TextValue; // | SubClause; // Add in Fuzzy and Boost (Both OPTIONAL)
OpenRange.Rule = ToTerm("[") | "{";
CloseRange.Rule = ToTerm("]") | "}";
Range.Rule = OpenRange + UnqualifiedTerm + "TO" + UnqualifiedTerm + CloseRange;
// These appear to be states that we have to still go though, but AREN'T reported in the parse tree (sometimes???)
// QualifiedTerm isn't valid in here?! See Parser Language Errors
MarkTransient(Clause, /*SubClause,*/ Term, BinaryOp, /*QualifiedTerm,*/ UnqualifiedTerm);
MarkPunctuation("(", ")");
RegisterOperators(10, "OR", "||");
RegisterOperators(20, "AND", "&&"); //, "-");
RegisterOperators(20, ImpliedOr);
//Register brace pairs to improve error reporting
RegisterBracePair("(", ")");
// Don't use RegisterBracePair() with the Range braches ('[' & '{'), they don't always balance!!
//Do not report ImpliedAnd as expected symbol - it is not really a symbol
AddToNoReportGroup(ImpliedOr);
//also do not report braces as expected
AddToNoReportGroup("(", ")");
MarkReservedWords("AND", "OR", "NOT", "TO", "&&", "||", "+", "-");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment