Created
January 24, 2012 11:46
-
-
Save ayende/1669767 to your computer and use it in GitHub Desktop.
RavenDB NGram Search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using Lucene.Net.Analysis; | |
using Lucene.Net.Analysis.Standard; | |
using Lucene.Net.Analysis.Tokenattributes; | |
using Lucene.Net.Util; | |
using Raven.Abstractions.Indexing; | |
using Raven.Client.Linq; | |
using Raven.Database.Indexing; | |
using Xunit; | |
namespace Raven.Tests.Bugs | |
{ | |
public class NGramSearch : RavenTest | |
{ | |
public class Image | |
{ | |
public string Id { get; set; } | |
public string Name { get; set; } | |
public ICollection<string> Users { get; set; } | |
public ICollection<string> Tags { get; set; } | |
} | |
public class NGramTokenFilter : TokenFilter | |
{ | |
public static int DEFAULT_MIN_NGRAM_SIZE = 1; | |
public static int DEFAULT_MAX_NGRAM_SIZE = 2; | |
private int minGram, maxGram; | |
private char[] curTermBuffer; | |
private int curTermLength; | |
private int curGramSize; | |
private int curPos; | |
private int tokStart; | |
private TermAttribute termAtt; | |
private OffsetAttribute offsetAtt; | |
/** | |
* Creates NGramTokenFilter with given min and max n-grams. | |
* <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> | |
* <param name="minGram">the smallest n-gram to generate</param> | |
* <param name="maxGram">the largest n-gram to generate</param> | |
*/ | |
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) | |
: base(input) | |
{ | |
if (minGram < 1) | |
{ | |
throw new System.ArgumentException("minGram must be greater than zero"); | |
} | |
if (minGram > maxGram) | |
{ | |
throw new System.ArgumentException("minGram must not be greater than maxGram"); | |
} | |
this.minGram = minGram; | |
this.maxGram = maxGram; | |
this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); | |
this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); | |
} | |
/** | |
* Creates NGramTokenFilter with default min and max n-grams. | |
* <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> | |
*/ | |
public NGramTokenFilter(TokenStream input) | |
: this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) | |
{ | |
} | |
/** Returns the next token in the stream, or null at EOS. */ | |
public override bool IncrementToken() | |
{ | |
while (true) | |
{ | |
if (curTermBuffer == null) | |
{ | |
if (!input.IncrementToken()) | |
{ | |
return false; | |
} | |
else | |
{ | |
curTermBuffer = (char[])termAtt.TermBuffer().Clone(); | |
curTermLength = termAtt.TermLength(); | |
curGramSize = minGram; | |
curPos = 0; | |
tokStart = offsetAtt.StartOffset(); | |
} | |
} | |
while (curGramSize <= maxGram) | |
{ | |
while (curPos + curGramSize <= curTermLength) | |
{ // while there is input | |
ClearAttributes(); | |
termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize); | |
offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); | |
curPos++; | |
return true; | |
} | |
curGramSize++; // increase n-gram size | |
curPos = 0; | |
} | |
curTermBuffer = null; | |
} | |
} | |
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should | |
* not be overridden. Delegates to the backwards compatibility layer. */ | |
[System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")] | |
public override Token Next(Token reusableToken) | |
{ | |
return base.Next(reusableToken); | |
} | |
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should | |
* not be overridden. Delegates to the backwards compatibility layer. */ | |
[System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")] | |
public override Token Next() | |
{ | |
return base.Next(); | |
} | |
public override void Reset() | |
{ | |
base.Reset(); | |
curTermBuffer = null; | |
} | |
} | |
[NotForQuerying] | |
public class NGramAnalyzer : Analyzer | |
{ | |
public override TokenStream TokenStream(string fieldName, TextReader reader) | |
{ | |
var tokenizer = new StandardTokenizer(Version.LUCENE_29, reader); | |
tokenizer.SetMaxTokenLength(255); | |
TokenStream filter = new StandardFilter(tokenizer); | |
filter = new LowerCaseFilter(filter); | |
filter = new StopFilter(false,filter, StandardAnalyzer.STOP_WORDS_SET); | |
return new NGramTokenFilter(filter, 2, 6); | |
} | |
} | |
[Fact] | |
public void Can_search_inner_words() | |
{ | |
using (var store = NewDocumentStore()) | |
{ | |
using (var session = store.OpenSession()) | |
{ | |
session.Store(new FullTextSearchOnTags.Image { Id = "1", Name = "Great Photo buddy" }); | |
session.Store(new FullTextSearchOnTags.Image { Id = "2", Name = "Nice Photo of the sky" }); | |
session.SaveChanges(); | |
} | |
store.DatabaseCommands.PutIndex("test", new IndexDefinition | |
{ | |
Map = "from doc in docs.Images select new { doc.Name }", | |
Indexes = | |
{ | |
{"Name", FieldIndexing.Analyzed} | |
}, | |
Analyzers = | |
{ | |
{"Name", typeof (NGramAnalyzer).AssemblyQualifiedName} | |
} | |
}); | |
using (var session = store.OpenSession()) | |
{ | |
var images = session.Query<FullTextSearchOnTags.Image>("test") | |
.Customize(x => x.WaitForNonStaleResults()) | |
.OrderBy(x => x.Name) | |
.Search(x => x.Name, "phot") | |
.ToList(); | |
WaitForUserToContinueTheTest(store); | |
Assert.NotEmpty(images); | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment