Skip to content

Instantly share code, notes, and snippets.

@jalchr
Forked from mattjohnsonpint/NGramTest.cs
Last active December 15, 2015 21:29
Show Gist options
  • Save jalchr/5326545 to your computer and use it in GitHub Desktop.
Save jalchr/5326545 to your computer and use it in GitHub Desktop.
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Tokenattributes;
using Raven.Abstractions.Indexing;
using Raven.Client;
using Raven.Client.Indexes;
using Raven.Tests.Helpers;
using Xunit;
namespace RavenTests
{
public class NGramTest : RavenTestBase
{
public class User
{
public string Name { get; set; }
}
public class UsersIndex : AbstractIndexCreationTask<User>
{
public UsersIndex()
{
Map = users => from user in users
select new
{
user.Name
};
Index(x => x.Name, FieldIndexing.Analyzed);
Analyze(x => x.Name, typeof(NGramAnalyzer).AssemblyQualifiedName);
}
}
[Fact]
public void Test()
{
using (var documentStore = NewDocumentStore())
{
documentStore.ExecuteIndex(new UsersIndex());
using (var session = documentStore.OpenSession())
{
session.Store(new User { Name = "Matt Johnson" });
session.SaveChanges();
}
WaitForIndexing(documentStore);
using (var session = documentStore.OpenSession())
{
var searchValues = new[] { "ma", "mat", "att", "jo", "joh", "son" };
var allPassed = true;
foreach (var value in searchValues)
{
var results = session.Query<User, UsersIndex>().Search(x => x.Name, value).ToList();
var pass = results.Count == 1;
Debug.WriteLine("\"{0}\" : {1}", value, pass ? "Pass" : "Fail");
if (!pass)
allPassed = false;
}
Assert.True(allPassed);
}
}
}
}
[NotForQuerying]
public class NGramAnalyzer : Analyzer
{
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
var tokenizer = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader) { MaxTokenLength = 255 };
TokenStream filter = new StandardFilter(tokenizer);
filter = new LowerCaseFilter(filter);
filter = new StopFilter(false, filter, StandardAnalyzer.STOP_WORDS_SET);
return new NGramTokenFilter(filter, 2, 6);
}
}
public sealed class NGramTokenFilter : TokenFilter
{
public static int DefaultMinNgramSize = 1;
public static int DefaultMaxNgramSize = 2;
private readonly int _maxGram;
private readonly int _minGram;
private readonly IOffsetAttribute _offsetAtt;
private readonly ITermAttribute _termAtt;
private int _curGramSize;
private int _curPos;
private char[] _curTermBuffer;
private int _curTermLength;
private int _tokStart;
/**
* Creates NGramTokenFilter with given min and max n-grams.
* <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
* <param name="minGram">the smallest n-gram to generate</param>
* <param name="maxGram">the largest n-gram to generate</param>
*/
public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
: base(input)
{
if (minGram < 1)
{
throw new ArgumentException("minGram must be greater than zero");
}
if (minGram > maxGram)
{
throw new ArgumentException("minGram must not be greater than maxGram");
}
_minGram = minGram;
_maxGram = maxGram;
_termAtt = AddAttribute<ITermAttribute>();
_offsetAtt = AddAttribute<IOffsetAttribute>();
}
/**
* Creates NGramTokenFilter with default min and max n-grams.
* <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
*/
public NGramTokenFilter(TokenStream input)
: this(input, DefaultMinNgramSize, DefaultMaxNgramSize)
{
}
/** Returns the next token in the stream, or null at EOS. */
public override bool IncrementToken()
{
while (true)
{
if (_curTermBuffer == null)
{
if (!input.IncrementToken())
{
return false;
}
_curTermBuffer = (char[])_termAtt.TermBuffer().Clone();
_curTermLength = _termAtt.TermLength();
_curGramSize = _minGram;
_curPos = 0;
_tokStart = _offsetAtt.StartOffset;
}
while (_curGramSize <= _maxGram)
{
while (_curPos + _curGramSize <= _curTermLength)
{
// while there is input
ClearAttributes();
_termAtt.SetTermBuffer(_curTermBuffer, _curPos, _curGramSize);
_offsetAtt.SetOffset(_tokStart + _curPos, _tokStart + _curPos + _curGramSize);
_curPos++;
return true;
}
_curGramSize++; // increase n-gram size
_curPos = 0;
}
_curTermBuffer = null;
}
}
public override void Reset()
{
base.Reset();
_curTermBuffer = null;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment