Created
November 19, 2012 10:48
-
-
Save jmcd/4110064 to your computer and use it in GitHub Desktop.
Searching for text stored with accents in Lucene.NET
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.IO; | |
using Lucene.Net.Analysis; | |
using Lucene.Net.Analysis.Standard; | |
using Lucene.Net.Documents; | |
using Lucene.Net.Index; | |
using Lucene.Net.QueryParsers; | |
using Lucene.Net.Search; | |
using Lucene.Net.Store; | |
using NUnit.Framework; | |
using Version = Lucene.Net.Util.Version; | |
public class ASCIIFoldingAnalyzer : Analyzer | |
{ | |
private readonly Analyzer subAnalyzer; | |
public ASCIIFoldingAnalyzer(Analyzer subAnalyzer) | |
{ | |
this.subAnalyzer = subAnalyzer; | |
} | |
public override TokenStream TokenStream(string fieldName, TextReader reader) | |
{ | |
var result = subAnalyzer.TokenStream(fieldName, reader); | |
result = new ASCIIFoldingFilter(result); | |
return result; | |
} | |
} | |
[TestFixture] | |
public class can_search_for_accent_stored_text_with_accented_and_non_accented_query | |
{ | |
private string accentQueryFieldValue; | |
private string nonAccentQueryFieldValue; | |
[TestFixtureSetUp] | |
public void TestFixtureSetUp() | |
{ | |
var path = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString()); | |
var directory = new SimpleFSDirectory(new DirectoryInfo(path)); | |
var analyzer = new ASCIIFoldingAnalyzer(new StandardAnalyzer(Version.LUCENE_29)); | |
var doc = new Document(); | |
doc.Add(new Field("afield", "jóhn", Field.Store.YES, Field.Index.ANALYZED)); | |
using (var indexWriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED)) | |
{ | |
indexWriter.AddDocument(doc); | |
} | |
using (var indexReader = IndexReader.Open(directory, true)) | |
{ | |
using (var indexSearcher = new IndexSearcher(indexReader)) | |
{ | |
var queryParser = new QueryParser(Version.LUCENE_29, "afield", analyzer); | |
accentQueryFieldValue = GetFieldValue("jóhn", queryParser, indexSearcher, default(Filter), indexReader); | |
nonAccentQueryFieldValue = GetFieldValue("john", queryParser, indexSearcher, default(Filter), indexReader); | |
} | |
} | |
} | |
private static string GetFieldValue(string queryText, QueryParser queryParser, IndexSearcher indexSearcher, Filter filter, IndexReader indexReader) | |
{ | |
var query = queryParser.Parse(queryText); | |
var topDocs = indexSearcher.Search(query, filter, 1000); | |
var fieldValue = default(string); | |
if (topDocs.TotalHits == 1) | |
{ | |
var scoreDoc = topDocs.ScoreDocs[0]; | |
var document = indexReader.Document(scoreDoc.doc); | |
fieldValue = document.Get("afield"); | |
} | |
return fieldValue; | |
} | |
[Test] | |
public void accent_query_field_value_has_accent() | |
{ | |
Assert.AreEqual("jóhn", accentQueryFieldValue); | |
} | |
[Test] | |
public void non_accent_query_field_value_has_accent() | |
{ | |
Assert.AreEqual("jóhn", nonAccentQueryFieldValue); | |
} | |
} |
@jesuslpm any luck? Tried to add this class but seems out dated for some methods that no longer exist in parent class.
Sorry, this is very old code - wouldn't know how to update it for modern libraries.
Yes sorry, updated with a custom analyzer and its working fine. Thanks for the answers!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How
ASCIIFoldingAnalyzer
would be in Lucene.Net 4.8?