Skip to content

Instantly share code, notes, and snippets.

@jmcd
Created November 19, 2012 10:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmcd/4110064 to your computer and use it in GitHub Desktop.
Save jmcd/4110064 to your computer and use it in GitHub Desktop.
Searching for text stored with accents in Lucene.NET
using System;
using System.IO;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using NUnit.Framework;
using Version = Lucene.Net.Util.Version;
public class ASCIIFoldingAnalyzer : Analyzer
{
private readonly Analyzer subAnalyzer;
public ASCIIFoldingAnalyzer(Analyzer subAnalyzer)
{
this.subAnalyzer = subAnalyzer;
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
var result = subAnalyzer.TokenStream(fieldName, reader);
result = new ASCIIFoldingFilter(result);
return result;
}
}
[TestFixture]
public class can_search_for_accent_stored_text_with_accented_and_non_accented_query
{
private string accentQueryFieldValue;
private string nonAccentQueryFieldValue;
[TestFixtureSetUp]
public void TestFixtureSetUp()
{
var path = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
var directory = new SimpleFSDirectory(new DirectoryInfo(path));
var analyzer = new ASCIIFoldingAnalyzer(new StandardAnalyzer(Version.LUCENE_29));
var doc = new Document();
doc.Add(new Field("afield", "jóhn", Field.Store.YES, Field.Index.ANALYZED));
using (var indexWriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED))
{
indexWriter.AddDocument(doc);
}
using (var indexReader = IndexReader.Open(directory, true))
{
using (var indexSearcher = new IndexSearcher(indexReader))
{
var queryParser = new QueryParser(Version.LUCENE_29, "afield", analyzer);
accentQueryFieldValue = GetFieldValue("jóhn", queryParser, indexSearcher, default(Filter), indexReader);
nonAccentQueryFieldValue = GetFieldValue("john", queryParser, indexSearcher, default(Filter), indexReader);
}
}
}
private static string GetFieldValue(string queryText, QueryParser queryParser, IndexSearcher indexSearcher, Filter filter, IndexReader indexReader)
{
var query = queryParser.Parse(queryText);
var topDocs = indexSearcher.Search(query, filter, 1000);
var fieldValue = default(string);
if (topDocs.TotalHits == 1)
{
var scoreDoc = topDocs.ScoreDocs[0];
var document = indexReader.Document(scoreDoc.doc);
fieldValue = document.Get("afield");
}
return fieldValue;
}
[Test]
public void accent_query_field_value_has_accent()
{
Assert.AreEqual("jóhn", accentQueryFieldValue);
}
[Test]
public void non_accent_query_field_value_has_accent()
{
Assert.AreEqual("jóhn", nonAccentQueryFieldValue);
}
}
@jesuslpm
Copy link

How ASCIIFoldingAnalyzer would be in Lucene.Net 4.8?

@diegolaz
Copy link

diegolaz commented Mar 2, 2022

@jesuslpm any luck? Tried to add this class but seems out dated for some methods that no longer exist in parent class.

@jmcd
Copy link
Author

jmcd commented Mar 5, 2022

Sorry, this is very old code - wouldn't know how to update it for modern libraries.

@diegolaz79
Copy link

Yes sorry, updated with a custom analyzer and its working fine. Thanks for the answers!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment