Skip to content

Instantly share code, notes, and snippets.

@jesuslpm
Created March 5, 2022 19:03
Show Gist options
  • Save jesuslpm/1d61e903b5efc379cbb09461e55d5e65 to your computer and use it in GitHub Desktop.
Save jesuslpm/1d61e903b5efc379cbb09461e55d5e65 to your computer and use it in GitHub Desktop.
An accent and case insensitive analyzer for lucene.net 4.8
using Lucene.Net.Analysis;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Core;
namespace Map.Lucene
{
public class AccentFoldingAnalyzer : Analyzer
{
private readonly LuceneVersion luceneVersion;
public AccentFoldingAnalyzer(): this(LuceneVersion.LUCENE_48)
{
}
public AccentFoldingAnalyzer(LuceneVersion luceneVersion): base()
{
this.luceneVersion = luceneVersion;
}
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
Tokenizer source = new StandardTokenizer(luceneVersion, reader);
TokenStream result = new StandardFilter(luceneVersion, source);
result = new LowerCaseFilter(luceneVersion, result);
result = new AccentFoldingFilter(result);
return new TokenStreamComponents(source, result);
}
}
}
using Lucene.Net.Analysis.TokenAttributes;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Lucene.Net.Search;
using Lucene.Net.Analysis;
namespace Map.Lucene
{
public sealed class AccentFoldingFilter : TokenFilter
{
private ICharTermAttribute termAttribute;
public AccentFoldingFilter(TokenStream input) : base(input)
{
termAttribute = this.GetAttribute<ICharTermAttribute>();
}
public override bool IncrementToken()
{
if (this.m_input.IncrementToken())
{
string buffer = termAttribute.ToString().RemoveDiacritics();
termAttribute.SetEmpty().Append(buffer);
return true;
}
return false;
}
}
}
public static string RemoveDiacritics(this string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
var stringBuilder = new StringBuilder();
foreach (var c in normalizedString)
{
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(c);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment