Skip to content

Instantly share code, notes, and snippets.

@georgiosd
Created October 13, 2013 11:22
Show Gist options
  • Save georgiosd/6961114 to your computer and use it in GitHub Desktop.
Save georgiosd/6961114 to your computer and use it in GitHub Desktop.
Porter stemming analyzer for RavenDB
public class PorterAnalyzer : Analyzer
{
/// <summary>An unmodifiable set containing some common English words that are usually not
/// useful for searching.
/// </summary>
public static readonly ISet<string> StopWordsSet = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/// <summary>Default maximum allowed token length </summary>
public const int DefaultMaxTokenLength = 255;
private readonly ISet<string> stopSet;
private readonly Version matchVersion;
/// <summary> Set maximum allowed token length. If a token is seen
/// that exceeds this length then it is discarded. This
/// setting only takes effect the next time tokenStream or
/// reusableTokenStream is called.
/// </summary>
public int MaxTokenLength { get; set; }
/// <summary>Builds an analyzer with the default stop words (<see cref="StopWordsSet" />).
/// </summary>
/// <param name="matchVersion">Lucene version to match see <see cref="Version">above</see></param>
public PorterAnalyzer(Version matchVersion)
: this(matchVersion, StopWordsSet)
{ }
/// <summary>Builds an analyzer with the given stop words.</summary>
/// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /></param>
/// <param name="stopSet">stop words
/// </param>
public PorterAnalyzer(Version matchVersion, ISet<string> stopSet)
{
this.stopSet = stopSet;
this.matchVersion = matchVersion;
MaxTokenLength = DefaultMaxTokenLength;
}
/// <summary>Builds an analyzer with the stop words from the given file.</summary>
/// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)">
/// </seealso>
/// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
///
/// </param>
/// <param name="stopwords">File to read stop words from
/// </param>
public PorterAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)
: this(matchVersion, WordlistLoader.GetWordSet(stopwords))
{
}
/// <summary>Builds an analyzer with the stop words from the given reader.</summary>
/// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
/// </seealso>
/// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
///
/// </param>
/// <param name="stopwords">Reader to read stop words from
/// </param>
public PorterAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
: this(matchVersion, WordlistLoader.GetWordSet(stopwords))
{ }
private SavedStreams CreateTokenStream(System.IO.TextReader reader)
{
StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader)
{
MaxTokenLength = MaxTokenLength
};
return new SavedStreams
{
TokenStream = tokenStream,
FilteredTokenStream = new PorterStemFilter(
new StopFilter(true,
new ASCIIFoldingFilter(
new LowerCaseFilter(
new StandardFilter(tokenStream)
)
)
, stopSet))
};
}
/// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" />
///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />.
/// </summary>
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
return CreateTokenStream(reader).FilteredTokenStream;
}
public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
{
SavedStreams streams = (SavedStreams)PreviousTokenStream;
if (streams == null)
PreviousTokenStream = streams = CreateTokenStream(reader);
else
streams.TokenStream.Reset(reader);
streams.TokenStream.MaxTokenLength = MaxTokenLength;
return streams.FilteredTokenStream;
}
private sealed class SavedStreams
{
internal StandardTokenizer TokenStream;
internal TokenStream FilteredTokenStream;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment