Skip to content

Instantly share code, notes, and snippets.

@spinscale
Created December 21, 2022 22:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save spinscale/c38e2158267f1e0a272cf5b0b75510ae to your computer and use it in GitHub Desktop.
Save spinscale/c38e2158267f1e0a272cf5b0b75510ae to your computer and use it in GitHub Desktop.
Lucene Suggestions using phonetic algorithms
package de.spinscale.prt;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.phonetic.PhoneticFilterFactory;
import org.apache.lucene.search.suggest.FileDictionary;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.junit.jupiter.api.Test;
import java.io.StringReader;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.assertj.core.api.Assertions.assertThat;
public class PhoneticFstTests {
@Test
public void testPhoneticSuggester() throws Exception {
Map<String, String> args = new HashMap<>();
args.put("encoder", "ColognePhonetic");
CustomAnalyzer analyzer = CustomAnalyzer.builder()
.addTokenFilter(PhoneticFilterFactory.class, args)
.withTokenizer("standard")
.build();
Directory directory = new NIOFSDirectory(Paths.get("/tmp/"));
AnalyzingSuggester suggester = new AnalyzingSuggester(directory, "lucene-tmp", analyzer);
// payloads are optional
String input = """
spülmaschine\t10\tspülmaschine
spüle\t1\tspüle
speer\t5\tspeer
gabel\t100\tgabel
""";
FileDictionary dictionary = new FileDictionary(new StringReader(input));
suggester.build(dictionary);
assertLookup("sp", suggester, "spülmaschine/10", "speer/5", "spüle/1");
assertLookup("spul", suggester, "spülmaschine/10", "spüle/1");
assertLookup("spöl", suggester, "spülmaschine/10", "spüle/1");
assertLookup("spln", suggester, "spülmaschine/10");
// this is why you shouldnt use this as your default suggester
assertLookup("spe", suggester, "spülmaschine/10", "speer/5", "spüle/1");
assertLookup("spee", suggester, "spülmaschine/10", "speer/5", "spüle/1");
// back to normal
assertLookup("speer", suggester, "speer/5");
assertLookup("gabel", suggester, "gabel/100");
assertLookup("gabbel", suggester, "gabel/100");
assertLookup("gobel", suggester, "gabel/100");
assertLookup("ggobel", suggester, "gabel/100");
}
private void assertLookup(String input, AnalyzingSuggester suggester, String ... expectedOutcomesWithScore) {
List<Lookup.LookupResult> results = suggester.lookup(input, null, false, 10);
assertThat(results).map(Lookup.LookupResult::toString).containsExactly(expectedOutcomesWithScore);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment