Skip to content

Instantly share code, notes, and snippets.

@jettro
Last active November 3, 2019 21:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jettro/34ae01faac6a940e004b7e21aa9dc5c4 to your computer and use it in GitHub Desktop.
Save jettro/34ae01faac6a940e004b7e21aa9dc5c4 to your computer and use it in GitHub Desktop.
Class to play around with Lucene HyphenationCompoundWordTokenFilter without booting an elasticsearch cluster. Want to see what happens to the hyphens and the found tokens with different setups and strings.
# Play with word composition
DELETE compound_word_example
PUT /compound_word_example
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"hyph_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"hyphenation_decompounder"
]
},
"dict_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"dictionary_decompounder"
]
}
},
"filter": {
"dictionary_decompounder": {
"type": "dictionary_decompounder",
"word_list": [
"dienst",
"vakantie",
"nacht"
]
},
"hyphenation_decompounder": {
"type": "hyphenation_decompounder",
"word_list": [
"dienst",
"vakantie",
"nacht"
],
"hyphenation_patterns_path": "analysis/nl.xml",
"max_subword_size": 22
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"title_hyph": {
"type": "text",
"analyzer": "hyph_analyzer"
},
"title_dict": {
"type": "text",
"analyzer": "dict_analyzer"
}
}
}
}
}
POST /compound_word_example/_doc/_bulk
{"index": {"_id": 1}}
{"title":"dit is een vakantie"}
{"index": {"_id": 2}}
{"title":"ik zoek een wisseldienst"}
{"index": {"_id": 3}}
{"title":"ik doe graag een nachtdienst"}
{"index": {"_id": 4}}
{"title":"ik werk graag in de nacht"}
{"index": {"_id": 5}}
{"title":"ik doe de diensten wel"}
GET compound_word_example/_analyze
{
"field": "title_hyph",
"text": ["ochtenddienst"]
}
GET compound_word_example/_analyze
{
"field": "title_dict",
"text": ["nachtdienst"]
}
GET /compound_word_example/_search
{
"query": {
"match": {
"title": "dienst"
}
}
}
GET /compound_word_example/_search
{
"query": {
"match": {
"title": "nacht"
}
}
}
package com.klm.mysearch;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.junit.Ignore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.FileSystemResource;
import org.springframework.util.StringUtils;
import org.xml.sax.InputSource;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
@Ignore
public class TryHyphenation {
public static final String HYPHEN_CONFIG = "/Users/jettrocoenradie/Downloads/offo-hyphenation/hyph/nl.xml";
public static final List<String> DICTIONARY = Arrays.asList(
"zo", "zomer", "vakantie", "huis", "nacht", "personeel", "boek", "koffie", "kop", "kopje");
private static final Logger LOGGER = LoggerFactory.getLogger(TryHyphenation.class);
private HyphenationTree tree;
/**
* Load the tree using the provided path to a Objects For Formatting Objects language config file.
*
* @param hyphenRulesPath Path to the file to load in the tree
*/
public TryHyphenation(String hyphenRulesPath) {
HyphenationTree tree = new HyphenationTree();
try (InputStream hyphenRules = new FileSystemResource(hyphenRulesPath).getInputStream()) {
InputSource source = new InputSource(hyphenRules);
tree.loadPatterns(source);
} catch (IOException e) {
LOGGER.error("Problem while loading the hyphen file", e);
}
this.tree = tree;
}
/**
* Method to run this sample class
*
* @param args Array containing possible arguments, not that we actually use any of them
*/
public static void main(String[] args) {
TryHyphenation hyphenation = new TryHyphenation(HYPHEN_CONFIG);
String sourceString = "Koffiekopje";
System.out.println("*** Find Hyphens:");
List<String> hyphens = hyphenation.hyphenate(sourceString);
String joinedHyphens = StringUtils.arrayToDelimitedString(
hyphens.toArray(), " - ");
System.out.println(joinedHyphens);
System.out.println("\n*** Find Tokens:");
List<String> tokens = hyphenation.findTokens(sourceString);
String joinedTokens = StringUtils.arrayToDelimitedString(tokens.toArray(), ", ");
System.out.println(joinedTokens);
System.out.println();
}
/**
* Returns a list of strings containing the found hyphens from the provided string.
*
* @param sourceString String to construct the hyphens for.
* @return List of strings containing the hyphens.
*/
public List<String> hyphenate(String sourceString) {
Hyphenation hyphenator = this.tree.hyphenate(sourceString, 1, 1);
int[] hyphenationPoints = hyphenator.getHyphenationPoints();
List<String> parts = new ArrayList<>();
for (int i = 1; i < hyphenationPoints.length; i++) {
parts.add(sourceString.substring(hyphenationPoints[i - 1], hyphenationPoints[i]));
}
return parts;
}
/**
* Uses the HyphenTree as created in the constructor to find terms in the dictionary that comply to the found
* hyphens of the provided string.
* @param sourceString String to find hyphens ans matching terms/tokens for
* @return List of found terms
*/
public List<String> findTokens(String sourceString) {
StandardTokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(sourceString));
CharArraySet charArraySet = new CharArraySet(DICTIONARY, true);
AccessibleHyphenationCompoundWordTokenFilter filter =
new AccessibleHyphenationCompoundWordTokenFilter(tokenizer, tree, charArraySet);
try {
filter.reset();
filter.incrementToken();
filter.close();
} catch (IOException e) {
LOGGER.error("Could not tokenize", e);
}
return filter.getTokens();
}
/**
* Subclass needed to get acces to protected parameters of the parent class
*/
private class AccessibleHyphenationCompoundWordTokenFilter extends HyphenationCompoundWordTokenFilter {
public AccessibleHyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator,
CharArraySet dictionary) {
super(input, hyphenator, dictionary);
}
public List<String> getTokens() {
return tokens.stream().map(compoundToken -> compoundToken.txt.toString())
.collect(Collectors.toList());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment