jettro/TryHyphenation.java

## elastic_console.json
# Play with word composition
DELETE compound_word_example
PUT /compound_word_example
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0,
    "analysis": {
      "analyzer": {
        "hyph_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "hyphenation_decompounder"
          ]
        },
        "dict_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "dictionary_decompounder"
          ]
        }
      },
      "filter": {
        "dictionary_decompounder": {
          "type": "dictionary_decompounder",
          "word_list": [
            "dienst",
            "vakantie",
            "nacht"
          ]
        },
        "hyphenation_decompounder": {
          "type": "hyphenation_decompounder",
          "word_list": [
            "dienst",
            "vakantie",
            "nacht"
          ],
          "hyphenation_patterns_path": "analysis/nl.xml",
          "max_subword_size": 22
        }
      }
    }
  },
  "mappings": {
    "_doc": {
      "properties": {
        "title_hyph": {
          "type": "text",
          "analyzer": "hyph_analyzer"
        },
        "title_dict": {
          "type": "text",
          "analyzer": "dict_analyzer"
        }
      }
    }
  }
}

POST /compound_word_example/_doc/_bulk
{"index": {"_id": 1}}
{"title":"dit is een vakantie"}
{"index": {"_id": 2}}
{"title":"ik zoek een wisseldienst"}
{"index": {"_id": 3}}
{"title":"ik doe graag een nachtdienst"}
{"index": {"_id": 4}}
{"title":"ik werk graag in de nacht"}
{"index": {"_id": 5}}
{"title":"ik doe de diensten wel"}

GET compound_word_example/_analyze
{
  "field": "title_hyph",
  "text": ["ochtenddienst"]
}

GET compound_word_example/_analyze
{
  "field": "title_dict",
  "text": ["nachtdienst"]
}

GET /compound_word_example/_search
{
  "query": {
    "match": {
      "title": "dienst"
    }
  }
}

GET /compound_word_example/_search
{
  "query": {
    "match": {
      "title": "nacht"
    }
  }
}

## TryHyphenation.java
package com.klm.mysearch;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.junit.Ignore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.FileSystemResource;
import org.springframework.util.StringUtils;
import org.xml.sax.InputSource;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

@Ignore
public class TryHyphenation {
    public static final String HYPHEN_CONFIG = "/Users/jettrocoenradie/Downloads/offo-hyphenation/hyph/nl.xml";
    public static final List<String> DICTIONARY = Arrays.asList(
            "zo", "zomer", "vakantie", "huis", "nacht", "personeel", "boek", "koffie", "kop", "kopje");
    private static final Logger LOGGER = LoggerFactory.getLogger(TryHyphenation.class);
    private HyphenationTree tree;

    /**
     * Load the tree using the provided path to a Objects For Formatting Objects language config file.
     *
     * @param hyphenRulesPath Path to the file to load in the tree
     */
    public TryHyphenation(String hyphenRulesPath) {
        HyphenationTree tree = new HyphenationTree();
        try (InputStream hyphenRules = new FileSystemResource(hyphenRulesPath).getInputStream()) {
            InputSource source = new InputSource(hyphenRules);
            tree.loadPatterns(source);
        } catch (IOException e) {
            LOGGER.error("Problem while loading the hyphen file", e);
        }
        this.tree = tree;
    }

    /**
     * Method to run this sample class
     *
     * @param args Array containing possible arguments, not that we actually use any of them
     */
    public static void main(String[] args) {
        TryHyphenation hyphenation = new TryHyphenation(HYPHEN_CONFIG);

        String sourceString = "Koffiekopje";

        System.out.println("*** Find Hyphens:");
        List<String> hyphens = hyphenation.hyphenate(sourceString);
        String joinedHyphens = StringUtils.arrayToDelimitedString(
                hyphens.toArray(), " - ");
        System.out.println(joinedHyphens);

        System.out.println("\n*** Find Tokens:");
        List<String> tokens = hyphenation.findTokens(sourceString);
        String joinedTokens = StringUtils.arrayToDelimitedString(tokens.toArray(), ", ");
        System.out.println(joinedTokens);

        System.out.println();
    }

    /**
     * Returns a list of strings containing the found hyphens from the provided string.
     *
     * @param sourceString String to construct the hyphens for.
     * @return List of strings containing the hyphens.
     */
    public List<String> hyphenate(String sourceString) {
        Hyphenation hyphenator = this.tree.hyphenate(sourceString, 1, 1);
        int[] hyphenationPoints = hyphenator.getHyphenationPoints();
        List<String> parts = new ArrayList<>();
        for (int i = 1; i < hyphenationPoints.length; i++) {
            parts.add(sourceString.substring(hyphenationPoints[i - 1], hyphenationPoints[i]));
        }
        return parts;
    }

    /**
     * Uses the HyphenTree as created in the constructor to find terms in the dictionary that comply to the found
     * hyphens of the provided string.
     * @param sourceString String to find hyphens ans matching terms/tokens for
     * @return List of found terms
     */
    public List<String> findTokens(String sourceString) {
        StandardTokenizer tokenizer = new StandardTokenizer();
        tokenizer.setReader(new StringReader(sourceString));

        CharArraySet charArraySet = new CharArraySet(DICTIONARY, true);
        AccessibleHyphenationCompoundWordTokenFilter filter =
                new AccessibleHyphenationCompoundWordTokenFilter(tokenizer, tree, charArraySet);
        try {
            filter.reset();
            filter.incrementToken();
            filter.close();
        } catch (IOException e) {
            LOGGER.error("Could not tokenize", e);
        }
        return filter.getTokens();
    }

    /**
     * Subclass needed to get acces to protected parameters of the parent class
     */
    private class AccessibleHyphenationCompoundWordTokenFilter extends HyphenationCompoundWordTokenFilter {

        public AccessibleHyphenationCompoundWordTokenFilter(TokenStream input,
                                                            HyphenationTree hyphenator,
                                                            CharArraySet dictionary) {
            super(input, hyphenator, dictionary);
        }

        public List<String> getTokens() {
            return tokens.stream().map(compoundToken -> compoundToken.txt.toString())
                    .collect(Collectors.toList());
        }

    }
}
	# Play with word composition
	DELETE compound_word_example
	PUT /compound_word_example
	{
	"settings": {
	"number_of_shards": 1,
	"number_of_replicas": 0,
	"analysis": {
	"analyzer": {
	"hyph_analyzer": {
	"type": "custom",
	"tokenizer": "standard",
	"filter": [
	"hyphenation_decompounder"
	]
	},
	"dict_analyzer": {
	"type": "custom",
	"tokenizer": "standard",
	"filter": [
	"dictionary_decompounder"
	]
	}
	},
	"filter": {
	"dictionary_decompounder": {
	"type": "dictionary_decompounder",
	"word_list": [
	"dienst",
	"vakantie",
	"nacht"
	]
	},
	"hyphenation_decompounder": {
	"type": "hyphenation_decompounder",
	"word_list": [
	"dienst",
	"vakantie",
	"nacht"
	],
	"hyphenation_patterns_path": "analysis/nl.xml",
	"max_subword_size": 22
	}
	}
	}
	},
	"mappings": {
	"_doc": {
	"properties": {
	"title_hyph": {
	"type": "text",
	"analyzer": "hyph_analyzer"
	},
	"title_dict": {
	"type": "text",
	"analyzer": "dict_analyzer"
	}
	}
	}
	}
	}

	POST /compound_word_example/_doc/_bulk
	{"index": {"_id": 1}}
	{"title":"dit is een vakantie"}
	{"index": {"_id": 2}}
	{"title":"ik zoek een wisseldienst"}
	{"index": {"_id": 3}}
	{"title":"ik doe graag een nachtdienst"}
	{"index": {"_id": 4}}
	{"title":"ik werk graag in de nacht"}
	{"index": {"_id": 5}}
	{"title":"ik doe de diensten wel"}

	GET compound_word_example/_analyze
	{
	"field": "title_hyph",
	"text": ["ochtenddienst"]
	}

	GET compound_word_example/_analyze
	{
	"field": "title_dict",
	"text": ["nachtdienst"]
	}

	GET /compound_word_example/_search
	{
	"query": {
	"match": {
	"title": "dienst"
	}
	}
	}

	GET /compound_word_example/_search
	{
	"query": {
	"match": {
	"title": "nacht"
	}
	}
	}
	package com.klm.mysearch;

	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
	import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
	import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
	import org.apache.lucene.analysis.standard.StandardTokenizer;
	import org.junit.Ignore;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.springframework.core.io.FileSystemResource;
	import org.springframework.util.StringUtils;
	import org.xml.sax.InputSource;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.StringReader;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;
	import java.util.stream.Collectors;

	@Ignore
	public class TryHyphenation {
	public static final String HYPHEN_CONFIG = "/Users/jettrocoenradie/Downloads/offo-hyphenation/hyph/nl.xml";
	public static final List<String> DICTIONARY = Arrays.asList(
	"zo", "zomer", "vakantie", "huis", "nacht", "personeel", "boek", "koffie", "kop", "kopje");
	private static final Logger LOGGER = LoggerFactory.getLogger(TryHyphenation.class);
	private HyphenationTree tree;

	/**
	* Load the tree using the provided path to a Objects For Formatting Objects language config file.
	*
	* @param hyphenRulesPath Path to the file to load in the tree
	*/
	public TryHyphenation(String hyphenRulesPath) {
	HyphenationTree tree = new HyphenationTree();
	try (InputStream hyphenRules = new FileSystemResource(hyphenRulesPath).getInputStream()) {
	InputSource source = new InputSource(hyphenRules);
	tree.loadPatterns(source);
	} catch (IOException e) {
	LOGGER.error("Problem while loading the hyphen file", e);
	}
	this.tree = tree;
	}

	/**
	* Method to run this sample class
	*
	* @param args Array containing possible arguments, not that we actually use any of them
	*/
	public static void main(String[] args) {
	TryHyphenation hyphenation = new TryHyphenation(HYPHEN_CONFIG);

	String sourceString = "Koffiekopje";

	System.out.println("*** Find Hyphens:");
	List<String> hyphens = hyphenation.hyphenate(sourceString);
	String joinedHyphens = StringUtils.arrayToDelimitedString(
	hyphens.toArray(), " - ");
	System.out.println(joinedHyphens);

	System.out.println("\n*** Find Tokens:");
	List<String> tokens = hyphenation.findTokens(sourceString);
	String joinedTokens = StringUtils.arrayToDelimitedString(tokens.toArray(), ", ");
	System.out.println(joinedTokens);

	System.out.println();
	}

	/**
	* Returns a list of strings containing the found hyphens from the provided string.
	*
	* @param sourceString String to construct the hyphens for.
	* @return List of strings containing the hyphens.
	*/
	public List<String> hyphenate(String sourceString) {
	Hyphenation hyphenator = this.tree.hyphenate(sourceString, 1, 1);
	int[] hyphenationPoints = hyphenator.getHyphenationPoints();
	List<String> parts = new ArrayList<>();
	for (int i = 1; i < hyphenationPoints.length; i++) {
	parts.add(sourceString.substring(hyphenationPoints[i - 1], hyphenationPoints[i]));
	}
	return parts;
	}

	/**
	* Uses the HyphenTree as created in the constructor to find terms in the dictionary that comply to the found
	* hyphens of the provided string.
	* @param sourceString String to find hyphens ans matching terms/tokens for
	* @return List of found terms
	*/
	public List<String> findTokens(String sourceString) {
	StandardTokenizer tokenizer = new StandardTokenizer();
	tokenizer.setReader(new StringReader(sourceString));

	CharArraySet charArraySet = new CharArraySet(DICTIONARY, true);
	AccessibleHyphenationCompoundWordTokenFilter filter =
	new AccessibleHyphenationCompoundWordTokenFilter(tokenizer, tree, charArraySet);
	try {
	filter.reset();
	filter.incrementToken();
	filter.close();
	} catch (IOException e) {
	LOGGER.error("Could not tokenize", e);
	}
	return filter.getTokens();
	}

	/**
	* Subclass needed to get acces to protected parameters of the parent class
	*/
	private class AccessibleHyphenationCompoundWordTokenFilter extends HyphenationCompoundWordTokenFilter {

	public AccessibleHyphenationCompoundWordTokenFilter(TokenStream input,
	HyphenationTree hyphenator,
	CharArraySet dictionary) {
	super(input, hyphenator, dictionary);
	}

	public List<String> getTokens() {
	return tokens.stream().map(compoundToken -> compoundToken.txt.toString())
	.collect(Collectors.toList());
	}

	}
	}