Skip to content

Instantly share code, notes, and snippets.

@babadofar
Last active September 30, 2015 11:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save babadofar/858df2b321f2209e20af to your computer and use it in GitHub Desktop.
Save babadofar/858df2b321f2209e20af to your computer and use it in GitHub Desktop.
hyphenation decompounder
{
"settings": {
"analysis": {
"analyzer": {
"hyph_decoumpound_list": {
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"hyph_decompound_list"
]
},
"dict_decoumpound_list": {
"filter": [
"standard",
"lowercase",
"dict_decompound_list"
],
"tokenizer": "standard"
}
},
"filter": {
"hyph_decompound_list": {
"type": "hyphenation_decompounder",
"word_list_path": "config/lang/words.txt",
"hyphenation_patterns_path": "config/lang/hyphenation/nb.xml"
},
"dict_decompound_list": {
"only_longest_match": "false",
"word_list_path": "config/lang/words.txt",
"type": "dictionary_decompounder",
"min_subword_size": "3"
}
}
}
}
}
GET /wiki/_analyze?analyzer=dict_decoumpound_list&text=vanskelig forsikringsforvaltning husbanken
Output:
{
"tokens": [
{
"token": "vanskelig",
"start_offset": 0,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "van",
"start_offset": 0,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "vanske",
"start_offset": 0,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "vanskelig",
"start_offset": 0,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "forsikringsforvaltning",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "forsikring",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "sikring",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "kri",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "kring",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "ring",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "forvaltning",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "orv",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "val",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "alt",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "husbanken",
"start_offset": 33,
"end_offset": 42,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "hus",
"start_offset": 33,
"end_offset": 42,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "husbank",
"start_offset": 33,
"end_offset": 42,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "bank",
"start_offset": 33,
"end_offset": 42,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "banke",
"start_offset": 33,
"end_offset": 42,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "anke",
"start_offset": 33,
"end_offset": 42,
"type": "<ALPHANUM>",
"position": 3
}
]
}
GET /wiki/_analyze?analyzer=hyph_decoumpound_list&text=vanskelig forsikringsforvaltning husbanken
{
"tokens": [
{
"token": "vanskelig",
"start_offset": 0,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "forsikringsforvaltning",
"start_offset": 10,
"end_offset": 32,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "husbanken",
"start_offset": 33,
"end_offset": 42,
"type": "<ALPHANUM>",
"position": 3
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment