-
-
Save becw/79362c7379687a904ebc to your computer and use it in GitHub Desktop.
Elasticsearch autocomplete analyzer configuration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @file | |
* Configure the ElasticSearch index for our content. | |
*/ | |
use Elastica\Client; | |
/** | |
* Implements hook_install(). | |
*/ | |
function custom_indexer_install() { | |
// Depending on when this module is enabled, composer_autoload may not have | |
// had a chance to register the Elastica library. | |
composer_autoload_init(); | |
custom_indexer_rebuild_index(); | |
} | |
/** | |
* Delete and re-create the ElasticSearch index. | |
* | |
* This routine explicitly describe some fields we'll be using as we need | |
* non-default mapping logic. | |
*/ | |
function custom_indexer_rebuild_index() { | |
try { | |
$client = new Client(array( | |
'connections' => variable_get('elastic_search_connections', array()), | |
)); | |
} | |
catch (Exception $e) { | |
// @todo error handling | |
watchdog('Custom Indexer', 'Failed to create and configure the ElasticSearch index; could not connect to ElasticSearch.'); | |
return FALSE; | |
} | |
$index_name = variable_get('custom_indexer_index', ''); | |
if (!$index_name) { | |
// Index not configured. | |
// @todo Error handling. | |
watchdog('Custom Indexer', 'The ElasticSearch index variable "custom_indexer_index" is not configured. Configure this variable in settings.local.php and reinstall the custom_indexer module.'); | |
return FALSE; | |
} | |
$index = $client->getIndex(variable_get('custom_indexer_index', '')); | |
// Configure a custom analyzer for doing autocomplete on particular fields. | |
$settings = array(); | |
$settings['analysis']['analyzer']['fulltext'] = array( | |
'type' => 'custom', | |
'tokenizer' => 'standard', | |
'filter' => array('standard', 'lowercase', 'stop', 'englishstemmer'), | |
); | |
// These two autocomplete analyzers do not include a stemmer; this is because | |
// the 'autocomplete_search' analyzer should produce only one token, so that | |
// it doesn't match dissimilar results with similar contents (ie, titles that | |
// contain two of the same words in completely different orders). | |
// | |
// The 'autocomplete' analyzer should be used for indexing; it generates many | |
// tokens. For example, the term "The Dark Knight" will be indexed as: | |
// the d | |
// the da | |
// the dar | |
// ... | |
// the dark k | |
// the dark kn | |
// ... | |
// dar | |
// dark | |
// dark k | |
// dark kn | |
// ... | |
// kni | |
// knig | |
// ... | |
// knight | |
// | |
// The 'autocomplete_search' analyzer should be used for user input; it | |
// generates a single token. Each of the following would match a token from | |
// "The Dark Knight", indexed with the 'autocomplete' analyzer as described | |
// above: | |
// The Dark Knight = the dark knight | |
// dark kn = dark kn | |
// dark-knight = dark knight | |
// | |
// Another good test is this data set: "Bruce Boa", "Bruno Kirby" and | |
// "Dylan Bruno". See the following three searches: | |
// "bru" = all three results | |
// "bruc" = just "Bruce Boa" | |
// "bruno" = both brunos | |
// "bruno ki" = just "Bruno Kirby" | |
$settings['analysis']['analyzer']['autocomplete'] = array( | |
'type' => 'custom', | |
'tokenizer' => 'standard', | |
'filter' => array( | |
'standard', | |
'lowercase', | |
// Generate shingles (sequences of consecutive words). This will allow | |
// searches to match continuously as the user types multiple words. | |
'autocompleteshingle', | |
// Generate "NGrams" (partial word tokens) for each word and sequence of | |
// words. These will be at most 15 characters long. | |
'autocompletengram', | |
// Don't index tokens longer than 15 characters; this would include long | |
// words and long shingles, which will just be extra cruft in the index. | |
'autocompletetruncate', | |
'trim', | |
// Regular tokens + shingles + NGrams = unnecessary duplicate tokens. | |
'unique', | |
'stop', | |
), | |
); | |
// Generates a single token. This is the partner to the 'autocomplete' | |
// analyzer, and given the same input, it should produce a single token that | |
// is one member of the set of tokens produced by that analyzer. | |
$settings['analysis']['analyzer']['autocomplete_search'] = array( | |
'type' => 'custom', | |
'tokenizer' => 'keyword', | |
'filter' => array( | |
'lowercase', | |
// Replace non-word characters with spaces; this should match the behavior | |
// of the standard tokenizer + shingling in the 'autocomplete' analyzer. | |
'autocompletesearchaddspaces', | |
// Truncate tokens to 15 characters; this prevents searches from matching, | |
// then not matching, then matching again as a user keeps typing. This | |
// corresponds with the 'autocompletengram' configuration. | |
'autocompletetruncate', | |
// Trim whitespace from the user's input. | |
'trim', | |
// If this token is a stop word, don't generate any tokens. | |
'stop', | |
), | |
); | |
$settings['analysis']['filter']['autocompleteshingle'] = array( | |
'type' => 'shingle', | |
'max_shingle_size' => 10, | |
); | |
$settings['analysis']['filter']['autocompletengram'] = array( | |
'type' => 'edgeNGram', | |
'min_gram' => 3, | |
'max_gram' => 15, | |
); | |
$settings['analysis']['filter']['autocompletesearchaddspaces'] = array( | |
'type' => 'pattern_replace', | |
'pattern' => '[^\w]+', | |
'replacement' => ' ', | |
); | |
$settings['analysis']['filter']['autocompletetruncate'] = array( | |
'type' => 'truncate', | |
'length' => 15, | |
); | |
$settings['analysis']['filter']['englishstemmer'] = array( | |
'type' => 'snowball', | |
'language' => 'English', | |
); | |
// Delete and re-create the index with our settings. | |
$response = $index->create(array('settings' => $settings), TRUE); | |
if (!$response->isOk()) { | |
// @todo error handling | |
watchdog('Custom Indexer', 'Failed to create the ElasticSearch index @index', array('@index' => $index_name)); | |
return FALSE; | |
} | |
/* ElasticSearch property mappings */ | |
// Program fields. | |
$response = $index | |
->getType('program') | |
->setMapping(array( | |
// Titles should be analyzed normally for search, analyzed with NGrams for | |
// autocomplete, and not analyzed for sorting. | |
'title' => array( | |
'type' => 'multi_field', | |
'fields' => array( | |
'title' => array( | |
'type' => 'string', | |
'analyzer' => 'fulltext', | |
), | |
'autocomplete' => array( | |
'type' => 'string', | |
'index_analyzer' => 'autocomplete', | |
'search_analyzer' => 'autocomplete_search', | |
'include_in_all' => 'false', | |
), | |
'sort' => array( | |
'type' => 'string', | |
'index' => 'not_analyzed', | |
'include_in_all' => 'false', | |
), | |
), | |
), | |
// Most IDs should be treated as strings. | |
'imdbid' => array('type' => 'string'), | |
'program_id' => array( | |
'type' => 'string', | |
'index' => 'not_analyzed', | |
), | |
'provider_id' => array('type' => 'string'), | |
'offers' => array( | |
'type' => 'object', | |
'index' => 'no', | |
), | |
// Map the rating sort field. | |
'rating' => array('type' => 'integer'), | |
// Offer filters are pre-tokenized by the indexer. | |
'offer_filters' => array( | |
'type' => 'string', | |
'index' => 'not_analyzed', | |
), | |
'maturity_rating' => array( | |
'type' => 'string', | |
'index' => 'not_analyzed', | |
), | |
'field_episode_title' => array( | |
'dynamic' => 'true', | |
'properties' => array( | |
'en' => array( | |
'dynamic' => 'true', | |
'properties' => array( | |
'value' => array( | |
'type' => 'string', | |
'analyzer' => 'fulltext', | |
), | |
), | |
), | |
), | |
), | |
)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment