Skip to content

Instantly share code, notes, and snippets.

@becw
Last active July 8, 2021 09:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save becw/79362c7379687a904ebc to your computer and use it in GitHub Desktop.
Save becw/79362c7379687a904ebc to your computer and use it in GitHub Desktop.
Elasticsearch autocomplete analyzer configuration
<?php
/**
* @file
* Configure the ElasticSearch index for our content.
*/
use Elastica\Client;
/**
* Implements hook_install().
*/
function custom_indexer_install() {
// Depending on when this module is enabled, composer_autoload may not have
// had a chance to register the Elastica library.
composer_autoload_init();
custom_indexer_rebuild_index();
}
/**
* Delete and re-create the ElasticSearch index.
*
* This routine explicitly describe some fields we'll be using as we need
* non-default mapping logic.
*/
function custom_indexer_rebuild_index() {
try {
$client = new Client(array(
'connections' => variable_get('elastic_search_connections', array()),
));
}
catch (Exception $e) {
// @todo error handling
watchdog('Custom Indexer', 'Failed to create and configure the ElasticSearch index; could not connect to ElasticSearch.');
return FALSE;
}
$index_name = variable_get('custom_indexer_index', '');
if (!$index_name) {
// Index not configured.
// @todo Error handling.
watchdog('Custom Indexer', 'The ElasticSearch index variable "custom_indexer_index" is not configured. Configure this variable in settings.local.php and reinstall the custom_indexer module.');
return FALSE;
}
$index = $client->getIndex(variable_get('custom_indexer_index', ''));
// Configure a custom analyzer for doing autocomplete on particular fields.
$settings = array();
$settings['analysis']['analyzer']['fulltext'] = array(
'type' => 'custom',
'tokenizer' => 'standard',
'filter' => array('standard', 'lowercase', 'stop', 'englishstemmer'),
);
// These two autocomplete analyzers do not include a stemmer; this is because
// the 'autocomplete_search' analyzer should produce only one token, so that
// it doesn't match dissimilar results with similar contents (ie, titles that
// contain two of the same words in completely different orders).
//
// The 'autocomplete' analyzer should be used for indexing; it generates many
// tokens. For example, the term "The Dark Knight" will be indexed as:
// the d
// the da
// the dar
// ...
// the dark k
// the dark kn
// ...
// dar
// dark
// dark k
// dark kn
// ...
// kni
// knig
// ...
// knight
//
// The 'autocomplete_search' analyzer should be used for user input; it
// generates a single token. Each of the following would match a token from
// "The Dark Knight", indexed with the 'autocomplete' analyzer as described
// above:
// The Dark Knight = the dark knight
// dark kn = dark kn
// dark-knight = dark knight
//
// Another good test is this data set: "Bruce Boa", "Bruno Kirby" and
// "Dylan Bruno". See the following three searches:
// "bru" = all three results
// "bruc" = just "Bruce Boa"
// "bruno" = both brunos
// "bruno ki" = just "Bruno Kirby"
$settings['analysis']['analyzer']['autocomplete'] = array(
'type' => 'custom',
'tokenizer' => 'standard',
'filter' => array(
'standard',
'lowercase',
// Generate shingles (sequences of consecutive words). This will allow
// searches to match continuously as the user types multiple words.
'autocompleteshingle',
// Generate "NGrams" (partial word tokens) for each word and sequence of
// words. These will be at most 15 characters long.
'autocompletengram',
// Don't index tokens longer than 15 characters; this would include long
// words and long shingles, which will just be extra cruft in the index.
'autocompletetruncate',
'trim',
// Regular tokens + shingles + NGrams = unnecessary duplicate tokens.
'unique',
'stop',
),
);
// Generates a single token. This is the partner to the 'autocomplete'
// analyzer, and given the same input, it should produce a single token that
// is one member of the set of tokens produced by that analyzer.
$settings['analysis']['analyzer']['autocomplete_search'] = array(
'type' => 'custom',
'tokenizer' => 'keyword',
'filter' => array(
'lowercase',
// Replace non-word characters with spaces; this should match the behavior
// of the standard tokenizer + shingling in the 'autocomplete' analyzer.
'autocompletesearchaddspaces',
// Truncate tokens to 15 characters; this prevents searches from matching,
// then not matching, then matching again as a user keeps typing. This
// corresponds with the 'autocompletengram' configuration.
'autocompletetruncate',
// Trim whitespace from the user's input.
'trim',
// If this token is a stop word, don't generate any tokens.
'stop',
),
);
$settings['analysis']['filter']['autocompleteshingle'] = array(
'type' => 'shingle',
'max_shingle_size' => 10,
);
$settings['analysis']['filter']['autocompletengram'] = array(
'type' => 'edgeNGram',
'min_gram' => 3,
'max_gram' => 15,
);
$settings['analysis']['filter']['autocompletesearchaddspaces'] = array(
'type' => 'pattern_replace',
'pattern' => '[^\w]+',
'replacement' => ' ',
);
$settings['analysis']['filter']['autocompletetruncate'] = array(
'type' => 'truncate',
'length' => 15,
);
$settings['analysis']['filter']['englishstemmer'] = array(
'type' => 'snowball',
'language' => 'English',
);
// Delete and re-create the index with our settings.
$response = $index->create(array('settings' => $settings), TRUE);
if (!$response->isOk()) {
// @todo error handling
watchdog('Custom Indexer', 'Failed to create the ElasticSearch index @index', array('@index' => $index_name));
return FALSE;
}
/* ElasticSearch property mappings */
// Program fields.
$response = $index
->getType('program')
->setMapping(array(
// Titles should be analyzed normally for search, analyzed with NGrams for
// autocomplete, and not analyzed for sorting.
'title' => array(
'type' => 'multi_field',
'fields' => array(
'title' => array(
'type' => 'string',
'analyzer' => 'fulltext',
),
'autocomplete' => array(
'type' => 'string',
'index_analyzer' => 'autocomplete',
'search_analyzer' => 'autocomplete_search',
'include_in_all' => 'false',
),
'sort' => array(
'type' => 'string',
'index' => 'not_analyzed',
'include_in_all' => 'false',
),
),
),
// Most IDs should be treated as strings.
'imdbid' => array('type' => 'string'),
'program_id' => array(
'type' => 'string',
'index' => 'not_analyzed',
),
'provider_id' => array('type' => 'string'),
'offers' => array(
'type' => 'object',
'index' => 'no',
),
// Map the rating sort field.
'rating' => array('type' => 'integer'),
// Offer filters are pre-tokenized by the indexer.
'offer_filters' => array(
'type' => 'string',
'index' => 'not_analyzed',
),
'maturity_rating' => array(
'type' => 'string',
'index' => 'not_analyzed',
),
'field_episode_title' => array(
'dynamic' => 'true',
'properties' => array(
'en' => array(
'dynamic' => 'true',
'properties' => array(
'value' => array(
'type' => 'string',
'analyzer' => 'fulltext',
),
),
),
),
),
));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment