becw/custom_indexer.install.php Secret

## custom_indexer.install.php
<?php

/**
 * @file
 * Configure the ElasticSearch index for our content.
 */

use Elastica\Client;

/**
 * Implements hook_install().
 */
function custom_indexer_install() {
  // Depending on when this module is enabled, composer_autoload may not have
  // had a chance to register the Elastica library.
  composer_autoload_init();
  custom_indexer_rebuild_index();
}

/**
 * Delete and re-create the ElasticSearch index.
 *
 * This routine explicitly describe some fields we'll be using as we need
 * non-default mapping logic.
 */
function custom_indexer_rebuild_index() {
  try {
    $client = new Client(array(
      'connections' => variable_get('elastic_search_connections', array()),
    ));
  }
  catch (Exception $e) {
    // @todo error handling
    watchdog('Custom Indexer', 'Failed to create and configure the ElasticSearch index; could not connect to ElasticSearch.');
    return FALSE;
  }

  $index_name = variable_get('custom_indexer_index', '');
  if (!$index_name) {
    // Index not configured.
    // @todo Error handling.
    watchdog('Custom Indexer', 'The ElasticSearch index variable "custom_indexer_index" is not configured. Configure this variable in settings.local.php and reinstall the custom_indexer module.');
    return FALSE;
  }

  $index = $client->getIndex(variable_get('custom_indexer_index', ''));

  // Configure a custom analyzer for doing autocomplete on particular fields.
  $settings = array();
  $settings['analysis']['analyzer']['fulltext'] = array(
    'type' => 'custom',
    'tokenizer' => 'standard',
    'filter' => array('standard', 'lowercase', 'stop', 'englishstemmer'),
  );

  // These two autocomplete analyzers do not include a stemmer; this is because
  // the 'autocomplete_search' analyzer should produce only one token, so that
  // it doesn't match dissimilar results with similar contents (ie, titles that
  // contain two of the same words in completely different orders).
  //
  // The 'autocomplete' analyzer should be used for indexing; it generates many
  // tokens. For example, the term "The Dark Knight" will be indexed as:
  //   the d
  //   the da
  //   the dar
  //   ...
  //   the dark k
  //   the dark kn
  //   ...
  //   dar
  //   dark
  //   dark k
  //   dark kn
  //   ...
  //   kni
  //   knig
  //   ...
  //   knight
  //
  // The 'autocomplete_search' analyzer should be used for user input; it
  // generates a single token. Each of the following would match a token from
  // "The Dark Knight", indexed with the 'autocomplete' analyzer as described
  // above:
  //   The Dark Knight = the dark knight
  //   dark kn = dark kn
  //   dark-knight = dark knight
  //
  // Another good test is this data set: "Bruce Boa", "Bruno Kirby" and
  // "Dylan Bruno". See the following three searches:
  //     "bru" = all three results
  //     "bruc" = just "Bruce Boa"
  //     "bruno" = both brunos
  //     "bruno ki" = just "Bruno Kirby"
  $settings['analysis']['analyzer']['autocomplete'] = array(
    'type' => 'custom',
    'tokenizer' => 'standard',
    'filter' => array(
      'standard',
      'lowercase',
      // Generate shingles (sequences of consecutive words). This will allow
      // searches to match continuously as the user types multiple words.
      'autocompleteshingle',
      // Generate "NGrams" (partial word tokens) for each word and sequence of
      // words. These will be at most 15 characters long.
      'autocompletengram',
      // Don't index tokens longer than 15 characters; this would include long
      // words and long shingles, which will just be extra cruft in the index.
      'autocompletetruncate',
      'trim',
      // Regular tokens + shingles + NGrams = unnecessary duplicate tokens.
      'unique',
      'stop',
    ),
  );
  // Generates a single token. This is the partner to the 'autocomplete'
  // analyzer, and given the same input, it should produce a single token that
  // is one member of the set of tokens produced by that analyzer.
  $settings['analysis']['analyzer']['autocomplete_search'] = array(
    'type' => 'custom',
    'tokenizer' => 'keyword',
    'filter' => array(
      'lowercase',
      // Replace non-word characters with spaces; this should match the behavior
      // of the standard tokenizer + shingling in the 'autocomplete' analyzer.
      'autocompletesearchaddspaces',
      // Truncate tokens to 15 characters; this prevents searches from matching,
      // then not matching, then matching again as a user keeps typing. This
      // corresponds with the 'autocompletengram' configuration.
      'autocompletetruncate',
      // Trim whitespace from the user's input.
      'trim',
      // If this token is a stop word, don't generate any tokens.
      'stop',
    ),
  );
  $settings['analysis']['filter']['autocompleteshingle'] = array(
    'type' => 'shingle',
    'max_shingle_size' => 10,
  );
  $settings['analysis']['filter']['autocompletengram'] = array(
    'type' => 'edgeNGram',
    'min_gram' => 3,
    'max_gram' => 15,
  );
  $settings['analysis']['filter']['autocompletesearchaddspaces'] = array(
    'type' => 'pattern_replace',
    'pattern' => '[^\w]+',
    'replacement' => ' ',
  );
  $settings['analysis']['filter']['autocompletetruncate'] = array(
    'type' => 'truncate',
    'length' => 15,
  );
  $settings['analysis']['filter']['englishstemmer'] = array(
    'type' => 'snowball',
    'language' => 'English',
  );

  // Delete and re-create the index with our settings.
  $response = $index->create(array('settings' => $settings), TRUE);
  if (!$response->isOk()) {
    // @todo error handling
    watchdog('Custom Indexer', 'Failed to create the ElasticSearch index @index', array('@index' => $index_name));
    return FALSE;
  }

  /* ElasticSearch property mappings */
  // Program fields.
  $response = $index
    ->getType('program')
    ->setMapping(array(
      // Titles should be analyzed normally for search, analyzed with NGrams for
      // autocomplete, and not analyzed for sorting.
      'title' => array(
        'type' => 'multi_field',
        'fields' => array(
          'title' => array(
            'type' => 'string',
            'analyzer' => 'fulltext',
          ),
          'autocomplete' => array(
            'type' => 'string',
            'index_analyzer' => 'autocomplete',
            'search_analyzer' => 'autocomplete_search',
            'include_in_all' => 'false',
          ),
          'sort' => array(
            'type' => 'string',
            'index' => 'not_analyzed',
            'include_in_all' => 'false',
          ),
        ),
      ),
      // Most IDs should be treated as strings.
      'imdbid' => array('type' => 'string'),
      'program_id' => array(
        'type' => 'string',
        'index' => 'not_analyzed',
      ),
      'provider_id' => array('type' => 'string'),
      'offers' => array(
        'type' => 'object',
        'index' => 'no',
      ),
      // Map the rating sort field.
      'rating' => array('type' => 'integer'),

      // Offer filters are pre-tokenized by the indexer.
      'offer_filters' => array(
        'type' => 'string',
        'index' => 'not_analyzed',
      ),
      'maturity_rating' => array(
        'type' => 'string',
        'index' => 'not_analyzed',
      ),
      'field_episode_title' => array(
        'dynamic' => 'true',
        'properties' => array(
          'en' => array(
            'dynamic' => 'true',
            'properties' => array(
              'value' => array(
                'type' => 'string',
                'analyzer' => 'fulltext',
              ),
            ),
          ),
        ),
      ),
   ));
}
	<?php

	/**
	* @file
	* Configure the ElasticSearch index for our content.
	*/

	use Elastica\Client;

	/**
	* Implements hook_install().
	*/
	function custom_indexer_install() {
	// Depending on when this module is enabled, composer_autoload may not have
	// had a chance to register the Elastica library.
	composer_autoload_init();
	custom_indexer_rebuild_index();
	}

	/**
	* Delete and re-create the ElasticSearch index.
	*
	* This routine explicitly describe some fields we'll be using as we need
	* non-default mapping logic.
	*/
	function custom_indexer_rebuild_index() {
	try {
	$client = new Client(array(
	'connections' => variable_get('elastic_search_connections', array()),
	));
	}
	catch (Exception $e) {
	// @todo error handling
	watchdog('Custom Indexer', 'Failed to create and configure the ElasticSearch index; could not connect to ElasticSearch.');
	return FALSE;
	}

	$index_name = variable_get('custom_indexer_index', '');
	if (!$index_name) {
	// Index not configured.
	// @todo Error handling.
	watchdog('Custom Indexer', 'The ElasticSearch index variable "custom_indexer_index" is not configured. Configure this variable in settings.local.php and reinstall the custom_indexer module.');
	return FALSE;
	}

	$index = $client->getIndex(variable_get('custom_indexer_index', ''));

	// Configure a custom analyzer for doing autocomplete on particular fields.
	$settings = array();
	$settings['analysis']['analyzer']['fulltext'] = array(
	'type' => 'custom',
	'tokenizer' => 'standard',
	'filter' => array('standard', 'lowercase', 'stop', 'englishstemmer'),
	);

	// These two autocomplete analyzers do not include a stemmer; this is because
	// the 'autocomplete_search' analyzer should produce only one token, so that
	// it doesn't match dissimilar results with similar contents (ie, titles that
	// contain two of the same words in completely different orders).
	//
	// The 'autocomplete' analyzer should be used for indexing; it generates many
	// tokens. For example, the term "The Dark Knight" will be indexed as:
	// the d
	// the da
	// the dar
	// ...
	// the dark k
	// the dark kn
	// ...
	// dar
	// dark
	// dark k
	// dark kn
	// ...
	// kni
	// knig
	// ...
	// knight
	//
	// The 'autocomplete_search' analyzer should be used for user input; it
	// generates a single token. Each of the following would match a token from
	// "The Dark Knight", indexed with the 'autocomplete' analyzer as described
	// above:
	// The Dark Knight = the dark knight
	// dark kn = dark kn
	// dark-knight = dark knight
	//
	// Another good test is this data set: "Bruce Boa", "Bruno Kirby" and
	// "Dylan Bruno". See the following three searches:
	// "bru" = all three results
	// "bruc" = just "Bruce Boa"
	// "bruno" = both brunos
	// "bruno ki" = just "Bruno Kirby"
	$settings['analysis']['analyzer']['autocomplete'] = array(
	'type' => 'custom',
	'tokenizer' => 'standard',
	'filter' => array(
	'standard',
	'lowercase',
	// Generate shingles (sequences of consecutive words). This will allow
	// searches to match continuously as the user types multiple words.
	'autocompleteshingle',
	// Generate "NGrams" (partial word tokens) for each word and sequence of
	// words. These will be at most 15 characters long.
	'autocompletengram',
	// Don't index tokens longer than 15 characters; this would include long
	// words and long shingles, which will just be extra cruft in the index.
	'autocompletetruncate',
	'trim',
	// Regular tokens + shingles + NGrams = unnecessary duplicate tokens.
	'unique',
	'stop',
	),
	);
	// Generates a single token. This is the partner to the 'autocomplete'
	// analyzer, and given the same input, it should produce a single token that
	// is one member of the set of tokens produced by that analyzer.
	$settings['analysis']['analyzer']['autocomplete_search'] = array(
	'type' => 'custom',
	'tokenizer' => 'keyword',
	'filter' => array(
	'lowercase',
	// Replace non-word characters with spaces; this should match the behavior
	// of the standard tokenizer + shingling in the 'autocomplete' analyzer.
	'autocompletesearchaddspaces',
	// Truncate tokens to 15 characters; this prevents searches from matching,
	// then not matching, then matching again as a user keeps typing. This
	// corresponds with the 'autocompletengram' configuration.
	'autocompletetruncate',
	// Trim whitespace from the user's input.
	'trim',
	// If this token is a stop word, don't generate any tokens.
	'stop',
	),
	);
	$settings['analysis']['filter']['autocompleteshingle'] = array(
	'type' => 'shingle',
	'max_shingle_size' => 10,
	);
	$settings['analysis']['filter']['autocompletengram'] = array(
	'type' => 'edgeNGram',
	'min_gram' => 3,
	'max_gram' => 15,
	);
	$settings['analysis']['filter']['autocompletesearchaddspaces'] = array(
	'type' => 'pattern_replace',
	'pattern' => '[^\w]+',
	'replacement' => ' ',
	);
	$settings['analysis']['filter']['autocompletetruncate'] = array(
	'type' => 'truncate',
	'length' => 15,
	);
	$settings['analysis']['filter']['englishstemmer'] = array(
	'type' => 'snowball',
	'language' => 'English',
	);

	// Delete and re-create the index with our settings.
	$response = $index->create(array('settings' => $settings), TRUE);
	if (!$response->isOk()) {
	// @todo error handling
	watchdog('Custom Indexer', 'Failed to create the ElasticSearch index @index', array('@index' => $index_name));
	return FALSE;
	}

	/* ElasticSearch property mappings */
	// Program fields.
	$response = $index
	->getType('program')
	->setMapping(array(
	// Titles should be analyzed normally for search, analyzed with NGrams for
	// autocomplete, and not analyzed for sorting.
	'title' => array(
	'type' => 'multi_field',
	'fields' => array(
	'title' => array(
	'type' => 'string',
	'analyzer' => 'fulltext',
	),
	'autocomplete' => array(
	'type' => 'string',
	'index_analyzer' => 'autocomplete',
	'search_analyzer' => 'autocomplete_search',
	'include_in_all' => 'false',
	),
	'sort' => array(
	'type' => 'string',
	'index' => 'not_analyzed',
	'include_in_all' => 'false',
	),
	),
	),
	// Most IDs should be treated as strings.
	'imdbid' => array('type' => 'string'),
	'program_id' => array(
	'type' => 'string',
	'index' => 'not_analyzed',
	),
	'provider_id' => array('type' => 'string'),
	'offers' => array(
	'type' => 'object',
	'index' => 'no',
	),
	// Map the rating sort field.
	'rating' => array('type' => 'integer'),

	// Offer filters are pre-tokenized by the indexer.
	'offer_filters' => array(
	'type' => 'string',
	'index' => 'not_analyzed',
	),
	'maturity_rating' => array(
	'type' => 'string',
	'index' => 'not_analyzed',
	),
	'field_episode_title' => array(
	'dynamic' => 'true',
	'properties' => array(
	'en' => array(
	'dynamic' => 'true',
	'properties' => array(
	'value' => array(
	'type' => 'string',
	'analyzer' => 'fulltext',
	),
	),
	),
	),
	),
	));
	}