Skip to content

Instantly share code, notes, and snippets.

@heddn
Created October 25, 2012 20:53
Show Gist options
  • Save heddn/3955320 to your computer and use it in GitHub Desktop.
Save heddn/3955320 to your computer and use it in GitHub Desktop.
Index non-drupal content
<?php
/**
* @file
* example drush commands.
*/
/**
* Implementation of hook_drush_command().
*
* @return
* An associative array describing your command(s).
*/
function example_drush_command() {
$items = array();
$items['example-solr-index'] = array(
'callback' => 'drush_example_index',
'description' => dt('Indexes specified content.'),
'arguments' => array(
'types' => dt('A space delimited list of entity types to be re-indexed.'),
),
'options' => array(
'force' => dt('Optional. Re-indexes all content of specified entity type.')
),
'examples' => array(
'drush example-solr-index external-content' => dt('Indexes all un-indexed external-content entities.'),
'drush example-solr-index --force external-content' => dt('Re-indexes all external-content entities.'),
),
'aliases' => array(
'msi'
),
);
return $items;
}
/**
* Implements hook_drush_help().
*
* This function is called whenever a drush user calls
* 'drush help <name-of-your-command>'
*
* @param string $section
* A string with the help section (prepend with 'drush:')
*
* @return string
* A string with the help text for your command.
*/
function example_drush_help($section) {
switch ($section) {
case 'drush:example-solr-index':
return dt("Indexes specified content. If you want to re-index all content, specify --force.");
}
}
/**
* Selectively (re)-index content from the solr index.
*
* Each argument is a filter on what to (re)-index .
* They are of the form entity (to (re)-index all content of that
* entity).
*/
function drush_example_index() {
module_load_include('inc', 'apachesolr', 'apachesolr.index');
module_load_include('inc', 'example', 'example_external_index');
$args = func_get_args();
$force = drush_get_option('force', FALSE);
$env_id = apachesolr_default_environment();
if (count($args) > 0) {
foreach ($args as $entity_type) {
// Retrieve the entity ids
$function_name = _example_function_name('example_entity_ids_', $entity_type);
$entity_ids = call_user_func($function_name, $force);
// Delete documents from index
foreach ($entity_ids['delete_ids'] as $entity_id) {
apachesolr_index_delete_entity_from_index($env_id, $entity_type, $entity_id);
}
// Retrieve the new documents
$function_name = _example_function_name('example_retrieve_documents_', $entity_type);
$documents = call_user_func($function_name, $entity_ids['create_ids']);
// Prepare the new documents
$documents = example_document_prepare($documents);
// Force replacement of all index entries
if ($force) {
$docs_chunk = array_chunk($documents, 20, TRUE);
foreach ($docs_chunk as $docs) {
foreach ($docs as $entity_id => $doc) {
apachesolr_index_delete_entity_from_index($env_id, $entity_type, $entity_id);
}
apachesolr_index_send_to_solr($env_id, $docs);
}
// Create the new index entries
}
else {
apachesolr_index_send_to_solr($env_id, $documents);
}
}
}
else {
drush_set_error('An entity type must be specified');
}
drush_print(t('Done (re)-indexing : ' . implode(' ', $args)));
}
function _example_function_name($prefix, $suffix) {
$function_name = $prefix . $suffix;
if (!is_callable($function_name)) {
throw new ErrorException(ucwords($suffix) . ' not implemented correctly. Function missing: ' . $function_name,
E_USER_ERROR);
}
return $function_name;
}
<?php
/**
* @file
* External indexing for non-drupal.
*/
/**
* @return array() $entity_ids
* The list of documents that should be added/deleted from the solr index
*/
function example_entity_ids_external_content($force = FALSE) {
$entity_ids['delete_ids'] = array();
$entity_ids['create_ids'] = array();
// Retrieve the external_content entity ids using the SOAP veneer
$cid = 'example_documentIds';
$response = external_content_api('GetIDs', array(
'localizationCode' => 'en-us'
), $cid);
// Cache the entity ids
cache_set($cid, $response, 'cache_external_content');
// Grab only the entity ids
$response_entity_ids = $response->GetIDsResult->string;
// Check if there are any new entities to index in solr
if ($force || $response_entity_ids != variable_get('example_external_content', array())) {
// Calculate the ids that need to be deleted from the index
$delete_entity_ids = variable_get('example_external_content', array());
$entity_ids['delete_ids'] = array_diff($delete_entity_ids, $response_entity_ids);
// Assign list of entity ids that need to be created in the index
$entity_ids['create_ids'] = response_entity_ids;
// Put away the new list of entity ids
variable_set('example_external_content', $response_entity_ids);
}
return $entity_ids;
}
/**
* This takes entity ids and returns a structured array of documents
* The document object should contain:
* $document->entity_id (string or long)
* $document->entity_type (string)
* $document->title (string)
* $document->body (string>
* $document->path (string)
* $document->language (optional string)
*
* @param string array $entity_ids
* The external_content entity ids to index in solr
* @return array() $documents
* stdClass array of documents to index
*/
function example_retrieve_documents_external_content($entity_ids = array()) {
$documents = array();
foreach ($entity_ids as $i => $entity_id) {
$cid = $entity_id;
$response = external_content_api('GetDocument', array(
'ext_id' => $entity_id,
), $cid);
cache_set($cid, $response, 'cache_external_content');
$xml_document = new DOMDocument;
$xml_document->loadXML($response->GetRawDocument->xml);
$xml_document = external_content_transform($xml_document, 'content');
$html_document = new DOMDocument();
// DOMDocument complains about malformed html, get rid of these errors
libxml_use_internal_errors(TRUE);
$html_document->loadHTML($xml_document);
$xpath = new DOMXPath($html_document);
$document = new stdClass();
$document->entity_id = $entity_id;
$document->entity_type = 'external_content';
$document->title = $xpath->query('//h1[@id="ContentTitle"]')->item(0)->nodeValue;
$document->body = $xpath->query('//div[contains(@class, "BodySection")]')->item(0)->nodeValue;
$document->path = 'external_content/' . $entity_id;
$documents[] = $document;
// Helpful in dev environments where we don't really need to index every external_content document
if ($i == variable_get('example_external_content_limit', 100)) {
break;
}
}
return $documents;
}
/**
*
* @param array $documents
* The stdClass array of documents to index
*/
function example_document_prepare($documents = array()) {
$solr_documents = array();
foreach ($documents as $document) {
// Since we can't use drupal's nid, make sure we get a mostly unique numeric entity id
// Still a possible chance of clashes since we have to trim the number to 18 positions
$entity_id_hash = substr(base_convert(sha1($document->entity_id), 16, 10), 0, 18);
$solr_document = new ApacheSolrDocument();
$solr_document->addField('id', apachesolr_document_id($entity_id_hash, $document->entity_type));
$solr_document->addField('site', url(NULL, array(
'absolute' => TRUE
)));
$solr_document->addField('hash', apachesolr_site_hash());
$solr_document->addField('entity_id', $entity_id_hash);
$solr_document->addField('entity_type', $document->entity_type);
$solr_document->addField('bundle', $document->entity_type);
$solr_document->addField('bundle_name', ucwords($document->entity_type));
$solr_document->addField('path', $document->path);
$solr_document->addField('url', url($document->path, array(
'absolute' => TRUE
)));
$solr_document->addField('language', $document->language ? $document->language : LANGUAGE_NONE);
$solr_document->addField('label', apachesolr_clean_text($document->title));
$content = apachesolr_clean_text($document->body);
$solr_document->addField('content', $content);
$solr_document->addField('teaser', truncate_utf8($content, 300, TRUE));
$solr_documents[$entity_id_hash] = $solr_document;
}
return $solr_documents;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment