Created
October 25, 2012 20:53
-
-
Save heddn/3955320 to your computer and use it in GitHub Desktop.
Index non-drupal content
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @file | |
* example drush commands. | |
*/ | |
/** | |
* Implementation of hook_drush_command(). | |
* | |
* @return | |
* An associative array describing your command(s). | |
*/ | |
function example_drush_command() { | |
$items = array(); | |
$items['example-solr-index'] = array( | |
'callback' => 'drush_example_index', | |
'description' => dt('Indexes specified content.'), | |
'arguments' => array( | |
'types' => dt('A space delimited list of entity types to be re-indexed.'), | |
), | |
'options' => array( | |
'force' => dt('Optional. Re-indexes all content of specified entity type.') | |
), | |
'examples' => array( | |
'drush example-solr-index external-content' => dt('Indexes all un-indexed external-content entities.'), | |
'drush example-solr-index --force external-content' => dt('Re-indexes all external-content entities.'), | |
), | |
'aliases' => array( | |
'msi' | |
), | |
); | |
return $items; | |
} | |
/** | |
* Implements hook_drush_help(). | |
* | |
* This function is called whenever a drush user calls | |
* 'drush help <name-of-your-command>' | |
* | |
* @param string $section | |
* A string with the help section (prepend with 'drush:') | |
* | |
* @return string | |
* A string with the help text for your command. | |
*/ | |
function example_drush_help($section) { | |
switch ($section) { | |
case 'drush:example-solr-index': | |
return dt("Indexes specified content. If you want to re-index all content, specify --force."); | |
} | |
} | |
/** | |
* Selectively (re)-index content from the solr index. | |
* | |
* Each argument is a filter on what to (re)-index . | |
* They are of the form entity (to (re)-index all content of that | |
* entity). | |
*/ | |
function drush_example_index() { | |
module_load_include('inc', 'apachesolr', 'apachesolr.index'); | |
module_load_include('inc', 'example', 'example_external_index'); | |
$args = func_get_args(); | |
$force = drush_get_option('force', FALSE); | |
$env_id = apachesolr_default_environment(); | |
if (count($args) > 0) { | |
foreach ($args as $entity_type) { | |
// Retrieve the entity ids | |
$function_name = _example_function_name('example_entity_ids_', $entity_type); | |
$entity_ids = call_user_func($function_name, $force); | |
// Delete documents from index | |
foreach ($entity_ids['delete_ids'] as $entity_id) { | |
apachesolr_index_delete_entity_from_index($env_id, $entity_type, $entity_id); | |
} | |
// Retrieve the new documents | |
$function_name = _example_function_name('example_retrieve_documents_', $entity_type); | |
$documents = call_user_func($function_name, $entity_ids['create_ids']); | |
// Prepare the new documents | |
$documents = example_document_prepare($documents); | |
// Force replacement of all index entries | |
if ($force) { | |
$docs_chunk = array_chunk($documents, 20, TRUE); | |
foreach ($docs_chunk as $docs) { | |
foreach ($docs as $entity_id => $doc) { | |
apachesolr_index_delete_entity_from_index($env_id, $entity_type, $entity_id); | |
} | |
apachesolr_index_send_to_solr($env_id, $docs); | |
} | |
// Create the new index entries | |
} | |
else { | |
apachesolr_index_send_to_solr($env_id, $documents); | |
} | |
} | |
} | |
else { | |
drush_set_error('An entity type must be specified'); | |
} | |
drush_print(t('Done (re)-indexing : ' . implode(' ', $args))); | |
} | |
function _example_function_name($prefix, $suffix) { | |
$function_name = $prefix . $suffix; | |
if (!is_callable($function_name)) { | |
throw new ErrorException(ucwords($suffix) . ' not implemented correctly. Function missing: ' . $function_name, | |
E_USER_ERROR); | |
} | |
return $function_name; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @file | |
* External indexing for non-drupal. | |
*/ | |
/** | |
* @return array() $entity_ids | |
* The list of documents that should be added/deleted from the solr index | |
*/ | |
function example_entity_ids_external_content($force = FALSE) { | |
$entity_ids['delete_ids'] = array(); | |
$entity_ids['create_ids'] = array(); | |
// Retrieve the external_content entity ids using the SOAP veneer | |
$cid = 'example_documentIds'; | |
$response = external_content_api('GetIDs', array( | |
'localizationCode' => 'en-us' | |
), $cid); | |
// Cache the entity ids | |
cache_set($cid, $response, 'cache_external_content'); | |
// Grab only the entity ids | |
$response_entity_ids = $response->GetIDsResult->string; | |
// Check if there are any new entities to index in solr | |
if ($force || $response_entity_ids != variable_get('example_external_content', array())) { | |
// Calculate the ids that need to be deleted from the index | |
$delete_entity_ids = variable_get('example_external_content', array()); | |
$entity_ids['delete_ids'] = array_diff($delete_entity_ids, $response_entity_ids); | |
// Assign list of entity ids that need to be created in the index | |
$entity_ids['create_ids'] = response_entity_ids; | |
// Put away the new list of entity ids | |
variable_set('example_external_content', $response_entity_ids); | |
} | |
return $entity_ids; | |
} | |
/** | |
* This takes entity ids and returns a structured array of documents | |
* The document object should contain: | |
* $document->entity_id (string or long) | |
* $document->entity_type (string) | |
* $document->title (string) | |
* $document->body (string> | |
* $document->path (string) | |
* $document->language (optional string) | |
* | |
* @param string array $entity_ids | |
* The external_content entity ids to index in solr | |
* @return array() $documents | |
* stdClass array of documents to index | |
*/ | |
function example_retrieve_documents_external_content($entity_ids = array()) { | |
$documents = array(); | |
foreach ($entity_ids as $i => $entity_id) { | |
$cid = $entity_id; | |
$response = external_content_api('GetDocument', array( | |
'ext_id' => $entity_id, | |
), $cid); | |
cache_set($cid, $response, 'cache_external_content'); | |
$xml_document = new DOMDocument; | |
$xml_document->loadXML($response->GetRawDocument->xml); | |
$xml_document = external_content_transform($xml_document, 'content'); | |
$html_document = new DOMDocument(); | |
// DOMDocument complains about malformed html, get rid of these errors | |
libxml_use_internal_errors(TRUE); | |
$html_document->loadHTML($xml_document); | |
$xpath = new DOMXPath($html_document); | |
$document = new stdClass(); | |
$document->entity_id = $entity_id; | |
$document->entity_type = 'external_content'; | |
$document->title = $xpath->query('//h1[@id="ContentTitle"]')->item(0)->nodeValue; | |
$document->body = $xpath->query('//div[contains(@class, "BodySection")]')->item(0)->nodeValue; | |
$document->path = 'external_content/' . $entity_id; | |
$documents[] = $document; | |
// Helpful in dev environments where we don't really need to index every external_content document | |
if ($i == variable_get('example_external_content_limit', 100)) { | |
break; | |
} | |
} | |
return $documents; | |
} | |
/** | |
* | |
* @param array $documents | |
* The stdClass array of documents to index | |
*/ | |
function example_document_prepare($documents = array()) { | |
$solr_documents = array(); | |
foreach ($documents as $document) { | |
// Since we can't use drupal's nid, make sure we get a mostly unique numeric entity id | |
// Still a possible chance of clashes since we have to trim the number to 18 positions | |
$entity_id_hash = substr(base_convert(sha1($document->entity_id), 16, 10), 0, 18); | |
$solr_document = new ApacheSolrDocument(); | |
$solr_document->addField('id', apachesolr_document_id($entity_id_hash, $document->entity_type)); | |
$solr_document->addField('site', url(NULL, array( | |
'absolute' => TRUE | |
))); | |
$solr_document->addField('hash', apachesolr_site_hash()); | |
$solr_document->addField('entity_id', $entity_id_hash); | |
$solr_document->addField('entity_type', $document->entity_type); | |
$solr_document->addField('bundle', $document->entity_type); | |
$solr_document->addField('bundle_name', ucwords($document->entity_type)); | |
$solr_document->addField('path', $document->path); | |
$solr_document->addField('url', url($document->path, array( | |
'absolute' => TRUE | |
))); | |
$solr_document->addField('language', $document->language ? $document->language : LANGUAGE_NONE); | |
$solr_document->addField('label', apachesolr_clean_text($document->title)); | |
$content = apachesolr_clean_text($document->body); | |
$solr_document->addField('content', $content); | |
$solr_document->addField('teaser', truncate_utf8($content, 300, TRUE)); | |
$solr_documents[$entity_id_hash] = $solr_document; | |
} | |
return $solr_documents; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment