Skip to content

Instantly share code, notes, and snippets.

@elliotttf
Created December 1, 2011 19:25
Show Gist options
  • Save elliotttf/1419169 to your computer and use it in GitHub Desktop.
Save elliotttf/1419169 to your computer and use it in GitHub Desktop.
Example of adding files to solr document that are only linked to in the node body.
<?php
/**
* Implements hook_apachesolr_document_handlers().
*/
function example_apachesolr_document_handlers($type, $namespace) {
if ($type == 'node' && $namespace == 'example') {
return array('example_add_documents');
}
}
/**
* Implements hook_apachesolr_update_index().
*
* @see apachesolr_attachments_update_index().
*/
function example_apachesolr_update_index($document, $node, $namespace) {
if (variable_get('apachesolr_attachments_extract_using', 'tika') == 'solr' || variable_get('apachesolr_attachments_tika_path', '')) {
$start = REQUEST_TIME;
$cron_try = variable_get('apachesolr_attachments_cron_try', 20);
$cron_limit = variable_get('apachesolr_attachments_cron_limit', 100);
$cron_time_limit = variable_get('apachesolr_attachments_cron_time_limit', 15);
$num_tried = 0;
do {
$rows = apachesolr_get_nodes_to_index('example', $cron_try);
// Calls apachesolr_attachments_apachesolr_document_handlers() and
// ultimately apachesolr_attachments_add_documents().
$success = apachesolr_index_nodes($rows, 'example');
$num_tried += $cron_try;
} while ($success && ($num_tried < $cron_limit) && (REQUEST_TIME - $start < $cron_time_limit));
}
}
/**
* This assumes the files you are indexing are not part of the
* files table but rather just linked in the node body.
*/
function example_add_documents($node, $namespace) {
$hash = apachesolr_site_hash();
// Find all of the documents.
$documents = $matches = array();
if (preg_match_all('/<a ([^>]+)?href="([^"]+\.(pdf|txt))"/', $node->body, $matches)) {
foreach ($matches[2] as $href) {
$file = pathinfo($href);
$text = apachesolr_attachments_get_attachment_text($file);
if (!$text) {
watchdog('example', "No text to index, bro!");
continue;
}
// Create the solr document.
$document = new ApacheSolrDocument();
// A single file might be attached to multiple nodes.
// You might want to use something other than the filename
// (if it's not unique) to define the document_id, a separate
// table to track these files could work, or something like UUID.
$file_id = $file['basename'];
$document->id = apachesolr_document_id($file_id . '-' . $node->nid, 'file');
$document->site = url(NULL, array('absolute' => TRUE));
$document->hash = $hash;
$document->entity_type = 'file';
$document->entity_id = $file_id;
$document->bundle = $node->type;
$document->bundle_name = node_type_get_name($node);
$document->label = $file['filename'];
$document->is_nid = $node->nid;
$document->url = file_create_url($href);
$document->path = file_stream_wrapper_get_instance_by_uri($href)->getDirectoryPath() . '/' . file_uri_target($href);
$document->content = $file['filename'] . ' ' . $text;
$document->ss_name = $node->name;
// We want the name to ale be searchable for keywords.
$document->tos_name = $node->name;
// Everything else uses dynamic fields
$document->is_uid = $node->uid;
$document->bs_status = $node->status;
$document->bs_sticky = $node->sticky;
$document->bs_promote = $node->promote;
$document->is_tnid = $node->tnid;
$document->bs_translate = $node->translate;
if (empty($node->language)) {
// 'und' is the language-neutral code in Drupal 7.
$document->ss_language = LANGUAGE_NONE;
}
else {
$document->ss_language = $node->language;
}
$document->ds_created = apachesolr_date_iso(filectime($href));
$document->ds_changed = $document->ds_created;
// apachesolr_attachments-specific fields.
$document->ss_filemime = file_get_mimetype($href);
$document->ss_file_node_title = apachesolr_clean_text($node->title);
$document->ss_file_node_url = url('node/' . $node->nid, array('absolute' => TRUE));
// Add taxonomy to document.
$indexed_fields = apachesolr_entity_fields('node');
foreach ($indexed_fields as $index_key => $field_info) {
if ($field_info['field']['type'] == 'taxonomy_term_reference') { // Add only taxonomy.
$field_name = $field_info['field']['field_name'];
// See if the node has fields that can be indexed
if (isset($node->{$field_name})) {
// Got a field.
$function = $field_info['indexing_callback'];
if ($function && function_exists($function)) {
// NOTE: This function should always return an array. One
// node field may be indexed to multiple Solr fields.
$fields = $function($node, $field_name, $index_key, $field_info);
foreach ($fields as $field) {
// It's fine to use this method also for single value fields.
$document->setMultiValue($field['key'], $field['value']);
}
}
}
}
}
// Let modules add to the document.
foreach (module_implements('apachesolr_update_index') as $module) {
$function = $module . '_apachesolr_update_index';
$function($document, $node, $namespace);
}
drupal_alter('apachesolr_attachments_index', $document, $node, $file, $namespace);
$documents[] = $document;
}
}
return $documents;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment