Skip to content

Instantly share code, notes, and snippets.

@cjming
Last active December 7, 2017 03:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cjming/fa9858d490ed73cc600a033ac69cb128 to your computer and use it in GitHub Desktop.
Save cjming/fa9858d490ed73cc600a033ac69cb128 to your computer and use it in GitHub Desktop.
Taxonomy term shuffles - run hook updates with batch API in D7
/**
* Implements hook_update_N().
*
* Re-assign nodes from one set of terms to new set of taxonomy terms.
* Redirect old terms pages to new terms pages.
* Output updated nodes and tids to CSV.
* Delete old set of taxonomy terms.
*/
function my_module_update_7001(&$sandbox) {
// Setup the initial processing.
if (!isset($sandbox['processed'])) {
// Set vars from csv.
$csv_rows = variable_get('{csv_variable_name}', array());
// Cut out if there are no update vars.
if (empty($csv_rows)) {
return;
}
$tids_old = $tids_new = array();
foreach ($csv_rows as $row) {
$tids_old[] = $row[0];
$tids_new[] = $row[1];
}
// Cut out if there are no update vars.
if (empty($tids_old) || empty($tids_new)) {
return;
}
$nids_all = array();
foreach ($tids_old as $tid_old) {
// Get all the nids that have this term attached.
$these_nids = taxonomy_select_nodes($tid_old, FALSE, FALSE, $order = array('t.nid' => 'ASC'));
// Merge array of attached nids to master array of nids.
if (!empty($these_nids)) {
$nids_all = array_merge($these_nids, $nids_all);
}
}
// Clean up the nids_all array - sort, remove duplicates.
sort($nids_all);
$nids_unique = array_unique($nids_all);
// Save initial sandbox variables.
$sandbox['processed'] = 0;
$sandbox['total'] = count($nids_unique);
$sandbox['csv_nids'] = array();
$sandbox['csv_tids'] = array();
$sandbox['nids_delete'] = array();
$sandbox['nids'] = $nids_unique;
$sandbox['tids_old'] = $tids_old;
$sandbox['tids_new'] = $tids_new;
}
// Iterate 5 times per batch.
$items_per_loop = 5;
// Update items as it gets closer to the end.
$items_per_loop = ($sandbox['total'] - $sandbox['processed'] < $items_per_loop) ? $sandbox['total'] - $sandbox['processed'] : $items_per_loop;
// Process a batch of nids 5 at a time.
for ($i = 0; $i < $items_per_loop; $i++) {
// Iterate through all the nids.
$this_nid = array_shift($sandbox['nids']);
$this_node = node_load(array_shift($sandbox['nids'])$this_nid);
// Load and wrap the node if it exists.
if (!empty($this_node)) {
$node_wrapper = entity_metadata_wrapper('node', $this_node);
// Get the term object values of this node's field_categories.
$node_terms = $node_wrapper->field_categories->value();
// Get the raw tids of this node's field_categories.
$node_tids_raw = $node_wrapper->field_categories->raw();
// Create new array to store updated tids per node.
if (!empty($node_tids_raw)) {
// Make sure $node_tids is an array.
$node_tids = is_array($node_tids_raw) ? $node_tids_raw : array($node_tids_raw);
$node_tids_update = $node_tids;
}
// Loop thru the node's field_categories tids.
foreach ($node_tids as $delta => $tid) {
// Check if each tid is a valid term object.
if (!empty($node_terms[$delta])) {
// Check if this tid should be deleted.
if (in_array($tid, $sandbox['tids_old'])) {
// Update the node's field_categories array of tids by removing the
// duplicate to-be-deleted tids.
$node_tids_update = array_diff($node_tids_update, array($tid));
// Grab the corresponding key from the to-be-deleted terms array.
$key = array_search($tid, $sandbox['tids_old']);
// If the new tid is not in the nodes categories, add it.
if (!in_array($sandbox['tids_new'][$key], $node_tids_update)) {
$node_tids_update[] = $sandbox['tids_new'][$key];
}
}
}
else {
// Update the node's field_categories array of tids by removing
// the tids that don't have a valid term object. Remove cruft -
// orphaned term ids that no longer map to a term object.
$node_tids_update = array_diff($node_tids_update, array($tid));
}
}
// Return all the values from the updated field_categories array and
// index the updated field_categories array numerically.
$node_tids_update = array_values($node_tids_update);
// Save the updated field_categories array to the node.
$node_wrapper->field_categories->set($node_tids_update);
$node_wrapper->save();
// Save the nid data for output to CSV.
$sandbox['csv_nids'][] = array(
$this_nid,
drupal_get_path_alias('node/' . $this_nid),
'"' . implode(" / ", $node_tids) . '"',
'"' . implode(" / ", $node_tids_update) . '"',
);
}
else {
// Add node ids to be deleted from taxonomy_index later if the loaded node
// doesn't exist.
$sandbox['nids_delete'][] = $this_nid;
// Save the nid data for output to CSV.
$sandbox['csv_nids'][] = array(
$this_nid,
'not a valid node',
'N/A',
'N/A',
);
}
// Rinse. Repeat.
$sandbox['processed']++;
}
// Update batch status.
$sandbox['#finished'] = ($sandbox['processed'] >= $sandbox['total']) ? 1 : ($sandbox['processed'] / $sandbox['total']);
// Once all batches are done, set up data for CSV output.
if ($sandbox['#finished'] >= 1) {
// Add redirect from to-be-deleted term aliases to reassigned term page.
foreach ($sandbox['tids_old'] as $index => $tid_old) {
$old_term_path = 'taxonomy/term/' . $tid_old;
$old_term_alias = drupal_get_path_alias($old_term_path);
$new_tid_path = 'taxonomy/term/' . $sandbox['tids_new'][$index];
$redirects = db_query("SELECT rid, source, redirect
FROM {redirect}
WHERE source='" . $old_term_alias . "'
AND redirect='" . $new_tid_path . "'")->fetchAll();
// Delete old term before adding redirects from old term path and alias.
taxonomy_term_delete($tid_old);
if (empty($redirects)) {
// Create a new redirect from to-be-deleted term page alias to
// reassigned term page if it doesn't already exist.
$new_redirect_from_alias = new stdClass();
redirect_object_prepare($new_redirect_from_alias, array('source' => $old_term_alias, 'redirect' => $new_tid_path));
redirect_save($new_redirect_from_alias);
// Create a new redirect from to-be-deleted term Drupal path to
// reassigned term page's drupal path.
$new_redirect_from_tid = new stdClass();
redirect_object_prepare($new_redirect_from_tid, array('source' => $old_term_path, 'redirect' => $new_tid_path));
redirect_save($new_redirect_from_tid);
// Write tid updates to CSV.
$tid_new_related_nids = taxonomy_select_nodes($sandbox['tids_new'][$index], FALSE, FALSE, $order = array('t.nid' => 'ASC'));
$sandbox['csv_tids'][] = array(
'tid_delete' => $tid_old,
'tid_delete_path' => 'taxonomy/term/' . $tid_old,
'tid_delete_alias' => $old_term_alias,
'status' => 'new',
'source_drupal_path' => $old_term_path,
'source_alias' => $old_term_alias,
'redirect' => $new_tid_path,
'tid_new_alias' => drupal_get_path_alias($new_tid_path),
'tid_new_related_nids' => '"' . implode(" | ", $tid_new_related_nids) . '"',
);
}
}
// Prepare headers for CSV files.
$csv_tid_headers = array(
'tid_delete' => 'tid_delete',
'tid_delete_path' => 'tid_delete_path',
'tid_delete_alias' => 'tid_delete_alias',
'status' => 'status',
'source_drupal_path' => 'source_drupal_path',
'source_alias' => 'source_alias',
'status' => 'status',
'redirect' => 'redirect',
'tid_new_alias' => 'tid_new_alias',
'tid_new_related_nids' => 'tid_new_related_nids',
);
$csv_nid_headers = array(
'nid' => 'nid',
'path' => 'path',
'tids_previous' => 'tids_previous',
'tids_updated' => 'tids_updated',
);
// Output updated tids/nids to csv file.
_my_module_create_csv($sandbox['csv_tids'], $csv_tid_headers, 'tids-to-delete');
_my_module_create_csv($sandbox['csv_nids'], $csv_nid_headers, 'nids-updated');
// Delete orphaned node ids from the taxonomy_index table.
if (!empty($sandbox['nids_delete'])) {
$nids_delete = $sandbox['nids_delete'];
db_delete('taxonomy_index')
->condition('nid', $nids_delete, 'IN')
->execute();
}
}
// Return progress status.
$args = array(
'!done' => $sandbox['processed'],
'!total' => $sandbox['total'],
'!pct' => round(($sandbox['processed'] / $sandbox['total']) * 100, 2),
);
return t('Completed !done/!total (!pct%)', $args);
}
<?php
/**
* @file
* Upload CSVs.
*/
/**
* Implements hook_menu().
*/
function my_module_csv_menu() {
$items['my_module_csv'] = array(
'title' => 'Import Data from CSV File',
'description' => 'Import content from a <abbr title="Comma Separated Values">CSV</abbr> file.',
'access callback' => 'user_access',
'access arguments' => array('administer site configuration'),
'page callback' => 'my_module_csv_pagecallback',
'type' => MENU_NORMAL_ITEM,
);
return $items;
}
/**
* Page callback function for the menu item.
*/
function my_module_csv_pagecallback() {
$module_path = drupal_get_path('module', 'my_module_csv');
$form = drupal_get_form('my_module_csv_form');
$output = "<p>This tool will attempt to import CSV data";
$output .= drupal_render($form);
return $output;
}
/**
* Build form.
*/
function my_module_csv_form() {
$form['#attributes'] = array(
'enctype' => 'multipart/form-data',
);
$form['csvfile'] = array(
'#title' => t('CSV File'),
'#type' => 'file',
'#description' => ($max_size = parse_size(ini_get('upload_max_filesize'))) ? t('Due to server restrictions, the <strong>maximum upload file size is !max_size</strong>. Files that exceed this size will be disregarded.', array('!max_size' => format_size($max_size))) : '',
);
$form['submit'] = array(
'#type' => 'submit',
'#value' => t('Commence Import'),
);
$form['#validate'] = array(
'my_module_csv_validate_fileupload',
'my_module_csv_form_validate',
);
return $form;
}
/**
* Validate CSV file upload.
*/
function my_module_csv_validate_fileupload(&$form, &$form_state) {
$validators = array(
'file_validate_extensions' => array('csv'),
);
$csv_dir = 'public://csv';
if (file_prepare_directory($csv_dir, FILE_CREATE_DIRECTORY)) {
if ($file = file_save_upload('csvfile', $validators, "public://csv", FILE_EXISTS_REPLACE)) {
$form_state['values']['csvupload'] = $file->destination;
}
else {
form_set_error('my_module_csv', t('Unable to copy upload file to !dest', array('!dest' => $destination)));
}
}
}
/**
* Validate CSV file.
*/
function my_module_csv_form_validate(&$form, &$form_state) {
if (isset($form_state['values']['csvupload'])) {
if ($handle = fopen($form_state['values']['csvupload'], 'r')) {
if (!($line = fgetcsv($handle, 4096))) {
form_set_error('csvfile', t('Something went wrong. Could not read CSV file.'));
}
fclose($handle);
}
else {
form_set_error('csvfile', t('Unable to read uploaded file !filepath', array('!filepath' => $form_state['values']['csvupload'])));
}
}
}
/**
* Custom submit CSV file.
*/
function my_module_csv_form_submit(&$form, &$form_state) {
$batch = array(
'title' => t('Importing CSV ...'),
'operations' => array(),
'init_message' => t('Commencing'),
'progress_message' => t('Processed @current out of @total.'),
'error_message' => t('An error occurred during processing'),
'finished' => 'outside_csv_import_finished',
);
if (isset($form_state['values']['csvupload'])) {
if ($handle = fopen($form_state['values']['csvupload'], 'r')) {
$batch['operations'][] = array('_my_module_csv_remember_filename', array($form_state['values']['csvupload']));
while ($line = fgetcsv($handle, 4096)) {
// base64_encode is used to ensure we don't overload the batch processor
// by stuffing complex objects into it.
$batch['operations'][] = array(
'_my_module_csv_import_line',
array(array_map('base64_encode', $line)),
);
}
fclose($handle);
}
}
batch_set($batch);
}
/**
* Batch API finished callback.
*/
function my_module_csv_import_finished($success, $results, $operations) {
if (!empty($results['failed_rows'])) {
$dir = file_directory_path() . '/outside_csv/';
if (file_check_directory($dir, FILE_CREATE_DIRECTORY)) {
$csv_filename = 'failed_rows-' . basename($results['uploaded_filename']);
$csv_filepath = $dir . '/' . $csv_filename;
$targs = array(
'!csv_url' => l(check_plain($csv_filename), $csv_filepath),
'%csv_filename' => $csv_filename,
'%csv_filepath' => $csv_filepath,
);
if ($handle = fopen($csv_filepath, 'w+')) {
foreach ($results['failed_rows'] as $failed_row) {
fputcsv($handle, $failed_row);
}
fclose($handle);
drupal_set_message(t('Some rows failed to import. You may download a CSV of these rows: !csv_url', $targs), 'error');
}
else {
drupal_set_message(t('Some rows failed to import, but unable to write error CSV to %csv_filepath', $targs), 'error');
}
}
else {
drupal_set_message(t('Some rows failed to import, but unable to create directory for error CSV at %csv_directory', $targs), 'error');
}
}
return t('The CSV import has completed.');
}
/**
* Batch API helper function.
*/
function _my_module_csv_remember_filename($filename, &$context) {
$context['results']['uploaded_filename'] = $filename;
}
/**
* Common batch processing callback for all operations.
*
* Required to load data into a Drupal variable.
*/
function _my_module_csv_import_line($line, $session_nid, &$context) {
// Get the filename of the uploaded csv without the extension.
$csv_filename = pathinfo($session_nid['results']['uploaded_filename'], PATHINFO_FILENAME);
// Get the variable from Drupal where the CSV rows are saved to.
$csv_variable = variable_get($csv_filename, array());
$session_nid['results']['rows_imported']++;
$line = $cleaned_line = array_map('base64_decode', $line);
// Give feedback to the importer about which operation is being performed.
// Show the row count by default.
$session_nid['message'] = t('Importing row !c', array('!c' => $context['results']['rows_imported']));
// Provide some feedback about the row currently being processed.
$session_nid['message'] = t('Importing %first', array('%first' => $line[0]));
// Capture and report on failed lines.
if ($line[1] == 'ROW' && $line[2] == 'FAILS') {
$session_nid['results']['failed_rows'][] = $line;
}
// Loop thru data fields per line and append to variable.
foreach ($line as $field) {
if (!empty($field)) {
$line_array[] = $field;
}
}
// Append the data to a variable.
$csv_variable[] = $line_array;
$session_nid['results']['variable'][] = $line_array;
// Clear line array for the next round.
$line_array = array();
// Save the variable.
$csv_variable_name = 'my_module_' . $csv_filename;
variable_set($csv_variable_name, $csv_variable);
// Output message during batch process.
drupal_set_message("Parsed line {$line[0]}");
}
/**
* Creates a csv file.
*
* @param array $data
* The array of data to export.
*/
function _my_module_create_csv(array $data, array $headers, $title) {
$fh = fopen('php://temp', 'rw');
fputcsv($fh, $headers);
foreach ($data as $row) {
fputcsv($fh, $row);
}
rewind($fh);
$data = stream_get_contents($fh);
fclose($fh);
$data = str_replace('"', '', $data);
// Get the temporary directory.
$temporary_directory = file_directory_temp();
// Set the file name for the temporary directory.
$destination = file_directory_temp()$temporary_directory . '/' . $title . '-export-' . time() . '.csv';
// Save the file.
$filename = file_unmanaged_save_data($data, $destination, FILE_EXISTS_REPLACE);
// The downloaded file name.
$download_name = $title . '-export-' . date('Y-m-d') . '.csv';
drupal_add_http_header('Content-Type', 'text/csv; utf-8');
drupal_add_http_header('Content-Disposition', 'attachment; filename=' . $download_name, TRUE);
readfile($filename);
}
@osopolar
Copy link

osopolar commented Dec 7, 2017

Thanks for sharing the inspiring solution. In case there are a lot of nodes, you may not save the complete node but only the field value, as it can take a while to update hundred thousands of nodes. It would look like:

//Get the id of field field_term.
$info = field_info_field('field_categories');
$field_categories = array($info['id']);
//...
$node_wrapper->field_categories->set($node_tids_update);
field_sql_storage_field_storage_write('node', $node, 'update', $field_categories);

And if that still takes too long, as in my case you really should try to parallelize it with the great tutorial/code from Multi Processing Part 3: Jumping the Drupal Queue, see also: https://github.com/johnennewdeeson/drush-multi-processing

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment