Skip to content

Instantly share code, notes, and snippets.

@paul121
Last active August 17, 2022 23:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paul121/7ae0c2c9ab30aa78ab3688d6c322a7ad to your computer and use it in GitHub Desktop.
Save paul121/7ae0c2c9ab30aa78ab3688d6c322a7ad to your computer and use it in GitHub Desktop.
Script for detecting duplicate assets & merging log records
<?php
use Drupal\asset\Entity\Asset;
use Drupal\log\Entity\Log;
// Debug flag, leave TRUE to not perform any actions.
$debug = TRUE;
#$debug = FALSE;
// Only query assets created by this username.
$user_name = 'SurveyStack';
// Hosts to to not run this script on.
$denylist = [
//'farmos.ddev.site',
];
$host = \Drupal::request()->getHost();
if (in_array($host, $denylist)) {
log_message('Host in denylist.');
return;
}
// Query names that are duplicated across plant assets.
$name_duplicates = \Drupal::entityQueryAggregate('asset')
->condition('uid.entity:user.name', $user_name)
->condition('type', 'plant')
->groupBy('name')
->aggregate('name', 'count')
->conditionAggregate('name', 'count', 1, '>')
->execute();
print(PHP_EOL);
log_message('Querying assets with duplicate names... ');
foreach ($name_duplicates as $name_duplicate) {
// Log the duplicate.
print(PHP_EOL);
$name = $name_duplicate['name'];
$count = $name_duplicate['name_count'];
log_message("Duplicate: '$name' Count: $count");
// Load the assets.
$asset_ids = \Drupal::entityQuery('asset')
->condition('uid.entity:user.name', $user_name)
->condition('type', 'plant')
->condition('name', $name)
->sort('id', 'ASC')
->execute();
$assets = Asset::loadMultiple($asset_ids);
// Determine each asset's location.
// Keep track if we find a location for any assets here.
$location_count = 0;
/** @var \Drupal\farm_location\AssetLocationInterface $asset_location */
$asset_location = \Drupal::service('asset.location');
$asset_locations = [];
foreach ($assets as $asset) {
if ($asset_location->hasLocation($asset)) {
++$location_count;
$location = $asset_location->getLocation($asset);
$location = reset($location);
$asset_locations[$asset->id()] = $location;
}
}
if ($location_count === 0) {
log_message('None of the duplicated assets have a location.', 'warning');
}
elseif ($location_count === 1) {
$label = $location->label();
log_message('Target location: ' . $label);
}
elseif ($location_count > 1) {
log_message('Duplicated assets have multiple locations.', 'warning');
log_message('These assets should be manually reviewed.', 'warning');
}
// Query logs that reference these assets.
$log_asset_ids_result = \Drupal::entityQueryAggregate('log')
->condition('asset.entity:asset.name', $name)
->groupBy('asset.entity:asset.id')
->aggregate('id', 'count')
->execute();
$log_asset_ids = array_column($log_asset_ids_result, 'id');
// Loop through assets and determine operations.
// Keep track if we have found the "canonical" asset to keep.
$found_canonical = FALSE;
log_message('Asset name -- ID -- location -- log count -- operation -- url');
foreach ($assets as $asset) {
// Strings to output.
$asset_id = $asset->id();
$label = $asset->label();
$url = $asset->toUrl()->setAbsolute()->toString();
$current_location = $asset_locations[$asset_id] ? $asset_locations[$asset_id]->label() : ' (none) ';
// Determine the number of logs associated with the asset.
$log_count = 0;
$index = array_search($asset_id, $log_asset_ids);
if (is_numeric($index)) {
$log_count = $log_asset_ids_result[$index]['id_count'];
}
// Determine the operation.
// Delete assets unless they have logs that reference them.
$operation = 'delete';
// If logs reference the asset, keep or merge.
if ($log_count > 0) {
// Keep the first asset that has logs.
if ($found_canonical == FALSE) {
$operation = 'keep';
$found_canonical = $asset_id;
}
// Else merge the asset and update logs.
else {
$operation = 'merge';
}
}
// Finally, if there are multiple locations override the operation
// to keep assets that have logs, they need manual review.
// Assets that do not have any logs can always be deleted.
if ($log_count != 0 && $location_count > 1) {
$operation = 'keep';
}
log_message("$label -- $asset_id -- $current_location -- $log_count -- $operation -- $url");
// Perform the operation if $debug == FALSE.
if (!$debug) {
// Merge the asset.
if ($operation == 'merge' && $found_canonical != FALSE) {
// Load logs that reference this asset.
$log_ids = \Drupal::entityQuery('log')
->condition('asset.entity:asset.id', $asset_id)
->execute();
$logs = Log::loadMultiple($log_ids);
// Remove the asset id from each log and add the canonical asset id.
foreach ($logs as $log) {
/** @var \Drupal\Core\Field\EntityReferenceFieldItemListInterface $asset_field */
$asset_field = $log->get('asset');
$log_assets = $asset_field->getValue();
// Find the existing asset_id.
$index = array_search($asset_id, array_column($log_assets, 'target_id'));
if (is_int($index)) {
// Remove the index.
$asset_field->removeItem($index);
}
// Add the canonical if doesn't already exist.
$index = array_search((int) $found_canonical, array_column($log_assets, 'target_id'));
if ($index === FALSE) {
$asset_field->appendItem($found_canonical);
}
// Finally, save the log.
$log->save();
$log_id = $log->id();
$log_name = $log->label();
log_message("Updated log $log_id '$log_name'", 'operation');
}
// After all logs are merged, delete the asset.
$operation = 'delete';
}
if ($operation == 'delete') {
try {
$asset->delete();
log_message("Deleted asset $asset_id '$label'", 'operation');
} catch (Exception $e) {
log_message("Could not delete asset $asset_id '$label'. Reason: " . $e->getMessage(), 'error');
continue;
}
}
}
}
}
print(PHP_EOL);
/**
* Helper function to print/log messages.
*
* @param $message
* The message to print.
* @param $level
* The message level prefix.
*/
function log_message($message, $level = 'info') {
$prefix = [
'info' => '',
'operation' => ' OP: ',
'warning' => 'WARNING: ',
'error' => 'ERROR: ',
];
print($prefix[$level] . $message . PHP_EOL);
}

Test cases:

(NOTE: I didn't create all the log types specified here, what's most important is if the asset has any logs referencing it. And location requires DONE movement logs.)

// Test cases.
// Name -- location -- logs -- operation

// Typical duplicated plant asset
// corn  5/05/20 - -- (seeding, observation) (keep)
// corn  5/05/20 - field 1 (input) (merge, delete)


// Duplicated plant with empty plant
// beets 6/20/20 - -- (seeding, observation) (keep)
// beets 6/20/20 - field 1 (input, activity) (merge, delte)
// beets 6/20/20 - -- (none) (delete)

// Two unique assets, one empty
// squash 6/10/20 - field 1 (seeding) (keep)
// squash 6/10/20 - field 2 (seeding) (keep)
// squash 6/10/20 - -- (none) (delete)

// Assets without location or logs
// radish 3/15/20 - -- (none) (delete)
// radish 3/18/20 - -- (none) (delete)

// Rare edge case, should be two, both duplicated
// 5  - 2020 carrots - field 1
// 10 - 2020 carrots - --
// 12 - 2020 carrots - field 2
// 15 - 2020 carrots - --

Output for given test cases

$ ddev drush scr asset_dedup.php

Querying assets with duplicate names... 

Duplicate: 'beets 6/20/20' Count: 3
Target location: Field 2
Asset name    -- ID -- location -- log count -- operation -- url
beets 6/20/20 -- 6 --  (none)  -- 2 -- keep -- https://farmos.ddev.site/asset/6
beets 6/20/20 -- 7 -- Field 2 -- 3 -- merge -- https://farmos.ddev.site/asset/7
beets 6/20/20 -- 8 --  (none)  -- 0 -- delete -- https://farmos.ddev.site/asset/8

Duplicate: 'carrots 04/10/20' Count: 2
Target location: Field 1
Asset name    -- ID -- location -- log count -- operation -- url
carrots 04/10/20 -- 13 -- Field 1 -- 1 -- keep -- https://farmos.ddev.site/asset/13
carrots 04/10/20 -- 14 --  (none)  -- 1 -- merge -- https://farmos.ddev.site/asset/14

Duplicate: 'corn 5/05/20' Count: 4
WARNING: Duplicated assets have multiple locations.
WARNING: These assets should be manually reviewed.
Asset name    -- ID -- location -- log count -- operation -- url
corn 5/05/20 -- 1 --  (none)  -- 2 -- keep -- https://farmos.ddev.site/asset/1
corn 5/05/20 -- 5 -- Field 1 -- 2 -- keep -- https://farmos.ddev.site/asset/5
corn 5/05/20 -- 15 -- Field 2 -- 1 -- keep -- https://farmos.ddev.site/asset/15
corn 5/05/20 -- 16 --  (none)  -- 0 -- delete -- https://farmos.ddev.site/asset/16

Duplicate: 'radish 3/15/20' Count: 2
WARNING: None of the duplicated assets have a location.
Asset name    -- ID -- location -- log count -- operation -- url
radish 3/15/20 -- 11 --  (none)  -- 0 -- delete -- https://farmos.ddev.site/asset/11
radish 3/15/20 -- 12 --  (none)  -- 0 -- delete -- https://farmos.ddev.site/asset/12

Duplicate: 'squash 6/10/20' Count: 2
WARNING: Duplicated assets have multiple locations.
WARNING: These assets should be manually reviewed.
Asset name    -- ID -- location -- log count -- operation -- url
squash 6/10/20 -- 9 -- Field 3 -- 2 -- keep -- https://farmos.ddev.site/asset/9
squash 6/10/20 -- 10 -- Field 2 -- 2 -- keep -- https://farmos.ddev.site/asset/10

@paul121
Copy link
Author

paul121 commented Aug 10, 2022

Added a query condition to limit to only assets created by a give user name.

@paul121
Copy link
Author

paul121 commented Aug 10, 2022

Added logic to perform the operation of updating logs & delete assets.

@paul121
Copy link
Author

paul121 commented Aug 10, 2022

Don't add the canonical asset to a log if it already exists

@paul121
Copy link
Author

paul121 commented Aug 10, 2022

If an asset fails ERR then an error is shown:

ERROR: Could not delete asset 7 'beets 6/20/20'. Reason: Cannot delete "asset" of type "plant" with label "beets 6/20/20" and ID "7" because other content is referencing it and the integrity of this entity type is enforced.

@paul121
Copy link
Author

paul121 commented Aug 11, 2022

Add $denylist to exclude script from running on some hosts.

@paul121
Copy link
Author

paul121 commented Aug 17, 2022

Modified logic so duplicated assets with no logs are always deleted

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment