Skip to content

Instantly share code, notes, and snippets.

@mgratch
Last active December 1, 2023 15:54
Show Gist options
  • Save mgratch/ec801047eee7ae3bc06d717200c4196a to your computer and use it in GitHub Desktop.
Save mgratch/ec801047eee7ae3bc06d717200c4196a to your computer and use it in GitHub Desktop.
<?php
/*
Plugin Name: CSV URL Processor
Plugin URI: https://gist.github.com/mgratch/ec801047eee7ae3bc06d717200c4196a
Description: A WP-CLI command to process URLs from a CSV file.
Version: 1.0
Author: Marc Gratch
Author URI: https://marcgratch.com
*/
if ( defined( 'WP_CLI' ) && WP_CLI ) {
/**
* Class for WP-CLI command to process URLs from CSV.
*/
class CSV_URL_Processor_Command extends WP_CLI_Command {
/**
* Processes the CSV file and outputs the path to the failed URLs file.
*
* ## OPTIONS
*
* <file>
* : The path to the CSV file.
*
* ## EXAMPLES
*
* wp csv_process file_path.csv
*
* @when after_wp_load
*
* @param array $args The positional arguments.
* @param array $assoc_args The associative arguments.
*/
public function __invoke( $args, $assoc_args ) {
global $wpdb;
list( $file_path ) = $args;
// Truncate $wpdb->prefix . 'redirection_items' table.
$wpdb->query( "TRUNCATE TABLE {$wpdb->prefix}redirection_items" );
// flush rewrite rules via cli.
WP_CLI::runcommand( 'rewrite flush' );
// clear cache and transients with cli.
WP_CLI::runcommand( 'cache flush' );
WP_CLI::runcommand( 'transient delete --all' );
$failed_urls = $this->process_csv_file( $file_path );
// Generate a timestamped filename for failed URLs.
$timestamp = gmdate( 'Ymd_His' );
$failed_file_path = 'failed_urls_' . $timestamp . '.csv';
$failed_handle = fopen( $failed_file_path, 'w' );
foreach ( $failed_urls as $failed_row ) {
fputcsv( $failed_handle, $failed_row );
}
fclose( $failed_handle );
WP_CLI::success( 'Processed CSV. Failed URLs saved to: ' . $failed_file_path );
}
/**
* Reads the CSV file, processes URLs, and returns the failed URLs.
*
* @param string $file_path The path to the CSV file.
*
* @return array
*/
private function process_csv_file( $file_path ) {
$csv_data = array();
$handle = fopen( $file_path, 'r' );
if ( $handle ) {
fgetcsv( $handle ); // Skip header row.
while ( ( $data = fgetcsv( $handle ) ) !== false ) {
$csv_data[] = $data;
}
fclose( $handle );
}
$total_rows = count( $csv_data );
$progress = \WP_CLI\Utils\make_progress_bar( 'Processing URLs', $total_rows );
$redirects = array();
$failed_urls = array();
$source_to_destination = array_column( $csv_data, 1, 0 );
foreach ( $csv_data as $index => $row ) {
$source_url = $row[0];
$destination_url = $row[1];
WP_CLI::log( "Processing URL $index: $source_url -> $destination_url" );
// Resolve the final destination URL for the current source URL.
$final_destination_url = $this->resolve_final_destination( $destination_url, $source_to_destination );
// Store the source URL with its final destination URL.
$redirects[ $source_url ] = $final_destination_url;
$progress->tick();
}
$progress->finish();
WP_CLI::log( 'Completed processing of CSV file. Now validating and creating redirects.' );
// Initialize a new progress bar for redirects validation.
$progress = \WP_CLI\Utils\make_progress_bar( 'Validating and Creating Redirects', count( $redirects ) );
// Validate and create redirects.
foreach ( $redirects as $source => $destination ) {
WP_CLI::log( "Validating URL: $source -> $destination" );
// If it is a relative domain, prepend the site URL.
if ( ! str_contains( $destination, 'http' ) ) {
$site_url = get_site_url();
// remove trailing slash from site URL.
$site_url = rtrim( $site_url, '/' );
$abs_url = $site_url . $destination;
} else {
$abs_url = $destination;
}
$response = wp_remote_get(
$abs_url,
array(
'timeout' => 10,
'sslverify' => false,
)
);
$code = wp_remote_retrieve_response_code( $response );
if ( 200 === $code ) {
$item = array(
'url' => trim( $source ),
'action_data' => array( 'url' => trim( $destination ) ),
'regex' => false,
'group_id' => 1,
'match_type' => 'url',
'action_type' => 'url',
'action_code' => 301,
);
$created = Red_Item::create( $item );
if ( is_wp_error( $created ) ) {
WP_CLI::warning( "Failed to create redirect for: $source -> $destination. Error: " . $created->get_error_message() );
$failed_urls[] = array( $source, $destination, '500', $created->get_error_message() );
} else {
WP_CLI::success( "Redirect created for: $source -> $destination" );
}
} else {
$message = wp_remote_retrieve_response_message( $response );
WP_CLI::warning( "URL validation failed for: $source -> $abs_url. HTTP Status: $code $message" );
$failed_urls[] = array( $source, $destination, $code, $message );
}
$progress->tick();
}
$progress->finish();
return $failed_urls;
}
/**
* Resolves the final destination URL for a given source URL by following the redirect chain.
*
* @param string $destination_url The initial destination URL to resolve.
* @param array $source_to_destination An associative array mapping source URLs to their immediate destinations.
* @return string The resolved final destination URL.
*/
private function resolve_final_destination( string $destination_url, array $source_to_destination ): string {
$domain = '4cornerresources.com';
// Keep following the destination URL until it's not found as a source URL in the array.
while ( $this->url_exists_in_column_a( $destination_url, $source_to_destination, $domain ) ) {
// make sure the destination URL is normalized before checking for existence in the array.
$destination_url = $this->normalize_url( $destination_url, $domain );
$destination_url = $source_to_destination[ $destination_url ];
}
return $destination_url;
}
/**
* Checks if the normalized URL exists in the source column, accounting for trailing slashes.
*
* @param string $url The URL to check for existence as a source URL.
* @param array $source_to_destination An associative array mapping source URLs to their immediate destinations.
* @param string $domain The domain within which to normalize URLs.
* @return bool True if the URL exists as a source, false otherwise.
*/
private function url_exists_in_column_a( string $url, array $source_to_destination, string $domain ): bool {
// Check the URL with and without a trailing slash.
$normalized_url = $this->normalize_url( $url, $domain );
$normalized_url_with_slash = rtrim( $normalized_url, '/' ) . '/';
return array_key_exists( $normalized_url, $source_to_destination ) || array_key_exists( $normalized_url_with_slash, $source_to_destination );
}
/**
* Normalizes a URL by stripping away the domain and protocol if it belongs to the specified domain.
* Returns the URL path if it is a relative URL or belongs to the specified domain.
* Returns the original URL if it's absolute and not of the specified domain.
*
* @param string $url The URL to normalize.
* @param string $domain The domain within which to normalize URLs.
* @return string The normalized URL path or the original URL.
*/
private function normalize_url( string $url, string $domain ): string {
// Parse the URL to get components.
$parsed_url = wp_parse_url( $url );
$host = isset( $parsed_url['host'] ) ? $parsed_url['host'] : '';
// If the URL is relative or the host matches the domain, normalize it.
if ( empty( $host ) || str_contains( $host, $domain ) ) {
// Ensure the path is set and prepend a slash if it's not there.
$path = isset( $parsed_url['path'] ) ? $parsed_url['path'] : '';
return '/' . ltrim( $path, '/' );
}
// Return the original URL if it's absolute and not of the specified domain.
return $url;
}
}
WP_CLI::add_command( 'csv_process', 'CSV_URL_Processor_Command' );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment