Skip to content

Instantly share code, notes, and snippets.

@JiveDig
Created April 18, 2024 15:35
Show Gist options
  • Save JiveDig/d8d620c7df6e7863c653eedbf1c4ab2d to your computer and use it in GitHub Desktop.
Save JiveDig/d8d620c7df6e7863c653eedbf1c4ab2d to your computer and use it in GitHub Desktop.
A PHP class for WordPress to import posts (or pages, cpt) as basic HTML, including importing images from content.
<?php
// Prevent direct file access.
defined( 'ABSPATH' ) || die;
/**
* WP-CLI to delete all content.
*
wp post delete $(wp post list --post_type=post --format=ids) --force
wp post delete $(wp post list --post_type=page --format=ids) --force
wp post delete $(wp post list --post_type=attachment --format=ids) --force
wp term delete category $(wp term list category --field=term_id)
wp term delete post_tag $(wp term list post_tag --field=term_id)
*/
/**
* Instantiate the class.
*
* @since 0.1.0
*
* @return void
*/
new Mai_CLI_Sitemap_Importer;
/**
* Gets it started.
*
* @since 0.1.0
*
* @link https://docs.wpvip.com/how-tos/write-custom-wp-cli-commands/
* @link https://webdevstudios.com/2019/10/08/making-wp-cli-commands/
*
* @return void
*/
add_action( 'cli_init', function() {
WP_CLI::add_command( 'maisitemap', 'Mai_CLI_Sitemap_Importer' );
});
/**
* Split testing class.
*
* @version 0.1.0
*/
class Mai_CLI_Sitemap_Importer {
/**
* Gets environment.
*
* Usage: wp maisitemap get_environment
*
* @return void
*/
function get_environment() {
WP_CLI::log( sprintf( 'Environment: %s', wp_get_environment_type() ) );
}
/**
* Import pages from sitemap.
*
* Usage:
* wp maisitemap import --sitemap_url=https://example.com/sitemap.xml
* wp maisitemap import --sitemap_url=https://www.example.com/sitemap.xml --xpath="(//div[@class='page-content'])[1]" --post_type=page
*
* @since 0.1.0
*
* @param array $args Standard command args.
* @param array $assoc_args Keyed args like --search and --fields.
*
* @return void
*/
function import( $args, $assoc_args ) {
// Parse args.
$assoc_args = wp_parse_args(
$assoc_args,
[
'sitemap_url' => '', // Required. The sitemap URL.
'xpath' => '//main', // The XPath to the content.
'post_type' => 'post',
'post_status' => 'publish',
]
);
// Bail if no URL.
if ( ! $assoc_args['sitemap_url'] ) {
WP_CLI::error( 'Please provide a sitemap URL.' );
return;
}
// Get sitemap data.
$response = wp_remote_get( $assoc_args['sitemap_url'] );
$code = wp_remote_retrieve_response_code( $response );
// Bail if error.
if ( is_wp_error( $response ) ) {
WP_CLI::line( $response->get_error_message() );
return;
}
// Bail if error.
if ( 200 !== $code ) {
// Build error message.
$message = $code;
// Add error message if available.
if ( isset( $body['error']['message'] ) ) {
$message .= ' ' . $body['error']['message'];
}
WP_CLI::line( $message );
return;
}
// Get body and parse XML.
$body = wp_remote_retrieve_body( $response );
$xml = simplexml_load_string( $body );
// Bail if no XML.
if ( ! $xml ) {
WP_CLI::error( 'Could not parse XML.' );
return;
}
// Bail if no URLs.
if ( ! $xml->url || ! count( $xml->url ) ) {
WP_CLI::error( 'No URLs found in sitemap.' );
return;
}
// Loop through URLs
foreach ( $xml->url as $url ) {
$url = (string) $url->loc;
// Skip if no URL.
if ( ! $url ) {
continue;
}
// Fetch HTML content for each URL.
$html_response = wp_remote_get( $url );
// Skip if error.
if ( is_wp_error( $html_response ) ) {
WP_CLI::line( $response->get_error_message() );
return;
}
// Get HTML.
$html = wp_remote_retrieve_body( $html_response );
// Create the new document.
$dom = new DOMDocument();
// Modify state.
$libxml_previous_state = libxml_use_internal_errors( true );
// Load the content in the document HTML.
$dom->loadHTML( $html );
// Setup XPath.
$xpath = new DOMXPath( $dom );
// Parse URL.
$parse = wp_parse_url( $url );
// Get the host.
$host = $parse['host'];
// Get slug.
$slug = $parse['path'];
$slug = trim( $slug, '/' );
// Get first h1.
$nodes = $xpath->query( '(//h1)[1]' );
$title = $nodes->item(0)->nodeValue;
// If no h1, get title from slug.
if ( ! $title ) {
$title = str_replace( '-', ' ', $slug );
$title = ucwords( $title );
}
// Get main content.
$nodes = $xpath->query( $assoc_args['xpath'] );
// Skip if no nodes.
if ( ! $nodes->length ) {
continue;
}
// Save new HTML.
$content = $dom->saveHTML( $nodes->item(0) );
// Remove content of <script> and <style> tags
$content = preg_replace( '/<script\b[^>]*>.*?<\/script>/is', '', $content );
$content = preg_replace( '/<style\b[^>]*>.*?<\/style>/is', '', $content );
// Get valid tags.
$tags = [
'p',
'a',
'ul',
'ol',
'li',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'blockquote',
'img',
'figure',
'figcaption',
'iframe',
'video',
'audio',
'source',
'pre',
'code',
'br',
'hr',
'em',
'strong',
];
// Strip tags.
$content = strip_tags( $content, $tags );
// Loop through tags.
foreach ( $tags as $tag ) {
// Set up tag processor.
$tags = new WP_HTML_Tag_Processor( $content );
// Loop through tags.
while ( $tags->next_tag( [ 'tag_name' => $tag ] ) ) {
$tags->remove_attribute( 'id' );
$tags->remove_attribute( 'class' );
$tags->remove_attribute( 'style' );
}
$content = $tags->get_updated_html();
}
// Set up post args.
$post_args = [
'post_title' => $title,
'post_name' => $slug,
// 'post_content' => $content, // Added later, after we have a post ID for images.
'post_status' => $assoc_args['post_status'],
'post_type' => $assoc_args['post_type'],
];
// Insert the post.
$post_id = wp_insert_post( $post_args );
// Skip if no post ID.
if ( is_wp_error( $post_id ) ) {
WP_CLI::line( $post_id->get_error_message() );
continue;
}
// Set up tag processor to import images.
$tags = new WP_HTML_Tag_Processor( $content );
// First, get the featured image.
$first = true;
// Loop through tags.
while ( $tags->next_tag( [ 'tag_name' => 'img' ] ) ) {
// Get src.
$src = $tags->get_attribute( 'src' );
// Skip if no src.
if ( ! $src ) {
continue;
}
// Parse the image src.
$parse = wp_parse_url( $src );
// If no host, add it.
if ( ! wp_parse_url( $src, PHP_URL_HOST ) ) {
$scheme = $parse['scheme'] ?: 'https';
$src = $scheme . '://' . trailingslashit( $host ) . ltrim( $src, '/' );
}
// Maybe upload the image.
$image_id = $this->upload_image( $src, 'ref_url', $src, $post_id );
// If we have an image ID.
if ( $image_id ) {
// Get image url.
$image_url = wp_get_attachment_image_url( $image_id, 'large' );
// Update the src.
$tags->set_attribute( 'src', $image_url );
// If first image, set as featured image.
if ( $first ) {
// Set the featured image.
set_post_thumbnail( $post_id, $image_id );
// Not first.
$first = false;
}
WP_CLI::line( sprintf( 'Image imported: %s', $image_url ) );
}
}
// Update the content.
$content = $tags->get_updated_html();
// Update the post content.
wp_update_post(
[
'ID' => $post_id,
'post_content' => $content,
]
);
// Log post inserted.
WP_CLI::line( sprintf( '%s inserted: %s', ucwords( $assoc_args['post_type'] ), get_permalink( $post_id ) ) );
}
WP_CLI::success( 'Import complete.' );
}
/**
* Downloads a remote file and inserts it into the WP Media Library.
*
* @access private
*
* @see https://developer.wordpress.org/reference/functions/media_handle_sideload/
*
* @param string $ref_uri The reference URI of a remote file.
* @param string $ref_key The reference key of a remote file.
* @param string $url HTTP URL address of a remote file.
* @param int $post_id The post ID the media is associated with.
*
* @return int|WP_Error The ID of the attachment or a WP_Error on failure.
*/
function upload_image( $ref_uri, $ref_key, $image_url, $post_id ) {
// Make sure we have the functions we need.
if ( ! function_exists( 'download_url' ) || ! function_exists( 'media_handle_sideload' ) ) {
require_once( ABSPATH . 'wp-admin/includes/media.php' );
require_once( ABSPATH . 'wp-admin/includes/file.php' );
require_once( ABSPATH . 'wp-admin/includes/image.php' );
}
// Check if there is an attachment with places_url meta key and value of $image_url.
$existing_ids = get_posts(
[
'post_type' => 'attachment',
'post_status' => 'any',
'meta_key' => $ref_key,
'meta_value' => $ref_uri,
'meta_compare' => '=',
'fields' => 'ids',
]
);
// Get existing ID.
$existing_id = $existing_ids && isset( $existing_ids[0] ) ? $existing_ids[0] : 0;
// Bail if the image already exists.
if ( $existing_id ) {
return $existing_id;
}
// Get contents of the image url.
$image_hashed = md5( $image_url ) . '.jpg';
$image_contents = file_get_contents( $image_url );
// If contents.
if ( $image_contents ) {
// Get the uploads directory.
$upload_dir = wp_get_upload_dir();
$upload_url = $upload_dir['baseurl'];
// Specify the path to the destination directory within uploads.
$destination_dir = $upload_dir['basedir'] . '/mai-sitemap-import/';
// Create the destination directory if it doesn't exist.
if ( ! file_exists( $destination_dir ) ) {
mkdir( $destination_dir, 0755, true );
}
// Specify the path to the destination file.
$destination_file = $destination_dir . $image_hashed;
// Save the image to the destination file.
file_put_contents( $destination_file, $image_contents );
// Bail if the file doesn't exist.
if ( ! file_exists( $destination_file ) ) {
return 0;
}
$image_url = $image_hashed;
}
// Bail, no image contents.
else {
return 0;
}
// Build the image url.
$image_url = untrailingslashit( $upload_url ) . '/mai-sitemap-import/' . $image_hashed;
// Build a temp url.
$tmp = download_url( $image_url );
// Remove the temp file.
wp_delete_file( $destination_file );
// Bail if error.
if ( is_wp_error( $tmp ) ) {
// ray( $tmp->get_error_code() . ': upload_image() 1 ' . $image_url . ' ' . $tmp->get_error_message() );
// Remove the original image and return the error.
wp_delete_file( $tmp );
return 0;
}
// Build the file array.
$file_array = [
'name' => basename( $image_url ),
'tmp_name' => $tmp,
];
// Add the image to the media library.
$image_id = media_handle_sideload( $file_array, $post_id );
// Bail if error.
if ( is_wp_error( $image_id ) ) {
// ray( $image_id->get_error_code() . ': upload_image() 2 ' . $image_url . ' ' . $image_id->get_error_message() );
// Remove the original image and return the error.
wp_delete_file( $file_array[ 'tmp_name' ] );
return $image_id;
}
// Remove the original image.
wp_delete_file( $file_array[ 'tmp_name' ] );
// Set the reference url for possible reference later.
update_post_meta( $image_id, $ref_key, $ref_uri );
return $image_id;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment