andyg2/get_pages.php

## get_pages.php
<?php
if (isset($_GET['host'])) {
  define('WEBSITE_URL', 'https://' . $_GET['host']);
} else {
  echo 'usage: get_pages.php?host=www.domain.com';
  exit;
}


// Create the API endpoint URL to fetch the list of pages with specific fields
$pages_api_url = WEBSITE_URL . '/wp-json/wp/v2/pages/?_fields=author,id,excerpt,status,title,link&per_page=100';
$pages_result = [];
// Initialize cURL session to fetch the list of pages
$ch = curl_init($pages_api_url);

// Set cURL options
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

// Execute the cURL request to get the list of pages
$pages_response = curl_exec($ch);

// Check for cURL errors
if (curl_errno($ch)) {
  echo 'cURL Error: ' . curl_error($ch);
} else {
  // Decode the JSON response
  $pages_data = json_decode($pages_response);

  // Check if pages data is available
  if ($pages_data && is_array($pages_data)) {
    // Loop through each page and fetch its content
    foreach ($pages_data as $page) {
      // prex($page);
      if (isset($page->title->rendered) && $page->status == 'publish') {
        $title = $page_title = $page->title->rendered;
        $slug = slugify($title);
        $pages_json = cache_func('get_wp_page', $page);
        if (isset($pages_json['status'])) {
          $page_extracted = [];
          $page_extracted['slug'] = $pages_json['slug'];
          $page_extracted['title'] = $pages_json['title']['rendered'];
          $page_extracted['content'] = $pages_json['content']['rendered'];
          $page_extracted['excerpt'] = $pages_json['excerpt']['rendered'];
          $page_extracted['template'] = $pages_json['template'];
          $page_extracted['meta'] = $pages_json['meta'];
          downloadImagesFromCss($page_extracted['content'], './images/' . $page_extracted['slug']);
          downloadImagesFromHTML($page_extracted['content'], './images/' . $page_extracted['slug']);
          $pages_result[$slug] = $page_extracted;
        }
      }
    }
  } else {
    echo 'No pages found.';
  }
}

// Close cURL session for the list of pages
curl_close($ch);

file_put_contents('./' . slugify(WEBSITE_URL) . '-pages.json', json_encode($pages_result));

// Function to locate and download images from page content
function downloadImagesFromCss($page_content, $output_directory) {

  if (!is_dir($output_directory)) {
    mkdir($output_directory, 0755, true);
  }

  // Define a regular expression pattern to match image URLs
  $pattern = '/url\((.*?)\)/';

  // Find all image URLs in the content
  if (preg_match_all($pattern, $page_content, $matches)) {
    // Loop through the matched URLs
    foreach ($matches[1] as $imageUrl) {
      // Remove leading and trailing single or double quotes
      $imageUrl = trim($imageUrl, "'\"");

      // Create the image URL
      $image_url = $imageUrl;
      // prex($image_url);

      // Generate a unique filename for the image
      $image_filename = basename($image_url);

      // Define the output path for the downloaded image
      $output_path = $output_directory . '/' . $image_filename;

      if (!file_exists($output_path)) {
        // Download the image
        // Initialize cURL session to download the image
        $ch_image = curl_init($image_url);

        // Set cURL options to save the image to the output path
        $fp = fopen($output_path, 'wb');
        curl_setopt($ch_image, CURLOPT_FILE, $fp);
        curl_setopt($ch_image, CURLOPT_HEADER, 0);
        curl_exec($ch_image);

        // Check for cURL errors
        if (curl_errno($ch_image)) {
          echo "cURL Error: " . curl_error($ch_image);
        } else {
          // echo "Downloaded image: {$image_url}<br>";
        }

        // Close cURL session for the image
        curl_close($ch_image);
        fclose($fp);
      }
    }
  }
}


// Function to locate and download images from HTML content
function downloadImagesFromHTML($html_content, $output_directory) {

  if (!is_dir($output_directory)) {
    mkdir($output_directory, 0755, true);
  }


  // Create a DOMDocument object to parse the HTML
  $dom = new DOMDocument();
  @$dom->loadHTML($html_content); // Use @ to suppress warnings for invalid HTML

  // Find all image tags in the HTML
  $image_tags = $dom->getElementsByTagName('img');

  foreach ($image_tags as $image_tag) {
    // Get the image source URL
    $image_url = $image_tag->getAttribute('src');

    // Generate a unique filename for the image
    $image_filename = basename($image_url);

    // Define the output path for the downloaded image
    $output_path = $output_directory . '/' . $image_filename;

    if (!file_exists($output_path)) {

      // Initialize cURL session to download the image
      $ch_image = curl_init($image_url);

      // Set cURL options to save the image to the output path
      $fp = fopen($output_path, 'wb');
      curl_setopt($ch_image, CURLOPT_FILE, $fp);
      curl_setopt($ch_image, CURLOPT_HEADER, 0);
      curl_exec($ch_image);

      // Check for cURL errors
      if (curl_errno($ch_image)) {
        echo "cURL Error: " . curl_error($ch_image);
      }

      // Close cURL session for the image
      curl_close($ch_image);
      fclose($fp);
    }
  }
}


function get_wp_page($page) {
  $page_id = $page->id;
  $page_title = $page->title->rendered;

  // Create the API endpoint URL to fetch individual page content
  $page_content_api_url = WEBSITE_URL . '/wp-json/wp/v2/pages/' . $page_id;

  // Initialize cURL session to fetch page content
  $ch_page = curl_init($page_content_api_url);

  // Set cURL options
  curl_setopt($ch_page, CURLOPT_RETURNTRANSFER, true);

  // Execute the cURL request to get the page content
  $page_content_response = curl_exec($ch_page);

  // Check for cURL errors
  if (curl_errno($ch_page)) {
    echo 'cURL Error for page ' . $page_title . ': ' . curl_error($ch_page);
    $page_content_data = null;
  } else {
    // Decode the JSON response
    $page_content_data = json_decode($page_content_response, true);
  }

  // Close cURL session for individual page
  curl_close($ch_page);
  return $page_content_data;
}


/**
 * Caches the output of any function for a given amount of time
 *
 * @param string function name $func
 * @param array of function arguments $args
 * @param integer number of seconds to cache the function response $seconds 604800 (one week)
 * @param boolean whether to treat the output of the function as JSON $json true
 * @param string relative or path of cached function results (with trailing slash) $cache_dir ./cache
 * @return void
 */
function cache_func($func, $args, $seconds = 604800, $json = true, $cache_dir = './cache/') {
  // Caches for x seconds ($seconds) the result ($result) of any function ($func) in directory ($cache_dir)


  // create dir if not exists
  if (!is_dir($cache_dir)) {
    mkdir($cache_dir, 0755, true);
  }

  // generate simple hash of function name and arguments
  // file deepcode ignore InsecureHash: Hash only used for filename creation
  $request_hash = md5(json_encode(array($func, $args)));

  // e.g: ./cache/expensive_function-1234567890abcdef1234567890abcdef.json
  $request_file = $cache_dir . $func . '-' . $request_hash . '.' . ($json ? 'json' : 'txt');

  // check cache
  $run = !file_exists($request_file) || (file_exists($request_file) && filemtime($request_file) < time() - $seconds);


  if ($run) { // call the function
    $result = $func($args); // calls the expensive_function with arguments
    $json ? file_put_contents($request_file, json_encode($result)) : file_put_contents($request_file, $result);
  } else { // use the cache
    $result = $json ? json_decode(file_get_contents($request_file), true) : file_get_contents($request_file);
  }
  return ($result);
}


/**
 * Create a slug from a string
 *
 * @param string input string to slugify $string
 * @param mixed single or array of strings to remove from the output string $replace
 * @param string string to separate each word $delimiter [-]
 * @return string a slugified representation of the input string
 *
 * Example
 * $string = Hello World, It's me!
 * $replace = world
 * return: 'hello-its-me'
 */
function slugify($string, $delimiter = '-', $skip = '', $replace = []) {
  $oldLocale = setlocale(LC_ALL, '0');
  setlocale(LC_ALL, 'en_US.UTF-8');
  $clean = iconv('UTF-8', 'ASCII//TRANSLIT', $string);
  if (!empty($replace)) {
    $clean = str_replace((array) $replace, ' ', $clean);
  }

  $regex = '^a-zA-Z0-9\/_|+ -';
  if (!empty($skip)) {
    $regex .= $skip;
  }
  $clean = preg_replace("/[" . $regex . "]/", '', $clean);
  $clean = strtolower($clean);
  $clean = preg_replace("/[\/_|+ -]+/", $delimiter, $clean);
  $clean = trim($clean, $delimiter);
  setlocale(LC_ALL, $oldLocale);
  return $clean;
}

/**
 * Most used function of all time simply outputs a string or array wrapped in a <pre> tag
 *
 * @param array $a array or string that needs wrapping
 * @param boolean $h show optional heading above the output
 * @return void
 */
function pre($a, $h = false) {
  echo $h ? '<h3>' . $h . '</h3><pre>' : '<pre>';
  print_r($a);
  echo '</pre>';
}

/**
 * Outputs a string or array wrapped in a <pre> tag and exists with optional debug_backtrace
 *
 * @param array $a
 * @param boolean $h show optional heading above the output
 * @param boolean $dbg perform backtrace
 * @return void
 */
function prex($a, $h = false, $dbg = false) {
  pre($a, $h);
  if ($dbg) {
    echo '<pre>';
    print_r(debug_backtrace());
    echo '</pre>';
  }
  exit;
}
	<?php
	if (isset($_GET['host'])) {
	define('WEBSITE_URL', 'https://' . $_GET['host']);
	} else {
	echo 'usage: get_pages.php?host=www.domain.com';
	exit;
	}



	// Create the API endpoint URL to fetch the list of pages with specific fields
	$pages_api_url = WEBSITE_URL . '/wp-json/wp/v2/pages/?_fields=author,id,excerpt,status,title,link&per_page=100';
	$pages_result = [];
	// Initialize cURL session to fetch the list of pages
	$ch = curl_init($pages_api_url);

	// Set cURL options
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

	// Execute the cURL request to get the list of pages
	$pages_response = curl_exec($ch);

	// Check for cURL errors
	if (curl_errno($ch)) {
	echo 'cURL Error: ' . curl_error($ch);
	} else {
	// Decode the JSON response
	$pages_data = json_decode($pages_response);

	// Check if pages data is available
	if ($pages_data && is_array($pages_data)) {
	// Loop through each page and fetch its content
	foreach ($pages_data as $page) {
	// prex($page);
	if (isset($page->title->rendered) && $page->status == 'publish') {
	$title = $page_title = $page->title->rendered;
	$slug = slugify($title);
	$pages_json = cache_func('get_wp_page', $page);
	if (isset($pages_json['status'])) {
	$page_extracted = [];
	$page_extracted['slug'] = $pages_json['slug'];
	$page_extracted['title'] = $pages_json['title']['rendered'];
	$page_extracted['content'] = $pages_json['content']['rendered'];
	$page_extracted['excerpt'] = $pages_json['excerpt']['rendered'];
	$page_extracted['template'] = $pages_json['template'];
	$page_extracted['meta'] = $pages_json['meta'];
	downloadImagesFromCss($page_extracted['content'], './images/' . $page_extracted['slug']);
	downloadImagesFromHTML($page_extracted['content'], './images/' . $page_extracted['slug']);
	$pages_result[$slug] = $page_extracted;
	}
	}
	}
	} else {
	echo 'No pages found.';
	}
	}

	// Close cURL session for the list of pages
	curl_close($ch);

	file_put_contents('./' . slugify(WEBSITE_URL) . '-pages.json', json_encode($pages_result));

	// Function to locate and download images from page content
	function downloadImagesFromCss($page_content, $output_directory) {

	if (!is_dir($output_directory)) {
	mkdir($output_directory, 0755, true);
	}

	// Define a regular expression pattern to match image URLs
	$pattern = '/url\((.*?)\)/';

	// Find all image URLs in the content
	if (preg_match_all($pattern, $page_content, $matches)) {
	// Loop through the matched URLs
	foreach ($matches[1] as $imageUrl) {
	// Remove leading and trailing single or double quotes
	$imageUrl = trim($imageUrl, "'\"");

	// Create the image URL
	$image_url = $imageUrl;
	// prex($image_url);

	// Generate a unique filename for the image
	$image_filename = basename($image_url);

	// Define the output path for the downloaded image
	$output_path = $output_directory . '/' . $image_filename;

	if (!file_exists($output_path)) {
	// Download the image
	// Initialize cURL session to download the image
	$ch_image = curl_init($image_url);

	// Set cURL options to save the image to the output path
	$fp = fopen($output_path, 'wb');
	curl_setopt($ch_image, CURLOPT_FILE, $fp);
	curl_setopt($ch_image, CURLOPT_HEADER, 0);
	curl_exec($ch_image);

	// Check for cURL errors
	if (curl_errno($ch_image)) {
	echo "cURL Error: " . curl_error($ch_image);
	} else {
	// echo "Downloaded image: {$image_url}<br>";
	}

	// Close cURL session for the image
	curl_close($ch_image);
	fclose($fp);
	}
	}
	}
	}


	// Function to locate and download images from HTML content
	function downloadImagesFromHTML($html_content, $output_directory) {

	if (!is_dir($output_directory)) {
	mkdir($output_directory, 0755, true);
	}


	// Create a DOMDocument object to parse the HTML
	$dom = new DOMDocument();
	@$dom->loadHTML($html_content); // Use @ to suppress warnings for invalid HTML

	// Find all image tags in the HTML
	$image_tags = $dom->getElementsByTagName('img');

	foreach ($image_tags as $image_tag) {
	// Get the image source URL
	$image_url = $image_tag->getAttribute('src');

	// Generate a unique filename for the image
	$image_filename = basename($image_url);

	// Define the output path for the downloaded image
	$output_path = $output_directory . '/' . $image_filename;

	if (!file_exists($output_path)) {

	// Initialize cURL session to download the image
	$ch_image = curl_init($image_url);

	// Set cURL options to save the image to the output path
	$fp = fopen($output_path, 'wb');
	curl_setopt($ch_image, CURLOPT_FILE, $fp);
	curl_setopt($ch_image, CURLOPT_HEADER, 0);
	curl_exec($ch_image);

	// Check for cURL errors
	if (curl_errno($ch_image)) {
	echo "cURL Error: " . curl_error($ch_image);
	}

	// Close cURL session for the image
	curl_close($ch_image);
	fclose($fp);
	}
	}
	}


	function get_wp_page($page) {
	$page_id = $page->id;
	$page_title = $page->title->rendered;

	// Create the API endpoint URL to fetch individual page content
	$page_content_api_url = WEBSITE_URL . '/wp-json/wp/v2/pages/' . $page_id;

	// Initialize cURL session to fetch page content
	$ch_page = curl_init($page_content_api_url);

	// Set cURL options
	curl_setopt($ch_page, CURLOPT_RETURNTRANSFER, true);

	// Execute the cURL request to get the page content
	$page_content_response = curl_exec($ch_page);

	// Check for cURL errors
	if (curl_errno($ch_page)) {
	echo 'cURL Error for page ' . $page_title . ': ' . curl_error($ch_page);
	$page_content_data = null;
	} else {
	// Decode the JSON response
	$page_content_data = json_decode($page_content_response, true);
	}

	// Close cURL session for individual page
	curl_close($ch_page);
	return $page_content_data;
	}






	/**
	* Caches the output of any function for a given amount of time
	*
	* @param string function name $func
	* @param array of function arguments $args
	* @param integer number of seconds to cache the function response $seconds 604800 (one week)
	* @param boolean whether to treat the output of the function as JSON $json true
	* @param string relative or path of cached function results (with trailing slash) $cache_dir ./cache
	* @return void
	*/
	function cache_func($func, $args, $seconds = 604800, $json = true, $cache_dir = './cache/') {
	// Caches for x seconds ($seconds) the result ($result) of any function ($func) in directory ($cache_dir)


	// create dir if not exists
	if (!is_dir($cache_dir)) {
	mkdir($cache_dir, 0755, true);
	}

	// generate simple hash of function name and arguments
	// file deepcode ignore InsecureHash: Hash only used for filename creation
	$request_hash = md5(json_encode(array($func, $args)));

	// e.g: ./cache/expensive_function-1234567890abcdef1234567890abcdef.json
	$request_file = $cache_dir . $func . '-' . $request_hash . '.' . ($json ? 'json' : 'txt');

	// check cache
	$run = !file_exists($request_file) \|\| (file_exists($request_file) && filemtime($request_file) < time() - $seconds);


	if ($run) { // call the function
	$result = $func($args); // calls the expensive_function with arguments
	$json ? file_put_contents($request_file, json_encode($result)) : file_put_contents($request_file, $result);
	} else { // use the cache
	$result = $json ? json_decode(file_get_contents($request_file), true) : file_get_contents($request_file);
	}
	return ($result);
	}


	/**
	* Create a slug from a string
	*
	* @param string input string to slugify $string
	* @param mixed single or array of strings to remove from the output string $replace
	* @param string string to separate each word $delimiter [-]
	* @return string a slugified representation of the input string
	*
	* Example
	* $string = Hello World, It's me!
	* $replace = world
	* return: 'hello-its-me'
	*/
	function slugify($string, $delimiter = '-', $skip = '', $replace = []) {
	$oldLocale = setlocale(LC_ALL, '0');
	setlocale(LC_ALL, 'en_US.UTF-8');
	$clean = iconv('UTF-8', 'ASCII//TRANSLIT', $string);
	if (!empty($replace)) {
	$clean = str_replace((array) $replace, ' ', $clean);
	}

	$regex = '^a-zA-Z0-9\/_\|+ -';
	if (!empty($skip)) {
	$regex .= $skip;
	}
	$clean = preg_replace("/[" . $regex . "]/", '', $clean);
	$clean = strtolower($clean);
	$clean = preg_replace("/[\/_\|+ -]+/", $delimiter, $clean);
	$clean = trim($clean, $delimiter);
	setlocale(LC_ALL, $oldLocale);
	return $clean;
	}

	/**
	* Most used function of all time simply outputs a string or array wrapped in a <pre> tag
	*
	* @param array $a array or string that needs wrapping
	* @param boolean $h show optional heading above the output
	* @return void
	*/
	function pre($a, $h = false) {
	echo $h ? '<h3>' . $h . '</h3><pre>' : '<pre>';
	print_r($a);
	echo '</pre>';
	}

	/**
	* Outputs a string or array wrapped in a <pre> tag and exists with optional debug_backtrace
	*
	* @param array $a
	* @param boolean $h show optional heading above the output
	* @param boolean $dbg perform backtrace
	* @return void
	*/
	function prex($a, $h = false, $dbg = false) {
	pre($a, $h);
	if ($dbg) {
	echo '<pre>';
	print_r(debug_backtrace());
	echo '</pre>';
	}
	exit;
	}