Skip to content

Instantly share code, notes, and snippets.

@adamfranco
Created July 20, 2023 13:20
Show Gist options
  • Save adamfranco/15f0cdf716b43a82bc74eda3d3b40b7a to your computer and use it in GitHub Desktop.
Save adamfranco/15f0cdf716b43a82bc74eda3d3b40b7a to your computer and use it in GitHub Desktop.
Fix image paths in an archive of sites.middlebury.edu/middmag-old/ produced by HTTrack
#!/usr/bin/env php
<?php
$file = $argv[1];
if (!preg_match('#\.html$#', $file)) {
throw new Exception("$file must be a .html file.");
}
if (preg_match('#(.+)sites.middlebury.edu/middmag-old/(.+)#', $file, $pathMatches)) {
$directory = dirname($pathMatches[2]);
if ($directory == '.') {
$sitePrefix = '';
} else {
$path = explode("/", $directory);
$depth = count($path);
$sitePrefix = implode("/", array_fill(0, $depth, '..')).'/';
}
} else {
throw new Exception("File must be within sites.middlebury.edu/middmag-old/, $file given.");
}
var_dump($directory, $path, $depth, $sitePrefix);
$html = $origHtml = file_get_contents($file);
$patterns = [];
######################################################
# Replacement patterns and functions.
######################################################
# Images under sites.middlebury.edu/middmag/
$patterns['# (src|href)="((?:\.\./)+)\.\./middmag/(files/(?:[^/"]+/)*([^"/]+))"#'] = 'fix_middmag_files';
function fix_middmag_files($file, $html, $sitePrefix, $m) {
$srcAttra = $m[0];
$relativePrefix = $m[2];
$filePath = $m[3];
$newAttr = ' '.$m[1].'="'.$relativePrefix.$filePath.'"';
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n");
return str_replace($srcAttra, $newAttr, $html);
}
# Images under middleburymagazine.com/
$patterns['# (src|href)="((?:\.\./)+)\.\./\.\./middleburymagazine\.com/(files/(?:[^/"]+/)*([^"/]+))"#'] = 'fix_middleburymagazine_com_files';
function fix_middleburymagazine_com_files($file, $html, $sitePrefix, $m) {
$srcAttra = $m[0];
$relativePrefix = $m[2];
$path = dirname($m[3]);
$fileName = basename($m[3]);
// Look for a png, jpg or real path.
if (preg_match('/\.html$/', $fileName)) {
$realPath = realpath(dirname($file).'/'.$relativePrefix.$path);
$fileNameBase = pathinfo($fileName, PATHINFO_FILENAME);
foreach (scandir($realPath) as $f) {
// Strip the extension and compare.
$base = pathinfo($f, PATHINFO_FILENAME);
if ($base == $fileNameBase) {
$fileName = $f;
break;
}
}
}
$newAttr = ' '.$m[1].'="'.$relativePrefix.$path.'/'.$fileName.'"';
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n");
return str_replace($srcAttra, $newAttr, $html);
}
# Links to sites.middlebury.edu/middmag/
$patterns['# (href|src)="https?://(?:sites\.middlebury\.edu/middmag|middmag\.com)/([^"]+)"#'] = 'fix_middmag_links';
function fix_middmag_links($file, $html, $sitePrefix, $m) {
$srcAttra = $m[0];
$u = parse_url($m[2]);
if (empty($u['path'])) {
$pagePath = 'index.html';
} else {
$pagePath = $u['path'];
}
// Trailing slash as expected
if (preg_match('#/$#', $pagePath)) {
// Ending with a trailing slash as expected for directories.
$pagePath = $pagePath.'index.html';
}
// Ending with a filename.
elseif (preg_match('#^.+/([^/]+\.\w+)$#', $pagePath)) {
// Nothing to do.
}
// Odd special case seen.
elseif($pagePath == '2014/02/26/uncle-donnie-t…s-on-the-world') {
$pagePath = '2014/02/26/uncle-donnie-takes-on-the-world/index.html';
}
// ending with no file extension or trailing slash.
else {
$pagePath = $pagePath.'/index.html';
}
# Add query string.
if (!empty($u['query'])) {
$pagePath .= '?'.$u['query'];
}
# Add fragments
if (!empty($u['fragment'])) {
$pagePath .= '#'.$u['fragment'];
}
$newAttr = ' '.$m[1].'="'.$sitePrefix.$pagePath.'"';
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n");
return str_replace($srcAttra, $newAttr, $html);
}
# Remove onclick from branding.
$patterns['# onclick="location.href=\'http://sites.middlebury.edu/middmag-old\';"#'] = 'remove_branding_onclick';
function remove_branding_onclick($file, $html, $sitePrefix, $m) {
$srcAttra = $m[0];
$newAttr = 'onclick="location.href=\''.$sitePrefix.'index.html\';"';
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n");
return str_replace($srcAttra, $newAttr, $html);
}
# Fix srcset urls.
$patterns['# srcset="([^"]+)"#'] = 'fix_srcset_urls';
function fix_srcset_urls($file, $html, $sitePrefix, $m) {
$srcAttra = $m[0];
$contents = $m[1];
$contents = str_replace('https://sites.middlebury.edu/middmag-old/', $sitePrefix, $contents);
$newAttr = 'srcset="'. $contents .'"';
fwrite(STDOUT, "\n".__FUNCTION__."\n$srcAttra\n$newAttr\n");
return str_replace($srcAttra, $newAttr, $html);
}
######################################################
# Loop through our patterns and make changes.
######################################################
foreach ($patterns as $regex => $callback) {
if (preg_match_all($regex, $html, $matches, PREG_SET_ORDER)) {
foreach ($matches as $match) {
$html = call_user_func($callback, $file, $html, $sitePrefix, $match);
}
}
}
if ($html != $origHtml) {
file_put_contents($file, $html);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment