Last active
June 26, 2025 15:17
-
-
Save dave-mills/d73d254d7a1facc1b51ea6375520c34b to your computer and use it in GitHub Desktop.
A script for a one-time formatting of blog posts, which were originally turned into HTML by copy-pasting from MS Word into a wysiwyg editor, along with a bunch of weird stylings and formatting oddities.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Article::all() | |
->each(function (Article $article) { | |
// Mark "width" inline styles for keeping | |
$article->content = Str::replaceMatches( | |
'/style=(\"[^\"]*width:[^\"]*\")/', | |
fn($matches) => 'keepwidth_123=' . $matches[1], | |
$article->old_content | |
); | |
// Remove inline styles, except image widths | |
$article->content = Str::replaceMatches( | |
'/style=\"[^\"]*\"/', | |
'', | |
$article->content | |
); | |
// put widths back | |
$article->content = Str::replaceMatches( | |
'/keepwidth_123=(\"[^\"]*width:[^\"]*\")/', | |
fn($matches) => 'style=' . $matches[1], | |
$article->content, | |
); | |
// Remove inline classes | |
$article->content = Str::replaceMatches( | |
'/class=\"[^\"]*\"/', | |
'', | |
$article->content | |
); | |
// Remove inline font declarations (!) | |
$article->content = Str::replaceMatches( | |
'/<font [^>]*>/', | |
'', | |
$article->content | |
); | |
// remove errant spaces before the end of html tags <> | |
$article->content = Str::replaceMatches( | |
'/\s+>/', | |
'>', | |
$article->content | |
); | |
// remove no-brake-spaces (hidden characters) | |
$article->content = Str::replaceMatches( | |
[ | |
'/ /', | |
'/<o:p>/', | |
'/<\/o:p>/', | |
'/<\/font>/', | |
'/<span[^>]*>/', | |
'/<\/span>/', | |
], | |
'', | |
$article->content | |
); | |
// remove newlines (line-breaks are all over the place in many of the articles) | |
$article->content = Str::replaceMatches( | |
'/[\n\r]+/', | |
' ', | |
$article->content | |
); | |
// Add in linebreaks to match paragraph tags | |
$article->content = Str::replaceMatches( | |
'/<\/p>?[\s]*</', | |
'</p> | |
<', | |
$article->content | |
); | |
// Add in linebreaks after header tags | |
$article->content = Str::replaceMatches( | |
'/<\/h(\d)>?[\s]*</', | |
fn($matches) => '</h' . $matches[1] . '> | |
<', | |
$article->content | |
); | |
// and after <br/> tags | |
$article->content = Str::replaceMatches( | |
'/<br\/>?[\s]*</', | |
'<br\/> | |
<', | |
$article->content | |
); | |
// Remove empty tags | |
$article->content = Str::replaceMatches( | |
'/[\r\n]<p>\s*<\/p>/', | |
'', | |
$article->content | |
); | |
// Remove empty space before the end of a <p> tag. | |
$article->content = Str::replaceMatches( | |
'/[\s\n\r]*<\/p>/', | |
'</p>', | |
$article->content | |
); | |
// Remove span tags, as they're only used for inline formatting | |
$article->content = Str::replaceMatches( | |
[ | |
'/<span>/', | |
'/<\/span>/', | |
], | |
'', | |
$article->content | |
); | |
// $article->content = $article->old_content; | |
$article->save(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment