Skip to content

Instantly share code, notes, and snippets.

@jwinett
Created June 22, 2018 23:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwinett/d86c9620ecdbcc810715bc619c287fbb to your computer and use it in GitHub Desktop.
Save jwinett/d86c9620ecdbcc810715bc619c287fbb to your computer and use it in GitHub Desktop.
Parse Newsletter HTML for insertion into database
<?php
$uri = $_GET["i"];
$url = "https://www.karenware.com$uri";
$text = file_get_contents( $url );
$dom = new DOMDocument;
@$dom->loadHTML($text, ( LIBXML_HTML_NODEFDTD+LIBXML_COMPACT ) & ~LIBXML_NOENT );
$content = $dom->getElementById( "contentNode" );
$content->normalize();
$new = new DOMDocument;
$state = 1;
foreach( $content->childNodes as $n ) {
/**
*
* @var DOMNode $node
*/
$node = $n;
$continue = false;
switch( $node->nodeType ) {
case XML_TEXT_NODE: // strip leading whitespace
if( $state == 1 ) {
if( ctype_space($node->nodeValue) ) {
$continue = true;
break;
}
$state++;
}
break;
case XML_COMMENT_NODE: // strip comments
$continue = true;
break;
case XML_ELEMENT_NODE:
if( $node->tagName == "a" && $node->hasAttributes() && $node->hasChildNodes() ) {
// Convert <a name="0"><h3>Something</h3></a> into <h3 id="0">Something</h3>
$name = $node->attributes->getNamedItem( "name" );
if( $name ) {
$count = 0;
foreach( $node->childNodes as $child ) {
if( !$count++ ) {
$child->setAttribute( "id", $name->value );
}
$new->appendChild( $new->importNode($child, true ) );
}
$continue = true;
break;
}
}
}
if( $continue )
continue;
$new->appendChild( $new->importNode($node, true ) );
}
echo $new->saveHTML();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment