engram-design/gist:e4133a41613197dfa494d197059a9dfc

## gistfile1.txt
<?php

// Turn off all error reporting
error_reporting(0);

$dir = new DirectoryIterator($folder);

$json = array();

foreach ($dir as $index => $fileinfo) {
    if (!$fileinfo->isDot()) {
    	$filename = $fileinfo->getFilename();

    	//echo '<pre>';
    	//print_r($fileinfo);
    	//echo '</pre>';

    	$slug = str_replace('.html', '', substr($filename, 8));

    	$issue = $issues[substr($filename, 0, 7)]['issue'];
    	$volume = $issues[substr($filename, 0, 7)]['vol'];
    	$number = $issues[substr($filename, 0, 7)]['number'];

		$html = file_get_contents($folder . $filename);

		// Start scraping!
		$doc = new DOMDocument();
	    $doc->loadHTML($html);
	    $xpath = new DOMXPath($doc);

	    // Start working on the (atrocious) content
		foreach ($xpath->query('//img') as $key => $node) {

			// Save the image for later
			$imgSplit = explode('/', $node->getAttribute('src'));
		    $contentImage = $imgSplit[count($imgSplit) - 1];

		    // Replace spaces with '-'
		    $contentImage = str_replace('%20', '-', $contentImage);
		    $contentImage = str_replace('.html', '', $contentImage);

		    // Sometimes images we want are surrounded by an 'em' tag containing the caption..
			if ($node->parentNode->tagName == 'em') {

				// Save the caption for later
			    $contentImageCaption = decodeText($node->parentNode->textContent);

		    	// Remove the 'em' and 'img' tags
		    	$node->parentNode->parentNode->removeChild($node->parentNode);
			} else {

		    	// Remove the 'img' tags
		    	$node->parentNode->removeChild($node);
			}
		}

	    $description = getValue($xpath->query($articleTable . '//p[1][em]'), 'textContent');

	    if (!$description) {
	    	$description = getValue($xpath->query('/html/head/meta[@name="description"]/@content'));
	    }

	    $content = getHTML($xpath->query($articleTable . '/tr[2]/td'));

	    // Now remove all images from our content - plus any other nasty tags
	    $content = strip_tags($content, '<br><br/><em><em/><b><b/><p><p/><a><a/><strong><strong/><i><i/>');

	    // Remove any empty tags (probably caused by removing img tags)
	    $content = preg_replace("#<p>(\s|&nbsp;|</?\s?br\s?/?>)*</?p>#", "", $content);

	    $content = preg_replace('/^\s+|\n|\r|\s+$/m', '', $content);

	    //
	    // Create some JSON!
	    //
	    $json[] = array(
	    	'description' => $description,
	    	'title' => $title,
	    	'date' => $date,

	    	'contentImage' => $contentImage,
	    	'contentImageCaption' => $contentImageCaption,

	    	'content' => str_replace(',', '\,', $content),

	    	'category' => $category,
	    	'type' => $type,

	    	'issue' => $issue,
	    	'volume' => $volume,
	    	'number' => $number,
	    );

    }
}

echo json_encode($json);


function getValue($items, $attribute = 'value') {
	foreach ($items as $item) {
	    return decodeText($item->{$attribute});
    }
}

function getHTML($content) {
	foreach ($content as $key => $node) {
		return decodeText($node->ownerDocument->saveHTML($node));
	}
}

// Escape backticks, grave accents, etc
function decodeText($text) {
	return convert_smart_quotes(mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8'));
	//return htmlentities($text, ENT_QUOTES);
}

function convert_smart_quotes($string)
{
    $search = array('&lsquo;', '&rsquo;', '&ldquo;', '&rdquo;', '&mdash;');

    $replace = array("'", "'", '"', '"', '-');

    return str_replace($search, $replace, $string);
}
	<?php

	// Turn off all error reporting
	error_reporting(0);

	$dir = new DirectoryIterator($folder);

	$json = array();

	foreach ($dir as $index => $fileinfo) {
	if (!$fileinfo->isDot()) {
	$filename = $fileinfo->getFilename();

	//echo '<pre>';
	//print_r($fileinfo);
	//echo '</pre>';

	$slug = str_replace('.html', '', substr($filename, 8));

	$issue = $issues[substr($filename, 0, 7)]['issue'];
	$volume = $issues[substr($filename, 0, 7)]['vol'];
	$number = $issues[substr($filename, 0, 7)]['number'];

	$html = file_get_contents($folder . $filename);

	// Start scraping!
	$doc = new DOMDocument();
	$doc->loadHTML($html);
	$xpath = new DOMXPath($doc);

	// Start working on the (atrocious) content
	foreach ($xpath->query('//img') as $key => $node) {

	// Save the image for later
	$imgSplit = explode('/', $node->getAttribute('src'));
	$contentImage = $imgSplit[count($imgSplit) - 1];

	// Replace spaces with '-'
	$contentImage = str_replace('%20', '-', $contentImage);
	$contentImage = str_replace('.html', '', $contentImage);

	// Sometimes images we want are surrounded by an 'em' tag containing the caption..
	if ($node->parentNode->tagName == 'em') {

	// Save the caption for later
	$contentImageCaption = decodeText($node->parentNode->textContent);

	// Remove the 'em' and 'img' tags
	$node->parentNode->parentNode->removeChild($node->parentNode);
	} else {

	// Remove the 'img' tags
	$node->parentNode->removeChild($node);
	}
	}

	$description = getValue($xpath->query($articleTable . '//p[1][em]'), 'textContent');

	if (!$description) {
	$description = getValue($xpath->query('/html/head/meta[@name="description"]/@content'));
	}

	$content = getHTML($xpath->query($articleTable . '/tr[2]/td'));

	// Now remove all images from our content - plus any other nasty tags
	$content = strip_tags($content, '<br><br/><em><em/><b><b/><p><p/><a><a/><strong><strong/><i><i/>');

	// Remove any empty tags (probably caused by removing img tags)
	$content = preg_replace("#<p>(\s\| \|</?\s?br\s?/?>)*</?p>#", "", $content);

	$content = preg_replace('/^\s+\|\n\|\r\|\s+$/m', '', $content);

	//
	// Create some JSON!
	//
	$json[] = array(
	'description' => $description,
	'title' => $title,
	'date' => $date,

	'contentImage' => $contentImage,
	'contentImageCaption' => $contentImageCaption,

	'content' => str_replace(',', '\,', $content),

	'category' => $category,
	'type' => $type,

	'issue' => $issue,
	'volume' => $volume,
	'number' => $number,
	);

	}
	}

	echo json_encode($json);



	function getValue($items, $attribute = 'value') {
	foreach ($items as $item) {
	return decodeText($item->{$attribute});
	}
	}

	function getHTML($content) {
	foreach ($content as $key => $node) {
	return decodeText($node->ownerDocument->saveHTML($node));
	}
	}

	// Escape backticks, grave accents, etc
	function decodeText($text) {
	return convert_smart_quotes(mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8'));
	//return htmlentities($text, ENT_QUOTES);
	}

	function convert_smart_quotes($string)
	{
	$search = array('‘', '’', '“', '”', '—');

	$replace = array("'", "'", '"', '"', '-');

	return str_replace($search, $replace, $string);
	}