Created
May 16, 2017 19:09
-
-
Save engram-design/e4133a41613197dfa494d197059a9dfc to your computer and use it in GitHub Desktop.
Parse Feed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Turn off all error reporting | |
error_reporting(0); | |
$dir = new DirectoryIterator($folder); | |
$json = array(); | |
foreach ($dir as $index => $fileinfo) { | |
if (!$fileinfo->isDot()) { | |
$filename = $fileinfo->getFilename(); | |
//echo '<pre>'; | |
//print_r($fileinfo); | |
//echo '</pre>'; | |
$slug = str_replace('.html', '', substr($filename, 8)); | |
$issue = $issues[substr($filename, 0, 7)]['issue']; | |
$volume = $issues[substr($filename, 0, 7)]['vol']; | |
$number = $issues[substr($filename, 0, 7)]['number']; | |
$html = file_get_contents($folder . $filename); | |
// Start scraping! | |
$doc = new DOMDocument(); | |
$doc->loadHTML($html); | |
$xpath = new DOMXPath($doc); | |
// Start working on the (atrocious) content | |
foreach ($xpath->query('//img') as $key => $node) { | |
// Save the image for later | |
$imgSplit = explode('/', $node->getAttribute('src')); | |
$contentImage = $imgSplit[count($imgSplit) - 1]; | |
// Replace spaces with '-' | |
$contentImage = str_replace('%20', '-', $contentImage); | |
$contentImage = str_replace('.html', '', $contentImage); | |
// Sometimes images we want are surrounded by an 'em' tag containing the caption.. | |
if ($node->parentNode->tagName == 'em') { | |
// Save the caption for later | |
$contentImageCaption = decodeText($node->parentNode->textContent); | |
// Remove the 'em' and 'img' tags | |
$node->parentNode->parentNode->removeChild($node->parentNode); | |
} else { | |
// Remove the 'img' tags | |
$node->parentNode->removeChild($node); | |
} | |
} | |
$description = getValue($xpath->query($articleTable . '//p[1][em]'), 'textContent'); | |
if (!$description) { | |
$description = getValue($xpath->query('/html/head/meta[@name="description"]/@content')); | |
} | |
$content = getHTML($xpath->query($articleTable . '/tr[2]/td')); | |
// Now remove all images from our content - plus any other nasty tags | |
$content = strip_tags($content, '<br><br/><em><em/><b><b/><p><p/><a><a/><strong><strong/><i><i/>'); | |
// Remove any empty tags (probably caused by removing img tags) | |
$content = preg_replace("#<p>(\s| |</?\s?br\s?/?>)*</?p>#", "", $content); | |
$content = preg_replace('/^\s+|\n|\r|\s+$/m', '', $content); | |
// | |
// Create some JSON! | |
// | |
$json[] = array( | |
'description' => $description, | |
'title' => $title, | |
'date' => $date, | |
'contentImage' => $contentImage, | |
'contentImageCaption' => $contentImageCaption, | |
'content' => str_replace(',', '\,', $content), | |
'category' => $category, | |
'type' => $type, | |
'issue' => $issue, | |
'volume' => $volume, | |
'number' => $number, | |
); | |
} | |
} | |
echo json_encode($json); | |
function getValue($items, $attribute = 'value') { | |
foreach ($items as $item) { | |
return decodeText($item->{$attribute}); | |
} | |
} | |
function getHTML($content) { | |
foreach ($content as $key => $node) { | |
return decodeText($node->ownerDocument->saveHTML($node)); | |
} | |
} | |
// Escape backticks, grave accents, etc | |
function decodeText($text) { | |
return convert_smart_quotes(mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8')); | |
//return htmlentities($text, ENT_QUOTES); | |
} | |
function convert_smart_quotes($string) | |
{ | |
$search = array('‘', '’', '“', '”', '—'); | |
$replace = array("'", "'", '"', '"', '-'); | |
return str_replace($search, $replace, $string); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment