Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save engram-design/e4133a41613197dfa494d197059a9dfc to your computer and use it in GitHub Desktop.
Save engram-design/e4133a41613197dfa494d197059a9dfc to your computer and use it in GitHub Desktop.
Parse Feed
<?php
// Turn off all error reporting
error_reporting(0);
$dir = new DirectoryIterator($folder);
$json = array();
foreach ($dir as $index => $fileinfo) {
if (!$fileinfo->isDot()) {
$filename = $fileinfo->getFilename();
//echo '<pre>';
//print_r($fileinfo);
//echo '</pre>';
$slug = str_replace('.html', '', substr($filename, 8));
$issue = $issues[substr($filename, 0, 7)]['issue'];
$volume = $issues[substr($filename, 0, 7)]['vol'];
$number = $issues[substr($filename, 0, 7)]['number'];
$html = file_get_contents($folder . $filename);
// Start scraping!
$doc = new DOMDocument();
$doc->loadHTML($html);
$xpath = new DOMXPath($doc);
// Start working on the (atrocious) content
foreach ($xpath->query('//img') as $key => $node) {
// Save the image for later
$imgSplit = explode('/', $node->getAttribute('src'));
$contentImage = $imgSplit[count($imgSplit) - 1];
// Replace spaces with '-'
$contentImage = str_replace('%20', '-', $contentImage);
$contentImage = str_replace('.html', '', $contentImage);
// Sometimes images we want are surrounded by an 'em' tag containing the caption..
if ($node->parentNode->tagName == 'em') {
// Save the caption for later
$contentImageCaption = decodeText($node->parentNode->textContent);
// Remove the 'em' and 'img' tags
$node->parentNode->parentNode->removeChild($node->parentNode);
} else {
// Remove the 'img' tags
$node->parentNode->removeChild($node);
}
}
$description = getValue($xpath->query($articleTable . '//p[1][em]'), 'textContent');
if (!$description) {
$description = getValue($xpath->query('/html/head/meta[@name="description"]/@content'));
}
$content = getHTML($xpath->query($articleTable . '/tr[2]/td'));
// Now remove all images from our content - plus any other nasty tags
$content = strip_tags($content, '<br><br/><em><em/><b><b/><p><p/><a><a/><strong><strong/><i><i/>');
// Remove any empty tags (probably caused by removing img tags)
$content = preg_replace("#<p>(\s|&nbsp;|</?\s?br\s?/?>)*</?p>#", "", $content);
$content = preg_replace('/^\s+|\n|\r|\s+$/m', '', $content);
//
// Create some JSON!
//
$json[] = array(
'description' => $description,
'title' => $title,
'date' => $date,
'contentImage' => $contentImage,
'contentImageCaption' => $contentImageCaption,
'content' => str_replace(',', '\,', $content),
'category' => $category,
'type' => $type,
'issue' => $issue,
'volume' => $volume,
'number' => $number,
);
}
}
echo json_encode($json);
function getValue($items, $attribute = 'value') {
foreach ($items as $item) {
return decodeText($item->{$attribute});
}
}
function getHTML($content) {
foreach ($content as $key => $node) {
return decodeText($node->ownerDocument->saveHTML($node));
}
}
// Escape backticks, grave accents, etc
function decodeText($text) {
return convert_smart_quotes(mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8'));
//return htmlentities($text, ENT_QUOTES);
}
function convert_smart_quotes($string)
{
$search = array('&lsquo;', '&rsquo;', '&ldquo;', '&rdquo;', '&mdash;');
$replace = array("'", "'", '"', '"', '-');
return str_replace($search, $replace, $string);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment