Skip to content

Instantly share code, notes, and snippets.

@poizan42
Last active December 30, 2015 13:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save poizan42/7836618 to your computer and use it in GitHub Desktop.
Save poizan42/7836618 to your computer and use it in GitHub Desktop.
mx.dk recent news scraper
<?php
class MetroXpress
{
const userAgent = null;
private static $monthMap = Array(
'januar' => 1,
'februar' => 2,
'marts' => 3,
'april' => 4,
'maj' => 5,
'juni' => 6,
'juli' => 7,
'august' => 8,
'september' => 9,
'oktober' => 10,
'november' => 11,
'december' => 12
);
private static function GetContext($method = 'GET')
{
$httpOpts = Array('method' => $method);
if (self::userAgent !== null)
$httpOpts['header'] = 'User-Agent: '.self::userAgent;
$opts = Array('http' => $httpOpts);
return stream_context_create($opts);
}
private static function GetClassSelector($class)
{
return '[contains(concat(" ", normalize-space(@class), " "), " '.$class.' ")]';
}
private static function ParseDate($date)
{
// Alternative format: Af Bo Poulsen - 06. december 2013 22:28; Opdateret: 06.12.2013 22:47
// Alternativt format igen: Af Michael Bo Mortensen - 01/01-14 18:54
if (preg_match('@(\d\d)/(\d\d)-(\d\d)\s+(\d?\d):(\d\d)@', $date, $matches))
{
list($x, $day, $month, $yearShort, $hour, $min) = $matches;
$year = $yearShort + 2000;
}
else
{
$monthMatch = '(' . implode('|', array_keys(self::$monthMap)) . ')';
$dateMatch = '/(\d\d)\.\s+'.$monthMatch.'\s+(\d\d\d\d)\s+(\d?\d):(\d\d)/';
if (!preg_match($dateMatch, $date, $matches))
throw new \Exception("Failed parsing date '$date'");
list($x, $day, $monthName, $year, $hour, $min) = $matches;
$month = @self::$monthMap[$monthName];
if ($month === null)
throw new \Exception("Failed parsing month '$monthName'");
}
$timezone = new \DateTimeZone('Europe/Copenhagen');
return new \DateTime($day.'-'.str_pad($month, 2, '0', STR_PAD_LEFT).'-'.$year.'T'.$hour.':'.$min,
$timezone);
}
/* Get a news item from MetroXpress.
$section: e.g. 'nyheder/kobenhavn' or 'sport/sportsnyheder' or 'nyheder/danmark'
$id: the id of the news item
Each entry is an associative array with the keys:
'id', 'section', 'title', 'pubDate', 'link', 'description' */
public static function GetNewsItem($section, $id)
{
$context = self::GetContext();
$link = "http://www.mx.dk/$section/story/$id";
$html = file_get_contents($link, false, $context);
if ($html === false)
throw new \Exception('Receiving news item failed');
$dom = new DOMDocument();
$internal_errors = libxml_use_internal_errors(true);
$dom->loadHTML($html);
libxml_clear_errors();
libxml_use_internal_errors($internal_errors);
$xPath = new DOMXPath($dom);
$storyHeadSelector = 'div'.self::GetClassSelector('story_head');
$storyTitlesSelector = 'div'.self::GetClassSelector('story_titles');
$storyTitlesNodeList = $xPath->query("//$storyHeadSelector/$storyTitlesSelector");
if ($storyTitlesNodeList->length == 0)
throw new \Exception('Missing story_titles div');
$storyTitlesNode = $storyTitlesNodeList->item(0);
$publishedDateNodeList = $xPath->query('div'.self::GetClassSelector('published'), $storyTitlesNode);
if ($publishedDateNodeList->length == 0)
throw new \Exception('Missing published date div');
$publishedDate = self::ParseDate($publishedDateNodeList->item(0)->nodeValue);
$publishedDate->setTimezone(new \DateTimeZone('UTC'));
$pubDate = $publishedDate->format('D, d M Y H:i:s').' GMT';
$titleNodeList = $xPath->query('h1', $storyTitlesNode);
if ($titleNodeList->length == 0)
throw new \Exception('Missing title');
$title = trim($titleNodeList->item(0)->nodeValue);
$descriptionNodeList = $xPath->query('h3', $storyTitlesNode);
if ($descriptionNodeList->length == 0)
throw new \Exception('Missing description');
$description = trim($descriptionNodeList->item(0)->nodeValue);
return Array(
'id' => $id,
'section' => $section,
'title' => $title,
'pubDate' => $pubDate,
'link' => $link,
'description' => $description);
}
/* Returns a list containing section and id of the 9 most recent news entries. */
public static function GetLatestNewsIds()
{
$context = self::GetContext();
$html = file_get_contents('http://www.mx.dk/nyheder/');
if ($html === false)
throw new \Exception('Receiving news item failed');
$dom = new DOMDocument();
$internal_errors = libxml_use_internal_errors(true);
$dom->loadHTML($html);
libxml_clear_errors();
libxml_use_internal_errors($internal_errors);
$xPath = new DOMXPath($dom);
$teaserSelector = 'div'.self::GetClassSelector('teaser');
$galleryBarSelector = 'div'.self::GetClassSelector('gallerybar');
$latestNewsNodeList = $xPath->query('//'.$teaserSelector.'[.//*[text()="Seneste nyt"]]/'
.$galleryBarSelector.'//a/@href');
if ($latestNewsNodeList->length == 0)
throw new \Exception('Missing latest news');
$newsIds = Array();
foreach ($latestNewsNodeList as $href)
{
if (!preg_match('@/(\w+)/(\w+)/story/(\d+)@', $href->value, $matches))
throw new \Exception("Could not parse url '{$href->value}'");
$newsIds[] = Array('section' => $matches[1].'/'.$matches[2], 'id' => $matches[3]);
}
return $newsIds;
}
/* Get the news entries for the 9 most recent news items */
public static function GetLatestNews()
{
$newsEntries = Array();
$newsIds = self::GetLatestNewsIds();
foreach ($newsIds as $newsId)
{
try
{
$newsEntries[] = self::GetNewsItem($newsId['section'], $newsId['id']);
}
catch (\Exception $e)
{
$newsEntries[] = Array('id' => $newsId['id'], 'section' => $newsId['section'], 'error' => $e);
}
}
return $newsEntries;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment