Skip to content

Instantly share code, notes, and snippets.

@Bertware
Created March 21, 2018 13:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Bertware/12b22879c89d6aa8ab361d856ce56ebb to your computer and use it in GitHub Desktop.
Save Bertware/12b22879c89d6aa8ab361d856ce56ebb to your computer and use it in GitHub Desktop.
Quick-and-dirty PHP script to Scrape information on all available episodes from vrtnu.be. create a json representation for each page, and a list which puts everything together
<?php
function file_put_contents_rec($file, $data)
{
$file = ltrim($file, '/');
$folder = substr($file, 0, -1 * strlen(basename($file)));
if (!is_dir($folder)) {
// dir doesn't exist, make it
mkdir($folder, 0777, true);
}
file_put_contents($file, $data);
}
// schedule: https://www.vrt.be/bin/epg/schedule.json
// live: https://services.vrt.be/videoplayer/r/live.json
// jsonp!
$html = file_get_contents("https://www.vrt.be/vrtnu/a-z/");
$tidy = tidy_parse_string($html);
$tidy->cleanRepair();
$xmldoc = new DOMDocument();
@$xmldoc->loadHTML($tidy->html()->value);
$xPath = new Domxpath($xmldoc);
$xPath_vrtglossary_group = "/html/body/div[@class='main']/main/div/div[@class='vrtglossary__groups']/div";
$xPath_vrtglossary_header_rel = "./div[@class='vrtglossary__group__title__letter']/h2/text()"; // use text content
$xPath_vrtglossary_item_rel = "./div/ul/li/a";
$xPath_item_imgurl = "./div[@class='tile__image-wrapper']/div/picture/source/@srcset";
$xPath_item_title = "./div[@class='tile__content-wrapper']/h3/text()"; // use text content
$xPath_item_description = "./div[@class='tile__content-wrapper']/div[@class='tile__description']"; // use text content
$queryResult = $xPath->query($xPath_vrtglossary_group);
$data = [];
$totalPrograms = 0;
foreach ($queryResult as $sectionElement) {
$headerText = $xPath->query($xPath_vrtglossary_header_rel, $sectionElement);
$items = $xPath->query($xPath_vrtglossary_item_rel, $sectionElement);
$sectionData = [];
foreach ($items as $programElement) {
$itemData = [];
$itemData['img'] = [];
foreach (explode(',', $xPath->query($xPath_item_imgurl, $programElement)->item(0)->textContent) as $img) {
$spaced = explode(' ', trim($img));
$itemData['img'][$spaced[1]] = "https:" . $spaced[0];
}
$itemData['title'] = trim($xPath->query($xPath_item_title, $programElement)->item(0)->textContent);
$itemData['description'] = trim($xPath->query($xPath_item_description, $programElement)->item(0)->textContent);
$itemData['api']['details'] = str_replace(".relevant/", ".json", $programElement->getAttribute('href'));
$itemData['url'] = "https://www.vrt.be" . $programElement->getAttribute('href');
$itemData['relative'] = $programElement->getAttribute('href');
$sectionData[] = $itemData;
$totalPrograms++;
}
$data[$headerText->item(0)->textContent] = $sectionData;
}
file_put_contents_rec("vrtnu/a-z.json", json_encode($data, JSON_UNESCAPED_SLASHES | JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
$programs = [];
$i = 0;
foreach ($data as $section => $sectionPrograms) {
foreach ($sectionPrograms as $program) {
// All other episodes
$xPath_episodes = "/html/body/div[@class='main']/main/div/div[@class='episodeslist']/div[@class='list']/div/div[@class='vrtlist__body']/div/ul/li/a";
$html = file_get_contents($program['url']);
$tidy = tidy_parse_string($html);
$tidy->cleanRepair();
$xmldoc = new DOMDocument();
@$xmldoc->loadHTML($tidy->html()->value);
$xPath = new Domxpath($xmldoc);
$episodeElements = $xPath->query($xPath_episodes);
$rawEpisodes = [];
foreach ($episodeElements as $episodeElement) {
$rawEpisode['relative'] = $episodeElement->getAttribute('href');
$rawEpisode['url'] = "https://www.vrt.be" . $episodeElement->getAttribute('href');
// replace the trailing slash of an episode url with ".mssecurevideo.json" to get json info. However, this requires an account, and accepting the terms (which we kinda can't)
// just parse for now and let the end user sign in
$rawEpisode['stream'] = rtrim($rawEpisode['url'], '/') . '.mssecurevideo.json';
@$rawEpisode['date'] = $xPath->query("./div[@class='tile__content-wrapper']/div/div/span[@class='tile__broadcastdate--mobile']", $episodeElement)->item(0)->textContent;
$rawEpisode['id'] = basename('url');
$rawEpisodes[] = $rawEpisode;
}
// fallback when no list is shown
if (count($episodeElements) == 0) {
$ch = curl_init($program['url']);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_URL, $program['url']);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_exec($ch);
$fullUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
$rawEpisode['relative'] = substr($fullUrl, 18);
$rawEpisode['url'] = $fullUrl;
// replace the trailing slash of an episode url with ".mssecurevideo.json" to get json info. However, this requires an account, and accepting the terms (which we kinda can't)
// just parse for now and let the end user sign in
$rawEpisode['stream'] = rtrim($rawEpisode['url'], '/') . '.mssecurevideo.json';
$rawEpisode['id'] = basename('url');
$rawEpisodes[] = $rawEpisode;
}
foreach ($rawEpisodes as $rawEpisode) {
$html = file_get_contents($program['url']);
$tidy = tidy_parse_string($html);
$tidy->cleanRepair();
$xmldoc = new DOMDocument();
@$xmldoc->loadHTML($tidy->html()->value);
$xPath = new Domxpath($xmldoc);
$xPath_details = "/html/body/div[@class='main']/main/div/div[@class='content-container']/div[@class='content']";
$xPath_episodeTitle = "./div[@class='content__container']/span[@class='content__title--episode']";
$xPath_episodeShortDesc = "./div[@class='content__container']/span[@class='content__shortdescription']";
$xPath_episodeDesc = "./div[@class='content__description']";
$xPath_available = "./div[@class='content__container']/div[@class='content__metadata']/div[@class='content__availability']/span";
$xPath_episodeNumber = "./div[@class='content__container']/div[@class='content__metadata']/div[@class='content__metadata--main']/div[@class='content__episode']/span";
$xPath_duration = "./div[@class='content__container']/div[@class='content__metadata']/div[@class='content__metadata--main']/time";
$xPath_country = "./div[@class='content__container']/div[@class='content__metadata']/div[@class='content__region']";
$xPath_categories = "./div[@class='content__container']/ul[@class='content__categories']/li/a";
$episodeDetails = $rawEpisode;
$queryResult = $xPath->query($xPath_details)->item(0);
if ($xPath->query($xPath_episodeTitle, $queryResult) && $xPath->query($xPath_episodeTitle, $queryResult)->count()) {
$episodeDetails['title'] = $xPath->query($xPath_episodeTitle, $queryResult)->item(0)->textContent;
} else {
$episodeDetails['title'] = $program['title'];
}
$episodeDetails['tagline'] = $xPath->query($xPath_episodeShortDesc, $queryResult)->item(0)->textContent;
$episodeDetails['description'] = trim($xPath->query($xPath_episodeDesc, $queryResult)->item(0)->textContent);
@$episodeDetails['available_until'] = trim($xPath->query($xPath_available, $queryResult)->item(0)->textContent);
$episodeDetails['duration'] = trim($xPath->query($xPath_duration, $queryResult)->item(0)->textContent);
@$episodeDetails['restrictions'] = trim($xPath->query($xPath_country, $queryResult)->item(0)->textContent);
@$episodeDetails['number'] = trim($xPath->query($xPath_episodeNumber, $queryResult)->item(0)->textContent);
$episodeDetails['categories'] = [];
foreach ($xPath->query($xPath_categories, $queryResult) as $category) {
$episodeDetails['categories'][] = $category->textContent;
}
$programs[$program['title']]['episodes'][] = $episodeDetails;
file_put_contents_rec(rtrim($episodeDetails['relative'], '/') . '.json', json_encode($episodeDetails, JSON_UNESCAPED_SLASHES | JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
}
print "[" . date("H:i:s") . "] " . $i . "/" . $totalPrograms . " " . round(100 * $i / $totalPrograms) . "% " . $program['title'] . PHP_EOL;
$i++;
file_put_contents_rec($program['api']['details'], json_encode($program, JSON_UNESCAPED_SLASHES | JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
}
}
file_put_contents_rec('vrtnu/list.json', json_encode($programs, JSON_UNESCAPED_SLASHES | JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment