Skip to content

Instantly share code, notes, and snippets.

@RdeWilde
Created August 1, 2010 17:31
Show Gist options
  • Save RdeWilde/503555 to your computer and use it in GitHub Desktop.
Save RdeWilde/503555 to your computer and use it in GitHub Desktop.
<?php
if (!$sContent = file_get_contents('http://www.biblija.net/biblija.cgi?m=lc1,1-10&id42=0&l=nl'))
throw new Exception('Tekst kon niet geladen worden');
// Try to get encoding right, lose the strange encoding chars
$sContent = mb_convert_encoding($sContent, 'UTF-8', mb_detect_encoding($sContent));
$sContent = mb_convert_encoding($sContent, 'html-entities', 'UTF-8');
$oDOM = new DOMDocument('1.0');
//$oDOM->resolveExternals = false;
//$oDOM->substituteEntities = false;
if (!$oDOM->loadHTML($sContent))
throw new Exception('Kon tekst niet parsen');
//if (strstr($oDOM->documentElement->textContent, 'Â'))
// throw new Exception('Get the bastard!');
// Let op, alle tags en attributes zijn naar lowercase omgezet.
if (!$oQry = new DOMXPath($oDOM))
throw new Exception('Could not parse XPath query');
$oTexts = $oQry->query('//td[@class = "text"]');
// The element containing the content
foreach ($oTexts as $oText)
{
$oParas = $oQry->query('div', $oText);
// Loop through all parts found
foreach ($oParas as $oPara)
{
// What sort of chapter we have?
switch ($oPara->getAttribute('class'))
{
case 'm':
case 'p':
// Doorloop childs, check nodeType (text, element > span? met class 'v') of doe een regexp-setje ;)
// \v = vertical whitespace
$aVerses = preg_split('/\v/is', $oPara->textContent);
foreach ($aVerses as $sVerse)
{
if (trim($sVerse) != '')
{
// \h = horizontal whitespace, later limit 2
$aResult = preg_split('/\h+/is', $sVerse, 2);
// Fix strange encoding bug
foreach ($aResult as $iIndex => $sResult) $aResult[$iIndex] = preg_replace('/Â/','',$sResult);
print_r($aResult);
}
}
break;
default:
//throw new Exception('Paragraph-type `'.$oPara->getAttribute('class').'` is not handled.');
}
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment