Skip to content

Instantly share code, notes, and snippets.

@rhulse
Created May 13, 2011 21:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rhulse/971350 to your computer and use it in GitHub Desktop.
Save rhulse/971350 to your computer and use it in GitHub Desktop.
Version 1 of the parser
<?php
// name richtextparser.inc
// This code is license GPL version 2
// It is an example from the blog series rebuilding radionz.co.nz
// http://richardhulse.blogspot.com/
/* VERSION HISTORY
0.1 first version. Parses only W97 NAT sched
0.2 added formatting options
0.3 tweaked to allow parsing of W97 CFM sched
0.4 branch control to split sched / highlight parsing
0.5 new date regex for highlights
0.6 10 April 2003 RLH
added w2000 parsing of files
0.7 13 April 2003 RLH
seperated parser and formatter into their own classes.
Parser now returns the type of formatting to be used in the line.
e.g. NONE, BOLD_ALL.
This could be refined if required.
BOLD ALL just means the line starts in BOLD which is good enough for our purposes
The formatter class decides what to do with the line based on that.
0.8 23 April 2003 RLH
Added section CHART to parse CFM classical chart.
stips the whole table back to TR and TD tags with no attributes.
Mercifully this is well formed in W97.
Each line of the table is extracted.
0.9 29 April 2003 RLH
Added cleaner for press releases.
0.91 26 May 2003 RLH
Changed day selector code
0.92 08 July 2003 RLH
Added musicfeatures
0.93 12 August 2003 RLH
changed table parser to replace entities
added "/ë/" to entity list
0.94 28 November 2003 RLH
Added tighter regex test for dates in Music Features docs
---------------------------------
NB: W97 docs are not well formed:
eg. <b><p></b></p>
(The parser should fix this)
W2000 docs ARE well formed but contain a heap of junk.
*/
// type of document to process
DEFINE( 'WHATSON', 1 );
DEFINE( 'HIGHLIGHTS', 2 );
DEFINE( 'CHART', 3 );
DEFINE( 'PRESS', 4 );
DEFINE( 'MUSICFEATURES', 5 );
DEFINE( 'PLAIN', 6 );
// tag processing
DEFINE( 'PARA_OPEN', 1);
DEFINE( 'PARA_CLOSE', 2);
DEFINE( 'BOLD_OPEN', 3);
DEFINE( 'BOLD_CLOSE', 4);
DEFINE( 'ITAL_OPEN', 5);
DEFINE( 'ITAL_CLOSE', 6);
DEFINE( 'UNKNOWN_OPEN', 10);
DEFINE( 'UNKNOWN_CLOSE', 11);
// type of line
DEFINE( 'NONE', 1);
DEFINE( 'BOLD_ALL', 2);
DEFINE( 'BOLD_PART', 3);
DEFINE( 'ITAL_ALL', 4);
DEFINE( 'ITAL_PART', 5);
/*
This class arrangment extension arrangement is a
bodge instead of polymorphism
think of it thus:
Parser_BASE
|
-------------------
| |
PBI_Parser TABLE_Parser
| |
-------------------
|
RNZ_Doc_Formatter
*/
class Parser_BASE
{
var $html;
var $posi;
var $length;
var $line;
var $para_type;
var $bold_on = 0;
var $tagged_line;
function Parser_BASE()
{
$this->html = "";
$this->posi = 0;
$this->length = 0;
$this->line = "";
$this->para_type = NONE;
}
function SetHTML ( $html )
{
$this->html = $html;
// -1 because of 0 offset !!!!!!!!!!!!
$this->length = strlen( $this->html ) -1;
}
function CleanUpHTML( $tags="<p><b><i>" )
{
/*
The order in which tags are processed is highly critical
The order here will remove and clean up most everything
in W97 and W2000 html
*/
// initial pre-processing normalises a few things
$pre_search = array
(
"/[\n|\r]/", // new lines
"/<\![^>]*>/", // w2000 comments
"/\s+/", // extra whitespace
"/<h.>/i", // convert headings
"/<\/h.>/i" // + closing tags
);
$pre_replace = array
(
" ",
" ",
" ",
"<p>",
"</p>"
);
$this->html = preg_replace( $pre_search, $pre_replace, $this->html );
// strip all tags that are not those we want
// including all the w2000 rubbish
$this->html = strip_tags( $this->html, $tags );
// general cleanup
// the tag clean up is for w2000 - well formed
// but tags are repeated too much
$search = array
(
"/National Radio Programme Listing/i", // listing text
"/Details available on www.radionz.co.nz/i",
"/<([p|b|i]) [^>]*>/i", // remove attributes from tags
"/>\s+</", // butt tags
"/<p>&nbsp;<\/p>/i", // blank paras
"/<p> &nbsp;<\/p>/i", // blank paras
"/<b>&nbsp;<\/b>/i", // blank paras
"/<i>&nbsp;<\/i>/i", // blank paras
"/<\/b><b>/i", // clean up bold tags
"/<b><\/b>/i", // empty bold tags
"/<\/i><i>/i", // same italics
"/<i><\/i>/i", // empty bold tags
"/<p><\/p>/i", // blank paras
"/[\n|\r]/"
);
$replace = array
(
"",
"",
"<\\1>",
"><",
"",
"",
"",
"",
"",
"",
"",
"",
"",
""
);
$this->html = preg_replace( $search, $replace, $this->html );
// -1 because of 0 offset !!!!!!!!!!!!
$this->length = strlen( $this->html ) -1;
}
}
class TABLE_Parser extends Parser_BASE
{
function TABLE_Parser()
{
// $this->html = $html;
$this->CleanTable();
}
function CleanTable( )
{
$this->CleanUpHTML( "<td><tr><p>" );
// this is table specific clean up
$search = array
(
"/<td [^>]*>/i", // remove attributes from tags
"/<tr [^>]*>/i", // remove attributes from tags
"/<td>/i",
"/<tr>/i",
"/<\/td>/i",
"/<\/tr>/i"
);
$replace = array
(
"<td>",
"<tr>",
"<td>",
"<tr>",
"</td>",
"</tr>"
);
$this->html = preg_replace( $search, $replace, $this->html );
$this->html = strip_tags( $this->html, "<tr><td>"); // removing any errant P tags
$this->html = str_replace("<tr><td>", "<tr><td class=\"pos\">", $this->html);
$this->html = str_replace("</tr><tr>", "\n", $this->html);
$this->html = str_replace("<tr>", "", $this->html);
$this->html = str_replace("</tr>", "", $this->html);
// echo $this->html; exit();
}
function GetTable()
{
return $this->html;
}
}
class PBI_Parser extends TABLE_Parser
{
var $bold_on;
var $italics_on;
function PBI_Parser()
{
parent::Parser_BASE();
// these two vars are used to remember the state across many lines
// for poorly formed word 97 html
$this->bold_on;
$this->italics_on;
}
function GetParagraph()
{
// NextPara gets a well formed paragraph
if( $this->NextParagraph()){
if( preg_match( "/<p><b>/", $this->line )){
$this->para_type = BOLD_ALL;
}
else{
$this->para_type = NONE;
}
// keep italics
$this->line = strip_tags( $this->line, "<em>");
return true;
}
else{
return false;
}
}
function NextParagraph()
{
// get the next para and TIDIES up the thml to be well formed.
// well, well enough for our purposes.
static $in_bold = false; // remember
$in_italics = false;
$in_cdata = false;
$in_para = false;
$this->bold_on = $in_bold; // restore state of bold
$this->italics_on = $in_italics; // restore state of bold
$this->line = "";
if ($this->posi >= $this->length){
return false;
}
while (1) {
if ($this->posi >= $this->length){
return false;
}
if ($this->html[$this->posi] == "<") { // a tag ?
switch ( $this->GetTag() )
{
case PARA_OPEN:
$this->line .= "<p>";
$in_para = true;
break;
case PARA_CLOSE:
$in_cdata = false; // line finished
$in_para = false;
if( $in_bold ){ // unclosed bold
$this->line .= "</b>";
}
if( $in_italics ){ // unclosed italics
$this->line .= "</em>";
}
$this->line .= "</p>";
return 1;
break;
case BOLD_OPEN:
$in_bold = true;
$this->bold_on = true; // turn bold on
break;
case BOLD_CLOSE:
$in_bold = false;
if( !$in_cdata ) { // we turn off the bold ONLY when we are not in the middle of the line
$this->bold_on = false;
}
else{
$this->line .= "</b>";
}
break;
case ITAL_OPEN:
$in_italics = true;
break;
case ITAL_CLOSE:
$in_italics = false;
if( !$in_cdata ) { // we turn off the bold ONLY when we are not in the middle of the line
$this->italics_on = false;
}
else{
$this->line .= "</em>";
}
break;
}
}
else{ // not a tag
if( $in_bold ){
$this->line .= "<b>";
}
if( $in_italics ){
$this->line .= "<em>";
}
while( $this->html[$this->posi] != "<" ){
$this->line .= $this->html[$this->posi++];
}
$in_cdata = true; // mark that we are in Cdata
// this is in case the bold state changes mid para
}
} // end while
}
function GetTag()
{
if ($this->html[$this->posi+1] == "/" ) { // closing tag?
$this->posi++;
if ( strtolower($this->html[$this->posi+1]) == "p" ){ // paragraph
$this->posi += 3;
return PARA_CLOSE; // end of the </p> so return the para
}
else if ( strtolower($this->html[$this->posi+1]) == "b" ){ // bold
$this->posi += 3;
return BOLD_CLOSE;
}
else if ( strtolower($this->html[$this->posi+1]) == "i" ){ // bold
$this->posi += 3;
return ITAL_CLOSE;
}
else { // unknown closing tag
$this->posi += 3;
return UNKOWN_CLOSE;
}
}
else if ( strtolower($this->html[$this->posi+1]) == "p" ){ // paragraph
$this->posi++;
while( $this->html[$this->posi] != ">" ){
$this->posi++;
}
$this->posi++; // move past >
return PARA_OPEN;
}
else if ( strtolower($this->html[$this->posi+1]) == "b" ){ // bold
$this->posi++;
while( $this->html[$this->posi] != ">" ){
$this->posi++;
}
$this->posi++; // move past >
return BOLD_OPEN;
}
else if ( strtolower($this->html[$this->posi+1]) == "i" ){ // italics
$this->posi++;
while( $this->html[$this->posi] != ">" ){
$this->posi++;
}
$this->posi++; // move past >
return ITAL_OPEN;
}
else{ // unknown opening tag
$this->posi++;
while( $this->html[$this->posi] != ">" ){
$this->posi++;
}
$this->posi++; // move past >
return UNKOWN_OPEN;
}
}
}
class RNZ_Doc_Formatter extends PBI_Parser
{
/*
This works as follows:
0. It cleans up the HTML:
a. removes blank lines
b. removes new lines
c. Removes "National Radio Programme Listing"
d. removes addtional white space
e. Butts tags
1. It strips all tags except b and p
2. It then extracts whole paragraphs pushing these onto an array
3. It remembers the current state of bold formatting
4. paragraphs are formated as <h3> if bold is on and <p> if not.
5. Day Titles are formated as <h2>.
6. RR is formatted <span class="rr">RR</span>
7. Text in ( ) are formated <em>( )</em>
8. Emails are turned into mailto:
CSS is used to style these as required.
This comes about because Word produces HTML that is not well formed
and XHTML MUST be well formed.
e.g. <B><P ALIGN=JUSTIFY>6.30 Sports Story RR </B>(RNZ)</P>
Assumptions:
1. This class assumes that lines are are bold are programmes
and that non-bold lines are programme info.
2. There are headings for each day in the format : /day_name \d{1,2} \w+ \d{4}/
3. That items in brackets are extra info:
e.g. (RNZ), (BBC), (Ep 2 of 5, BBC).
and that these are not bold in weight even if they are on a bold line.
4. The sub-string "RR" is spread through the document to indicate Replay Radio
Using HTML tidy would produce the same result but with much higher overhead.
*/
var $daysel= "
<div class=\"iedaysel\">
<ul>
<li><a href=\"#sat\">Saturday</a></li>
<li><a href=\"#sun\">Sunday</a></li>
<li><a href=\"#mon\">Monday</a></li>
<li><a href=\"#tue\">Tuesday</a></li>
<li><a href=\"#wed\">Wednesday</a></li>
<li><a href=\"#thu\">Thursday</a></li>
<li><a href=\"#fri\">Friday</a></li>
</ul>
</div>";
var $xhtml;
function RNZ_Doc_Formatter()
{
parent::PBI_Parser();
$this->xhtml = array();
$this->summary = array();
}
function Process( $html='', $mode=WHATSON )
{
$line = "";
// if there are no tags then it is plain text
if ( stristr( $html, "<" ) === FALSE ){
return(0);
}
$this->SetHTML( $html);
// each type of document gets its own formatting
if( $mode == WHATSON ){ // WHATS ON DOCUMENT
$this->CleanUpHTML( "<p><b>" );
while ( $this->GetParagraph() ) {
$this->AddEntities();
$this->WO_AddTags();
// array_push( $this->xhtml, $this->line."<br />\n" ); // DEBUG
// ignore the bold state and save the line
if ($this->WO_AddAnchors()){
array_push( $this->xhtml, $this->line );
}
else {
switch( $this->para_type )
{
case BOLD_ALL:
array_push( $this->xhtml, "<h3>".$this->line."</h3>" );
break;
default:
array_push( $this->xhtml, "<p>".$this->line."</p>" );
break;
}
}
}
}
else if( $mode == HIGHLIGHTS ){ // HIGHLIGHTS DOCUMENT
$this->CleanUpHTML( "<p><b><i>" );
// echo $this->html; exit(0); // DEBUG
while ( $this->GetParagraph() ) {
$this->AddEntities();
// ignore the bold state and save the line
if ( $this->HI_AddAnchors() ){
$temp = strip_tags( $this->line );
$this->line = "<h2>$temp</h2>";
array_push( $this->xhtml, $this->line );
}
else {
switch( $this->para_type )
{
case BOLD_ALL:
array_push( $this->xhtml, "<p class=\"hlts\">".$this->line."</p>" );
break;
default:
array_push( $this->xhtml, "<p>".$this->line."</p>" );
break;
}
}
}
}
else if( $mode == CHART ){ // concert FM classical chart
// instantiate class
parent::TABLE_Parser();
array_push( $this->xhtml, "<table class=\"chart\">");
array_push( $this->xhtml, "<tr><th>This<br />Week</th><th>Last<br />Week</th><th>Title</th><th>Artist</th><th>CD #</th></tr>");
$lines = explode("\n", $this->GetTable() );
//print_r( explode("<th>", $this->GetTable()));
//echo $this->GetTable(); exit(); // DEBUG
foreach( $lines as $this->line ){
$fields = explode("<td>", $this->line);
$newline = '';
foreach( $fields as $field ){
$this->line = strip_tags( $field );
$this->AddEntities();
$newline .= "<td>$this->line</td>";
}
$newline = str_replace("<tr><td>", "<tr><td class=\"pos\">", "<tr>$newline</tr>" );
array_push( $this->xhtml, "$newline" );
}
array_push( $this->xhtml, "</table>");
}
else if( $mode == PRESS ){ // PRESS RELEASES
$this->CleanUpHTML( "<p><b><i>" );
while ( $this->GetParagraph() ) {
$this->AddEntities();
array_push( $this->xhtml, "<p>".$this->line."</p>" );
}
}
else if( $mode == PLAIN ){ // PLAIN
$this->CleanUpHTML( "<p><b><i>" );
while ( $this->GetParagraph() ) {
$this->AddEntities();
array_push( $this->xhtml, "<p>".$this->line."</p>" );
}
}
else if( $mode == MUSICFEATURES ){ // MUSIC FEATURES
$this->CleanUpHTML( "<p><b>" );
while ( $this->GetParagraph() ) {
$this->AddEntities();
$this->WO_AddTags();
if ( $this->MUSIC_AddAnchors() ){
$temp = strip_tags( $this->line );
$this->line = "<h2>$temp</h2>";
array_push( $this->xhtml, $this->line );
}
else {
switch( $this->para_type )
{
case BOLD_ALL:
array_push( $this->xhtml, "<p class=\"hlts\">".$this->line."</p>" );
break;
default:
array_push( $this->xhtml, "<p>".$this->line."</p>" );
break;
}
}
}
}
return(1);
}
// looks for the first occurence of a date formated thus:
/*
saturday # dayname
\s # 1 space (gap)
\d{1,2} # 1 or 2 digits (date)
\s # 1 space (gap)
\w{3,9} # 3 to 9 characters (month)
\s # 1 space (gap)
\d{4} # 4 digits (year)
*/
// this will get all of the lines that are like:
// saturday 14 March 2002
// but ignore lines that are close.
function WO_AddAnchors( )
{
// marks if item found to avoid duplicate ID anchors which are NOT allowed in XHTML
// documents aare unlikely to contain two but belt and braces required anyway
static $sat = 0;
static $sun = 0;
static $mon = 0;
static $tue = 0;
static $wed = 0;
static $thu = 0;
static $fri = 0;
// anchor the days of the week
if ( preg_match( "/saturday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
if( !$sat ){ // only do the first one
$this->line = "<h2>".strip_tags( $this->line )."</h2>";
$sat = 1;
return 1;
}
}
else if ( preg_match( "/sunday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
if( !$sun ){ // only do the first one
$this->line = "<a id=\"sun\" name=\"sun\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
$sun = 1;
return 1;
}
}
else if ( preg_match( "/monday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
if( !$mon ){ // only do the first one
$this->line = "<a id=\"mon\" name=\"mon\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
$mon = 1;
return 1;
}
}
else if ( preg_match( "/tuesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
if( !$tue ){ // only do the first one
$this->line = "<a id=\"tue\" name=\"tue\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
$tue = 1;
return 1;
}
}
else if ( preg_match( "/wednesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
if( !$wed ){ // only do the first one
$this->line = "<a id=\"wed\" name=\"wed\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
$wed = 1;
return 1;
}
}
else if ( preg_match( "/thursday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
if( !$thu ){ // only do the first one
$this->line = "<a id=\"thu\" name=\"thu\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
$thu = 1;
return 1;
}
}
else if ( preg_match( "/friday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
if( !$fri ){ // only do the first one
$this->line = "<a id=\"fri\" name=\"fri\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
$fri = 1;
return 1;
}
}
else {
return 0;
}
}
function HI_AddAnchors( )
{
// anchor the days of the week
if ( preg_match( "/saturday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/sunday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/monday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/tuesday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/wednesday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/thursday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/friday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
return 1;
}
else {
return 0;
}
}
function Music_AddAnchors( )
{
// anchor the days of the week
if ( preg_match( "/saturday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/sunday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/monday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/tuesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/wednesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/thursday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
return 1;
}
else if ( preg_match( "/friday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
return 1;
}
else {
return 0;
}
}
function WO_AddTags()
{
$search = array ( "/ RR/", // replay radio
"/\(([\w|\s|\.|\,]+)\)/", // things in brackets
"/(\d{1,2}\.\d{2}) /", // times with a space after (not in a list)
"/((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/" ); // email
$replace = array ( " <span class=\"rr\">RR</span>",
"<em>(\\1)</em>",
"<strong>\\1</strong> ",
"<a href=\"mailto:\\1\">\\1</a>" );
$this->line = preg_replace( $search, $replace, $this->line );
}
function AddEntities()
{
$search = array
(
"/ë/",
"/'/",
"/í/",
"/\"/",
"/Ö/",
"/ ñ /"
);
$replace = array
(
"&#8217;",
"&#8217;",
"&#8217;",
"&quot;",
"&#8230;",
" &#8212; "
);
$this->line = preg_replace( $search, $replace, $this->line );
}
// function to return a string containing the XHTML with line breaks.
function GetXHTML()
{
$temp = "";
foreach( $this->xhtml as $line ){
$temp .= "$line\n";
}
return $temp;
}
function GetSummary()
{
$temp = "";
foreach( $this->summary as $line ){
$temp .= "<p>$line\n</p>";
}
return $temp;
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment