rhulse/gist:971350

## gistfile1.php
<?php

// name richtextparser.inc

// This code is license GPL version 2
// It is an example from the blog series rebuilding radionz.co.nz
//  http://richardhulse.blogspot.com/

/* VERSION HISTORY

0.1 first version. Parses only W97 NAT sched
0.2 added formatting options
0.3 tweaked to allow parsing of W97 CFM sched
0.4 branch control to split sched / highlight parsing
0.5 new date regex for highlights

0.6 10 April 2003 RLH
added w2000 parsing of files

0.7 13 April 2003 RLH
seperated parser and formatter into their own classes.
Parser now returns the type of formatting to be used in the line.
	e.g. NONE, BOLD_ALL.
This could be refined if required.
BOLD ALL just means the line starts in BOLD which is good enough for our purposes

The formatter class decides what to do with the line based on that.

0.8 23 April 2003 RLH
Added section CHART to parse CFM classical chart.
stips the whole table back to TR and TD tags with no attributes.
Mercifully this is  well formed in W97.

Each line of the table is extracted.

0.9 29 April 2003 RLH
Added cleaner for press releases.

0.91 26 May 2003 RLH
Changed day selector code

0.92 08 July 2003 RLH
Added musicfeatures

0.93 12 August 2003 RLH
changed table parser to replace entities
added "/ë/" to entity list

0.94 28 November 2003 RLH
Added tighter regex test for dates in Music Features docs


---------------------------------
NB: W97 docs are not well formed:
eg.  <b><p></b></p>
	(The parser should fix this)
	W2000 docs ARE well formed but contain a heap of junk.

*/

// type of document to process

DEFINE( 'WHATSON', 1 );
DEFINE( 'HIGHLIGHTS', 2 );
DEFINE( 'CHART', 3 );
DEFINE( 'PRESS', 4 );
DEFINE( 'MUSICFEATURES', 5 );
DEFINE( 'PLAIN', 6 );

// tag processing
DEFINE( 'PARA_OPEN', 1);
DEFINE( 'PARA_CLOSE', 2);
DEFINE( 'BOLD_OPEN', 3);
DEFINE( 'BOLD_CLOSE', 4);
DEFINE( 'ITAL_OPEN', 5);
DEFINE( 'ITAL_CLOSE', 6);
DEFINE( 'UNKNOWN_OPEN', 10);
DEFINE( 'UNKNOWN_CLOSE', 11);

// type of line
DEFINE( 'NONE', 1);
DEFINE( 'BOLD_ALL', 2);
DEFINE( 'BOLD_PART', 3);
DEFINE( 'ITAL_ALL', 4);
DEFINE( 'ITAL_PART', 5);

/*

This class arrangment extension arrangement is a
bodge instead of polymorphism

think of it thus:


                 Parser_BASE
				      |
			-------------------
			|                 |
		PBI_Parser	    TABLE_Parser
			|                 |
			-------------------
					|
				RNZ_Doc_Formatter


*/

class Parser_BASE
{
	var $html;
	var $posi;
	var $length;
	var $line;
	var $para_type;
	var $bold_on = 0;

	var $tagged_line;

	function Parser_BASE()
	{
		$this->html = "";
		$this->posi = 0;
		$this->length = 0;
		$this->line = "";
		$this->para_type = NONE;
	}

	function SetHTML ( $html )
	{
		$this->html = $html;
		// -1 because of  0 offset   !!!!!!!!!!!!
		$this->length = strlen( $this->html ) -1;
	}

	function CleanUpHTML( $tags="<p><b><i>" )
	{
		/*
		The order in which tags are processed is highly critical
		The order here will remove and clean up most everything
		in W97 and W2000 html
		*/

		// initial pre-processing normalises a few things
		$pre_search  = array
		(
			"/[\n|\r]/",	// new lines
			"/<\![^>]*>/", 	// w2000 comments
			"/\s+/",		// extra whitespace
			"/<h.>/i",		// convert headings
			"/<\/h.>/i"		// + closing tags
		);


		$pre_replace = array
		(
			" ",
			" ",
			" ",
			"<p>",
			"</p>"
		);


		$this->html = preg_replace( $pre_search, $pre_replace, $this->html );


		// strip all tags that are not those we want
		// including all the w2000 rubbish
		$this->html = strip_tags( $this->html, $tags );

		// general cleanup
		// the tag clean up is for w2000 - well formed
		// but tags are repeated too much
		$search  = array
		(
			"/National Radio Programme Listing/i",  // listing text
			"/Details available on www.radionz.co.nz/i",
			"/<([p|b|i]) [^>]*>/i",	// remove attributes from tags
			"/>\s+</",					// butt tags
			"/<p>&nbsp;<\/p>/i",		// blank paras
			"/<p> &nbsp;<\/p>/i",		// blank paras
			"/<b>&nbsp;<\/b>/i",		// blank paras
			"/<i>&nbsp;<\/i>/i",		// blank paras
			"/<\/b><b>/i",			// clean up bold tags
			"/<b><\/b>/i",			// empty bold tags
			"/<\/i><i>/i",			// same italics
			"/<i><\/i>/i",			// empty bold tags
			"/<p><\/p>/i",		// blank paras
			"/[\n|\r]/"
		);

		$replace = array
		(
			"",
			"",
			"<\\1>",
			"><",
			"",
			"",
			"",
			"",
			"",
			"",
			"",
			"",
			"",
			""
		);

		$this->html = preg_replace( $search, $replace, $this->html );
		// -1 because of  0 offset   !!!!!!!!!!!!
		$this->length = strlen( $this->html ) -1;

	}
}

class TABLE_Parser extends Parser_BASE
{

	function TABLE_Parser()
	{
//		$this->html = $html;
		$this->CleanTable();
	}

	function CleanTable( )
	{
		$this->CleanUpHTML( "<td><tr><p>" );

		// this is table specific clean up
		$search  = array
		(
			"/<td [^>]*>/i",	// remove attributes from tags
			"/<tr [^>]*>/i",	// remove attributes from tags
			"/<td>/i",
			"/<tr>/i",
			"/<\/td>/i",
			"/<\/tr>/i"
		);

		$replace = array
		(
			"<td>",
			"<tr>",
			"<td>",
			"<tr>",
			"</td>",
			"</tr>"
		);

		$this->html = preg_replace( $search, $replace, $this->html );

		$this->html = strip_tags( $this->html, "<tr><td>");  // removing any errant P tags

		$this->html = str_replace("<tr><td>", "<tr><td class=\"pos\">", $this->html);
		$this->html = str_replace("</tr><tr>", "\n", $this->html);
		$this->html = str_replace("<tr>", "", $this->html);
		$this->html = str_replace("</tr>", "", $this->html);

//		echo $this->html; exit();
	}

	function GetTable()
	{
		return $this->html;
	}
}


class PBI_Parser extends TABLE_Parser
{
	var $bold_on;
	var $italics_on;

	function PBI_Parser()
	{
		parent::Parser_BASE();
		// these two vars are used to remember the state across many lines
		// for poorly formed word 97 html
		$this->bold_on;
		$this->italics_on;
	}

	function GetParagraph()
	{
		// NextPara gets a well formed paragraph
		if( $this->NextParagraph()){
			if( preg_match( "/<p><b>/", $this->line )){
				$this->para_type = BOLD_ALL;
			}
			else{
				$this->para_type = NONE;
			}
			// keep italics
			$this->line = strip_tags( $this->line, "<em>");
			return true;
		}
		else{
			return false;
		}
	}

	function NextParagraph()
	{
		// get the next para and TIDIES up the thml to be well formed.
		// well, well enough for our purposes.
		static 	$in_bold = false;  // remember
				$in_italics = false;
				$in_cdata = false;

		$in_para = false;
		$this->bold_on = $in_bold;  // restore state of bold
		$this->italics_on = $in_italics;  // restore state of bold
		$this->line = "";

		if ($this->posi >= $this->length){
			return false;
		}

		while (1) {
			if ($this->posi >= $this->length){
				return false;
			}

		  	if ($this->html[$this->posi] == "<") {  // a tag ?
				switch ( $this->GetTag() )
				{
					case PARA_OPEN:
							$this->line .= "<p>";
							$in_para = true;
							break;

					case PARA_CLOSE:
							$in_cdata = false;   // line finished
							$in_para = false;
							if( $in_bold ){ // unclosed bold
								$this->line .= "</b>";
							}
							if( $in_italics ){ // unclosed italics
								$this->line .= "</em>";
							}
							$this->line .= "</p>";
							return 1;
							break;

					case BOLD_OPEN:
							$in_bold = true;
							$this->bold_on = true;  // turn bold on
							break;

					case BOLD_CLOSE:
							$in_bold = false;
							if( !$in_cdata ) { // we turn off the bold ONLY when we are not in the middle of the line
								$this->bold_on = false;
							}
							else{
								$this->line .= "</b>";
							}
							break;

					case ITAL_OPEN:
							$in_italics = true;
							break;

					case ITAL_CLOSE:
							$in_italics = false;
							if( !$in_cdata ) { // we turn off the bold ONLY when we are not in the middle of the line
								$this->italics_on = false;
							}
							else{
							$this->line .= "</em>";
							}
							break;
				}
			}
			else{ // not a tag
				if( $in_bold ){
					$this->line .= "<b>";
				}
				if( $in_italics ){
					$this->line .= "<em>";
				}
				while( $this->html[$this->posi] != "<" ){
					$this->line .= $this->html[$this->posi++];
				}
				$in_cdata = true; // mark that we are in Cdata
				// this is in case the bold state changes mid para
			}
		} // end while
	}

	function GetTag()
	{
		if ($this->html[$this->posi+1] == "/" ) { // closing tag?
			$this->posi++;
			if ( strtolower($this->html[$this->posi+1]) == "p" ){  // paragraph
				$this->posi += 3;
				return PARA_CLOSE;			// end of the </p> so return the para
			}
			else if ( strtolower($this->html[$this->posi+1]) == "b" ){  // bold
				$this->posi += 3;
				return BOLD_CLOSE;
			}
			else if ( strtolower($this->html[$this->posi+1]) == "i" ){  // bold
				$this->posi += 3;
				return ITAL_CLOSE;
			}
			else {  // unknown closing tag
				$this->posi += 3;
				return UNKOWN_CLOSE;
			}
		}
		else if ( strtolower($this->html[$this->posi+1]) == "p" ){  // paragraph
			$this->posi++;
			while( $this->html[$this->posi] != ">" ){
				$this->posi++;
			}
			$this->posi++; // move past >
			return PARA_OPEN;
		}
		else if ( strtolower($this->html[$this->posi+1]) == "b" ){  // bold
			$this->posi++;
			while( $this->html[$this->posi] != ">" ){
				$this->posi++;
			}
			$this->posi++; // move past >
			return BOLD_OPEN;
		}
		else if ( strtolower($this->html[$this->posi+1]) == "i" ){  // italics
			$this->posi++;
			while( $this->html[$this->posi] != ">" ){
				$this->posi++;
			}
			$this->posi++; // move past >
			return ITAL_OPEN;
		}
		else{ // unknown opening tag
			$this->posi++;
			while( $this->html[$this->posi] != ">" ){
				$this->posi++;
			}
			$this->posi++; // move past >
			return UNKOWN_OPEN;
		}
	}
}


class RNZ_Doc_Formatter extends PBI_Parser
{
/*
This works as follows:
0. It cleans up the HTML:
	a. removes blank lines
	b. removes new lines
	c. Removes "National Radio Programme Listing"
	d. removes addtional white space
	e. Butts tags
1. It strips all tags except b and p
2. It then extracts whole paragraphs pushing these onto an array
3. It remembers the current state of bold formatting
4. paragraphs are formated as <h3> if bold is on and <p> if not.
5. Day Titles are formated as <h2>.
6. RR is formatted <span class="rr">RR</span>
7. Text in ( ) are formated <em>( )</em>
8. Emails are turned into mailto:

CSS is used to style these as required.

This comes about because Word produces HTML that is not well formed
and XHTML MUST be well formed.
e.g. <B><P ALIGN=JUSTIFY>6.30 Sports Story RR </B>(RNZ)</P>

Assumptions:
1. This class assumes that lines are are bold are programmes
and that non-bold lines are programme info.
2. There are headings for each day in the format : /day_name \d{1,2} \w+ \d{4}/
3. That items in brackets are extra info:
	e.g. (RNZ), (BBC), (Ep 2 of 5, BBC).
	and that these are not bold in weight even if they are on a bold line.
4. The sub-string "RR" is spread through the document to indicate Replay Radio

Using HTML tidy would produce the same result but with much higher overhead.

*/


var $daysel= "
<div class=\"iedaysel\">
<ul>
 <li><a href=\"#sat\">Saturday</a></li>
 <li><a href=\"#sun\">Sunday</a></li>
 <li><a href=\"#mon\">Monday</a></li>
 <li><a href=\"#tue\">Tuesday</a></li>
 <li><a href=\"#wed\">Wednesday</a></li>
 <li><a href=\"#thu\">Thursday</a></li>
 <li><a href=\"#fri\">Friday</a></li>
</ul>
</div>";

	var $xhtml;

	function RNZ_Doc_Formatter()
	{
		parent::PBI_Parser();
		$this->xhtml = array();
		$this->summary = array();
	}


	function Process( $html='', $mode=WHATSON )
	{
		$line = "";

		// if there are no tags then it is plain text
		if ( stristr( $html, "<" ) === FALSE ){
			return(0);
		}

		$this->SetHTML( $html);

		// each type of document gets its own formatting
		if( $mode == WHATSON ){ // WHATS ON DOCUMENT

			$this->CleanUpHTML( "<p><b>" );
			while ( $this->GetParagraph() ) {
				$this->AddEntities();
				$this->WO_AddTags();

//			 	array_push( $this->xhtml, $this->line."<br />\n" ); // DEBUG

				// ignore the bold state and save the line
				if ($this->WO_AddAnchors()){
				 	array_push( $this->xhtml, $this->line );
				}
				else {
					switch( $this->para_type )
					{
						case BOLD_ALL:
							array_push( $this->xhtml, "<h3>".$this->line."</h3>" );
							break;

						default:
						array_push( $this->xhtml, "<p>".$this->line."</p>" );
						break;
					}
				}
			}
		}
		else if( $mode == HIGHLIGHTS ){ // HIGHLIGHTS DOCUMENT
			$this->CleanUpHTML( "<p><b><i>" );
//			echo $this->html;	exit(0);			// DEBUG

			while ( $this->GetParagraph() ) {
				$this->AddEntities();

				// ignore the bold state and save the line
				if ( $this->HI_AddAnchors() ){
					$temp = strip_tags( $this->line );
					$this->line = "<h2>$temp</h2>";
				 	array_push( $this->xhtml, $this->line );
				}
				else {
					switch( $this->para_type )
					{
						case BOLD_ALL:
							array_push( $this->xhtml, "<p class=\"hlts\">".$this->line."</p>" );
							break;

						default:
							array_push( $this->xhtml, "<p>".$this->line."</p>" );
						break;
					}
				}
			}
		}
		else if( $mode == CHART ){ // concert FM classical chart

			// instantiate class
			parent::TABLE_Parser();

			array_push( $this->xhtml, "<table class=\"chart\">");
			array_push( $this->xhtml, "<tr><th>This<br />Week</th><th>Last<br />Week</th><th>Title</th><th>Artist</th><th>CD #</th></tr>");

			$lines = explode("\n", $this->GetTable() );

			//print_r( explode("<th>", $this->GetTable()));
			//echo $this->GetTable();  exit();  // DEBUG

			foreach( $lines as $this->line ){
				$fields = explode("<td>", $this->line);

				$newline = '';
				foreach( $fields as $field ){
					$this->line = strip_tags( $field );
					$this->AddEntities();
					$newline .= "<td>$this->line</td>";
				}
				$newline = str_replace("<tr><td>", "<tr><td class=\"pos\">", "<tr>$newline</tr>" );

			 	array_push( $this->xhtml, "$newline" );
			}

			array_push( $this->xhtml, "</table>");
		}
		else if( $mode == PRESS ){ // PRESS RELEASES
			$this->CleanUpHTML( "<p><b><i>" );

			while ( $this->GetParagraph() ) {
				$this->AddEntities();
				array_push( $this->xhtml, "<p>".$this->line."</p>" );
			}
		}
		else if( $mode == PLAIN ){ // PLAIN
			$this->CleanUpHTML( "<p><b><i>" );

			while ( $this->GetParagraph() ) {
				$this->AddEntities();
				array_push( $this->xhtml, "<p>".$this->line."</p>" );
			}
		}
		else if( $mode == MUSICFEATURES ){ // MUSIC FEATURES

			$this->CleanUpHTML( "<p><b>" );
			while ( $this->GetParagraph() ) {
				$this->AddEntities();
				$this->WO_AddTags();

					if ( $this->MUSIC_AddAnchors() ){
					$temp = strip_tags( $this->line );
					$this->line = "<h2>$temp</h2>";
				 	array_push( $this->xhtml, $this->line );
				}
				else {
					switch( $this->para_type )
					{
						case BOLD_ALL:
							array_push( $this->xhtml, "<p class=\"hlts\">".$this->line."</p>" );
							break;

						default:
							array_push( $this->xhtml, "<p>".$this->line."</p>" );
						break;
					}
				}
			}
		}

		return(1);
	}

	// looks for the first occurence of a date formated thus:
	/*
	saturday     	# dayname
	\s				# 1 space		  	(gap)
	\d{1,2}			# 1 or 2 digits  	(date)
	\s				# 1 space		  	(gap)
	\w{3,9}			# 3 to 9 characters (month)
	\s				# 1 space		  	(gap)
	\d{4}			# 4 digits			(year)
	*/

	// this will get all of the lines that are like:
	// saturday 14 March 2002
	// but ignore lines that are close.
	function WO_AddAnchors( )
	{
		// marks if item found to avoid duplicate ID anchors which are NOT allowed in XHTML
		// documents aare unlikely to contain two but belt and braces required anyway
		static $sat = 0;
		static $sun = 0;
		static $mon = 0;
		static $tue = 0;
		static $wed = 0;
		static $thu = 0;
		static $fri = 0;

		// anchor the days of the week
		if ( preg_match( "/saturday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
			if( !$sat ){ // only do the first one
				$this->line = "<h2>".strip_tags( $this->line )."</h2>";
				$sat = 1;
				return 1;
			}
		}
		else if ( preg_match( "/sunday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
			if( !$sun ){ // only do the first one
				$this->line = "<a id=\"sun\" name=\"sun\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
				$sun = 1;
				return 1;
			}
		}
		else if ( preg_match( "/monday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
			if( !$mon ){ // only do the first one
				$this->line = "<a id=\"mon\" name=\"mon\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
				$mon = 1;
				return 1;
			}
		}
		else if ( preg_match( "/tuesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
			if( !$tue ){ // only do the first one
				$this->line = "<a id=\"tue\" name=\"tue\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
				$tue = 1;
				return 1;
			}
		}
		else if ( preg_match( "/wednesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
			if( !$wed ){ // only do the first one
				$this->line = "<a id=\"wed\" name=\"wed\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
				$wed = 1;
				return 1;
			}
		}
		else if ( preg_match( "/thursday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
			if( !$thu ){ // only do the first one
				$this->line = "<a id=\"thu\" name=\"thu\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
				$thu = 1;
				return 1;
			}
		}
		else if ( preg_match( "/friday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
			if( !$fri ){ // only do the first one
				$this->line = "<a id=\"fri\" name=\"fri\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
				$fri = 1;
				return 1;
			}
		}
		else {
			return 0;
		}
	}

	function HI_AddAnchors( )
	{
		// anchor the days of the week
		if ( preg_match( "/saturday \d{1,2} \w{3,9}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/sunday \d{1,2} \w{3,9}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/monday \d{1,2} \w{3,9}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/tuesday \d{1,2} \w{3,9}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/wednesday \d{1,2} \w{3,9}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/thursday \d{1,2} \w{3,9}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/friday \d{1,2} \w{3,9}/i", $this->line )){  // days of the week title
			return 1;
		}
		else {
			return 0;
		}
	}

	function Music_AddAnchors( )
	{
		// anchor the days of the week
		if ( preg_match( "/saturday \d{1,2} \w{3,9} \d{4}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/sunday \d{1,2} \w{3,9} \d{4}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/monday \d{1,2} \w{3,9} \d{4}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/tuesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/wednesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/thursday \d{1,2} \w{3,9} \d{4}/i", $this->line )){  // days of the week title
			return 1;
		}
		else if ( preg_match( "/friday \d{1,2} \w{3,9} \d{4}/i", $this->line )){  // days of the week title
			return 1;
		}
		else {
			return 0;
		}
	}

	function WO_AddTags()
	{
		$search  = array (	"/ RR/",	// replay radio
							"/\(([\w|\s|\.|\,]+)\)/",	// things in brackets
							"/(\d{1,2}\.\d{2}) /",		// times with a space after (not in a list)
							"/((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/" ); // email

		$replace = array (	" <span class=\"rr\">RR</span>",
							"<em>(\\1)</em>",
							"<strong>\\1</strong> ",
							"<a href=\"mailto:\\1\">\\1</a>" );

		$this->line = preg_replace( $search, $replace, $this->line );
	}

	function AddEntities()
	{
		$search  = array
		(
			"/ë/",
			"/'/",
			"/í/",
			"/\"/",
			"/Ö/",
			"/ ñ /"
		);

		$replace = array
		(
			"&#8217;",
			"&#8217;",
			"&#8217;",
			"&quot;",
			"&#8230;",
			" &#8212; "
		);

		$this->line = preg_replace( $search, $replace, $this->line );
	}

	// function to return a string containing the XHTML with line breaks.
	function GetXHTML()
	{
		$temp = "";

		foreach( $this->xhtml as $line ){
			$temp .= "$line\n";
		}
		return $temp;
	}

	function GetSummary()
	{
		$temp = "";

		foreach( $this->summary as $line ){
			$temp .= "<p>$line\n</p>";
		}
		return $temp;
	}

}
?>
	<?php

	// name richtextparser.inc

	// This code is license GPL version 2
	// It is an example from the blog series rebuilding radionz.co.nz
	// http://richardhulse.blogspot.com/

	/* VERSION HISTORY

	0.1 first version. Parses only W97 NAT sched
	0.2 added formatting options
	0.3 tweaked to allow parsing of W97 CFM sched
	0.4 branch control to split sched / highlight parsing
	0.5 new date regex for highlights

	0.6 10 April 2003 RLH
	added w2000 parsing of files

	0.7 13 April 2003 RLH
	seperated parser and formatter into their own classes.
	Parser now returns the type of formatting to be used in the line.
	e.g. NONE, BOLD_ALL.
	This could be refined if required.
	BOLD ALL just means the line starts in BOLD which is good enough for our purposes

	The formatter class decides what to do with the line based on that.

	0.8 23 April 2003 RLH
	Added section CHART to parse CFM classical chart.
	stips the whole table back to TR and TD tags with no attributes.
	Mercifully this is well formed in W97.

	Each line of the table is extracted.

	0.9 29 April 2003 RLH
	Added cleaner for press releases.

	0.91 26 May 2003 RLH
	Changed day selector code

	0.92 08 July 2003 RLH
	Added musicfeatures

	0.93 12 August 2003 RLH
	changed table parser to replace entities
	added "/ë/" to entity list

	0.94 28 November 2003 RLH
	Added tighter regex test for dates in Music Features docs



	---------------------------------
	NB: W97 docs are not well formed:
	eg. <b><p></b></p>
	(The parser should fix this)
	W2000 docs ARE well formed but contain a heap of junk.

	*/

	// type of document to process

	DEFINE( 'WHATSON', 1 );
	DEFINE( 'HIGHLIGHTS', 2 );
	DEFINE( 'CHART', 3 );
	DEFINE( 'PRESS', 4 );
	DEFINE( 'MUSICFEATURES', 5 );
	DEFINE( 'PLAIN', 6 );

	// tag processing
	DEFINE( 'PARA_OPEN', 1);
	DEFINE( 'PARA_CLOSE', 2);
	DEFINE( 'BOLD_OPEN', 3);
	DEFINE( 'BOLD_CLOSE', 4);
	DEFINE( 'ITAL_OPEN', 5);
	DEFINE( 'ITAL_CLOSE', 6);
	DEFINE( 'UNKNOWN_OPEN', 10);
	DEFINE( 'UNKNOWN_CLOSE', 11);

	// type of line
	DEFINE( 'NONE', 1);
	DEFINE( 'BOLD_ALL', 2);
	DEFINE( 'BOLD_PART', 3);
	DEFINE( 'ITAL_ALL', 4);
	DEFINE( 'ITAL_PART', 5);

	/*

	This class arrangment extension arrangement is a
	bodge instead of polymorphism

	think of it thus:


	Parser_BASE
	\|
	-------------------
	\| \|
	PBI_Parser TABLE_Parser
	\| \|
	-------------------
	\|
	RNZ_Doc_Formatter


	*/

	class Parser_BASE
	{
	var $html;
	var $posi;
	var $length;
	var $line;
	var $para_type;
	var $bold_on = 0;

	var $tagged_line;

	function Parser_BASE()
	{
	$this->html = "";
	$this->posi = 0;
	$this->length = 0;
	$this->line = "";
	$this->para_type = NONE;
	}

	function SetHTML ( $html )
	{
	$this->html = $html;
	// -1 because of 0 offset !!!!!!!!!!!!
	$this->length = strlen( $this->html ) -1;
	}

	function CleanUpHTML( $tags="<p><b><i>" )
	{
	/*
	The order in which tags are processed is highly critical
	The order here will remove and clean up most everything
	in W97 and W2000 html
	*/

	// initial pre-processing normalises a few things
	$pre_search = array
	(
	"/[\n\|\r]/", // new lines
	"/<\![^>]*>/", // w2000 comments
	"/\s+/", // extra whitespace
	"/<h.>/i", // convert headings
	"/<\/h.>/i" // + closing tags
	);


	$pre_replace = array
	(
	" ",
	" ",
	" ",
	"<p>",
	"</p>"
	);


	$this->html = preg_replace( $pre_search, $pre_replace, $this->html );


	// strip all tags that are not those we want
	// including all the w2000 rubbish
	$this->html = strip_tags( $this->html, $tags );

	// general cleanup
	// the tag clean up is for w2000 - well formed
	// but tags are repeated too much
	$search = array
	(
	"/National Radio Programme Listing/i", // listing text
	"/Details available on www.radionz.co.nz/i",
	"/<([p\|b\|i]) [^>]*>/i", // remove attributes from tags
	"/>\s+</", // butt tags
	"/<p> <\/p>/i", // blank paras
	"/<p>  <\/p>/i", // blank paras
	"/<b> <\/b>/i", // blank paras
	"/<i> <\/i>/i", // blank paras
	"/<\/b><b>/i", // clean up bold tags
	"/<b><\/b>/i", // empty bold tags
	"/<\/i><i>/i", // same italics
	"/<i><\/i>/i", // empty bold tags
	"/<p><\/p>/i", // blank paras
	"/[\n\|\r]/"
	);

	$replace = array
	(
	"",
	"",
	"<\\1>",
	"><",
	"",
	"",
	"",
	"",
	"",
	"",
	"",
	"",
	"",
	""
	);

	$this->html = preg_replace( $search, $replace, $this->html );
	// -1 because of 0 offset !!!!!!!!!!!!
	$this->length = strlen( $this->html ) -1;

	}
	}

	class TABLE_Parser extends Parser_BASE
	{

	function TABLE_Parser()
	{
	// $this->html = $html;
	$this->CleanTable();
	}

	function CleanTable( )
	{
	$this->CleanUpHTML( "<td><tr><p>" );

	// this is table specific clean up
	$search = array
	(
	"/<td [^>]*>/i", // remove attributes from tags
	"/<tr [^>]*>/i", // remove attributes from tags
	"/<td>/i",
	"/<tr>/i",
	"/<\/td>/i",
	"/<\/tr>/i"
	);

	$replace = array
	(
	"<td>",
	"<tr>",
	"<td>",
	"<tr>",
	"</td>",
	"</tr>"
	);

	$this->html = preg_replace( $search, $replace, $this->html );

	$this->html = strip_tags( $this->html, "<tr><td>"); // removing any errant P tags

	$this->html = str_replace("<tr><td>", "<tr><td class=\"pos\">", $this->html);
	$this->html = str_replace("</tr><tr>", "\n", $this->html);
	$this->html = str_replace("<tr>", "", $this->html);
	$this->html = str_replace("</tr>", "", $this->html);

	// echo $this->html; exit();
	}

	function GetTable()
	{
	return $this->html;
	}
	}


	class PBI_Parser extends TABLE_Parser
	{
	var $bold_on;
	var $italics_on;

	function PBI_Parser()
	{
	parent::Parser_BASE();
	// these two vars are used to remember the state across many lines
	// for poorly formed word 97 html
	$this->bold_on;
	$this->italics_on;
	}

	function GetParagraph()
	{
	// NextPara gets a well formed paragraph
	if( $this->NextParagraph()){
	if( preg_match( "/<p><b>/", $this->line )){
	$this->para_type = BOLD_ALL;
	}
	else{
	$this->para_type = NONE;
	}
	// keep italics
	$this->line = strip_tags( $this->line, "<em>");
	return true;
	}
	else{
	return false;
	}
	}

	function NextParagraph()
	{
	// get the next para and TIDIES up the thml to be well formed.
	// well, well enough for our purposes.
	static $in_bold = false; // remember
	$in_italics = false;
	$in_cdata = false;

	$in_para = false;
	$this->bold_on = $in_bold; // restore state of bold
	$this->italics_on = $in_italics; // restore state of bold
	$this->line = "";

	if ($this->posi >= $this->length){
	return false;
	}

	while (1) {
	if ($this->posi >= $this->length){
	return false;
	}

	if ($this->html[$this->posi] == "<") { // a tag ?
	switch ( $this->GetTag() )
	{
	case PARA_OPEN:
	$this->line .= "<p>";
	$in_para = true;
	break;

	case PARA_CLOSE:
	$in_cdata = false; // line finished
	$in_para = false;
	if( $in_bold ){ // unclosed bold
	$this->line .= "</b>";
	}
	if( $in_italics ){ // unclosed italics
	$this->line .= "</em>";
	}
	$this->line .= "</p>";
	return 1;
	break;

	case BOLD_OPEN:
	$in_bold = true;
	$this->bold_on = true; // turn bold on
	break;

	case BOLD_CLOSE:
	$in_bold = false;
	if( !$in_cdata ) { // we turn off the bold ONLY when we are not in the middle of the line
	$this->bold_on = false;
	}
	else{
	$this->line .= "</b>";
	}
	break;

	case ITAL_OPEN:
	$in_italics = true;
	break;

	case ITAL_CLOSE:
	$in_italics = false;
	if( !$in_cdata ) { // we turn off the bold ONLY when we are not in the middle of the line
	$this->italics_on = false;
	}
	else{
	$this->line .= "</em>";
	}
	break;
	}
	}
	else{ // not a tag
	if( $in_bold ){
	$this->line .= "<b>";
	}
	if( $in_italics ){
	$this->line .= "<em>";
	}
	while( $this->html[$this->posi] != "<" ){
	$this->line .= $this->html[$this->posi++];
	}
	$in_cdata = true; // mark that we are in Cdata
	// this is in case the bold state changes mid para
	}
	} // end while
	}

	function GetTag()
	{
	if ($this->html[$this->posi+1] == "/" ) { // closing tag?
	$this->posi++;
	if ( strtolower($this->html[$this->posi+1]) == "p" ){ // paragraph
	$this->posi += 3;
	return PARA_CLOSE; // end of the </p> so return the para
	}
	else if ( strtolower($this->html[$this->posi+1]) == "b" ){ // bold
	$this->posi += 3;
	return BOLD_CLOSE;
	}
	else if ( strtolower($this->html[$this->posi+1]) == "i" ){ // bold
	$this->posi += 3;
	return ITAL_CLOSE;
	}
	else { // unknown closing tag
	$this->posi += 3;
	return UNKOWN_CLOSE;
	}
	}
	else if ( strtolower($this->html[$this->posi+1]) == "p" ){ // paragraph
	$this->posi++;
	while( $this->html[$this->posi] != ">" ){
	$this->posi++;
	}
	$this->posi++; // move past >
	return PARA_OPEN;
	}
	else if ( strtolower($this->html[$this->posi+1]) == "b" ){ // bold
	$this->posi++;
	while( $this->html[$this->posi] != ">" ){
	$this->posi++;
	}
	$this->posi++; // move past >
	return BOLD_OPEN;
	}
	else if ( strtolower($this->html[$this->posi+1]) == "i" ){ // italics
	$this->posi++;
	while( $this->html[$this->posi] != ">" ){
	$this->posi++;
	}
	$this->posi++; // move past >
	return ITAL_OPEN;
	}
	else{ // unknown opening tag
	$this->posi++;
	while( $this->html[$this->posi] != ">" ){
	$this->posi++;
	}
	$this->posi++; // move past >
	return UNKOWN_OPEN;
	}
	}
	}


	class RNZ_Doc_Formatter extends PBI_Parser
	{
	/*
	This works as follows:
	0. It cleans up the HTML:
	a. removes blank lines
	b. removes new lines
	c. Removes "National Radio Programme Listing"
	d. removes addtional white space
	e. Butts tags
	1. It strips all tags except b and p
	2. It then extracts whole paragraphs pushing these onto an array
	3. It remembers the current state of bold formatting
	4. paragraphs are formated as <h3> if bold is on and <p> if not.
	5. Day Titles are formated as <h2>.
	6. RR is formatted <span class="rr">RR</span>
	7. Text in ( ) are formated <em>( )</em>
	8. Emails are turned into mailto:

	CSS is used to style these as required.

	This comes about because Word produces HTML that is not well formed
	and XHTML MUST be well formed.
	e.g. <B><P ALIGN=JUSTIFY>6.30 Sports Story RR </B>(RNZ)</P>

	Assumptions:
	1. This class assumes that lines are are bold are programmes
	and that non-bold lines are programme info.
	2. There are headings for each day in the format : /day_name \d{1,2} \w+ \d{4}/
	3. That items in brackets are extra info:
	e.g. (RNZ), (BBC), (Ep 2 of 5, BBC).
	and that these are not bold in weight even if they are on a bold line.
	4. The sub-string "RR" is spread through the document to indicate Replay Radio

	Using HTML tidy would produce the same result but with much higher overhead.

	*/


	var $daysel= "
	<div class=\"iedaysel\">
	<ul>
	<li><a href=\"#sat\">Saturday</a></li>
	<li><a href=\"#sun\">Sunday</a></li>
	<li><a href=\"#mon\">Monday</a></li>
	<li><a href=\"#tue\">Tuesday</a></li>
	<li><a href=\"#wed\">Wednesday</a></li>
	<li><a href=\"#thu\">Thursday</a></li>
	<li><a href=\"#fri\">Friday</a></li>
	</ul>
	</div>";

	var $xhtml;

	function RNZ_Doc_Formatter()
	{
	parent::PBI_Parser();
	$this->xhtml = array();
	$this->summary = array();
	}


	function Process( $html='', $mode=WHATSON )
	{
	$line = "";

	// if there are no tags then it is plain text
	if ( stristr( $html, "<" ) === FALSE ){
	return(0);
	}

	$this->SetHTML( $html);

	// each type of document gets its own formatting
	if( $mode == WHATSON ){ // WHATS ON DOCUMENT

	$this->CleanUpHTML( "<p><b>" );
	while ( $this->GetParagraph() ) {
	$this->AddEntities();
	$this->WO_AddTags();

	// array_push( $this->xhtml, $this->line."<br />\n" ); // DEBUG

	// ignore the bold state and save the line
	if ($this->WO_AddAnchors()){
	array_push( $this->xhtml, $this->line );
	}
	else {
	switch( $this->para_type )
	{
	case BOLD_ALL:
	array_push( $this->xhtml, "<h3>".$this->line."</h3>" );
	break;

	default:
	array_push( $this->xhtml, "<p>".$this->line."</p>" );
	break;
	}
	}
	}
	}
	else if( $mode == HIGHLIGHTS ){ // HIGHLIGHTS DOCUMENT
	$this->CleanUpHTML( "<p><b><i>" );
	// echo $this->html; exit(0); // DEBUG

	while ( $this->GetParagraph() ) {
	$this->AddEntities();

	// ignore the bold state and save the line
	if ( $this->HI_AddAnchors() ){
	$temp = strip_tags( $this->line );
	$this->line = "<h2>$temp</h2>";
	array_push( $this->xhtml, $this->line );
	}
	else {
	switch( $this->para_type )
	{
	case BOLD_ALL:
	array_push( $this->xhtml, "<p class=\"hlts\">".$this->line."</p>" );
	break;

	default:
	array_push( $this->xhtml, "<p>".$this->line."</p>" );
	break;
	}
	}
	}
	}
	else if( $mode == CHART ){ // concert FM classical chart

	// instantiate class
	parent::TABLE_Parser();

	array_push( $this->xhtml, "<table class=\"chart\">");
	array_push( $this->xhtml, "<tr><th>This<br />Week</th><th>Last<br />Week</th><th>Title</th><th>Artist</th><th>CD #</th></tr>");

	$lines = explode("\n", $this->GetTable() );

	//print_r( explode("<th>", $this->GetTable()));
	//echo $this->GetTable(); exit(); // DEBUG

	foreach( $lines as $this->line ){
	$fields = explode("<td>", $this->line);

	$newline = '';
	foreach( $fields as $field ){
	$this->line = strip_tags( $field );
	$this->AddEntities();
	$newline .= "<td>$this->line</td>";
	}
	$newline = str_replace("<tr><td>", "<tr><td class=\"pos\">", "<tr>$newline</tr>" );

	array_push( $this->xhtml, "$newline" );
	}

	array_push( $this->xhtml, "</table>");
	}
	else if( $mode == PRESS ){ // PRESS RELEASES
	$this->CleanUpHTML( "<p><b><i>" );

	while ( $this->GetParagraph() ) {
	$this->AddEntities();
	array_push( $this->xhtml, "<p>".$this->line."</p>" );
	}
	}
	else if( $mode == PLAIN ){ // PLAIN
	$this->CleanUpHTML( "<p><b><i>" );

	while ( $this->GetParagraph() ) {
	$this->AddEntities();
	array_push( $this->xhtml, "<p>".$this->line."</p>" );
	}
	}
	else if( $mode == MUSICFEATURES ){ // MUSIC FEATURES

	$this->CleanUpHTML( "<p><b>" );
	while ( $this->GetParagraph() ) {
	$this->AddEntities();
	$this->WO_AddTags();

	if ( $this->MUSIC_AddAnchors() ){
	$temp = strip_tags( $this->line );
	$this->line = "<h2>$temp</h2>";
	array_push( $this->xhtml, $this->line );
	}
	else {
	switch( $this->para_type )
	{
	case BOLD_ALL:
	array_push( $this->xhtml, "<p class=\"hlts\">".$this->line."</p>" );
	break;

	default:
	array_push( $this->xhtml, "<p>".$this->line."</p>" );
	break;
	}
	}
	}
	}

	return(1);
	}

	// looks for the first occurence of a date formated thus:
	/*
	saturday # dayname
	\s # 1 space (gap)
	\d{1,2} # 1 or 2 digits (date)
	\s # 1 space (gap)
	\w{3,9} # 3 to 9 characters (month)
	\s # 1 space (gap)
	\d{4} # 4 digits (year)
	*/

	// this will get all of the lines that are like:
	// saturday 14 March 2002
	// but ignore lines that are close.
	function WO_AddAnchors( )
	{
	// marks if item found to avoid duplicate ID anchors which are NOT allowed in XHTML
	// documents aare unlikely to contain two but belt and braces required anyway
	static $sat = 0;
	static $sun = 0;
	static $mon = 0;
	static $tue = 0;
	static $wed = 0;
	static $thu = 0;
	static $fri = 0;

	// anchor the days of the week
	if ( preg_match( "/saturday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
	if( !$sat ){ // only do the first one
	$this->line = "<h2>".strip_tags( $this->line )."</h2>";
	$sat = 1;
	return 1;
	}
	}
	else if ( preg_match( "/sunday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
	if( !$sun ){ // only do the first one
	$this->line = "<a id=\"sun\" name=\"sun\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
	$sun = 1;
	return 1;
	}
	}
	else if ( preg_match( "/monday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
	if( !$mon ){ // only do the first one
	$this->line = "<a id=\"mon\" name=\"mon\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
	$mon = 1;
	return 1;
	}
	}
	else if ( preg_match( "/tuesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
	if( !$tue ){ // only do the first one
	$this->line = "<a id=\"tue\" name=\"tue\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
	$tue = 1;
	return 1;
	}
	}
	else if ( preg_match( "/wednesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
	if( !$wed ){ // only do the first one
	$this->line = "<a id=\"wed\" name=\"wed\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
	$wed = 1;
	return 1;
	}
	}
	else if ( preg_match( "/thursday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
	if( !$thu ){ // only do the first one
	$this->line = "<a id=\"thu\" name=\"thu\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
	$thu = 1;
	return 1;
	}
	}
	else if ( preg_match( "/friday \d{1,2} \w{3,9} \d{4}/i", $this->line )){
	if( !$fri ){ // only do the first one
	$this->line = "<a id=\"fri\" name=\"fri\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>";
	$fri = 1;
	return 1;
	}
	}
	else {
	return 0;
	}
	}

	function HI_AddAnchors( )
	{
	// anchor the days of the week
	if ( preg_match( "/saturday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/sunday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/monday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/tuesday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/wednesday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/thursday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/friday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title
	return 1;
	}
	else {
	return 0;
	}
	}

	function Music_AddAnchors( )
	{
	// anchor the days of the week
	if ( preg_match( "/saturday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/sunday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/monday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/tuesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/wednesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/thursday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
	return 1;
	}
	else if ( preg_match( "/friday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title
	return 1;
	}
	else {
	return 0;
	}
	}

	function WO_AddTags()
	{
	$search = array ( "/ RR/", // replay radio
	"/\(([\w\|\s\|\.\|\,]+)\)/", // things in brackets
	"/(\d{1,2}\.\d{2}) /", // times with a space after (not in a list)
	"/((\w\|\-\|\_\|\.)+\@((\w\|\-\|\_)+\.)+[a-zA-Z]{2,})/" ); // email

	$replace = array ( " <span class=\"rr\">RR</span>",
	"<em>(\\1)</em>",
	"<strong>\\1</strong> ",
	"<a href=\"mailto:\\1\">\\1</a>" );

	$this->line = preg_replace( $search, $replace, $this->line );
	}

	function AddEntities()
	{
	$search = array
	(
	"/ë/",
	"/'/",
	"/í/",
	"/\"/",
	"/Ö/",
	"/ ñ /"
	);

	$replace = array
	(
	"’",
	"’",
	"’",
	""",
	"…",
	" — "
	);

	$this->line = preg_replace( $search, $replace, $this->line );
	}

	// function to return a string containing the XHTML with line breaks.
	function GetXHTML()
	{
	$temp = "";

	foreach( $this->xhtml as $line ){
	$temp .= "$line\n";
	}
	return $temp;
	}

	function GetSummary()
	{
	$temp = "";

	foreach( $this->summary as $line ){
	$temp .= "<p>$line\n</p>";
	}
	return $temp;
	}

	}
	?>