Version 1 of the parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// name richtextparser.inc | |
// This code is license GPL version 2 | |
// It is an example from the blog series rebuilding radionz.co.nz | |
// http://richardhulse.blogspot.com/ | |
/* VERSION HISTORY | |
0.1 first version. Parses only W97 NAT sched | |
0.2 added formatting options | |
0.3 tweaked to allow parsing of W97 CFM sched | |
0.4 branch control to split sched / highlight parsing | |
0.5 new date regex for highlights | |
0.6 10 April 2003 RLH | |
added w2000 parsing of files | |
0.7 13 April 2003 RLH | |
seperated parser and formatter into their own classes. | |
Parser now returns the type of formatting to be used in the line. | |
e.g. NONE, BOLD_ALL. | |
This could be refined if required. | |
BOLD ALL just means the line starts in BOLD which is good enough for our purposes | |
The formatter class decides what to do with the line based on that. | |
0.8 23 April 2003 RLH | |
Added section CHART to parse CFM classical chart. | |
stips the whole table back to TR and TD tags with no attributes. | |
Mercifully this is well formed in W97. | |
Each line of the table is extracted. | |
0.9 29 April 2003 RLH | |
Added cleaner for press releases. | |
0.91 26 May 2003 RLH | |
Changed day selector code | |
0.92 08 July 2003 RLH | |
Added musicfeatures | |
0.93 12 August 2003 RLH | |
changed table parser to replace entities | |
added "/ë/" to entity list | |
0.94 28 November 2003 RLH | |
Added tighter regex test for dates in Music Features docs | |
--------------------------------- | |
NB: W97 docs are not well formed: | |
eg. <b><p></b></p> | |
(The parser should fix this) | |
W2000 docs ARE well formed but contain a heap of junk. | |
*/ | |
// type of document to process | |
DEFINE( 'WHATSON', 1 ); | |
DEFINE( 'HIGHLIGHTS', 2 ); | |
DEFINE( 'CHART', 3 ); | |
DEFINE( 'PRESS', 4 ); | |
DEFINE( 'MUSICFEATURES', 5 ); | |
DEFINE( 'PLAIN', 6 ); | |
// tag processing | |
DEFINE( 'PARA_OPEN', 1); | |
DEFINE( 'PARA_CLOSE', 2); | |
DEFINE( 'BOLD_OPEN', 3); | |
DEFINE( 'BOLD_CLOSE', 4); | |
DEFINE( 'ITAL_OPEN', 5); | |
DEFINE( 'ITAL_CLOSE', 6); | |
DEFINE( 'UNKNOWN_OPEN', 10); | |
DEFINE( 'UNKNOWN_CLOSE', 11); | |
// type of line | |
DEFINE( 'NONE', 1); | |
DEFINE( 'BOLD_ALL', 2); | |
DEFINE( 'BOLD_PART', 3); | |
DEFINE( 'ITAL_ALL', 4); | |
DEFINE( 'ITAL_PART', 5); | |
/* | |
This class arrangment extension arrangement is a | |
bodge instead of polymorphism | |
think of it thus: | |
Parser_BASE | |
| | |
------------------- | |
| | | |
PBI_Parser TABLE_Parser | |
| | | |
------------------- | |
| | |
RNZ_Doc_Formatter | |
*/ | |
class Parser_BASE | |
{ | |
var $html; | |
var $posi; | |
var $length; | |
var $line; | |
var $para_type; | |
var $bold_on = 0; | |
var $tagged_line; | |
function Parser_BASE() | |
{ | |
$this->html = ""; | |
$this->posi = 0; | |
$this->length = 0; | |
$this->line = ""; | |
$this->para_type = NONE; | |
} | |
function SetHTML ( $html ) | |
{ | |
$this->html = $html; | |
// -1 because of 0 offset !!!!!!!!!!!! | |
$this->length = strlen( $this->html ) -1; | |
} | |
function CleanUpHTML( $tags="<p><b><i>" ) | |
{ | |
/* | |
The order in which tags are processed is highly critical | |
The order here will remove and clean up most everything | |
in W97 and W2000 html | |
*/ | |
// initial pre-processing normalises a few things | |
$pre_search = array | |
( | |
"/[\n|\r]/", // new lines | |
"/<\![^>]*>/", // w2000 comments | |
"/\s+/", // extra whitespace | |
"/<h.>/i", // convert headings | |
"/<\/h.>/i" // + closing tags | |
); | |
$pre_replace = array | |
( | |
" ", | |
" ", | |
" ", | |
"<p>", | |
"</p>" | |
); | |
$this->html = preg_replace( $pre_search, $pre_replace, $this->html ); | |
// strip all tags that are not those we want | |
// including all the w2000 rubbish | |
$this->html = strip_tags( $this->html, $tags ); | |
// general cleanup | |
// the tag clean up is for w2000 - well formed | |
// but tags are repeated too much | |
$search = array | |
( | |
"/National Radio Programme Listing/i", // listing text | |
"/Details available on www.radionz.co.nz/i", | |
"/<([p|b|i]) [^>]*>/i", // remove attributes from tags | |
"/>\s+</", // butt tags | |
"/<p> <\/p>/i", // blank paras | |
"/<p> <\/p>/i", // blank paras | |
"/<b> <\/b>/i", // blank paras | |
"/<i> <\/i>/i", // blank paras | |
"/<\/b><b>/i", // clean up bold tags | |
"/<b><\/b>/i", // empty bold tags | |
"/<\/i><i>/i", // same italics | |
"/<i><\/i>/i", // empty bold tags | |
"/<p><\/p>/i", // blank paras | |
"/[\n|\r]/" | |
); | |
$replace = array | |
( | |
"", | |
"", | |
"<\\1>", | |
"><", | |
"", | |
"", | |
"", | |
"", | |
"", | |
"", | |
"", | |
"", | |
"", | |
"" | |
); | |
$this->html = preg_replace( $search, $replace, $this->html ); | |
// -1 because of 0 offset !!!!!!!!!!!! | |
$this->length = strlen( $this->html ) -1; | |
} | |
} | |
class TABLE_Parser extends Parser_BASE | |
{ | |
function TABLE_Parser() | |
{ | |
// $this->html = $html; | |
$this->CleanTable(); | |
} | |
function CleanTable( ) | |
{ | |
$this->CleanUpHTML( "<td><tr><p>" ); | |
// this is table specific clean up | |
$search = array | |
( | |
"/<td [^>]*>/i", // remove attributes from tags | |
"/<tr [^>]*>/i", // remove attributes from tags | |
"/<td>/i", | |
"/<tr>/i", | |
"/<\/td>/i", | |
"/<\/tr>/i" | |
); | |
$replace = array | |
( | |
"<td>", | |
"<tr>", | |
"<td>", | |
"<tr>", | |
"</td>", | |
"</tr>" | |
); | |
$this->html = preg_replace( $search, $replace, $this->html ); | |
$this->html = strip_tags( $this->html, "<tr><td>"); // removing any errant P tags | |
$this->html = str_replace("<tr><td>", "<tr><td class=\"pos\">", $this->html); | |
$this->html = str_replace("</tr><tr>", "\n", $this->html); | |
$this->html = str_replace("<tr>", "", $this->html); | |
$this->html = str_replace("</tr>", "", $this->html); | |
// echo $this->html; exit(); | |
} | |
function GetTable() | |
{ | |
return $this->html; | |
} | |
} | |
class PBI_Parser extends TABLE_Parser | |
{ | |
var $bold_on; | |
var $italics_on; | |
function PBI_Parser() | |
{ | |
parent::Parser_BASE(); | |
// these two vars are used to remember the state across many lines | |
// for poorly formed word 97 html | |
$this->bold_on; | |
$this->italics_on; | |
} | |
function GetParagraph() | |
{ | |
// NextPara gets a well formed paragraph | |
if( $this->NextParagraph()){ | |
if( preg_match( "/<p><b>/", $this->line )){ | |
$this->para_type = BOLD_ALL; | |
} | |
else{ | |
$this->para_type = NONE; | |
} | |
// keep italics | |
$this->line = strip_tags( $this->line, "<em>"); | |
return true; | |
} | |
else{ | |
return false; | |
} | |
} | |
function NextParagraph() | |
{ | |
// get the next para and TIDIES up the thml to be well formed. | |
// well, well enough for our purposes. | |
static $in_bold = false; // remember | |
$in_italics = false; | |
$in_cdata = false; | |
$in_para = false; | |
$this->bold_on = $in_bold; // restore state of bold | |
$this->italics_on = $in_italics; // restore state of bold | |
$this->line = ""; | |
if ($this->posi >= $this->length){ | |
return false; | |
} | |
while (1) { | |
if ($this->posi >= $this->length){ | |
return false; | |
} | |
if ($this->html[$this->posi] == "<") { // a tag ? | |
switch ( $this->GetTag() ) | |
{ | |
case PARA_OPEN: | |
$this->line .= "<p>"; | |
$in_para = true; | |
break; | |
case PARA_CLOSE: | |
$in_cdata = false; // line finished | |
$in_para = false; | |
if( $in_bold ){ // unclosed bold | |
$this->line .= "</b>"; | |
} | |
if( $in_italics ){ // unclosed italics | |
$this->line .= "</em>"; | |
} | |
$this->line .= "</p>"; | |
return 1; | |
break; | |
case BOLD_OPEN: | |
$in_bold = true; | |
$this->bold_on = true; // turn bold on | |
break; | |
case BOLD_CLOSE: | |
$in_bold = false; | |
if( !$in_cdata ) { // we turn off the bold ONLY when we are not in the middle of the line | |
$this->bold_on = false; | |
} | |
else{ | |
$this->line .= "</b>"; | |
} | |
break; | |
case ITAL_OPEN: | |
$in_italics = true; | |
break; | |
case ITAL_CLOSE: | |
$in_italics = false; | |
if( !$in_cdata ) { // we turn off the bold ONLY when we are not in the middle of the line | |
$this->italics_on = false; | |
} | |
else{ | |
$this->line .= "</em>"; | |
} | |
break; | |
} | |
} | |
else{ // not a tag | |
if( $in_bold ){ | |
$this->line .= "<b>"; | |
} | |
if( $in_italics ){ | |
$this->line .= "<em>"; | |
} | |
while( $this->html[$this->posi] != "<" ){ | |
$this->line .= $this->html[$this->posi++]; | |
} | |
$in_cdata = true; // mark that we are in Cdata | |
// this is in case the bold state changes mid para | |
} | |
} // end while | |
} | |
function GetTag() | |
{ | |
if ($this->html[$this->posi+1] == "/" ) { // closing tag? | |
$this->posi++; | |
if ( strtolower($this->html[$this->posi+1]) == "p" ){ // paragraph | |
$this->posi += 3; | |
return PARA_CLOSE; // end of the </p> so return the para | |
} | |
else if ( strtolower($this->html[$this->posi+1]) == "b" ){ // bold | |
$this->posi += 3; | |
return BOLD_CLOSE; | |
} | |
else if ( strtolower($this->html[$this->posi+1]) == "i" ){ // bold | |
$this->posi += 3; | |
return ITAL_CLOSE; | |
} | |
else { // unknown closing tag | |
$this->posi += 3; | |
return UNKOWN_CLOSE; | |
} | |
} | |
else if ( strtolower($this->html[$this->posi+1]) == "p" ){ // paragraph | |
$this->posi++; | |
while( $this->html[$this->posi] != ">" ){ | |
$this->posi++; | |
} | |
$this->posi++; // move past > | |
return PARA_OPEN; | |
} | |
else if ( strtolower($this->html[$this->posi+1]) == "b" ){ // bold | |
$this->posi++; | |
while( $this->html[$this->posi] != ">" ){ | |
$this->posi++; | |
} | |
$this->posi++; // move past > | |
return BOLD_OPEN; | |
} | |
else if ( strtolower($this->html[$this->posi+1]) == "i" ){ // italics | |
$this->posi++; | |
while( $this->html[$this->posi] != ">" ){ | |
$this->posi++; | |
} | |
$this->posi++; // move past > | |
return ITAL_OPEN; | |
} | |
else{ // unknown opening tag | |
$this->posi++; | |
while( $this->html[$this->posi] != ">" ){ | |
$this->posi++; | |
} | |
$this->posi++; // move past > | |
return UNKOWN_OPEN; | |
} | |
} | |
} | |
class RNZ_Doc_Formatter extends PBI_Parser | |
{ | |
/* | |
This works as follows: | |
0. It cleans up the HTML: | |
a. removes blank lines | |
b. removes new lines | |
c. Removes "National Radio Programme Listing" | |
d. removes addtional white space | |
e. Butts tags | |
1. It strips all tags except b and p | |
2. It then extracts whole paragraphs pushing these onto an array | |
3. It remembers the current state of bold formatting | |
4. paragraphs are formated as <h3> if bold is on and <p> if not. | |
5. Day Titles are formated as <h2>. | |
6. RR is formatted <span class="rr">RR</span> | |
7. Text in ( ) are formated <em>( )</em> | |
8. Emails are turned into mailto: | |
CSS is used to style these as required. | |
This comes about because Word produces HTML that is not well formed | |
and XHTML MUST be well formed. | |
e.g. <B><P ALIGN=JUSTIFY>6.30 Sports Story RR </B>(RNZ)</P> | |
Assumptions: | |
1. This class assumes that lines are are bold are programmes | |
and that non-bold lines are programme info. | |
2. There are headings for each day in the format : /day_name \d{1,2} \w+ \d{4}/ | |
3. That items in brackets are extra info: | |
e.g. (RNZ), (BBC), (Ep 2 of 5, BBC). | |
and that these are not bold in weight even if they are on a bold line. | |
4. The sub-string "RR" is spread through the document to indicate Replay Radio | |
Using HTML tidy would produce the same result but with much higher overhead. | |
*/ | |
var $daysel= " | |
<div class=\"iedaysel\"> | |
<ul> | |
<li><a href=\"#sat\">Saturday</a></li> | |
<li><a href=\"#sun\">Sunday</a></li> | |
<li><a href=\"#mon\">Monday</a></li> | |
<li><a href=\"#tue\">Tuesday</a></li> | |
<li><a href=\"#wed\">Wednesday</a></li> | |
<li><a href=\"#thu\">Thursday</a></li> | |
<li><a href=\"#fri\">Friday</a></li> | |
</ul> | |
</div>"; | |
var $xhtml; | |
function RNZ_Doc_Formatter() | |
{ | |
parent::PBI_Parser(); | |
$this->xhtml = array(); | |
$this->summary = array(); | |
} | |
function Process( $html='', $mode=WHATSON ) | |
{ | |
$line = ""; | |
// if there are no tags then it is plain text | |
if ( stristr( $html, "<" ) === FALSE ){ | |
return(0); | |
} | |
$this->SetHTML( $html); | |
// each type of document gets its own formatting | |
if( $mode == WHATSON ){ // WHATS ON DOCUMENT | |
$this->CleanUpHTML( "<p><b>" ); | |
while ( $this->GetParagraph() ) { | |
$this->AddEntities(); | |
$this->WO_AddTags(); | |
// array_push( $this->xhtml, $this->line."<br />\n" ); // DEBUG | |
// ignore the bold state and save the line | |
if ($this->WO_AddAnchors()){ | |
array_push( $this->xhtml, $this->line ); | |
} | |
else { | |
switch( $this->para_type ) | |
{ | |
case BOLD_ALL: | |
array_push( $this->xhtml, "<h3>".$this->line."</h3>" ); | |
break; | |
default: | |
array_push( $this->xhtml, "<p>".$this->line."</p>" ); | |
break; | |
} | |
} | |
} | |
} | |
else if( $mode == HIGHLIGHTS ){ // HIGHLIGHTS DOCUMENT | |
$this->CleanUpHTML( "<p><b><i>" ); | |
// echo $this->html; exit(0); // DEBUG | |
while ( $this->GetParagraph() ) { | |
$this->AddEntities(); | |
// ignore the bold state and save the line | |
if ( $this->HI_AddAnchors() ){ | |
$temp = strip_tags( $this->line ); | |
$this->line = "<h2>$temp</h2>"; | |
array_push( $this->xhtml, $this->line ); | |
} | |
else { | |
switch( $this->para_type ) | |
{ | |
case BOLD_ALL: | |
array_push( $this->xhtml, "<p class=\"hlts\">".$this->line."</p>" ); | |
break; | |
default: | |
array_push( $this->xhtml, "<p>".$this->line."</p>" ); | |
break; | |
} | |
} | |
} | |
} | |
else if( $mode == CHART ){ // concert FM classical chart | |
// instantiate class | |
parent::TABLE_Parser(); | |
array_push( $this->xhtml, "<table class=\"chart\">"); | |
array_push( $this->xhtml, "<tr><th>This<br />Week</th><th>Last<br />Week</th><th>Title</th><th>Artist</th><th>CD #</th></tr>"); | |
$lines = explode("\n", $this->GetTable() ); | |
//print_r( explode("<th>", $this->GetTable())); | |
//echo $this->GetTable(); exit(); // DEBUG | |
foreach( $lines as $this->line ){ | |
$fields = explode("<td>", $this->line); | |
$newline = ''; | |
foreach( $fields as $field ){ | |
$this->line = strip_tags( $field ); | |
$this->AddEntities(); | |
$newline .= "<td>$this->line</td>"; | |
} | |
$newline = str_replace("<tr><td>", "<tr><td class=\"pos\">", "<tr>$newline</tr>" ); | |
array_push( $this->xhtml, "$newline" ); | |
} | |
array_push( $this->xhtml, "</table>"); | |
} | |
else if( $mode == PRESS ){ // PRESS RELEASES | |
$this->CleanUpHTML( "<p><b><i>" ); | |
while ( $this->GetParagraph() ) { | |
$this->AddEntities(); | |
array_push( $this->xhtml, "<p>".$this->line."</p>" ); | |
} | |
} | |
else if( $mode == PLAIN ){ // PLAIN | |
$this->CleanUpHTML( "<p><b><i>" ); | |
while ( $this->GetParagraph() ) { | |
$this->AddEntities(); | |
array_push( $this->xhtml, "<p>".$this->line."</p>" ); | |
} | |
} | |
else if( $mode == MUSICFEATURES ){ // MUSIC FEATURES | |
$this->CleanUpHTML( "<p><b>" ); | |
while ( $this->GetParagraph() ) { | |
$this->AddEntities(); | |
$this->WO_AddTags(); | |
if ( $this->MUSIC_AddAnchors() ){ | |
$temp = strip_tags( $this->line ); | |
$this->line = "<h2>$temp</h2>"; | |
array_push( $this->xhtml, $this->line ); | |
} | |
else { | |
switch( $this->para_type ) | |
{ | |
case BOLD_ALL: | |
array_push( $this->xhtml, "<p class=\"hlts\">".$this->line."</p>" ); | |
break; | |
default: | |
array_push( $this->xhtml, "<p>".$this->line."</p>" ); | |
break; | |
} | |
} | |
} | |
} | |
return(1); | |
} | |
// looks for the first occurence of a date formated thus: | |
/* | |
saturday # dayname | |
\s # 1 space (gap) | |
\d{1,2} # 1 or 2 digits (date) | |
\s # 1 space (gap) | |
\w{3,9} # 3 to 9 characters (month) | |
\s # 1 space (gap) | |
\d{4} # 4 digits (year) | |
*/ | |
// this will get all of the lines that are like: | |
// saturday 14 March 2002 | |
// but ignore lines that are close. | |
function WO_AddAnchors( ) | |
{ | |
// marks if item found to avoid duplicate ID anchors which are NOT allowed in XHTML | |
// documents aare unlikely to contain two but belt and braces required anyway | |
static $sat = 0; | |
static $sun = 0; | |
static $mon = 0; | |
static $tue = 0; | |
static $wed = 0; | |
static $thu = 0; | |
static $fri = 0; | |
// anchor the days of the week | |
if ( preg_match( "/saturday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ | |
if( !$sat ){ // only do the first one | |
$this->line = "<h2>".strip_tags( $this->line )."</h2>"; | |
$sat = 1; | |
return 1; | |
} | |
} | |
else if ( preg_match( "/sunday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ | |
if( !$sun ){ // only do the first one | |
$this->line = "<a id=\"sun\" name=\"sun\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>"; | |
$sun = 1; | |
return 1; | |
} | |
} | |
else if ( preg_match( "/monday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ | |
if( !$mon ){ // only do the first one | |
$this->line = "<a id=\"mon\" name=\"mon\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>"; | |
$mon = 1; | |
return 1; | |
} | |
} | |
else if ( preg_match( "/tuesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ | |
if( !$tue ){ // only do the first one | |
$this->line = "<a id=\"tue\" name=\"tue\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>"; | |
$tue = 1; | |
return 1; | |
} | |
} | |
else if ( preg_match( "/wednesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ | |
if( !$wed ){ // only do the first one | |
$this->line = "<a id=\"wed\" name=\"wed\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>"; | |
$wed = 1; | |
return 1; | |
} | |
} | |
else if ( preg_match( "/thursday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ | |
if( !$thu ){ // only do the first one | |
$this->line = "<a id=\"thu\" name=\"thu\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>"; | |
$thu = 1; | |
return 1; | |
} | |
} | |
else if ( preg_match( "/friday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ | |
if( !$fri ){ // only do the first one | |
$this->line = "<a id=\"fri\" name=\"fri\"></a>\n" . $this->daysel . "<h2>".strip_tags( $this->line )."</h2>"; | |
$fri = 1; | |
return 1; | |
} | |
} | |
else { | |
return 0; | |
} | |
} | |
function HI_AddAnchors( ) | |
{ | |
// anchor the days of the week | |
if ( preg_match( "/saturday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/sunday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/monday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/tuesday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/wednesday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/thursday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/friday \d{1,2} \w{3,9}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else { | |
return 0; | |
} | |
} | |
function Music_AddAnchors( ) | |
{ | |
// anchor the days of the week | |
if ( preg_match( "/saturday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/sunday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/monday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/tuesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/wednesday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/thursday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else if ( preg_match( "/friday \d{1,2} \w{3,9} \d{4}/i", $this->line )){ // days of the week title | |
return 1; | |
} | |
else { | |
return 0; | |
} | |
} | |
function WO_AddTags() | |
{ | |
$search = array ( "/ RR/", // replay radio | |
"/\(([\w|\s|\.|\,]+)\)/", // things in brackets | |
"/(\d{1,2}\.\d{2}) /", // times with a space after (not in a list) | |
"/((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/" ); // email | |
$replace = array ( " <span class=\"rr\">RR</span>", | |
"<em>(\\1)</em>", | |
"<strong>\\1</strong> ", | |
"<a href=\"mailto:\\1\">\\1</a>" ); | |
$this->line = preg_replace( $search, $replace, $this->line ); | |
} | |
function AddEntities() | |
{ | |
$search = array | |
( | |
"/ë/", | |
"/'/", | |
"/í/", | |
"/\"/", | |
"/Ö/", | |
"/ ñ /" | |
); | |
$replace = array | |
( | |
"’", | |
"’", | |
"’", | |
""", | |
"…", | |
" — " | |
); | |
$this->line = preg_replace( $search, $replace, $this->line ); | |
} | |
// function to return a string containing the XHTML with line breaks. | |
function GetXHTML() | |
{ | |
$temp = ""; | |
foreach( $this->xhtml as $line ){ | |
$temp .= "$line\n"; | |
} | |
return $temp; | |
} | |
function GetSummary() | |
{ | |
$temp = ""; | |
foreach( $this->summary as $line ){ | |
$temp .= "<p>$line\n</p>"; | |
} | |
return $temp; | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment