Parse($rtf); * //$reader->root->dump(); // to see what the reader read * $formatter = new RtfHtml(); * echo $formatter->Format($reader->root); */ class RtfElement { protected function Indent($level) { for($i = 0; $i < $level * 2; $i++) echo " "; } } class RtfGroup extends RtfElement { public $parent; public $children; public function __construct() { $this->parent = null; $this->children = array(); } public function GetType() { // No children? if(sizeof($this->children) == 0) return null; // First child not a control word? $child = $this->children[0]; if(get_class($child) != "RtfControlWord") return null; return $child->word; } public function IsDestination() { // No children? if(sizeof($this->children) == 0) return null; // First child not a control symbol? $child = $this->children[0]; if(get_class($child) != "RtfControlSymbol") return null; return $child->symbol == '*'; } public function dump($level = 0) { echo "
"; $this->Indent($level); echo "{"; echo "
"; foreach($this->children as $child) { if(get_class($child) == "RtfGroup") { if ($child->GetType() == "fonttbl") continue; if ($child->GetType() == "colortbl") continue; if ($child->GetType() == "stylesheet") continue; if ($child->GetType() == "info") continue; // Skip any pictures: if (substr($child->GetType(), 0, 4) == "pict") continue; if ($child->IsDestination()) continue; } $child->dump($level + 2); } echo "
"; $this->Indent($level); echo "}"; echo "
"; } } class RtfControlWord extends RtfElement { public $word; public $parameter; public function dump($level) { echo "
"; $this->Indent($level); echo "WORD {$this->word} ({$this->parameter})"; echo "
"; } } class RtfControlSymbol extends RtfElement { public $symbol; public $parameter = 0; public function dump($level) { echo "
"; $this->Indent($level); echo "SYMBOL {$this->symbol} ({$this->parameter})"; echo "
"; } } class RtfText extends RtfElement { public $text; public function dump($level) { echo "
"; $this->Indent($level); echo "TEXT {$this->text}"; echo "
"; } } class RtfReader { public $root = null; protected function GetChar() { $this->char = $this->rtf[$this->pos++]; } protected function ParseStartGroup() { // Store state of document on stack. $group = new RtfGroup(); if($this->group != null) $group->parent = $this->group; if($this->root == null) { $this->group = $group; $this->root = $group; } else { array_push($this->group->children, $group); $this->group = $group; } } protected function is_letter() { if(ord($this->char) >= 65 && ord($this->char) <= 90) return TRUE; if(ord($this->char) >= 90 && ord($this->char) <= 122) return TRUE; return FALSE; } protected function is_digit() { if(ord($this->char) >= 48 && ord($this->char) <= 57) return TRUE; return FALSE; } protected function ParseEndGroup() { // Retrieve state of document from stack. $this->group = $this->group->parent; } protected function ParseControlWord() { $this->GetChar(); $word = ""; while($this->is_letter()) { $word .= $this->char; $this->GetChar(); } // Read parameter (if any) consisting of digits. // Paramater may be negative. $parameter = null; $negative = false; if($this->char == '-') { $this->GetChar(); $negative = true; } while($this->is_digit()) { if($parameter == null) $parameter = 0; $parameter = $parameter * 10 + $this->char; $this->GetChar(); } if($parameter === null) $parameter = 1; if($negative) $parameter = -$parameter; // If this is \u, then the parameter will be followed by // a character. if($word == "u") { } // If the current character is a space, then // it is a delimiter. It is consumed. // If it's not a space, then it's part of the next // item in the text, so put the character back. else { if($this->char != ' ') $this->pos--; } $rtfword = new RtfControlWord(); $rtfword->word = $word; $rtfword->parameter = $parameter; array_push($this->group->children, $rtfword); } protected function ParseControlSymbol() { // Read symbol (one character only). $this->GetChar(); $symbol = $this->char; // Symbols ordinarily have no parameter. However, // if this is \', then it is followed by a 2-digit hex-code: $parameter = 0; if($symbol == '\'') { $this->GetChar(); $parameter = $this->char; $this->GetChar(); $parameter = hexdec($parameter . $this->char); } $rtfsymbol = new RtfControlSymbol(); $rtfsymbol->symbol = $symbol; $rtfsymbol->parameter = $parameter; array_push($this->group->children, $rtfsymbol); } protected function ParseControl() { // Beginning of an RTF control word or control symbol. // Look ahead by one character to see if it starts with // a letter (control world) or another symbol (control symbol): $this->GetChar(); $this->pos--; if($this->is_letter()) $this->ParseControlWord(); else $this->ParseControlSymbol(); } protected function ParseText() { // Parse plain text up to backslash or brace, // unless escaped. $text = ""; do { $terminate = false; $escape = false; // Is this an escape? if($this->char == '\\') { // Perform lookahead to see if this // is really an escape sequence. $this->GetChar(); switch($this->char) { case '\\': $text .= '\\'; break; case '{': $text .= '{'; break; case '}': $text .= '}'; break; default: // Not an escape. Roll back. $this->pos = $this->pos - 2; $terminate = true; break; } } else if($this->char == '{' || $this->char == '}') { $this->pos--; $terminate = true; } if(!$terminate && !$escape) { $text .= $this->char; $this->GetChar(); } } while(!$terminate && $this->pos < $this->len); $rtftext = new RtfText(); $rtftext->text = $text; array_push($this->group->children, $rtftext); } public function Parse($rtf) { $this->rtf = $rtf; $this->pos = 0; $this->len = strlen($this->rtf); $this->group = null; $this->root = null; while($this->pos < $this->len) { // Read next character: $this->GetChar(); // Ignore \r and \n if($this->char == "\n" || $this->char == "\r") continue; // What type of character is this? switch($this->char) { case '{': $this->ParseStartGroup(); break; case '}': $this->ParseEndGroup(); break; case '\\': $this->ParseControl(); break; default: $this->ParseText(); break; } } } } class RtfState { public function __construct() { $this->Reset(); } public function Reset() { $this->bold = false; $this->italic = false; $this->underline = false; $this->end_underline = false; $this->strike = false; $this->hidden = false; $this->fontsize = 0; } } class RtfHtml { public function Format($root) { $this->output = ""; // Create a stack of states: $this->states = array(); // Put an initial standard state onto the stack: $this->state = new RtfState(); array_push($this->states, $this->state); $this->FormatGroup($root); return $this->output; } protected function FormatGroup($group) { // Can we ignore this group? if ($group->GetType() == "fonttbl") return; if ($group->GetType() == "colortbl") return; if ($group->GetType() == "stylesheet") return; if ($group->GetType() == "info") return; // Skip any pictures: if (substr($group->GetType(), 0, 4) == "pict") return; if ($group->IsDestination()) return; // Push a new state onto the stack: $this->state = clone $this->state; array_push($this->states, $this->state); foreach($group->children as $child) { if(get_class($child) == "RtfGroup") $this->FormatGroup($child); if(get_class($child) == "RtfControlWord") $this->FormatControlWord($child); if(get_class($child) == "RtfControlSymbol") $this->FormatControlSymbol($child); if(get_class($child) == "RtfText") $this->FormatText($child); } // Pop state from stack. array_pop($this->states); $this->state = $this->states[sizeof($this->states)-1]; } protected function FormatControlWord($word) { if($word->word == "plain") $this->state->Reset(); if($word->word == "b") $this->state->bold = $word->parameter; if($word->word == "i") $this->state->italic = $word->parameter; if($word->word == "ul") $this->state->underline = $word->parameter; if($word->word == "ulnone") $this->state->end_underline = $word->parameter; if($word->word == "strike") $this->state->strike = $word->parameter; if($word->word == "v") $this->state->hidden = $word->parameter; if($word->word == "fs") $this->state->fontsize = ceil(($word->parameter / 24) * 16); if($word->word == "par") $this->output .= "

"; // Characters: if($word->word == "lquote") $this->output .= "‘"; if($word->word == "rquote") $this->output .= "’"; if($word->word == "ldblquote") $this->output .= "“"; if($word->word == "rdblquote") $this->output .= "”"; if($word->word == "emdash") $this->output .= "—"; if($word->word == "endash") $this->output .= "–"; if($word->word == "bullet") $this->output .= "•"; if($word->word == "u") $this->output .= "◊"; } protected function BeginState() { $span = ""; if($this->state->bold) $span .= "font-weight:bold;"; if($this->state->italic) $span .= "font-style:italic;"; if($this->state->underline) $span .= "text-decoration:underline;"; if($this->state->end_underline) $span .= "text-decoration:none;"; if($this->state->strike) $span .= "text-decoration:strikethrough;"; if($this->state->hidden) $span .= "display:none;"; if($this->state->fontsize != 0) $span .= "font-size: {$this->state->fontsize}px;"; $this->output .= ""; } protected function EndState() { $this->output .= ""; } protected function FormatControlSymbol($symbol) { if($symbol->symbol == '\'') { $this->BeginState(); $this->output .= htmlentities(chr($symbol->parameter), ENT_QUOTES, 'ISO-8859-1'); $this->EndState(); } } protected function FormatText($text) { $this->BeginState(); $this->output .= $text->text; $this->EndState(); } } ?>