alecpl
2010-03-05 2d08c50fd78e8ae74f27a2418f7909b18ae2bf42
program/lib/html2text.php
@@ -149,25 +149,18 @@
        '/<script[^>]*>.*?<\/script>/i',         // <script>s -- which strip_tags supposedly has problems with
        '/<style[^>]*>.*?<\/style>/i',           // <style>s -- which strip_tags supposedly has problems with
        //'/<!-- .* -->/',                         // Comments -- which strip_tags might have problem a with
        '/<h[123][^>]*>(.*?)<\/h[123]>/ie',      // H1 - H3
        '/<h[456][^>]*>(.*?)<\/h[456]>/ie',      // H4 - H6
        '/<p[^>]*>/i',                           // <P>
        '/<br[^>]*>/i',                          // <br>
        '/<b[^>]*>(.*?)<\/b>/ie',                // <b>
        '/<strong[^>]*>(.*?)<\/strong>/ie',      // <strong>
        '/<i[^>]*>(.*?)<\/i>/i',                 // <i>
        '/<em[^>]*>(.*?)<\/em>/i',               // <em>
        '/(<ul[^>]*>|<\/ul>)/i',                 // <ul> and </ul>
        '/(<ol[^>]*>|<\/ol>)/i',                 // <ol> and </ol>
        '/<li[^>]*>(.*?)<\/li>/i',               // <li> and </li>
        '/<li[^>]*>/i',                          // <li>
        '/<a [^>]*href=("|\')([^"\']+)\1[^>]*>(.*?)<\/a>/ie',
                                                 // <a href="">
        '/<hr[^>]*>/i',                          // <hr>
        '/(<table[^>]*>|<\/table>)/i',           // <table> and </table>
        '/(<tr[^>]*>|<\/tr>)/i',                 // <tr> and </tr>
        '/<td[^>]*>(.*?)<\/td>/i',               // <td> and </td>
        '/<th[^>]*>(.*?)<\/th>/ie',              // <th> and </th>
        '/&(nbsp|#160);/i',                      // Non-breaking space
        '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
                                               // Double quotes
@@ -183,7 +176,6 @@
        '/&(bull|#149|#8226);/i',                // Bullet
        '/&(pound|#163);/i',                     // Pound sign
        '/&(euro|#8364);/i',                     // Euro sign
        '/&[^&;]+;/i',                           // Unknown/unhandled entities
        '/[ ]{2,}/'                              // Runs of spaces, post-handling
    );
@@ -201,25 +193,18 @@
        '',                                     // <script>s -- which strip_tags supposedly has problems with
        '',                                     // <style>s -- which strip_tags supposedly has problems with
        //'',                                     // Comments -- which strip_tags might have problem a with
        "strtoupper(\"\n\n\\1\n\n\")",          // H1 - H3
        "ucwords(\"\n\n\\1\n\")",             // H4 - H6
        "\n\n",                               // <P>
        "\n",                                   // <br>
        'strtoupper("\\1")',                    // <b>
        'strtoupper("\\1")',                    // <strong>
        '_\\1_',                                // <i>
        '_\\1_',                                // <em>
        "\n\n",                                 // <ul> and </ul>
        "\n\n",                                 // <ol> and </ol>
        "\t* \\1\n",                            // <li> and </li>
        "\n\t* ",                               // <li>
       '$this->_build_link_list("\\2", "\\3")',
                                          // <a href="">
       "\n-------------------------\n",        // <hr>
       "\n\n",                                 // <table> and </table>
        "\n-------------------------\n",        // <hr>
        "\n\n",                                 // <table> and </table>
        "\n",                                   // <tr> and </tr>
        "\t\t\\1\n",                            // <td> and </td>
        "strtoupper(\"\t\t\\1\n\")",            // <th> and </th>
        ' ',                                    // Non-breaking space
        '"',                                    // Double quotes
        "'",                                    // Single quotes
@@ -232,10 +217,25 @@
        '--',
        '-',
        '*',
        '£',
        '£',
        'EUR',                                  // Euro sign. € ?
        '',                                     // Unknown/unhandled entities
        ' '                                     // Runs of spaces, post-handling
    );
    /**
     *  List of preg* regular expression patterns to search for
     *  and replace using callback function.
     *
     *  @var array $callback_search
     *  @access public
     */
    var $callback_search = array(
        '/<(h)[123456][^>]*>(.*?)<\/h[123456]>/i', // H1 - H3
        '/<(b)[^>]*>(.*?)<\/b>/i',                 // <b>
        '/<(strong)[^>]*>(.*?)<\/strong>/i',       // <strong>
        '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i',
                                                   // <a href="">
        '/<(th)[^>]*>(.*?)<\/th>/i',               // <th> and </th>
    );
   /**
@@ -247,11 +247,11 @@
    *  @see $pre_replace
    */
    var $pre_search = array(
   "/\n/",
   "/\t/",
   '/ /',
   '/<pre[^>]*>/',
   '/<\/pre>/'
        "/\n/",
        "/\t/",
        '/ /',
        '/<pre[^>]*>/',
        '/<\/pre>/'
    );
    /**
@@ -262,11 +262,11 @@
     *  @see $pre_search
     */
    var $pre_replace = array(
   '<br>',
   '&nbsp;&nbsp;&nbsp;&nbsp;',
   '&nbsp;',
   '',
   ''
        '<br>',
        '&nbsp;&nbsp;&nbsp;&nbsp;',
        '&nbsp;',
        '',
        ''
    );
    /**
@@ -342,10 +342,10 @@
        if ( !empty($source) ) {
            $this->set_html($source, $from_file);
        }
        $this->set_base_url();
   $this->_do_links = $do_links;
   $this->width = $width;
        $this->_do_links = $do_links;
        $this->width = $width;
    }
    /**
@@ -359,10 +359,10 @@
    function set_html( $source, $from_file = false )
    {
        if ( $from_file && file_exists($source) ) {
       $this->html = file_get_contents($source);
            $this->html = file_get_contents($source);
        }
        else
       $this->html = $source;
            $this->html = $source;
        $this->_converted = false;
    }
@@ -463,11 +463,20 @@
        $text = trim(stripslashes($this->html));
   // Convert <PRE>
        // Convert <PRE>
        $this->_convert_pre($text);
        // Run our defined search-and-replace
        $text = preg_replace($this->search, $this->replace, $text);
        // Replace known html entities
        $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
        // Run our defined search-and-replace with callback
        $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text);
        // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
        $text = preg_replace('/&[^&;]+;/i', '', $text);
        // Strip any other HTML tags
        $text = strip_tags($text, $this->allowed_tags);
@@ -515,9 +524,9 @@
            $this->_link_count++;
            $this->_link_list .= "[" . $this->_link_count . "] $link\n";
            $additional = ' [' . $this->_link_count . ']';
      } elseif ( substr($link, 0, 11) == 'javascript:' ) {
         // Don't count the link; ignore it
         $additional = '';
   } elseif ( substr($link, 0, 11) == 'javascript:' ) {
      // Don't count the link; ignore it
      $additional = '';
      // what about href="#anchor" ?
        } else {
            $this->_link_count++;
@@ -540,11 +549,47 @@
     */
    function _convert_pre(&$text)
    {
   while(preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches))
   {
       $result = preg_replace($this->pre_search, $this->pre_replace, $matches[1]);
       $text = preg_replace('/<pre[^>]*>.*<\/pre>/ismU', '<div><br>' . $result . '<br></div>', $text, 1);
   }
        while(preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
            $result = preg_replace($this->pre_search, $this->pre_replace, $matches[1]);
            $text = preg_replace('/<pre[^>]*>.*<\/pre>/ismU', '<div><br>' . $result . '<br></div>', $text, 1);
        }
    }
    /**
     *  Callback function for preg_replace_callback use.
     *
     *  @param  array PREG matches
     *  @return string
     *  @access private
     */
    function _preg_callback($matches)
    {
        switch($matches[1]) {
        case 'b':
        case 'strong':
            return $this->_strtoupper($matches[2]);
        case 'th':
            return $this->_strtoupper("\t\t". $matches[2] ."\n");
        case 'h':
            return $this->_strtoupper("\n\n". $matches[2] ."\n\n");
        case 'a':
            return $this->_build_link_list($matches[3], $matches[4]);
        }
    }
    /**
     *  Strtoupper multibyte wrapper function
     *
     *  @param  string
     *  @return string
     *  @access private
     */
    function _strtoupper($str)
    {
        if (function_exists('mb_strtoupper'))
            return mb_strtoupper($str);
        else
            return strtoupper($str);
    }
}