alecpl
2011-11-28 ca0cd05973b468c056a682ab1fb9133856a56943
- Fix handling HTML entities when converting HTML to text (#1488212)


3 files modified
88 ■■■■■ changed files
CHANGELOG 1 ●●●● patch | view | raw | blame | history
program/js/app.js 7 ●●●● patch | view | raw | blame | history
program/lib/html2text.php 80 ●●●●● patch | view | raw | blame | history
CHANGELOG
@@ -1,6 +1,7 @@
CHANGELOG Roundcube Webmail
===========================
- Fix handling HTML entities when converting HTML to text (#1488212)
- Fix fit_string_to_size() renders browser and ui unresponsive (#1488207)
- Fix handling of invalid characters in request (#1488124)
- Fix merging some configuration options in update.sh script (#1485864)
program/js/app.js
@@ -5765,10 +5765,13 @@
    });
  };
  this.plain2html = function(plainText, id)
  this.plain2html = function(plain, id)
  {
    var lock = this.set_busy(true, 'converting');
    $('#'+id).val(plainText ? '<pre>'+plainText+'</pre>' : '');
    plain = plain.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
    $('#'+id).val(plain ? '<pre>'+plain+'</pre>' : '');
    this.set_busy(false, null, lock);
  };
program/lib/html2text.php
@@ -145,7 +145,6 @@
    var $search = array(
        "/\r/",                                  // Non-legal carriage return
        "/[\n\t]+/",                             // Newlines and tabs
        '/[ ]{2,}/',                             // Runs of spaces, pre-handling
        '/<script[^>]*>.*?<\/script>/i',         // <script>s -- which strip_tags supposedly has problems with
        '/<style[^>]*>.*?<\/style>/i',           // <style>s -- which strip_tags supposedly has problems with
        '/<p[^>]*>/i',                           // <P>
@@ -161,22 +160,6 @@
        '/(<table[^>]*>|<\/table>)/i',           // <table> and </table>
        '/(<tr[^>]*>|<\/tr>)/i',                 // <tr> and </tr>
        '/<td[^>]*>(.*?)<\/td>/i',               // <td> and </td>
        '/&(nbsp|#160);/i',                      // Non-breaking space
        '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
                                                 // Double quotes
        '/&(apos|rsquo|lsquo|#8216|#8217);/i',   // Single quotes
        '/&gt;/i',                               // Greater-than
        '/&lt;/i',                               // Less-than
        '/&(copy|#169);/i',                      // Copyright
        '/&(trade|#8482|#153);/i',               // Trademark
        '/&(reg|#174);/i',                       // Registered
        '/&(mdash|#151|#8212);/i',               // mdash
        '/&(ndash|minus|#8211|#8722);/i',        // ndash
        '/&(bull|#149|#8226);/i',                // Bullet
        '/&(pound|#163);/i',                     // Pound sign
        '/&(euro|#8364);/i',                     // Euro sign
        '/&(amp|#38);/i',                        // Ampersand: see _converter()
        '/[ ]{2,}/'                              // Runs of spaces, post-handling
    );
    /**
@@ -189,7 +172,6 @@
    var $replace = array(
        '',                                     // Non-legal carriage return
        ' ',                                    // Newlines and tabs
        ' ',                                    // Runs of spaces, pre-handling
        '',                                     // <script>s -- which strip_tags supposedly has problems with
        '',                                     // <style>s -- which strip_tags supposedly has problems with
        "\n\n",                                 // <P>
@@ -205,6 +187,43 @@
        "\n\n",                                 // <table> and </table>
        "\n",                                   // <tr> and </tr>
        "\t\t\\1\n",                            // <td> and </td>
    );
    /**
     *  List of preg* regular expression patterns to search for,
     *  used in conjunction with $ent_replace.
     *
     *  @var array $ent_search
     *  @access public
     *  @see $ent_replace
     */
    var $ent_search = array(
        '/&(nbsp|#160);/i',                      // Non-breaking space
        '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
                                                 // Double quotes
        '/&(apos|rsquo|lsquo|#8216|#8217);/i',   // Single quotes
        '/&gt;/i',                               // Greater-than
        '/&lt;/i',                               // Less-than
        '/&(copy|#169);/i',                      // Copyright
        '/&(trade|#8482|#153);/i',               // Trademark
        '/&(reg|#174);/i',                       // Registered
        '/&(mdash|#151|#8212);/i',               // mdash
        '/&(ndash|minus|#8211|#8722);/i',        // ndash
        '/&(bull|#149|#8226);/i',                // Bullet
        '/&(pound|#163);/i',                     // Pound sign
        '/&(euro|#8364);/i',                     // Euro sign
        '/&(amp|#38);/i',                        // Ampersand: see _converter()
        '/[ ]{2,}/',                             // Runs of spaces, post-handling
    );
    /**
     *  List of pattern replacements corresponding to patterns searched.
     *
     *  @var array $ent_replace
     *  @access public
     *  @see $ent_search
     */
    var $ent_replace = array(
        ' ',                                    // Non-breaking space
        '"',                                    // Double quotes
        "'",                                    // Single quotes
@@ -219,7 +238,7 @@
        '£',
        'EUR',                                  // Euro sign. € ?
        '|+|amp|+|',                            // Ampersand: see _converter()
        ' '                                     // Runs of spaces, post-handling
        ' ',                                    // Runs of spaces, post-handling
    );
    /**
@@ -492,14 +511,20 @@
        // Convert <PRE>
        $this->_convert_pre($text);
        // Run our defined search-and-replace
        // Run our defined tags search-and-replace
        $text = preg_replace($this->search, $this->replace, $text);
        // Run our defined tags search-and-replace with callback
        $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text);
        // Strip any other HTML tags
        $text = strip_tags($text, $this->allowed_tags);
        // Run our defined entities/characters search-and-replace
        $text = preg_replace($this->ent_search, $this->ent_replace, $text);
        // Replace known html entities
        $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
        // Run our defined search-and-replace with callback
        $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text);
        // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
        $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
@@ -508,15 +533,12 @@
        // This properly handles situation of "&amp;quot;" in input string
        $text = str_replace('|+|amp|+|', '&', $text);
        // Strip any other HTML tags
        $text = strip_tags($text, $this->allowed_tags);
        // Bring down number of empty lines to 2 max
        $text = preg_replace("/\n\s+\n/", "\n\n", $text);
        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
        // remove leading empty lines (can be produced by eg. P tag on the beginning)
        $text = preg_replace('/^\n+/', '', $text);
        $text = ltrim($text, "\n");
        // Wrap the text to a readable format
        // for PHP versions >= 4.0.2. Default width is 75
@@ -544,9 +566,7 @@
        if ( !$this->_do_links )
            return $display;
        if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' ||
            substr($link, 0, 7) == 'mailto:'
        ) {
        if ( preg_match('!^(https?://|mailto:)!', $link) ) {
            $this->_link_count++;
            $this->_link_list .= '[' . $this->_link_count . "] $link\n";
            $additional = ' [' . $this->_link_count . ']';