From ed1d212ae2daea5e4bd043417610177093e99f19 Mon Sep 17 00:00:00 2001
From: Aleksander Machniak <alec@alec.pl>
Date: Sat, 16 Jan 2016 03:03:51 -0500
Subject: [PATCH] Improved SVG cleanup code
---
program/lib/Roundcube/rcube_html2text.php | 137 ++++++++++++++++++++++++++++-----------------
1 files changed, 84 insertions(+), 53 deletions(-)
diff --git a/program/lib/Roundcube/rcube_html2text.php b/program/lib/Roundcube/rcube_html2text.php
index 0b172eb..d20d7b7 100644
--- a/program/lib/Roundcube/rcube_html2text.php
+++ b/program/lib/Roundcube/rcube_html2text.php
@@ -136,13 +136,16 @@
* @see $replace
*/
protected $search = array(
- "/\r/", // Non-legal carriage return
- "/[\n\t]+/", // Newlines and tabs
+ '/\r/', // Non-legal carriage return
+ '/^.*<body[^>]*>\n*/i', // Anything before <body>
'/<head[^>]*>.*?<\/head>/i', // <head>
- '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
- '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
- '/<p[^>]*>/i', // <P>
- '/<br[^>]*>/i', // <br>
+ '/<script[^>]*>.*?<\/script>/i', // <script>
+ '/<style[^>]*>.*?<\/style>/i', // <style>
+ '/[\n\t]+/', // Newlines and tabs
+ '/<p[^>]*>/i', // <p>
+ '/<\/p>[\s\n\t]*<div[^>]*>/i', // </p> before <div>
+ '/<br[^>]*>[\s\n\t]*<div[^>]*>/i', // <br> before <div>
+ '/<br[^>]*>\s*/i', // <br>
'/<i[^>]*>(.*?)<\/i>/i', // <i>
'/<em[^>]*>(.*?)<\/em>/i', // <em>
'/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
@@ -164,11 +167,14 @@
*/
protected $replace = array(
'', // Non-legal carriage return
- ' ', // Newlines and tabs
+ '', // Anything before <body>
'', // <head>
- '', // <script>s -- which strip_tags supposedly has problems with
- '', // <style>s -- which strip_tags supposedly has problems with
- "\n\n", // <P>
+ '', // <script>
+ '', // <style>
+ ' ', // Newlines and tabs
+ "\n\n", // <p>
+ "\n<div>", // </p> before <div>
+ '<div>', // <br> before <div>
"\n", // <br>
'_\\1_', // <i>
'_\\1_', // <em>
@@ -216,7 +222,7 @@
* @see $ent_search
*/
protected $ent_replace = array(
- ' ', // Non-breaking space
+ "\xC2\xA0", // Non-breaking space
'"', // Double quotes
"'", // Single quotes
'>',
@@ -423,7 +429,7 @@
// Variables used for building the link list
$this->_link_list = array();
- $text = trim(stripslashes($this->html));
+ $text = $this->html;
// Convert HTML to TXT
$this->_converter($text);
@@ -473,6 +479,9 @@
// Replace known html entities
$text = html_entity_decode($text, ENT_QUOTES, $this->charset);
+ // Replace unicode nbsp to regular spaces
+ $text = preg_replace('/\xC2\xA0/', ' ', $text);
+
// Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
$text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
@@ -506,7 +515,7 @@
* @param string $link URL of the link
* @param string $display Part of the text to associate number with
*/
- protected function _build_link_list( $link, $display )
+ protected function _build_link_list($link, $display)
{
if (!$this->_do_links || empty($link)) {
return $display;
@@ -514,6 +523,11 @@
// Ignored link types
if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
+ return $display;
+ }
+
+ // skip links with href == content (#1490434)
+ if ($link === $display) {
return $display;
}
@@ -571,55 +585,72 @@
*/
protected function _convert_blockquotes(&$text)
{
- if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
- $level = 0;
- $diff = 0;
- foreach ($matches[0] as $m) {
- if ($m[0][0] == '<' && $m[0][1] == '/') {
- $level--;
- if ($level < 0) {
- $level = 0; // malformed HTML: go to next blockquote
- }
- else if ($level > 0) {
- // skip inner blockquote
- }
- else {
- $end = $m[1];
- $len = $end - $taglen - $start;
- // Get blockquote content
- $body = substr($text, $start + $taglen - $diff, $len);
+ $level = 0;
+ $offset = 0;
+ while (($start = strpos($text, '<blockquote', $offset)) !== false) {
+ $offset = $start + 12;
+ do {
+ $end = strpos($text, '</blockquote>', $offset);
+ $next = strpos($text, '<blockquote', $offset);
- // Set text width
- $p_width = $this->width;
- if ($this->width > 0) $this->width -= 2;
- // Convert blockquote content
- $body = trim($body);
- $this->_converter($body);
- // Add citation markers and create PRE block
- $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
- $body = '<pre>' . htmlspecialchars($body) . '</pre>';
- // Re-set text width
- $this->width = $p_width;
- // Replace content
- $text = substr($text, 0, $start - $diff)
- . $body . substr($text, $end + strlen($m[0]) - $diff);
-
- $diff = $len + $taglen + strlen($m[0]) - strlen($body);
- unset($body);
- }
+ // nested <blockquote>, skip
+ if ($next !== false && $next < $end) {
+ $offset = $next + 12;
+ $level++;
}
+ // nested </blockquote> tag
+ if ($end !== false && $level > 0) {
+ $offset = $end + 12;
+ $level--;
+ }
+ // found matching end tag
+ else if ($end !== false && $level == 0) {
+ $taglen = strpos($text, '>', $start) - $start;
+ $startpos = $start + $taglen + 1;
+
+ // get blockquote content
+ $body = trim(substr($text, $startpos, $end - $startpos));
+
+ // adjust text wrapping width
+ $p_width = $this->width;
+ if ($this->width > 0) $this->width -= 2;
+
+ // replace content with inner blockquotes
+ $this->_converter($body);
+
+ // resore text width
+ $this->width = $p_width;
+
+ // Add citation markers and create <pre> block
+ $body = preg_replace_callback('/((?:^|\n)>*)([^\n]*)/', array($this, 'blockquote_citation_callback'), trim($body));
+ $body = '<pre>' . htmlspecialchars($body) . '</pre>';
+
+ $text = substr_replace($text, $body . "\n", $start, $end + 13 - $start);
+ $offset = 0;
+
+ break;
+ }
+ // abort on invalid tag structure (e.g. no closing tag found)
else {
- if ($level == 0) {
- $start = $m[1];
- $taglen = strlen($m[0]);
- }
- $level ++;
+ break;
}
}
+ while ($end || $next);
}
}
/**
+ * Callback function to correctly add citation markers for blockquote contents
+ */
+ public function blockquote_citation_callback($m)
+ {
+ $line = ltrim($m[2]);
+ $space = $line[0] == '>' ? '' : ' ';
+
+ return $m[1] . '>' . $space . $line;
+ }
+
+ /**
* Callback function for preg_replace_callback use.
*
* @param array PREG matches
--
Gitblit v1.9.1