githubFork/roundcubemail.git

Fix handling of long script/style tags when converting to text

Aleksander Machniak

2016-04-13 e2e56261b977a3f70e2495e860e6e2c387305a15

commit \| author \| age
4e17e6	1	<?php
T	2
a95874	3	/**
66afd7	4	+-----------------------------------------------------------------------+
AM	5	\| This file is part of the Roundcube Webmail client \|
	6	\| Copyright (C) 2008-2012, The Roundcube Dev Team \|
	7	\| Copyright (c) 2005-2007, Jon Abernathy <jon@chuggnutt.com> \|
	8	\| \|
	9	\| Licensed under the GNU General Public License version 3 or \|
	10	\| any later version with exceptions for skins & plugins. \|
	11	\| See the README file for a full license statement. \|
	12	\| \|
	13	\| PURPOSE: \|
	14	\| Converts HTML to formatted plain text (based on html2text class) \|
	15	+-----------------------------------------------------------------------+
	16	\| Author: Thomas Bruederli <roundcube@gmail.com> \|
	17	\| Author: Aleksander Machniak <alec@alec.pl> \|
	18	\| Author: Jon Abernathy <jon@chuggnutt.com> \|
	19	+-----------------------------------------------------------------------+
	20	*/
4e17e6	21
T	22	/**
8ac6fd	23	* Takes HTML and converts it to formatted, plain text.
A	24	*
	25	* Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and
	26	* correcting an error in the regexp search array. Fixed 7/30/03.
	27	*
	28	* Updated set_html() function's file reading mechanism, 9/25/03.
	29	*
	30	* Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding
	31	* several more HTML entity codes to the $search and $replace arrays.
	32	* Updated 11/7/03.
	33	*
	34	* Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for
	35	* suggesting the addition of $allowed_tags and its supporting function
	36	* (which I slightly modified). Updated 3/12/04.
	37	*
	38	* Thanks to Justin Dearing for pointing out that a replacement for the
	39	* <TH> tag was missing, and suggesting an appropriate fix.
	40	* Updated 8/25/04.
	41	*
	42	* Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a
	43	* display/formatting bug in the _build_link_list() function: email
	44	* readers would show the left bracket and number ("[1") as part of the
	45	* rendered email address.
	46	* Updated 12/16/04.
	47	*
	48	* Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code
	49	* to handle relative links, which I hadn't considered. I modified his
	50	* code a bit to handle normal HTTP links and MAILTO links. Also for
	51	* suggesting three additional HTML entity codes to search for.
	52	* Updated 03/02/05.
	53	*
	54	* Thanks to Jacob Chandler for pointing out another link condition
	55	* for the _build_link_list() function: "https".
	56	* Updated 04/06/05.
	57	*
	58	* Thanks to Marc Bertrand (http://www.dresdensky.com/) for
	59	* suggesting a revision to the word wrapping functionality; if you
	60	* specify a $width of 0 or less, word wrapping will be ignored.
	61	* Updated 11/02/06.
	62	*
	63	* *** Big housecleaning updates below:
	64	*
	65	* Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for
	66	* suggesting the fix to handle </li> and blank lines (whitespace).
	67	* Christian Basedau (http://www.movetheweb.de/) also suggested the
	68	* blank lines fix.
	69	*
	70	* Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/),
	71	* Christian Basedau, Norbert Laposa (http://ln5.co.uk/),
	72	* Bas van de Weijer, and Marijn van Butselaar
	73	* for pointing out my glaring error in the <th> handling. Marcus also
	74	* supplied a host of fixes.
	75	*
	76	* Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing
	77	* out that extra spaces should be compressed--a problem addressed with
	78	* Marcus Bointon's fixes but that I had not yet incorporated.
	79	*
21d463	80	* Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for
8ac6fd	81	* suggesting a valuable fix with <a> tag handling.
A	82	*
	83	* Thanks to Wojciech Bajon (again!) for suggesting fixes and additions,
	84	* including the <a> tag handling that Daniel Schledermann pointed
	85	* out but that I had not yet incorporated. I haven't (yet)
	86	* incorporated all of Wojciech's changes, though I may at some
	87	* future time.
	88	*
	89	* *** End of the housecleaning updates. Updated 08/08/07.
	90	*/
66afd7	91
AM	92	/**
	93	* Converts HTML to formatted plain text
	94	*
	95	* @package Framework
	96	* @subpackage Utils
	97	*/
	98	class rcube_html2text
4e17e6	99	{
66afd7	100	/**
AM	101	* Contains the HTML content to convert.
	102	*
	103	* @var string $html
	104	*/
	105	protected $html;
4e17e6	106
T	107	/**
66afd7	108	* Contains the converted, formatted text.
4e17e6	109	*
66afd7	110	* @var string $text
4e17e6	111	*/
66afd7	112	protected $text;
4e17e6	113
T	114	/**
66afd7	115	* Maximum width of the formatted text, in columns.
4e17e6	116	*
66afd7	117	* Set this value to 0 (or less) to ignore word wrapping
AM	118	* and not constrain text to a fixed-width column.
	119	*
	120	* @var integer $width
4e17e6	121	*/
66afd7	122	protected $width = 70;
4e17e6	123
T	124	/**
66afd7	125	* Target character encoding for output text
4e17e6	126	*
66afd7	127	* @var string $charset
4e17e6	128	*/
66afd7	129	protected $charset = 'UTF-8';
4e17e6	130
T	131	/**
66afd7	132	* List of preg* regular expression patterns to search for,
AM	133	* used in conjunction with $replace.
c72a96	134	*
66afd7	135	* @var array $search
AM	136	* @see $replace
c72a96	137	*/
66afd7	138	protected $search = array(
91dc7f	139	'/\r/', // Non-legal carriage return
e2e562	140	'/^.<body[^>]>\n*/is', // Anything before <body>
AM	141	'/<head[^>]>.?<\/head>/is', // <head>
	142	'/<script[^>]>.?<\/script>/is', // <script>
	143	'/<style[^>]>.?<\/style>/is', // <style>
91dc7f	144	'/[\n\t]+/', // Newlines and tabs
AM	145	'/<p[^>]*>/i', // <p>
	146	'/<\/p>[\s\n\t]<div[^>]>/i', // </p> before <div>
	147	'/<br[^>]>[\s\n\t]<div[^>]*>/i', // <br> before <div>
53cbeb	148	'/<br[^>]>\s/i', // <br>
8ac6fd	149	'/<i[^>]>(.?)<\/i>/i', // <i>
A	150	'/<em[^>]>(.?)<\/em>/i', // <em>
4e17e6	151	'/(<ul[^>]*>\|<\/ul>)/i', // <ul> and </ul>
T	152	'/(<ol[^>]*>\|<\/ol>)/i', // <ol> and </ol>
8ac6fd	153	'/<li[^>]>(.?)<\/li>/i', // <li> and </li>
4e17e6	154	'/<li[^>]*>/i', // <li>
T	155	'/<hr[^>]*>/i', // <hr>
f6b282	156	'/<div[^>]*>/i', // <div>
4e17e6	157	'/(<table[^>]*>\|<\/table>)/i', // <table> and </table>
T	158	'/(<tr[^>]*>\|<\/tr>)/i', // <tr> and </tr>
8ac6fd	159	'/<td[^>]>(.?)<\/td>/i', // <td> and </td>
4e17e6	160	);
T	161
	162	/**
66afd7	163	* List of pattern replacements corresponding to patterns searched.
4e17e6	164	*
66afd7	165	* @var array $replace
AM	166	* @see $search
4e17e6	167	*/
66afd7	168	protected $replace = array(
4e17e6	169	'', // Non-legal carriage return
91dc7f	170	'', // Anything before <body>
dc8f29	171	'', // <head>
91dc7f	172	'', // <script>
AM	173	'', // <style>
	174	' ', // Newlines and tabs
	175	"\n\n", // <p>
	176	"\n<div>", // </p> before <div>
	177	'<div>', // <br> before <div>
4e17e6	178	"\n", // <br>
T	179	'_\\1_', // <i>
8ac6fd	180	'_\\1_', // <em>
4e17e6	181	"\n\n", // <ul> and </ul>
T	182	"\n\n", // <ol> and </ol>
8ac6fd	183	"\t* \\1\n", // <li> and </li>
A	184	"\n\t* ", // <li>
6972cc	185	"\n-------------------------\n", // <hr>
11bcac	186	"<div>\n", // <div>
6972cc	187	"\n\n", // <table> and </table>
4e17e6	188	"\n", // <tr> and </tr>
T	189	"\t\t\\1\n", // <td> and </td>
ca0cd0	190	);
A	191
	192	/**
66afd7	193	* List of preg* regular expression patterns to search for,
AM	194	* used in conjunction with $ent_replace.
ca0cd0	195	*
66afd7	196	* @var array $ent_search
AM	197	* @see $ent_replace
ca0cd0	198	*/
66afd7	199	protected $ent_search = array(
ca0cd0	200	'/&(nbsp\|#160);/i', // Non-breaking space
A	201	'/&(quot\|rdquo\|ldquo\|#8220\|#8221\|#147\|#148);/i',
21d463	202	// Double quotes
ca0cd0	203	'/&(apos\|rsquo\|lsquo\|#8216\|#8217);/i', // Single quotes
A	204	'/>/i', // Greater-than
	205	'/</i', // Less-than
	206	'/&(copy\|#169);/i', // Copyright
	207	'/&(trade\|#8482\|#153);/i', // Trademark
	208	'/&(reg\|#174);/i', // Registered
	209	'/&(mdash\|#151\|#8212);/i', // mdash
	210	'/&(ndash\|minus\|#8211\|#8722);/i', // ndash
	211	'/&(bull\|#149\|#8226);/i', // Bullet
	212	'/&(pound\|#163);/i', // Pound sign
	213	'/&(euro\|#8364);/i', // Euro sign
	214	'/&(amp\|#38);/i', // Ampersand: see _converter()
	215	'/[ ]{2,}/', // Runs of spaces, post-handling
	216	);
	217
	218	/**
66afd7	219	* List of pattern replacements corresponding to patterns searched.
ca0cd0	220	*
66afd7	221	* @var array $ent_replace
AM	222	* @see $ent_search
ca0cd0	223	*/
66afd7	224	protected $ent_replace = array(
25c8fe	225	"\xC2\xA0", // Non-breaking space
8ac6fd	226	'"', // Double quotes
A	227	"'", // Single quotes
4e17e6	228	'>',
T	229	'<',
	230	'(c)',
	231	'(tm)',
	232	'(R)',
8ac6fd	233	'--',
A	234	'-',
4e17e6	235	'*',
300fc6	236	'£',
8ac6fd	237	'EUR', // Euro sign. ?
6084d7	238	'\|+\|amp\|+\|', // Ampersand: see _converter()
ca0cd0	239	' ', // Runs of spaces, post-handling
f50cc7	240	);
A	241
	242	/**
66afd7	243	* List of preg* regular expression patterns to search for
AM	244	* and replace using callback function.
f50cc7	245	*
66afd7	246	* @var array $callback_search
f50cc7	247	*/
66afd7	248	protected $callback_search = array(
8c1880	249	'/<(a) [^>]href=("\|\')([^"\']+)\2[^>]>(.*?)<\/a>/i', // <a href="">
AM	250	'/<(h)[123456]( [^>])?>(.?)<\/h[123456]>/i', // h1 - h6
	251	'/<(b)( [^>])?>(.?)<\/b>/i', // <b>
	252	'/<(strong)( [^>])?>(.?)<\/strong>/i', // <strong>
	253	'/<(th)( [^>])?>(.?)<\/th>/i', // <th> and </th>
7353fa	254	);
A	255
8ac6fd	256	/**
66afd7	257	* List of preg* regular expression patterns to search for in PRE body,
AM	258	* used in conjunction with $pre_replace.
8ac6fd	259	*
66afd7	260	* @var array $pre_search
AM	261	* @see $pre_replace
8ac6fd	262	*/
66afd7	263	protected $pre_search = array(
6972cc	264	"/\n/",
T	265	"/\t/",
	266	'/ /',
	267	'/<pre[^>]*>/',
	268	'/<\/pre>/'
7353fa	269	);
A	270
	271	/**
66afd7	272	* List of pattern replacements corresponding to patterns searched for PRE body.
7353fa	273	*
66afd7	274	* @var array $pre_replace
AM	275	* @see $pre_search
7353fa	276	*/
66afd7	277	protected $pre_replace = array(
6972cc	278	'<br>',
T	279	'    ',
	280	' ',
	281	'',
	282	''
4e17e6	283	);
T	284
	285	/**
66afd7	286	* Contains a list of HTML tags to allow in the resulting text.
4e17e6	287	*
66afd7	288	* @var string $allowed_tags
AM	289	* @see set_allowed_tags()
4e17e6	290	*/
66afd7	291	protected $allowed_tags = '';
4e17e6	292
T	293	/**
66afd7	294	* Contains the base URL that relative links should resolve to.
4e17e6	295	*
66afd7	296	* @var string $url
4e17e6	297	*/
66afd7	298	protected $url;
4e17e6	299
T	300	/**
66afd7	301	* Indicates whether content in the $html variable has been converted yet.
4e17e6	302	*
66afd7	303	* @var boolean $_converted
AM	304	* @see $html, $text
4e17e6	305	*/
66afd7	306	protected $_converted = false;
4e17e6	307
T	308	/**
66afd7	309	* Contains URL addresses from links to be rendered in plain text.
4e17e6	310	*
66afd7	311	* @var array $_link_list
AM	312	* @see _build_link_list()
4e17e6	313	*/
66afd7	314	protected $_link_list = array();
4e17e6	315
ca0cd0	316	/**
A	317	* Boolean flag, true if a table of link URLs should be listed after the text.
	318	*
	319	* @var boolean $_do_links
66afd7	320	* @see __construct()
e7f85b	321	*/
66afd7	322	protected $_do_links = true;
ca0cd0	323
4e17e6	324	/**
66afd7	325	* Constructor.
4e17e6	326	*
66afd7	327	* If the HTML source string (or file) is supplied, the class
AM	328	* will instantiate with that source propagated, all that has
	329	* to be done it to call get_text().
4e17e6	330	*
66afd7	331	* @param string $source HTML content
AM	332	* @param boolean $from_file Indicates $source is a file to pull content from
	333	* @param boolean $do_links Indicate whether a table of link URLs is desired
	334	* @param integer $width Maximum width of the formatted text, 0 for no limit
4e17e6	335	*/
66afd7	336	function __construct($source = '', $from_file = false, $do_links = true, $width = 75, $charset = 'UTF-8')
4e17e6	337	{
66afd7	338	if (!empty($source)) {
4e17e6	339	$this->set_html($source, $from_file);
T	340	}
6972cc	341
4e17e6	342	$this->set_base_url();
66afd7	343
6972cc	344	$this->_do_links = $do_links;
66afd7	345	$this->width = $width;
AM	346	$this->charset = $charset;
4e17e6	347	}
T	348
	349	/**
66afd7	350	* Loads source HTML into memory, either from $source string or a file.
4e17e6	351	*
66afd7	352	* @param string $source HTML content
AM	353	* @param boolean $from_file Indicates $source is a file to pull content from
4e17e6	354	*/
66afd7	355	function set_html($source, $from_file = false)
4e17e6	356	{
66afd7	357	if ($from_file && file_exists($source)) {
8c1880	358	$this->html = file_get_contents($source);
4e17e6	359	}
66afd7	360	else {
6972cc	361	$this->html = $source;
66afd7	362	}
4e17e6	363
T	364	$this->_converted = false;
	365	}
	366
	367	/**
66afd7	368	* Returns the text, converted from HTML.
4e17e6	369	*
66afd7	370	* @return string Plain text
4e17e6	371	*/
T	372	function get_text()
	373	{
66afd7	374	if (!$this->_converted) {
4e17e6	375	$this->_convert();
T	376	}
	377
	378	return $this->text;
	379	}
	380
	381	/**
66afd7	382	* Prints the text, converted from HTML.
4e17e6	383	*/
T	384	function print_text()
	385	{
	386	print $this->get_text();
	387	}
	388
	389	/**
66afd7	390	* Sets the allowed HTML tags to pass through to the resulting text.
4e17e6	391	*
66afd7	392	* Tags should be in the form "<p>", with no corresponding closing tag.
4e17e6	393	*/
66afd7	394	function set_allowed_tags($allowed_tags = '')
4e17e6	395	{
66afd7	396	if (!empty($allowed_tags)) {
4e17e6	397	$this->allowed_tags = $allowed_tags;
T	398	}
	399	}
	400
	401	/**
66afd7	402	* Sets a base URL to handle relative links.
4e17e6	403	*/
66afd7	404	function set_base_url($url = '')
4e17e6	405	{
66afd7	406	if (empty($url)) {
AM	407	if (!empty($_SERVER['HTTP_HOST'])) {
21d463	408	$this->url = 'http://' . $_SERVER['HTTP_HOST'];
66afd7	409	}
AM	410	else {
21d463	411	$this->url = '';
AM	412	}
66afd7	413	}
AM	414	else {
4e17e6	415	// Strip any trailing slashes for consistency (relative
T	416	// URLs may already start with a slash like "/file.html")
66afd7	417	if (substr($url, -1) == '/') {
4e17e6	418	$url = substr($url, 0, -1);
T	419	}
	420	$this->url = $url;
	421	}
	422	}
	423
	424	/**
66afd7	425	* Workhorse function that does actual conversion (calls _converter() method).
4e17e6	426	*/
66afd7	427	protected function _convert()
4e17e6	428	{
T	429	// Variables used for building the link list
43c40f	430	$this->_link_list = array();
4e17e6	431
efc470	432	$text = $this->html;
11bcac	433
A	434	// Convert HTML to TXT
	435	$this->_converter($text);
	436
	437	// Add link list
43c40f	438	if (!empty($this->_link_list)) {
A	439	$text .= "\n\nLinks:\n------\n";
	440	foreach ($this->_link_list as $idx => $url) {
	441	$text .= '[' . ($idx+1) . '] ' . $url . "\n";
	442	}
11bcac	443	}
A	444
66afd7	445	$this->text = $text;
11bcac	446	$this->_converted = true;
A	447	}
	448
	449	/**
66afd7	450	* Workhorse function that does actual conversion.
11bcac	451	*
66afd7	452	* First performs custom tag replacement specified by $search and
AM	453	* $replace arrays. Then strips any remaining HTML tags, reduces whitespace
	454	* and newlines to a readable format, and word wraps the text to
	455	* $width characters.
11bcac	456	*
66afd7	457	* @param string Reference to HTML content string
11bcac	458	*/
66afd7	459	protected function _converter(&$text)
11bcac	460	{
A	461	// Convert <BLOCKQUOTE> (before PRE!)
	462	$this->_convert_blockquotes($text);
4e17e6	463
6972cc	464	// Convert <PRE>
8ac6fd	465	$this->_convert_pre($text);
300fc6	466
ca0cd0	467	// Run our defined tags search-and-replace
4e17e6	468	$text = preg_replace($this->search, $this->replace, $text);
ca0cd0	469
A	470	// Run our defined tags search-and-replace with callback
66afd7	471	$text = preg_replace_callback($this->callback_search, array($this, 'tags_preg_callback'), $text);
ca0cd0	472
A	473	// Strip any other HTML tags
	474	$text = strip_tags($text, $this->allowed_tags);
	475
	476	// Run our defined entities/characters search-and-replace
	477	$text = preg_replace($this->ent_search, $this->ent_replace, $text);
4e17e6	478
6972cc	479	// Replace known html entities
c72a96	480	$text = html_entity_decode($text, ENT_QUOTES, $this->charset);
4e0419	481
0ee632	482	// Replace unicode nbsp to regular spaces
TB	483	$text = preg_replace('/\xC2\xA0/', ' ', $text);
	484
755900	485	// Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
6084d7	486	$text = preg_replace('/&([a-zA-Z0-9]{2,6}\|#[0-9]{2,4});/', '', $text);
A	487
	488	// Convert "\|+\|amp\|+\|" into "&", need to be done after handling of unknown entities
	489	// This properly handles situation of "&quot;" in input string
	490	$text = str_replace('\|+\|amp\|+\|', '&', $text);
755900	491
4e17e6	492	// Bring down number of empty lines to 2 max
8ac6fd	493	$text = preg_replace("/\n\s+\n/", "\n\n", $text);
4e17e6	494	$text = preg_replace("/[\n]{3,}/", "\n\n", $text);
T	495
4d7fbd	496	// remove leading empty lines (can be produced by eg. P tag on the beginning)
ca0cd0	497	$text = ltrim($text, "\n");
4d7fbd	498
4e17e6	499	// Wrap the text to a readable format
T	500	// for PHP versions >= 4.0.2. Default width is 75
8ac6fd	501	// If width is 0 or less, don't wrap the text.
A	502	if ( $this->width > 0 ) {
21d463	503	$text = wordwrap($text, $this->width);
8ac6fd	504	}
4e17e6	505	}
T	506
	507	/**
66afd7	508	* Helper function called by preg_replace() on link replacement.
4e17e6	509	*
66afd7	510	* Maintains an internal list of links to be displayed at the end of the
AM	511	* text, with numeric indices to the original point in the text they
	512	* appeared. Also makes an effort at identifying and handling absolute
	513	* and relative links.
4e17e6	514	*
66afd7	515	* @param string $link URL of the link
AM	516	* @param string $display Part of the text to associate number with
8ac6fd	517	*/
ff4068	518	protected function _build_link_list($link, $display)
8ac6fd	519	{
21d463	520	if (!$this->_do_links \|\| empty($link)) {
AM	521	return $display;
4e17e6	522	}
T	523
21d463	524	// Ignored link types
AM	525	if (preg_match('!^(javascript:\|mailto:\|#)!i', $link)) {
	526	return $display;
	527	}
	528
ff4068	529	// skip links with href == content (#1490434)
AM	530	if ($link === $display) {
	531	return $display;
	532	}
	533
21d463	534	if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
43c40f	535	$url = $link;
A	536	}
	537	else {
	538	$url = $this->url;
	539	if (substr($link, 0, 1) != '/') {
	540	$url .= '/';
	541	}
	542	$url .= "$link";
	543	}
	544
	545	if (($index = array_search($url, $this->_link_list)) === false) {
	546	$index = count($this->_link_list);
8c1880	547	$this->_link_list[] = $url;
43c40f	548	}
A	549
	550	return $display . ' [' . ($index+1) . ']';
8ac6fd	551	}
11bcac	552
7353fa	553	/**
66afd7	554	* Helper function for PRE body conversion.
7353fa	555	*
66afd7	556	* @param string HTML content
8ac6fd	557	*/
66afd7	558	protected function _convert_pre(&$text)
8ac6fd	559	{
d483cd	560	// get the content of PRE element
11bcac	561	while (preg_match('/<pre[^>]>(.)<\/pre>/ismU', $text, $matches)) {
8c1880	562	$this->pre_content = $matches[1];
AM	563
	564	// Run our defined tags search-and-replace with callback
	565	$this->pre_content = preg_replace_callback($this->callback_search,
66afd7	566	array($this, 'tags_preg_callback'), $this->pre_content);
8c1880	567
d483cd	568	// convert the content
A	569	$this->pre_content = sprintf('<div><br>%s<br></div>',
8c1880	570	preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));
AM	571
d483cd	572	// replace the content (use callback because content can contain $0 variable)
8c1880	573	$text = preg_replace_callback('/<pre[^>]>.<\/pre>/ismU',
66afd7	574	array($this, 'pre_preg_callback'), $text, 1);
8c1880	575
d483cd	576	// free memory
A	577	$this->pre_content = '';
11bcac	578	}
A	579	}
	580
	581	/**
66afd7	582	* Helper function for BLOCKQUOTE body conversion.
11bcac	583	*
66afd7	584	* @param string HTML content
11bcac	585	*/
66afd7	586	protected function _convert_blockquotes(&$text)
11bcac	587	{
bb6f4b	588	$level = 0;
TB	589	$offset = 0;
	590	while (($start = strpos($text, '<blockquote', $offset)) !== false) {
	591	$offset = $start + 12;
	592	do {
	593	$end = strpos($text, '</blockquote>', $offset);
	594	$next = strpos($text, '<blockquote', $offset);
	595
	596	// nested <blockquote>, skip
	597	if ($next !== false && $next < $end) {
	598	$offset = $next + 12;
	599	$level++;
	600	}
	601	// nested </blockquote> tag
	602	if ($end !== false && $level > 0) {
	603	$offset = $end + 12;
11bcac	604	$level--;
A	605	}
bb6f4b	606	// found matching end tag
TB	607	else if ($end !== false && $level == 0) {
	608	$taglen = strpos($text, '>', $start) - $start;
	609	$startpos = $start + $taglen + 1;
	610
	611	// get blockquote content
	612	$body = trim(substr($text, $startpos, $end - $startpos));
	613
737b62	614	// adjust text wrapping width
TB	615	$p_width = $this->width;
	616	if ($this->width > 0) $this->width -= 2;
	617
bb6f4b	618	// replace content with inner blockquotes
TB	619	$this->_converter($body);
	620
737b62	621	// resore text width
TB	622	$this->width = $p_width;
	623
bb6f4b	624	// Add citation markers and create <pre> block
27a620	625	$body = preg_replace_callback('/((?:^\|\n)>)([^\n])/', array($this, 'blockquote_citation_callback'), trim($body));
bb6f4b	626	$body = '<pre>' . htmlspecialchars($body) . '</pre>';
TB	627
ff6de9	628	$text = substr_replace($text, $body . "\n", $start, $end + 13 - $start);
bb6f4b	629	$offset = 0;
ff6de9	630
bb6f4b	631	break;
11bcac	632	}
eecd9c	633	// abort on invalid tag structure (e.g. no closing tag found)
TB	634	else {
	635	break;
	636	}
ff6de9	637	}
AM	638	while ($end \|\| $next);
6972cc	639	}
8ac6fd	640	}
f50cc7	641
A	642	/**
bb6f4b	643	* Callback function to correctly add citation markers for blockquote contents
TB	644	*/
27a620	645	public function blockquote_citation_callback($m)
bb6f4b	646	{
ff6de9	647	$line = ltrim($m[2]);
bb6f4b	648	$space = $line[0] == '>' ? '' : ' ';
ff6de9	649
bb6f4b	650	return $m[1] . '>' . $space . $line;
TB	651	}
	652
	653	/**
66afd7	654	* Callback function for preg_replace_callback use.
f50cc7	655	*
66afd7	656	* @param array PREG matches
AM	657	* @return string
f50cc7	658	*/
66afd7	659	public function tags_preg_callback($matches)
f50cc7	660	{
2d7b4f	661	switch (strtolower($matches[1])) {
6972cc	662	case 'b':
T	663	case 'strong':
8c1880	664	return $this->_toupper($matches[3]);
da1722	665	case 'th':
8c1880	666	return $this->_toupper("\t\t". $matches[3] ."\n");
6972cc	667	case 'h':
8c1880	668	return $this->_toupper("\n\n". $matches[3] ."\n\n");
6972cc	669	case 'a':
29c542	670	// Remove spaces in URL (#1487805)
A	671	$url = str_replace(' ', '', $matches[3]);
	672	return $this->_build_link_list($url, $matches[4]);
6972cc	673	}
f50cc7	674	}
29c542	675
f50cc7	676	/**
66afd7	677	* Callback function for preg_replace_callback use in PRE content handler.
d483cd	678	*
66afd7	679	* @param array PREG matches
AM	680	* @return string
d483cd	681	*/
66afd7	682	public function pre_preg_callback($matches)
d483cd	683	{
A	684	return $this->pre_content;
	685	}
	686
	687	/**
f35995	688	* Strtoupper function with HTML tags and entities handling.
f50cc7	689	*
f35995	690	* @param string $str Text to convert
A	691	* @return string Converted text
	692	*/
	693	private function _toupper($str)
	694	{
	695	// string can containg HTML tags
	696	$chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY \| PREG_SPLIT_DELIM_CAPTURE);
	697
	698	// convert toupper only the text between HTML tags
	699	foreach ($chunks as $idx => $chunk) {
	700	if ($chunk[0] != '<') {
	701	$chunks[$idx] = $this->_strtoupper($chunk);
	702	}
	703	}
	704
	705	return implode($chunks);
	706	}
	707
	708	/**
	709	* Strtoupper multibyte wrapper function with HTML entities handling.
	710	*
	711	* @param string $str Text to convert
	712	* @return string Converted text
f50cc7	713	*/
d483cd	714	private function _strtoupper($str)
f50cc7	715	{
c72a96	716	$str = html_entity_decode($str, ENT_COMPAT, $this->charset);
66afd7	717	$str = mb_strtoupper($str);
c72a96	718	$str = htmlspecialchars($str, ENT_COMPAT, $this->charset);
67e592	719
A	720	return $str;
f50cc7	721	}
4e17e6	722	}