From 5f8adabb6286fdcb0ff8a0ea5d1d58f40eef51f4 Mon Sep 17 00:00:00 2001
From: Aleksander Machniak <alec@alec.pl>
Date: Mon, 27 Aug 2012 03:28:16 -0400
Subject: [PATCH] Add simple (constructor) tests for Framework classes
---
program/lib/html2text.php | 146 ++++++++++++++++++++++++++++--------------------
1 files changed, 85 insertions(+), 61 deletions(-)
diff --git a/program/lib/html2text.php b/program/lib/html2text.php
index 0171f4b..28c5ae0 100644
--- a/program/lib/html2text.php
+++ b/program/lib/html2text.php
@@ -89,7 +89,7 @@
* out that extra spaces should be compressed--a problem addressed with
* Marcus Bointon's fixes but that I had not yet incorporated.
*
- * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for
+ * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for
* suggesting a valuable fix with <a> tag handling.
*
* Thanks to Wojciech Bajon (again!) for suggesting fixes and additions,
@@ -200,7 +200,7 @@
var $ent_search = array(
'/&(nbsp|#160);/i', // Non-breaking space
'/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
- // Double quotes
+ // Double quotes
'/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes
'/>/i', // Greater-than
'/</i', // Less-than
@@ -249,12 +249,11 @@
* @access public
*/
var $callback_search = array(
- '/<(h)[123456][^>]*>(.*?)<\/h[123456]>/i', // H1 - H3
- '/<(b)[^>]*>(.*?)<\/b>/i', // <b>
- '/<(strong)[^>]*>(.*?)<\/strong>/i', // <strong>
- '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i',
- // <a href="">
- '/<(th)[^>]*>(.*?)<\/th>/i', // <th> and </th>
+ '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', // <a href="">
+ '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
+ '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
+ '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
+ '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
);
/**
@@ -317,21 +316,11 @@
/**
* Contains URL addresses from links to be rendered in plain text.
*
- * @var string $_link_list
+ * @var array $_link_list
* @access private
* @see _build_link_list()
*/
- var $_link_list = '';
-
- /**
- * Number of valid links detected in the text, used for plain text
- * display (rendered similar to footnotes).
- *
- * @var integer $_link_count
- * @access private
- * @see _build_link_list()
- */
- var $_link_count = 0;
+ var $_link_list = array();
/**
* Boolean flag, true if a table of link URLs should be listed after the text.
@@ -378,7 +367,7 @@
function set_html( $source, $from_file = false )
{
if ( $from_file && file_exists($source) ) {
- $this->html = file_get_contents($source);
+ $this->html = file_get_contents($source);
}
else
$this->html = $source;
@@ -448,11 +437,11 @@
function set_base_url( $url = '' )
{
if ( empty($url) ) {
- if ( !empty($_SERVER['HTTP_HOST']) ) {
- $this->url = 'http://' . $_SERVER['HTTP_HOST'];
- } else {
- $this->url = '';
- }
+ if ( !empty($_SERVER['HTTP_HOST']) ) {
+ $this->url = 'http://' . $_SERVER['HTTP_HOST'];
+ } else {
+ $this->url = '';
+ }
} else {
// Strip any trailing slashes for consistency (relative
// URLs may already start with a slash like "/file.html")
@@ -472,8 +461,7 @@
function _convert()
{
// Variables used for building the link list
- $this->_link_count = 0;
- $this->_link_list = '';
+ $this->_link_list = array();
$text = trim(stripslashes($this->html));
@@ -481,8 +469,11 @@
$this->_converter($text);
// Add link list
- if ( !empty($this->_link_list) ) {
- $text .= "\n\nLinks:\n------\n" . $this->_link_list;
+ if (!empty($this->_link_list)) {
+ $text .= "\n\nLinks:\n------\n";
+ foreach ($this->_link_list as $idx => $url) {
+ $text .= '[' . ($idx+1) . '] ' . $url . "\n";
+ }
}
$this->text = $text;
@@ -524,7 +515,7 @@
$text = preg_replace($this->ent_search, $this->ent_replace, $text);
// Replace known html entities
- $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
+ $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
// Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
$text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
@@ -544,7 +535,7 @@
// for PHP versions >= 4.0.2. Default width is 75
// If width is 0 or less, don't wrap the text.
if ( $this->width > 0 ) {
- $text = wordwrap($text, $this->width);
+ $text = wordwrap($text, $this->width);
}
}
@@ -563,28 +554,32 @@
*/
function _build_link_list( $link, $display )
{
- if ( !$this->_do_links )
- return $display;
-
- if ( preg_match('!^(https?://|mailto:)!', $link) ) {
- $this->_link_count++;
- $this->_link_list .= '[' . $this->_link_count . "] $link\n";
- $additional = ' [' . $this->_link_count . ']';
- } elseif ( substr($link, 0, 11) == 'javascript:' ) {
- // Don't count the link; ignore it
- $additional = '';
- // what about href="#anchor" ?
- } else {
- $this->_link_count++;
- $this->_link_list .= '[' . $this->_link_count . '] ' . $this->url;
- if ( substr($link, 0, 1) != '/' ) {
- $this->_link_list .= '/';
- }
- $this->_link_list .= "$link\n";
- $additional = ' [' . $this->_link_count . ']';
+ if (!$this->_do_links || empty($link)) {
+ return $display;
}
- return $display . $additional;
+ // Ignored link types
+ if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
+ return $display;
+ }
+
+ if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
+ $url = $link;
+ }
+ else {
+ $url = $this->url;
+ if (substr($link, 0, 1) != '/') {
+ $url .= '/';
+ }
+ $url .= "$link";
+ }
+
+ if (($index = array_search($url, $this->_link_list)) === false) {
+ $index = count($this->_link_list);
+ $this->_link_list[] = $url;
+ }
+
+ return $display . ' [' . ($index+1) . ']';
}
/**
@@ -597,12 +592,20 @@
{
// get the content of PRE element
while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
+ $this->pre_content = $matches[1];
+
+ // Run our defined tags search-and-replace with callback
+ $this->pre_content = preg_replace_callback($this->callback_search,
+ array('html2text', '_preg_callback'), $this->pre_content);
+
// convert the content
$this->pre_content = sprintf('<div><br>%s<br></div>',
- preg_replace($this->pre_search, $this->pre_replace, $matches[1]));
+ preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));
+
// replace the content (use callback because content can contain $0 variable)
- $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU',
+ $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU',
array('html2text', '_preg_pre_callback'), $text, 1);
+
// free memory
$this->pre_content = '';
}
@@ -672,14 +675,14 @@
*/
private function _preg_callback($matches)
{
- switch($matches[1]) {
+ switch (strtolower($matches[1])) {
case 'b':
case 'strong':
- return $this->_strtoupper($matches[2]);
+ return $this->_toupper($matches[3]);
case 'th':
- return $this->_strtoupper("\t\t". $matches[2] ."\n");
+ return $this->_toupper("\t\t". $matches[3] ."\n");
case 'h':
- return $this->_strtoupper("\n\n". $matches[2] ."\n\n");
+ return $this->_toupper("\n\n". $matches[3] ."\n\n");
case 'a':
// Remove spaces in URL (#1487805)
$url = str_replace(' ', '', $matches[3]);
@@ -699,10 +702,31 @@
}
/**
- * Strtoupper multibyte wrapper function with HTML entities handling
+ * Strtoupper function with HTML tags and entities handling.
*
- * @param string $str Text to convert
- * @return string Converted text
+ * @param string $str Text to convert
+ * @return string Converted text
+ */
+ private function _toupper($str)
+ {
+ // string can containg HTML tags
+ $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
+
+ // convert toupper only the text between HTML tags
+ foreach ($chunks as $idx => $chunk) {
+ if ($chunk[0] != '<') {
+ $chunks[$idx] = $this->_strtoupper($chunk);
+ }
+ }
+
+ return implode($chunks);
+ }
+
+ /**
+ * Strtoupper multibyte wrapper function with HTML entities handling.
+ *
+ * @param string $str Text to convert
+ * @return string Converted text
*/
private function _strtoupper($str)
{
--
Gitblit v1.9.1