commit | author | age
|
4e17e6
|
1 |
<?php |
T |
2 |
|
|
3 |
/************************************************************************* |
|
4 |
* * |
|
5 |
* class.html2text.inc * |
|
6 |
* * |
|
7 |
************************************************************************* |
|
8 |
* * |
|
9 |
* Converts HTML to formatted plain text * |
|
10 |
* * |
|
11 |
* Copyright (c) 2005 Jon Abernathy <jon@chuggnutt.com> * |
|
12 |
* All rights reserved. * |
|
13 |
* * |
|
14 |
* This script is free software; you can redistribute it and/or modify * |
|
15 |
* it under the terms of the GNU General Public License as published by * |
|
16 |
* the Free Software Foundation; either version 2 of the License, or * |
|
17 |
* (at your option) any later version. * |
|
18 |
* * |
|
19 |
* The GNU General Public License can be found at * |
|
20 |
* http://www.gnu.org/copyleft/gpl.html. * |
|
21 |
* * |
|
22 |
* This script is distributed in the hope that it will be useful, * |
|
23 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of * |
|
24 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
|
25 |
* GNU General Public License for more details. * |
|
26 |
* * |
|
27 |
* Author(s): Jon Abernathy <jon@chuggnutt.com> * |
|
28 |
* * |
|
29 |
* Last modified: 04/06/05 * |
|
30 |
* Modified: 2004/05/19 (tbr) * |
|
31 |
* * |
|
32 |
*************************************************************************/ |
|
33 |
|
|
34 |
|
|
35 |
/** |
|
36 |
* Takes HTML and converts it to formatted, plain text. |
|
37 |
* |
|
38 |
* Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and |
|
39 |
* correcting an error in the regexp search array. Fixed 7/30/03. |
|
40 |
* |
|
41 |
* Updated set_html() function's file reading mechanism, 9/25/03. |
|
42 |
* |
|
43 |
* Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding |
|
44 |
* several more HTML entity codes to the $search and $replace arrays. |
|
45 |
* Updated 11/7/03. |
|
46 |
* |
|
47 |
* Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for |
|
48 |
* suggesting the addition of $allowed_tags and its supporting function |
|
49 |
* (which I slightly modified). Updated 3/12/04. |
|
50 |
* |
|
51 |
* Thanks to Justin Dearing for pointing out that a replacement for the |
|
52 |
* <TH> tag was missing, and suggesting an appropriate fix. |
|
53 |
* Updated 8/25/04. |
|
54 |
* |
|
55 |
* Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a |
|
56 |
* display/formatting bug in the _build_link_list() function: email |
|
57 |
* readers would show the left bracket and number ("[1") as part of the |
|
58 |
* rendered email address. |
|
59 |
* Updated 12/16/04. |
|
60 |
* |
|
61 |
* Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code |
|
62 |
* to handle relative links, which I hadn't considered. I modified his |
|
63 |
* code a bit to handle normal HTTP links and MAILTO links. Also for |
|
64 |
* suggesting three additional HTML entity codes to search for. |
|
65 |
* Updated 03/02/05. |
|
66 |
* |
|
67 |
* Thanks to Jacob Chandler for pointing out another link condition |
|
68 |
* for the _build_link_list() function: "https". |
|
69 |
* Updated 04/06/05. |
|
70 |
* |
|
71 |
* @author Jon Abernathy <jon@chuggnutt.com> |
|
72 |
* @version 0.6.1 |
|
73 |
* @since PHP 4.0.2 |
|
74 |
*/ |
|
75 |
class html2text |
|
76 |
{ |
|
77 |
|
|
78 |
/** |
|
79 |
* Contains the HTML content to convert. |
|
80 |
* |
|
81 |
* @var string $html |
|
82 |
* @access public |
|
83 |
*/ |
|
84 |
var $html; |
|
85 |
|
|
86 |
/** |
|
87 |
* Contains the converted, formatted text. |
|
88 |
* |
|
89 |
* @var string $text |
|
90 |
* @access public |
|
91 |
*/ |
|
92 |
var $text; |
|
93 |
|
|
94 |
/** |
|
95 |
* Maximum width of the formatted text, in columns. |
|
96 |
* |
|
97 |
* @var integer $width |
|
98 |
* @access public |
|
99 |
*/ |
|
100 |
var $width = 70; |
|
101 |
|
|
102 |
/** |
|
103 |
* List of preg* regular expression patterns to search for, |
|
104 |
* used in conjunction with $replace. |
|
105 |
* |
|
106 |
* @var array $search |
|
107 |
* @access public |
|
108 |
* @see $replace |
|
109 |
*/ |
|
110 |
var $search = array( |
|
111 |
"/\r/", // Non-legal carriage return |
|
112 |
"/[\n\t]+/", // Newlines and tabs |
|
113 |
'/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with |
|
114 |
//'/<!-- .* -->/', // Comments -- which strip_tags might have problem a with |
ab6f80
|
115 |
'/<a [^>]*href=("|\')([^"\']+)\1[^>]*>(.+?)<\/a>/ie', // <a href=""> |
4e17e6
|
116 |
'/<h[123][^>]*>(.+?)<\/h[123]>/ie', // H1 - H3 |
T |
117 |
'/<h[456][^>]*>(.+?)<\/h[456]>/ie', // H4 - H6 |
|
118 |
'/<p[^>]*>/i', // <P> |
|
119 |
'/<br[^>]*>/i', // <br> |
|
120 |
'/<b[^>]*>(.+?)<\/b>/ie', // <b> |
|
121 |
'/<i[^>]*>(.+?)<\/i>/i', // <i> |
|
122 |
'/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul> |
|
123 |
'/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol> |
|
124 |
'/<li[^>]*>/i', // <li> |
|
125 |
'/<hr[^>]*>/i', // <hr> |
|
126 |
'/(<table[^>]*>|<\/table>)/i', // <table> and </table> |
|
127 |
'/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr> |
|
128 |
'/<td[^>]*>(.+?)<\/td>/i', // <td> and </td> |
40ed9b
|
129 |
'/<th[^>]*>(.+?)<\/th>/ie', // <th> and </th> |
4e17e6
|
130 |
'/ /i', |
T |
131 |
'/"/i', |
|
132 |
'/>/i', |
|
133 |
'/</i', |
40ed9b
|
134 |
'/&(amp|#38);/i', |
4e17e6
|
135 |
'/©/i', |
T |
136 |
'/™/i', |
|
137 |
'/“/', |
|
138 |
'/”/', |
|
139 |
'/–/', |
40ed9b
|
140 |
'/&#(8217|39);/', |
4e17e6
|
141 |
'/©/', |
T |
142 |
'/™/', |
|
143 |
'/—/', |
|
144 |
'/“/', |
|
145 |
'/”/', |
|
146 |
'/•/', |
|
147 |
'/®/i', |
|
148 |
'/•/i', |
|
149 |
'/&[&;]+;/i' |
|
150 |
); |
|
151 |
|
|
152 |
/** |
|
153 |
* List of pattern replacements corresponding to patterns searched. |
|
154 |
* |
|
155 |
* @var array $replace |
|
156 |
* @access public |
|
157 |
* @see $search |
|
158 |
*/ |
|
159 |
var $replace = array( |
|
160 |
'', // Non-legal carriage return |
|
161 |
' ', // Newlines and tabs |
|
162 |
'', // <script>s -- which strip_tags supposedly has problems with |
5cc4b1
|
163 |
//'', // Comments -- which strip_tags might have problem a with |
ab6f80
|
164 |
'$this->_build_link_list("\\2", "\\3")', // <a href=""> |
4e17e6
|
165 |
"strtoupper(\"\n\n\\1\n\n\")", // H1 - H3 |
5cc4b1
|
166 |
"ucwords(\"\n\n\\1\n\")", // H4 - H6 |
T |
167 |
"\n\n", // <P> |
4e17e6
|
168 |
"\n", // <br> |
T |
169 |
'strtoupper("\\1")', // <b> |
|
170 |
'_\\1_', // <i> |
|
171 |
"\n\n", // <ul> and </ul> |
|
172 |
"\n\n", // <ol> and </ol> |
|
173 |
"\t*", // <li> |
|
174 |
"\n-------------------------\n", // <hr> |
|
175 |
"\n\n", // <table> and </table> |
|
176 |
"\n", // <tr> and </tr> |
|
177 |
"\t\t\\1\n", // <td> and </td> |
|
178 |
"strtoupper(\"\t\t\\1\n\")", // <th> and </th> |
|
179 |
' ', |
|
180 |
'"', |
|
181 |
'>', |
|
182 |
'<', |
|
183 |
'&', |
|
184 |
'(c)', |
|
185 |
'(tm)', |
|
186 |
'"', |
|
187 |
'"', |
|
188 |
'-', |
|
189 |
"'", |
|
190 |
'(c)', |
|
191 |
'(tm)', |
|
192 |
'--', |
|
193 |
'"', |
|
194 |
'"', |
|
195 |
'*', |
|
196 |
'(R)', |
|
197 |
'*', |
|
198 |
'' |
|
199 |
); |
|
200 |
|
|
201 |
/** |
|
202 |
* Contains a list of HTML tags to allow in the resulting text. |
|
203 |
* |
|
204 |
* @var string $allowed_tags |
|
205 |
* @access public |
|
206 |
* @see set_allowed_tags() |
|
207 |
*/ |
|
208 |
var $allowed_tags = ''; |
|
209 |
|
|
210 |
/** |
|
211 |
* Contains the base URL that relative links should resolve to. |
|
212 |
* |
|
213 |
* @var string $url |
|
214 |
* @access public |
|
215 |
*/ |
|
216 |
var $url; |
|
217 |
|
|
218 |
/** |
|
219 |
* Indicates whether content in the $html variable has been converted yet. |
|
220 |
* |
|
221 |
* @var boolean $converted |
|
222 |
* @access private |
|
223 |
* @see $html, $text |
|
224 |
*/ |
|
225 |
var $_converted = false; |
|
226 |
|
|
227 |
/** |
|
228 |
* Contains URL addresses from links to be rendered in plain text. |
|
229 |
* |
|
230 |
* @var string $link_list |
|
231 |
* @access private |
|
232 |
* @see _build_link_list() |
|
233 |
*/ |
|
234 |
var $_link_list = array(); |
dd792e
|
235 |
|
S |
236 |
/** |
|
237 |
* Boolean flag, true if a table of link URLs should be listed after the text. |
|
238 |
* |
|
239 |
* @var boolean $_do_links |
|
240 |
* @access private |
|
241 |
* @see html2text() |
|
242 |
*/ |
|
243 |
var $_do_links = true; |
4e17e6
|
244 |
|
T |
245 |
/** |
|
246 |
* Constructor. |
|
247 |
* |
|
248 |
* If the HTML source string (or file) is supplied, the class |
|
249 |
* will instantiate with that source propagated, all that has |
|
250 |
* to be done it to call get_text(). |
|
251 |
* |
|
252 |
* @param string $source HTML content |
|
253 |
* @param boolean $from_file Indicates $source is a file to pull content from |
dd792e
|
254 |
* @param boolean $do_link_table indicate whether a table of link URLs is desired |
4e17e6
|
255 |
* @access public |
T |
256 |
* @return void |
|
257 |
*/ |
5cc4b1
|
258 |
function html2text( $source = '', $from_file = false, $produce_link_table = true ) |
4e17e6
|
259 |
{ |
T |
260 |
if ( !empty($source) ) { |
|
261 |
$this->set_html($source, $from_file); |
|
262 |
} |
|
263 |
$this->set_base_url(); |
dd792e
|
264 |
$this->_do_links = $produce_link_table; |
4e17e6
|
265 |
} |
T |
266 |
|
|
267 |
/** |
|
268 |
* Loads source HTML into memory, either from $source string or a file. |
|
269 |
* |
|
270 |
* @param string $source HTML content |
|
271 |
* @param boolean $from_file Indicates $source is a file to pull content from |
|
272 |
* @access public |
|
273 |
* @return void |
|
274 |
*/ |
|
275 |
function set_html( $source, $from_file = false ) |
|
276 |
{ |
|
277 |
$this->html = $source; |
|
278 |
|
|
279 |
if ( $from_file && file_exists($source) ) { |
|
280 |
$fp = fopen($source, 'r'); |
|
281 |
$this->html = fread($fp, filesize($source)); |
|
282 |
fclose($fp); |
|
283 |
} |
|
284 |
|
|
285 |
$this->_converted = false; |
|
286 |
} |
|
287 |
|
|
288 |
/** |
|
289 |
* Returns the text, converted from HTML. |
|
290 |
* |
|
291 |
* @access public |
|
292 |
* @return string |
|
293 |
*/ |
|
294 |
function get_text() |
|
295 |
{ |
|
296 |
if ( !$this->_converted ) { |
|
297 |
$this->_convert(); |
|
298 |
} |
|
299 |
|
|
300 |
return $this->text; |
|
301 |
} |
|
302 |
|
|
303 |
/** |
|
304 |
* Prints the text, converted from HTML. |
|
305 |
* |
|
306 |
* @access public |
|
307 |
* @return void |
|
308 |
*/ |
|
309 |
function print_text() |
|
310 |
{ |
|
311 |
print $this->get_text(); |
|
312 |
} |
|
313 |
|
|
314 |
/** |
|
315 |
* Alias to print_text(), operates identically. |
|
316 |
* |
|
317 |
* @access public |
|
318 |
* @return void |
|
319 |
* @see print_text() |
|
320 |
*/ |
|
321 |
function p() |
|
322 |
{ |
|
323 |
print $this->get_text(); |
|
324 |
} |
|
325 |
|
|
326 |
/** |
|
327 |
* Sets the allowed HTML tags to pass through to the resulting text. |
|
328 |
* |
|
329 |
* Tags should be in the form "<p>", with no corresponding closing tag. |
|
330 |
* |
|
331 |
* @access public |
|
332 |
* @return void |
|
333 |
*/ |
|
334 |
function set_allowed_tags( $allowed_tags = '' ) |
|
335 |
{ |
|
336 |
if ( !empty($allowed_tags) ) { |
|
337 |
$this->allowed_tags = $allowed_tags; |
|
338 |
} |
|
339 |
} |
|
340 |
|
|
341 |
/** |
|
342 |
* Sets a base URL to handle relative links. |
|
343 |
* |
|
344 |
* @access public |
|
345 |
* @return void |
|
346 |
*/ |
|
347 |
function set_base_url( $url = '' ) |
|
348 |
{ |
|
349 |
if ( empty($url) ) { |
|
350 |
$this->url = 'http://' . $_SERVER['HTTP_HOST']; |
|
351 |
} else { |
|
352 |
// Strip any trailing slashes for consistency (relative |
|
353 |
// URLs may already start with a slash like "/file.html") |
|
354 |
if ( substr($url, -1) == '/' ) { |
|
355 |
$url = substr($url, 0, -1); |
|
356 |
} |
|
357 |
$this->url = $url; |
|
358 |
} |
|
359 |
} |
|
360 |
|
|
361 |
/** |
|
362 |
* Workhorse function that does actual conversion. |
|
363 |
* |
|
364 |
* First performs custom tag replacement specified by $search and |
|
365 |
* $replace arrays. Then strips any remaining HTML tags, reduces whitespace |
|
366 |
* and newlines to a readable format, and word wraps the text to |
|
367 |
* $width characters. |
|
368 |
* |
|
369 |
* @access private |
|
370 |
* @return void |
|
371 |
*/ |
|
372 |
function _convert() |
|
373 |
{ |
|
374 |
// Variables used for building the link list |
|
375 |
//$link_count = 1; |
|
376 |
//$this->_link_list = ''; |
|
377 |
|
|
378 |
$text = trim(stripslashes($this->html)); |
|
379 |
|
|
380 |
// Run our defined search-and-replace |
|
381 |
$text = preg_replace($this->search, $this->replace, $text); |
|
382 |
|
|
383 |
// Strip any other HTML tags |
|
384 |
$text = strip_tags($text, $this->allowed_tags); |
|
385 |
|
|
386 |
// Bring down number of empty lines to 2 max |
|
387 |
$text = preg_replace("/\n\s+\n/", "\n", $text); |
|
388 |
$text = preg_replace("/[\n]{3,}/", "\n\n", $text); |
|
389 |
|
|
390 |
// Add link list |
|
391 |
if ( sizeof($this->_link_list) ) { |
|
392 |
$text .= "\n\nLinks:\n------\n"; |
|
393 |
foreach ($this->_link_list as $id => $link) { |
|
394 |
$text .= '[' . ($id+1) . '] ' . $link . "\n"; |
|
395 |
} |
|
396 |
} |
|
397 |
|
|
398 |
// Wrap the text to a readable format |
|
399 |
// for PHP versions >= 4.0.2. Default width is 75 |
|
400 |
$text = wordwrap($text, $this->width); |
|
401 |
|
|
402 |
$this->text = $text; |
|
403 |
|
|
404 |
$this->_converted = true; |
|
405 |
} |
|
406 |
|
|
407 |
/** |
|
408 |
* Helper function called by preg_replace() on link replacement. |
|
409 |
* |
|
410 |
* Maintains an internal list of links to be displayed at the end of the |
|
411 |
* text, with numeric indices to the original point in the text they |
|
412 |
* appeared. Also makes an effort at identifying and handling absolute |
|
413 |
* and relative links. |
|
414 |
* |
|
415 |
* @param integer $link_count Counter tracking current link number |
|
416 |
* @param string $link URL of the link |
|
417 |
* @param string $display Part of the text to associate number with |
|
418 |
* @access private |
|
419 |
* @return string |
|
420 |
*/ |
|
421 |
function _build_link_list($link, $display) |
|
422 |
{ |
dd792e
|
423 |
if (! $this->_do_links) return $display; |
S |
424 |
|
4e17e6
|
425 |
$link_lc = strtolower($link); |
T |
426 |
|
|
427 |
if (substr($link_lc, 0, 7) == 'http://' || substr($link_lc, 0, 8) == 'https://' || substr($link_lc, 0, 7) == 'mailto:') |
|
428 |
{ |
|
429 |
$url = $link; |
|
430 |
} |
|
431 |
else |
|
432 |
{ |
|
433 |
$url = $this->url; |
|
434 |
if ($link{0} != '/') { |
|
435 |
$url .= '/'; |
|
436 |
} |
|
437 |
$url .= $link; |
|
438 |
} |
|
439 |
|
|
440 |
$index = array_search($url, $this->_link_list); |
|
441 |
if ($index===FALSE) |
|
442 |
{ |
|
443 |
$index = sizeof($this->_link_list); |
|
444 |
$this->_link_list[$index] = $url; |
|
445 |
} |
|
446 |
|
|
447 |
return $display . ' [' . ($index+1) . ']'; |
|
448 |
} |
|
449 |
} |
|
450 |
|
|
451 |
?> |