githubFork/roundcubemail.git

Unified attachments filenames handling for message parts without a filename

Aleksander Machniak

2012-12-28 be72fb3597c21ca3aaa058adf41bb72d53d197c7

commit \| author \| age
c321a9	1	<?php
T	2
	3	/*
	4	+-----------------------------------------------------------------------+
	5	\| This file is part of the Roundcube Webmail client \|
	6	\| Copyright (C) 2005-2012, The Roundcube Dev Team \|
	7	\| Copyright (C) 2011-2012, Kolab Systems AG \|
	8	\| Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org> \|
7fe381	9	\| \|
T	10	\| Licensed under the GNU General Public License version 3 or \|
	11	\| any later version with exceptions for skins & plugins. \|
	12	\| See the README file for a full license statement. \|
c321a9	13	\| \|
T	14	\| PURPOSE: \|
	15	\| Provide charset conversion functionality \|
	16	+-----------------------------------------------------------------------+
	17	\| Author: Thomas Bruederli <roundcube@gmail.com> \|
	18	\| Author: Aleksander Machniak <alec@alec.pl> \|
	19	+-----------------------------------------------------------------------+
	20	*/
	21
	22	/**
	23	* Character sets conversion functionality
	24	*
9ab346	25	* @package Framework
AM	26	* @subpackage Core
	27	* @author Thomas Bruederli <roundcube@gmail.com>
	28	* @author Aleksander Machniak <alec@alec.pl>
	29	* @author Edmund Grimley Evans <edmundo@rano.org>
c321a9	30	*/
T	31	class rcube_charset
	32	{
	33	// Aliases: some of them from HTML5 spec.
	34	static public $aliases = array(
	35	'USASCII' => 'WINDOWS-1252',
	36	'ANSIX31101983' => 'WINDOWS-1252',
	37	'ANSIX341968' => 'WINDOWS-1252',
	38	'UNKNOWN8BIT' => 'ISO-8859-15',
	39	'UNKNOWN' => 'ISO-8859-15',
	40	'USERDEFINED' => 'ISO-8859-15',
	41	'KSC56011987' => 'EUC-KR',
1495ac	42	'GB2312' => 'GBK',
A	43	'GB231280' => 'GBK',
	44	'UNICODE' => 'UTF-8',
	45	'UTF7IMAP' => 'UTF7-IMAP',
	46	'TIS620' => 'WINDOWS-874',
	47	'ISO88599' => 'WINDOWS-1254',
	48	'ISO885911' => 'WINDOWS-874',
	49	'MACROMAN' => 'MACINTOSH',
c321a9	50	'77' => 'MAC',
T	51	'128' => 'SHIFT-JIS',
	52	'129' => 'CP949',
	53	'130' => 'CP1361',
	54	'134' => 'GBK',
	55	'136' => 'BIG5',
	56	'161' => 'WINDOWS-1253',
	57	'162' => 'WINDOWS-1254',
	58	'163' => 'WINDOWS-1258',
	59	'177' => 'WINDOWS-1255',
	60	'178' => 'WINDOWS-1256',
	61	'186' => 'WINDOWS-1257',
	62	'204' => 'WINDOWS-1251',
	63	'222' => 'WINDOWS-874',
	64	'238' => 'WINDOWS-1250',
	65	'MS950' => 'CP950',
	66	'WINDOWS949' => 'UHC',
	67	);
	68
	69
	70	/**
	71	* Catch an error and throw an exception.
	72	*
	73	* @param int Level of the error
	74	* @param string Error message
	75	*/
66d215	76	public static function error_handler($errno, $errstr)
c321a9	77	{
T	78	throw new ErrorException($errstr, 0, $errno);
	79	}
	80
	81
	82	/**
	83	* Parse and validate charset name string (see #1485758).
	84	* Sometimes charset string is malformed, there are also charset aliases
	85	* but we need strict names for charset conversion (specially utf8 class)
	86	*
40e1c2	87	* @param string $input Input charset name
c321a9	88	*
T	89	* @return string The validated charset name
	90	*/
f5f9b4	91	public static function parse_charset($input)
c321a9	92	{
T	93	static $charsets = array();
	94	$charset = strtoupper($input);
	95
	96	if (isset($charsets[$input])) {
	97	return $charsets[$input];
	98	}
	99
	100	$charset = preg_replace(array(
	101	'/^[^0-9A-Z]+/', // e.g. _ISO-8859-JP$SIO
	102	'/\$.*$/', // e.g. _ISO-8859-JP$SIO
	103	'/UNICODE-1-1-*/', // RFC1641/1642
	104	'/^X-/', // X- prefix (e.g. X-ROMAN8 => ROMAN8)
	105	), '', $charset);
	106
	107	if ($charset == 'BINARY') {
	108	return $charsets[$input] = null;
	109	}
	110
	111	// allow A-Z and 0-9 only
	112	$str = preg_replace('/[^A-Z0-9]/', '', $charset);
	113
	114	if (isset(self::$aliases[$str])) {
	115	$result = self::$aliases[$str];
	116	}
	117	// UTF
	118	else if (preg_match('/U[A-Z][A-Z](7\|8\|16\|32)(BE\|LE)*/', $str, $m)) {
	119	$result = 'UTF-' . $m[1] . $m[2];
	120	}
	121	// ISO-8859
	122	else if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) {
	123	$iso = 'ISO-8859-' . ($m[1] ? $m[1] : 1);
	124	// some clients sends windows-1252 text as latin1,
	125	// it is safe to use windows-1252 for all latin1
	126	$result = $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso;
	127	}
	128	// handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE
	129	else if (preg_match('/(WIN\|WINDOWS)([0-9]+)/', $str, $m)) {
	130	$result = 'WINDOWS-' . $m[2];
	131	}
	132	// LATIN
	133	else if (preg_match('/LATIN(.*)/', $str, $m)) {
	134	$aliases = array('2' => 2, '3' => 3, '4' => 4, '5' => 9, '6' => 10,
	135	'7' => 13, '8' => 14, '9' => 15, '10' => 16,
	136	'ARABIC' => 6, 'CYRILLIC' => 5, 'GREEK' => 7, 'GREEK1' => 7, 'HEBREW' => 8
	137	);
	138
	139	// some clients sends windows-1252 text as latin1,
	140	// it is safe to use windows-1252 for all latin1
	141	if ($m[1] == 1) {
	142	$result = 'WINDOWS-1252';
	143	}
	144	// if iconv is not supported we need ISO labels, it's also safe for iconv
	145	else if (!empty($aliases[$m[1]])) {
	146	$result = 'ISO-8859-'.$aliases[$m[1]];
	147	}
	148	// iconv requires convertion of e.g. LATIN-1 to LATIN1
	149	else {
	150	$result = $str;
	151	}
	152	}
	153	else {
	154	$result = $charset;
	155	}
	156
	157	$charsets[$input] = $result;
	158
	159	return $result;
	160	}
	161
	162
	163	/**
	164	* Convert a string from one charset to another.
	165	* Uses mbstring and iconv functions if possible
	166	*
	167	* @param string Input string
	168	* @param string Suspected charset of the input string
a92beb	169	* @param string Target charset to convert to; defaults to RCUBE_CHARSET
c321a9	170	*
T	171	* @return string Converted string
	172	*/
	173	public static function convert($str, $from, $to = null)
	174	{
	175	static $iconv_options = null;
	176	static $mbstring_list = null;
bc1e4f	177	static $mbstring_sch = null;
c321a9	178	static $conv = null;
T	179
a92beb	180	$to = empty($to) ? RCUBE_CHARSET : $to;
f5f9b4	181	$from = self::parse_charset($from);
c321a9	182
764641	183	// It is a common case when UTF-16 charset is used with US-ASCII content (#1488654)
AM	184	// In that case we can just skip the conversion (use UTF-8)
	185	if ($from == 'UTF-16' && !preg_match('/[^\x00-\x7F]/', $str)) {
	186	$from = 'UTF-8';
	187	}
	188
c321a9	189	if ($from == $to \|\| empty($str) \|\| empty($from)) {
T	190	return $str;
	191	}
	192
f5f9b4	193	if ($iconv_options === null) {
A	194	if (function_exists('iconv')) {
c321a9	195	// ignore characters not available in output charset
T	196	$iconv_options = '//IGNORE';
	197	if (iconv('', $iconv_options, '') === false) {
	198	// iconv implementation does not support options
	199	$iconv_options = '';
	200	}
	201	}
f5f9b4	202	}
c321a9	203
f5f9b4	204	// convert charset using iconv module
A	205	if ($iconv_options !== null && $from != 'UTF7-IMAP' && $to != 'UTF7-IMAP') {
c321a9	206	// throw an exception if iconv reports an illegal character in input
T	207	// it means that input string has been truncated
	208	set_error_handler(array('rcube_charset', 'error_handler'), E_NOTICE);
	209	try {
	210	$_iconv = iconv($from, $to . $iconv_options, $str);
	211	} catch (ErrorException $e) {
	212	$_iconv = false;
	213	}
	214	restore_error_handler();
	215
	216	if ($_iconv !== false) {
	217	return $_iconv;
	218	}
	219	}
	220
f5f9b4	221	if ($mbstring_list === null) {
A	222	if (extension_loaded('mbstring')) {
bc1e4f	223	$mbstring_sch = mb_substitute_character();
c321a9	224	$mbstring_list = mb_list_encodings();
T	225	$mbstring_list = array_map('strtoupper', $mbstring_list);
	226	}
f5f9b4	227	}
A	228
	229	// convert charset using mbstring module
	230	if ($mbstring_list !== null) {
	231	$aliases['WINDOWS-1257'] = 'ISO-8859-13';
bc1e4f	232	// it happens that mbstring supports ASCII but not US-ASCII
AM	233	if (($from == 'US-ASCII' \|\| $to == 'US-ASCII') && !in_array('US-ASCII', $mbstring_list)) {
	234	$aliases['US-ASCII'] = 'ASCII';
	235	}
c321a9	236
T	237	$mb_from = $aliases[$from] ? $aliases[$from] : $from;
	238	$mb_to = $aliases[$to] ? $aliases[$to] : $to;
	239
	240	// return if encoding found, string matches encoding and convert succeeded
	241	if (in_array($mb_from, $mbstring_list) && in_array($mb_to, $mbstring_list)) {
bc1e4f	242	if (mb_check_encoding($str, $mb_from)) {
AM	243	// Do the same as //IGNORE with iconv
	244	mb_substitute_character('none');
	245	$out = mb_convert_encoding($str, $mb_to, $mb_from);
	246	mb_substitute_character($mbstring_sch);
	247
	248	if ($out !== false) {
	249	return $out;
	250	}
c321a9	251	}
T	252	}
	253	}
	254
	255	// convert charset using bundled classes/functions
	256	if ($to == 'UTF-8') {
	257	if ($from == 'UTF7-IMAP') {
	258	if ($_str = self::utf7imap_to_utf8($str)) {
	259	return $_str;
	260	}
	261	}
	262	else if ($from == 'UTF-7') {
	263	if ($_str = self::utf7_to_utf8($str)) {
	264	return $_str;
	265	}
	266	}
	267	else if ($from == 'ISO-8859-1' && function_exists('utf8_encode')) {
	268	return utf8_encode($str);
	269	}
	270	else if (class_exists('utf8')) {
	271	if (!$conv) {
	272	$conv = new utf8($from);
	273	}
	274	else {
	275	$conv->loadCharset($from);
	276	}
	277
	278	if ($_str = $conv->strToUtf8($str)) {
	279	return $_str;
	280	}
	281	}
	282	}
	283
	284	// encode string for output
	285	if ($from == 'UTF-8') {
	286	// @TODO: we need a function for UTF-7 (RFC2152) conversion
	287	if ($to == 'UTF7-IMAP' \|\| $to == 'UTF-7') {
20efa5	288	if ($_str = self::utf8_to_utf7imap($str)) {
c321a9	289	return $_str;
T	290	}
	291	}
	292	else if ($to == 'ISO-8859-1' && function_exists('utf8_decode')) {
	293	return utf8_decode($str);
	294	}
	295	else if (class_exists('utf8')) {
	296	if (!$conv) {
	297	$conv = new utf8($to);
	298	}
	299	else {
	300	$conv->loadCharset($from);
	301	}
	302
	303	if ($_str = $conv->strToUtf8($str)) {
	304	return $_str;
	305	}
	306	}
	307	}
	308
	309	// return original string
	310	return $str;
	311	}
	312
	313
	314	/**
	315	* Converts string from standard UTF-7 (RFC 2152) to UTF-8.
	316	*
	317	* @param string Input string (UTF-7)
	318	*
	319	* @return string Converted string (UTF-8)
	320	*/
	321	public static function utf7_to_utf8($str)
	322	{
	323	$Index_64 = array(
	324	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
	325	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
	326	0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,0,0,
	327	1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,
	328	0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	329	1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
	330	0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	331	1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
	332	);
	333
	334	$u7len = strlen($str);
	335	$str = strval($str);
	336	$res = '';
	337
	338	for ($i=0; $u7len > 0; $i++, $u7len--) {
	339	$u7 = $str[$i];
	340	if ($u7 == '+') {
	341	$i++;
	342	$u7len--;
	343	$ch = '';
	344
	345	for (; $u7len > 0; $i++, $u7len--) {
	346	$u7 = $str[$i];
	347
	348	if (!$Index_64[ord($u7)]) {
	349	break;
	350	}
	351
1495ac	352	$ch .= $u7;
c321a9	353	}
T	354
	355	if ($ch == '') {
	356	if ($u7 == '-') {
	357	$res .= '+';
	358	}
	359
	360	continue;
	361	}
	362
	363	$res .= self::utf16_to_utf8(base64_decode($ch));
	364	}
	365	else {
	366	$res .= $u7;
	367	}
	368	}
	369
	370	return $res;
	371	}
	372
	373
	374	/**
	375	* Converts string from UTF-16 to UTF-8 (helper for utf-7 to utf-8 conversion)
	376	*
	377	* @param string Input string
	378	*
	379	* @return string The converted string
	380	*/
	381	public static function utf16_to_utf8($str)
	382	{
	383	$len = strlen($str);
	384	$dec = '';
	385
	386	for ($i = 0; $i < $len; $i += 2) {
	387	$c = ord($str[$i]) << 8 \| ord($str[$i + 1]);
	388	if ($c >= 0x0001 && $c <= 0x007F) {
	389	$dec .= chr($c);
	390	}
	391	else if ($c > 0x07FF) {
	392	$dec .= chr(0xE0 \| (($c >> 12) & 0x0F));
	393	$dec .= chr(0x80 \| (($c >> 6) & 0x3F));
	394	$dec .= chr(0x80 \| (($c >> 0) & 0x3F));
	395	}
	396	else {
	397	$dec .= chr(0xC0 \| (($c >> 6) & 0x1F));
	398	$dec .= chr(0x80 \| (($c >> 0) & 0x3F));
	399	}
	400	}
	401
	402	return $dec;
	403	}
	404
	405
	406	/**
	407	* Convert the data ($str) from RFC 2060's UTF-7 to UTF-8.
	408	* If input data is invalid, return the original input string.
	409	* RFC 2060 obviously intends the encoding to be unique (see
	410	* point 5 in section 5.1.3), so we reject any non-canonical
	411	* form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead
	412	* of &AMAAwA-).
	413	*
	414	* Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
	415	*
	416	* @param string $str Input string (UTF7-IMAP)
	417	*
	418	* @return string Output string (UTF-8)
	419	*/
	420	public static function utf7imap_to_utf8($str)
	421	{
	422	$Index_64 = array(
	423	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
	424	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
	425	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
	426	52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
	427	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
	428	15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
	429	-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
	430	41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
	431	);
	432
	433	$u7len = strlen($str);
	434	$str = strval($str);
	435	$p = '';
	436	$err = '';
	437
	438	for ($i=0; $u7len > 0; $i++, $u7len--) {
	439	$u7 = $str[$i];
	440	if ($u7 == '&') {
	441	$i++;
	442	$u7len--;
	443	$u7 = $str[$i];
	444
	445	if ($u7len && $u7 == '-') {
	446	$p .= '&';
	447	continue;
	448	}
	449
	450	$ch = 0;
	451	$k = 10;
	452	for (; $u7len > 0; $i++, $u7len--) {
	453	$u7 = $str[$i];
	454
	455	if ((ord($u7) & 0x80) \|\| ($b = $Index_64[ord($u7)]) == -1) {
	456	break;
	457	}
	458
	459	if ($k > 0) {
	460	$ch \|= $b << $k;
	461	$k -= 6;
	462	}
	463	else {
	464	$ch \|= $b >> (-$k);
	465	if ($ch < 0x80) {
	466	// Printable US-ASCII
	467	if (0x20 <= $ch && $ch < 0x7f) {
	468	return $err;
	469	}
	470	$p .= chr($ch);
	471	}
	472	else if ($ch < 0x800) {
	473	$p .= chr(0xc0 \| ($ch >> 6));
	474	$p .= chr(0x80 \| ($ch & 0x3f));
	475	}
	476	else {
	477	$p .= chr(0xe0 \| ($ch >> 12));
	478	$p .= chr(0x80 \| (($ch >> 6) & 0x3f));
	479	$p .= chr(0x80 \| ($ch & 0x3f));
	480	}
	481
	482	$ch = ($b << (16 + $k)) & 0xffff;
	483	$k += 10;
	484	}
	485	}
	486
	487	// Non-zero or too many extra bits
	488	if ($ch \|\| $k < 6) {
	489	return $err;
	490	}
	491
	492	// BASE64 not properly terminated
	493	if (!$u7len \|\| $u7 != '-') {
	494	return $err;
	495	}
	496
	497	// Adjacent BASE64 sections
	498	if ($u7len > 2 && $str[$i+1] == '&' && $str[$i+2] != '-') {
	499	return $err;
	500	}
	501	}
	502	// Not printable US-ASCII
	503	else if (ord($u7) < 0x20 \|\| ord($u7) >= 0x7f) {
	504	return $err;
	505	}
	506	else {
	507	$p .= $u7;
	508	}
	509	}
	510
	511	return $p;
	512	}
	513
	514
	515	/**
	516	* Convert the data ($str) from UTF-8 to RFC 2060's UTF-7.
	517	* Unicode characters above U+FFFF are replaced by U+FFFE.
	518	* If input data is invalid, return an empty string.
	519	*
	520	* Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
	521	*
	522	* @param string $str Input string (UTF-8)
	523	*
	524	* @return string Output string (UTF7-IMAP)
	525	*/
	526	public static function utf8_to_utf7imap($str)
	527	{
	528	$B64Chars = array(
	529	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
	530	'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
	531	'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
	532	't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
	533	'8', '9', '+', ','
	534	);
	535
	536	$u8len = strlen($str);
	537	$base64 = 0;
	538	$i = 0;
	539	$p = '';
	540	$err = '';
	541
	542	while ($u8len) {
	543	$u8 = $str[$i];
	544	$c = ord($u8);
	545
	546	if ($c < 0x80) {
	547	$ch = $c;
	548	$n = 0;
	549	}
	550	else if ($c < 0xc2) {
	551	return $err;
	552	}
	553	else if ($c < 0xe0) {
	554	$ch = $c & 0x1f;
	555	$n = 1;
	556	}
	557	else if ($c < 0xf0) {
	558	$ch = $c & 0x0f;
	559	$n = 2;
	560	}
	561	else if ($c < 0xf8) {
	562	$ch = $c & 0x07;
	563	$n = 3;
	564	}
	565	else if ($c < 0xfc) {
	566	$ch = $c & 0x03;
	567	$n = 4;
	568	}
	569	else if ($c < 0xfe) {
	570	$ch = $c & 0x01;
	571	$n = 5;
	572	}
	573	else {
	574	return $err;
	575	}
	576
	577	$i++;
	578	$u8len--;
	579
	580	if ($n > $u8len) {
	581	return $err;
	582	}
	583
	584	for ($j=0; $j < $n; $j++) {
	585	$o = ord($str[$i+$j]);
	586	if (($o & 0xc0) != 0x80) {
	587	return $err;
	588	}
	589	$ch = ($ch << 6) \| ($o & 0x3f);
	590	}
	591
	592	if ($n > 1 && !($ch >> ($n * 5 + 1))) {
	593	return $err;
	594	}
	595
	596	$i += $n;
	597	$u8len -= $n;
	598
	599	if ($ch < 0x20 \|\| $ch >= 0x7f) {
	600	if (!$base64) {
	601	$p .= '&';
	602	$base64 = 1;
	603	$b = 0;
	604	$k = 10;
	605	}
	606	if ($ch & ~0xffff) {
	607	$ch = 0xfffe;
	608	}
	609
	610	$p .= $B64Chars[($b \| $ch >> $k)];
	611	$k -= 6;
	612	for (; $k >= 0; $k -= 6) {
	613	$p .= $B64Chars[(($ch >> $k) & 0x3f)];
	614	}
	615
	616	$b = ($ch << (-$k)) & 0x3f;
	617	$k += 16;
	618	}
	619	else {
	620	if ($base64) {
	621	if ($k > 10) {
	622	$p .= $B64Chars[$b];
	623	}
	624	$p .= '-';
	625	$base64 = 0;
	626	}
	627
	628	$p .= chr($ch);
	629	if (chr($ch) == '&') {
	630	$p .= '-';
	631	}
	632	}
	633	}
	634
	635	if ($base64) {
	636	if ($k > 10) {
	637	$p .= $B64Chars[$b];
	638	}
	639	$p .= '-';
	640	}
	641
	642	return $p;
	643	}
	644
	645
	646	/**
	647	* A method to guess character set of a string.
	648	*
	649	* @param string $string String.
	650	* @param string $failover Default result for failover.
	651	*
	652	* @return string Charset name
	653	*/
	654	public static function detect($string, $failover='')
	655	{
0679b2	656	if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE'; // Big Endian
AM	657	if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE'; // Little Endian
	658	if (substr($string, 0, 2) == "\xFE\xFF") return 'UTF-16BE'; // Big Endian
	659	if (substr($string, 0, 2) == "\xFF\xFE") return 'UTF-16LE'; // Little Endian
	660	if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8';
	661
	662	// heuristics
	663	if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE';
	664	if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE';
	665	if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE';
	666	if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE';
	667
	668	if (function_exists('mb_detect_encoding')) {
	669	// FIXME: the order is important, because sometimes
	670	// iso string is detected as euc-jp and etc.
	671	$enc = array(
c8558a	672	'UTF-8', 'SJIS', 'GB2312',
0679b2	673	'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
AM	674	'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',
	675	'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',
c8558a	676	'WINDOWS-1252', 'WINDOWS-1251', 'EUC-JP', 'EUC-TW', 'KOI8-R', 'BIG5',
AM	677	'ISO-2022-KR', 'ISO-2022-JP',
0679b2	678	);
AM	679
	680	$result = mb_detect_encoding($string, join(',', $enc));
c321a9	681	}
0679b2	682	else {
AM	683	// No match, check for UTF-8
	684	// from http://w3.org/International/questions/qa-forms-utf-8.html
	685	if (preg_match('/\A(
	686	[\x09\x0A\x0D\x20-\x7E]
	687	\| [\xC2-\xDF][\x80-\xBF]
	688	\| \xE0[\xA0-\xBF][\x80-\xBF]
	689	\| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}
	690	\| \xED[\x80-\x9F][\x80-\xBF]
	691	\| \xF0[\x90-\xBF][\x80-\xBF]{2}
	692	\| [\xF1-\xF3][\x80-\xBF]{3}
	693	\| \xF4[\x80-\x8F][\x80-\xBF]{2}
	694	)*\z/xs', substr($string, 0, 2048))
	695	) {
	696	return 'UTF-8';
	697	}
	698	}
c321a9	699
T	700	return $result ? $result : $failover;
	701	}
	702
	703
	704	/**
	705	* Removes non-unicode characters from input.
	706	*
	707	* @param mixed $input String or array.
	708	*
	709	* @return mixed String or array
	710	*/
	711	public static function clean($input)
	712	{
	713	// handle input of type array
	714	if (is_array($input)) {
	715	foreach ($input as $idx => $val) {
	716	$input[$idx] = self::clean($val);
	717	}
	718	return $input;
	719	}
	720
	721	if (!is_string($input) \|\| $input == '') {
	722	return $input;
	723	}
	724
	725	// iconv/mbstring are much faster (especially with long strings)
	726	if (function_exists('mb_convert_encoding')) {
	727	if (($res = mb_convert_encoding($input, 'UTF-8', 'UTF-8')) !== false) {
	728	return $res;
	729	}
	730	}
	731
	732	if (function_exists('iconv')) {
	733	if (($res = @iconv('UTF-8', 'UTF-8//IGNORE', $input)) !== false) {
	734	return $res;
	735	}
	736	}
	737
	738	$seq = '';
	739	$out = '';
	740	$regexp = '/^('.
	741	// '[\x00-\x7F]'. // UTF8-1
	742	'\|[\xC2-\xDF][\x80-\xBF]'. // UTF8-2
	743	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. // UTF8-3
	744	'\|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'. // UTF8-3
	745	'\|\xED[\x80-\x9F][\x80-\xBF]'. // UTF8-3
	746	'\|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'. // UTF8-3
	747	'\|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'. // UTF8-4
	748	'\|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.// UTF8-4
	749	'\|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'. // UTF8-4
	750	')$/';
	751
	752	for ($i = 0, $len = strlen($input); $i < $len; $i++) {
	753	$chr = $input[$i];
	754	$ord = ord($chr);
	755
	756	// 1-byte character
	757	if ($ord <= 0x7F) {
	758	if ($seq) {
	759	$out .= preg_match($regexp, $seq) ? $seq : '';
	760	}
	761	$seq = '';
	762	$out .= $chr;
	763	// first (or second) byte of multibyte sequence
	764	}
	765	else if ($ord >= 0xC0) {
	766	if (strlen($seq) > 1) {
1495ac	767	$out .= preg_match($regexp, $seq) ? $seq : '';
c321a9	768	$seq = '';
T	769	}
	770	else if ($seq && ord($seq) < 0xC0) {
	771	$seq = '';
	772	}
	773	$seq .= $chr;
	774	// next byte of multibyte sequence
	775	}
	776	else if ($seq) {
	777	$seq .= $chr;
	778	}
	779	}
	780
	781	if ($seq) {
	782	$out .= preg_match($regexp, $seq) ? $seq : '';
	783	}
	784
	785	return $out;
	786	}
	787
	788	}