githubFork/roundcubemail.git

First attempt to search in multiple folders; do it multi-threaded using pth...

Thomas

2013-10-14 566747af00ae413c942a7c6702e24c044af36f17

commit \| author \| age
c321a9	1	<?php
T	2
	3	/*
	4	+-----------------------------------------------------------------------+
	5	\| This file is part of the Roundcube Webmail client \|
	6	\| Copyright (C) 2005-2012, The Roundcube Dev Team \|
	7	\| Copyright (C) 2011-2012, Kolab Systems AG \|
	8	\| Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org> \|
7fe381	9	\| \|
T	10	\| Licensed under the GNU General Public License version 3 or \|
	11	\| any later version with exceptions for skins & plugins. \|
	12	\| See the README file for a full license statement. \|
c321a9	13	\| \|
T	14	\| PURPOSE: \|
	15	\| Provide charset conversion functionality \|
	16	+-----------------------------------------------------------------------+
	17	\| Author: Thomas Bruederli <roundcube@gmail.com> \|
	18	\| Author: Aleksander Machniak <alec@alec.pl> \|
	19	+-----------------------------------------------------------------------+
	20	*/
	21
	22	/**
	23	* Character sets conversion functionality
	24	*
9ab346	25	* @package Framework
AM	26	* @subpackage Core
	27	* @author Thomas Bruederli <roundcube@gmail.com>
	28	* @author Aleksander Machniak <alec@alec.pl>
	29	* @author Edmund Grimley Evans <edmundo@rano.org>
c321a9	30	*/
T	31	class rcube_charset
	32	{
	33	// Aliases: some of them from HTML5 spec.
	34	static public $aliases = array(
	35	'USASCII' => 'WINDOWS-1252',
	36	'ANSIX31101983' => 'WINDOWS-1252',
	37	'ANSIX341968' => 'WINDOWS-1252',
	38	'UNKNOWN8BIT' => 'ISO-8859-15',
	39	'UNKNOWN' => 'ISO-8859-15',
	40	'USERDEFINED' => 'ISO-8859-15',
	41	'KSC56011987' => 'EUC-KR',
1495ac	42	'GB2312' => 'GBK',
A	43	'GB231280' => 'GBK',
	44	'UNICODE' => 'UTF-8',
	45	'UTF7IMAP' => 'UTF7-IMAP',
	46	'TIS620' => 'WINDOWS-874',
	47	'ISO88599' => 'WINDOWS-1254',
	48	'ISO885911' => 'WINDOWS-874',
	49	'MACROMAN' => 'MACINTOSH',
c321a9	50	'77' => 'MAC',
T	51	'128' => 'SHIFT-JIS',
	52	'129' => 'CP949',
	53	'130' => 'CP1361',
	54	'134' => 'GBK',
	55	'136' => 'BIG5',
	56	'161' => 'WINDOWS-1253',
	57	'162' => 'WINDOWS-1254',
	58	'163' => 'WINDOWS-1258',
	59	'177' => 'WINDOWS-1255',
	60	'178' => 'WINDOWS-1256',
	61	'186' => 'WINDOWS-1257',
	62	'204' => 'WINDOWS-1251',
	63	'222' => 'WINDOWS-874',
	64	'238' => 'WINDOWS-1250',
	65	'MS950' => 'CP950',
	66	'WINDOWS949' => 'UHC',
	67	);
	68
	69
	70	/**
	71	* Catch an error and throw an exception.
	72	*
	73	* @param int Level of the error
	74	* @param string Error message
	75	*/
66d215	76	public static function error_handler($errno, $errstr)
c321a9	77	{
T	78	throw new ErrorException($errstr, 0, $errno);
	79	}
	80
	81
	82	/**
	83	* Parse and validate charset name string (see #1485758).
	84	* Sometimes charset string is malformed, there are also charset aliases
	85	* but we need strict names for charset conversion (specially utf8 class)
	86	*
40e1c2	87	* @param string $input Input charset name
c321a9	88	*
T	89	* @return string The validated charset name
	90	*/
f5f9b4	91	public static function parse_charset($input)
c321a9	92	{
T	93	static $charsets = array();
	94	$charset = strtoupper($input);
	95
	96	if (isset($charsets[$input])) {
	97	return $charsets[$input];
	98	}
	99
	100	$charset = preg_replace(array(
	101	'/^[^0-9A-Z]+/', // e.g. _ISO-8859-JP$SIO
	102	'/\$.*$/', // e.g. _ISO-8859-JP$SIO
	103	'/UNICODE-1-1-*/', // RFC1641/1642
	104	'/^X-/', // X- prefix (e.g. X-ROMAN8 => ROMAN8)
	105	), '', $charset);
	106
	107	if ($charset == 'BINARY') {
	108	return $charsets[$input] = null;
	109	}
	110
	111	// allow A-Z and 0-9 only
	112	$str = preg_replace('/[^A-Z0-9]/', '', $charset);
	113
	114	if (isset(self::$aliases[$str])) {
	115	$result = self::$aliases[$str];
	116	}
	117	// UTF
	118	else if (preg_match('/U[A-Z][A-Z](7\|8\|16\|32)(BE\|LE)*/', $str, $m)) {
	119	$result = 'UTF-' . $m[1] . $m[2];
	120	}
	121	// ISO-8859
	122	else if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) {
	123	$iso = 'ISO-8859-' . ($m[1] ? $m[1] : 1);
	124	// some clients sends windows-1252 text as latin1,
	125	// it is safe to use windows-1252 for all latin1
	126	$result = $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso;
	127	}
	128	// handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE
	129	else if (preg_match('/(WIN\|WINDOWS)([0-9]+)/', $str, $m)) {
	130	$result = 'WINDOWS-' . $m[2];
	131	}
	132	// LATIN
	133	else if (preg_match('/LATIN(.*)/', $str, $m)) {
	134	$aliases = array('2' => 2, '3' => 3, '4' => 4, '5' => 9, '6' => 10,
	135	'7' => 13, '8' => 14, '9' => 15, '10' => 16,
	136	'ARABIC' => 6, 'CYRILLIC' => 5, 'GREEK' => 7, 'GREEK1' => 7, 'HEBREW' => 8
	137	);
	138
	139	// some clients sends windows-1252 text as latin1,
	140	// it is safe to use windows-1252 for all latin1
	141	if ($m[1] == 1) {
	142	$result = 'WINDOWS-1252';
	143	}
	144	// if iconv is not supported we need ISO labels, it's also safe for iconv
	145	else if (!empty($aliases[$m[1]])) {
	146	$result = 'ISO-8859-'.$aliases[$m[1]];
	147	}
	148	// iconv requires convertion of e.g. LATIN-1 to LATIN1
	149	else {
	150	$result = $str;
	151	}
	152	}
	153	else {
	154	$result = $charset;
	155	}
	156
	157	$charsets[$input] = $result;
	158
	159	return $result;
	160	}
	161
	162
	163	/**
	164	* Convert a string from one charset to another.
	165	* Uses mbstring and iconv functions if possible
	166	*
	167	* @param string Input string
	168	* @param string Suspected charset of the input string
a92beb	169	* @param string Target charset to convert to; defaults to RCUBE_CHARSET
c321a9	170	*
T	171	* @return string Converted string
	172	*/
	173	public static function convert($str, $from, $to = null)
	174	{
	175	static $iconv_options = null;
	176	static $mbstring_list = null;
bc1e4f	177	static $mbstring_sch = null;
c321a9	178	static $conv = null;
T	179
a92beb	180	$to = empty($to) ? RCUBE_CHARSET : $to;
f5f9b4	181	$from = self::parse_charset($from);
c321a9	182
764641	183	// It is a common case when UTF-16 charset is used with US-ASCII content (#1488654)
AM	184	// In that case we can just skip the conversion (use UTF-8)
	185	if ($from == 'UTF-16' && !preg_match('/[^\x00-\x7F]/', $str)) {
	186	$from = 'UTF-8';
	187	}
	188
c321a9	189	if ($from == $to \|\| empty($str) \|\| empty($from)) {
T	190	return $str;
	191	}
	192
f5f9b4	193	if ($iconv_options === null) {
A	194	if (function_exists('iconv')) {
c321a9	195	// ignore characters not available in output charset
T	196	$iconv_options = '//IGNORE';
	197	if (iconv('', $iconv_options, '') === false) {
	198	// iconv implementation does not support options
	199	$iconv_options = '';
	200	}
	201	}
172302	202	else {
AM	203	$iconv_options = false;
	204	}
f5f9b4	205	}
c321a9	206
f5f9b4	207	// convert charset using iconv module
172302	208	if ($iconv_options !== false && $from != 'UTF7-IMAP' && $to != 'UTF7-IMAP') {
c321a9	209	// throw an exception if iconv reports an illegal character in input
T	210	// it means that input string has been truncated
	211	set_error_handler(array('rcube_charset', 'error_handler'), E_NOTICE);
	212	try {
	213	$_iconv = iconv($from, $to . $iconv_options, $str);
	214	} catch (ErrorException $e) {
	215	$_iconv = false;
	216	}
	217	restore_error_handler();
	218
	219	if ($_iconv !== false) {
	220	return $_iconv;
	221	}
	222	}
	223
f5f9b4	224	if ($mbstring_list === null) {
A	225	if (extension_loaded('mbstring')) {
bc1e4f	226	$mbstring_sch = mb_substitute_character();
c321a9	227	$mbstring_list = mb_list_encodings();
T	228	$mbstring_list = array_map('strtoupper', $mbstring_list);
	229	}
172302	230	else {
AM	231	$mbstring_list = false;
	232	}
f5f9b4	233	}
A	234
	235	// convert charset using mbstring module
172302	236	if ($mbstring_list !== false) {
f5f9b4	237	$aliases['WINDOWS-1257'] = 'ISO-8859-13';
bc1e4f	238	// it happens that mbstring supports ASCII but not US-ASCII
AM	239	if (($from == 'US-ASCII' \|\| $to == 'US-ASCII') && !in_array('US-ASCII', $mbstring_list)) {
	240	$aliases['US-ASCII'] = 'ASCII';
	241	}
c321a9	242
T	243	$mb_from = $aliases[$from] ? $aliases[$from] : $from;
	244	$mb_to = $aliases[$to] ? $aliases[$to] : $to;
	245
	246	// return if encoding found, string matches encoding and convert succeeded
	247	if (in_array($mb_from, $mbstring_list) && in_array($mb_to, $mbstring_list)) {
bc1e4f	248	if (mb_check_encoding($str, $mb_from)) {
AM	249	// Do the same as //IGNORE with iconv
	250	mb_substitute_character('none');
	251	$out = mb_convert_encoding($str, $mb_to, $mb_from);
	252	mb_substitute_character($mbstring_sch);
	253
	254	if ($out !== false) {
	255	return $out;
	256	}
c321a9	257	}
T	258	}
	259	}
	260
	261	// convert charset using bundled classes/functions
	262	if ($to == 'UTF-8') {
	263	if ($from == 'UTF7-IMAP') {
	264	if ($_str = self::utf7imap_to_utf8($str)) {
	265	return $_str;
	266	}
	267	}
	268	else if ($from == 'UTF-7') {
	269	if ($_str = self::utf7_to_utf8($str)) {
	270	return $_str;
	271	}
	272	}
	273	else if ($from == 'ISO-8859-1' && function_exists('utf8_encode')) {
	274	return utf8_encode($str);
	275	}
	276	else if (class_exists('utf8')) {
	277	if (!$conv) {
	278	$conv = new utf8($from);
	279	}
	280	else {
	281	$conv->loadCharset($from);
	282	}
	283
	284	if ($_str = $conv->strToUtf8($str)) {
	285	return $_str;
	286	}
	287	}
	288	}
	289
	290	// encode string for output
	291	if ($from == 'UTF-8') {
	292	// @TODO: we need a function for UTF-7 (RFC2152) conversion
	293	if ($to == 'UTF7-IMAP' \|\| $to == 'UTF-7') {
20efa5	294	if ($_str = self::utf8_to_utf7imap($str)) {
c321a9	295	return $_str;
T	296	}
	297	}
	298	else if ($to == 'ISO-8859-1' && function_exists('utf8_decode')) {
	299	return utf8_decode($str);
	300	}
	301	else if (class_exists('utf8')) {
	302	if (!$conv) {
	303	$conv = new utf8($to);
	304	}
	305	else {
	306	$conv->loadCharset($from);
	307	}
	308
	309	if ($_str = $conv->strToUtf8($str)) {
	310	return $_str;
	311	}
	312	}
	313	}
	314
	315	// return original string
	316	return $str;
	317	}
	318
	319
	320	/**
	321	* Converts string from standard UTF-7 (RFC 2152) to UTF-8.
	322	*
	323	* @param string Input string (UTF-7)
	324	*
	325	* @return string Converted string (UTF-8)
	326	*/
	327	public static function utf7_to_utf8($str)
	328	{
	329	$Index_64 = array(
	330	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
	331	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
	332	0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,0,0,
	333	1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,
	334	0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	335	1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
	336	0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	337	1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
	338	);
	339
	340	$u7len = strlen($str);
	341	$str = strval($str);
	342	$res = '';
	343
	344	for ($i=0; $u7len > 0; $i++, $u7len--) {
	345	$u7 = $str[$i];
	346	if ($u7 == '+') {
	347	$i++;
	348	$u7len--;
	349	$ch = '';
	350
	351	for (; $u7len > 0; $i++, $u7len--) {
	352	$u7 = $str[$i];
	353
	354	if (!$Index_64[ord($u7)]) {
	355	break;
	356	}
	357
1495ac	358	$ch .= $u7;
c321a9	359	}
T	360
	361	if ($ch == '') {
	362	if ($u7 == '-') {
	363	$res .= '+';
	364	}
	365
	366	continue;
	367	}
	368
	369	$res .= self::utf16_to_utf8(base64_decode($ch));
	370	}
	371	else {
	372	$res .= $u7;
	373	}
	374	}
	375
	376	return $res;
	377	}
	378
	379
	380	/**
	381	* Converts string from UTF-16 to UTF-8 (helper for utf-7 to utf-8 conversion)
	382	*
	383	* @param string Input string
	384	*
	385	* @return string The converted string
	386	*/
	387	public static function utf16_to_utf8($str)
	388	{
	389	$len = strlen($str);
	390	$dec = '';
	391
	392	for ($i = 0; $i < $len; $i += 2) {
	393	$c = ord($str[$i]) << 8 \| ord($str[$i + 1]);
	394	if ($c >= 0x0001 && $c <= 0x007F) {
	395	$dec .= chr($c);
	396	}
	397	else if ($c > 0x07FF) {
	398	$dec .= chr(0xE0 \| (($c >> 12) & 0x0F));
	399	$dec .= chr(0x80 \| (($c >> 6) & 0x3F));
	400	$dec .= chr(0x80 \| (($c >> 0) & 0x3F));
	401	}
	402	else {
	403	$dec .= chr(0xC0 \| (($c >> 6) & 0x1F));
	404	$dec .= chr(0x80 \| (($c >> 0) & 0x3F));
	405	}
	406	}
	407
	408	return $dec;
	409	}
	410
	411
	412	/**
	413	* Convert the data ($str) from RFC 2060's UTF-7 to UTF-8.
	414	* If input data is invalid, return the original input string.
	415	* RFC 2060 obviously intends the encoding to be unique (see
	416	* point 5 in section 5.1.3), so we reject any non-canonical
	417	* form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead
	418	* of &AMAAwA-).
	419	*
	420	* Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
	421	*
	422	* @param string $str Input string (UTF7-IMAP)
	423	*
	424	* @return string Output string (UTF-8)
	425	*/
	426	public static function utf7imap_to_utf8($str)
	427	{
	428	$Index_64 = array(
	429	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
	430	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
	431	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
	432	52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
	433	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
	434	15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
	435	-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
	436	41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
	437	);
	438
	439	$u7len = strlen($str);
	440	$str = strval($str);
	441	$p = '';
	442	$err = '';
	443
	444	for ($i=0; $u7len > 0; $i++, $u7len--) {
	445	$u7 = $str[$i];
	446	if ($u7 == '&') {
	447	$i++;
	448	$u7len--;
	449	$u7 = $str[$i];
	450
	451	if ($u7len && $u7 == '-') {
	452	$p .= '&';
	453	continue;
	454	}
	455
	456	$ch = 0;
	457	$k = 10;
	458	for (; $u7len > 0; $i++, $u7len--) {
	459	$u7 = $str[$i];
	460
	461	if ((ord($u7) & 0x80) \|\| ($b = $Index_64[ord($u7)]) == -1) {
	462	break;
	463	}
	464
	465	if ($k > 0) {
	466	$ch \|= $b << $k;
	467	$k -= 6;
	468	}
	469	else {
	470	$ch \|= $b >> (-$k);
	471	if ($ch < 0x80) {
	472	// Printable US-ASCII
	473	if (0x20 <= $ch && $ch < 0x7f) {
	474	return $err;
	475	}
	476	$p .= chr($ch);
	477	}
	478	else if ($ch < 0x800) {
	479	$p .= chr(0xc0 \| ($ch >> 6));
	480	$p .= chr(0x80 \| ($ch & 0x3f));
	481	}
	482	else {
	483	$p .= chr(0xe0 \| ($ch >> 12));
	484	$p .= chr(0x80 \| (($ch >> 6) & 0x3f));
	485	$p .= chr(0x80 \| ($ch & 0x3f));
	486	}
	487
	488	$ch = ($b << (16 + $k)) & 0xffff;
	489	$k += 10;
	490	}
	491	}
	492
	493	// Non-zero or too many extra bits
	494	if ($ch \|\| $k < 6) {
	495	return $err;
	496	}
	497
	498	// BASE64 not properly terminated
	499	if (!$u7len \|\| $u7 != '-') {
	500	return $err;
	501	}
	502
	503	// Adjacent BASE64 sections
	504	if ($u7len > 2 && $str[$i+1] == '&' && $str[$i+2] != '-') {
	505	return $err;
	506	}
	507	}
	508	// Not printable US-ASCII
	509	else if (ord($u7) < 0x20 \|\| ord($u7) >= 0x7f) {
	510	return $err;
	511	}
	512	else {
	513	$p .= $u7;
	514	}
	515	}
	516
	517	return $p;
	518	}
	519
	520
	521	/**
	522	* Convert the data ($str) from UTF-8 to RFC 2060's UTF-7.
	523	* Unicode characters above U+FFFF are replaced by U+FFFE.
	524	* If input data is invalid, return an empty string.
	525	*
	526	* Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
	527	*
	528	* @param string $str Input string (UTF-8)
	529	*
	530	* @return string Output string (UTF7-IMAP)
	531	*/
	532	public static function utf8_to_utf7imap($str)
	533	{
	534	$B64Chars = array(
	535	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
	536	'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
	537	'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
	538	't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
	539	'8', '9', '+', ','
	540	);
	541
	542	$u8len = strlen($str);
	543	$base64 = 0;
	544	$i = 0;
	545	$p = '';
	546	$err = '';
	547
	548	while ($u8len) {
	549	$u8 = $str[$i];
	550	$c = ord($u8);
	551
	552	if ($c < 0x80) {
	553	$ch = $c;
	554	$n = 0;
	555	}
	556	else if ($c < 0xc2) {
	557	return $err;
	558	}
	559	else if ($c < 0xe0) {
	560	$ch = $c & 0x1f;
	561	$n = 1;
	562	}
	563	else if ($c < 0xf0) {
	564	$ch = $c & 0x0f;
	565	$n = 2;
	566	}
	567	else if ($c < 0xf8) {
	568	$ch = $c & 0x07;
	569	$n = 3;
	570	}
	571	else if ($c < 0xfc) {
	572	$ch = $c & 0x03;
	573	$n = 4;
	574	}
	575	else if ($c < 0xfe) {
	576	$ch = $c & 0x01;
	577	$n = 5;
	578	}
	579	else {
	580	return $err;
	581	}
	582
	583	$i++;
	584	$u8len--;
	585
	586	if ($n > $u8len) {
	587	return $err;
	588	}
	589
	590	for ($j=0; $j < $n; $j++) {
	591	$o = ord($str[$i+$j]);
	592	if (($o & 0xc0) != 0x80) {
	593	return $err;
	594	}
	595	$ch = ($ch << 6) \| ($o & 0x3f);
	596	}
	597
	598	if ($n > 1 && !($ch >> ($n * 5 + 1))) {
	599	return $err;
	600	}
	601
	602	$i += $n;
	603	$u8len -= $n;
	604
	605	if ($ch < 0x20 \|\| $ch >= 0x7f) {
	606	if (!$base64) {
	607	$p .= '&';
	608	$base64 = 1;
	609	$b = 0;
	610	$k = 10;
	611	}
	612	if ($ch & ~0xffff) {
	613	$ch = 0xfffe;
	614	}
	615
	616	$p .= $B64Chars[($b \| $ch >> $k)];
	617	$k -= 6;
	618	for (; $k >= 0; $k -= 6) {
	619	$p .= $B64Chars[(($ch >> $k) & 0x3f)];
	620	}
	621
	622	$b = ($ch << (-$k)) & 0x3f;
	623	$k += 16;
	624	}
	625	else {
	626	if ($base64) {
	627	if ($k > 10) {
	628	$p .= $B64Chars[$b];
	629	}
	630	$p .= '-';
	631	$base64 = 0;
	632	}
	633
	634	$p .= chr($ch);
	635	if (chr($ch) == '&') {
	636	$p .= '-';
	637	}
	638	}
	639	}
	640
	641	if ($base64) {
	642	if ($k > 10) {
	643	$p .= $B64Chars[$b];
	644	}
	645	$p .= '-';
	646	}
	647
	648	return $p;
	649	}
	650
	651
	652	/**
	653	* A method to guess character set of a string.
	654	*
a5b8ef	655	* @param string $string String
AM	656	* @param string $failover Default result for failover
	657	* @param string $language User language
c321a9	658	*
T	659	* @return string Charset name
	660	*/
a5b8ef	661	public static function detect($string, $failover = null, $language = null)
c321a9	662	{
0679b2	663	if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE'; // Big Endian
AM	664	if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE'; // Little Endian
	665	if (substr($string, 0, 2) == "\xFE\xFF") return 'UTF-16BE'; // Big Endian
	666	if (substr($string, 0, 2) == "\xFF\xFE") return 'UTF-16LE'; // Little Endian
	667	if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8';
	668
	669	// heuristics
	670	if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE';
	671	if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE';
	672	if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE';
	673	if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE';
	674
	675	if (function_exists('mb_detect_encoding')) {
a5b8ef	676	if (empty($language)) {
AM	677	$rcube = rcube::get_instance();
	678	$language = $rcube->get_user_language();
0679b2	679	}
a5b8ef	680
AM	681	// Prioritize charsets according to current language (#1485669)
	682	switch ($language) {
59f031	683	case 'ja_JP':
a5b8ef	684	$prio = array('ISO-2022-JP', 'JIS', 'UTF-8', 'EUC-JP', 'eucJP-win', 'SJIS', 'SJIS-win');
AM	685	break;
	686
59f031	687	case 'zh_CN':
AM	688	case 'zh_TW':
a5b8ef	689	$prio = array('UTF-8', 'BIG-5', 'GB2312', 'EUC-TW');
AM	690	break;
	691
59f031	692	case 'ko_KR':
a5b8ef	693	$prio = array('UTF-8', 'EUC-KR', 'ISO-2022-KR');
AM	694	break;
	695
59f031	696	case 'ru_RU':
a5b8ef	697	$prio = array('UTF-8', 'WINDOWS-1251', 'KOI8-R');
AM	698	break;
	699
59f031	700	case 'tr_TR':
AM	701	$prio = array('UTF-8', 'ISO-8859-9', 'WINDOWS-1254');
	702	break;
	703
a5b8ef	704	default:
AM	705	$prio = array('UTF-8', 'SJIS', 'GB2312',
	706	'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
	707	'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',
	708	'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',
	709	'WINDOWS-1252', 'WINDOWS-1251', 'EUC-JP', 'EUC-TW', 'KOI8-R', 'BIG-5',
	710	'ISO-2022-KR', 'ISO-2022-JP',
	711	);
	712	}
	713
	714	$encodings = array_unique(array_merge($prio, mb_list_encodings()));
	715
	716	return mb_detect_encoding($string, $encodings);
0679b2	717	}
c321a9	718
a5b8ef	719	// No match, check for UTF-8
AM	720	// from http://w3.org/International/questions/qa-forms-utf-8.html
	721	if (preg_match('/\A(
	722	[\x09\x0A\x0D\x20-\x7E]
	723	\| [\xC2-\xDF][\x80-\xBF]
	724	\| \xE0[\xA0-\xBF][\x80-\xBF]
	725	\| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}
	726	\| \xED[\x80-\x9F][\x80-\xBF]
	727	\| \xF0[\x90-\xBF][\x80-\xBF]{2}
	728	\| [\xF1-\xF3][\x80-\xBF]{3}
	729	\| \xF4[\x80-\x8F][\x80-\xBF]{2}
	730	)*\z/xs', substr($string, 0, 2048))
	731	) {
	732	return 'UTF-8';
	733	}
	734
	735	return $failover;
c321a9	736	}
T	737
	738
	739	/**
	740	* Removes non-unicode characters from input.
	741	*
	742	* @param mixed $input String or array.
	743	*
	744	* @return mixed String or array
	745	*/
	746	public static function clean($input)
	747	{
	748	// handle input of type array
	749	if (is_array($input)) {
	750	foreach ($input as $idx => $val) {
	751	$input[$idx] = self::clean($val);
	752	}
	753	return $input;
	754	}
	755
	756	if (!is_string($input) \|\| $input == '') {
	757	return $input;
	758	}
	759
	760	// iconv/mbstring are much faster (especially with long strings)
	761	if (function_exists('mb_convert_encoding')) {
	762	if (($res = mb_convert_encoding($input, 'UTF-8', 'UTF-8')) !== false) {
	763	return $res;
	764	}
	765	}
	766
	767	if (function_exists('iconv')) {
	768	if (($res = @iconv('UTF-8', 'UTF-8//IGNORE', $input)) !== false) {
	769	return $res;
	770	}
	771	}
	772
	773	$seq = '';
	774	$out = '';
	775	$regexp = '/^('.
	776	// '[\x00-\x7F]'. // UTF8-1
	777	'\|[\xC2-\xDF][\x80-\xBF]'. // UTF8-2
	778	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. // UTF8-3
	779	'\|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'. // UTF8-3
	780	'\|\xED[\x80-\x9F][\x80-\xBF]'. // UTF8-3
	781	'\|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'. // UTF8-3
	782	'\|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'. // UTF8-4
	783	'\|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.// UTF8-4
	784	'\|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'. // UTF8-4
	785	')$/';
	786
	787	for ($i = 0, $len = strlen($input); $i < $len; $i++) {
	788	$chr = $input[$i];
	789	$ord = ord($chr);
	790
	791	// 1-byte character
	792	if ($ord <= 0x7F) {
	793	if ($seq) {
	794	$out .= preg_match($regexp, $seq) ? $seq : '';
	795	}
	796	$seq = '';
	797	$out .= $chr;
	798	// first (or second) byte of multibyte sequence
	799	}
	800	else if ($ord >= 0xC0) {
	801	if (strlen($seq) > 1) {
1495ac	802	$out .= preg_match($regexp, $seq) ? $seq : '';
c321a9	803	$seq = '';
T	804	}
	805	else if ($seq && ord($seq) < 0xC0) {
	806	$seq = '';
	807	}
	808	$seq .= $chr;
	809	// next byte of multibyte sequence
	810	}
	811	else if ($seq) {
	812	$seq .= $chr;
	813	}
	814	}
	815
	816	if ($seq) {
	817	$out .= preg_match($regexp, $seq) ? $seq : '';
	818	}
	819
	820	return $out;
	821	}
	822
	823	}