githubFork/roundcubemail.git

Fix multi-threaded autocompletion when number of threads > number of sources

thomascube

2012-05-04 5b04ddd6bc9e0af5f73694371cd3988b1d5be7e8

commit \| author \| age
c321a9	1	<?php
T	2
	3	/*
	4	+-----------------------------------------------------------------------+
	5	\| program/include/rcube_charset.php \|
	6	\| \|
	7	\| This file is part of the Roundcube Webmail client \|
	8	\| Copyright (C) 2005-2012, The Roundcube Dev Team \|
	9	\| Copyright (C) 2011-2012, Kolab Systems AG \|
	10	\| Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org> \|
7fe381	11	\| \|
T	12	\| Licensed under the GNU General Public License version 3 or \|
	13	\| any later version with exceptions for skins & plugins. \|
	14	\| See the README file for a full license statement. \|
c321a9	15	\| \|
T	16	\| PURPOSE: \|
	17	\| Provide charset conversion functionality \|
	18	\| \|
	19	+-----------------------------------------------------------------------+
	20	\| Author: Thomas Bruederli <roundcube@gmail.com> \|
	21	\| Author: Aleksander Machniak <alec@alec.pl> \|
	22	+-----------------------------------------------------------------------+
	23
	24	$Id$
	25
	26	*/
	27
	28	/**
	29	* Character sets conversion functionality
	30	*
	31	* @package Core
	32	* @author Thomas Bruederli <roundcube@gmail.com>
	33	* @author Aleksander Machniak <alec@alec.pl>
	34	* @author Edmund Grimley Evans <edmundo@rano.org>
	35	*/
	36	class rcube_charset
	37	{
	38	// Aliases: some of them from HTML5 spec.
	39	static public $aliases = array(
	40	'USASCII' => 'WINDOWS-1252',
	41	'ANSIX31101983' => 'WINDOWS-1252',
	42	'ANSIX341968' => 'WINDOWS-1252',
	43	'UNKNOWN8BIT' => 'ISO-8859-15',
	44	'UNKNOWN' => 'ISO-8859-15',
	45	'USERDEFINED' => 'ISO-8859-15',
	46	'KSC56011987' => 'EUC-KR',
1495ac	47	'GB2312' => 'GBK',
A	48	'GB231280' => 'GBK',
	49	'UNICODE' => 'UTF-8',
	50	'UTF7IMAP' => 'UTF7-IMAP',
	51	'TIS620' => 'WINDOWS-874',
	52	'ISO88599' => 'WINDOWS-1254',
	53	'ISO885911' => 'WINDOWS-874',
	54	'MACROMAN' => 'MACINTOSH',
c321a9	55	'77' => 'MAC',
T	56	'128' => 'SHIFT-JIS',
	57	'129' => 'CP949',
	58	'130' => 'CP1361',
	59	'134' => 'GBK',
	60	'136' => 'BIG5',
	61	'161' => 'WINDOWS-1253',
	62	'162' => 'WINDOWS-1254',
	63	'163' => 'WINDOWS-1258',
	64	'177' => 'WINDOWS-1255',
	65	'178' => 'WINDOWS-1256',
	66	'186' => 'WINDOWS-1257',
	67	'204' => 'WINDOWS-1251',
	68	'222' => 'WINDOWS-874',
	69	'238' => 'WINDOWS-1250',
	70	'MS950' => 'CP950',
	71	'WINDOWS949' => 'UHC',
	72	);
	73
	74
	75	/**
	76	* Catch an error and throw an exception.
	77	*
	78	* @param int Level of the error
	79	* @param string Error message
	80	*/
66d215	81	public static function error_handler($errno, $errstr)
c321a9	82	{
T	83	throw new ErrorException($errstr, 0, $errno);
	84	}
	85
	86
	87	/**
	88	* Parse and validate charset name string (see #1485758).
	89	* Sometimes charset string is malformed, there are also charset aliases
	90	* but we need strict names for charset conversion (specially utf8 class)
	91	*
	92	* @param string Input charset name
	93	*
	94	* @return string The validated charset name
	95	*/
f5f9b4	96	public static function parse_charset($input)
c321a9	97	{
T	98	static $charsets = array();
	99	$charset = strtoupper($input);
	100
	101	if (isset($charsets[$input])) {
	102	return $charsets[$input];
	103	}
	104
	105	$charset = preg_replace(array(
	106	'/^[^0-9A-Z]+/', // e.g. _ISO-8859-JP$SIO
	107	'/\$.*$/', // e.g. _ISO-8859-JP$SIO
	108	'/UNICODE-1-1-*/', // RFC1641/1642
	109	'/^X-/', // X- prefix (e.g. X-ROMAN8 => ROMAN8)
	110	), '', $charset);
	111
	112	if ($charset == 'BINARY') {
	113	return $charsets[$input] = null;
	114	}
	115
	116	// allow A-Z and 0-9 only
	117	$str = preg_replace('/[^A-Z0-9]/', '', $charset);
	118
	119	if (isset(self::$aliases[$str])) {
	120	$result = self::$aliases[$str];
	121	}
	122	// UTF
	123	else if (preg_match('/U[A-Z][A-Z](7\|8\|16\|32)(BE\|LE)*/', $str, $m)) {
	124	$result = 'UTF-' . $m[1] . $m[2];
	125	}
	126	// ISO-8859
	127	else if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) {
	128	$iso = 'ISO-8859-' . ($m[1] ? $m[1] : 1);
	129	// some clients sends windows-1252 text as latin1,
	130	// it is safe to use windows-1252 for all latin1
	131	$result = $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso;
	132	}
	133	// handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE
	134	else if (preg_match('/(WIN\|WINDOWS)([0-9]+)/', $str, $m)) {
	135	$result = 'WINDOWS-' . $m[2];
	136	}
	137	// LATIN
	138	else if (preg_match('/LATIN(.*)/', $str, $m)) {
	139	$aliases = array('2' => 2, '3' => 3, '4' => 4, '5' => 9, '6' => 10,
	140	'7' => 13, '8' => 14, '9' => 15, '10' => 16,
	141	'ARABIC' => 6, 'CYRILLIC' => 5, 'GREEK' => 7, 'GREEK1' => 7, 'HEBREW' => 8
	142	);
	143
	144	// some clients sends windows-1252 text as latin1,
	145	// it is safe to use windows-1252 for all latin1
	146	if ($m[1] == 1) {
	147	$result = 'WINDOWS-1252';
	148	}
	149	// if iconv is not supported we need ISO labels, it's also safe for iconv
	150	else if (!empty($aliases[$m[1]])) {
	151	$result = 'ISO-8859-'.$aliases[$m[1]];
	152	}
	153	// iconv requires convertion of e.g. LATIN-1 to LATIN1
	154	else {
	155	$result = $str;
	156	}
	157	}
	158	else {
	159	$result = $charset;
	160	}
	161
	162	$charsets[$input] = $result;
	163
	164	return $result;
	165	}
	166
	167
	168	/**
	169	* Convert a string from one charset to another.
	170	* Uses mbstring and iconv functions if possible
	171	*
	172	* @param string Input string
	173	* @param string Suspected charset of the input string
	174	* @param string Target charset to convert to; defaults to RCMAIL_CHARSET
	175	*
	176	* @return string Converted string
	177	*/
	178	public static function convert($str, $from, $to = null)
	179	{
	180	static $iconv_options = null;
	181	static $mbstring_list = null;
	182	static $conv = null;
	183
f5f9b4	184	$to = empty($to) ? strtoupper(RCMAIL_CHARSET) : self::parse_charset($to);
A	185	$from = self::parse_charset($from);
c321a9	186
T	187	if ($from == $to \|\| empty($str) \|\| empty($from)) {
	188	return $str;
	189	}
	190
f5f9b4	191	if ($iconv_options === null) {
A	192	if (function_exists('iconv')) {
c321a9	193	// ignore characters not available in output charset
T	194	$iconv_options = '//IGNORE';
	195	if (iconv('', $iconv_options, '') === false) {
	196	// iconv implementation does not support options
	197	$iconv_options = '';
	198	}
	199	}
f5f9b4	200	}
c321a9	201
f5f9b4	202	// convert charset using iconv module
A	203	if ($iconv_options !== null && $from != 'UTF7-IMAP' && $to != 'UTF7-IMAP') {
c321a9	204	// throw an exception if iconv reports an illegal character in input
T	205	// it means that input string has been truncated
	206	set_error_handler(array('rcube_charset', 'error_handler'), E_NOTICE);
	207	try {
	208	$_iconv = iconv($from, $to . $iconv_options, $str);
	209	} catch (ErrorException $e) {
	210	$_iconv = false;
	211	}
	212	restore_error_handler();
	213
	214	if ($_iconv !== false) {
	215	return $_iconv;
	216	}
	217	}
	218
f5f9b4	219	if ($mbstring_list === null) {
A	220	if (extension_loaded('mbstring')) {
c321a9	221	$mbstring_list = mb_list_encodings();
T	222	$mbstring_list = array_map('strtoupper', $mbstring_list);
	223	}
f5f9b4	224	}
A	225
	226	// convert charset using mbstring module
	227	if ($mbstring_list !== null) {
	228	$aliases['WINDOWS-1257'] = 'ISO-8859-13';
c321a9	229
T	230	$mb_from = $aliases[$from] ? $aliases[$from] : $from;
	231	$mb_to = $aliases[$to] ? $aliases[$to] : $to;
	232
	233	// return if encoding found, string matches encoding and convert succeeded
	234	if (in_array($mb_from, $mbstring_list) && in_array($mb_to, $mbstring_list)) {
	235	if (mb_check_encoding($str, $mb_from) && ($out = mb_convert_encoding($str, $mb_to, $mb_from))) {
	236	return $out;
	237	}
	238	}
	239	}
	240
	241	// convert charset using bundled classes/functions
	242	if ($to == 'UTF-8') {
	243	if ($from == 'UTF7-IMAP') {
	244	if ($_str = self::utf7imap_to_utf8($str)) {
	245	return $_str;
	246	}
	247	}
	248	else if ($from == 'UTF-7') {
	249	if ($_str = self::utf7_to_utf8($str)) {
	250	return $_str;
	251	}
	252	}
	253	else if ($from == 'ISO-8859-1' && function_exists('utf8_encode')) {
	254	return utf8_encode($str);
	255	}
	256	else if (class_exists('utf8')) {
	257	if (!$conv) {
	258	$conv = new utf8($from);
	259	}
	260	else {
	261	$conv->loadCharset($from);
	262	}
	263
	264	if ($_str = $conv->strToUtf8($str)) {
	265	return $_str;
	266	}
	267	}
	268	}
	269
	270	// encode string for output
	271	if ($from == 'UTF-8') {
	272	// @TODO: we need a function for UTF-7 (RFC2152) conversion
	273	if ($to == 'UTF7-IMAP' \|\| $to == 'UTF-7') {
20efa5	274	if ($_str = self::utf8_to_utf7imap($str)) {
c321a9	275	return $_str;
T	276	}
	277	}
	278	else if ($to == 'ISO-8859-1' && function_exists('utf8_decode')) {
	279	return utf8_decode($str);
	280	}
	281	else if (class_exists('utf8')) {
	282	if (!$conv) {
	283	$conv = new utf8($to);
	284	}
	285	else {
	286	$conv->loadCharset($from);
	287	}
	288
	289	if ($_str = $conv->strToUtf8($str)) {
	290	return $_str;
	291	}
	292	}
	293	}
	294
	295	// return original string
	296	return $str;
	297	}
	298
	299
	300	/**
	301	* Converts string from standard UTF-7 (RFC 2152) to UTF-8.
	302	*
	303	* @param string Input string (UTF-7)
	304	*
	305	* @return string Converted string (UTF-8)
	306	*/
	307	public static function utf7_to_utf8($str)
	308	{
	309	$Index_64 = array(
	310	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
	311	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
	312	0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,0,0,
	313	1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,
	314	0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	315	1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
	316	0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	317	1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
	318	);
	319
	320	$u7len = strlen($str);
	321	$str = strval($str);
	322	$res = '';
	323
	324	for ($i=0; $u7len > 0; $i++, $u7len--) {
	325	$u7 = $str[$i];
	326	if ($u7 == '+') {
	327	$i++;
	328	$u7len--;
	329	$ch = '';
	330
	331	for (; $u7len > 0; $i++, $u7len--) {
	332	$u7 = $str[$i];
	333
	334	if (!$Index_64[ord($u7)]) {
	335	break;
	336	}
	337
1495ac	338	$ch .= $u7;
c321a9	339	}
T	340
	341	if ($ch == '') {
	342	if ($u7 == '-') {
	343	$res .= '+';
	344	}
	345
	346	continue;
	347	}
	348
	349	$res .= self::utf16_to_utf8(base64_decode($ch));
	350	}
	351	else {
	352	$res .= $u7;
	353	}
	354	}
	355
	356	return $res;
	357	}
	358
	359
	360	/**
	361	* Converts string from UTF-16 to UTF-8 (helper for utf-7 to utf-8 conversion)
	362	*
	363	* @param string Input string
	364	*
	365	* @return string The converted string
	366	*/
	367	public static function utf16_to_utf8($str)
	368	{
	369	$len = strlen($str);
	370	$dec = '';
	371
	372	for ($i = 0; $i < $len; $i += 2) {
	373	$c = ord($str[$i]) << 8 \| ord($str[$i + 1]);
	374	if ($c >= 0x0001 && $c <= 0x007F) {
	375	$dec .= chr($c);
	376	}
	377	else if ($c > 0x07FF) {
	378	$dec .= chr(0xE0 \| (($c >> 12) & 0x0F));
	379	$dec .= chr(0x80 \| (($c >> 6) & 0x3F));
	380	$dec .= chr(0x80 \| (($c >> 0) & 0x3F));
	381	}
	382	else {
	383	$dec .= chr(0xC0 \| (($c >> 6) & 0x1F));
	384	$dec .= chr(0x80 \| (($c >> 0) & 0x3F));
	385	}
	386	}
	387
	388	return $dec;
	389	}
	390
	391
	392	/**
	393	* Convert the data ($str) from RFC 2060's UTF-7 to UTF-8.
	394	* If input data is invalid, return the original input string.
	395	* RFC 2060 obviously intends the encoding to be unique (see
	396	* point 5 in section 5.1.3), so we reject any non-canonical
	397	* form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead
	398	* of &AMAAwA-).
	399	*
	400	* Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
	401	*
	402	* @param string $str Input string (UTF7-IMAP)
	403	*
	404	* @return string Output string (UTF-8)
	405	*/
	406	public static function utf7imap_to_utf8($str)
	407	{
	408	$Index_64 = array(
	409	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
	410	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
	411	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
	412	52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
	413	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
	414	15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
	415	-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
	416	41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
	417	);
	418
	419	$u7len = strlen($str);
	420	$str = strval($str);
	421	$p = '';
	422	$err = '';
	423
	424	for ($i=0; $u7len > 0; $i++, $u7len--) {
	425	$u7 = $str[$i];
	426	if ($u7 == '&') {
	427	$i++;
	428	$u7len--;
	429	$u7 = $str[$i];
	430
	431	if ($u7len && $u7 == '-') {
	432	$p .= '&';
	433	continue;
	434	}
	435
	436	$ch = 0;
	437	$k = 10;
	438	for (; $u7len > 0; $i++, $u7len--) {
	439	$u7 = $str[$i];
	440
	441	if ((ord($u7) & 0x80) \|\| ($b = $Index_64[ord($u7)]) == -1) {
	442	break;
	443	}
	444
	445	if ($k > 0) {
	446	$ch \|= $b << $k;
	447	$k -= 6;
	448	}
	449	else {
	450	$ch \|= $b >> (-$k);
	451	if ($ch < 0x80) {
	452	// Printable US-ASCII
	453	if (0x20 <= $ch && $ch < 0x7f) {
	454	return $err;
	455	}
	456	$p .= chr($ch);
	457	}
	458	else if ($ch < 0x800) {
	459	$p .= chr(0xc0 \| ($ch >> 6));
	460	$p .= chr(0x80 \| ($ch & 0x3f));
	461	}
	462	else {
	463	$p .= chr(0xe0 \| ($ch >> 12));
	464	$p .= chr(0x80 \| (($ch >> 6) & 0x3f));
	465	$p .= chr(0x80 \| ($ch & 0x3f));
	466	}
	467
	468	$ch = ($b << (16 + $k)) & 0xffff;
	469	$k += 10;
	470	}
	471	}
	472
	473	// Non-zero or too many extra bits
	474	if ($ch \|\| $k < 6) {
	475	return $err;
	476	}
	477
	478	// BASE64 not properly terminated
	479	if (!$u7len \|\| $u7 != '-') {
	480	return $err;
	481	}
	482
	483	// Adjacent BASE64 sections
	484	if ($u7len > 2 && $str[$i+1] == '&' && $str[$i+2] != '-') {
	485	return $err;
	486	}
	487	}
	488	// Not printable US-ASCII
	489	else if (ord($u7) < 0x20 \|\| ord($u7) >= 0x7f) {
	490	return $err;
	491	}
	492	else {
	493	$p .= $u7;
	494	}
	495	}
	496
	497	return $p;
	498	}
	499
	500
	501	/**
	502	* Convert the data ($str) from UTF-8 to RFC 2060's UTF-7.
	503	* Unicode characters above U+FFFF are replaced by U+FFFE.
	504	* If input data is invalid, return an empty string.
	505	*
	506	* Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
	507	*
	508	* @param string $str Input string (UTF-8)
	509	*
	510	* @return string Output string (UTF7-IMAP)
	511	*/
	512	public static function utf8_to_utf7imap($str)
	513	{
	514	$B64Chars = array(
	515	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
	516	'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
	517	'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
	518	't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
	519	'8', '9', '+', ','
	520	);
	521
	522	$u8len = strlen($str);
	523	$base64 = 0;
	524	$i = 0;
	525	$p = '';
	526	$err = '';
	527
	528	while ($u8len) {
	529	$u8 = $str[$i];
	530	$c = ord($u8);
	531
	532	if ($c < 0x80) {
	533	$ch = $c;
	534	$n = 0;
	535	}
	536	else if ($c < 0xc2) {
	537	return $err;
	538	}
	539	else if ($c < 0xe0) {
	540	$ch = $c & 0x1f;
	541	$n = 1;
	542	}
	543	else if ($c < 0xf0) {
	544	$ch = $c & 0x0f;
	545	$n = 2;
	546	}
	547	else if ($c < 0xf8) {
	548	$ch = $c & 0x07;
	549	$n = 3;
	550	}
	551	else if ($c < 0xfc) {
	552	$ch = $c & 0x03;
	553	$n = 4;
	554	}
	555	else if ($c < 0xfe) {
	556	$ch = $c & 0x01;
	557	$n = 5;
	558	}
	559	else {
	560	return $err;
	561	}
	562
	563	$i++;
	564	$u8len--;
	565
	566	if ($n > $u8len) {
	567	return $err;
	568	}
	569
	570	for ($j=0; $j < $n; $j++) {
	571	$o = ord($str[$i+$j]);
	572	if (($o & 0xc0) != 0x80) {
	573	return $err;
	574	}
	575	$ch = ($ch << 6) \| ($o & 0x3f);
	576	}
	577
	578	if ($n > 1 && !($ch >> ($n * 5 + 1))) {
	579	return $err;
	580	}
	581
	582	$i += $n;
	583	$u8len -= $n;
	584
	585	if ($ch < 0x20 \|\| $ch >= 0x7f) {
	586	if (!$base64) {
	587	$p .= '&';
	588	$base64 = 1;
	589	$b = 0;
	590	$k = 10;
	591	}
	592	if ($ch & ~0xffff) {
	593	$ch = 0xfffe;
	594	}
	595
	596	$p .= $B64Chars[($b \| $ch >> $k)];
	597	$k -= 6;
	598	for (; $k >= 0; $k -= 6) {
	599	$p .= $B64Chars[(($ch >> $k) & 0x3f)];
	600	}
	601
	602	$b = ($ch << (-$k)) & 0x3f;
	603	$k += 16;
	604	}
	605	else {
	606	if ($base64) {
	607	if ($k > 10) {
	608	$p .= $B64Chars[$b];
	609	}
	610	$p .= '-';
	611	$base64 = 0;
	612	}
	613
	614	$p .= chr($ch);
	615	if (chr($ch) == '&') {
	616	$p .= '-';
	617	}
	618	}
	619	}
	620
	621	if ($base64) {
	622	if ($k > 10) {
	623	$p .= $B64Chars[$b];
	624	}
	625	$p .= '-';
	626	}
	627
	628	return $p;
	629	}
	630
	631
	632	/**
	633	* A method to guess character set of a string.
	634	*
	635	* @param string $string String.
	636	* @param string $failover Default result for failover.
	637	*
	638	* @return string Charset name
	639	*/
	640	public static function detect($string, $failover='')
	641	{
	642	if (!function_exists('mb_detect_encoding')) {
	643	return $failover;
	644	}
	645
	646	// FIXME: the order is important, because sometimes
	647	// iso string is detected as euc-jp and etc.
	648	$enc = array(
	649	'UTF-8', 'SJIS', 'BIG5', 'GB2312',
	650	'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
	651	'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',
	652	'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',
	653	'WINDOWS-1252', 'WINDOWS-1251', 'EUC-JP', 'EUC-TW', 'KOI8-R',
	654	'ISO-2022-KR', 'ISO-2022-JP'
	655	);
	656
	657	$result = mb_detect_encoding($string, join(',', $enc));
	658
	659	return $result ? $result : $failover;
	660	}
	661
	662
	663	/**
	664	* Removes non-unicode characters from input.
	665	*
	666	* @param mixed $input String or array.
	667	*
	668	* @return mixed String or array
	669	*/
	670	public static function clean($input)
	671	{
	672	// handle input of type array
	673	if (is_array($input)) {
	674	foreach ($input as $idx => $val) {
	675	$input[$idx] = self::clean($val);
	676	}
	677	return $input;
	678	}
	679
	680	if (!is_string($input) \|\| $input == '') {
	681	return $input;
	682	}
	683
	684	// iconv/mbstring are much faster (especially with long strings)
	685	if (function_exists('mb_convert_encoding')) {
	686	if (($res = mb_convert_encoding($input, 'UTF-8', 'UTF-8')) !== false) {
	687	return $res;
	688	}
	689	}
	690
	691	if (function_exists('iconv')) {
	692	if (($res = @iconv('UTF-8', 'UTF-8//IGNORE', $input)) !== false) {
	693	return $res;
	694	}
	695	}
	696
	697	$seq = '';
	698	$out = '';
	699	$regexp = '/^('.
	700	// '[\x00-\x7F]'. // UTF8-1
	701	'\|[\xC2-\xDF][\x80-\xBF]'. // UTF8-2
	702	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. // UTF8-3
	703	'\|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'. // UTF8-3
	704	'\|\xED[\x80-\x9F][\x80-\xBF]'. // UTF8-3
	705	'\|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'. // UTF8-3
	706	'\|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'. // UTF8-4
	707	'\|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.// UTF8-4
	708	'\|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'. // UTF8-4
	709	')$/';
	710
	711	for ($i = 0, $len = strlen($input); $i < $len; $i++) {
	712	$chr = $input[$i];
	713	$ord = ord($chr);
	714
	715	// 1-byte character
	716	if ($ord <= 0x7F) {
	717	if ($seq) {
	718	$out .= preg_match($regexp, $seq) ? $seq : '';
	719	}
	720	$seq = '';
	721	$out .= $chr;
	722	// first (or second) byte of multibyte sequence
	723	}
	724	else if ($ord >= 0xC0) {
	725	if (strlen($seq) > 1) {
1495ac	726	$out .= preg_match($regexp, $seq) ? $seq : '';
c321a9	727	$seq = '';
T	728	}
	729	else if ($seq && ord($seq) < 0xC0) {
	730	$seq = '';
	731	}
	732	$seq .= $chr;
	733	// next byte of multibyte sequence
	734	}
	735	else if ($seq) {
	736	$seq .= $chr;
	737	}
	738	}
	739
	740	if ($seq) {
	741	$out .= preg_match($regexp, $seq) ? $seq : '';
	742	}
	743
	744	return $out;
	745	}
	746
	747	}