githubFork/roundcubemail.git

User configurable setting how to display contact names in list

thomascube

2012-02-11 f9a967763e2a4cc595b44f40ac1ca666b2a02af6

commit \| author \| age
c321a9	1	<?php
T	2
	3	/*
	4	+-----------------------------------------------------------------------+
	5	\| program/include/rcube_charset.php \|
	6	\| \|
	7	\| This file is part of the Roundcube Webmail client \|
	8	\| Copyright (C) 2005-2012, The Roundcube Dev Team \|
	9	\| Copyright (C) 2011-2012, Kolab Systems AG \|
	10	\| Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org> \|
7fe381	11	\| \|
T	12	\| Licensed under the GNU General Public License version 3 or \|
	13	\| any later version with exceptions for skins & plugins. \|
	14	\| See the README file for a full license statement. \|
c321a9	15	\| \|
T	16	\| PURPOSE: \|
	17	\| Provide charset conversion functionality \|
	18	\| \|
	19	+-----------------------------------------------------------------------+
	20	\| Author: Thomas Bruederli <roundcube@gmail.com> \|
	21	\| Author: Aleksander Machniak <alec@alec.pl> \|
	22	+-----------------------------------------------------------------------+
	23
	24	$Id$
	25
	26	*/
	27
	28	/**
	29	* Character sets conversion functionality
	30	*
	31	* @package Core
	32	* @author Thomas Bruederli <roundcube@gmail.com>
	33	* @author Aleksander Machniak <alec@alec.pl>
	34	* @author Edmund Grimley Evans <edmundo@rano.org>
	35	*/
	36	class rcube_charset
	37	{
	38	// Aliases: some of them from HTML5 spec.
	39	static public $aliases = array(
	40	'USASCII' => 'WINDOWS-1252',
	41	'ANSIX31101983' => 'WINDOWS-1252',
	42	'ANSIX341968' => 'WINDOWS-1252',
	43	'UNKNOWN8BIT' => 'ISO-8859-15',
	44	'UNKNOWN' => 'ISO-8859-15',
	45	'USERDEFINED' => 'ISO-8859-15',
	46	'KSC56011987' => 'EUC-KR',
	47	'GB2312' => 'GBK',
	48	'GB231280' => 'GBK',
	49	'UNICODE' => 'UTF-8',
	50	'UTF7IMAP' => 'UTF7-IMAP',
	51	'TIS620' => 'WINDOWS-874',
	52	'ISO88599' => 'WINDOWS-1254',
	53	'ISO885911' => 'WINDOWS-874',
	54	'MACROMAN' => 'MACINTOSH',
	55	'77' => 'MAC',
	56	'128' => 'SHIFT-JIS',
	57	'129' => 'CP949',
	58	'130' => 'CP1361',
	59	'134' => 'GBK',
	60	'136' => 'BIG5',
	61	'161' => 'WINDOWS-1253',
	62	'162' => 'WINDOWS-1254',
	63	'163' => 'WINDOWS-1258',
	64	'177' => 'WINDOWS-1255',
	65	'178' => 'WINDOWS-1256',
	66	'186' => 'WINDOWS-1257',
	67	'204' => 'WINDOWS-1251',
	68	'222' => 'WINDOWS-874',
	69	'238' => 'WINDOWS-1250',
	70	'MS950' => 'CP950',
	71	'WINDOWS949' => 'UHC',
	72	);
	73
	74
	75	/**
	76	* Catch an error and throw an exception.
	77	*
	78	* @param int Level of the error
	79	* @param string Error message
	80	*/
	81	public function error_handler($errno, $errstr)
	82	{
	83	throw new ErrorException($errstr, 0, $errno);
	84	}
	85
	86
	87	/**
	88	* Parse and validate charset name string (see #1485758).
	89	* Sometimes charset string is malformed, there are also charset aliases
	90	* but we need strict names for charset conversion (specially utf8 class)
	91	*
	92	* @param string Input charset name
	93	*
	94	* @return string The validated charset name
	95	*/
	96	public static function parse($input)
	97	{
	98	static $charsets = array();
	99	$charset = strtoupper($input);
	100
	101	if (isset($charsets[$input])) {
	102	return $charsets[$input];
	103	}
	104
	105	$charset = preg_replace(array(
	106	'/^[^0-9A-Z]+/', // e.g. _ISO-8859-JP$SIO
	107	'/\$.*$/', // e.g. _ISO-8859-JP$SIO
	108	'/UNICODE-1-1-*/', // RFC1641/1642
	109	'/^X-/', // X- prefix (e.g. X-ROMAN8 => ROMAN8)
	110	), '', $charset);
	111
	112	if ($charset == 'BINARY') {
	113	return $charsets[$input] = null;
	114	}
	115
	116	// allow A-Z and 0-9 only
	117	$str = preg_replace('/[^A-Z0-9]/', '', $charset);
	118
	119	if (isset(self::$aliases[$str])) {
	120	$result = self::$aliases[$str];
	121	}
	122	// UTF
	123	else if (preg_match('/U[A-Z][A-Z](7\|8\|16\|32)(BE\|LE)*/', $str, $m)) {
	124	$result = 'UTF-' . $m[1] . $m[2];
	125	}
	126	// ISO-8859
	127	else if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) {
	128	$iso = 'ISO-8859-' . ($m[1] ? $m[1] : 1);
	129	// some clients sends windows-1252 text as latin1,
	130	// it is safe to use windows-1252 for all latin1
	131	$result = $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso;
	132	}
	133	// handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE
	134	else if (preg_match('/(WIN\|WINDOWS)([0-9]+)/', $str, $m)) {
	135	$result = 'WINDOWS-' . $m[2];
	136	}
	137	// LATIN
	138	else if (preg_match('/LATIN(.*)/', $str, $m)) {
	139	$aliases = array('2' => 2, '3' => 3, '4' => 4, '5' => 9, '6' => 10,
	140	'7' => 13, '8' => 14, '9' => 15, '10' => 16,
	141	'ARABIC' => 6, 'CYRILLIC' => 5, 'GREEK' => 7, 'GREEK1' => 7, 'HEBREW' => 8
	142	);
	143
	144	// some clients sends windows-1252 text as latin1,
	145	// it is safe to use windows-1252 for all latin1
	146	if ($m[1] == 1) {
	147	$result = 'WINDOWS-1252';
	148	}
	149	// if iconv is not supported we need ISO labels, it's also safe for iconv
	150	else if (!empty($aliases[$m[1]])) {
	151	$result = 'ISO-8859-'.$aliases[$m[1]];
	152	}
	153	// iconv requires convertion of e.g. LATIN-1 to LATIN1
	154	else {
	155	$result = $str;
	156	}
	157	}
	158	else {
	159	$result = $charset;
	160	}
	161
	162	$charsets[$input] = $result;
	163
	164	return $result;
	165	}
	166
	167
	168	/**
	169	* Convert a string from one charset to another.
	170	* Uses mbstring and iconv functions if possible
	171	*
	172	* @param string Input string
	173	* @param string Suspected charset of the input string
	174	* @param string Target charset to convert to; defaults to RCMAIL_CHARSET
	175	*
	176	* @return string Converted string
	177	*/
	178	public static function convert($str, $from, $to = null)
	179	{
	180	static $iconv_options = null;
	181	static $mbstring_loaded = null;
	182	static $mbstring_list = null;
	183	static $conv = null;
	184
	185	$to = empty($to) ? strtoupper(RCMAIL_CHARSET) : self::parse($to);
	186	$from = self::parse($from);
	187
	188	if ($from == $to \|\| empty($str) \|\| empty($from)) {
	189	return $str;
	190	}
	191
	192	// convert charset using iconv module
	193	if (function_exists('iconv') && $from != 'UTF7-IMAP' && $to != 'UTF7-IMAP') {
	194	if ($iconv_options === null) {
	195	// ignore characters not available in output charset
	196	$iconv_options = '//IGNORE';
	197	if (iconv('', $iconv_options, '') === false) {
	198	// iconv implementation does not support options
	199	$iconv_options = '';
	200	}
	201	}
	202
	203	// throw an exception if iconv reports an illegal character in input
	204	// it means that input string has been truncated
	205	set_error_handler(array('rcube_charset', 'error_handler'), E_NOTICE);
	206	try {
	207	$_iconv = iconv($from, $to . $iconv_options, $str);
	208	} catch (ErrorException $e) {
	209	$_iconv = false;
	210	}
	211	restore_error_handler();
	212
	213	if ($_iconv !== false) {
	214	return $_iconv;
	215	}
	216	}
	217
	218	if ($mbstring_loaded === null) {
	219	$mbstring_loaded = extension_loaded('mbstring');
	220	}
	221
	222	// convert charset using mbstring module
	223	if ($mbstring_loaded) {
	224	$aliases['WINDOWS-1257'] = 'ISO-8859-13';
	225
	226	if ($mbstring_list === null) {
	227	$mbstring_list = mb_list_encodings();
	228	$mbstring_list = array_map('strtoupper', $mbstring_list);
	229	}
	230
	231	$mb_from = $aliases[$from] ? $aliases[$from] : $from;
	232	$mb_to = $aliases[$to] ? $aliases[$to] : $to;
	233
	234	// return if encoding found, string matches encoding and convert succeeded
	235	if (in_array($mb_from, $mbstring_list) && in_array($mb_to, $mbstring_list)) {
	236	if (mb_check_encoding($str, $mb_from) && ($out = mb_convert_encoding($str, $mb_to, $mb_from))) {
	237	return $out;
	238	}
	239	}
	240	}
	241
	242	// convert charset using bundled classes/functions
	243	if ($to == 'UTF-8') {
	244	if ($from == 'UTF7-IMAP') {
	245	if ($_str = self::utf7imap_to_utf8($str)) {
	246	return $_str;
	247	}
	248	}
	249	else if ($from == 'UTF-7') {
	250	if ($_str = self::utf7_to_utf8($str)) {
	251	return $_str;
	252	}
	253	}
	254	else if ($from == 'ISO-8859-1' && function_exists('utf8_encode')) {
	255	return utf8_encode($str);
	256	}
	257	else if (class_exists('utf8')) {
	258	if (!$conv) {
	259	$conv = new utf8($from);
	260	}
	261	else {
	262	$conv->loadCharset($from);
	263	}
	264
	265	if ($_str = $conv->strToUtf8($str)) {
	266	return $_str;
	267	}
	268	}
	269	}
	270
	271	// encode string for output
	272	if ($from == 'UTF-8') {
	273	// @TODO: we need a function for UTF-7 (RFC2152) conversion
	274	if ($to == 'UTF7-IMAP' \|\| $to == 'UTF-7') {
	275	if ($_str = utf8_to_utf7imap($str)) {
	276	return $_str;
	277	}
	278	}
	279	else if ($to == 'ISO-8859-1' && function_exists('utf8_decode')) {
	280	return utf8_decode($str);
	281	}
	282	else if (class_exists('utf8')) {
	283	if (!$conv) {
	284	$conv = new utf8($to);
	285	}
	286	else {
	287	$conv->loadCharset($from);
	288	}
	289
	290	if ($_str = $conv->strToUtf8($str)) {
	291	return $_str;
	292	}
	293	}
	294	}
	295
	296	// return original string
	297	return $str;
	298	}
	299
	300
	301	/**
	302	* Converts string from standard UTF-7 (RFC 2152) to UTF-8.
	303	*
	304	* @param string Input string (UTF-7)
	305	*
	306	* @return string Converted string (UTF-8)
	307	*/
	308	public static function utf7_to_utf8($str)
	309	{
	310	$Index_64 = array(
	311	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
	312	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
	313	0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,0,0,
	314	1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,
	315	0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	316	1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
	317	0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
	318	1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
	319	);
	320
	321	$u7len = strlen($str);
	322	$str = strval($str);
	323	$res = '';
	324
	325	for ($i=0; $u7len > 0; $i++, $u7len--) {
	326	$u7 = $str[$i];
	327	if ($u7 == '+') {
	328	$i++;
	329	$u7len--;
	330	$ch = '';
	331
	332	for (; $u7len > 0; $i++, $u7len--) {
	333	$u7 = $str[$i];
	334
	335	if (!$Index_64[ord($u7)]) {
	336	break;
	337	}
	338
	339	$ch .= $u7;
	340	}
	341
	342	if ($ch == '') {
	343	if ($u7 == '-') {
	344	$res .= '+';
	345	}
	346
	347	continue;
	348	}
	349
	350	$res .= self::utf16_to_utf8(base64_decode($ch));
	351	}
	352	else {
	353	$res .= $u7;
	354	}
	355	}
	356
	357	return $res;
	358	}
	359
	360
	361	/**
	362	* Converts string from UTF-16 to UTF-8 (helper for utf-7 to utf-8 conversion)
	363	*
	364	* @param string Input string
	365	*
	366	* @return string The converted string
	367	*/
	368	public static function utf16_to_utf8($str)
	369	{
	370	$len = strlen($str);
	371	$dec = '';
	372
	373	for ($i = 0; $i < $len; $i += 2) {
	374	$c = ord($str[$i]) << 8 \| ord($str[$i + 1]);
	375	if ($c >= 0x0001 && $c <= 0x007F) {
	376	$dec .= chr($c);
	377	}
	378	else if ($c > 0x07FF) {
	379	$dec .= chr(0xE0 \| (($c >> 12) & 0x0F));
	380	$dec .= chr(0x80 \| (($c >> 6) & 0x3F));
	381	$dec .= chr(0x80 \| (($c >> 0) & 0x3F));
	382	}
	383	else {
	384	$dec .= chr(0xC0 \| (($c >> 6) & 0x1F));
	385	$dec .= chr(0x80 \| (($c >> 0) & 0x3F));
	386	}
	387	}
	388
	389	return $dec;
	390	}
	391
	392
	393	/**
	394	* Convert the data ($str) from RFC 2060's UTF-7 to UTF-8.
	395	* If input data is invalid, return the original input string.
	396	* RFC 2060 obviously intends the encoding to be unique (see
	397	* point 5 in section 5.1.3), so we reject any non-canonical
	398	* form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead
	399	* of &AMAAwA-).
	400	*
	401	* Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
	402	*
	403	* @param string $str Input string (UTF7-IMAP)
	404	*
	405	* @return string Output string (UTF-8)
	406	*/
	407	public static function utf7imap_to_utf8($str)
	408	{
	409	$Index_64 = array(
	410	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
	411	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
	412	-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
	413	52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
	414	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
	415	15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
	416	-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
	417	41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
	418	);
	419
	420	$u7len = strlen($str);
	421	$str = strval($str);
	422	$p = '';
	423	$err = '';
	424
	425	for ($i=0; $u7len > 0; $i++, $u7len--) {
	426	$u7 = $str[$i];
	427	if ($u7 == '&') {
	428	$i++;
	429	$u7len--;
	430	$u7 = $str[$i];
	431
	432	if ($u7len && $u7 == '-') {
	433	$p .= '&';
	434	continue;
	435	}
	436
	437	$ch = 0;
	438	$k = 10;
	439	for (; $u7len > 0; $i++, $u7len--) {
	440	$u7 = $str[$i];
	441
	442	if ((ord($u7) & 0x80) \|\| ($b = $Index_64[ord($u7)]) == -1) {
	443	break;
	444	}
	445
	446	if ($k > 0) {
	447	$ch \|= $b << $k;
	448	$k -= 6;
	449	}
	450	else {
	451	$ch \|= $b >> (-$k);
	452	if ($ch < 0x80) {
	453	// Printable US-ASCII
	454	if (0x20 <= $ch && $ch < 0x7f) {
	455	return $err;
	456	}
	457	$p .= chr($ch);
	458	}
	459	else if ($ch < 0x800) {
	460	$p .= chr(0xc0 \| ($ch >> 6));
	461	$p .= chr(0x80 \| ($ch & 0x3f));
	462	}
	463	else {
	464	$p .= chr(0xe0 \| ($ch >> 12));
	465	$p .= chr(0x80 \| (($ch >> 6) & 0x3f));
	466	$p .= chr(0x80 \| ($ch & 0x3f));
	467	}
	468
	469	$ch = ($b << (16 + $k)) & 0xffff;
	470	$k += 10;
	471	}
	472	}
	473
	474	// Non-zero or too many extra bits
	475	if ($ch \|\| $k < 6) {
	476	return $err;
	477	}
	478
	479	// BASE64 not properly terminated
	480	if (!$u7len \|\| $u7 != '-') {
	481	return $err;
	482	}
	483
	484	// Adjacent BASE64 sections
	485	if ($u7len > 2 && $str[$i+1] == '&' && $str[$i+2] != '-') {
	486	return $err;
	487	}
	488	}
	489	// Not printable US-ASCII
	490	else if (ord($u7) < 0x20 \|\| ord($u7) >= 0x7f) {
	491	return $err;
	492	}
	493	else {
	494	$p .= $u7;
	495	}
	496	}
	497
	498	return $p;
	499	}
	500
	501
	502	/**
	503	* Convert the data ($str) from UTF-8 to RFC 2060's UTF-7.
	504	* Unicode characters above U+FFFF are replaced by U+FFFE.
	505	* If input data is invalid, return an empty string.
	506	*
	507	* Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
	508	*
	509	* @param string $str Input string (UTF-8)
	510	*
	511	* @return string Output string (UTF7-IMAP)
	512	*/
	513	public static function utf8_to_utf7imap($str)
	514	{
	515	$B64Chars = array(
	516	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
	517	'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
	518	'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
	519	't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
	520	'8', '9', '+', ','
	521	);
	522
	523	$u8len = strlen($str);
	524	$base64 = 0;
	525	$i = 0;
	526	$p = '';
	527	$err = '';
	528
	529	while ($u8len) {
	530	$u8 = $str[$i];
	531	$c = ord($u8);
	532
	533	if ($c < 0x80) {
	534	$ch = $c;
	535	$n = 0;
	536	}
	537	else if ($c < 0xc2) {
	538	return $err;
	539	}
	540	else if ($c < 0xe0) {
	541	$ch = $c & 0x1f;
	542	$n = 1;
	543	}
	544	else if ($c < 0xf0) {
	545	$ch = $c & 0x0f;
	546	$n = 2;
	547	}
	548	else if ($c < 0xf8) {
	549	$ch = $c & 0x07;
	550	$n = 3;
	551	}
	552	else if ($c < 0xfc) {
	553	$ch = $c & 0x03;
	554	$n = 4;
	555	}
	556	else if ($c < 0xfe) {
	557	$ch = $c & 0x01;
	558	$n = 5;
	559	}
	560	else {
	561	return $err;
	562	}
	563
	564	$i++;
	565	$u8len--;
	566
	567	if ($n > $u8len) {
	568	return $err;
	569	}
	570
	571	for ($j=0; $j < $n; $j++) {
	572	$o = ord($str[$i+$j]);
	573	if (($o & 0xc0) != 0x80) {
	574	return $err;
	575	}
	576	$ch = ($ch << 6) \| ($o & 0x3f);
	577	}
	578
	579	if ($n > 1 && !($ch >> ($n * 5 + 1))) {
	580	return $err;
	581	}
	582
	583	$i += $n;
	584	$u8len -= $n;
	585
	586	if ($ch < 0x20 \|\| $ch >= 0x7f) {
	587	if (!$base64) {
	588	$p .= '&';
	589	$base64 = 1;
	590	$b = 0;
	591	$k = 10;
	592	}
	593	if ($ch & ~0xffff) {
	594	$ch = 0xfffe;
	595	}
	596
	597	$p .= $B64Chars[($b \| $ch >> $k)];
	598	$k -= 6;
	599	for (; $k >= 0; $k -= 6) {
	600	$p .= $B64Chars[(($ch >> $k) & 0x3f)];
	601	}
	602
	603	$b = ($ch << (-$k)) & 0x3f;
	604	$k += 16;
	605	}
	606	else {
	607	if ($base64) {
	608	if ($k > 10) {
	609	$p .= $B64Chars[$b];
	610	}
	611	$p .= '-';
	612	$base64 = 0;
	613	}
	614
	615	$p .= chr($ch);
	616	if (chr($ch) == '&') {
	617	$p .= '-';
	618	}
	619	}
	620	}
	621
	622	if ($base64) {
	623	if ($k > 10) {
	624	$p .= $B64Chars[$b];
	625	}
	626	$p .= '-';
	627	}
	628
	629	return $p;
	630	}
	631
	632
	633	/**
	634	* A method to guess character set of a string.
	635	*
	636	* @param string $string String.
	637	* @param string $failover Default result for failover.
	638	*
	639	* @return string Charset name
	640	*/
	641	public static function detect($string, $failover='')
	642	{
	643	if (!function_exists('mb_detect_encoding')) {
	644	return $failover;
	645	}
	646
	647	// FIXME: the order is important, because sometimes
	648	// iso string is detected as euc-jp and etc.
	649	$enc = array(
	650	'UTF-8', 'SJIS', 'BIG5', 'GB2312',
	651	'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
	652	'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',
	653	'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',
	654	'WINDOWS-1252', 'WINDOWS-1251', 'EUC-JP', 'EUC-TW', 'KOI8-R',
	655	'ISO-2022-KR', 'ISO-2022-JP'
	656	);
	657
	658	$result = mb_detect_encoding($string, join(',', $enc));
	659
	660	return $result ? $result : $failover;
	661	}
	662
	663
	664	/**
	665	* Removes non-unicode characters from input.
	666	*
	667	* @param mixed $input String or array.
	668	*
	669	* @return mixed String or array
	670	*/
	671	public static function clean($input)
	672	{
	673	// handle input of type array
	674	if (is_array($input)) {
	675	foreach ($input as $idx => $val) {
	676	$input[$idx] = self::clean($val);
	677	}
	678	return $input;
	679	}
	680
	681	if (!is_string($input) \|\| $input == '') {
	682	return $input;
	683	}
	684
	685	// iconv/mbstring are much faster (especially with long strings)
	686	if (function_exists('mb_convert_encoding')) {
	687	if (($res = mb_convert_encoding($input, 'UTF-8', 'UTF-8')) !== false) {
	688	return $res;
	689	}
	690	}
	691
	692	if (function_exists('iconv')) {
	693	if (($res = @iconv('UTF-8', 'UTF-8//IGNORE', $input)) !== false) {
	694	return $res;
	695	}
	696	}
	697
	698	$seq = '';
	699	$out = '';
	700	$regexp = '/^('.
	701	// '[\x00-\x7F]'. // UTF8-1
	702	'\|[\xC2-\xDF][\x80-\xBF]'. // UTF8-2
	703	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. // UTF8-3
	704	'\|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'. // UTF8-3
	705	'\|\xED[\x80-\x9F][\x80-\xBF]'. // UTF8-3
	706	'\|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'. // UTF8-3
	707	'\|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'. // UTF8-4
	708	'\|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.// UTF8-4
	709	'\|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'. // UTF8-4
	710	')$/';
	711
	712	for ($i = 0, $len = strlen($input); $i < $len; $i++) {
	713	$chr = $input[$i];
	714	$ord = ord($chr);
	715
	716	// 1-byte character
	717	if ($ord <= 0x7F) {
	718	if ($seq) {
	719	$out .= preg_match($regexp, $seq) ? $seq : '';
	720	}
	721	$seq = '';
	722	$out .= $chr;
	723	// first (or second) byte of multibyte sequence
	724	}
	725	else if ($ord >= 0xC0) {
	726	if (strlen($seq) > 1) {
	727	$out .= preg_match($regexp, $seq) ? $seq : '';
	728	$seq = '';
	729	}
	730	else if ($seq && ord($seq) < 0xC0) {
	731	$seq = '';
	732	}
	733	$seq .= $chr;
	734	// next byte of multibyte sequence
	735	}
	736	else if ($seq) {
	737	$seq .= $chr;
	738	}
	739	}
	740
	741	if ($seq) {
	742	$out .= preg_match($regexp, $seq) ? $seq : '';
	743	}
	744
	745	return $out;
	746	}
	747
	748	}