commit | author | age
|
bdc07a
|
1 |
<?php |
83dbb7
|
2 |
/* |
T |
3 |
utf8 1.0 |
|
4 |
Copyright: Left |
|
5 |
--------------------------------------------------------------------------------- |
|
6 |
Version: 1.0 |
|
7 |
Date: 23 November 2004 |
|
8 |
--------------------------------------------------------------------------------- |
|
9 |
Author: Alexander Minkovsky (a_minkovsky@hotmail.com) |
|
10 |
--------------------------------------------------------------------------------- |
|
11 |
License: Choose the more appropriated for You - I don't care. |
|
12 |
--------------------------------------------------------------------------------- |
|
13 |
Description: |
|
14 |
Class provides functionality to convert single byte strings, such as CP1251 |
|
15 |
ti UTF-8 multibyte format and vice versa. |
|
16 |
Class loads a concrete charset map, for example CP1251. |
|
17 |
(Refer to ftp://ftp.unicode.org/Public/MAPPINGS/ for map files) |
|
18 |
Directory containing MAP files is predefined as constant. |
|
19 |
Each charset is also predefined as constant pointing to the MAP file. |
|
20 |
--------------------------------------------------------------------------------- |
|
21 |
Example usage: |
|
22 |
Pass the desired charset in the class constructor: |
|
23 |
$utfConverter = new utf8(CP1251); //defaults to CP1250. |
|
24 |
or load the charset MAP using loadCharset method like this: |
|
25 |
$utfConverter->loadCharset(CP1252); |
|
26 |
Then call |
|
27 |
$res = $utfConverter->strToUtf8($str); |
|
28 |
or |
|
29 |
$res = $utfConverter->utf8ToStr($utf); |
|
30 |
to get the needed encoding. |
|
31 |
--------------------------------------------------------------------------------- |
|
32 |
Note: |
|
33 |
Rewrite or Override the onError method if needed. It's the error handler used from everywhere and takes 2 parameters: |
|
34 |
err_code and err_text. By default it just prints out a message about the error. |
|
35 |
*/ |
|
36 |
|
|
37 |
// Charset maps |
|
38 |
// Adapted to fit RoundCube |
|
39 |
define("UTF8_MAP_DIR", "program/lib/encoding"); |
|
40 |
$utf8_maps = array( |
|
41 |
"CP1250" => UTF8_MAP_DIR . "/CP1250.map", |
|
42 |
"CP1251" => UTF8_MAP_DIR . "/CP1251.map", |
|
43 |
"CP1252" => UTF8_MAP_DIR . "/CP1252.map", |
|
44 |
"CP1253" => UTF8_MAP_DIR . "/CP1253.map", |
|
45 |
"CP1254" => UTF8_MAP_DIR . "/CP1254.map", |
|
46 |
"CP1255" => UTF8_MAP_DIR . "/CP1255.map", |
|
47 |
"CP1256" => UTF8_MAP_DIR . "/CP1256.map", |
|
48 |
"CP1257" => UTF8_MAP_DIR . "/CP1257.map", |
|
49 |
"CP1258" => UTF8_MAP_DIR . "/CP1258.map", |
|
50 |
"ISO-8859-1" => UTF8_MAP_DIR . "/ISO-8859-1.map", |
|
51 |
"ISO-8859-2" => UTF8_MAP_DIR . "/ISO-8859-2.map", |
|
52 |
"ISO-8859-3" => UTF8_MAP_DIR . "/ISO-8859-3.map", |
5f56a5
|
53 |
"ISO-8859-4" => UTF8_MAP_DIR . "/ISO-8859-4.map", |
T |
54 |
"ISO-8859-5" => UTF8_MAP_DIR . "/ISO-8859-5.map", |
|
55 |
"ISO-8859-6" => UTF8_MAP_DIR . "/ISO-8859-6.map", |
|
56 |
"ISO-8859-7" => UTF8_MAP_DIR . "/ISO-8859-7.map", |
|
57 |
"ISO-8859-8" => UTF8_MAP_DIR . "/ISO-8859-8.map", |
|
58 |
"ISO-8859-9" => UTF8_MAP_DIR . "/ISO-8859-9.map" |
|
59 |
); |
83dbb7
|
60 |
|
T |
61 |
//Error constants |
|
62 |
define("ERR_OPEN_MAP_FILE","ERR_OPEN_MAP_FILE"); |
|
63 |
|
|
64 |
//Class definition |
|
65 |
Class utf8{ |
|
66 |
|
58e360
|
67 |
var $charset = "ISO-8859-1"; |
83dbb7
|
68 |
var $ascMap = array(); |
T |
69 |
var $utfMap = array(); |
58e360
|
70 |
|
83dbb7
|
71 |
// made PHP5 capable by RoundCube |
T |
72 |
function __construct($charset="ISO-8859-1"){ |
|
73 |
$this->loadCharset($charset); |
|
74 |
} |
|
75 |
|
|
76 |
//Constructor |
|
77 |
function utf8($charset="ISO-8859-1"){ |
|
78 |
$this->__construct($charset); |
|
79 |
} |
|
80 |
|
|
81 |
//Load charset |
|
82 |
function loadCharset($charset){ |
|
83 |
global $utf8_maps; |
58e360
|
84 |
|
83dbb7
|
85 |
if (!is_file($utf8_maps[$charset])) |
T |
86 |
{ |
|
87 |
$this->onError(ERR_OPEN_MAP_FILE, "Failed to open map file for $charset"); |
|
88 |
return; |
|
89 |
} |
|
90 |
|
|
91 |
if (empty($this->ascMap[$charset])) |
|
92 |
{ |
|
93 |
$lines = file_get_contents($utf8_maps[$charset]); |
|
94 |
$lines = preg_replace("/#.*$/m","",$lines); |
|
95 |
$lines = preg_replace("/\n\n/","",$lines); |
|
96 |
$lines = explode("\n",$lines); |
|
97 |
foreach($lines as $line){ |
|
98 |
$parts = explode('0x',$line); |
|
99 |
if(count($parts)==3){ |
|
100 |
$asc=hexdec(substr($parts[1],0,2)); |
|
101 |
$utf=hexdec(substr($parts[2],0,4)); |
|
102 |
$this->ascMap[$charset][$asc]=$utf; |
|
103 |
} |
|
104 |
} |
|
105 |
} |
|
106 |
|
|
107 |
$this->charset = $charset; |
|
108 |
$this->utfMap = array_flip($this->ascMap[$charset]); |
|
109 |
} |
|
110 |
|
|
111 |
//Error handler |
|
112 |
function onError($err_code,$err_text){ |
|
113 |
//print($err_code . " : " . $err_text . "<hr>\n"); |
|
114 |
raise_error(array('code' => 500, |
|
115 |
'file' => __FILE__, |
|
116 |
'message' => $err_text), TRUE, FALSE); |
|
117 |
} |
|
118 |
|
|
119 |
//Translate string ($str) to UTF-8 from given charset |
|
120 |
function strToUtf8($str){ |
|
121 |
$chars = unpack('C*', $str); |
|
122 |
$cnt = count($chars); |
|
123 |
for($i=1;$i<=$cnt;$i++) $this->_charToUtf8($chars[$i]); |
|
124 |
return implode("",$chars); |
|
125 |
} |
|
126 |
|
|
127 |
//Translate UTF-8 string to single byte string in the given charset |
|
128 |
function utf8ToStr($utf){ |
|
129 |
$chars = unpack('C*', $utf); |
|
130 |
$cnt = count($chars); |
|
131 |
$res = ""; //No simple way to do it in place... concatenate char by char |
|
132 |
for ($i=1;$i<=$cnt;$i++){ |
|
133 |
$res .= $this->_utf8ToChar($chars, $i); |
|
134 |
} |
|
135 |
return $res; |
|
136 |
} |
|
137 |
|
|
138 |
//Char to UTF-8 sequence |
|
139 |
function _charToUtf8(&$char){ |
|
140 |
$c = (int)$this->ascMap[$this->charset][$char]; |
|
141 |
if ($c < 0x80){ |
|
142 |
$char = chr($c); |
|
143 |
} |
|
144 |
else if($c<0x800) // 2 bytes |
|
145 |
$char = (chr(0xC0 | $c>>6) . chr(0x80 | $c & 0x3F)); |
|
146 |
else if($c<0x10000) // 3 bytes |
|
147 |
$char = (chr(0xE0 | $c>>12) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F)); |
|
148 |
else if($c<0x200000) // 4 bytes |
|
149 |
$char = (chr(0xF0 | $c>>18) . chr(0x80 | $c>>12 & 0x3F) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F)); |
|
150 |
} |
|
151 |
|
|
152 |
//UTF-8 sequence to single byte character |
|
153 |
function _utf8ToChar(&$chars, &$idx){ |
|
154 |
if(($chars[$idx] >= 240) && ($chars[$idx] <= 255)){ // 4 bytes |
|
155 |
$utf = (intval($chars[$idx]-240) << 18) + |
|
156 |
(intval($chars[++$idx]-128) << 12) + |
|
157 |
(intval($chars[++$idx]-128) << 6) + |
|
158 |
(intval($chars[++$idx]-128) << 0); |
|
159 |
} |
|
160 |
else if (($chars[$idx] >= 224) && ($chars[$idx] <= 239)){ // 3 bytes |
|
161 |
$utf = (intval($chars[$idx]-224) << 12) + |
|
162 |
(intval($chars[++$idx]-128) << 6) + |
|
163 |
(intval($chars[++$idx]-128) << 0); |
|
164 |
} |
|
165 |
else if (($chars[$idx] >= 192) && ($chars[$idx] <= 223)){ // 2 bytes |
|
166 |
$utf = (intval($chars[$idx]-192) << 6) + |
|
167 |
(intval($chars[++$idx]-128) << 0); |
|
168 |
} |
|
169 |
else{ // 1 byte |
|
170 |
$utf = $chars[$idx]; |
|
171 |
} |
|
172 |
if(array_key_exists($utf,$this->utfMap)) |
|
173 |
return chr($this->utfMap[$utf]); |
|
174 |
else |
|
175 |
return "?"; |
|
176 |
} |
|
177 |
|
|
178 |
} |
58e360
|
179 |
|
T |
180 |
?> |