commit | author | age
|
bdc07a
|
1 |
<?php |
83dbb7
|
2 |
/* |
T |
3 |
utf8 1.0 |
|
4 |
Copyright: Left |
|
5 |
--------------------------------------------------------------------------------- |
|
6 |
Version: 1.0 |
|
7 |
Date: 23 November 2004 |
|
8 |
--------------------------------------------------------------------------------- |
|
9 |
Author: Alexander Minkovsky (a_minkovsky@hotmail.com) |
|
10 |
--------------------------------------------------------------------------------- |
|
11 |
License: Choose the more appropriated for You - I don't care. |
|
12 |
--------------------------------------------------------------------------------- |
|
13 |
Description: |
|
14 |
Class provides functionality to convert single byte strings, such as CP1251 |
|
15 |
ti UTF-8 multibyte format and vice versa. |
|
16 |
Class loads a concrete charset map, for example CP1251. |
|
17 |
(Refer to ftp://ftp.unicode.org/Public/MAPPINGS/ for map files) |
|
18 |
Directory containing MAP files is predefined as constant. |
|
19 |
Each charset is also predefined as constant pointing to the MAP file. |
|
20 |
--------------------------------------------------------------------------------- |
|
21 |
Example usage: |
|
22 |
Pass the desired charset in the class constructor: |
|
23 |
$utfConverter = new utf8(CP1251); //defaults to CP1250. |
|
24 |
or load the charset MAP using loadCharset method like this: |
|
25 |
$utfConverter->loadCharset(CP1252); |
|
26 |
Then call |
|
27 |
$res = $utfConverter->strToUtf8($str); |
|
28 |
or |
|
29 |
$res = $utfConverter->utf8ToStr($utf); |
|
30 |
to get the needed encoding. |
|
31 |
--------------------------------------------------------------------------------- |
|
32 |
Note: |
|
33 |
Rewrite or Override the onError method if needed. It's the error handler used from everywhere and takes 2 parameters: |
|
34 |
err_code and err_text. By default it just prints out a message about the error. |
|
35 |
*/ |
|
36 |
|
|
37 |
// Charset maps |
e019f2
|
38 |
// Adapted to fit Roundcube |
83dbb7
|
39 |
define("UTF8_MAP_DIR", "program/lib/encoding"); |
T |
40 |
|
|
41 |
//Error constants |
ce72e0
|
42 |
define("ERR_OPEN_MAP_FILE", "ERR_OPEN_MAP_FILE"); |
83dbb7
|
43 |
|
T |
44 |
//Class definition |
ce72e0
|
45 |
Class utf8 { |
83dbb7
|
46 |
|
58e360
|
47 |
var $charset = "ISO-8859-1"; |
83dbb7
|
48 |
var $ascMap = array(); |
T |
49 |
var $utfMap = array(); |
ce72e0
|
50 |
var $aliases = array( |
A |
51 |
'KOI8-R' => 'KOI8R' |
|
52 |
); |
|
53 |
var $error = null; |
58e360
|
54 |
|
ce72e0
|
55 |
function __construct($charset="ISO-8859-1") { |
83dbb7
|
56 |
$this->loadCharset($charset); |
T |
57 |
} |
|
58 |
|
|
59 |
//Load charset |
ce72e0
|
60 |
function loadCharset($charset) { |
83dbb7
|
61 |
|
ce72e0
|
62 |
$charset = preg_replace(array('/^WINDOWS-*125([0-8])$/', '/^CP-/'), array('CP125\\1', 'CP'), $charset); |
A |
63 |
if (isset($aliases[$charset])) |
|
64 |
$charset = $aliases[$charset]; |
|
65 |
|
|
66 |
$this->charset = $charset; |
|
67 |
|
83dbb7
|
68 |
if (empty($this->ascMap[$charset])) |
T |
69 |
{ |
ce72e0
|
70 |
$file = UTF8_MAP_DIR.'/'.$charset.'.map'; |
A |
71 |
|
|
72 |
if (!is_file($file)) { |
|
73 |
$this->onError(ERR_OPEN_MAP_FILE, "Failed to open map file for $charset"); |
|
74 |
return; |
|
75 |
} |
|
76 |
|
|
77 |
$lines = file_get_contents($file); |
83dbb7
|
78 |
$lines = preg_replace("/#.*$/m","",$lines); |
T |
79 |
$lines = preg_replace("/\n\n/","",$lines); |
|
80 |
$lines = explode("\n",$lines); |
ce72e0
|
81 |
|
83dbb7
|
82 |
foreach($lines as $line){ |
T |
83 |
$parts = explode('0x',$line); |
|
84 |
if(count($parts)==3){ |
|
85 |
$asc=hexdec(substr($parts[1],0,2)); |
|
86 |
$utf=hexdec(substr($parts[2],0,4)); |
|
87 |
$this->ascMap[$charset][$asc]=$utf; |
|
88 |
} |
|
89 |
} |
ce72e0
|
90 |
|
A |
91 |
$this->utfMap = array_flip($this->ascMap[$charset]); |
83dbb7
|
92 |
} |
T |
93 |
} |
|
94 |
|
|
95 |
//Error handler |
|
96 |
function onError($err_code,$err_text){ |
ce72e0
|
97 |
$this->error = $err_text; |
A |
98 |
return null; |
83dbb7
|
99 |
} |
T |
100 |
|
|
101 |
//Translate string ($str) to UTF-8 from given charset |
|
102 |
function strToUtf8($str){ |
ce72e0
|
103 |
if (empty($this->ascMap[$this->charset])) |
A |
104 |
return null; |
|
105 |
|
83dbb7
|
106 |
$chars = unpack('C*', $str); |
T |
107 |
$cnt = count($chars); |
ce72e0
|
108 |
for($i=1; $i<=$cnt; $i++) |
A |
109 |
$this->_charToUtf8($chars[$i]); |
|
110 |
|
83dbb7
|
111 |
return implode("",$chars); |
T |
112 |
} |
|
113 |
|
|
114 |
//Translate UTF-8 string to single byte string in the given charset |
|
115 |
function utf8ToStr($utf){ |
ce72e0
|
116 |
if (empty($this->ascMap[$this->charset])) |
A |
117 |
return null; |
|
118 |
|
83dbb7
|
119 |
$chars = unpack('C*', $utf); |
T |
120 |
$cnt = count($chars); |
|
121 |
$res = ""; //No simple way to do it in place... concatenate char by char |
ce72e0
|
122 |
|
A |
123 |
for ($i=1; $i<=$cnt; $i++) |
83dbb7
|
124 |
$res .= $this->_utf8ToChar($chars, $i); |
ce72e0
|
125 |
|
83dbb7
|
126 |
return $res; |
T |
127 |
} |
|
128 |
|
|
129 |
//Char to UTF-8 sequence |
|
130 |
function _charToUtf8(&$char){ |
|
131 |
$c = (int)$this->ascMap[$this->charset][$char]; |
|
132 |
if ($c < 0x80){ |
|
133 |
$char = chr($c); |
|
134 |
} |
|
135 |
else if($c<0x800) // 2 bytes |
|
136 |
$char = (chr(0xC0 | $c>>6) . chr(0x80 | $c & 0x3F)); |
|
137 |
else if($c<0x10000) // 3 bytes |
|
138 |
$char = (chr(0xE0 | $c>>12) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F)); |
|
139 |
else if($c<0x200000) // 4 bytes |
|
140 |
$char = (chr(0xF0 | $c>>18) . chr(0x80 | $c>>12 & 0x3F) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F)); |
|
141 |
} |
|
142 |
|
|
143 |
//UTF-8 sequence to single byte character |
|
144 |
function _utf8ToChar(&$chars, &$idx){ |
|
145 |
if(($chars[$idx] >= 240) && ($chars[$idx] <= 255)){ // 4 bytes |
|
146 |
$utf = (intval($chars[$idx]-240) << 18) + |
|
147 |
(intval($chars[++$idx]-128) << 12) + |
|
148 |
(intval($chars[++$idx]-128) << 6) + |
|
149 |
(intval($chars[++$idx]-128) << 0); |
|
150 |
} |
|
151 |
else if (($chars[$idx] >= 224) && ($chars[$idx] <= 239)){ // 3 bytes |
|
152 |
$utf = (intval($chars[$idx]-224) << 12) + |
|
153 |
(intval($chars[++$idx]-128) << 6) + |
|
154 |
(intval($chars[++$idx]-128) << 0); |
|
155 |
} |
|
156 |
else if (($chars[$idx] >= 192) && ($chars[$idx] <= 223)){ // 2 bytes |
|
157 |
$utf = (intval($chars[$idx]-192) << 6) + |
|
158 |
(intval($chars[++$idx]-128) << 0); |
|
159 |
} |
|
160 |
else{ // 1 byte |
|
161 |
$utf = $chars[$idx]; |
|
162 |
} |
|
163 |
if(array_key_exists($utf,$this->utfMap)) |
|
164 |
return chr($this->utfMap[$utf]); |
|
165 |
else |
|
166 |
return "?"; |
|
167 |
} |
|
168 |
|
|
169 |
} |
58e360
|
170 |
|
6710a6
|
171 |
?> |