commit | author | age
|
b4edf7
|
1 |
<?php |
A |
2 |
|
|
3 |
/* |
|
4 |
+-----------------------------------------------------------------------+ |
|
5 |
| program/include/rcube_spellchecker.php | |
|
6 |
| | |
|
7 |
| This file is part of the Roundcube Webmail client | |
|
8 |
| Copyright (C) 2011, Kolab Systems AG | |
|
9 |
| Copyright (C) 2008-2011, The Roundcube Dev Team | |
|
10 |
| Licensed under the GNU GPL | |
|
11 |
| | |
|
12 |
| PURPOSE: | |
|
13 |
| Spellchecking using different backends | |
|
14 |
| | |
|
15 |
+-----------------------------------------------------------------------+ |
|
16 |
| Author: Aleksander Machniak <machniak@kolabsys.com> | |
|
17 |
| Author: Thomas Bruederli <roundcube@gmail.com> | |
|
18 |
+-----------------------------------------------------------------------+ |
|
19 |
|
|
20 |
$Id$ |
|
21 |
|
|
22 |
*/ |
|
23 |
|
|
24 |
|
|
25 |
/** |
|
26 |
* Helper class for spellchecking with Googielspell and PSpell support. |
|
27 |
* |
|
28 |
* @package Core |
|
29 |
*/ |
|
30 |
class rcube_spellchecker |
|
31 |
{ |
|
32 |
private $matches = array(); |
|
33 |
private $engine; |
|
34 |
private $lang; |
|
35 |
private $rc; |
|
36 |
private $error; |
|
37 |
private $separator = '/[ !"#$%&()*+\\,\/\n:;<=>?@\[\]^_{|}-]+|\.[^\w]/'; |
|
38 |
|
|
39 |
|
|
40 |
// default settings |
|
41 |
const GOOGLE_HOST = 'ssl://www.google.com'; |
|
42 |
const GOOGLE_PORT = 443; |
|
43 |
const MAX_SUGGESTIONS = 10; |
|
44 |
|
|
45 |
|
|
46 |
/** |
|
47 |
* Constructor |
|
48 |
* |
|
49 |
* @param string $lang Language code |
|
50 |
*/ |
|
51 |
function __construct($lang = 'en') |
|
52 |
{ |
|
53 |
$this->rc = rcmail::get_instance(); |
|
54 |
$this->engine = $this->rc->config->get('spellcheck_engine', 'googie'); |
644e3a
|
55 |
$this->lang = $lang ? $lang : 'en'; |
b4edf7
|
56 |
|
A |
57 |
if ($this->engine == 'pspell' && !extension_loaded('pspell')) { |
|
58 |
raise_error(array( |
|
59 |
'code' => 500, 'type' => 'php', |
|
60 |
'file' => __FILE__, 'line' => __LINE__, |
|
61 |
'message' => "Pspell extension not available"), true, true); |
|
62 |
} |
|
63 |
} |
|
64 |
|
|
65 |
|
|
66 |
/** |
|
67 |
* Set content and check spelling |
|
68 |
* |
|
69 |
* @param string $text Text content for spellchecking |
|
70 |
* @param bool $is_html Enables HTML-to-Text conversion |
|
71 |
* |
|
72 |
* @return bool True when no mispelling found, otherwise false |
|
73 |
*/ |
|
74 |
function check($text, $is_html=false) |
|
75 |
{ |
|
76 |
// convert to plain text |
|
77 |
if ($is_html) { |
|
78 |
$this->content = $this->html2text($text); |
|
79 |
} |
|
80 |
else { |
|
81 |
$this->content = $text; |
|
82 |
} |
|
83 |
|
|
84 |
if ($this->engine == 'pspell') { |
|
85 |
$this->matches = $this->_pspell_check($this->content); |
|
86 |
} |
|
87 |
else { |
|
88 |
$this->matches = $this->_googie_check($this->content); |
|
89 |
} |
|
90 |
|
|
91 |
return $this->found() == 0; |
|
92 |
} |
|
93 |
|
|
94 |
|
|
95 |
/** |
|
96 |
* Number of mispellings found (after check) |
|
97 |
* |
|
98 |
* @return int Number of mispellings |
|
99 |
*/ |
|
100 |
function found() |
|
101 |
{ |
|
102 |
return count($this->matches); |
|
103 |
} |
|
104 |
|
|
105 |
|
|
106 |
/** |
|
107 |
* Returns suggestions for the specified word |
|
108 |
* |
|
109 |
* @param string $word The word |
|
110 |
* |
|
111 |
* @return array Suggestions list |
|
112 |
*/ |
|
113 |
function get_suggestions($word) |
|
114 |
{ |
|
115 |
if ($this->engine == 'pspell') { |
|
116 |
return $this->_pspell_suggestions($word); |
|
117 |
} |
|
118 |
|
|
119 |
return $this->_googie_suggestions($word); |
|
120 |
} |
|
121 |
|
|
122 |
|
|
123 |
/** |
|
124 |
* Returns mispelled words |
|
125 |
* |
|
126 |
* @param string $text The content for spellchecking. If empty content |
|
127 |
* used for check() method will be used. |
|
128 |
* |
|
129 |
* @return array List of mispelled words |
|
130 |
*/ |
|
131 |
function get_words($text = null, $is_html=false) |
|
132 |
{ |
|
133 |
if ($this->engine == 'pspell') { |
|
134 |
return $this->_pspell_words($text, $is_html); |
|
135 |
} |
|
136 |
|
|
137 |
return $this->_googie_words($text, $is_html); |
|
138 |
} |
|
139 |
|
|
140 |
|
|
141 |
/** |
|
142 |
* Returns checking result in XML (Googiespell) format |
|
143 |
* |
|
144 |
* @return string XML content |
|
145 |
*/ |
|
146 |
function get_xml() |
|
147 |
{ |
|
148 |
// send output |
|
149 |
$out = '<?xml version="1.0" encoding="'.RCMAIL_CHARSET.'"?><spellresult charschecked="'.mb_strlen($this->content).'">'; |
|
150 |
|
|
151 |
foreach ($this->matches as $item) { |
|
152 |
$out .= '<c o="'.$item[1].'" l="'.$item[2].'">'; |
|
153 |
$out .= is_array($item[4]) ? implode("\t", $item[4]) : $item[4]; |
|
154 |
$out .= '</c>'; |
|
155 |
} |
|
156 |
|
|
157 |
$out .= '</spellresult>'; |
|
158 |
|
|
159 |
return $out; |
|
160 |
} |
|
161 |
|
|
162 |
|
|
163 |
/** |
644e3a
|
164 |
* Returns checking result (mispelled words with suggestions) |
A |
165 |
* |
|
166 |
* @return array Spellchecking result. An array indexed by word. |
|
167 |
*/ |
|
168 |
function get() |
|
169 |
{ |
|
170 |
$result = array(); |
|
171 |
|
|
172 |
foreach ($this->matches as $item) { |
|
173 |
if ($this->engine == 'pspell') { |
|
174 |
$word = $item[0]; |
|
175 |
} |
|
176 |
else { |
|
177 |
$word = mb_substr($this->content, $item[1], $item[2], RCMAIL_CHARSET); |
|
178 |
} |
|
179 |
$result[$word] = is_array($item[4]) ? implode("\t", $item[4]) : $item[4]; |
|
180 |
} |
|
181 |
|
|
182 |
return $out; |
|
183 |
} |
|
184 |
|
|
185 |
|
|
186 |
/** |
b4edf7
|
187 |
* Returns error message |
A |
188 |
* |
|
189 |
* @return string Error message |
|
190 |
*/ |
|
191 |
function error() |
|
192 |
{ |
|
193 |
return $this->error; |
|
194 |
} |
|
195 |
|
|
196 |
|
|
197 |
/** |
|
198 |
* Checks the text using pspell |
|
199 |
* |
|
200 |
* @param string $text Text content for spellchecking |
|
201 |
*/ |
|
202 |
private function _pspell_check($text) |
|
203 |
{ |
|
204 |
// init spellchecker |
|
205 |
$this->_pspell_init(); |
|
206 |
|
|
207 |
if (!$this->plink) { |
|
208 |
return array(); |
|
209 |
} |
|
210 |
|
|
211 |
// tokenize |
|
212 |
$text = preg_split($this->separator, $text, NULL, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE); |
|
213 |
|
|
214 |
$diff = 0; |
|
215 |
$matches = array(); |
|
216 |
|
|
217 |
foreach ($text as $w) { |
|
218 |
$word = trim($w[0]); |
|
219 |
$pos = $w[1] - $diff; |
|
220 |
$len = mb_strlen($word); |
|
221 |
|
|
222 |
if ($word && preg_match('/[^0-9\.]/', $word) && !pspell_check($this->plink, $word)) { |
|
223 |
$suggestions = pspell_suggest($this->plink, $word); |
|
224 |
|
|
225 |
if (sizeof($suggestions) > self::MAX_SUGGESTIONS) |
|
226 |
$suggestions = array_slice($suggestions, 0, self::MAX_SUGGESTIONS); |
|
227 |
|
|
228 |
$matches[] = array($word, $pos, $len, null, $suggestions); |
|
229 |
} |
|
230 |
|
|
231 |
$diff += (strlen($word) - $len); |
|
232 |
} |
|
233 |
|
|
234 |
return $matches; |
|
235 |
} |
|
236 |
|
|
237 |
|
|
238 |
/** |
|
239 |
* Returns the mispelled words |
|
240 |
*/ |
|
241 |
private function _pspell_words($text = null, $is_html=false) |
|
242 |
{ |
|
243 |
if ($text) { |
|
244 |
// init spellchecker |
|
245 |
$this->_pspell_init(); |
|
246 |
|
|
247 |
if (!$this->plink) { |
|
248 |
return array(); |
|
249 |
} |
|
250 |
|
|
251 |
// With PSpell we don't need to get suggestions to return mispelled words |
|
252 |
if ($is_html) { |
|
253 |
$text = $this->html2text($text); |
|
254 |
} |
|
255 |
|
|
256 |
$text = preg_split($this->separator, $text, NULL, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE); |
|
257 |
|
|
258 |
foreach ($text as $w) { |
|
259 |
$word = trim($w[0]); |
|
260 |
if ($word && preg_match('/[^0-9\.]/', $word) && !pspell_check($this->plink, $word)) { |
|
261 |
$result[] = $word; |
|
262 |
} |
|
263 |
} |
|
264 |
|
|
265 |
return $result; |
|
266 |
} |
|
267 |
|
|
268 |
$result = array(); |
|
269 |
|
|
270 |
foreach ($this->matches as $m) { |
|
271 |
$result[] = $m[0]; |
|
272 |
} |
|
273 |
|
|
274 |
return $result; |
|
275 |
} |
|
276 |
|
|
277 |
|
|
278 |
/** |
|
279 |
* Returns suggestions for mispelled word |
|
280 |
*/ |
|
281 |
private function _pspell_suggestions($word) |
|
282 |
{ |
|
283 |
// init spellchecker |
|
284 |
$this->_pspell_init(); |
|
285 |
|
|
286 |
if (!$this->plink) { |
|
287 |
return array(); |
|
288 |
} |
|
289 |
|
|
290 |
$suggestions = pspell_suggest($this->plink, $word); |
|
291 |
|
|
292 |
if (sizeof($suggestions) > self::MAX_SUGGESTIONS) |
|
293 |
$suggestions = array_slice($suggestions, 0, self::MAX_SUGGESTIONS); |
|
294 |
|
|
295 |
return is_array($suggestions) ? $suggestions : array(); |
|
296 |
} |
|
297 |
|
|
298 |
|
|
299 |
/** |
|
300 |
* Initializes PSpell dictionary |
|
301 |
*/ |
|
302 |
private function _pspell_init() |
|
303 |
{ |
|
304 |
if (!$this->plink) { |
|
305 |
$this->plink = pspell_new($this->lang, null, null, RCMAIL_CHARSET, PSPELL_FAST); |
|
306 |
} |
|
307 |
|
|
308 |
if (!$this->plink) { |
|
309 |
$this->error = "Unable to load Pspell engine for selected language"; |
|
310 |
} |
|
311 |
} |
|
312 |
|
|
313 |
|
|
314 |
private function _googie_check($text) |
|
315 |
{ |
|
316 |
// spell check uri is configured |
|
317 |
$url = $this->rc->config->get('spellcheck_uri'); |
|
318 |
|
|
319 |
if ($url) { |
|
320 |
$a_uri = parse_url($url); |
|
321 |
$ssl = ($a_uri['scheme'] == 'https' || $a_uri['scheme'] == 'ssl'); |
|
322 |
$port = $a_uri['port'] ? $a_uri['port'] : ($ssl ? 443 : 80); |
|
323 |
$host = ($ssl ? 'ssl://' : '') . $a_uri['host']; |
|
324 |
$path = $a_uri['path'] . ($a_uri['query'] ? '?'.$a_uri['query'] : '') . $this->lang; |
|
325 |
} |
|
326 |
else { |
|
327 |
$host = self::GOOGLE_HOST; |
|
328 |
$port = self::GOOGLE_PORT; |
|
329 |
$path = '/tbproxy/spell?lang=' . $this->lang; |
|
330 |
} |
|
331 |
|
|
332 |
// Google has some problem with spaces, use \n instead |
|
333 |
$text = str_replace(' ', "\n", $text); |
|
334 |
|
|
335 |
$text = '<?xml version="1.0" encoding="utf-8" ?>' |
|
336 |
.'<spellrequest textalreadyclipped="0" ignoredups="0" ignoredigits="1" ignoreallcaps="1">' |
|
337 |
.'<text>' . $text . '</text>' |
|
338 |
.'</spellrequest>'; |
|
339 |
|
|
340 |
$store = ''; |
|
341 |
if ($fp = fsockopen($host, $port, $errno, $errstr, 30)) { |
|
342 |
$out = "POST $path HTTP/1.0\r\n"; |
|
343 |
$out .= "Host: " . str_replace('ssl://', '', $host) . "\r\n"; |
|
344 |
$out .= "Content-Length: " . strlen($text) . "\r\n"; |
|
345 |
$out .= "Content-Type: application/x-www-form-urlencoded\r\n"; |
|
346 |
$out .= "Connection: Close\r\n\r\n"; |
|
347 |
$out .= $text; |
|
348 |
fwrite($fp, $out); |
|
349 |
|
|
350 |
while (!feof($fp)) |
|
351 |
$store .= fgets($fp, 128); |
|
352 |
fclose($fp); |
|
353 |
} |
|
354 |
|
|
355 |
if (!$store) { |
|
356 |
$this->error = "Empty result from spelling engine"; |
|
357 |
} |
|
358 |
|
|
359 |
preg_match_all('/<c o="([^"]*)" l="([^"]*)" s="([^"]*)">([^<]*)<\/c>/', $store, $matches, PREG_SET_ORDER); |
|
360 |
|
|
361 |
return $matches; |
|
362 |
} |
|
363 |
|
|
364 |
|
|
365 |
private function _googie_words($text = null, $is_html=false) |
|
366 |
{ |
|
367 |
if ($text) { |
|
368 |
if ($is_html) { |
|
369 |
$text = $this->html2text($text); |
|
370 |
} |
|
371 |
|
|
372 |
$matches = $this->_googie_check($text); |
|
373 |
} |
|
374 |
else { |
|
375 |
$matches = $this->matches; |
|
376 |
$text = $this->content; |
|
377 |
} |
|
378 |
|
|
379 |
$result = array(); |
|
380 |
|
|
381 |
foreach ($matches as $m) { |
|
382 |
$result[] = mb_substr($text, $m[1], $m[2], RCMAIL_CHARSET); |
|
383 |
} |
|
384 |
|
|
385 |
return $result; |
|
386 |
} |
|
387 |
|
|
388 |
|
|
389 |
private function _googie_suggestions($word) |
|
390 |
{ |
|
391 |
if ($word) { |
|
392 |
$matches = $this->_googie_check($word); |
|
393 |
} |
|
394 |
else { |
|
395 |
$matches = $this->matches; |
|
396 |
} |
|
397 |
|
|
398 |
if ($matches[0][4]) { |
|
399 |
$suggestions = explode("\t", $matches[0][4]); |
|
400 |
if (sizeof($suggestions) > self::MAX_SUGGESTIONS) { |
|
401 |
$suggestions = array_slice($suggestions, 0, MAX_SUGGESTIONS); |
|
402 |
} |
|
403 |
|
|
404 |
return $suggestions; |
|
405 |
} |
|
406 |
|
|
407 |
return array(); |
|
408 |
} |
|
409 |
|
|
410 |
|
|
411 |
private function html2text($text) |
|
412 |
{ |
|
413 |
$h2t = new html2text($text, false, true, 0); |
|
414 |
return $h2t->get_text(); |
|
415 |
} |
|
416 |
} |