From e8b82c2e7b0ae2e5d45ecb600813b8990568feb9 Mon Sep 17 00:00:00 2001
From: Thomas Bruederli <bruederli@kolabsys.com>
Date: Sun, 28 Dec 2014 10:22:08 -0500
Subject: [PATCH] Fix rcube_utils::normalize_string() to support unicode characters + add argument for minimum token length
---
program/lib/Roundcube/rcube_utils.php | 22 +++++++++++++++-------
1 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php
index 2e4aa32..f4c0e90 100644
--- a/program/lib/Roundcube/rcube_utils.php
+++ b/program/lib/Roundcube/rcube_utils.php
@@ -912,14 +912,21 @@
* Split the given string into word tokens
*
* @param string Input to tokenize
+ * @param integer Minimum length of a single token
* @return array List of tokens
*/
- public static function tokenize_string($str)
+ public static function tokenize_string($str, $minlen = 2)
{
- return explode(" ", preg_replace(
- array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'),
- array(' ', '\\1\\2', ' '),
- $str));
+ $expr = array('/[\s;\/+-]+/ui', '/(\d)[-.\s]+(\d)/u');
+ $repl = array(' ', '\\1\\2');
+
+ if ($minlen > 1) {
+ $minlen--;
+ $expr[] = "/(^|\s+)\w{1,$minlen}(\s+|$)/u";
+ $repl[] = ' ';
+ }
+
+ return array_filter(explode(" ", preg_replace($expr, $repl, $str)));
}
/**
@@ -928,10 +935,11 @@
*
* @param string Input string (UTF-8)
* @param boolean True to return list of words as array
+ * @param integer Minimum length of tokens
*
* @return mixed Normalized string or a list of normalized tokens
*/
- public static function normalize_string($str, $as_array = false)
+ public static function normalize_string($str, $as_array = false, $minlen = 2)
{
// replace 4-byte unicode characters with '?' character,
// these are not supported in default utf-8 charset on mysql,
@@ -943,7 +951,7 @@
. ')/', '?', $str);
// split by words
- $arr = self::tokenize_string($str);
+ $arr = self::tokenize_string($str, $minlen);
// detect character set
if (utf8_encode(utf8_decode($str)) == $str) {
--
Gitblit v1.9.1