Fix rcube_utils::normalize_string() to support unicode characters + add argument for minimum token length
| | |
| | | * Split the given string into word tokens |
| | | * |
| | | * @param string Input to tokenize |
| | | * @param integer Minimum length of a single token |
| | | * @return array List of tokens |
| | | */ |
| | | public static function tokenize_string($str) |
| | | public static function tokenize_string($str, $minlen = 2) |
| | | { |
| | | return explode(" ", preg_replace( |
| | | array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'), |
| | | array(' ', '\\1\\2', ' '), |
| | | $str)); |
| | | $expr = array('/[\s;\/+-]+/ui', '/(\d)[-.\s]+(\d)/u'); |
| | | $repl = array(' ', '\\1\\2'); |
| | | |
| | | if ($minlen > 1) { |
| | | $minlen--; |
| | | $expr[] = "/(^|\s+)\w{1,$minlen}(\s+|$)/u"; |
| | | $repl[] = ' '; |
| | | } |
| | | |
| | | return array_filter(explode(" ", preg_replace($expr, $repl, $str))); |
| | | } |
| | | |
| | | /** |
| | |
| | | * |
| | | * @param string Input string (UTF-8) |
| | | * @param boolean True to return list of words as array |
| | | * @param integer Minimum length of tokens |
| | | * |
| | | * @return mixed Normalized string or a list of normalized tokens |
| | | */ |
| | | public static function normalize_string($str, $as_array = false) |
| | | public static function normalize_string($str, $as_array = false, $minlen = 2) |
| | | { |
| | | // replace 4-byte unicode characters with '?' character, |
| | | // these are not supported in default utf-8 charset on mysql, |
| | |
| | | . ')/', '?', $str); |
| | | |
| | | // split by words |
| | | $arr = self::tokenize_string($str); |
| | | $arr = self::tokenize_string($str, $minlen); |
| | | |
| | | // detect character set |
| | | if (utf8_encode(utf8_decode($str)) == $str) { |
| | |
| | | } |
| | | |
| | | /** |
| | | * rcube:utils::tokenize_string() |
| | | */ |
| | | function test_tokenize_string() |
| | | { |
| | | $test = array( |
| | | '' => array(), |
| | | 'abc d' => array('abc'), |
| | | 'abc de' => array('abc','de'), |
| | | 'äàé;êöü-xyz' => array('äàé','êöü','xyz'), |
| | | '日期格式' => array('日期格式'), |
| | | ); |
| | | |
| | | foreach ($test as $input => $output) { |
| | | $result = rcube_utils::tokenize_string($input); |
| | | $this->assertSame($output, $result); |
| | | } |
| | | } |
| | | |
| | | /** |
| | | * rcube:utils::normalize_string() |
| | | */ |
| | | function test_normalize_string() |
| | |
| | | 'abc def' => 'abc def', |
| | | 'ÇçäâàåæéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ' => 'ccaaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy', |
| | | 'ąáâäćçčéęëěíîłľĺńňóôöŕřśšşťţůúűüźžżýĄŚŻŹĆ' => 'aaaaccceeeeiilllnnooorrsssttuuuuzzzyaszzc', |
| | | 'ß' => 'ss', |
| | | 'ae' => 'a', |
| | | 'oe' => 'o', |
| | | 'ue' => 'u', |
| | | 'ß' => '', |
| | | 'ßs' => 'sss', |
| | | 'Xae' => 'xa', |
| | | 'Xoe' => 'xo', |
| | | 'Xue' => 'xu', |
| | | '项目' => '项目', |
| | | '日' => '', // FIXME: this should not be stripped although minlen = 2 |
| | | ); |
| | | |
| | | foreach ($test as $input => $output) { |
| | | $result = rcube_utils::normalize_string($input); |
| | | $this->assertSame($output, $result); |
| | | $this->assertSame($output, $result, "Error normalizing '$input'"); |
| | | } |
| | | } |
| | | |