From ed1d212ae2daea5e4bd043417610177093e99f19 Mon Sep 17 00:00:00 2001
From: Aleksander Machniak <alec@alec.pl>
Date: Sat, 16 Jan 2016 03:03:51 -0500
Subject: [PATCH] Improved SVG cleanup code
---
program/lib/Roundcube/rcube_utils.php | 610 ++++++++++++++++++++++++++++++++++++++++---------------
1 files changed, 441 insertions(+), 169 deletions(-)
diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php
index fabe0f0..06f4314 100644
--- a/program/lib/Roundcube/rcube_utils.php
+++ b/program/lib/Roundcube/rcube_utils.php
@@ -1,6 +1,6 @@
<?php
-/*
+/**
+-----------------------------------------------------------------------+
| This file is part of the Roundcube Webmail client |
| Copyright (C) 2008-2012, The Roundcube Dev Team |
@@ -103,13 +103,14 @@
}
foreach ($domain_array as $part) {
- if (!preg_match('/^(([A-Za-z0-9][A-Za-z0-9-]{0,61}[A-Za-z0-9])|([A-Za-z0-9]))$/', $part)) {
+ if (!preg_match('/^((xn--)?([A-Za-z0-9][A-Za-z0-9-]{0,61}[A-Za-z0-9])|([A-Za-z0-9]))$/', $part)) {
return false;
}
}
// last domain part
- if (preg_match('/[^a-zA-Z]/', array_pop($domain_array))) {
+ $last_part = array_pop($domain_array);
+ if (strpos($last_part, 'xn--') !== 0 && preg_match('/[^a-zA-Z]/', $last_part)) {
return false;
}
@@ -117,17 +118,6 @@
if (!$dns_check || !$rcube->config->get('email_dns_check')) {
return true;
- }
-
- if (strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' && version_compare(PHP_VERSION, '5.3.0', '<')) {
- $lookup = array();
- @exec("nslookup -type=MX " . escapeshellarg($domain_part) . " 2>&1", $lookup);
- foreach ($lookup as $line) {
- if (strpos($line, 'MX preference')) {
- return true;
- }
- }
- return false;
}
// find MX record(s)
@@ -144,7 +134,6 @@
return false;
}
-
/**
* Validates IPv4 or IPv6 address
*
@@ -154,41 +143,8 @@
*/
public static function check_ip($ip)
{
- // IPv6, but there's no build-in IPv6 support
- if (strpos($ip, ':') !== false && !defined('AF_INET6')) {
- $parts = explode(':', $ip);
- $count = count($parts);
-
- if ($count > 8 || $count < 2) {
- return false;
- }
-
- foreach ($parts as $idx => $part) {
- $length = strlen($part);
- if (!$length) {
- // there can be only one ::
- if ($found_empty) {
- return false;
- }
- $found_empty = true;
- }
- // last part can be an IPv4 address
- else if ($idx == $count - 1) {
- if (!preg_match('/^[0-9a-f]{1,4}$/i', $part)) {
- return @inet_pton($part) !== false;
- }
- }
- else if (!preg_match('/^[0-9a-f]{1,4}$/i', $part)) {
- return false;
- }
- }
-
- return true;
- }
-
- return @inet_pton($ip) !== false;
+ return filter_var($ip, FILTER_VALIDATE_IP) !== false;
}
-
/**
* Check whether the HTTP referer matches the current request
@@ -197,27 +153,27 @@
*/
public static function check_referer()
{
- $uri = parse_url($_SERVER['REQUEST_URI']);
+ $uri = parse_url($_SERVER['REQUEST_URI']);
$referer = parse_url(self::request_header('Referer'));
+
return $referer['host'] == self::request_header('Host') && $referer['path'] == $uri['path'];
}
-
/**
* Replacing specials characters to a specific encoding type
*
- * @param string Input string
- * @param string Encoding type: text|html|xml|js|url
- * @param string Replace mode for tags: show|replace|remove
- * @param boolean Convert newlines
+ * @param string Input string
+ * @param string Encoding type: text|html|xml|js|url
+ * @param string Replace mode for tags: show|remove|strict
+ * @param boolean Convert newlines
*
- * @return string The quoted string
+ * @return string The quoted string
*/
public static function rep_specialchars_output($str, $enctype = '', $mode = '', $newlines = true)
{
static $html_encode_arr = false;
- static $js_rep_table = false;
- static $xml_rep_table = false;
+ static $js_rep_table = false;
+ static $xml_rep_table = false;
if (!is_string($str)) {
$str = strval($str);
@@ -232,8 +188,11 @@
$encode_arr = $html_encode_arr;
- // don't replace quotes and html tags
- if ($mode == 'show' || $mode == '') {
+ if ($mode == 'remove') {
+ $str = strip_tags($str);
+ }
+ else if ($mode != 'strict') {
+ // don't replace quotes and html tags
$ltpos = strpos($str, '<');
if ($ltpos !== false && strpos($str, '>', $ltpos) !== false) {
unset($encode_arr['"']);
@@ -241,9 +200,6 @@
unset($encode_arr['>']);
unset($encode_arr['&']);
}
- }
- else if ($mode == 'remove') {
- $str = strip_tags($str);
}
$out = strtr($str, $encode_arr);
@@ -266,8 +222,8 @@
$js_rep_table["'"] = "\\'";
$js_rep_table["\\"] = "\\\\";
// Unicode line and paragraph separators (#1486310)
- $js_rep_table[chr(hexdec(E2)).chr(hexdec(80)).chr(hexdec(A8))] = '
';
- $js_rep_table[chr(hexdec(E2)).chr(hexdec(80)).chr(hexdec(A9))] = '
';
+ $js_rep_table[chr(hexdec('E2')).chr(hexdec('80')).chr(hexdec('A8'))] = '
';
+ $js_rep_table[chr(hexdec('E2')).chr(hexdec('80')).chr(hexdec('A9'))] = '
';
}
// encode for javascript use
@@ -277,7 +233,7 @@
// encode for plaintext
if ($enctype == 'text') {
- return str_replace("\r\n", "\n", $mode=='remove' ? strip_tags($str) : $str);
+ return str_replace("\r\n", "\n", $mode == 'remove' ? strip_tags($str) : $str);
}
if ($enctype == 'url') {
@@ -293,21 +249,20 @@
return $str;
}
-
/**
* Read input value and convert it for internal use
* Performs stripslashes() and charset conversion if necessary
*
- * @param string Field name to read
- * @param int Source to get value from (GPC)
- * @param boolean Allow HTML tags in field value
- * @param string Charset to convert into
+ * @param string Field name to read
+ * @param int Source to get value from (GPC)
+ * @param boolean Allow HTML tags in field value
+ * @param string Charset to convert into
*
- * @return string Field value or NULL if not available
+ * @return string Field value or NULL if not available
*/
- public static function get_input_value($fname, $source, $allow_html=FALSE, $charset=NULL)
+ public static function get_input_value($fname, $source, $allow_html = false, $charset = null)
{
- $value = NULL;
+ $value = null;
if ($source == self::INPUT_GET) {
if (isset($_GET[$fname])) {
@@ -334,18 +289,17 @@
return self::parse_input_value($value, $allow_html, $charset);
}
-
/**
* Parse/validate input value. See self::get_input_value()
* Performs stripslashes() and charset conversion if necessary
*
- * @param string Input value
- * @param boolean Allow HTML tags in field value
- * @param string Charset to convert into
+ * @param string Input value
+ * @param boolean Allow HTML tags in field value
+ * @param string Charset to convert into
*
- * @return string Parsed value
+ * @return string Parsed value
*/
- public static function parse_input_value($value, $allow_html=FALSE, $charset=NULL)
+ public static function parse_input_value($value, $allow_html = false, $charset = null)
{
global $OUTPUT;
@@ -360,12 +314,8 @@
return $value;
}
- // strip single quotes if magic_quotes_sybase is enabled
- if (ini_get('magic_quotes_sybase')) {
- $value = str_replace("''", "'", $value);
- }
// strip slashes if magic_quotes enabled
- else if (get_magic_quotes_gpc() || get_magic_quotes_runtime()) {
+ if (get_magic_quotes_gpc() || get_magic_quotes_runtime()) {
$value = stripslashes($value);
}
@@ -389,31 +339,30 @@
return $value;
}
-
/**
* Convert array of request parameters (prefixed with _)
* to a regular array with non-prefixed keys.
*
- * @param int $mode Source to get value from (GPC)
- * @param string $ignore PCRE expression to skip parameters by name
+ * @param int $mode Source to get value from (GPC)
+ * @param string $ignore PCRE expression to skip parameters by name
+ * @param boolean $allow_html Allow HTML tags in field value
*
* @return array Hash array with all request parameters
*/
- public static function request2param($mode = null, $ignore = 'task|action')
+ public static function request2param($mode = null, $ignore = 'task|action', $allow_html = false)
{
$out = array();
$src = $mode == self::INPUT_GET ? $_GET : ($mode == self::INPUT_POST ? $_POST : $_REQUEST);
- foreach ($src as $key => $value) {
+ foreach (array_keys($src) as $key) {
$fname = $key[0] == '_' ? substr($key, 1) : $key;
if ($ignore && !preg_match('/^(' . $ignore . ')$/', $fname)) {
- $out[$fname] = self::get_input_value($key, $mode);
+ $out[$fname] = self::get_input_value($key, $mode, $allow_html);
}
}
return $out;
}
-
/**
* Convert the given string into a valid HTML identifier
@@ -429,7 +378,6 @@
}
}
-
/**
* Replace all css definitions with #container [def]
* and remove css-inlined scripting
@@ -439,50 +387,57 @@
*
* @return string Modified CSS source
*/
- public static function mod_css_styles($source, $container_id, $allow_remote=false)
+ public static function mod_css_styles($source, $container_id, $allow_remote = false)
{
- $last_pos = 0;
+ $last_pos = 0;
$replacements = new rcube_string_replacer;
// ignore the whole block if evil styles are detected
$source = self::xss_entity_decode($source);
$stripped = preg_replace('/[^a-z\(:;]/i', '', $source);
$evilexpr = 'expression|behavior|javascript:|import[^a]' . (!$allow_remote ? '|url\(' : '');
+
if (preg_match("/$evilexpr/i", $stripped)) {
return '/* evil! */';
}
+ $strict_url_regexp = '!url\s*\([ "\'](https?:)//[a-z0-9/._+-]+["\' ]\)!Uims';
+
// cut out all contents between { and }
while (($pos = strpos($source, '{', $last_pos)) && ($pos2 = strpos($source, '}', $pos))) {
- $styles = substr($source, $pos+1, $pos2-($pos+1));
+ $nested = strpos($source, '{', $pos+1);
+ if ($nested && $nested < $pos2) // when dealing with nested blocks (e.g. @media), take the inner one
+ $pos = $nested;
+ $length = $pos2 - $pos - 1;
+ $styles = substr($source, $pos+1, $length);
// check every line of a style block...
if ($allow_remote) {
$a_styles = preg_split('/;[\r\n]*/', $styles, -1, PREG_SPLIT_NO_EMPTY);
+
foreach ($a_styles as $line) {
$stripped = preg_replace('/[^a-z\(:;]/i', '', $line);
// ... and only allow strict url() values
- $regexp = '!url\s*\([ "\'](https?:)//[a-z0-9/._+-]+["\' ]\)!Uims';
- if (stripos($stripped, 'url(') && !preg_match($regexp, $line)) {
+ if (stripos($stripped, 'url(') && !preg_match($strict_url_regexp, $line)) {
$a_styles = array('/* evil! */');
break;
}
}
+
$styles = join(";\n", $a_styles);
}
- $key = $replacements->add($styles);
- $source = substr($source, 0, $pos+1)
- . $replacements->get_replacement($key)
- . substr($source, $pos2, strlen($source)-$pos2);
- $last_pos = $pos+2;
+ $key = $replacements->add($styles);
+ $repl = $replacements->get_replacement($key);
+ $source = substr_replace($source, $repl, $pos+1, $length);
+ $last_pos = $pos2 - ($length - strlen($repl));
}
// remove html comments and add #container to each tag selector.
// also replace body definition because we also stripped off the <body> tag
- $styles = preg_replace(
+ $source = preg_replace(
array(
- '/(^\s*<!--)|(-->\s*$)/',
+ '/(^\s*<\!--)|(-->\s*$)/m',
'/(^\s*|,\s*|\}\s*)([a-z0-9\._#\*][a-z0-9\.\-_]*)/im',
'/'.preg_quote($container_id, '/').'\s+body/i',
),
@@ -494,35 +449,40 @@
$source);
// put block contents back in
- $styles = $replacements->resolve($styles);
+ $source = $replacements->resolve($source);
- return $styles;
+ return $source;
}
-
/**
* Generate CSS classes from mimetype and filename extension
*
- * @param string $mimetype Mimetype
- * @param string $filename Filename
+ * @param string $mimetype Mimetype
+ * @param string $filename Filename
*
* @return string CSS classes separated by space
*/
public static function file2class($mimetype, $filename)
{
+ $mimetype = strtolower($mimetype);
+ $filename = strtolower($filename);
+
list($primary, $secondary) = explode('/', $mimetype);
- $classes = array($primary ? $primary : 'unknown');
+ $classes = array($primary ?: 'unknown');
+
if ($secondary) {
$classes[] = $secondary;
}
- if (preg_match('/\.([a-z0-9]+)$/i', $filename, $m)) {
- $classes[] = $m[1];
+
+ if (preg_match('/\.([a-z0-9]+)$/', $filename, $m)) {
+ if (!in_array($m[1], $classes)) {
+ $classes[] = $m[1];
+ }
}
- return strtolower(join(" ", $classes));
+ return join(" ", $classes);
}
-
/**
* Decode escaped entities used by known XSS exploits.
@@ -542,7 +502,6 @@
return $out;
}
-
/**
* preg_replace_callback callback for xss_entity_decode
*
@@ -554,7 +513,6 @@
{
return chr(hexdec($matches[1]));
}
-
/**
* Check if we can process not exceeding memory_limit
@@ -571,7 +529,6 @@
return $mem_limit > 0 && $memory + $need > $mem_limit ? false : true;
}
-
/**
* Check if working in SSL mode
*
@@ -582,24 +539,24 @@
*/
public static function https_check($port=null, $use_https=true)
{
- global $RCMAIL;
-
if (!empty($_SERVER['HTTPS']) && strtolower($_SERVER['HTTPS']) != 'off') {
return true;
}
- if (!empty($_SERVER['HTTP_X_FORWARDED_PROTO']) && strtolower($_SERVER['HTTP_X_FORWARDED_PROTO']) == 'https') {
+ if (!empty($_SERVER['HTTP_X_FORWARDED_PROTO'])
+ && strtolower($_SERVER['HTTP_X_FORWARDED_PROTO']) == 'https'
+ && in_array($_SERVER['REMOTE_ADDR'], rcube::get_instance()->config->get('proxy_whitelist', array()))
+ ) {
return true;
}
if ($port && $_SERVER['SERVER_PORT'] == $port) {
return true;
}
- if ($use_https && isset($RCMAIL) && $RCMAIL->config->get('use_https')) {
+ if ($use_https && rcube::get_instance()->config->get('use_https')) {
return true;
}
return false;
}
-
/**
* Replaces hostname variables.
@@ -611,6 +568,10 @@
*/
public static function parse_host($name, $host = '')
{
+ if (!is_string($name)) {
+ return $name;
+ }
+
// %n - host
$n = preg_replace('/:\d+$/', '', $_SERVER['SERVER_NAME']);
// %t - host name without first part, e.g. %n=mail.domain.tld, %t=domain.tld
@@ -618,10 +579,11 @@
// %d - domain name without first part
$d = preg_replace('/^[^\.]+\./', '', $_SERVER['HTTP_HOST']);
// %h - IMAP host
- $h = $_SESSION['storage_host'] ? $_SESSION['storage_host'] : $host;
+ $h = $_SESSION['storage_host'] ?: $host;
// %z - IMAP domain without first part, e.g. %h=imap.domain.tld, %z=domain.tld
$z = preg_replace('/^[^\.]+\./', '', $h);
- // %s - domain name after the '@' from e-mail address provided at login screen. Returns FALSE if an invalid email is provided
+ // %s - domain name after the '@' from e-mail address provided at login screen.
+ // Returns FALSE if an invalid email is provided
if (strpos($name, '%s') !== false) {
$user_email = self::get_input_value('_user', self::INPUT_POST);
$user_email = self::idn_convert($user_email, true);
@@ -631,10 +593,8 @@
}
}
- $name = str_replace(array('%n', '%t', '%d', '%h', '%z', '%s'), array($n, $t, $d, $h, $z, $s[2]), $name);
- return $name;
+ return str_replace(array('%n', '%t', '%d', '%h', '%z', '%s'), array($n, $t, $d, $h, $z, $s[2]), $name);
}
-
/**
* Returns remote IP address and forwarded addresses if found
@@ -649,6 +609,7 @@
if (!empty($_SERVER['HTTP_X_REAL_IP'])) {
$remote_ip[] = 'X-Real-IP: ' . $_SERVER['HTTP_X_REAL_IP'];
}
+
// append the X-Forwarded-For header, if set
if (!empty($_SERVER['HTTP_X_FORWARDED_FOR'])) {
$remote_ip[] = 'X-Forwarded-For: ' . $_SERVER['HTTP_X_FORWARDED_FOR'];
@@ -661,13 +622,44 @@
return $address;
}
+ /**
+ * Returns the real remote IP address
+ *
+ * @return string Remote IP address
+ */
+ public static function remote_addr()
+ {
+ // Check if any of the headers are set first to improve performance
+ if (!empty($_SERVER['HTTP_X_FORWARDED_FOR']) || !empty($_SERVER['HTTP_X_REAL_IP'])) {
+ $proxy_whitelist = rcube::get_instance()->config->get('proxy_whitelist', array());
+ if (in_array($_SERVER['REMOTE_ADDR'], $proxy_whitelist)) {
+ if (!empty($_SERVER['HTTP_X_FORWARDED_FOR'])) {
+ foreach(array_reverse(explode(',', $_SERVER['HTTP_X_FORWARDED_FOR'])) as $forwarded_ip) {
+ if (!in_array($forwarded_ip, $proxy_whitelist)) {
+ return $forwarded_ip;
+ }
+ }
+ }
+
+ if (!empty($_SERVER['HTTP_X_REAL_IP'])) {
+ return $_SERVER['HTTP_X_REAL_IP'];
+ }
+ }
+ }
+
+ if (!empty($_SERVER['REMOTE_ADDR'])) {
+ return $_SERVER['REMOTE_ADDR'];
+ }
+
+ return '';
+ }
/**
* Read a specific HTTP request header.
*
- * @param string $name Header name
+ * @param string $name Header name
*
- * @return mixed Header value or null if not available
+ * @return mixed Header value or null if not available
*/
public static function request_header($name)
{
@@ -711,29 +703,102 @@
return $result;
}
-
/**
* Improved equivalent to strtotime()
*
- * @param string $date Date string
+ * @param string $date Date string
+ * @param DateTimeZone $timezone Timezone to use for DateTime object
*
* @return int Unix timestamp
*/
- public static function strtotime($date)
+ public static function strtotime($date, $timezone = null)
{
- // check for MS Outlook vCard date format YYYYMMDD
- if (preg_match('/^([12][90]\d\d)([01]\d)(\d\d)$/', trim($date), $matches)) {
- return mktime(0,0,0, intval($matches[2]), intval($matches[3]), intval($matches[1]));
+ $date = self::clean_datestr($date);
+ $tzname = $timezone ? ' ' . $timezone->getName() : '';
+
+ // unix timestamp
+ if (is_numeric($date)) {
+ return (int) $date;
}
- else if (is_numeric($date)) {
+
+ // if date parsing fails, we have a date in non-rfc format.
+ // remove token from the end and try again
+ while ((($ts = @strtotime($date . $tzname)) === false) || ($ts < 0)) {
+ $d = explode(' ', $date);
+ array_pop($d);
+ if (!$d) {
+ break;
+ }
+ $date = implode(' ', $d);
+ }
+
+ return (int) $ts;
+ }
+
+ /**
+ * Date parsing function that turns the given value into a DateTime object
+ *
+ * @param string $date Date string
+ * @param DateTimeZone $timezone Timezone to use for DateTime object
+ *
+ * @return DateTime instance or false on failure
+ */
+ public static function anytodatetime($date, $timezone = null)
+ {
+ if ($date instanceof DateTime) {
return $date;
+ }
+
+ $dt = false;
+ $date = self::clean_datestr($date);
+
+ // try to parse string with DateTime first
+ if (!empty($date)) {
+ try {
+ $dt = $timezone ? new DateTime($date, $timezone) : new DateTime($date);
+ }
+ catch (Exception $e) {
+ // ignore
+ }
+ }
+
+ // try our advanced strtotime() method
+ if (!$dt && ($timestamp = self::strtotime($date, $timezone))) {
+ try {
+ $dt = new DateTime("@".$timestamp);
+ if ($timezone) {
+ $dt->setTimezone($timezone);
+ }
+ }
+ catch (Exception $e) {
+ // ignore
+ }
+ }
+
+ return $dt;
+ }
+
+ /**
+ * Clean up date string for strtotime() input
+ *
+ * @param string $date Date string
+ *
+ * @return string Date string
+ */
+ public static function clean_datestr($date)
+ {
+ $date = trim($date);
+
+ // check for MS Outlook vCard date format YYYYMMDD
+ if (preg_match('/^([12][90]\d\d)([01]\d)([0123]\d)$/', $date, $m)) {
+ return sprintf('%04d-%02d-%02d 00:00:00', intval($m[1]), intval($m[2]), intval($m[3]));
}
// Clean malformed data
$date = preg_replace(
array(
'/GMT\s*([+-][0-9]+)/', // support non-standard "GMTXXXX" literal
- '/[^a-z0-9\x20\x09:+-]/i', // remove any invalid characters
+ '/[^a-z0-9\x20\x09:+-\/]/i', // remove any invalid characters
'/\s*(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s*/i', // remove weekday names
),
array(
@@ -744,20 +809,20 @@
$date = trim($date);
- // if date parsing fails, we have a date in non-rfc format.
- // remove token from the end and try again
- while ((($ts = @strtotime($date)) === false) || ($ts < 0)) {
- $d = explode(' ', $date);
- array_pop($d);
- if (!$d) {
- break;
- }
- $date = implode(' ', $d);
+ // try to fix dd/mm vs. mm/dd discrepancy, we can't do more here
+ if (preg_match('/^(\d{1,2})[.\/-](\d{1,2})[.\/-](\d{4})$/', $date, $m)) {
+ $mdy = $m[2] > 12 && $m[1] <= 12;
+ $day = $mdy ? $m[2] : $m[1];
+ $month = $mdy ? $m[1] : $m[2];
+ $date = sprintf('%04d-%02d-%02d 00:00:00', intval($m[3]), $month, $day);
+ }
+ // I've found that YYYY.MM.DD is recognized wrong, so here's a fix
+ else if (preg_match('/^(\d{4})\.(\d{1,2})\.(\d{1,2})$/', $date)) {
+ $date = str_replace('.', '-', $date) . ' 00:00:00';
}
- return $ts;
+ return $date;
}
-
/*
* Idn_to_ascii wrapper.
@@ -768,7 +833,6 @@
return self::idn_convert($str, true);
}
-
/*
* Idn_to_ascii wrapper.
* Intl/Idn modules version of this function doesn't work with e-mail address
@@ -778,8 +842,7 @@
return self::idn_convert($str, false);
}
-
- public static function idn_convert($input, $is_utf=false)
+ public static function idn_convert($input, $is_utf = false)
{
if ($at = strpos($input, '@')) {
$user = substr($input, 0, $at);
@@ -802,41 +865,105 @@
* Split the given string into word tokens
*
* @param string Input to tokenize
+ * @param integer Minimum length of a single token
* @return array List of tokens
*/
- public static function tokenize_string($str)
+ public static function tokenize_string($str, $minlen = 2)
{
- return explode(" ", preg_replace(
- array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'),
- array(' ', '\\1\\2', ' '),
- $str));
+ $expr = array('/[\s;,"\'\/+-]+/ui', '/(\d)[-.\s]+(\d)/u');
+ $repl = array(' ', '\\1\\2');
+
+ if ($minlen > 1) {
+ $minlen--;
+ $expr[] = "/(^|\s+)\w{1,$minlen}(\s+|$)/u";
+ $repl[] = ' ';
+ }
+
+ return array_filter(explode(" ", preg_replace($expr, $repl, $str)));
}
/**
* Normalize the given string for fulltext search.
- * Currently only optimized for Latin-1 characters; to be extended
+ * Currently only optimized for ISO-8859-1 and ISO-8859-2 characters; to be extended
*
* @param string Input string (UTF-8)
* @param boolean True to return list of words as array
- * @return mixed Normalized string or a list of normalized tokens
+ * @param integer Minimum length of tokens
+ *
+ * @return mixed Normalized string or a list of normalized tokens
*/
- public static function normalize_string($str, $as_array = false)
+ public static function normalize_string($str, $as_array = false, $minlen = 2)
{
+ // replace 4-byte unicode characters with '?' character,
+ // these are not supported in default utf-8 charset on mysql,
+ // the chance we'd need them in searching is very low
+ $str = preg_replace('/('
+ . '\xF0[\x90-\xBF][\x80-\xBF]{2}'
+ . '|[\xF1-\xF3][\x80-\xBF]{3}'
+ . '|\xF4[\x80-\x8F][\x80-\xBF]{2}'
+ . ')/', '?', $str);
+
// split by words
- $arr = self::tokenize_string($str);
+ $arr = self::tokenize_string($str, $minlen);
+
+ // detect character set
+ if (utf8_encode(utf8_decode($str)) == $str) {
+ // ISO-8859-1 (or ASCII)
+ preg_match_all('/./u', 'äâàåáãæçéêëèïîìíñöôòøõóüûùúýÿ', $keys);
+ preg_match_all('/./', 'aaaaaaaceeeeiiiinoooooouuuuyy', $values);
+
+ $mapping = array_combine($keys[0], $values[0]);
+ $mapping = array_merge($mapping, array('ß' => 'ss', 'ae' => 'a', 'oe' => 'o', 'ue' => 'u'));
+ }
+ else if (rcube_charset::convert(rcube_charset::convert($str, 'UTF-8', 'ISO-8859-2'), 'ISO-8859-2', 'UTF-8') == $str) {
+ // ISO-8859-2
+ preg_match_all('/./u', 'ąáâäćçčéęëěíîłľĺńňóôöŕřśšşťţůúűüźžżý', $keys);
+ preg_match_all('/./', 'aaaaccceeeeiilllnnooorrsssttuuuuzzzy', $values);
+
+ $mapping = array_combine($keys[0], $values[0]);
+ $mapping = array_merge($mapping, array('ß' => 'ss', 'ae' => 'a', 'oe' => 'o', 'ue' => 'u'));
+ }
foreach ($arr as $i => $part) {
- if (utf8_encode(utf8_decode($part)) == $part) { // is latin-1 ?
- $arr[$i] = utf8_encode(strtr(strtolower(strtr(utf8_decode($part),
- 'ÇçäâàåéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ',
- 'ccaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy')),
- array('ß' => 'ss', 'ae' => 'a', 'oe' => 'o', 'ue' => 'u')));
+ $part = mb_strtolower($part);
+
+ if (!empty($mapping)) {
+ $part = strtr($part, $mapping);
}
- else
- $arr[$i] = mb_strtolower($part);
+
+ $arr[$i] = $part;
}
return $as_array ? $arr : join(" ", $arr);
+ }
+
+ /**
+ * Compare two strings for matching words (order not relevant)
+ *
+ * @param string Haystack
+ * @param string Needle
+ *
+ * @return boolean True if match, False otherwise
+ */
+ public static function words_match($haystack, $needle)
+ {
+ $a_needle = self::tokenize_string($needle, 1);
+ $_haystack = join(" ", self::tokenize_string($haystack, 1));
+ $valid = strlen($_haystack) > 0;
+ $hits = 0;
+
+ foreach ($a_needle as $w) {
+ if ($valid) {
+ if (stripos($_haystack, $w) !== false) {
+ $hits++;
+ }
+ }
+ else if (stripos($haystack, $w) !== false) {
+ $hits++;
+ }
+ }
+
+ return $hits >= count($a_needle);
}
/**
@@ -915,7 +1042,6 @@
}
}
-
/**
* Find out if the string content means true or false
*
@@ -930,4 +1056,150 @@
return !in_array($str, array('false', '0', 'no', 'off', 'nein', ''), true);
}
+ /**
+ * OS-dependent absolute path detection
+ */
+ public static function is_absolute_path($path)
+ {
+ if (strtoupper(substr(PHP_OS, 0, 3)) == 'WIN') {
+ return (bool) preg_match('!^[a-z]:[\\\\/]!i', $path);
+ }
+ else {
+ return $path[0] == '/';
+ }
+ }
+
+ /**
+ * Resolve relative URL
+ *
+ * @param string $url Relative URL
+ *
+ * @return string Absolute URL
+ */
+ public static function resolve_url($url)
+ {
+ // prepend protocol://hostname:port
+ if (!preg_match('|^https?://|', $url)) {
+ $schema = 'http';
+ $default_port = 80;
+
+ if (self::https_check()) {
+ $schema = 'https';
+ $default_port = 443;
+ }
+
+ $prefix = $schema . '://' . preg_replace('/:\d+$/', '', $_SERVER['HTTP_HOST']);
+ if ($_SERVER['SERVER_PORT'] != $default_port) {
+ $prefix .= ':' . $_SERVER['SERVER_PORT'];
+ }
+
+ $url = $prefix . ($url[0] == '/' ? '' : '/') . $url;
+ }
+
+ return $url;
+ }
+
+ /**
+ * Generate a random string
+ *
+ * @param int $length String length
+ * @param bool $raw Return RAW data instead of ascii
+ *
+ * @return string The generated random string
+ */
+ public static function random_bytes($length, $raw = false)
+ {
+ // Use PHP7 true random generator
+ if (function_exists('random_bytes')) {
+ // random_bytes() can throw an Error/TypeError/Exception in some cases
+ try {
+ $random = random_bytes($length);
+ }
+ catch (Throwable $e) {}
+ }
+
+ if (!$random) {
+ $random = openssl_random_pseudo_bytes($length);
+ }
+
+ if ($raw) {
+ return $random;
+ }
+
+ $random = self::bin2ascii($random);
+
+ // truncate to the specified size...
+ if ($length < strlen($random)) {
+ $random = substr($random, 0, $length);
+ }
+
+ return $random;
+ }
+
+ /**
+ * Convert binary data into readable form (containing a-zA-Z0-9 characters)
+ *
+ * @param string $input Binary input
+ *
+ * @return string Readable output
+ */
+ public static function bin2ascii($input)
+ {
+ // Above method returns "hexits".
+ // Based on bin_to_readable() function in ext/session/session.c.
+ // Note: removed ",-" characters from hextab
+ $hextab = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ $nbits = 6; // can be 4, 5 or 6
+ $length = strlen($input);
+ $result = '';
+ $char = 0;
+ $i = 0;
+ $have = 0;
+ $mask = (1 << $nbits) - 1;
+
+ while (true) {
+ if ($have < $nbits) {
+ if ($i < $length) {
+ $char |= ord($input[$i++]) << $have;
+ $have += 8;
+ }
+ else if (!$have) {
+ break;
+ }
+ else {
+ $have = $nbits;
+ }
+ }
+
+ // consume nbits
+ $result .= $hextab[$char & $mask];
+ $char >>= $nbits;
+ $have -= $nbits;
+ }
+
+ return $result;
+ }
+
+ /**
+ * Format current date according to specified format.
+ * This method supports microseconds (u).
+ *
+ * @param string $format Date format (default: 'd-M-Y H:i:s O')
+ *
+ * @return string Formatted date
+ */
+ public static function date_format($format = null)
+ {
+ if (empty($format)) {
+ $format = 'd-M-Y H:i:s O';
+ }
+
+ if (strpos($format, 'u') !== false
+ && ($date = date_create_from_format('U.u.e', microtime(true) . '.' . date_default_timezone_get()))
+ ) {
+ return $date->format($format);
+ }
+
+ return date($format);
+ }
}
--
Gitblit v1.9.1