From 5570ad60b496974607790dab49fc80cce8f5c700 Mon Sep 17 00:00:00 2001
From: thomascube <thomas@roundcube.net>
Date: Tue, 26 Jan 2010 02:19:56 -0500
Subject: [PATCH] Improved charset detection in vcard import + added unit tests for it

---
 program/include/rcube_vcard.php |   64 ++++++++++++++++++++++---------
 1 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/program/include/rcube_vcard.php b/program/include/rcube_vcard.php
index dde14c5..320607b 100644
--- a/program/include/rcube_vcard.php
+++ b/program/include/rcube_vcard.php
@@ -47,10 +47,10 @@
   /**
    * Constructor
    */
-  public function __construct($vcard = null)
+  public function __construct($vcard = null, $charset = RCMAIL_CHARSET)
   {
     if (!empty($vcard))
-      $this->load($vcard);
+      $this->load($vcard, $charset);
   }
 
 
@@ -59,18 +59,22 @@
    *
    * @param string vCard string to parse
    */
-  public function load($vcard)
+  public function load($vcard, $charset = RCMAIL_CHARSET)
   {
     $this->raw = self::vcard_decode($vcard);
+    
+    // resolve charset parameters
+    if ($charset == null)
+      $this->raw = $this->charset_convert($this->raw);
 
     // find well-known address fields
-    $this->displayname = $this->raw['FN'][0];
+    $this->displayname = $this->raw['FN'][0][0];
     $this->surname = $this->raw['N'][0][0];
     $this->firstname = $this->raw['N'][0][1];
     $this->middlename = $this->raw['N'][0][2];
-    $this->nickname = $this->raw['NICKNAME'][0];
-    $this->organization = $this->raw['ORG'][0];
-    $this->business = ($this->raw['X-ABShowAs'][0] == 'COMPANY') || (join('', (array)$this->raw['N'][0]) == '' && !empty($this->organization));
+    $this->nickname = $this->raw['NICKNAME'][0][0];
+    $this->organization = $this->raw['ORG'][0][0];
+    $this->business = ($this->raw['X-ABSHOWAS'][0][0] == 'COMPANY') || (join('', (array)$this->raw['N'][0]) == '' && !empty($this->organization));
     
     foreach ((array)$this->raw['EMAIL'] as $i => $raw_email)
       $this->email[$i] = is_array($raw_email) ? $raw_email[0] : $raw_email;
@@ -106,7 +110,7 @@
     switch ($field) {
       case 'name':
       case 'displayname':
-        $this->raw['FN'][0] = $value;
+        $this->raw['FN'][0][0] = $value;
         break;
         
       case 'firstname':
@@ -118,11 +122,11 @@
         break;
       
       case 'nickname':
-        $this->raw['NICKNAME'][0] = $value;
+        $this->raw['NICKNAME'][0][0] = $value;
         break;
         
       case 'organization':
-        $this->raw['ORG'][0] = $value;
+        $this->raw['ORG'][0][0] = $value;
         break;
         
       case 'email':
@@ -156,6 +160,28 @@
     
     return $result;
   }
+  
+  
+  /**
+   * Convert a whole vcard (array) to UTF-8.
+   * Each member value that has a charset parameter will be converted.
+   */
+  private function charset_convert($card)
+  {
+    foreach ($card as $key => $node) {
+      foreach ($node as $i => $subnode) {
+        if (is_array($subnode) && $subnode['charset'] && ($charset = $subnode['charset'][0])) {
+          foreach ($subnode as $j => $value) {
+            if (is_numeric($j) && is_string($value))
+              $card[$key][$i][$j] = rcube_charset_convert($value, $charset);
+          }
+          unset($card[$key][$i]['charset']);
+        }
+      }
+    }
+
+    return $card;
+  }
 
 
   /**
@@ -168,11 +194,14 @@
   {
     $out = array();
 
+    // check if charsets are specified (usually vcard version < 3.0 but this is not reliable)
+    if (preg_match('/charset=/i', substr($data, 0, 2048)))
+      $charset = null;
     // detect charset and convert to utf-8
-    $encoding = self::detect_encoding($data);
-    if ($encoding && $encoding != RCMAIL_CHARSET) {
-      $data = rcube_charset_convert($data, $encoding);
+    else if (($charset = self::detect_encoding($data)) && $charset != RCMAIL_CHARSET) {
+      $data = rcube_charset_convert($data, $charset);
       $data = preg_replace(array('/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'), '', $data); // also remove BOM
+      $charset = RCMAIL_CHARSET;
     }
 
     $vcard_block = '';
@@ -184,7 +213,7 @@
 
       if (trim($line) == 'END:VCARD') {
         // parse vcard
-        $obj = new rcube_vcard(self::cleanup($vcard_block));
+        $obj = new rcube_vcard(self::cleanup($vcard_block), $charset);
         if (!empty($obj->displayname))
           $out[] = $obj;
 
@@ -217,9 +246,6 @@
     // Remove cruft like item1.X-AB*, item1.ADR instead of ADR, and empty lines
     $vcard = preg_replace(array('/^item\d*\.X-AB.*$/m', '/^item\d*\./m', "/\n+/"), array('', '', "\n"), $vcard);
 
-    // remove vcard 2.1 charset definitions
-    $vcard = preg_replace('/;CHARSET=[^:;]+/', '', $vcard);
-    
     // if N doesn't have any semicolons, add some 
     $vcard = preg_replace('/^(N:[^;\R]*)$/m', '\1;;;;', $vcard);
 
@@ -269,7 +295,7 @@
 
         foreach($regs2[1] as $attrid => $attr) {
           if ((list($key, $value) = explode('=', $attr)) && $value) {
-	    $value = trim($value);
+            $value = trim($value);
             if ($key == 'ENCODING') {
               // add next line(s) to value string if QP line end detected
               while ($value == 'QUOTED-PRINTABLE' && preg_match('/=$/', $lines[$i]))
@@ -286,7 +312,7 @@
         }
 
         $entry = array_merge($entry, (array)self::vcard_unquote($line[2]));
-        $data[$field][] = count($entry) > 1 ? $entry : $entry[0];
+        $data[$field][] = $entry;
       }
     }
 

--
Gitblit v1.9.1