From 1e3271edfb56f670b741139752041759a0feb448 Mon Sep 17 00:00:00 2001
From: alecpl <alec@alec.pl>
Date: Wed, 10 Feb 2010 09:17:45 -0500
Subject: [PATCH] - support more charset aliases

---
 program/include/main.inc |   20 +++++++++++++++++---
 1 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/program/include/main.inc b/program/include/main.inc
index f09db8c..f81e95b 100644
--- a/program/include/main.inc
+++ b/program/include/main.inc
@@ -332,7 +332,7 @@
   $charset = preg_replace(array(
     '/^[^0-9A-Z]+/',	// e.g. _ISO-8859-JP$SIO
     '/\$.*$/',		// e.g. _ISO-8859-JP$SIO
-    '/UNICODE-1-1-/',	// RFC1642
+    '/UNICODE-1-1-*/',	// RFC1641/1642
     ), '', $charset);
 
   # Aliases: some of them from HTML5 spec.
@@ -352,15 +352,24 @@
     'ISO88599'	    => 'WINDOWS-1254',
     'ISO885911'	    => 'WINDOWS-874',
     'MACROMAN'	    => 'MACINTOSH',
+    '238'           => 'WINDOWS-1250',
+    '178'           => 'WINDOWS-1256',
+    '177'           => 'WINDOWS-1255',
+    '204'           => 'WINDOWS-1251',
+    '161'           => 'WINDOWS-1253',
+    '222'           => 'WINDOWS-874',
+    '134'           => 'GBK',
+    '238'           => 'WINDOWS-1250',
+    '128'           => 'SHIFT-JIS'
   );
 
   // allow a-z and 0-9 only and remove X- prefix (e.g. X-ROMAN8 => ROMAN8)
-  $str = preg_replace(array('/[^a-z0-9]/i', '/^x+/i'), '', $charset);
+  $str = preg_replace(array('/[^A-Z0-9]/', '/^X+/'), '', $charset);
 
   if (isset($aliases[$str]))
     return $aliases[$str];
 
-  if (preg_match('/UTF(7|8|16|32)(BE|LE)*/', $str, $m))
+  if (preg_match('/U[A-Z][A-Z](7|8|16|32)(BE|LE)*/', $str, $m))
     return 'UTF-' . $m[1] . $m[2];
 
   if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) {
@@ -370,6 +379,11 @@
     return $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso;
     }
 
+  // handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE
+  if (preg_match('/WINDOWS([0-9]+)/', $str, $m)) {
+    return 'WINDOWS-' . $m[1];
+    }
+
   return $charset;
   }
 

--
Gitblit v1.9.1