[MLton-commit] r4253

Wesley Terpstra MLton@mlton.org
Thu, 24 Nov 2005 07:45:22 -0800


The converter signature and beginnings of converter implementation.
----------------------------------------------------------------------

A   mlton/branches/unicode/basis-library/i18n/
A   mlton/branches/unicode/basis-library/i18n/converter.sig
A   mlton/branches/unicode/basis-library/i18n/converter.sml
A   mlton/branches/unicode/basis-library/i18n.mlb

----------------------------------------------------------------------

Added: mlton/branches/unicode/basis-library/i18n/converter.sig
===================================================================
--- mlton/branches/unicode/basis-library/i18n/converter.sig	2005-11-24 01:08:59 UTC (rev 4252)
+++ mlton/branches/unicode/basis-library/i18n/converter.sig	2005-11-24 15:45:21 UTC (rev 4253)
@@ -0,0 +1,70 @@
+signature CHARSET_CONVERTER =
+   sig
+      structure Encoding:
+         sig
+            type t
+
+            val equals: t * t -> bool
+            
+            val fromName: string -> t option
+            val toName: t -> string
+            
+            (* also needed for bare minimum support (in order of usefulness):
+            val punycode: t
+            val utf7: t
+            val gb18030: t
+            val cesu8: t
+            val scsu: t
+            *)
+            val utf8: t
+            (* the "be" and "le" control endian in the absense of FFFE *)
+            val utf16be: t
+            val utf16le: t
+            val utf32be: t
+            val utf32le: t
+         end
+      
+      (* Unfortunately, unlike all of the StringCvt methods provided in the
+       * basis, charset encodings can be stateful. For example, consider a
+       * fictituous charset consisting of letters A-Z and a 'uppercase'
+       * and 'lowercase' char (u & l respectively). Then "BlBBuCB" = "BbbCB".
+       * For this reason, decoders need to keep a state in addition to the
+       * stream position. Encoders also need to be 'flush'ed at the end of
+       * encoding to restore a stateful output stream to the initial state.
+       *)
+      type state
+      val embed: unit -> ('a -> state) * (state -> 'a option)
+      
+      type 'a decoder = {
+         initial: state,
+         decoder:  (Word8.word, 'a) reader -> (WideChar.char, 'a * state) reader
+      }
+      val decoder: Encoding.t -> 'a decoder
+      
+      (* The encoder will only write up to the first unicode character which
+       * cannot be represented in the output charset.
+       *)
+      type encoder = {
+         initial: state * Word8Vector.vector,
+         encoder: state * WideSubtring.substring -> 
+                  state * WideSubstring.substring * Word8Vector.vector,
+         flush:   state -> Word8Vector.vector
+         }
+      val encoder: Encoding.t -> encoder
+      
+      (* Convenience functions *)
+      val decode: Encoding.t * Word8VectorSlice.vector_slice -> WideString.string option
+      val encode: Encoding.t * WideSubstring.substring -> Word8Vector.vector option
+      
+      (* The register method allows you to add support for new encodings.
+       * The name is used case insensitively.
+       * The decoder has concrete type "state decoder" to work around SML's
+       * lack of higher order types. However, you must not peek inside it.
+       *)
+      type user_coder = {
+         name: string,
+         decoder: state decoder,
+         encoder: encoder
+         }
+      val register: user_coder -> unit
+   end

Added: mlton/branches/unicode/basis-library/i18n/converter.sml
===================================================================
--- mlton/branches/unicode/basis-library/i18n/converter.sml	2005-11-24 01:08:59 UTC (rev 4252)
+++ mlton/branches/unicode/basis-library/i18n/converter.sml	2005-11-24 15:45:21 UTC (rev 4253)
@@ -0,0 +1,94 @@
+structure CharsetConverter :> CHARSET_CONVERTER =
+   struct
+      (* http://mlton.org/UniversalType *)
+      type state = exn
+      fun 'a embed () =
+         let
+            exception E of 'a
+            fun project (e: t): 'a option =
+               case e of
+                  E a => SOME a
+                | _ => NONE
+         in
+            (E, project)
+         end
+      
+      type 'a decoder = {
+         initial: state,
+         decoder:  (Word8.word, 'a) reader -> (WideChar.char, 'a * state) reader
+      }
+      
+      type encoder = {
+         initial: state * Word8Vector.vector,
+         encoder: state * WideSubtring.substring -> 
+                  state * WideSubstring.substring * Word8Vector.vector,
+         flush:   state -> Word8Vector.vector
+         }
+      
+      type user_coder = {
+         name: string,
+         decoder: state decoder,
+         encoder: encoder
+         }
+      
+      val coders : user_coder list ref = ref []
+      fun register x = coders := x :: (!coders)
+      
+      structure Encoding =
+         struct
+            datatype t = 
+               UTF8 | UTF16BE | UTF16LE | UTF32BE | UTF32LE |
+               USER of user_coder
+            
+            val equals = op =
+            
+            fun canonName s =
+               String.translate
+                  (fn #"_" => ""
+                    | #"-" => ""
+                    | x => String.str (Char.toUpper x)) s
+                  
+            fun fromName s = case canonName s of
+               "UTF8" => SOME UTF8
+             | "UCS2" => SOME UTF16LE
+             | "UCS2LE" => SOME UTF16LE
+             | "UCS2BE" => SOME UTF16BE
+             | "UTF16" => SOME UTF16LE (* guess little-endian for now *)
+             | "UTF16LE" => SOME UTF16LE
+             | "UTF16BE" => SOME UTF16BE
+             | "UCS4" => SOME UTF32LE
+             | "UCS4LE" => SOME UTF32LE
+             | "UCS4BE" => SOME UTF32BE
+             | "UTF32" => SOME UTF32LE (* guess little-endian for now *)
+             | "UTF32LE" => SOME UTF32LE
+             | "UTF32BE" => SOME UTF32BE
+             | s => 
+                  case List.find (fn {name, ...} => name = s) (!coders) of
+                     NONE => NONE
+                   | SOME x => SOME (USER x)
+            
+            fun toName UTF8 = "UTF-8"
+              | toName UTF16BE = "UTF-16BE"
+              | toName UTF16LE = "UTF-16LE"
+              | toName UTF32BE = "UTF-32BE"
+              | toName UTF32LE = "UTF-32LE"
+              | toName (User {name, ...}) => name
+            
+            val utf8 = UTF8
+            val utf16be = UTF16BE
+            val utf16le = UTF16LE
+            val utf32be = UTF32BE
+            val utf32le = UTF32LE
+         end
+      
+      fun decode (e, vs) =
+         let
+            val { initial, decoder } = decoder e
+            fun get vs =
+               if Word8VectorSlice.length vs = 0 then NONE else
+               SOME (Word8VectorSlice.sub (vs, 0), 
+                     Word8VectorSlice.subslice (vs, 1, NONE))
+         in
+            ()
+         end
+   end

Added: mlton/branches/unicode/basis-library/i18n.mlb
===================================================================
--- mlton/branches/unicode/basis-library/i18n.mlb	2005-11-24 01:08:59 UTC (rev 4252)
+++ mlton/branches/unicode/basis-library/i18n.mlb	2005-11-24 15:45:21 UTC (rev 4253)
@@ -0,0 +1,50 @@
+(* Copyright (C) 2004-2005 Henry Cejtin, Matthew Fluet, Suresh
+ *    Jagannathan, and Stephen Weeks.
+ *
+ * MLton is released under a BSD-style license.
+ * See the file MLton-LICENSE for details.
+ *)
+
+ann  
+   "deadCode true"
+   "sequenceNonUnit warn"
+   "nonexhaustiveMatch warn" "redundantMatch warn"
+   "warnUnused true" "forceUsed"
+in
+   local
+      libs/basis-extra/basis-extra.mlb
+   in
+      structure Char4 : CHAR
+      structure Char4Array : MONO_ARRAY
+      structure Char4Array2 : MONO_ARRAY2
+      structure Char4ArraySlice : MONO_ARRAY_SLICE
+      structure Char4Vector : MONO_VECTOR
+      structure Char4VectorSlice : MONO_VECTOR_SLICE
+      structure String4 : STRING
+      structure Substring4 : SUBSTRING
+      structure Text4 : TEXT
+
+      structure Char2 : CHAR
+      structure Char2Array : MONO_ARRAY
+      structure Char2Array2 : MONO_ARRAY2
+      structure Char2ArraySlice : MONO_ARRAY_SLICE
+      structure Char2Vector : MONO_VECTOR
+      structure Char2VectorSlice : MONO_VECTOR_SLICE
+      structure String2 : STRING
+      structure Substring2 : SUBSTRING
+      structure Text2 : TEXT
+
+      structure Char1 : CHAR
+      structure Char1Array : MONO_ARRAY
+      structure Char1Array2 : MONO_ARRAY2
+      structure Char1ArraySlice : MONO_ARRAY_SLICE
+      structure Char1Vector : MONO_VECTOR
+      structure Char1VectorSlice : MONO_VECTOR_SLICE
+      structure String1 : STRING
+      structure Substring1 : SUBSTRING
+      structure Text1 : TEXT
+      
+      signature CHARSET_CONVERTER
+      structure CharsetConverter : CHARSET_CONVERTER
+   end
+end