Finished flattening D1/D2 differences. Except for one or two lines, all differences...

[software/druntime.git] / src / compiler / dmd / util / utf.d
diff --git a/src/compiler/dmd/util/utf.d b/src/compiler/dmd/util/utf.d

index 84ace7c7d0c701d607465b7c5400e509dc381032..50d4a51972eed92dda777cd47d1758b18630cb7c 100644 (file)
--- a/src/compiler/dmd/util/utf.d
+++ b/src/compiler/dmd/util/utf.d
@@ -1,4 +1,4 @@
-// utf.d
+// Written in the D programming language
  
  /*
   *  Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
  
  /*
   *  Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
@@ -22,16 +22,38 @@
   *     distribution.
   */
  
   *     distribution.
   */
  
-// Description of UTF-8 at:
-// http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
-// http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
-
+/********************************************
+ * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
+ *
+ * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
+ * wchar type.
+ * For linux systems, the C wchar_t type is UTF-32 and corresponds to
+ * the D utf.dchar type.
+ *
+ * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
+ *
+ * See_Also:
+ *     $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
+ *     $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
+ *     $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
+ * Macros:
+ *     WIKI = Phobos/StdUtf
+ */
  
  module rt.util.utf;
  
  
  
  module rt.util.utf;
  
  
-extern (C) void onUnicodeError( char[] msg, size_t idx );
+extern (C) void onUnicodeError( string msg, size_t idx );
  
  
+/*******************************
+ * Test if c is a valid UTF-32 character.
+ *
+ * \uFFFE and \uFFFF are considered valid by this function,
+ * as they are permitted for internal use by an application,
+ * but they are not allowed for interchange by the Unicode standard.
+ *
+ * Returns: true if it is, false if not.
+ */
  
  bool isValidDchar(dchar c)
  {
  
  bool isValidDchar(dchar c)
  {
@@ -53,14 +75,10 @@ unittest
  }
  
  
  }
  
  
-/* This array gives the length of a UTF-8 sequence indexed by the value
- * of the leading byte. An FF represents an illegal starting value of
- * a UTF-8 sequence.
- * FF is used instead of 0 to avoid having loops hang.
- */
  
  
-ubyte[256] UTF8stride =
+auto UTF8stride =
  [
  [
+    cast(ubyte)
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -79,77 +97,89 @@ ubyte[256] UTF8stride =
      4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
  ];
  
      4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
  ];
  
-uint stride(char[] s, size_t i)
+/**
+ * stride() returns the length of a UTF-8 sequence starting at index i
+ * in string s.
+ * Returns:
+ *     The number of bytes in the UTF-8 sequence or
+ *     0xFF meaning s[i] is not the start of of UTF-8 sequence.
+ */
+uint stride(in char[] s, size_t i)
  {
      return UTF8stride[s[i]];
  }
  
  {
      return UTF8stride[s[i]];
  }
  
-uint stride(wchar[] s, size_t i)
+/**
+ * stride() returns the length of a UTF-16 sequence starting at index i
+ * in string s.
+ */
+uint stride(in wchar[] s, size_t i)
  {   uint u = s[i];
      return 1 + (u >= 0xD800 && u <= 0xDBFF);
  }
  
  {   uint u = s[i];
      return 1 + (u >= 0xD800 && u <= 0xDBFF);
  }
  
-uint stride(dchar[] s, size_t i)
+/**
+ * stride() returns the length of a UTF-32 sequence starting at index i
+ * in string s.
+ * Returns: The return value will always be 1.
+ */
+uint stride(in dchar[] s, size_t i)
  {
      return 1;
  }
  
  /*******************************************
  {
      return 1;
  }
  
  /*******************************************
- * Given an index into an array of char's,
- * and assuming that index is at the start of a UTF character,
- * determine the number of UCS characters up to that index.
+ * Given an index i into an array of characters s[],
+ * and assuming that index i is at the start of a UTF character,
+ * determine the number of UCS characters up to that index i.
   */
  
   */
  
-size_t toUCSindex(char[] s, size_t i)
+size_t toUCSindex(in char[] s, size_t i)
  {
      size_t n;
      size_t j;
  {
      size_t n;
      size_t j;
-    size_t stride;
  
  
-    for (j = 0; j < i; j += stride)
+    for (j = 0; j < i; )
      {
      {
-       stride = UTF8stride[s[j]];
-       if (stride == 0xFF)
-           goto Lerr;
+       j += stride(s, j);
         n++;
      }
      if (j > i)
      {
         n++;
      }
      if (j > i)
      {
-      Lerr:
-      onUnicodeError("invalid UTF-8 sequence", j);
+        onUnicodeError("invalid UTF-8 sequence", j);
      }
      return n;
  }
  
      }
      return n;
  }
  
-size_t toUCSindex(wchar[] s, size_t i)
+/** ditto */
+size_t toUCSindex(in wchar[] s, size_t i)
  {
      size_t n;
      size_t j;
  
      for (j = 0; j < i; )
  {
      size_t n;
      size_t j;
  
      for (j = 0; j < i; )
-    {  uint u = s[j];
-
-       j += 1 + (u >= 0xD800 && u <= 0xDBFF);
+    {
+       j += stride(s, j);
         n++;
      }
      if (j > i)
      {
         n++;
      }
      if (j > i)
      {
-      Lerr:
-      onUnicodeError("invalid UTF-16 sequence", j);
+        onUnicodeError("invalid UTF-16 sequence", j);
      }
      return n;
  }
  
      }
      return n;
  }
  
-size_t toUCSindex(dchar[] s, size_t i)
+/** ditto */
+size_t toUCSindex(in dchar[] s, size_t i)
  {
      return i;
  }
  
  /******************************************
  {
      return i;
  }
  
  /******************************************
- * Given a UCS index into an array of characters, return the UTF index.
+ * Given a UCS index n into an array of characters s[], return the UTF index.
   */
  
   */
  
-size_t toUTFindex(char[] s, size_t n)
+size_t toUTFindex(in char[] s, size_t n)
  {
      size_t i;
  
  {
      size_t i;
  
@@ -163,7 +193,8 @@ size_t toUTFindex(char[] s, size_t n)
      return i;
  }
  
      return i;
  }
  
-size_t toUTFindex(wchar[] s, size_t n)
+/** ditto */
+size_t toUTFindex(in wchar[] s, size_t n)
  {
      size_t i;
  
  {
      size_t i;
  
@@ -175,14 +206,20 @@ size_t toUTFindex(wchar[] s, size_t n)
      return i;
  }
  
      return i;
  }
  
-size_t toUTFindex(dchar[] s, size_t n)
+/** ditto */
+size_t toUTFindex(in dchar[] s, size_t n)
  {
      return n;
  }
  
  /* =================== Decode ======================= */
  
  {
      return n;
  }
  
  /* =================== Decode ======================= */
  
-dchar decode(char[] s, inout size_t idx)
+/***************
+ * Decodes and returns character starting at s[idx]. idx is advanced past the
+ * decoded character. If the character is not well formed, a UtfException is
+ * thrown and idx remains unchanged.
+ */
+dchar decode(in char[] s, inout size_t idx)
      in
      {
         assert(idx >= 0 && idx < s.length);
      in
      {
         assert(idx >= 0 && idx < s.length);
@@ -275,7 +312,7 @@ unittest
  
      debug(utf) printf("utf.decode.unittest\n");
  
  
      debug(utf) printf("utf.decode.unittest\n");
  
-    static char[] s1 = "abcd";
+    static s1 = "abcd"c;
      i = 0;
      c = decode(s1, i);
      assert(c == cast(dchar)'a');
      i = 0;
      c = decode(s1, i);
      assert(c == cast(dchar)'a');
@@ -284,20 +321,20 @@ unittest
      assert(c == cast(dchar)'b');
      assert(i == 2);
  
      assert(c == cast(dchar)'b');
      assert(i == 2);
  
-    static char[] s2 = "\xC2\xA9";
+    static s2 = "\xC2\xA9"c;
      i = 0;
      c = decode(s2, i);
      assert(c == cast(dchar)'\u00A9');
      assert(i == 2);
  
      i = 0;
      c = decode(s2, i);
      assert(c == cast(dchar)'\u00A9');
      assert(i == 2);
  
-    static char[] s3 = "\xE2\x89\xA0";
+    static s3 = "\xE2\x89\xA0"c;
      i = 0;
      c = decode(s3, i);
      assert(c == cast(dchar)'\u2260');
      assert(i == 3);
  
      i = 0;
      c = decode(s3, i);
      assert(c == cast(dchar)'\u2260');
      assert(i == 3);
  
-    static char[][] s4 =
-    [  "\xE2\x89",             // too short
+    static s4 =
+    [  "\xE2\x89"c,            // too short
         "\xC0\x8A",
         "\xE0\x80\x8A",
         "\xF0\x80\x80\x8A",
         "\xC0\x8A",
         "\xE0\x80\x8A",
         "\xF0\x80\x80\x8A",
@@ -321,9 +358,9 @@ unittest
      }
  }
  
      }
  }
  
-/********************************************************/
+/** ditto */
  
  
-dchar decode(wchar[] s, inout size_t idx)
+dchar decode(in wchar[] s, inout size_t idx)
      in
      {
         assert(idx >= 0 && idx < s.length);
      in
      {
         assert(idx >= 0 && idx < s.length);
@@ -334,7 +371,7 @@ dchar decode(wchar[] s, inout size_t idx)
      }
      body
      {
      }
      body
      {
-       char[] msg;
+       string msg;
         dchar V;
         size_t i = idx;
         uint u = s[i];
         dchar V;
         size_t i = idx;
         uint u = s[i];
@@ -379,9 +416,9 @@ dchar decode(wchar[] s, inout size_t idx)
         return cast(dchar)u; // dummy return
      }
  
         return cast(dchar)u; // dummy return
      }
  
-/********************************************************/
+/** ditto */
  
  
-dchar decode(dchar[] s, inout size_t idx)
+dchar decode(in dchar[] s, inout size_t idx)
      in
      {
         assert(idx >= 0 && idx < s.length);
      in
      {
         assert(idx >= 0 && idx < s.length);
@@ -404,6 +441,9 @@ dchar decode(dchar[] s, inout size_t idx)
  
  /* =================== Encode ======================= */
  
  
  /* =================== Encode ======================= */
  
+/*******************************
+ * Encodes character c and appends it to array s[].
+ */
  void encode(inout char[] s, dchar c)
      in
      {
  void encode(inout char[] s, dchar c)
      in
      {
@@ -456,7 +496,7 @@ unittest
  {
      debug(utf) printf("utf.encode.unittest\n");
  
  {
      debug(utf) printf("utf.encode.unittest\n");
  
-    char[] s = "abcd";
+    char[] s = "abcd".dup;
      encode(s, cast(dchar)'a');
      assert(s.length == 5);
      assert(s == "abcda");
      encode(s, cast(dchar)'a');
      assert(s.length == 5);
      assert(s == "abcda");
@@ -471,7 +511,7 @@ unittest
      assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
  }
  
      assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
  }
  
-/********************************************************/
+/** ditto */
  
  void encode(inout wchar[] s, dchar c)
      in
  
  void encode(inout wchar[] s, dchar c)
      in
@@ -497,6 +537,7 @@ void encode(inout wchar[] s, dchar c)
         s = r;
      }
  
         s = r;
      }
  
+/** ditto */
  void encode(inout dchar[] s, dchar c)
      in
      {
  void encode(inout dchar[] s, dchar c)
      in
      {
@@ -507,36 +548,46 @@ void encode(inout dchar[] s, dchar c)
         s ~= c;
      }
  
         s ~= c;
      }
  
-/* =================== Validation ======================= */
+/**
+Returns the code length of $(D c) in the encoding using $(D C) as a
+code point. The code is returned in character count, not in bytes.
+ */
  
  
-void validate(char[] s)
+ubyte codeLength(C)(dchar c)
  {
  {
-    size_t len = s.length;
-    size_t i;
  
  
-    for (i = 0; i < len; )
+    static if (C.sizeof == 1)
      {
      {
-       decode(s, i);
-    }
+        return
+            c <= 0x7F ? 1
+            : c <= 0x7FF ? 2
+            : c <= 0xFFFF ? 3
+            : c <= 0x10FFFF ? 4
+            : (assert(false), 6);
  }
  
  }
  
-void validate(wchar[] s)
+    else static if (C.sizeof == 2)
  {
  {
-    size_t len = s.length;
-    size_t i;
-
-    for (i = 0; i < len; )
+       return c <= 0xFFFF ? 1 : 2;
+    }
+    else
      {
      {
-       decode(s, i);
+        static assert(C.sizeof == 4);
+        return 1;
      }
  }
  
      }
  }
  
-void validate(dchar[] s)
-{
-    size_t len = s.length;
-    size_t i;
+/* =================== Validation ======================= */
  
  
-    for (i = 0; i < len; )
+/***********************************
+Checks to see if string is well formed or not. $(D S) can be an array
+ of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
+ if it is not. Use to check all untrusted input for correctness.
+ */
+void validate(S)(in S s)
+{
+    invariant len = s.length;
+    for (size_t i = 0; i < len; )
      {
         decode(s, i);
      }
      {
         decode(s, i);
      }
@@ -580,7 +631,10 @@ char[] toUTF8(char[4] buf, dchar c)
         assert(0);
      }
  
         assert(0);
      }
  
-char[] toUTF8(char[] s)
+/*******************
+ * Encodes string s into UTF-8 and returns the encoded string.
+ */
+string toUTF8(string s)
      in
      {
         validate(s);
      in
      {
         validate(s);
@@ -590,7 +644,8 @@ char[] toUTF8(char[] s)
         return s;
      }
  
         return s;
      }
  
-char[] toUTF8(wchar[] s)
+/** ditto */
+string toUTF8(in wchar[] s)
  {
      char[] r;
      size_t i;
  {
      char[] r;
      size_t i;
@@ -613,10 +668,11 @@ char[] toUTF8(wchar[] s)
             break;
         }
      }
             break;
         }
      }
-    return r;
+    return cast(string)r;
  }
  
  }
  
-char[] toUTF8(dchar[] s)
+/** ditto */
+string toUTF8(in dchar[] s)
  {
      char[] r;
      size_t i;
  {
      char[] r;
      size_t i;
@@ -639,7 +695,7 @@ char[] toUTF8(dchar[] s)
             break;
         }
      }
             break;
         }
      }
-    return r;
+    return cast(string)r;
  }
  
  /* =================== Conversion to UTF16 ======================= */
  }
  
  /* =================== Conversion to UTF16 ======================= */
@@ -664,7 +720,12 @@ wchar[] toUTF16(wchar[2] buf, dchar c)
         }
      }
  
         }
      }
  
-wchar[] toUTF16(char[] s)
+/****************
+ * Encodes string s into UTF-16 and returns the encoded string.
+ * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
+ * an LPWSTR or LPCWSTR argument.
+ */
+wstring toUTF16(in char[] s)
  {
      wchar[] r;
      size_t slen = s.length;
  {
      wchar[] r;
      size_t slen = s.length;
@@ -685,10 +746,12 @@ wchar[] toUTF16(char[] s)
             encode(r, c);
         }
      }
             encode(r, c);
         }
      }
-    return r;
+    return cast(wstring)r;
  }
  
  }
  
-wchar* toUTF16z(char[] s)
+alias const(wchar)* wptr;
+/** ditto */
+wptr toUTF16z(in char[] s)
  {
      wchar[] r;
      size_t slen = s.length;
  {
      wchar[] r;
      size_t slen = s.length;
@@ -713,7 +776,8 @@ wchar* toUTF16z(char[] s)
      return r.ptr;
  }
  
      return r.ptr;
  }
  
-wchar[] toUTF16(wchar[] s)
+/** ditto */
+wstring toUTF16(wstring s)
      in
      {
         validate(s);
      in
      {
         validate(s);
@@ -723,7 +787,8 @@ wchar[] toUTF16(wchar[] s)
         return s;
      }
  
         return s;
      }
  
-wchar[] toUTF16(dchar[] s)
+/** ditto */
+wstring toUTF16(in dchar[] s)
  {
      wchar[] r;
      size_t slen = s.length;
  {
      wchar[] r;
      size_t slen = s.length;
@@ -734,12 +799,15 @@ wchar[] toUTF16(dchar[] s)
      {
         encode(r, s[i]);
      }
      {
         encode(r, s[i]);
      }
-    return r;
+    return cast(wstring)r;
  }
  
  /* =================== Conversion to UTF32 ======================= */
  
  }
  
  /* =================== Conversion to UTF32 ======================= */
  
-dchar[] toUTF32(char[] s)
+/*****
+ * Encodes string s into UTF-32 and returns the encoded string.
+ */
+dstring toUTF32(in char[] s)
  {
      dchar[] r;
      size_t slen = s.length;
  {
      dchar[] r;
      size_t slen = s.length;
@@ -755,10 +823,11 @@ dchar[] toUTF32(char[] s)
             i++;                // c is ascii, no need for decode
         r[j++] = c;
      }
             i++;                // c is ascii, no need for decode
         r[j++] = c;
      }
-    return r[0 .. j];
+    return cast(dstring)r[0 .. j];
  }
  
  }
  
-dchar[] toUTF32(wchar[] s)
+/** ditto */
+dstring toUTF32(in wchar[] s)
  {
      dchar[] r;
      size_t slen = s.length;
  {
      dchar[] r;
      size_t slen = s.length;
@@ -774,10 +843,11 @@ dchar[] toUTF32(wchar[] s)
             i++;                // c is ascii, no need for decode
         r[j++] = c;
      }
             i++;                // c is ascii, no need for decode
         r[j++] = c;
      }
-    return r[0 .. j];
+    return cast(dstring)r[0 .. j];
  }
  
  }
  
-dchar[] toUTF32(dchar[] s)
+/** ditto */
+dstring toUTF32(dstring s)
      in
      {
         validate(s);
      in
      {
         validate(s);
@@ -793,14 +863,10 @@ unittest
  {
      debug(utf) printf("utf.toUTF.unittest\n");
  
  {
      debug(utf) printf("utf.toUTF.unittest\n");
  
-    char[] c;
-    wchar[] w;
-    dchar[] d;
-
-    c = "hello";
-    w = toUTF16(c);
+    auto c = "hello"c;
+    auto w = toUTF16(c);
      assert(w == "hello");
      assert(w == "hello");
-    d = toUTF32(c);
+    auto d = toUTF32(c);
      assert(d == "hello");
  
      c = toUTF8(w);
      assert(d == "hello");
  
      c = toUTF8(w);
@@ -834,7 +900,7 @@ unittest
      c = "he\U0010AAAAllo";
      w = toUTF16(c);
      //foreach (wchar c; w) printf("c = x%x\n", c);
      c = "he\U0010AAAAllo";
      w = toUTF16(c);
      //foreach (wchar c; w) printf("c = x%x\n", c);
-    //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
+    //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
      assert(w == "he\U0010AAAAllo");
      d = toUTF32(c);
      assert(d == "he\U0010AAAAllo");
      assert(w == "he\U0010AAAAllo");
      d = toUTF32(c);
      assert(d == "he\U0010AAAAllo");