src/compiler/dmd/util/utf.d

   1 // Written in the D programming language
   2
   3 /*
   4  *  Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
   5  *  Written by Walter Bright
   6  *
   7  *  This software is provided 'as-is', without any express or implied
   8  *  warranty. In no event will the authors be held liable for any damages
   9  *  arising from the use of this software.
  10  *
  11  *  Permission is granted to anyone to use this software for any purpose,
  12  *  including commercial applications, and to alter it and redistribute it
  13  *  freely, subject to the following restrictions:
  14  *
  15  *  o  The origin of this software must not be misrepresented; you must not
  16  *     claim that you wrote the original software. If you use this software
  17  *     in a product, an acknowledgment in the product documentation would be
  18  *     appreciated but is not required.
  19  *  o  Altered source versions must be plainly marked as such, and must not
  20  *     be misrepresented as being the original software.
  21  *  o  This notice may not be removed or altered from any source
  22  *     distribution.
  23  */
  24
  25 /********************************************
  26  * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
  27  *
  28  * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
  29  * wchar type.
  30  * For linux systems, the C wchar_t type is UTF-32 and corresponds to
  31  * the D utf.dchar type.
  32  *
  33  * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
  34  *
  35  * See_Also:
  36  *      $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
  37  *      $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
  38  *      $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
  39  * Macros:
  40  *      WIKI = Phobos/StdUtf
  41  */
  42
  43 module rt.util.utf;
  44
  45
  46 extern (C) void onUnicodeError( string msg, size_t idx );
  47
  48 /*******************************
  49  * Test if c is a valid UTF-32 character.
  50  *
  51  * \uFFFE and \uFFFF are considered valid by this function,
  52  * as they are permitted for internal use by an application,
  53  * but they are not allowed for interchange by the Unicode standard.
  54  *
  55  * Returns: true if it is, false if not.
  56  */
  57
  58 bool isValidDchar(dchar c)
  59 {
  60     /* Note: FFFE and FFFF are specifically permitted by the
  61      * Unicode standard for application internal use, but are not
  62      * allowed for interchange.
  63      * (thanks to Arcane Jill)
  64      */
  65
  66     return c < 0xD800 ||
  67         (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
  68 }
  69
  70 unittest
  71 {
  72     debug(utf) printf("utf.isValidDchar.unittest\n");
  73     assert(isValidDchar(cast(dchar)'a') == true);
  74     assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
  75 }
  76
  77
  78
  79 auto UTF8stride =
  80 [
  81     cast(ubyte)
  82     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  84     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  85     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  86     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  87     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  88     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  89     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  90     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  91     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  92     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  93     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
  94     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  95     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  96     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  97     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
  98 ];
  99
 100 /**
 101  * stride() returns the length of a UTF-8 sequence starting at index i
 102  * in string s.
 103  * Returns:
 104  *      The number of bytes in the UTF-8 sequence or
 105  *      0xFF meaning s[i] is not the start of of UTF-8 sequence.
 106  */
 107 uint stride(in char[] s, size_t i)
 108 {
 109     return UTF8stride[s[i]];
 110 }
 111
 112 /**
 113  * stride() returns the length of a UTF-16 sequence starting at index i
 114  * in string s.
 115  */
 116 uint stride(in wchar[] s, size_t i)
 117 {   uint u = s[i];
 118     return 1 + (u >= 0xD800 && u <= 0xDBFF);
 119 }
 120
 121 /**
 122  * stride() returns the length of a UTF-32 sequence starting at index i
 123  * in string s.
 124  * Returns: The return value will always be 1.
 125  */
 126 uint stride(in dchar[] s, size_t i)
 127 {
 128     return 1;
 129 }
 130
 131 /*******************************************
 132  * Given an index i into an array of characters s[],
 133  * and assuming that index i is at the start of a UTF character,
 134  * determine the number of UCS characters up to that index i.
 135  */
 136
 137 size_t toUCSindex(in char[] s, size_t i)
 138 {
 139     size_t n;
 140     size_t j;
 141
 142     for (j = 0; j < i; )
 143     {
 144         j += stride(s, j);
 145         n++;
 146     }
 147     if (j > i)
 148     {
 149         onUnicodeError("invalid UTF-8 sequence", j);
 150     }
 151     return n;
 152 }
 153
 154 /** ditto */
 155 size_t toUCSindex(in wchar[] s, size_t i)
 156 {
 157     size_t n;
 158     size_t j;
 159
 160     for (j = 0; j < i; )
 161     {
 162         j += stride(s, j);
 163         n++;
 164     }
 165     if (j > i)
 166     {
 167         onUnicodeError("invalid UTF-16 sequence", j);
 168     }
 169     return n;
 170 }
 171
 172 /** ditto */
 173 size_t toUCSindex(in dchar[] s, size_t i)
 174 {
 175     return i;
 176 }
 177
 178 /******************************************
 179  * Given a UCS index n into an array of characters s[], return the UTF index.
 180  */
 181
 182 size_t toUTFindex(in char[] s, size_t n)
 183 {
 184     size_t i;
 185
 186     while (n--)
 187     {
 188         uint j = UTF8stride[s[i]];
 189         if (j == 0xFF)
 190             onUnicodeError("invalid UTF-8 sequence", i);
 191         i += j;
 192     }
 193     return i;
 194 }
 195
 196 /** ditto */
 197 size_t toUTFindex(in wchar[] s, size_t n)
 198 {
 199     size_t i;
 200
 201     while (n--)
 202     {   wchar u = s[i];
 203
 204         i += 1 + (u >= 0xD800 && u <= 0xDBFF);
 205     }
 206     return i;
 207 }
 208
 209 /** ditto */
 210 size_t toUTFindex(in dchar[] s, size_t n)
 211 {
 212     return n;
 213 }
 214
 215 /* =================== Decode ======================= */
 216
 217 /***************
 218  * Decodes and returns character starting at s[idx]. idx is advanced past the
 219  * decoded character. If the character is not well formed, a UtfException is
 220  * thrown and idx remains unchanged.
 221  */
 222 dchar decode(in char[] s, inout size_t idx)
 223     in
 224     {
 225         assert(idx >= 0 && idx < s.length);
 226     }
 227     out (result)
 228     {
 229         assert(isValidDchar(result));
 230     }
 231     body
 232     {
 233         size_t len = s.length;
 234         dchar V;
 235         size_t i = idx;
 236         char u = s[i];
 237
 238         if (u & 0x80)
 239         {   uint n;
 240             char u2;
 241
 242             /* The following encodings are valid, except for the 5 and 6 byte
 243              * combinations:
 244              *  0xxxxxxx
 245              *  110xxxxx 10xxxxxx
 246              *  1110xxxx 10xxxxxx 10xxxxxx
 247              *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 248              *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 249              *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 250              */
 251             for (n = 1; ; n++)
 252             {
 253                 if (n > 4)
 254                     goto Lerr;          // only do the first 4 of 6 encodings
 255                 if (((u << n) & 0x80) == 0)
 256                 {
 257                     if (n == 1)
 258                         goto Lerr;
 259                     break;
 260                 }
 261             }
 262
 263             // Pick off (7 - n) significant bits of B from first byte of octet
 264             V = cast(dchar)(u & ((1 << (7 - n)) - 1));
 265
 266             if (i + (n - 1) >= len)
 267                 goto Lerr;                      // off end of string
 268
 269             /* The following combinations are overlong, and illegal:
 270              *  1100000x (10xxxxxx)
 271              *  11100000 100xxxxx (10xxxxxx)
 272              *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
 273              *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
 274              *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
 275              */
 276             u2 = s[i + 1];
 277             if ((u & 0xFE) == 0xC0 ||
 278                 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
 279                 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
 280                 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
 281                 (u == 0xFC && (u2 & 0xFC) == 0x80))
 282                 goto Lerr;                      // overlong combination
 283
 284             for (uint j = 1; j != n; j++)
 285             {
 286                 u = s[i + j];
 287                 if ((u & 0xC0) != 0x80)
 288                     goto Lerr;                  // trailing bytes are 10xxxxxx
 289                 V = (V << 6) | (u & 0x3F);
 290             }
 291             if (!isValidDchar(V))
 292                 goto Lerr;
 293             i += n;
 294         }
 295         else
 296         {
 297             V = cast(dchar) u;
 298             i++;
 299         }
 300
 301         idx = i;
 302         return V;
 303
 304       Lerr:
 305       onUnicodeError("invalid UTF-8 sequence", i);
 306     return V; // dummy return
 307     }
 308
 309 unittest
 310 {   size_t i;
 311     dchar c;
 312
 313     debug(utf) printf("utf.decode.unittest\n");
 314
 315     static s1 = "abcd"c;
 316     i = 0;
 317     c = decode(s1, i);
 318     assert(c == cast(dchar)'a');
 319     assert(i == 1);
 320     c = decode(s1, i);
 321     assert(c == cast(dchar)'b');
 322     assert(i == 2);
 323
 324     static s2 = "\xC2\xA9"c;
 325     i = 0;
 326     c = decode(s2, i);
 327     assert(c == cast(dchar)'\u00A9');
 328     assert(i == 2);
 329
 330     static s3 = "\xE2\x89\xA0"c;
 331     i = 0;
 332     c = decode(s3, i);
 333     assert(c == cast(dchar)'\u2260');
 334     assert(i == 3);
 335
 336     static s4 =
 337     [   "\xE2\x89"c,            // too short
 338         "\xC0\x8A",
 339         "\xE0\x80\x8A",
 340         "\xF0\x80\x80\x8A",
 341         "\xF8\x80\x80\x80\x8A",
 342         "\xFC\x80\x80\x80\x80\x8A",
 343     ];
 344
 345     for (int j = 0; j < s4.length; j++)
 346     {
 347         try
 348         {
 349             i = 0;
 350             c = decode(s4[j], i);
 351             assert(0);
 352         }
 353         catch (Object o)
 354         {
 355             i = 23;
 356         }
 357         assert(i == 23);
 358     }
 359 }
 360
 361 /** ditto */
 362
 363 dchar decode(in wchar[] s, inout size_t idx)
 364     in
 365     {
 366         assert(idx >= 0 && idx < s.length);
 367     }
 368     out (result)
 369     {
 370         assert(isValidDchar(result));
 371     }
 372     body
 373     {
 374         string msg;
 375         dchar V;
 376         size_t i = idx;
 377         uint u = s[i];
 378
 379         if (u & ~0x7F)
 380         {   if (u >= 0xD800 && u <= 0xDBFF)
 381             {   uint u2;
 382
 383                 if (i + 1 == s.length)
 384                 {   msg = "surrogate UTF-16 high value past end of string";
 385                     goto Lerr;
 386                 }
 387                 u2 = s[i + 1];
 388                 if (u2 < 0xDC00 || u2 > 0xDFFF)
 389                 {   msg = "surrogate UTF-16 low value out of range";
 390                     goto Lerr;
 391                 }
 392                 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
 393                 i += 2;
 394             }
 395             else if (u >= 0xDC00 && u <= 0xDFFF)
 396             {   msg = "unpaired surrogate UTF-16 value";
 397                 goto Lerr;
 398             }
 399             else if (u == 0xFFFE || u == 0xFFFF)
 400             {   msg = "illegal UTF-16 value";
 401                 goto Lerr;
 402             }
 403             else
 404                 i++;
 405         }
 406         else
 407         {
 408             i++;
 409         }
 410
 411         idx = i;
 412         return cast(dchar)u;
 413
 414       Lerr:
 415           onUnicodeError(msg, i);
 416         return cast(dchar)u; // dummy return
 417     }
 418
 419 /** ditto */
 420
 421 dchar decode(in dchar[] s, inout size_t idx)
 422     in
 423     {
 424         assert(idx >= 0 && idx < s.length);
 425     }
 426     body
 427     {
 428         size_t i = idx;
 429         dchar c = s[i];
 430
 431         if (!isValidDchar(c))
 432             goto Lerr;
 433         idx = i + 1;
 434         return c;
 435
 436       Lerr:
 437           onUnicodeError("invalid UTF-32 value", i);
 438         return c; // dummy return
 439     }
 440
 441
 442 /* =================== Encode ======================= */
 443
 444 /*******************************
 445  * Encodes character c and appends it to array s[].
 446  */
 447 void encode(inout char[] s, dchar c)
 448     in
 449     {
 450         assert(isValidDchar(c));
 451     }
 452     body
 453     {
 454         char[] r = s;
 455
 456         if (c <= 0x7F)
 457         {
 458             r ~= cast(char) c;
 459         }
 460         else
 461         {
 462             char[4] buf;
 463             uint L;
 464
 465             if (c <= 0x7FF)
 466             {
 467                 buf[0] = cast(char)(0xC0 | (c >> 6));
 468                 buf[1] = cast(char)(0x80 | (c & 0x3F));
 469                 L = 2;
 470             }
 471             else if (c <= 0xFFFF)
 472             {
 473                 buf[0] = cast(char)(0xE0 | (c >> 12));
 474                 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
 475                 buf[2] = cast(char)(0x80 | (c & 0x3F));
 476                 L = 3;
 477             }
 478             else if (c <= 0x10FFFF)
 479             {
 480                 buf[0] = cast(char)(0xF0 | (c >> 18));
 481                 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
 482                 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
 483                 buf[3] = cast(char)(0x80 | (c & 0x3F));
 484                 L = 4;
 485             }
 486             else
 487             {
 488                 assert(0);
 489             }
 490             r ~= buf[0 .. L];
 491         }
 492         s = r;
 493     }
 494
 495 unittest
 496 {
 497     debug(utf) printf("utf.encode.unittest\n");
 498
 499     char[] s = "abcd".dup;
 500     encode(s, cast(dchar)'a');
 501     assert(s.length == 5);
 502     assert(s == "abcda");
 503
 504     encode(s, cast(dchar)'\u00A9');
 505     assert(s.length == 7);
 506     assert(s == "abcda\xC2\xA9");
 507     //assert(s == "abcda\u00A9");       // BUG: fix compiler
 508
 509     encode(s, cast(dchar)'\u2260');
 510     assert(s.length == 10);
 511     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
 512 }
 513
 514 /** ditto */
 515
 516 void encode(inout wchar[] s, dchar c)
 517     in
 518     {
 519         assert(isValidDchar(c));
 520     }
 521     body
 522     {
 523         wchar[] r = s;
 524
 525         if (c <= 0xFFFF)
 526         {
 527             r ~= cast(wchar) c;
 528         }
 529         else
 530         {
 531             wchar[2] buf;
 532
 533             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
 534             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
 535             r ~= buf;
 536         }
 537         s = r;
 538     }
 539
 540 /** ditto */
 541 void encode(inout dchar[] s, dchar c)
 542     in
 543     {
 544         assert(isValidDchar(c));
 545     }
 546     body
 547     {
 548         s ~= c;
 549     }
 550
 551 /**
 552 Returns the code length of $(D c) in the encoding using $(D C) as a
 553 code point. The code is returned in character count, not in bytes.
 554  */
 555
 556 ubyte codeLength(C)(dchar c)
 557 {
 558
 559     static if (C.sizeof == 1)
 560     {
 561         return
 562             c <= 0x7F ? 1
 563             : c <= 0x7FF ? 2
 564             : c <= 0xFFFF ? 3
 565             : c <= 0x10FFFF ? 4
 566             : (assert(false), 6);
 567 }
 568
 569     else static if (C.sizeof == 2)
 570 {
 571         return c <= 0xFFFF ? 1 : 2;
 572     }
 573     else
 574     {
 575         static assert(C.sizeof == 4);
 576         return 1;
 577     }
 578 }
 579
 580 /* =================== Validation ======================= */
 581
 582 /***********************************
 583 Checks to see if string is well formed or not. $(D S) can be an array
 584  of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
 585  if it is not. Use to check all untrusted input for correctness.
 586  */
 587 void validate(S)(in S s)
 588 {
 589     invariant len = s.length;
 590     for (size_t i = 0; i < len; )
 591     {
 592         decode(s, i);
 593     }
 594 }
 595
 596 /* =================== Conversion to UTF8 ======================= */
 597
 598 char[] toUTF8(char[4] buf, dchar c)
 599     in
 600     {
 601         assert(isValidDchar(c));
 602     }
 603     body
 604     {
 605         if (c <= 0x7F)
 606         {
 607             buf[0] = cast(char) c;
 608             return buf[0 .. 1];
 609         }
 610         else if (c <= 0x7FF)
 611         {
 612             buf[0] = cast(char)(0xC0 | (c >> 6));
 613             buf[1] = cast(char)(0x80 | (c & 0x3F));
 614             return buf[0 .. 2];
 615         }
 616         else if (c <= 0xFFFF)
 617         {
 618             buf[0] = cast(char)(0xE0 | (c >> 12));
 619             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
 620             buf[2] = cast(char)(0x80 | (c & 0x3F));
 621             return buf[0 .. 3];
 622         }
 623         else if (c <= 0x10FFFF)
 624         {
 625             buf[0] = cast(char)(0xF0 | (c >> 18));
 626             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
 627             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
 628             buf[3] = cast(char)(0x80 | (c & 0x3F));
 629             return buf[0 .. 4];
 630         }
 631         assert(0);
 632     }
 633
 634 /*******************
 635  * Encodes string s into UTF-8 and returns the encoded string.
 636  */
 637 string toUTF8(string s)
 638     in
 639     {
 640         validate(s);
 641     }
 642     body
 643     {
 644         return s;
 645     }
 646
 647 /** ditto */
 648 string toUTF8(in wchar[] s)
 649 {
 650     char[] r;
 651     size_t i;
 652     size_t slen = s.length;
 653
 654     r.length = slen;
 655
 656     for (i = 0; i < slen; i++)
 657     {   wchar c = s[i];
 658
 659         if (c <= 0x7F)
 660             r[i] = cast(char)c;         // fast path for ascii
 661         else
 662         {
 663             r.length = i;
 664             foreach (dchar c; s[i .. slen])
 665             {
 666                 encode(r, c);
 667             }
 668             break;
 669         }
 670     }
 671     return cast(string)r;
 672 }
 673
 674 /** ditto */
 675 string toUTF8(in dchar[] s)
 676 {
 677     char[] r;
 678     size_t i;
 679     size_t slen = s.length;
 680
 681     r.length = slen;
 682
 683     for (i = 0; i < slen; i++)
 684     {   dchar c = s[i];
 685
 686         if (c <= 0x7F)
 687             r[i] = cast(char)c;         // fast path for ascii
 688         else
 689         {
 690             r.length = i;
 691             foreach (dchar d; s[i .. slen])
 692             {
 693                 encode(r, d);
 694             }
 695             break;
 696         }
 697     }
 698     return cast(string)r;
 699 }
 700
 701 /* =================== Conversion to UTF16 ======================= */
 702
 703 wchar[] toUTF16(wchar[2] buf, dchar c)
 704     in
 705     {
 706         assert(isValidDchar(c));
 707     }
 708     body
 709     {
 710         if (c <= 0xFFFF)
 711         {
 712             buf[0] = cast(wchar) c;
 713             return buf[0 .. 1];
 714         }
 715         else
 716         {
 717             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
 718             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
 719             return buf[0 .. 2];
 720         }
 721     }
 722
 723 /****************
 724  * Encodes string s into UTF-16 and returns the encoded string.
 725  * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
 726  * an LPWSTR or LPCWSTR argument.
 727  */
 728 wstring toUTF16(in char[] s)
 729 {
 730     wchar[] r;
 731     size_t slen = s.length;
 732
 733     r.length = slen;
 734     r.length = 0;
 735     for (size_t i = 0; i < slen; )
 736     {
 737         dchar c = s[i];
 738         if (c <= 0x7F)
 739         {
 740             i++;
 741             r ~= cast(wchar)c;
 742         }
 743         else
 744         {
 745             c = decode(s, i);
 746             encode(r, c);
 747         }
 748     }
 749     return cast(wstring)r;
 750 }
 751
 752 alias const(wchar)* wptr;
 753 /** ditto */
 754 wptr toUTF16z(in char[] s)
 755 {
 756     wchar[] r;
 757     size_t slen = s.length;
 758
 759     r.length = slen + 1;
 760     r.length = 0;
 761     for (size_t i = 0; i < slen; )
 762     {
 763         dchar c = s[i];
 764         if (c <= 0x7F)
 765         {
 766             i++;
 767             r ~= cast(wchar)c;
 768         }
 769         else
 770         {
 771             c = decode(s, i);
 772             encode(r, c);
 773         }
 774     }
 775     r ~= "\000";
 776     return r.ptr;
 777 }
 778
 779 /** ditto */
 780 wstring toUTF16(wstring s)
 781     in
 782     {
 783         validate(s);
 784     }
 785     body
 786     {
 787         return s;
 788     }
 789
 790 /** ditto */
 791 wstring toUTF16(in dchar[] s)
 792 {
 793     wchar[] r;
 794     size_t slen = s.length;
 795
 796     r.length = slen;
 797     r.length = 0;
 798     for (size_t i = 0; i < slen; i++)
 799     {
 800         encode(r, s[i]);
 801     }
 802     return cast(wstring)r;
 803 }
 804
 805 /* =================== Conversion to UTF32 ======================= */
 806
 807 /*****
 808  * Encodes string s into UTF-32 and returns the encoded string.
 809  */
 810 dstring toUTF32(in char[] s)
 811 {
 812     dchar[] r;
 813     size_t slen = s.length;
 814     size_t j = 0;
 815
 816     r.length = slen;            // r[] will never be longer than s[]
 817     for (size_t i = 0; i < slen; )
 818     {
 819         dchar c = s[i];
 820         if (c >= 0x80)
 821             c = decode(s, i);
 822         else
 823             i++;                // c is ascii, no need for decode
 824         r[j++] = c;
 825     }
 826     return cast(dstring)r[0 .. j];
 827 }
 828
 829 /** ditto */
 830 dstring toUTF32(in wchar[] s)
 831 {
 832     dchar[] r;
 833     size_t slen = s.length;
 834     size_t j = 0;
 835
 836     r.length = slen;            // r[] will never be longer than s[]
 837     for (size_t i = 0; i < slen; )
 838     {
 839         dchar c = s[i];
 840         if (c >= 0x80)
 841             c = decode(s, i);
 842         else
 843             i++;                // c is ascii, no need for decode
 844         r[j++] = c;
 845     }
 846     return cast(dstring)r[0 .. j];
 847 }
 848
 849 /** ditto */
 850 dstring toUTF32(dstring s)
 851     in
 852     {
 853         validate(s);
 854     }
 855     body
 856     {
 857         return s;
 858     }
 859
 860 /* ================================ tests ================================== */
 861
 862 unittest
 863 {
 864     debug(utf) printf("utf.toUTF.unittest\n");
 865
 866     auto c = "hello"c;
 867     auto w = toUTF16(c);
 868     assert(w == "hello");
 869     auto d = toUTF32(c);
 870     assert(d == "hello");
 871
 872     c = toUTF8(w);
 873     assert(c == "hello");
 874     d = toUTF32(w);
 875     assert(d == "hello");
 876
 877     c = toUTF8(d);
 878     assert(c == "hello");
 879     w = toUTF16(d);
 880     assert(w == "hello");
 881
 882
 883     c = "hel\u1234o";
 884     w = toUTF16(c);
 885     assert(w == "hel\u1234o");
 886     d = toUTF32(c);
 887     assert(d == "hel\u1234o");
 888
 889     c = toUTF8(w);
 890     assert(c == "hel\u1234o");
 891     d = toUTF32(w);
 892     assert(d == "hel\u1234o");
 893
 894     c = toUTF8(d);
 895     assert(c == "hel\u1234o");
 896     w = toUTF16(d);
 897     assert(w == "hel\u1234o");
 898
 899
 900     c = "he\U0010AAAAllo";
 901     w = toUTF16(c);
 902     //foreach (wchar c; w) printf("c = x%x\n", c);
 903     //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
 904     assert(w == "he\U0010AAAAllo");
 905     d = toUTF32(c);
 906     assert(d == "he\U0010AAAAllo");
 907
 908     c = toUTF8(w);
 909     assert(c == "he\U0010AAAAllo");
 910     d = toUTF32(w);
 911     assert(d == "he\U0010AAAAllo");
 912
 913     c = toUTF8(d);
 914     assert(c == "he\U0010AAAAllo");
 915     w = toUTF16(d);
 916     assert(w == "he\U0010AAAAllo");
 917 }