1 // Written in the D programming language
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
5 * Written by Walter Bright
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
15 * o The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * o Altered source versions must be plainly marked as such, and must not
20 * be misrepresented as being the original software.
21 * o This notice may not be removed or altered from any source
25 /********************************************
26 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
28 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
30 * For linux systems, the C wchar_t type is UTF-32 and corresponds to
31 * the D utf.dchar type.
33 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
36 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
37 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
38 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
40 * WIKI = Phobos/StdUtf
46 extern (C) void onUnicodeError( string msg, size_t idx );
48 /*******************************
49 * Test if c is a valid UTF-32 character.
51 * \uFFFE and \uFFFF are considered valid by this function,
52 * as they are permitted for internal use by an application,
53 * but they are not allowed for interchange by the Unicode standard.
55 * Returns: true if it is, false if not.
58 bool isValidDchar(dchar c)
60 /* Note: FFFE and FFFF are specifically permitted by the
61 * Unicode standard for application internal use, but are not
62 * allowed for interchange.
63 * (thanks to Arcane Jill)
67 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
72 debug(utf) printf("utf.isValidDchar.unittest\n");
73 assert(isValidDchar(cast(dchar)'a') == true);
74 assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
82 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
83 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
84 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
85 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
86 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
87 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
88 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
89 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
90 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
91 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
92 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
93 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
94 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
95 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
96 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
97 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
101 * stride() returns the length of a UTF-8 sequence starting at index i
104 * The number of bytes in the UTF-8 sequence or
105 * 0xFF meaning s[i] is not the start of of UTF-8 sequence.
107 uint stride(in char[] s, size_t i)
109 return UTF8stride[s[i]];
113 * stride() returns the length of a UTF-16 sequence starting at index i
116 uint stride(in wchar[] s, size_t i)
118 return 1 + (u >= 0xD800 && u <= 0xDBFF);
122 * stride() returns the length of a UTF-32 sequence starting at index i
124 * Returns: The return value will always be 1.
126 uint stride(in dchar[] s, size_t i)
131 /*******************************************
132 * Given an index i into an array of characters s[],
133 * and assuming that index i is at the start of a UTF character,
134 * determine the number of UCS characters up to that index i.
137 size_t toUCSindex(in char[] s, size_t i)
149 onUnicodeError("invalid UTF-8 sequence", j);
155 size_t toUCSindex(in wchar[] s, size_t i)
167 onUnicodeError("invalid UTF-16 sequence", j);
173 size_t toUCSindex(in dchar[] s, size_t i)
178 /******************************************
179 * Given a UCS index n into an array of characters s[], return the UTF index.
182 size_t toUTFindex(in char[] s, size_t n)
188 uint j = UTF8stride[s[i]];
190 onUnicodeError("invalid UTF-8 sequence", i);
197 size_t toUTFindex(in wchar[] s, size_t n)
204 i += 1 + (u >= 0xD800 && u <= 0xDBFF);
210 size_t toUTFindex(in dchar[] s, size_t n)
215 /* =================== Decode ======================= */
218 * Decodes and returns character starting at s[idx]. idx is advanced past the
219 * decoded character. If the character is not well formed, a UtfException is
220 * thrown and idx remains unchanged.
222 dchar decode(in char[] s, inout size_t idx)
225 assert(idx >= 0 && idx < s.length);
229 assert(isValidDchar(result));
233 size_t len = s.length;
242 /* The following encodings are valid, except for the 5 and 6 byte
246 * 1110xxxx 10xxxxxx 10xxxxxx
247 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
248 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
249 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
254 goto Lerr; // only do the first 4 of 6 encodings
255 if (((u << n) & 0x80) == 0)
263 // Pick off (7 - n) significant bits of B from first byte of octet
264 V = cast(dchar)(u & ((1 << (7 - n)) - 1));
266 if (i + (n - 1) >= len)
267 goto Lerr; // off end of string
269 /* The following combinations are overlong, and illegal:
270 * 1100000x (10xxxxxx)
271 * 11100000 100xxxxx (10xxxxxx)
272 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
273 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
274 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
277 if ((u & 0xFE) == 0xC0 ||
278 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
279 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
280 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
281 (u == 0xFC && (u2 & 0xFC) == 0x80))
282 goto Lerr; // overlong combination
284 for (uint j = 1; j != n; j++)
287 if ((u & 0xC0) != 0x80)
288 goto Lerr; // trailing bytes are 10xxxxxx
289 V = (V << 6) | (u & 0x3F);
291 if (!isValidDchar(V))
305 onUnicodeError("invalid UTF-8 sequence", i);
306 return V; // dummy return
313 debug(utf) printf("utf.decode.unittest\n");
318 assert(c == cast(dchar)'a');
321 assert(c == cast(dchar)'b');
324 static s2 = "\xC2\xA9"c;
327 assert(c == cast(dchar)'\u00A9');
330 static s3 = "\xE2\x89\xA0"c;
333 assert(c == cast(dchar)'\u2260');
337 [ "\xE2\x89"c[], // too short
341 "\xF8\x80\x80\x80\x8A",
342 "\xFC\x80\x80\x80\x80\x8A",
345 for (int j = 0; j < s4.length; j++)
350 c = decode(s4[j], i);
363 dchar decode(in wchar[] s, inout size_t idx)
366 assert(idx >= 0 && idx < s.length);
370 assert(isValidDchar(result));
380 { if (u >= 0xD800 && u <= 0xDBFF)
383 if (i + 1 == s.length)
384 { msg = "surrogate UTF-16 high value past end of string";
388 if (u2 < 0xDC00 || u2 > 0xDFFF)
389 { msg = "surrogate UTF-16 low value out of range";
392 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
395 else if (u >= 0xDC00 && u <= 0xDFFF)
396 { msg = "unpaired surrogate UTF-16 value";
399 else if (u == 0xFFFE || u == 0xFFFF)
400 { msg = "illegal UTF-16 value";
415 onUnicodeError(msg, i);
416 return cast(dchar)u; // dummy return
421 dchar decode(in dchar[] s, inout size_t idx)
424 assert(idx >= 0 && idx < s.length);
431 if (!isValidDchar(c))
437 onUnicodeError("invalid UTF-32 value", i);
438 return c; // dummy return
442 /* =================== Encode ======================= */
444 /*******************************
445 * Encodes character c and appends it to array s[].
447 void encode(inout char[] s, dchar c)
450 assert(isValidDchar(c));
467 buf[0] = cast(char)(0xC0 | (c >> 6));
468 buf[1] = cast(char)(0x80 | (c & 0x3F));
471 else if (c <= 0xFFFF)
473 buf[0] = cast(char)(0xE0 | (c >> 12));
474 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
475 buf[2] = cast(char)(0x80 | (c & 0x3F));
478 else if (c <= 0x10FFFF)
480 buf[0] = cast(char)(0xF0 | (c >> 18));
481 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
482 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
483 buf[3] = cast(char)(0x80 | (c & 0x3F));
497 debug(utf) printf("utf.encode.unittest\n");
499 char[] s = "abcd".dup;
500 encode(s, cast(dchar)'a');
501 assert(s.length == 5);
502 assert(s == "abcda");
504 encode(s, cast(dchar)'\u00A9');
505 assert(s.length == 7);
506 assert(s == "abcda\xC2\xA9");
507 //assert(s == "abcda\u00A9"); // BUG: fix compiler
509 encode(s, cast(dchar)'\u2260');
510 assert(s.length == 10);
511 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
516 void encode(inout wchar[] s, dchar c)
519 assert(isValidDchar(c));
533 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
534 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
541 void encode(inout dchar[] s, dchar c)
544 assert(isValidDchar(c));
552 Returns the code length of $(D c) in the encoding using $(D C) as a
553 code point. The code is returned in character count, not in bytes.
556 ubyte codeLength(C)(dchar c)
559 static if (C.sizeof == 1)
566 : (assert(false), 6);
569 else static if (C.sizeof == 2)
571 return c <= 0xFFFF ? 1 : 2;
575 static assert(C.sizeof == 4);
580 /* =================== Validation ======================= */
582 /***********************************
583 Checks to see if string is well formed or not. $(D S) can be an array
584 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
585 if it is not. Use to check all untrusted input for correctness.
587 void validate(S)(in S s)
590 for (size_t i = 0; i < len; )
596 /* =================== Conversion to UTF8 ======================= */
598 char[] toUTF8(char[4] buf, dchar c)
601 assert(isValidDchar(c));
607 buf[0] = cast(char) c;
612 buf[0] = cast(char)(0xC0 | (c >> 6));
613 buf[1] = cast(char)(0x80 | (c & 0x3F));
616 else if (c <= 0xFFFF)
618 buf[0] = cast(char)(0xE0 | (c >> 12));
619 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
620 buf[2] = cast(char)(0x80 | (c & 0x3F));
623 else if (c <= 0x10FFFF)
625 buf[0] = cast(char)(0xF0 | (c >> 18));
626 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
627 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
628 buf[3] = cast(char)(0x80 | (c & 0x3F));
635 * Encodes string s into UTF-8 and returns the encoded string.
637 string toUTF8(string s)
648 string toUTF8(in wchar[] s)
652 size_t slen = s.length;
656 for (i = 0; i < slen; i++)
660 r[i] = cast(char)c; // fast path for ascii
664 foreach (dchar c; s[i .. slen])
671 return cast(string)r;
675 string toUTF8(in dchar[] s)
679 size_t slen = s.length;
683 for (i = 0; i < slen; i++)
687 r[i] = cast(char)c; // fast path for ascii
691 foreach (dchar d; s[i .. slen])
698 return cast(string)r;
701 /* =================== Conversion to UTF16 ======================= */
703 wchar[] toUTF16(wchar[2] buf, dchar c)
706 assert(isValidDchar(c));
712 buf[0] = cast(wchar) c;
717 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
718 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
724 * Encodes string s into UTF-16 and returns the encoded string.
725 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
726 * an LPWSTR or LPCWSTR argument.
728 wstring toUTF16(in char[] s)
731 size_t slen = s.length;
735 for (size_t i = 0; i < slen; )
749 return cast(wstring)r;
752 alias const(wchar)* wptr;
754 wptr toUTF16z(in char[] s)
757 size_t slen = s.length;
761 for (size_t i = 0; i < slen; )
780 wstring toUTF16(wstring s)
791 wstring toUTF16(in dchar[] s)
794 size_t slen = s.length;
798 for (size_t i = 0; i < slen; i++)
802 return cast(wstring)r;
805 /* =================== Conversion to UTF32 ======================= */
808 * Encodes string s into UTF-32 and returns the encoded string.
810 dstring toUTF32(in char[] s)
813 size_t slen = s.length;
816 r.length = slen; // r[] will never be longer than s[]
817 for (size_t i = 0; i < slen; )
823 i++; // c is ascii, no need for decode
826 return cast(dstring)r[0 .. j];
830 dstring toUTF32(in wchar[] s)
833 size_t slen = s.length;
836 r.length = slen; // r[] will never be longer than s[]
837 for (size_t i = 0; i < slen; )
843 i++; // c is ascii, no need for decode
846 return cast(dstring)r[0 .. j];
850 dstring toUTF32(dstring s)
860 /* ================================ tests ================================== */
864 debug(utf) printf("utf.toUTF.unittest\n");
868 assert(w == "hello");
870 assert(d == "hello");
873 assert(c == "hello");
875 assert(d == "hello");
878 assert(c == "hello");
880 assert(w == "hello");
885 assert(w == "hel\u1234o");
887 assert(d == "hel\u1234o");
890 assert(c == "hel\u1234o");
892 assert(d == "hel\u1234o");
895 assert(c == "hel\u1234o");
897 assert(w == "hel\u1234o");
900 c = "he\U0010AAAAllo";
902 //foreach (wchar c; w) printf("c = x%x\n", c);
903 //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
904 assert(w == "he\U0010AAAAllo");
906 assert(d == "he\U0010AAAAllo");
909 assert(c == "he\U0010AAAAllo");
911 assert(d == "he\U0010AAAAllo");
914 assert(c == "he\U0010AAAAllo");
916 assert(w == "he\U0010AAAAllo");