X-Git-Url: https://git.llucax.com/software/druntime.git/blobdiff_plain/0b5d833e0947d44cd5bee6562c22ed4a14ca23bb..1f3a9af059166187064490d1cae80bd11fb8737c:/src/compiler/dmd/util/utf.d diff --git a/src/compiler/dmd/util/utf.d b/src/compiler/dmd/util/utf.d index 84ace7c..50d4a51 100644 --- a/src/compiler/dmd/util/utf.d +++ b/src/compiler/dmd/util/utf.d @@ -1,4 +1,4 @@ -// utf.d +// Written in the D programming language /* * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com @@ -22,16 +22,38 @@ * distribution. */ -// Description of UTF-8 at: -// http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 -// http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335 - +/******************************************** + * Encode and decode UTF-8, UTF-16 and UTF-32 strings. + * + * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D + * wchar type. + * For linux systems, the C wchar_t type is UTF-32 and corresponds to + * the D utf.dchar type. + * + * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). + * + * See_Also: + * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)
+ * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)
+ * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) + * Macros: + * WIKI = Phobos/StdUtf + */ module rt.util.utf; -extern (C) void onUnicodeError( char[] msg, size_t idx ); +extern (C) void onUnicodeError( string msg, size_t idx ); +/******************************* + * Test if c is a valid UTF-32 character. + * + * \uFFFE and \uFFFF are considered valid by this function, + * as they are permitted for internal use by an application, + * but they are not allowed for interchange by the Unicode standard. + * + * Returns: true if it is, false if not. + */ bool isValidDchar(dchar c) { @@ -53,14 +75,10 @@ unittest } -/* This array gives the length of a UTF-8 sequence indexed by the value - * of the leading byte. An FF represents an illegal starting value of - * a UTF-8 sequence. - * FF is used instead of 0 to avoid having loops hang. - */ -ubyte[256] UTF8stride = +auto UTF8stride = [ + cast(ubyte) 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -79,77 +97,89 @@ ubyte[256] UTF8stride = 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, ]; -uint stride(char[] s, size_t i) +/** + * stride() returns the length of a UTF-8 sequence starting at index i + * in string s. + * Returns: + * The number of bytes in the UTF-8 sequence or + * 0xFF meaning s[i] is not the start of of UTF-8 sequence. + */ +uint stride(in char[] s, size_t i) { return UTF8stride[s[i]]; } -uint stride(wchar[] s, size_t i) +/** + * stride() returns the length of a UTF-16 sequence starting at index i + * in string s. + */ +uint stride(in wchar[] s, size_t i) { uint u = s[i]; return 1 + (u >= 0xD800 && u <= 0xDBFF); } -uint stride(dchar[] s, size_t i) +/** + * stride() returns the length of a UTF-32 sequence starting at index i + * in string s. + * Returns: The return value will always be 1. + */ +uint stride(in dchar[] s, size_t i) { return 1; } /******************************************* - * Given an index into an array of char's, - * and assuming that index is at the start of a UTF character, - * determine the number of UCS characters up to that index. + * Given an index i into an array of characters s[], + * and assuming that index i is at the start of a UTF character, + * determine the number of UCS characters up to that index i. */ -size_t toUCSindex(char[] s, size_t i) +size_t toUCSindex(in char[] s, size_t i) { size_t n; size_t j; - size_t stride; - for (j = 0; j < i; j += stride) + for (j = 0; j < i; ) { - stride = UTF8stride[s[j]]; - if (stride == 0xFF) - goto Lerr; + j += stride(s, j); n++; } if (j > i) { - Lerr: - onUnicodeError("invalid UTF-8 sequence", j); + onUnicodeError("invalid UTF-8 sequence", j); } return n; } -size_t toUCSindex(wchar[] s, size_t i) +/** ditto */ +size_t toUCSindex(in wchar[] s, size_t i) { size_t n; size_t j; for (j = 0; j < i; ) - { uint u = s[j]; - - j += 1 + (u >= 0xD800 && u <= 0xDBFF); + { + j += stride(s, j); n++; } if (j > i) { - Lerr: - onUnicodeError("invalid UTF-16 sequence", j); + onUnicodeError("invalid UTF-16 sequence", j); } return n; } -size_t toUCSindex(dchar[] s, size_t i) +/** ditto */ +size_t toUCSindex(in dchar[] s, size_t i) { return i; } /****************************************** - * Given a UCS index into an array of characters, return the UTF index. + * Given a UCS index n into an array of characters s[], return the UTF index. */ -size_t toUTFindex(char[] s, size_t n) +size_t toUTFindex(in char[] s, size_t n) { size_t i; @@ -163,7 +193,8 @@ size_t toUTFindex(char[] s, size_t n) return i; } -size_t toUTFindex(wchar[] s, size_t n) +/** ditto */ +size_t toUTFindex(in wchar[] s, size_t n) { size_t i; @@ -175,14 +206,20 @@ size_t toUTFindex(wchar[] s, size_t n) return i; } -size_t toUTFindex(dchar[] s, size_t n) +/** ditto */ +size_t toUTFindex(in dchar[] s, size_t n) { return n; } /* =================== Decode ======================= */ -dchar decode(char[] s, inout size_t idx) +/*************** + * Decodes and returns character starting at s[idx]. idx is advanced past the + * decoded character. If the character is not well formed, a UtfException is + * thrown and idx remains unchanged. + */ +dchar decode(in char[] s, inout size_t idx) in { assert(idx >= 0 && idx < s.length); @@ -275,7 +312,7 @@ unittest debug(utf) printf("utf.decode.unittest\n"); - static char[] s1 = "abcd"; + static s1 = "abcd"c; i = 0; c = decode(s1, i); assert(c == cast(dchar)'a'); @@ -284,20 +321,20 @@ unittest assert(c == cast(dchar)'b'); assert(i == 2); - static char[] s2 = "\xC2\xA9"; + static s2 = "\xC2\xA9"c; i = 0; c = decode(s2, i); assert(c == cast(dchar)'\u00A9'); assert(i == 2); - static char[] s3 = "\xE2\x89\xA0"; + static s3 = "\xE2\x89\xA0"c; i = 0; c = decode(s3, i); assert(c == cast(dchar)'\u2260'); assert(i == 3); - static char[][] s4 = - [ "\xE2\x89", // too short + static s4 = + [ "\xE2\x89"c, // too short "\xC0\x8A", "\xE0\x80\x8A", "\xF0\x80\x80\x8A", @@ -321,9 +358,9 @@ unittest } } -/********************************************************/ +/** ditto */ -dchar decode(wchar[] s, inout size_t idx) +dchar decode(in wchar[] s, inout size_t idx) in { assert(idx >= 0 && idx < s.length); @@ -334,7 +371,7 @@ dchar decode(wchar[] s, inout size_t idx) } body { - char[] msg; + string msg; dchar V; size_t i = idx; uint u = s[i]; @@ -379,9 +416,9 @@ dchar decode(wchar[] s, inout size_t idx) return cast(dchar)u; // dummy return } -/********************************************************/ +/** ditto */ -dchar decode(dchar[] s, inout size_t idx) +dchar decode(in dchar[] s, inout size_t idx) in { assert(idx >= 0 && idx < s.length); @@ -404,6 +441,9 @@ dchar decode(dchar[] s, inout size_t idx) /* =================== Encode ======================= */ +/******************************* + * Encodes character c and appends it to array s[]. + */ void encode(inout char[] s, dchar c) in { @@ -456,7 +496,7 @@ unittest { debug(utf) printf("utf.encode.unittest\n"); - char[] s = "abcd"; + char[] s = "abcd".dup; encode(s, cast(dchar)'a'); assert(s.length == 5); assert(s == "abcda"); @@ -471,7 +511,7 @@ unittest assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); } -/********************************************************/ +/** ditto */ void encode(inout wchar[] s, dchar c) in @@ -497,6 +537,7 @@ void encode(inout wchar[] s, dchar c) s = r; } +/** ditto */ void encode(inout dchar[] s, dchar c) in { @@ -507,36 +548,46 @@ void encode(inout dchar[] s, dchar c) s ~= c; } -/* =================== Validation ======================= */ +/** +Returns the code length of $(D c) in the encoding using $(D C) as a +code point. The code is returned in character count, not in bytes. + */ -void validate(char[] s) +ubyte codeLength(C)(dchar c) { - size_t len = s.length; - size_t i; - for (i = 0; i < len; ) + static if (C.sizeof == 1) { - decode(s, i); - } + return + c <= 0x7F ? 1 + : c <= 0x7FF ? 2 + : c <= 0xFFFF ? 3 + : c <= 0x10FFFF ? 4 + : (assert(false), 6); } -void validate(wchar[] s) + else static if (C.sizeof == 2) { - size_t len = s.length; - size_t i; - - for (i = 0; i < len; ) + return c <= 0xFFFF ? 1 : 2; + } + else { - decode(s, i); + static assert(C.sizeof == 4); + return 1; } } -void validate(dchar[] s) -{ - size_t len = s.length; - size_t i; +/* =================== Validation ======================= */ - for (i = 0; i < len; ) +/*********************************** +Checks to see if string is well formed or not. $(D S) can be an array + of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException) + if it is not. Use to check all untrusted input for correctness. + */ +void validate(S)(in S s) +{ + invariant len = s.length; + for (size_t i = 0; i < len; ) { decode(s, i); } @@ -580,7 +631,10 @@ char[] toUTF8(char[4] buf, dchar c) assert(0); } -char[] toUTF8(char[] s) +/******************* + * Encodes string s into UTF-8 and returns the encoded string. + */ +string toUTF8(string s) in { validate(s); @@ -590,7 +644,8 @@ char[] toUTF8(char[] s) return s; } -char[] toUTF8(wchar[] s) +/** ditto */ +string toUTF8(in wchar[] s) { char[] r; size_t i; @@ -613,10 +668,11 @@ char[] toUTF8(wchar[] s) break; } } - return r; + return cast(string)r; } -char[] toUTF8(dchar[] s) +/** ditto */ +string toUTF8(in dchar[] s) { char[] r; size_t i; @@ -639,7 +695,7 @@ char[] toUTF8(dchar[] s) break; } } - return r; + return cast(string)r; } /* =================== Conversion to UTF16 ======================= */ @@ -664,7 +720,12 @@ wchar[] toUTF16(wchar[2] buf, dchar c) } } -wchar[] toUTF16(char[] s) +/**************** + * Encodes string s into UTF-16 and returns the encoded string. + * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take + * an LPWSTR or LPCWSTR argument. + */ +wstring toUTF16(in char[] s) { wchar[] r; size_t slen = s.length; @@ -685,10 +746,12 @@ wchar[] toUTF16(char[] s) encode(r, c); } } - return r; + return cast(wstring)r; } -wchar* toUTF16z(char[] s) +alias const(wchar)* wptr; +/** ditto */ +wptr toUTF16z(in char[] s) { wchar[] r; size_t slen = s.length; @@ -713,7 +776,8 @@ wchar* toUTF16z(char[] s) return r.ptr; } -wchar[] toUTF16(wchar[] s) +/** ditto */ +wstring toUTF16(wstring s) in { validate(s); @@ -723,7 +787,8 @@ wchar[] toUTF16(wchar[] s) return s; } -wchar[] toUTF16(dchar[] s) +/** ditto */ +wstring toUTF16(in dchar[] s) { wchar[] r; size_t slen = s.length; @@ -734,12 +799,15 @@ wchar[] toUTF16(dchar[] s) { encode(r, s[i]); } - return r; + return cast(wstring)r; } /* =================== Conversion to UTF32 ======================= */ -dchar[] toUTF32(char[] s) +/***** + * Encodes string s into UTF-32 and returns the encoded string. + */ +dstring toUTF32(in char[] s) { dchar[] r; size_t slen = s.length; @@ -755,10 +823,11 @@ dchar[] toUTF32(char[] s) i++; // c is ascii, no need for decode r[j++] = c; } - return r[0 .. j]; + return cast(dstring)r[0 .. j]; } -dchar[] toUTF32(wchar[] s) +/** ditto */ +dstring toUTF32(in wchar[] s) { dchar[] r; size_t slen = s.length; @@ -774,10 +843,11 @@ dchar[] toUTF32(wchar[] s) i++; // c is ascii, no need for decode r[j++] = c; } - return r[0 .. j]; + return cast(dstring)r[0 .. j]; } -dchar[] toUTF32(dchar[] s) +/** ditto */ +dstring toUTF32(dstring s) in { validate(s); @@ -793,14 +863,10 @@ unittest { debug(utf) printf("utf.toUTF.unittest\n"); - char[] c; - wchar[] w; - dchar[] d; - - c = "hello"; - w = toUTF16(c); + auto c = "hello"c; + auto w = toUTF16(c); assert(w == "hello"); - d = toUTF32(c); + auto d = toUTF32(c); assert(d == "hello"); c = toUTF8(w); @@ -834,7 +900,7 @@ unittest c = "he\U0010AAAAllo"; w = toUTF16(c); //foreach (wchar c; w) printf("c = x%x\n", c); - //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c); + //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c); assert(w == "he\U0010AAAAllo"); d = toUTF32(c); assert(d == "he\U0010AAAAllo");