X-Git-Url: https://git.llucax.com/software/druntime.git/blobdiff_plain/5a07183e26b7ca151959629420a83f59af5339f3..HEAD:/src/compiler/dmd/util/utf.d?ds=sidebyside
diff --git a/src/compiler/dmd/util/utf.d b/src/compiler/dmd/util/utf.d
index 84ace7c..552ab34 100644
--- a/src/compiler/dmd/util/utf.d
+++ b/src/compiler/dmd/util/utf.d
@@ -1,4 +1,4 @@
-// utf.d
+// Written in the D programming language
/*
* Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
@@ -22,16 +22,38 @@
* distribution.
*/
-// Description of UTF-8 at:
-// http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
-// http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
-
+/********************************************
+ * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
+ *
+ * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
+ * wchar type.
+ * For linux systems, the C wchar_t type is UTF-32 and corresponds to
+ * the D utf.dchar type.
+ *
+ * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
+ *
+ * See_Also:
+ * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)
+ * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)
+ * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
+ * Macros:
+ * WIKI = Phobos/StdUtf
+ */
module rt.util.utf;
-extern (C) void onUnicodeError( char[] msg, size_t idx );
+extern (C) void onUnicodeError( string msg, size_t idx );
+/*******************************
+ * Test if c is a valid UTF-32 character.
+ *
+ * \uFFFE and \uFFFF are considered valid by this function,
+ * as they are permitted for internal use by an application,
+ * but they are not allowed for interchange by the Unicode standard.
+ *
+ * Returns: true if it is, false if not.
+ */
bool isValidDchar(dchar c)
{
@@ -53,14 +75,10 @@ unittest
}
-/* This array gives the length of a UTF-8 sequence indexed by the value
- * of the leading byte. An FF represents an illegal starting value of
- * a UTF-8 sequence.
- * FF is used instead of 0 to avoid having loops hang.
- */
-ubyte[256] UTF8stride =
+auto UTF8stride =
[
+ cast(ubyte)
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -79,77 +97,89 @@ ubyte[256] UTF8stride =
4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
];
-uint stride(char[] s, size_t i)
+/**
+ * stride() returns the length of a UTF-8 sequence starting at index i
+ * in string s.
+ * Returns:
+ * The number of bytes in the UTF-8 sequence or
+ * 0xFF meaning s[i] is not the start of of UTF-8 sequence.
+ */
+uint stride(in char[] s, size_t i)
{
return UTF8stride[s[i]];
}
-uint stride(wchar[] s, size_t i)
+/**
+ * stride() returns the length of a UTF-16 sequence starting at index i
+ * in string s.
+ */
+uint stride(in wchar[] s, size_t i)
{ uint u = s[i];
return 1 + (u >= 0xD800 && u <= 0xDBFF);
}
-uint stride(dchar[] s, size_t i)
+/**
+ * stride() returns the length of a UTF-32 sequence starting at index i
+ * in string s.
+ * Returns: The return value will always be 1.
+ */
+uint stride(in dchar[] s, size_t i)
{
return 1;
}
/*******************************************
- * Given an index into an array of char's,
- * and assuming that index is at the start of a UTF character,
- * determine the number of UCS characters up to that index.
+ * Given an index i into an array of characters s[],
+ * and assuming that index i is at the start of a UTF character,
+ * determine the number of UCS characters up to that index i.
*/
-size_t toUCSindex(char[] s, size_t i)
+size_t toUCSindex(in char[] s, size_t i)
{
size_t n;
size_t j;
- size_t stride;
- for (j = 0; j < i; j += stride)
+ for (j = 0; j < i; )
{
- stride = UTF8stride[s[j]];
- if (stride == 0xFF)
- goto Lerr;
+ j += stride(s, j);
n++;
}
if (j > i)
{
- Lerr:
- onUnicodeError("invalid UTF-8 sequence", j);
+ onUnicodeError("invalid UTF-8 sequence", j);
}
return n;
}
-size_t toUCSindex(wchar[] s, size_t i)
+/** ditto */
+size_t toUCSindex(in wchar[] s, size_t i)
{
size_t n;
size_t j;
for (j = 0; j < i; )
- { uint u = s[j];
-
- j += 1 + (u >= 0xD800 && u <= 0xDBFF);
+ {
+ j += stride(s, j);
n++;
}
if (j > i)
{
- Lerr:
- onUnicodeError("invalid UTF-16 sequence", j);
+ onUnicodeError("invalid UTF-16 sequence", j);
}
return n;
}
-size_t toUCSindex(dchar[] s, size_t i)
+/** ditto */
+size_t toUCSindex(in dchar[] s, size_t i)
{
return i;
}
/******************************************
- * Given a UCS index into an array of characters, return the UTF index.
+ * Given a UCS index n into an array of characters s[], return the UTF index.
*/
-size_t toUTFindex(char[] s, size_t n)
+size_t toUTFindex(in char[] s, size_t n)
{
size_t i;
@@ -163,7 +193,8 @@ size_t toUTFindex(char[] s, size_t n)
return i;
}
-size_t toUTFindex(wchar[] s, size_t n)
+/** ditto */
+size_t toUTFindex(in wchar[] s, size_t n)
{
size_t i;
@@ -175,14 +206,20 @@ size_t toUTFindex(wchar[] s, size_t n)
return i;
}
-size_t toUTFindex(dchar[] s, size_t n)
+/** ditto */
+size_t toUTFindex(in dchar[] s, size_t n)
{
return n;
}
/* =================== Decode ======================= */
-dchar decode(char[] s, inout size_t idx)
+/***************
+ * Decodes and returns character starting at s[idx]. idx is advanced past the
+ * decoded character. If the character is not well formed, a UtfException is
+ * thrown and idx remains unchanged.
+ */
+dchar decode(in char[] s, inout size_t idx)
in
{
assert(idx >= 0 && idx < s.length);
@@ -275,7 +312,7 @@ unittest
debug(utf) printf("utf.decode.unittest\n");
- static char[] s1 = "abcd";
+ static s1 = "abcd"c;
i = 0;
c = decode(s1, i);
assert(c == cast(dchar)'a');
@@ -284,20 +321,20 @@ unittest
assert(c == cast(dchar)'b');
assert(i == 2);
- static char[] s2 = "\xC2\xA9";
+ static s2 = "\xC2\xA9"c;
i = 0;
c = decode(s2, i);
assert(c == cast(dchar)'\u00A9');
assert(i == 2);
- static char[] s3 = "\xE2\x89\xA0";
+ static s3 = "\xE2\x89\xA0"c;
i = 0;
c = decode(s3, i);
assert(c == cast(dchar)'\u2260');
assert(i == 3);
- static char[][] s4 =
- [ "\xE2\x89", // too short
+ static s4 =
+ [ "\xE2\x89"c[], // too short
"\xC0\x8A",
"\xE0\x80\x8A",
"\xF0\x80\x80\x8A",
@@ -321,9 +358,9 @@ unittest
}
}
-/********************************************************/
+/** ditto */
-dchar decode(wchar[] s, inout size_t idx)
+dchar decode(in wchar[] s, inout size_t idx)
in
{
assert(idx >= 0 && idx < s.length);
@@ -334,7 +371,7 @@ dchar decode(wchar[] s, inout size_t idx)
}
body
{
- char[] msg;
+ string msg;
dchar V;
size_t i = idx;
uint u = s[i];
@@ -379,9 +416,9 @@ dchar decode(wchar[] s, inout size_t idx)
return cast(dchar)u; // dummy return
}
-/********************************************************/
+/** ditto */
-dchar decode(dchar[] s, inout size_t idx)
+dchar decode(in dchar[] s, inout size_t idx)
in
{
assert(idx >= 0 && idx < s.length);
@@ -404,6 +441,9 @@ dchar decode(dchar[] s, inout size_t idx)
/* =================== Encode ======================= */
+/*******************************
+ * Encodes character c and appends it to array s[].
+ */
void encode(inout char[] s, dchar c)
in
{
@@ -456,7 +496,7 @@ unittest
{
debug(utf) printf("utf.encode.unittest\n");
- char[] s = "abcd";
+ char[] s = "abcd".dup;
encode(s, cast(dchar)'a');
assert(s.length == 5);
assert(s == "abcda");
@@ -471,7 +511,7 @@ unittest
assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
}
-/********************************************************/
+/** ditto */
void encode(inout wchar[] s, dchar c)
in
@@ -497,6 +537,7 @@ void encode(inout wchar[] s, dchar c)
s = r;
}
+/** ditto */
void encode(inout dchar[] s, dchar c)
in
{
@@ -507,36 +548,46 @@ void encode(inout dchar[] s, dchar c)
s ~= c;
}
-/* =================== Validation ======================= */
+/**
+Returns the code length of $(D c) in the encoding using $(D C) as a
+code point. The code is returned in character count, not in bytes.
+ */
-void validate(char[] s)
+ubyte codeLength(C)(dchar c)
{
- size_t len = s.length;
- size_t i;
- for (i = 0; i < len; )
+ static if (C.sizeof == 1)
{
- decode(s, i);
- }
+ return
+ c <= 0x7F ? 1
+ : c <= 0x7FF ? 2
+ : c <= 0xFFFF ? 3
+ : c <= 0x10FFFF ? 4
+ : (assert(false), 6);
}
-void validate(wchar[] s)
+ else static if (C.sizeof == 2)
{
- size_t len = s.length;
- size_t i;
-
- for (i = 0; i < len; )
+ return c <= 0xFFFF ? 1 : 2;
+ }
+ else
{
- decode(s, i);
+ static assert(C.sizeof == 4);
+ return 1;
}
}
-void validate(dchar[] s)
-{
- size_t len = s.length;
- size_t i;
+/* =================== Validation ======================= */
- for (i = 0; i < len; )
+/***********************************
+Checks to see if string is well formed or not. $(D S) can be an array
+ of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
+ if it is not. Use to check all untrusted input for correctness.
+ */
+void validate(S)(in S s)
+{
+ auto len = s.length;
+ for (size_t i = 0; i < len; )
{
decode(s, i);
}
@@ -580,7 +631,10 @@ char[] toUTF8(char[4] buf, dchar c)
assert(0);
}
-char[] toUTF8(char[] s)
+/*******************
+ * Encodes string s into UTF-8 and returns the encoded string.
+ */
+string toUTF8(string s)
in
{
validate(s);
@@ -590,7 +644,8 @@ char[] toUTF8(char[] s)
return s;
}
-char[] toUTF8(wchar[] s)
+/** ditto */
+string toUTF8(in wchar[] s)
{
char[] r;
size_t i;
@@ -613,10 +668,11 @@ char[] toUTF8(wchar[] s)
break;
}
}
- return r;
+ return cast(string)r;
}
-char[] toUTF8(dchar[] s)
+/** ditto */
+string toUTF8(in dchar[] s)
{
char[] r;
size_t i;
@@ -639,7 +695,7 @@ char[] toUTF8(dchar[] s)
break;
}
}
- return r;
+ return cast(string)r;
}
/* =================== Conversion to UTF16 ======================= */
@@ -664,7 +720,12 @@ wchar[] toUTF16(wchar[2] buf, dchar c)
}
}
-wchar[] toUTF16(char[] s)
+/****************
+ * Encodes string s into UTF-16 and returns the encoded string.
+ * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
+ * an LPWSTR or LPCWSTR argument.
+ */
+wstring toUTF16(in char[] s)
{
wchar[] r;
size_t slen = s.length;
@@ -685,10 +746,12 @@ wchar[] toUTF16(char[] s)
encode(r, c);
}
}
- return r;
+ return cast(wstring)r;
}
-wchar* toUTF16z(char[] s)
+alias const(wchar)* wptr;
+/** ditto */
+wptr toUTF16z(in char[] s)
{
wchar[] r;
size_t slen = s.length;
@@ -713,7 +776,8 @@ wchar* toUTF16z(char[] s)
return r.ptr;
}
-wchar[] toUTF16(wchar[] s)
+/** ditto */
+wstring toUTF16(wstring s)
in
{
validate(s);
@@ -723,7 +787,8 @@ wchar[] toUTF16(wchar[] s)
return s;
}
-wchar[] toUTF16(dchar[] s)
+/** ditto */
+wstring toUTF16(in dchar[] s)
{
wchar[] r;
size_t slen = s.length;
@@ -734,12 +799,15 @@ wchar[] toUTF16(dchar[] s)
{
encode(r, s[i]);
}
- return r;
+ return cast(wstring)r;
}
/* =================== Conversion to UTF32 ======================= */
-dchar[] toUTF32(char[] s)
+/*****
+ * Encodes string s into UTF-32 and returns the encoded string.
+ */
+dstring toUTF32(in char[] s)
{
dchar[] r;
size_t slen = s.length;
@@ -755,10 +823,11 @@ dchar[] toUTF32(char[] s)
i++; // c is ascii, no need for decode
r[j++] = c;
}
- return r[0 .. j];
+ return cast(dstring)r[0 .. j];
}
-dchar[] toUTF32(wchar[] s)
+/** ditto */
+dstring toUTF32(in wchar[] s)
{
dchar[] r;
size_t slen = s.length;
@@ -774,10 +843,11 @@ dchar[] toUTF32(wchar[] s)
i++; // c is ascii, no need for decode
r[j++] = c;
}
- return r[0 .. j];
+ return cast(dstring)r[0 .. j];
}
-dchar[] toUTF32(dchar[] s)
+/** ditto */
+dstring toUTF32(dstring s)
in
{
validate(s);
@@ -793,14 +863,10 @@ unittest
{
debug(utf) printf("utf.toUTF.unittest\n");
- char[] c;
- wchar[] w;
- dchar[] d;
-
- c = "hello";
- w = toUTF16(c);
+ auto c = "hello"c[];
+ auto w = toUTF16(c);
assert(w == "hello");
- d = toUTF32(c);
+ auto d = toUTF32(c);
assert(d == "hello");
c = toUTF8(w);
@@ -834,7 +900,7 @@ unittest
c = "he\U0010AAAAllo";
w = toUTF16(c);
//foreach (wchar c; w) printf("c = x%x\n", c);
- //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
+ //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
assert(w == "he\U0010AAAAllo");
d = toUTF32(c);
assert(d == "he\U0010AAAAllo");