1 module mir.utf;
2 
3 /++
4 +/
5 bool decodeFrontImpl(bool assumeNotEmpty = false, bool assumeFrontNotAscii = false)(scope ref inout(char)[] str, out dchar value) @safe pure nothrow @nogc
6 {
7     /* The following encodings are valid:
8      *  0xxxxxxx
9      *  110xxxxx 10xxxxxx
10      *  1110xxxx 10xxxxxx 10xxxxxx
11      *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
12      */
13 
14     import mir.bitop: ctlz;
15     import mir.utility: max;
16 
17     static if (!assumeNotEmpty)
18     {
19         if (str.length == 0)
20             return false;
21     }
22     else
23     {
24         assert(str.length);
25     }
26 
27     uint f = str[0];
28     value = f;
29     str = str[1 .. $];
30 
31     static if (!assumeFrontNotAscii)
32     {
33         if (f < 0x80)
34             return true;
35     }
36 
37     uint len = ctlz(~(f << 25));
38     if (len == 0 || len > max(3u, str.length)) // invalid UTF-8
39         return false;
40     value &= (1 << (6 - len)) - 1;
41 
42     do
43     {
44         auto c = str[0];
45         str = str[1 .. $];
46         value <<= 6;
47         if ((c & 0xC0) != 0x80)
48             return false;
49         value |= c & 0x3F;
50     }
51     while(--len);
52     return true;
53 }
54 
55 version (D_Exceptions):
56 
57 package static immutable utfException = new Exception("Invalid UTF-8 sequence");
58 
59 ///
60 dchar decodeFront(scope ref inout(char)[] str) @safe pure @nogc @property
61 {
62     dchar ret;
63     if (decodeFrontImpl(str, ret))
64     {
65         return ret;
66     }
67     import mir.exception: toMutable;
68     throw utfException.toMutable;
69 }
70 
71 ///
72 @safe pure unittest
73 {
74     string str = "Hello, World!";
75 
76     assert(str.decodeFront == 'H' && str == "ello, World!");
77     str = "å";
78     assert(str.decodeFront == 'å' && str.length == 0);
79     str = "å";
80 }