1 module mir.utf; 2 3 /++ 4 +/ 5 bool decodeFrontImpl(bool assumeNotEmpty = false, bool assumeFrontNotAscii = false)(scope ref inout(char)[] str, out dchar value) @safe pure nothrow @nogc 6 { 7 /* The following encodings are valid: 8 * 0xxxxxxx 9 * 110xxxxx 10xxxxxx 10 * 1110xxxx 10xxxxxx 10xxxxxx 11 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 12 */ 13 14 import mir.bitop: ctlz; 15 import mir.utility: max; 16 17 static if (!assumeNotEmpty) 18 { 19 if (str.length == 0) 20 return false; 21 } 22 else 23 { 24 assert(str.length); 25 } 26 27 uint f = str[0]; 28 value = f; 29 str = str[1 .. $]; 30 31 static if (!assumeFrontNotAscii) 32 { 33 if (f < 0x80) 34 return true; 35 } 36 37 uint len = ctlz(~(f << 25)); 38 if (len == 0 || len > max(3u, str.length)) // invalid UTF-8 39 return false; 40 value &= (1 << (6 - len)) - 1; 41 42 do 43 { 44 auto c = str[0]; 45 str = str[1 .. $]; 46 value <<= 6; 47 if ((c & 0xC0) != 0x80) 48 return false; 49 value |= c & 0x3F; 50 } 51 while(--len); 52 return true; 53 } 54 55 version (D_Exceptions): 56 57 package static immutable utfException = new Exception("Invalid UTF-8 sequence"); 58 59 /// 60 dchar decodeFront(scope ref inout(char)[] str) @safe pure @nogc @property 61 { 62 dchar ret; 63 if (decodeFrontImpl(str, ret)) 64 { 65 return ret; 66 } 67 import mir.exception: toMutable; 68 throw utfException.toMutable; 69 } 70 71 /// 72 @safe pure unittest 73 { 74 string str = "Hello, World!"; 75 76 assert(str.decodeFront == 'H' && str == "ello, World!"); 77 str = "å"; 78 assert(str.decodeFront == 'å' && str.length == 0); 79 str = "å"; 80 }