mir.deser.text.readers source code

1 /+
2 Helpers for reading values from a given Ion token.
3 
4 Authors: Harrison Ford
5 +/
6 module mir.deser.text.readers;
7 
8 import mir.deser.text.skippers;
9 import mir.deser.text.tokenizer;
10 import mir.deser.text.tokens;
11 
12 private bool isValidDchar(dchar c) pure nothrow @safe @nogc
13 {
14     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
15 }
16 
17 /+
18 Read the contents of a given token from the input range.
19 
20 $(WARNING This function does no checking if the current token
21 is the given function that you pass in. Use with caution.)
22 Params:
23     t = The tokenizer
24     token = The token type to read from the input range.
25 Returns:
26     The string contents of the token given
27 +/
28 auto readValue(IonTokenType token)(return ref IonTokenizer t) @nogc @safe pure
29 {
30     import std.traits : EnumMembers;
31     import std.string : chompPrefix;
32     static foreach(i, member; EnumMembers!IonTokenType) {{
33         static if (member != IonTokenType.TokenInvalid && member != IonTokenType.TokenEOF 
34                     && member != IonTokenType.TokenFloatInf && member != IonTokenType.TokenFloatMinusInf
35                     && member != IonTokenType.TokenFloatNaN && member < IonTokenType.TokenComma) 
36         {
37             enum name = __traits(identifier, EnumMembers!IonTokenType[i]);
38             static if (token == member) {
39                 static if (member == IonTokenType.TokenDot) {
40                     auto val = t.readSymbolOperator();
41                 }
42                 else {
43                     auto val = mixin("t.read" ~ name.chompPrefix("Token") ~ "()");
44                 }
45                 t.finished = true;
46 
47                 return val;
48             }
49         }
50     }}
51     assert(0);
52 }
53 ///
54 version(mir_ion_parser_test) unittest {
55     import mir.deser.text.tokenizer : tokenizeString;
56     import mir.deser.text.tokens : IonTokenType;
57 
58     void testVal(IonTokenType token)(string ts, string expected, char after) {
59         auto t = tokenizeString(ts);
60         assert(t.nextToken());
61         assert(t.currentToken == token);
62         auto v = readValue!(token)(t);
63         assert(v.matchedText == expected);
64         assert(t.readInput() == after);
65     }
66     with (IonTokenType) {
67         testVal!(TokenNumber)("123123", "123123", 0);
68     }
69 }
70 
71 /+
72 Read a UTF-32 code-point from the input range (for clobs).
73 Params:
74     t = The tokenizer
75 Returns:
76     a UTF-32 code-point
77 +/
78 dchar readEscapedClobChar(return ref IonTokenizer t) @nogc @safe pure {
79     return readEscapedChar!(true)(t);
80 }
81 
82 /+
83 Read out a UTF-32 code-point from a hex escape within our input range.
84 
85 For simplicity's sake, this will return the largest type possible (a UTF-32 code-point).
86 Params:
87     t = The tokenizer
88 Returns:
89     a code-point representing the escape value that was read
90 Throws:
91     IonTokenizerException if an invalid escape value was found.
92 +/
93 dchar readEscapedChar(bool isClob = false)(return ref IonTokenizer t) @nogc @safe pure 
94 {
95     dchar readHexEscapeLiteral(int length)() @nogc @safe pure { 
96         dchar codePoint = 0, val;
97         for (int i = 0; i < length; i++) {
98             const(char) c = t.expect!isHexDigit;
99             const(char) hexVal = hexLiteral(c);
100             codePoint = (codePoint << 4) | hexVal; // TODO: is this correct?
101         }
102         val = codePoint;
103         return val;
104     }
105 
106     char c;
107     static if (isClob) {
108         c = t.expect!"a != 'U' && a != 'u'"; // cannot have unicode escapes within clobs
109     } else {
110         c = t.readInput();
111     }
112 
113     switch (c) {
114         case '0':
115             // TODO: will this cause an error and make our code confused? 
116             // \0 should not normally exist (except in it's escaped form) -- determine if this is expected behavior
117             return '\0'; 
118         static foreach(member; ['a', 'b', 't', 'n', 'f', 'r', 'v']) {
119             case member:
120                 return mixin("'\\" ~ member ~ "'");
121         }
122         static foreach(member; ['?', '/', '\'', '"', '\\']) {
123             case member:
124                 return member;
125         }
126         case 'U':
127             return readHexEscapeLiteral!8;
128         case 'u':
129             return readHexEscapeLiteral!4;
130         case 'x':
131             return readHexEscapeLiteral!2;
132         default:
133             throw IonTokenizerErrorCode.invalidHexEscape.ionTokenizerException;
134     }
135 }
136 // Test reading a unicode escape
137 version(mir_ion_parser_test) unittest
138 {
139     import mir.deser.text.tokenizer : tokenizeString;
140     import mir.deser.text.tokens : IonTokenizerException;
141 
142     void test(string ts, dchar expected) {
143         auto t = tokenizeString(ts);
144         assert(t.readEscapedChar() == expected);
145     }
146 
147     void testFail(string ts) {
148         import std.exception : assertThrown;
149         auto t = tokenizeString(ts);
150         assertThrown!IonTokenizerException(t.readEscapedChar());
151     }
152 
153     test("U0001F44D", '\U0001F44D');
154     test("u2248", '\u2248');
155     test("x20", '\x20');
156     test("a", '\a');
157     test("b", '\b');
158     test("?", '?');
159     test("\"", '"');
160     test("0", '\0');
161 
162     testFail("c0101");
163     testFail("d21231");
164     testFail("!");
165 }
166 
167 /+
168 Read a UTF-32 escape sequence, and return it as UTF-8 character(s).
169 Params:
170     t = The tokenizer
171 Returns:
172     A string containing the UTF-32 escape sequence, or nothing if we read a new-line.
173     The length of the string is not well-defined, it can change depending on the escape sequence.
174 +/
175 size_t readEscapeSeq(bool isClob = false)(return ref IonTokenizer t) @nogc @safe pure
176 {
177     const(char) esc = t.peekOne();
178     if (esc == '\r') {
179         const(char)[] cs = t.peekMax(2);
180         if (cs.length == 2 && cs == "\r\n") {
181             t.skipExactly(2);
182             return 0;
183         } else {
184             t.skipOne();
185             return 0;
186         }
187     }
188     else if (esc == '\n') {
189         t.skipOne();
190         return 0;
191     }
192     
193     // I hate this, but apparently toUTF8 cannot take in a single UTF-32 code-point
194     const(dchar) c = readEscapedChar!(isClob)(t); 
195     // Extracted encode logic from std.utf.encode
196     // Zero out the escape sequence (since we re-use this buffer)
197     t.resetEscapeBuffer();
198     if (c <= 0x7F)
199     {
200         assert(isValidDchar(c));
201         t.escapeSequence[0] = cast(char) c;
202         return 1;
203     }
204     if (c <= 0x7FF)
205     {
206         assert(isValidDchar(c));
207         t.escapeSequence[0] = cast(char)(0xC0 | (c >> 6));
208         t.escapeSequence[1] = cast(char)(0x80 | (c & 0x3F));
209         return 2;
210     }
211     if (c <= 0xFFFF)
212     {
213         if (0xD800 <= c && c <= 0xDFFF)
214             throw IonTokenizerErrorCode.encodingSurrogateCode.ionTokenizerException;
215 
216         assert(isValidDchar(c));
217         t.escapeSequence[0] = cast(char)(0xE0 | (c >> 12));
218         t.escapeSequence[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
219         t.escapeSequence[2] = cast(char)(0x80 | (c & 0x3F));
220         return 3;
221     }
222     if (c <= 0x10FFFF)
223     {
224         assert(isValidDchar(c));
225         t.escapeSequence[0] = cast(char)(0xF0 | (c >> 18));
226         t.escapeSequence[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
227         t.escapeSequence[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
228         t.escapeSequence[3] = cast(char)(0x80 | (c & 0x3F));
229         return 4;
230     }
231 
232     assert(!isValidDchar(c));
233     throw IonTokenizerErrorCode.encodingInvalidCode.ionTokenizerException;
234 }
235 
236 /+
237     Read a non-quoted symbol from our input range.
238     Params:
239         t = The tokenizer
240     Returns:
241         A string containing the un-quoted symbol from the input range in the tokenizer.
242 +/
243 IonTextSymbol readSymbol(return ref IonTokenizer t) @safe pure @nogc
244 {
245     IonTextSymbol val;
246     size_t end = 0, endPos = 0;
247     const(char)[] window = t.window;
248 
249     if (window.length == 0) return val;
250     foreach(c; window) {
251         if (!c.isIdentifierPart()) {
252             break;
253         }
254         end++;
255     }
256 
257     endPos = t.position + end;
258     if (end > t.window.length || endPos > t.input.length) {
259         assert(0); // should never happen
260     }
261 
262     val.matchedIndex = t.position;
263     val.matchedText = t.input[t.position .. endPos];
264     t.skipExactly(end);
265 
266     return val;
267 }
268 // Test reading a symbol
269 version(mir_ion_parser_test) unittest
270 {
271     import mir.deser.text.tokenizer : tokenizeString;
272     import mir.deser.text.tokens : IonTokenizerException, IonTokenType;
273 
274     void test(string ts, string expected, IonTokenType after) {
275         import std.exception : assertNotThrown;
276         auto t = tokenizeString(ts); 
277         assertNotThrown!IonTokenizerException(t.nextToken());
278         assert(t.currentToken == IonTokenType.TokenSymbol);
279         assert(t.readSymbol().matchedText == expected);
280         assertNotThrown!IonTokenizerException(t.nextToken());
281         assert(t.currentToken == after);
282     }
283 
284     test("hello", "hello", IonTokenType.TokenEOF);
285     test("a", "a", IonTokenType.TokenEOF);
286     test("abc", "abc", IonTokenType.TokenEOF);
287     test("null +inf", "null", IonTokenType.TokenFloatInf);
288     test("false,", "false", IonTokenType.TokenComma);
289     // nan should not be a symbol -- we should treat it as it's own case
290     // test("nan]", "nan", IonTokenType.TokenCloseBracket);
291 }
292 
293 /+
294 Read a quoted symbol from our input range, 
295 and automatically decode any escape sequences found.
296     
297 Params:
298     t = The tokenizer
299 Returns:
300     A string containing the quoted symbol.
301 +/
302 IonTextQuotedSymbol readSymbolQuoted(return ref IonTokenizer t) @nogc @safe pure
303 {
304     IonTextQuotedSymbol val;
305     val.isFinal = true;
306     size_t read, startIndex = t.position, endIndex = 0;
307     loop: while (true) {
308         char c = t.expect!"a != 0 && a != '\\n'";
309         s: switch (c) {
310             case '\'': // found the end 
311                 break loop;
312             case '\\':
313                 if (read != 0) {
314                     t.unread(c);
315                     val.isFinal = false;
316                     endIndex = t.position;
317                     break loop;
318                 }
319 
320                 size_t esc = t.readEscapeSeq();
321                 if (esc == 0) continue;
322                 val.matchedText = t.escapeSequence[0 .. esc];
323                 val.matchedIndex = startIndex;
324                 val.isEscapeSequence = true;
325                 val.isFinal = false;
326                 if (t.peekOne() == '\'') {
327                     t.skipOne();
328                     val.isFinal = true;
329                 }
330                 return val;
331             default:
332                 read++;
333                 break s;
334         }
335     }
336 
337     if (endIndex == 0) {
338         endIndex = t.position - 1;
339     }
340 
341     val.matchedText = t.input[startIndex .. endIndex];
342     val.matchedIndex = startIndex;
343     return val;
344 }
345 // Test reading quoted symbols
346 version(mir_ion_parser_test) unittest
347 {
348     import mir.deser.text.tokenizer : tokenizeString;
349     import mir.deser.text.tokens : IonTokenType;
350 
351     void test(string ts, string expected, char after) {
352         auto t = tokenizeString(ts);
353         assert(t.nextToken());
354         assert(t.currentToken == IonTokenType.TokenSymbolQuoted);
355         auto val = t.readSymbolQuoted();
356         assert(val.matchedText == expected);
357         assert(val.isFinal);
358         assert(!val.isEscapeSequence);
359         assert(t.readInput() == after);
360     }
361 
362     void testMultipart(string ts, string expected1, string expected2, string expected3, char after) {
363         auto t = tokenizeString(ts);
364         assert(t.nextToken());
365         assert(t.currentToken == IonTokenType.TokenSymbolQuoted);
366 
367         auto val = t.readSymbolQuoted();
368         assert(val.matchedText == expected1);
369         assert(!val.isFinal);
370 
371         auto val2 = t.readSymbolQuoted();
372         assert(val2.matchedText == expected2);
373         assert(!val2.isFinal);
374 
375         auto val3 = t.readSymbolQuoted();
376         assert(val3.matchedText == expected3);
377         assert(val3.isFinal);
378         assert(t.readInput() == after);
379     }
380 
381     test("'a'", "a", 0);
382     test("'a b c'", "a b c", 0);
383     test("'null' ", "null", ' ');
384     test("'false',", "false", ',');
385     test("'nan']", "nan", ']');
386 
387     testMultipart("'a\\'b'", "a", "'", "b", 0);
388     testMultipart(`'a\nb'`, "a", "\n", "b", 0);
389     testMultipart("'a\\\\b'", "a", "\\", "b", 0);
390     testMultipart(`'a\x20b'`, "a", " ", "b", 0);
391     testMultipart(`'a\u2248b'`, "a", "≈", "b", 0);
392     testMultipart(`'a\U0001F44Db'`, "a", "👍", "b", 0);
393 }
394 
395 /+
396 Read a symbol operator from the input range.
397 Params:
398     t = The tokenizer
399 Returns:
400     A string containing any symbol operators that were able to be read.
401 +/
402 IonTextSymbolOperator readSymbolOperator(return ref IonTokenizer t) @safe @nogc pure
403 {
404     IonTextSymbolOperator val;
405     size_t startIndex = t.position;
406     val.matchedIndex = startIndex;
407     char c = t.peekOne();
408     while (c.isOperatorChar()) {
409         t.skipOne();
410         c = t.peekOne();
411     }
412 
413     val.matchedText = t.input[startIndex .. t.position];
414     return val;
415 }
416 
417 /+
418 Read a string from the input range and automatically decode any UTF escapes.
419 Params:
420     longString = Is this string a 'long' string, defined by 3 single-quotes?
421     isClob = Is this string allowed to have UTF escapes?
422     t = The tokenizer
423 Returns:
424     The string's content from the input range.
425 +/
426 auto readString(bool longString = false, bool isClob = false)(return ref IonTokenizer t) @safe @nogc pure
427 {
428     static if (isClob) {
429         IonTextClob val;
430     } else {
431         IonTextString val;
432     } 
433 
434     val.isFinal = true;
435     static if (longString && !isClob) {
436         val.isLongString = true;
437     }
438 
439     size_t read = 0, startIndex = t.position, endIndex = 0;
440     loop: while (true) {
441         char c = t.expect!"a != 0";
442 
443         static if (!longString) {
444             t.expectFalse!(isNewLine, true)(c);
445         }
446 
447         /*
448         static if (isClob) {
449             //t.expectFalse!(isInvalidChar, true)(c);
450             t.expect!(isASCIIChar, true)(c);
451         } else {
452             t.expectFalse!(isInvalidChar, true)(c);
453         }
454         */
455 
456         s: switch (c) {
457             static if (!longString) {
458                 case '"':
459                     break loop;
460             } else {
461                 static if (!isClob) {
462                     case '\r':
463                         if (read != 0) {
464                             t.unread(c);
465                             endIndex = t.position;
466                             val.isFinal = false;
467                             break loop;
468                         }
469 
470                         const(char)[] v = t.peekMax(1);
471                         if (v.length == 1 && v[0] == '\n') { // see if this is \r\n or just \r
472                             t.skipOne();
473                         }
474 
475                         t.resetEscapeBuffer();
476                         t.escapeSequence[0] = '\n';
477                         val.matchedText = t.escapeSequence[0 .. 1];
478                         val.isNormalizedNewLine = true;
479                         val.isFinal = false;
480 
481                         // do the same check, and see if this string ends *directly* after this newline
482                         // again, peekExactly is acceptable here because the long string *MUST* end with
483                         // a sequence of 3 quotes, and we should throw if it's not there.
484                         if (t.peekExactly(3) == "'''") {
485                             // consume, and skip whitespace
486                             assert(t.skipExactly(3)); // consume the first quote mark
487                             val.isFinal = true;
488                             c = t.skipWhitespace!(true, false);
489                             t.unread(c);
490                         }
491                         return val;
492                 }
493                 case '\'':
494                     const(char)[] v = t.peekMax(2);
495                     if (v.length == 2 && v[0] == '\'' && v[1] == '\'') {
496                         val.isFinal = true;
497                         endIndex = t.position - 1;
498                         t.skipExactly(2);
499                         static if (isClob) {
500                             c = t.skipWhitespace!(false, true);
501                             if (c) {
502                                 t.unread(c);
503                                 break loop;
504                             } else {
505                                 break s;
506                             }
507                         } else {
508                             break loop;
509                         }
510                     } else {
511                         goto default;
512                     }
513             }
514             case '\\':
515                 if (read != 0) {
516                     t.unread(c);
517                     endIndex = t.position;
518                     val.isFinal = false;
519                     break loop;
520                 }
521 
522                 size_t esc = readEscapeSeq!(isClob)(t);
523                 static if (isClob) {
524                     if (esc == 2) {
525                         // XXX: hack
526                         // Since we can't have unicode escapes, this HAS to be \x80 - \xFF.
527                         // We shouldn't convert this into a UTF codepoint, and we should keep it as is.
528                         break s;
529                     } 
530                 }
531 
532                 assert(esc <= 4); // sanity check that we do not have an escape larger then 4 chars
533             
534                 val.matchedText = t.escapeSequence[0 .. esc];
535                 val.matchedIndex = startIndex;
536                 val.isEscapeSequence = true;
537                 val.isFinal = false;
538                 // check if the string ends *directly* after this escape,
539                 // if so, just consume the quotations, and call it a day
540                 static if (longString) {
541                     // if this is a long string, there should be *at least* 3 extra
542                     // characters left (for the ending quotes). this will throw 
543                     // if they are not there.
544                     if (t.peekExactly(3) == "'''") { 
545                         // consume, and skip whitespace
546                         assert(t.skipExactly(3));
547                         val.isFinal = true;
548                         static if (isClob) {
549                             c = t.skipWhitespace!(false, true);
550                         } else {
551                             c = t.skipWhitespace!(true, false);
552                         }
553                         t.unread(c);
554                     }
555                 } else {
556                     if (t.peekOne() == '"') {
557                         assert(t.skipOne());
558                         val.isFinal = true;
559                     }
560                 }
561                 if (esc >= 1) {
562                     val.escapeSequenceType = IonTextEscapeType.UTF;
563                 } else {
564                     val.escapeSequenceType = IonTextEscapeType.Hex;
565                 }
566                 return val;
567                 //break s;
568             default:
569                 read++;
570                 break s;
571         }
572     }
573 
574     if (endIndex == 0) {
575         endIndex = t.position - 1;
576     }
577 
578     val.matchedText = t.input[startIndex .. endIndex];
579     val.matchedIndex = startIndex;
580     return val;
581 }
582 // Test reading a string
583 version(mir_ion_parser_test) unittest
584 {
585     import mir.deser.text.tokenizer : tokenizeString;
586     import mir.deser.text.tokens : IonTokenType;
587 
588     void test(string ts, string expected, char after) {
589         auto t = tokenizeString(ts);
590         assert(t.nextToken());
591         assert(t.currentToken == IonTokenType.TokenString);
592         auto str = t.readString();
593         assert(str.matchedText == expected);
594         assert(t.readInput() == after);
595     }
596 
597     void testMultiPart(string ts, string expected, string after, char last) {
598         auto t = tokenizeString(ts);
599 
600         assert(t.nextToken());
601         assert(t.currentToken == IonTokenType.TokenString);
602         auto str = t.readString();
603         assert(str.matchedText == expected);
604         assert(!str.isEscapeSequence);
605         assert(!str.isFinal);
606 
607         auto str2 = t.readString();
608         assert(str2.matchedText == after);
609         assert(str2.isEscapeSequence);
610         assert(str2.isFinal);
611         assert(t.readInput() == last);
612     }
613 
614     test(`"Hello, world"`, "Hello, world", 0);
615     testMultiPart(`"Hello! \U0001F44D"`, "Hello! ", "👍", 0);
616     test(`"0xFOOBAR",`, "0xFOOBAR", ',');
617 }
618 
619 /+
620 Read a long string (defined by having three single quotes surrounding it's contents).
621 
622 $(NOTE If this function encounters another long string in the input range separated by whitespace, 
623 it will concatenate the contents of the two long strings together. This is not implementation-specific,
624 rather, part of the Ion specification)
625 
626 Params:
627     t = The tokenizer
628 Returns:
629     A string holding the contents of any long strings found.
630 +/
631 IonTextString readLongString(return ref IonTokenizer t) @safe @nogc pure
632 {
633     return readString!(true)(t);
634 }
635 // Test reading a long string
636 version(mir_ion_parser_test) unittest
637 {
638     import mir.deser.text.tokenizer : tokenizeString;
639     import mir.deser.text.tokens : IonTokenType;
640 
641     void test(string ts, string expected, char after) {
642         auto t = tokenizeString(ts);
643         assert(t.nextToken());
644         assert(t.currentToken == IonTokenType.TokenLongString);
645         auto str = t.readLongString();
646         t.finished = true;
647         assert(str.matchedText == expected);
648         assert(t.readInput() == after);
649         assert(str.isFinal);
650     }
651 
652     void testMultiPart(string ts, string expected1, string expected2, char after) {
653         auto t = tokenizeString(ts);
654         assert(t.nextToken());
655         assert(t.currentToken == IonTokenType.TokenLongString);
656 
657         auto str = t.readLongString();
658         t.finished = true;
659         assert(str.matchedText == expected1);
660         assert(str.isFinal);
661 
662         assert(t.nextToken());
663         assert(t.currentToken == IonTokenType.TokenLongString);
664         auto str2 = t.readLongString();
665         assert(str2.matchedText == expected2);
666         assert(t.readInput() == after);
667         assert(str.isFinal);
668     }
669 
670     void testNewLine(string ts, string expected1, string expected2, bool normalized, bool eofFinal, char after) {
671         auto t = tokenizeString(ts);
672         assert(t.nextToken());
673         assert(t.currentToken == IonTokenType.TokenLongString);
674         auto str = t.readLongString();
675         assert(str.matchedText == expected1);
676         t.finished = true;
677         if (normalized) {
678             assert(!str.isFinal);
679             auto str1 = t.readLongString();
680             t.finished = true;
681             assert(str1.isNormalizedNewLine);
682             assert(str1.matchedText == "\n");
683             if (eofFinal) {
684                 assert(str1.isFinal);
685                 assert(t.nextToken());
686                 assert(t.currentToken == IonTokenType.TokenLongString);
687             } else {
688                 assert(!str1.isFinal);
689             }
690         } else {
691             assert(str.isFinal);
692             assert(t.nextToken());
693             assert(t.currentToken == IonTokenType.TokenLongString);
694         }
695         auto str1 = t.readLongString();
696         assert(str1.matchedText == expected2);
697         assert(str1.isFinal);
698         assert(t.readInput() == after);
699     }
700 
701     test(`'''Hello, world'''`, "Hello, world", 0);
702     testMultiPart(`'''Hello! ''''''\U0001F44D'''`, "Hello! ", "👍", 0);
703     test(`'''0xFOOBAR''',`, "0xFOOBAR", ',');
704     test(`'''Hello, 'world'!'''`, "Hello, \'world\'!", 0);
705     testMultiPart(`'''Hello,'''''' world!'''`, "Hello,", " world!", 0);
706     testMultiPart(`'''Hello,'''     ''' world!'''`, "Hello,", " world!", 0);
707     // Test the normalization of new-lines in long strings here.
708     testNewLine("'''Hello, \r\n''' '''world!'''", "Hello, ", "world!", true, true, 0); // normalized, crlf precedes end of string
709     testNewLine("'''Hello, \r\n world!'''", "Hello, ", " world!", true, false, 0); // normalized, but there is extra text
710     testNewLine("'''Hello, \n''' '''world!'''", "Hello, \n", "world!", false, false, 0); // not normalized, no extra text
711     testNewLine("'''Hello, \r''' '''world!'''", "Hello, ", "world!", true, true, 0); // normalized, crlf precedes end of string
712     testNewLine("'''Hello, \r \nworld!'''", "Hello, ", " \nworld!", true, false, 0); // normalized, but there is extra text
713 }
714 
715 /+
716 Read the contents of a clob, and return it as an untyped array.
717 
718 $(NOTE As per Ion specification, a clob does not contain Base64 data. Use readBlob if you are expecting to decode Base64 data.)
719 
720 Params:
721     longClob = Should this function concatenate the contents of multiple clobs within the brackets?
722     t = The tokenizer
723 Returns:
724     An untyped array containing the contents of the clob. This array is guaranteed to have no UTF-8/UTF-32 characters -- only ASCII characters.
725 +/
726 
727 IonTextClob readClob(bool longClob = false)(return ref IonTokenizer t) @safe @nogc pure
728 {
729     // Always read out bytes, as clobs are octet-based (and not necessarily a string)
730     auto data = readString!(longClob, true)(t);
731     char c;
732     static if (longClob) {
733         data.isLongClob = true;
734         if (data.isFinal) {
735             c = t.skipLobWhitespace();
736             if (c == '\'' && t.isTripleQuote()) {
737                 data.isFinal = false;
738                 return data;
739             }
740         }
741     }
742 
743     // read out the following }} ONLY if we don't encounter an escape sequence
744     if (data.isFinal) {
745         static if (longClob) {
746             c = t.expect!("a == '}'", true)(c);
747         } else {
748             c = t.expect!("a == '}'", true)(t.skipLobWhitespace()); // after skipping any whitespace, it should be the terminator ('}')
749         }
750         c = t.expect!"a == '}'"; // and no whitespace should between one bracket and another
751     }
752 
753     return data;
754 }
755 // Test reading a short clob
756 version(mir_ion_parser_test) unittest
757 {
758     import mir.deser.text.tokenizer : tokenizeString;
759     import mir.deser.text.tokens : IonTokenType;
760 
761     void test(string ts, string expected, char after) {
762         auto t = tokenizeString(ts);
763         assert(t.nextToken());
764         assert(t.currentToken == IonTokenType.TokenString);
765         assert(t.readClob().matchedText == expected);
766         assert(t.readInput() == after);
767     }
768 
769     test(`"Hello, world"}}`, "Hello, world", 0);
770     test(`"0xF00BAR"}}, `, "0xF00BAR", ',');
771 }
772 
773 /+
774 Helper to read a long clob from the input stream.
775 
776 See [readClob] for any notes.
777 Params:
778     t = The tokenizer
779 Returns:
780     An untyped array holding the contents of the clob.
781 +/
782 IonTextClob readLongClob(return ref IonTokenizer t) @safe @nogc pure
783 {
784     return readClob!(true)(t);
785 }
786 
787 /+
788 Read a blob from the input stream, and return the Base64 contents.
789 
790 $(NOTE This function does not verify if the Base64 contained is valid, or if it is even Base64.)
791 Params:
792     t = The tokenizer
793 Returns:
794     An untyped array containing the Base64 contents of the blob.
795 +/
796 IonTextBlob readBlob(return ref IonTokenizer t) @safe @nogc pure
797 {
798     IonTextBlob val;
799     size_t startIndex = t.position, endIndex = t.position;
800     char c;
801     while (true) {
802         c = t.expect!("a != 0", true)(t.skipLobWhitespace());
803         if (c == '}') {
804             break;
805         } else {
806             endIndex = t.position; // grow our end index as we get more data
807         }
808     }
809 
810     c = t.expect!"a == '}'";
811     t.finished = true;
812     val.matchedText = t.input[startIndex .. endIndex];
813     val.matchedIndex = startIndex;
814     return val;
815 }
816 /+
817 Read a number from the input stream, and return the type of number, as well as the number itself.
818 
819 Params:
820     t = The tokenizer
821 Returns:
822     A struct holding the type and value of the number.
823     See the examples below on how to access the type/value.
824 +/
825 
826 IonTextNumber readNumber(return ref IonTokenizer t) @safe @nogc pure
827 {
828     import mir.ion.type_code : IonTypeCode;
829     IonTextNumber num;
830     size_t startIndex = t.position;
831 
832     void readExponent() @safe @nogc pure {
833         char c = t.readInput();
834         if (c == '+' || c == '-') {
835             c = t.expect!"a != 0";
836         }
837 
838         readDigits(t, c);
839     }
840 
841     char c = t.readInput();
842     if (c == '-' || c == '+') {
843         c = t.readInput();
844     }
845 
846     immutable char leader = c;
847     const(char)[] digits = readDigits(t, leader);
848     if (leader == '0') {
849         if (digits.length != 1) { // if it is not just a plain 0, fail since we don't support leading zeros
850             throw IonTokenizerErrorCode.invalidLeadingZeros.ionTokenizerException;
851         }
852     }
853 
854     c = t.readInput();
855     if (c == '.') {
856         c = t.readInput();
857         if (c.isDigit) {
858             immutable char decimalLeader = t.expect!("a != 0", true)(c);
859             readDigits(t, decimalLeader);
860             c = t.readInput();
861         }
862     }
863 
864     switch (c) {
865         case 'e':
866         case 'E':
867         case 'd':
868         case 'D':
869             readExponent();
870             break;
871         default:
872             // this is not a character we want, so unread it (for now)
873             t.unread(c); 
874             break;
875     }
876 
877     c = t.expect!(t.isStopChar);
878     t.unread(c);
879     num.matchedText = t.input[startIndex .. t.position];
880     num.matchedIndex = startIndex;
881 
882     return num; 
883 }
884 // Test reading numbers
885 version(mir_ion_parser_test) unittest
886 {
887     import mir.deser.text.tokenizer : tokenizeString;
888     import mir.deser.text.tokens : IonTokenType;
889     import mir.ion.type_code : IonTypeCode;
890 
891     void test(string ts, string expected, char after) {
892         auto t = tokenizeString(ts);
893         assert(t.nextToken());
894         assert(t.currentToken == IonTokenType.TokenNumber);
895         auto n = t.readNumber();
896         assert(n.matchedText == expected);
897         assert(t.readInput() == after);
898     }
899 
900     test("12341", "12341", 0);
901     test("-12312", "-12312", 0);
902     test("0.420d2", "0.420d2", 0);
903     test("1.1999999999999999555910790149937383830547332763671875e0", 
904          "1.1999999999999999555910790149937383830547332763671875e0", 0);
905     test("1.1999999999999999e0, ", "1.1999999999999999e0", ',');
906 }
907 
908 /+
909 Read as many digits from the input stream as possible, given the first digit of the digits.
910 
911 This function will stop reading digits as soon as whitespace is hit.
912 Params:
913     t = The tokenizer
914     leader = The leading digit in a sequence of digits following
915     buf = The appender on which this function will put it's output
916 Returns:
917     A character located after it has read every single digit in a sequence.
918 +/
919 const(char)[] readDigits(return ref IonTokenizer t, char leader) @safe @nogc pure
920 {
921     immutable char c = leader;
922     if (!isDigit(c)) {
923         throw IonTokenizerErrorCode.expectedValidLeader.ionTokenizerException;
924     }
925     t.unread(c); // unread so the readRadixDigits can consume it
926     return readRadixDigits(t);
927 }
928 
929 /+
930 Read as many digits from the input stream as possible, given a validator.
931 
932 This function will stop reading digits as soon as the validator returns false.
933 Params:
934     isValid = The validation function which is called to determine if the reader should halt.
935     t = The tokenizer
936     buf = The appender on which this function will put it's output
937 Returns:
938     A character located after it has read every single digit in a sequence.
939 +/
940 const(char)[] readRadixDigits(alias isValid = isDigit)(return ref IonTokenizer t) 
941 {
942     import mir.functional : naryFun;
943     size_t startIndex = t.position;
944     while (true) {
945         char c = t.readInput();
946         if (c == '_') {
947             t.expect!(isValid, true)(t.peekOne());
948             continue; // skip over it
949         }
950 
951         if (!naryFun!isValid(c)) {
952             t.unread(c);
953             return t.input[startIndex .. t.position];
954         }
955     }
956 }
957 
958 /+
959 Read a radix number, given two validation functions for it's marker and the validity of each digit read.
960 
961 Params:
962     isMarker = A validation function to check if the marker is valid (0b/0x/etc)
963     isValid = A validation function to check if every digit found is valid (0-1/0-9A-F/etc)
964     t = The tokenizer
965 Returns:
966     A string containing the full radix number (including the leading 0 and marker).
967 +/
968 const(char)[] readRadix(alias isMarker, alias isValid)(return ref IonTokenizer t) @safe @nogc pure
969 {
970     size_t startIndex = t.position;
971     char c = t.readInput();
972     if (c == '-') {
973         c = t.readInput();
974     }
975 
976     // 0
977     t.expect!("a == '0'", true)(c);
978     // 0(b || x)
979     c = t.expect!isMarker;
980     t.expect!("a != '_'", true)(t.peekOne()); // cannot be 0x_ or 0b_
981     const(char)[] val = readRadixDigits!(isValid)(t);
982     c = t.readInput();
983     if (c) {
984         c = t.expect!(t.isStopChar, true)(c);
985         t.unread(c);
986     }
987 
988     return t.input[startIndex .. t.position];
989 }
990 
991 /+
992 Read a binary number (marked by '0b') from the input stream.
993 
994 Params:
995     t = The tokenizer
996 Returns:
997     A string containing the entire binary number read.
998 +/
999 const(char)[] readBinary(return ref IonTokenizer t) @safe @nogc pure
1000 {
1001     return readRadix!("a == 'b' || a == 'B'", "a == '0' || a == '1'")(t);
1002 }
1003 // Test reading a binary number
1004 version(mir_ion_parser_test) unittest
1005 {
1006     import mir.deser.text.tokenizer : tokenizeString;
1007     import mir.deser.text.tokens : IonTokenType;
1008 
1009     void test(string ts, string expected, char after) {
1010         auto t = tokenizeString(ts);
1011         assert(t.nextToken());
1012         assert(t.currentToken == IonTokenType.TokenBinary);
1013         assert(t.readBinary() == expected);
1014         assert(t.readInput() == after);
1015     }
1016 
1017     test("0b101011010", "0b101011010", 0);
1018     test("0b100000101000001010000010100000101000001 ", "0b100000101000001010000010100000101000001", ' ');
1019     test("0b11011110101011011011111011101111,", "0b11011110101011011011111011101111", ',');
1020     test("      0b11011110101011011011111011101111,", "0b11011110101011011011111011101111", ',');  
1021 }
1022 
1023 /+
1024 Read a hex number (marked by '0x') from the input stream.
1025 
1026 Params:
1027     t = The tokenizer
1028 Returns:
1029     A string containing the entire hex number read.
1030 +/
1031 const(char)[] readHex(return ref IonTokenizer t) @safe @nogc pure
1032 {
1033     return readRadix!("a == 'x' || a == 'X'", isHexDigit)(t);
1034 }
1035 // Test reading a hex number
1036 version(mir_ion_parser_test) unittest
1037 {
1038     import mir.deser.text.tokenizer : tokenizeString;
1039     import mir.deser.text.tokens : IonTokenType;
1040 
1041     void test(string ts, string expected, char after) {
1042         auto t = tokenizeString(ts);
1043         assert(t.nextToken());
1044         assert(t.currentToken == IonTokenType.TokenHex);
1045         assert(t.readHex() == expected);
1046         assert(t.readInput() == after);
1047     } 
1048 
1049     void testMultipart(string ts, string expected1, char after, string expected2) {
1050         auto t = tokenizeString(ts);
1051         assert(t.nextToken());
1052         assert(t.currentToken == IonTokenType.TokenHex);
1053         assert(t.readHex() == expected1);
1054         assert(t.readInput() == after);
1055         assert(t.readHex() == expected2);
1056     }
1057 
1058     test("0xBADBABE", "0xBADBABE", 0);
1059     test("0x414141", "0x414141", 0);
1060     test("0x0", "0x0", 0);
1061     test("     0x414141", "0x414141", 0);
1062     test("     0x414141,", "0x414141", ',');
1063     testMultipart("     0x414141,0x414142", "0x414141", ',', "0x414142");
1064 }
1065 
1066 /+
1067 Read a ISO-8601 extended timestamp from the input stream.
1068 
1069 $(NOTE This function does some rudimentary checks to see if the timestamp is valid,
1070 but it does nothing more then that.)
1071 
1072 Params:
1073     t = The tokenizer
1074 Returns:
1075     A string containing the entire timestamp read from the input stream.
1076 +/
1077 
1078 IonTextTimestamp readTimestamp(return ref IonTokenizer t) @safe @nogc pure 
1079 {
1080     IonTextTimestamp val;
1081     size_t startIndex = t.position;
1082 
1083     char readTSDigits(int nums) @safe @nogc pure {
1084         for (int i = 0; i < nums; i++) {
1085             t.expect!isDigit;
1086         }
1087         return t.readInput();
1088     }
1089 
1090     char readTSOffset(char c) @safe @nogc pure {
1091         if (c != '-' && c != '+') {
1092             return c; 
1093         }
1094         const(char) cs = t.expect!("a == ':'", true)(readTSDigits(2));
1095         return readTSDigits(2);
1096     }
1097 
1098     char readTSOffsetOrZ(char c) @safe @nogc pure {
1099         t.expect!("a == '-' || a == '+' || a == 'z' || a == 'Z'", true)(c);
1100         if (c == '-' || c == '+') {
1101             return readTSOffset(c);
1102         }
1103         if (c == 'z' || c == 'Z') {
1104             return t.readInput();
1105         }
1106         assert(0);
1107     }
1108 
1109     IonTextTimestamp readTSFinish(char c) @safe @nogc pure {
1110         t.expect!(t.isStopChar, true)(c);
1111         t.unread(c);
1112         val.matchedIndex = startIndex;
1113         val.matchedText = t.input[startIndex .. t.position];
1114         return val;
1115     }
1116 
1117     // could be either:
1118     // yyyy(T || -)
1119     // or hh
1120     char c = readTSDigits(2);
1121     // is this a year? if so, then the character after
1122     // the first two digits should be a digit... if not,
1123     // we can just assume that it's a "time of day"
1124     if (c.isDigit())
1125     {
1126         c = t.expect!("a == 'T' || a == '-'", true)(readTSDigits(1));
1127         if (c == 'T') {
1128             // yyyyT
1129             val.matchedText = t.input[startIndex .. t.position];
1130             return val;
1131         }
1132         // yyyy-mm(T || -)
1133         c = t.expect!("a == 'T' || a == '-'", true)(readTSDigits(2));
1134         if (c == 'T') {
1135             val.matchedText = t.input[startIndex .. t.position];
1136             return val;
1137         }
1138         // yyyy-mm-dd(T)?
1139         c = readTSDigits(2);
1140         if (c != 'T') {
1141             return readTSFinish(c);
1142         }
1143         // yyyy-mm-ddT 
1144         c = t.readInput();
1145         if (!c.isDigit()) {
1146             // yyyy-mm-ddT(+ || -)hh:mm
1147             c = readTSOffset(c);
1148             return readTSFinish(c);
1149         }
1150         // (yyyy-mm-ddT)?hh
1151         c = t.expect!("a == ':'", true)(readTSDigits(1));
1152     }
1153     else
1154     {
1155         // hh
1156         c = t.expect!("a == ':'", true)(c);
1157     }
1158 
1159     // (yyyy-mm-ddT)?hh:mm
1160     c = readTSDigits(2);
1161     if (c != ':') {
1162         // (yyyy-mm-ddT)?hh:mm(+-|Z)?
1163         if (c) {
1164             c = readTSOffsetOrZ(c);
1165         }
1166         return readTSFinish(c);
1167     }
1168 
1169     // (yyyy-mm-ddT)?hh:mm:ss
1170     c = readTSDigits(2);
1171 
1172     if (c != '.') {
1173         // (yyyy-mm-ddT)?hh:mm:ss(Z)?
1174         if (c) {
1175             c = readTSOffsetOrZ(c);
1176         }
1177         return readTSFinish(c);
1178     }
1179 
1180     // (yyyy-mm-ddT)?hh:mm:ss.ssssZ
1181     c = t.readInput();
1182     if (c.isDigit()) {
1183         readDigits(t, c);
1184     }
1185 
1186     c = t.readInput();
1187     if (c) {
1188         c = readTSOffsetOrZ(c);
1189     } 
1190 
1191     return readTSFinish(c);
1192 }
1193 // Test reading timestamps
1194 version(mir_ion_parser_test) unittest
1195 {
1196     import mir.deser.text.tokenizer : tokenizeString;
1197     import mir.deser.text.tokens : IonTokenType;
1198 
1199     void test(string ts, string expected, char after) {
1200         auto t = tokenizeString(ts);
1201         assert(t.nextToken());
1202         assert(t.currentToken == IonTokenType.TokenTimestamp);
1203         assert(t.readTimestamp().matchedText == expected);
1204         assert(t.readInput() == after);
1205     } 
1206 
1207     test("2001T", "2001T", 0);
1208     test("2001-01T,", "2001-01T", ',');
1209     test("2001-01-02}", "2001-01-02", '}');
1210     test("2001-01-02T ", "2001-01-02T", ' ');
1211     test("2001-01-02T+00:00\t", "2001-01-02T+00:00", '\t');
1212     test("2001-01-02T-00:00\n", "2001-01-02T-00:00", '\n');
1213     test("2001-01-02T03:04+00:00 ", "2001-01-02T03:04+00:00", ' ');
1214     test("2001-01-02T03:04-00:00 ", "2001-01-02T03:04-00:00", ' ');
1215     test("2001-01-02T03:04Z ", "2001-01-02T03:04Z", ' ');
1216     test("2001-01-02T03:04z ", "2001-01-02T03:04z", ' ');
1217     test("2001-01-02T03:04:05Z ", "2001-01-02T03:04:05Z", ' ');
1218     test("2001-01-02T03:04:05+00:00 ", "2001-01-02T03:04:05+00:00", ' ');
1219     test("2001-01-02T03:04:05.666Z ", "2001-01-02T03:04:05.666Z", ' ');
1220     test("2001-01-02T03:04:05.666666z ", "2001-01-02T03:04:05.666666z", ' ');
1221 
1222     // Test new "time of day" timestamps
1223     test("03:04+00:00", "03:04+00:00", 0);
1224     test("03:04-00:00", "03:04-00:00", 0);
1225     test("03:04Z", "03:04Z", 0);
1226     test("03:04z", "03:04z", 0);
1227     test("03:04:05Z", "03:04:05Z", 0);
1228     test("03:04:05+00:00", "03:04:05+00:00", 0);
1229     test("03:04:05.666Z", "03:04:05.666Z", 0);
1230     test("03:04:05.666z", "03:04:05.666z", 0);
1231     test("03:04:05.666666Z", "03:04:05.666666Z", 0);
1232     test("03:04:05.666666z", "03:04:05.666666z", 0);
1233 }