1 /+
2 Helpers for reading values from a given Ion token.
4 Authors: Harrison Ford
5 +/
6 module mir.deser.text.readers;
8 import mir.deser.text.skippers;
9 import mir.deser.text.tokenizer;
10 import mir.deser.text.tokens;
12 private bool isValidDchar(dchar c) pure nothrow @safe @nogc
13 {
14     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
15 }
17 /+
18 Read the contents of a given token from the input range.
20 $(WARNING This function does no checking if the current token
21 is the given function that you pass in. Use with caution.)
22 Params:
23     t = The tokenizer
24     token = The token type to read from the input range.
25 Returns:
26     The string contents of the token given
27 +/
28 auto readValue(IonTokenType token)(return ref IonTokenizer t) @nogc @safe pure
29 {
30     import std.traits : EnumMembers;
31     import std.string : chompPrefix;
32     static foreach(i, member; EnumMembers!IonTokenType) {{
33         static if (member != IonTokenType.TokenInvalid && member != IonTokenType.TokenEOF 
34                     && member != IonTokenType.TokenFloatInf && member != IonTokenType.TokenFloatMinusInf
35                     && member != IonTokenType.TokenFloatNaN && member < IonTokenType.TokenComma) 
36         {
37             enum name = __traits(identifier, EnumMembers!IonTokenType[i]);
38             static if (token == member) {
39                 static if (member == IonTokenType.TokenDot) {
40                     auto val = t.readSymbolOperator();
41                 }
42                 else {
43                     auto val = mixin("t.read" ~ name.chompPrefix("Token") ~ "()");
44                 }
45                 t.finished = true;
47                 return val;
48             }
49         }
50     }}
51     assert(0);
52 }
53 ///
54 version(mir_ion_parser_test) unittest {
55     import mir.deser.text.tokenizer : tokenizeString;
56     import mir.deser.text.tokens : IonTokenType;
58     void testVal(IonTokenType token)(string ts, string expected, char after) {
59         auto t = tokenizeString(ts);
60         assert(t.nextToken());
61         assert(t.currentToken == token);
62         auto v = readValue!(token)(t);
63         assert(v.matchedText == expected);
64         assert(t.readInput() == after);
65     }
66     with (IonTokenType) {
67         testVal!(TokenNumber)("123123", "123123", 0);
68     }
69 }
71 /+
72 Read a UTF-32 code-point from the input range (for clobs).
73 Params:
74     t = The tokenizer
75 Returns:
76     a UTF-32 code-point
77 +/
78 dchar readEscapedClobChar(return ref IonTokenizer t) @nogc @safe pure {
79     return readEscapedChar!(true)(t);
80 }
82 /+
83 Read out a UTF-32 code-point from a hex escape within our input range.
85 For simplicity's sake, this will return the largest type possible (a UTF-32 code-point).
86 Params:
87     t = The tokenizer
88 Returns:
89     a code-point representing the escape value that was read
90 Throws:
91     IonTokenizerException if an invalid escape value was found.
92 +/
93 dchar readEscapedChar(bool isClob = false)(return ref IonTokenizer t) @nogc @safe pure 
94 {
95     dchar readHexEscapeLiteral(int length)() @nogc @safe pure { 
96         dchar codePoint = 0, val;
97         for (int i = 0; i < length; i++) {
98             const(char) c = t.expect!isHexDigit;
99             const(char) hexVal = hexLiteral(c);
100             codePoint = (codePoint << 4) | hexVal; // TODO: is this correct?
101         }
102         val = codePoint;
103         return val;
104     }
106     char c;
107     static if (isClob) {
108         c = t.expect!"a != 'U' && a != 'u'"; // cannot have unicode escapes within clobs
109     } else {
110         c = t.readInput();
111     }
113     switch (c) {
114         case '0':
115             // TODO: will this cause an error and make our code confused? 
116             // \0 should not normally exist (except in it's escaped form) -- determine if this is expected behavior
117             return '\0'; 
118         static foreach(member; ['a', 'b', 't', 'n', 'f', 'r', 'v']) {
119             case member:
120                 return mixin("'\\" ~ member ~ "'");
121         }
122         static foreach(member; ['?', '/', '\'', '"', '\\']) {
123             case member:
124                 return member;
125         }
126         case 'U':
127             return readHexEscapeLiteral!8;
128         case 'u':
129             return readHexEscapeLiteral!4;
130         case 'x':
131             return readHexEscapeLiteral!2;
132         default:
133             throw IonTokenizerErrorCode.invalidHexEscape.ionTokenizerException;
134     }
135 }
136 // Test reading a unicode escape
137 version(mir_ion_parser_test) unittest
138 {
139     import mir.deser.text.tokenizer : tokenizeString;
140     import mir.deser.text.tokens : IonTokenizerException;
142     void test(string ts, dchar expected) {
143         auto t = tokenizeString(ts);
144         assert(t.readEscapedChar() == expected);
145     }
147     void testFail(string ts) {
148         import std.exception : assertThrown;
149         auto t = tokenizeString(ts);
150         assertThrown!IonTokenizerException(t.readEscapedChar());
151     }
153     test("U0001F44D", '\U0001F44D');
154     test("u2248", '\u2248');
155     test("x20", '\x20');
156     test("a", '\a');
157     test("b", '\b');
158     test("?", '?');
159     test("\"", '"');
160     test("0", '\0');
162     testFail("c0101");
163     testFail("d21231");
164     testFail("!");
165 }
167 /+
168 Read a UTF-32 escape sequence, and return it as UTF-8 character(s).
169 Params:
170     t = The tokenizer
171 Returns:
172     A string containing the UTF-32 escape sequence, or nothing if we read a new-line.
173     The length of the string is not well-defined, it can change depending on the escape sequence.
174 +/
175 size_t readEscapeSeq(bool isClob = false)(return ref IonTokenizer t) @nogc @safe pure
176 {
177     const(char) esc = t.peekOne();
178     if (esc == '\r') {
179         const(char)[] cs = t.peekMax(2);
180         if (cs.length == 2 && cs == "\r\n") {
181             t.skipExactly(2);
182             return 0;
183         } else {
184             t.skipOne();
185             return 0;
186         }
187     }
188     else if (esc == '\n') {
189         t.skipOne();
190         return 0;
191     }
193     // I hate this, but apparently toUTF8 cannot take in a single UTF-32 code-point
194     const(dchar) c = readEscapedChar!(isClob)(t); 
195     // Extracted encode logic from std.utf.encode
196     // Zero out the escape sequence (since we re-use this buffer)
197     t.resetEscapeBuffer();
198     if (c <= 0x7F)
199     {
200         assert(isValidDchar(c));
201         t.escapeSequence[0] = cast(char) c;
202         return 1;
203     }
204     if (c <= 0x7FF)
205     {
206         assert(isValidDchar(c));
207         t.escapeSequence[0] = cast(char)(0xC0 | (c >> 6));
208         t.escapeSequence[1] = cast(char)(0x80 | (c & 0x3F));
209         return 2;
210     }
211     if (c <= 0xFFFF)
212     {
213         if (0xD800 <= c && c <= 0xDFFF)
214             throw IonTokenizerErrorCode.encodingSurrogateCode.ionTokenizerException;
216         assert(isValidDchar(c));
217         t.escapeSequence[0] = cast(char)(0xE0 | (c >> 12));
218         t.escapeSequence[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
219         t.escapeSequence[2] = cast(char)(0x80 | (c & 0x3F));
220         return 3;
221     }
222     if (c <= 0x10FFFF)
223     {
224         assert(isValidDchar(c));
225         t.escapeSequence[0] = cast(char)(0xF0 | (c >> 18));
226         t.escapeSequence[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
227         t.escapeSequence[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
228         t.escapeSequence[3] = cast(char)(0x80 | (c & 0x3F));
229         return 4;
230     }
232     assert(!isValidDchar(c));
233     throw IonTokenizerErrorCode.encodingInvalidCode.ionTokenizerException;
234 }
236 /+
237     Read a non-quoted symbol from our input range.
238     Params:
239         t = The tokenizer
240     Returns:
241         A string containing the un-quoted symbol from the input range in the tokenizer.
242 +/
243 IonTextSymbol readSymbol(return ref IonTokenizer t) @safe pure @nogc
244 {
245     IonTextSymbol val;
246     size_t end = 0, endPos = 0;
247     const(char)[] window = t.window;
249     if (window.length == 0) return val;
250     foreach(c; window) {
251         if (!c.isIdentifierPart()) {
252             break;
253         }
254         end++;
255     }
257     endPos = t.position + end;
258     if (end > t.window.length || endPos > t.input.length) {
259         assert(0); // should never happen
260     }
262     val.matchedIndex = t.position;
263     val.matchedText = t.input[t.position .. endPos];
264     t.skipExactly(end);
266     return val;
267 }
268 // Test reading a symbol
269 version(mir_ion_parser_test) unittest
270 {
271     import mir.deser.text.tokenizer : tokenizeString;
272     import mir.deser.text.tokens : IonTokenizerException, IonTokenType;
274     void test(string ts, string expected, IonTokenType after) {
275         import std.exception : assertNotThrown;
276         auto t = tokenizeString(ts); 
277         assertNotThrown!IonTokenizerException(t.nextToken());
278         assert(t.currentToken == IonTokenType.TokenSymbol);
279         assert(t.readSymbol().matchedText == expected);
280         assertNotThrown!IonTokenizerException(t.nextToken());
281         assert(t.currentToken == after);
282     }
284     test("hello", "hello", IonTokenType.TokenEOF);
285     test("a", "a", IonTokenType.TokenEOF);
286     test("abc", "abc", IonTokenType.TokenEOF);
287     test("null +inf", "null", IonTokenType.TokenFloatInf);
288     test("false,", "false", IonTokenType.TokenComma);
289     // nan should not be a symbol -- we should treat it as it's own case
290     // test("nan]", "nan", IonTokenType.TokenCloseBracket);
291 }
293 /+
294 Read a quoted symbol from our input range, 
295 and automatically decode any escape sequences found.
297 Params:
298     t = The tokenizer
299 Returns:
300     A string containing the quoted symbol.
301 +/
302 IonTextQuotedSymbol readSymbolQuoted(return ref IonTokenizer t) @nogc @safe pure
303 {
304     IonTextQuotedSymbol val;
305     val.isFinal = true;
306     size_t read, startIndex = t.position, endIndex = 0;
307     loop: while (true) {
308         char c = t.expect!"a != 0 && a != '\\n'";
309         s: switch (c) {
310             case '\'': // found the end 
311                 break loop;
312             case '\\':
313                 if (read != 0) {
314                     t.unread(c);
315                     val.isFinal = false;
316                     endIndex = t.position;
317                     break loop;
318                 }
320                 size_t esc = t.readEscapeSeq();
321                 if (esc == 0) continue;
322                 val.matchedText = t.escapeSequence[0 .. esc];
323                 val.matchedIndex = startIndex;
324                 val.isEscapeSequence = true;
325                 val.isFinal = false;
326                 if (t.peekOne() == '\'') {
327                     t.skipOne();
328                     val.isFinal = true;
329                 }
330                 return val;
331             default:
332                 read++;
333                 break s;
334         }
335     }
337     if (endIndex == 0) {
338         endIndex = t.position - 1;
339     }
341     val.matchedText = t.input[startIndex .. endIndex];
342     val.matchedIndex = startIndex;
343     return val;
344 }
345 // Test reading quoted symbols
346 version(mir_ion_parser_test) unittest
347 {
348     import mir.deser.text.tokenizer : tokenizeString;
349     import mir.deser.text.tokens : IonTokenType;
351     void test(string ts, string expected, char after) {
352         auto t = tokenizeString(ts);
353         assert(t.nextToken());
354         assert(t.currentToken == IonTokenType.TokenSymbolQuoted);
355         auto val = t.readSymbolQuoted();
356         assert(val.matchedText == expected);
357         assert(val.isFinal);
358         assert(!val.isEscapeSequence);
359         assert(t.readInput() == after);
360     }
362     void testMultipart(string ts, string expected1, string expected2, string expected3, char after) {
363         auto t = tokenizeString(ts);
364         assert(t.nextToken());
365         assert(t.currentToken == IonTokenType.TokenSymbolQuoted);
367         auto val = t.readSymbolQuoted();
368         assert(val.matchedText == expected1);
369         assert(!val.isFinal);
371         auto val2 = t.readSymbolQuoted();
372         assert(val2.matchedText == expected2);
373         assert(!val2.isFinal);
375         auto val3 = t.readSymbolQuoted();
376         assert(val3.matchedText == expected3);
377         assert(val3.isFinal);
378         assert(t.readInput() == after);
379     }
381     test("'a'", "a", 0);
382     test("'a b c'", "a b c", 0);
383     test("'null' ", "null", ' ');
384     test("'false',", "false", ',');
385     test("'nan']", "nan", ']');
387     testMultipart("'a\\'b'", "a", "'", "b", 0);
388     testMultipart(`'a\nb'`, "a", "\n", "b", 0);
389     testMultipart("'a\\\\b'", "a", "\\", "b", 0);
390     testMultipart(`'a\x20b'`, "a", " ", "b", 0);
391     testMultipart(`'a\u2248b'`, "a", "≈", "b", 0);
392     testMultipart(`'a\U0001F44Db'`, "a", "👍", "b", 0);
393 }
395 /+
396 Read a symbol operator from the input range.
397 Params:
398     t = The tokenizer
399 Returns:
400     A string containing any symbol operators that were able to be read.
401 +/
402 IonTextSymbolOperator readSymbolOperator(return ref IonTokenizer t) @safe @nogc pure
403 {
404     IonTextSymbolOperator val;
405     size_t startIndex = t.position;
406     val.matchedIndex = startIndex;
407     char c = t.peekOne();
408     while (c.isOperatorChar()) {
409         t.skipOne();
410         c = t.peekOne();
411     }
413     val.matchedText = t.input[startIndex .. t.position];
414     return val;
415 }
417 /+
418 Read a string from the input range and automatically decode any UTF escapes.
419 Params:
420     longString = Is this string a 'long' string, defined by 3 single-quotes?
421     isClob = Is this string allowed to have UTF escapes?
422     t = The tokenizer
423 Returns:
424     The string's content from the input range.
425 +/
426 auto readString(bool longString = false, bool isClob = false)(return ref IonTokenizer t) @safe @nogc pure
427 {
428     static if (isClob) {
429         IonTextClob val;
430     } else {
431         IonTextString val;
432     } 
434     val.isFinal = true;
435     static if (longString && !isClob) {
436         val.isLongString = true;
437     }
439     size_t read = 0, startIndex = t.position, endIndex = 0;
440     loop: while (true) {
441         char c = t.expect!"a != 0";
443         static if (!longString) {
444             t.expectFalse!(isNewLine, true)(c);
445         }
447         /*
448         static if (isClob) {
449             //t.expectFalse!(isInvalidChar, true)(c);
450             t.expect!(isASCIIChar, true)(c);
451         } else {
452             t.expectFalse!(isInvalidChar, true)(c);
453         }
454         */
456         s: switch (c) {
457             static if (!longString) {
458                 case '"':
459                     break loop;
460             } else {
461                 static if (!isClob) {
462                     case '\r':
463                         if (read != 0) {
464                             t.unread(c);
465                             endIndex = t.position;
466                             val.isFinal = false;
467                             break loop;
468                         }
470                         const(char)[] v = t.peekMax(1);
471                         if (v.length == 1 && v[0] == '\n') { // see if this is \r\n or just \r
472                             t.skipOne();
473                         }
475                         t.resetEscapeBuffer();
476                         t.escapeSequence[0] = '\n';
477                         val.matchedText = t.escapeSequence[0 .. 1];
478                         val.isNormalizedNewLine = true;
479                         val.isFinal = false;
481                         // do the same check, and see if this string ends *directly* after this newline
482                         // again, peekExactly is acceptable here because the long string *MUST* end with
483                         // a sequence of 3 quotes, and we should throw if it's not there.
484                         if (t.peekExactly(3) == "'''") {
485                             // consume, and skip whitespace
486                             assert(t.skipExactly(3)); // consume the first quote mark
487                             val.isFinal = true;
488                             c = t.skipWhitespace!(true, false);
489                             t.unread(c);
490                         }
491                         return val;
492                 }
493                 case '\'':
494                     const(char)[] v = t.peekMax(2);
495                     if (v.length == 2 && v[0] == '\'' && v[1] == '\'') {
496                         val.isFinal = true;
497                         endIndex = t.position - 1;
498                         t.skipExactly(2);
499                         static if (isClob) {
500                             c = t.skipWhitespace!(false, true);
501                             if (c) {
502                                 t.unread(c);
503                                 break loop;
504                             } else {
505                                 break s;
506                             }
507                         } else {
508                             break loop;
509                         }
510                     } else {
511                         goto default;
512                     }
513             }
514             case '\\':
515                 if (read != 0) {
516                     t.unread(c);
517                     endIndex = t.position;
518                     val.isFinal = false;
519                     break loop;
520                 }
522                 size_t esc = readEscapeSeq!(isClob)(t);
523                 static if (isClob) {
524                     if (esc == 2) {
525                         // XXX: hack
526                         // Since we can't have unicode escapes, this HAS to be \x80 - \xFF.
527                         // We shouldn't convert this into a UTF codepoint, and we should keep it as is.
528                         break s;
529                     } 
530                 }
532                 assert(esc <= 4); // sanity check that we do not have an escape larger then 4 chars
534                 val.matchedText = t.escapeSequence[0 .. esc];
535                 val.matchedIndex = startIndex;
536                 val.isEscapeSequence = true;
537                 val.isFinal = false;
538                 // check if the string ends *directly* after this escape,
539                 // if so, just consume the quotations, and call it a day
540                 static if (longString) {
541                     // if this is a long string, there should be *at least* 3 extra
542                     // characters left (for the ending quotes). this will throw 
543                     // if they are not there.
544                     if (t.peekExactly(3) == "'''") { 
545                         // consume, and skip whitespace
546                         assert(t.skipExactly(3));
547                         val.isFinal = true;
548                         static if (isClob) {
549                             c = t.skipWhitespace!(false, true);
550                         } else {
551                             c = t.skipWhitespace!(true, false);
552                         }
553                         t.unread(c);
554                     }
555                 } else {
556                     if (t.peekOne() == '"') {
557                         assert(t.skipOne());
558                         val.isFinal = true;
559                     }
560                 }
561                 if (esc >= 1) {
562                     val.escapeSequenceType = IonTextEscapeType.UTF;
563                 } else {
564                     val.escapeSequenceType = IonTextEscapeType.Hex;
565                 }
566                 return val;
567                 //break s;
568             default:
569                 read++;
570                 break s;
571         }
572     }
574     if (endIndex == 0) {
575         endIndex = t.position - 1;
576     }
578     val.matchedText = t.input[startIndex .. endIndex];
579     val.matchedIndex = startIndex;
580     return val;
581 }
582 // Test reading a string
583 version(mir_ion_parser_test) unittest
584 {
585     import mir.deser.text.tokenizer : tokenizeString;
586     import mir.deser.text.tokens : IonTokenType;
588     void test(string ts, string expected, char after) {
589         auto t = tokenizeString(ts);
590         assert(t.nextToken());
591         assert(t.currentToken == IonTokenType.TokenString);
592         auto str = t.readString();
593         assert(str.matchedText == expected);
594         assert(t.readInput() == after);
595     }
597     void testMultiPart(string ts, string expected, string after, char last) {
598         auto t = tokenizeString(ts);
600         assert(t.nextToken());
601         assert(t.currentToken == IonTokenType.TokenString);
602         auto str = t.readString();
603         assert(str.matchedText == expected);
604         assert(!str.isEscapeSequence);
605         assert(!str.isFinal);
607         auto str2 = t.readString();
608         assert(str2.matchedText == after);
609         assert(str2.isEscapeSequence);
610         assert(str2.isFinal);
611         assert(t.readInput() == last);
612     }
614     test(`"Hello, world"`, "Hello, world", 0);
615     testMultiPart(`"Hello! \U0001F44D"`, "Hello! ", "👍", 0);
616     test(`"0xFOOBAR",`, "0xFOOBAR", ',');
617 }
619 /+
620 Read a long string (defined by having three single quotes surrounding it's contents).
622 $(NOTE If this function encounters another long string in the input range separated by whitespace, 
623 it will concatenate the contents of the two long strings together. This is not implementation-specific,
624 rather, part of the Ion specification)
626 Params:
627     t = The tokenizer
628 Returns:
629     A string holding the contents of any long strings found.
630 +/
631 IonTextString readLongString(return ref IonTokenizer t) @safe @nogc pure
632 {
633     return readString!(true)(t);
634 }
635 // Test reading a long string
636 version(mir_ion_parser_test) unittest
637 {
638     import mir.deser.text.tokenizer : tokenizeString;
639     import mir.deser.text.tokens : IonTokenType;
641     void test(string ts, string expected, char after) {
642         auto t = tokenizeString(ts);
643         assert(t.nextToken());
644         assert(t.currentToken == IonTokenType.TokenLongString);
645         auto str = t.readLongString();
646         t.finished = true;
647         assert(str.matchedText == expected);
648         assert(t.readInput() == after);
649         assert(str.isFinal);
650     }
652     void testMultiPart(string ts, string expected1, string expected2, char after) {
653         auto t = tokenizeString(ts);
654         assert(t.nextToken());
655         assert(t.currentToken == IonTokenType.TokenLongString);
657         auto str = t.readLongString();
658         t.finished = true;
659         assert(str.matchedText == expected1);
660         assert(str.isFinal);
662         assert(t.nextToken());
663         assert(t.currentToken == IonTokenType.TokenLongString);
664         auto str2 = t.readLongString();
665         assert(str2.matchedText == expected2);
666         assert(t.readInput() == after);
667         assert(str.isFinal);
668     }
670     void testNewLine(string ts, string expected1, string expected2, bool normalized, bool eofFinal, char after) {
671         auto t = tokenizeString(ts);
672         assert(t.nextToken());
673         assert(t.currentToken == IonTokenType.TokenLongString);
674         auto str = t.readLongString();
675         assert(str.matchedText == expected1);
676         t.finished = true;
677         if (normalized) {
678             assert(!str.isFinal);
679             auto str1 = t.readLongString();
680             t.finished = true;
681             assert(str1.isNormalizedNewLine);
682             assert(str1.matchedText == "\n");
683             if (eofFinal) {
684                 assert(str1.isFinal);
685                 assert(t.nextToken());
686                 assert(t.currentToken == IonTokenType.TokenLongString);
687             } else {
688                 assert(!str1.isFinal);
689             }
690         } else {
691             assert(str.isFinal);
692             assert(t.nextToken());
693             assert(t.currentToken == IonTokenType.TokenLongString);
694         }
695         auto str1 = t.readLongString();
696         assert(str1.matchedText == expected2);
697         assert(str1.isFinal);
698         assert(t.readInput() == after);
699     }
701     test(`'''Hello, world'''`, "Hello, world", 0);
702     testMultiPart(`'''Hello! ''''''\U0001F44D'''`, "Hello! ", "👍", 0);
703     test(`'''0xFOOBAR''',`, "0xFOOBAR", ',');
704     test(`'''Hello, 'world'!'''`, "Hello, \'world\'!", 0);
705     testMultiPart(`'''Hello,'''''' world!'''`, "Hello,", " world!", 0);
706     testMultiPart(`'''Hello,'''     ''' world!'''`, "Hello,", " world!", 0);
707     // Test the normalization of new-lines in long strings here.
708     testNewLine("'''Hello, \r\n''' '''world!'''", "Hello, ", "world!", true, true, 0); // normalized, crlf precedes end of string
709     testNewLine("'''Hello, \r\n world!'''", "Hello, ", " world!", true, false, 0); // normalized, but there is extra text
710     testNewLine("'''Hello, \n''' '''world!'''", "Hello, \n", "world!", false, false, 0); // not normalized, no extra text
711     testNewLine("'''Hello, \r''' '''world!'''", "Hello, ", "world!", true, true, 0); // normalized, crlf precedes end of string
712     testNewLine("'''Hello, \r \nworld!'''", "Hello, ", " \nworld!", true, false, 0); // normalized, but there is extra text
713 }
715 /+
716 Read the contents of a clob, and return it as an untyped array.
718 $(NOTE As per Ion specification, a clob does not contain Base64 data. Use readBlob if you are expecting to decode Base64 data.)
720 Params:
721     longClob = Should this function concatenate the contents of multiple clobs within the brackets?
722     t = The tokenizer
723 Returns:
724     An untyped array containing the contents of the clob. This array is guaranteed to have no UTF-8/UTF-32 characters -- only ASCII characters.
725 +/
727 IonTextClob readClob(bool longClob = false)(return ref IonTokenizer t) @safe @nogc pure
728 {
729     // Always read out bytes, as clobs are octet-based (and not necessarily a string)
730     auto data = readString!(longClob, true)(t);
731     char c;
732     static if (longClob) {
733         data.isLongClob = true;
734         if (data.isFinal) {
735             c = t.skipLobWhitespace();
736             if (c == '\'' && t.isTripleQuote()) {
737                 data.isFinal = false;
738                 return data;
739             }
740         }
741     }
743     // read out the following }} ONLY if we don't encounter an escape sequence
744     if (data.isFinal) {
745         static if (longClob) {
746             c = t.expect!("a == '}'", true)(c);
747         } else {
748             c = t.expect!("a == '}'", true)(t.skipLobWhitespace()); // after skipping any whitespace, it should be the terminator ('}')
749         }
750         c = t.expect!"a == '}'"; // and no whitespace should between one bracket and another
751     }
753     return data;
754 }
755 // Test reading a short clob
756 version(mir_ion_parser_test) unittest
757 {
758     import mir.deser.text.tokenizer : tokenizeString;
759     import mir.deser.text.tokens : IonTokenType;
761     void test(string ts, string expected, char after) {
762         auto t = tokenizeString(ts);
763         assert(t.nextToken());
764         assert(t.currentToken == IonTokenType.TokenString);
765         assert(t.readClob().matchedText == expected);
766         assert(t.readInput() == after);
767     }
769     test(`"Hello, world"}}`, "Hello, world", 0);
770     test(`"0xF00BAR"}}, `, "0xF00BAR", ',');
771 }
773 /+
774 Helper to read a long clob from the input stream.
776 See [readClob] for any notes.
777 Params:
778     t = The tokenizer
779 Returns:
780     An untyped array holding the contents of the clob.
781 +/
782 IonTextClob readLongClob(return ref IonTokenizer t) @safe @nogc pure
783 {
784     return readClob!(true)(t);
785 }
787 /+
788 Read a blob from the input stream, and return the Base64 contents.
790 $(NOTE This function does not verify if the Base64 contained is valid, or if it is even Base64.)
791 Params:
792     t = The tokenizer
793 Returns:
794     An untyped array containing the Base64 contents of the blob.
795 +/
796 IonTextBlob readBlob(return ref IonTokenizer t) @safe @nogc pure
797 {
798     IonTextBlob val;
799     size_t startIndex = t.position, endIndex = t.position;
800     char c;
801     while (true) {
802         c = t.expect!("a != 0", true)(t.skipLobWhitespace());
803         if (c == '}') {
804             break;
805         } else {
806             endIndex = t.position; // grow our end index as we get more data
807         }
808     }
810     c = t.expect!"a == '}'";
811     t.finished = true;
812     val.matchedText = t.input[startIndex .. endIndex];
813     val.matchedIndex = startIndex;
814     return val;
815 }
816 /+
817 Read a number from the input stream, and return the type of number, as well as the number itself.
819 Params:
820     t = The tokenizer
821 Returns:
822     A struct holding the type and value of the number.
823     See the examples below on how to access the type/value.
824 +/
826 IonTextNumber readNumber(return ref IonTokenizer t) @safe @nogc pure
827 {
828     import mir.ion.type_code : IonTypeCode;
829     IonTextNumber num;
830     size_t startIndex = t.position;
832     void readExponent() @safe @nogc pure {
833         char c = t.readInput();
834         if (c == '+' || c == '-') {
835             c = t.expect!"a != 0";
836         }
838         readDigits(t, c);
839     }
841     char c = t.readInput();
842     if (c == '-' || c == '+') {
843         c = t.readInput();
844     }
846     immutable char leader = c;
847     const(char)[] digits = readDigits(t, leader);
848     if (leader == '0') {
849         if (digits.length != 1) { // if it is not just a plain 0, fail since we don't support leading zeros
850             throw IonTokenizerErrorCode.invalidLeadingZeros.ionTokenizerException;
851         }
852     }
854     c = t.readInput();
855     if (c == '.') {
856         c = t.readInput();
857         if (c.isDigit) {
858             immutable char decimalLeader = t.expect!("a != 0", true)(c);
859             readDigits(t, decimalLeader);
860             c = t.readInput();
861         }
862     }
864     switch (c) {
865         case 'e':
866         case 'E':
867         case 'd':
868         case 'D':
869             readExponent();
870             break;
871         default:
872             // this is not a character we want, so unread it (for now)
873             t.unread(c); 
874             break;
875     }
877     c = t.expect!(t.isStopChar);
878     t.unread(c);
879     num.matchedText = t.input[startIndex .. t.position];
880     num.matchedIndex = startIndex;
882     return num; 
883 }
884 // Test reading numbers
885 version(mir_ion_parser_test) unittest
886 {
887     import mir.deser.text.tokenizer : tokenizeString;
888     import mir.deser.text.tokens : IonTokenType;
889     import mir.ion.type_code : IonTypeCode;
891     void test(string ts, string expected, char after) {
892         auto t = tokenizeString(ts);
893         assert(t.nextToken());
894         assert(t.currentToken == IonTokenType.TokenNumber);
895         auto n = t.readNumber();
896         assert(n.matchedText == expected);
897         assert(t.readInput() == after);
898     }
900     test("12341", "12341", 0);
901     test("-12312", "-12312", 0);
902     test("0.420d2", "0.420d2", 0);
903     test("1.1999999999999999555910790149937383830547332763671875e0", 
904          "1.1999999999999999555910790149937383830547332763671875e0", 0);
905     test("1.1999999999999999e0, ", "1.1999999999999999e0", ',');
906 }
908 /+
909 Read as many digits from the input stream as possible, given the first digit of the digits.
911 This function will stop reading digits as soon as whitespace is hit.
912 Params:
913     t = The tokenizer
914     leader = The leading digit in a sequence of digits following
915     buf = The appender on which this function will put it's output
916 Returns:
917     A character located after it has read every single digit in a sequence.
918 +/
919 const(char)[] readDigits(return ref IonTokenizer t, char leader) @safe @nogc pure
920 {
921     immutable char c = leader;
922     if (!isDigit(c)) {
923         throw IonTokenizerErrorCode.expectedValidLeader.ionTokenizerException;
924     }
925     t.unread(c); // unread so the readRadixDigits can consume it
926     return readRadixDigits(t);
927 }
929 /+
930 Read as many digits from the input stream as possible, given a validator.
932 This function will stop reading digits as soon as the validator returns false.
933 Params:
934     isValid = The validation function which is called to determine if the reader should halt.
935     t = The tokenizer
936     buf = The appender on which this function will put it's output
937 Returns:
938     A character located after it has read every single digit in a sequence.
939 +/
940 const(char)[] readRadixDigits(alias isValid = isDigit)(return ref IonTokenizer t) 
941 {
942     import mir.functional : naryFun;
943     size_t startIndex = t.position;
944     while (true) {
945         char c = t.readInput();
946         if (c == '_') {
947             t.expect!(isValid, true)(t.peekOne());
948             continue; // skip over it
949         }
951         if (!naryFun!isValid(c)) {
952             t.unread(c);
953             return t.input[startIndex .. t.position];
954         }
955     }
956 }
958 /+
959 Read a radix number, given two validation functions for it's marker and the validity of each digit read.
961 Params:
962     isMarker = A validation function to check if the marker is valid (0b/0x/etc)
963     isValid = A validation function to check if every digit found is valid (0-1/0-9A-F/etc)
964     t = The tokenizer
965 Returns:
966     A string containing the full radix number (including the leading 0 and marker).
967 +/
968 const(char)[] readRadix(alias isMarker, alias isValid)(return ref IonTokenizer t) @safe @nogc pure
969 {
970     size_t startIndex = t.position;
971     char c = t.readInput();
972     if (c == '-') {
973         c = t.readInput();
974     }
976     // 0
977     t.expect!("a == '0'", true)(c);
978     // 0(b || x)
979     c = t.expect!isMarker;
980     t.expect!("a != '_'", true)(t.peekOne()); // cannot be 0x_ or 0b_
981     const(char)[] val = readRadixDigits!(isValid)(t);
982     c = t.readInput();
983     if (c) {
984         c = t.expect!(t.isStopChar, true)(c);
985         t.unread(c);
986     }
988     return t.input[startIndex .. t.position];
989 }
991 /+
992 Read a binary number (marked by '0b') from the input stream.
994 Params:
995     t = The tokenizer
996 Returns:
997     A string containing the entire binary number read.
998 +/
999 const(char)[] readBinary(return ref IonTokenizer t) @safe @nogc pure
1000 {
1001     return readRadix!("a == 'b' || a == 'B'", "a == '0' || a == '1'")(t);
1002 }
1003 // Test reading a binary number
1004 version(mir_ion_parser_test) unittest
1005 {
1006     import mir.deser.text.tokenizer : tokenizeString;
1007     import mir.deser.text.tokens : IonTokenType;
1009     void test(string ts, string expected, char after) {
1010         auto t = tokenizeString(ts);
1011         assert(t.nextToken());
1012         assert(t.currentToken == IonTokenType.TokenBinary);
1013         assert(t.readBinary() == expected);
1014         assert(t.readInput() == after);
1015     }
1017     test("0b101011010", "0b101011010", 0);
1018     test("0b100000101000001010000010100000101000001 ", "0b100000101000001010000010100000101000001", ' ');
1019     test("0b11011110101011011011111011101111,", "0b11011110101011011011111011101111", ',');
1020     test("      0b11011110101011011011111011101111,", "0b11011110101011011011111011101111", ',');  
1021 }
1023 /+
1024 Read a hex number (marked by '0x') from the input stream.
1026 Params:
1027     t = The tokenizer
1028 Returns:
1029     A string containing the entire hex number read.
1030 +/
1031 const(char)[] readHex(return ref IonTokenizer t) @safe @nogc pure
1032 {
1033     return readRadix!("a == 'x' || a == 'X'", isHexDigit)(t);
1034 }
1035 // Test reading a hex number
1036 version(mir_ion_parser_test) unittest
1037 {
1038     import mir.deser.text.tokenizer : tokenizeString;
1039     import mir.deser.text.tokens : IonTokenType;
1041     void test(string ts, string expected, char after) {
1042         auto t = tokenizeString(ts);
1043         assert(t.nextToken());
1044         assert(t.currentToken == IonTokenType.TokenHex);
1045         assert(t.readHex() == expected);
1046         assert(t.readInput() == after);
1047     } 
1049     void testMultipart(string ts, string expected1, char after, string expected2) {
1050         auto t = tokenizeString(ts);
1051         assert(t.nextToken());
1052         assert(t.currentToken == IonTokenType.TokenHex);
1053         assert(t.readHex() == expected1);
1054         assert(t.readInput() == after);
1055         assert(t.readHex() == expected2);
1056     }
1058     test("0xBADBABE", "0xBADBABE", 0);
1059     test("0x414141", "0x414141", 0);
1060     test("0x0", "0x0", 0);
1061     test("     0x414141", "0x414141", 0);
1062     test("     0x414141,", "0x414141", ',');
1063     testMultipart("     0x414141,0x414142", "0x414141", ',', "0x414142");
1064 }
1066 /+
1067 Read a ISO-8601 extended timestamp from the input stream.
1069 $(NOTE This function does some rudimentary checks to see if the timestamp is valid,
1070 but it does nothing more then that.)
1072 Params:
1073     t = The tokenizer
1074 Returns:
1075     A string containing the entire timestamp read from the input stream.
1076 +/
1078 IonTextTimestamp readTimestamp(return ref IonTokenizer t) @safe @nogc pure 
1079 {
1080     IonTextTimestamp val;
1081     size_t startIndex = t.position;
1083     char readTSDigits(int nums) @safe @nogc pure {
1084         for (int i = 0; i < nums; i++) {
1085             t.expect!isDigit;
1086         }
1087         return t.readInput();
1088     }
1090     char readTSOffset(char c) @safe @nogc pure {
1091         if (c != '-' && c != '+') {
1092             return c; 
1093         }
1094         const(char) cs = t.expect!("a == ':'", true)(readTSDigits(2));
1095         return readTSDigits(2);
1096     }
1098     char readTSOffsetOrZ(char c) @safe @nogc pure {
1099         t.expect!("a == '-' || a == '+' || a == 'z' || a == 'Z'", true)(c);
1100         if (c == '-' || c == '+') {
1101             return readTSOffset(c);
1102         }
1103         if (c == 'z' || c == 'Z') {
1104             return t.readInput();
1105         }
1106         assert(0);
1107     }
1109     IonTextTimestamp readTSFinish(char c) @safe @nogc pure {
1110         t.expect!(t.isStopChar, true)(c);
1111         t.unread(c);
1112         val.matchedIndex = startIndex;
1113         val.matchedText = t.input[startIndex .. t.position];
1114         return val;
1115     }
1117     // could be either:
1118     // yyyy(T || -)
1119     // or hh
1120     char c = readTSDigits(2);
1121     // is this a year? if so, then the character after
1122     // the first two digits should be a digit... if not,
1123     // we can just assume that it's a "time of day"
1124     if (c.isDigit())
1125     {
1126         c = t.expect!("a == 'T' || a == '-'", true)(readTSDigits(1));
1127         if (c == 'T') {
1128             // yyyyT
1129             val.matchedText = t.input[startIndex .. t.position];
1130             return val;
1131         }
1132         // yyyy-mm(T || -)
1133         c = t.expect!("a == 'T' || a == '-'", true)(readTSDigits(2));
1134         if (c == 'T') {
1135             val.matchedText = t.input[startIndex .. t.position];
1136             return val;
1137         }
1138         // yyyy-mm-dd(T)?
1139         c = readTSDigits(2);
1140         if (c != 'T') {
1141             return readTSFinish(c);
1142         }
1143         // yyyy-mm-ddT 
1144         c = t.readInput();
1145         if (!c.isDigit()) {
1146             // yyyy-mm-ddT(+ || -)hh:mm
1147             c = readTSOffset(c);
1148             return readTSFinish(c);
1149         }
1150         // (yyyy-mm-ddT)?hh
1151         c = t.expect!("a == ':'", true)(readTSDigits(1));
1152     }
1153     else
1154     {
1155         // hh
1156         c = t.expect!("a == ':'", true)(c);
1157     }
1159     // (yyyy-mm-ddT)?hh:mm
1160     c = readTSDigits(2);
1161     if (c != ':') {
1162         // (yyyy-mm-ddT)?hh:mm(+-|Z)?
1163         if (c) {
1164             c = readTSOffsetOrZ(c);
1165         }
1166         return readTSFinish(c);
1167     }
1169     // (yyyy-mm-ddT)?hh:mm:ss
1170     c = readTSDigits(2);
1172     if (c != '.') {
1173         // (yyyy-mm-ddT)?hh:mm:ss(Z)?
1174         if (c) {
1175             c = readTSOffsetOrZ(c);
1176         }
1177         return readTSFinish(c);
1178     }
1180     // (yyyy-mm-ddT)?hh:mm:ss.ssssZ
1181     c = t.readInput();
1182     if (c.isDigit()) {
1183         readDigits(t, c);
1184     }
1186     c = t.readInput();
1187     if (c) {
1188         c = readTSOffsetOrZ(c);
1189     } 
1191     return readTSFinish(c);
1192 }
1193 // Test reading timestamps
1194 version(mir_ion_parser_test) unittest
1195 {
1196     import mir.deser.text.tokenizer : tokenizeString;
1197     import mir.deser.text.tokens : IonTokenType;
1199     void test(string ts, string expected, char after) {
1200         auto t = tokenizeString(ts);
1201         assert(t.nextToken());
1202         assert(t.currentToken == IonTokenType.TokenTimestamp);
1203         assert(t.readTimestamp().matchedText == expected);
1204         assert(t.readInput() == after);
1205     } 
1207     test("2001T", "2001T", 0);
1208     test("2001-01T,", "2001-01T", ',');
1209     test("2001-01-02}", "2001-01-02", '}');
1210     test("2001-01-02T ", "2001-01-02T", ' ');
1211     test("2001-01-02T+00:00\t", "2001-01-02T+00:00", '\t');
1212     test("2001-01-02T-00:00\n", "2001-01-02T-00:00", '\n');
1213     test("2001-01-02T03:04+00:00 ", "2001-01-02T03:04+00:00", ' ');
1214     test("2001-01-02T03:04-00:00 ", "2001-01-02T03:04-00:00", ' ');
1215     test("2001-01-02T03:04Z ", "2001-01-02T03:04Z", ' ');
1216     test("2001-01-02T03:04z ", "2001-01-02T03:04z", ' ');
1217     test("2001-01-02T03:04:05Z ", "2001-01-02T03:04:05Z", ' ');
1218     test("2001-01-02T03:04:05+00:00 ", "2001-01-02T03:04:05+00:00", ' ');
1219     test("2001-01-02T03:04:05.666Z ", "2001-01-02T03:04:05.666Z", ' ');
1220     test("2001-01-02T03:04:05.666666z ", "2001-01-02T03:04:05.666666z", ' ');
1222     // Test new "time of day" timestamps
1223     test("03:04+00:00", "03:04+00:00", 0);
1224     test("03:04-00:00", "03:04-00:00", 0);
1225     test("03:04Z", "03:04Z", 0);
1226     test("03:04z", "03:04z", 0);
1227     test("03:04:05Z", "03:04:05Z", 0);
1228     test("03:04:05+00:00", "03:04:05+00:00", 0);
1229     test("03:04:05.666Z", "03:04:05.666Z", 0);
1230     test("03:04:05.666z", "03:04:05.666z", 0);
1231     test("03:04:05.666666Z", "03:04:05.666666Z", 0);
1232     test("03:04:05.666666z", "03:04:05.666666z", 0);
1233 }