1 /+ 2 Helpers for reading values from a given Ion token. 3 4 Authors: Harrison Ford 5 +/ 6 module mir.deser.text.readers; 7 8 import mir.deser.text.skippers; 9 import mir.deser.text.tokenizer; 10 import mir.deser.text.tokens; 11 12 private bool isValidDchar(dchar c) pure nothrow @safe @nogc 13 { 14 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF); 15 } 16 17 /+ 18 Read the contents of a given token from the input range. 19 20 $(WARNING This function does no checking if the current token 21 is the given function that you pass in. Use with caution.) 22 Params: 23 t = The tokenizer 24 token = The token type to read from the input range. 25 Returns: 26 The string contents of the token given 27 +/ 28 auto readValue(IonTokenType token)(return ref IonTokenizer t) @nogc @safe pure 29 { 30 import std.traits : EnumMembers; 31 import std.string : chompPrefix; 32 static foreach(i, member; EnumMembers!IonTokenType) {{ 33 static if (member != IonTokenType.TokenInvalid && member != IonTokenType.TokenEOF 34 && member != IonTokenType.TokenFloatInf && member != IonTokenType.TokenFloatMinusInf 35 && member != IonTokenType.TokenFloatNaN && member < IonTokenType.TokenComma) 36 { 37 enum name = __traits(identifier, EnumMembers!IonTokenType[i]); 38 static if (token == member) { 39 static if (member == IonTokenType.TokenDot) { 40 auto val = t.readSymbolOperator(); 41 } 42 else { 43 auto val = mixin("t.read" ~ name.chompPrefix("Token") ~ "()"); 44 } 45 t.finished = true; 46 47 return val; 48 } 49 } 50 }} 51 assert(0); 52 } 53 /// 54 version(mir_ion_parser_test) unittest { 55 import mir.deser.text.tokenizer : tokenizeString; 56 import mir.deser.text.tokens : IonTokenType; 57 58 void testVal(IonTokenType token)(string ts, string expected, char after) { 59 auto t = tokenizeString(ts); 60 assert(t.nextToken()); 61 assert(t.currentToken == token); 62 auto v = readValue!(token)(t); 63 assert(v.matchedText == expected); 64 assert(t.readInput() == after); 65 } 66 with (IonTokenType) { 67 testVal!(TokenNumber)("123123", "123123", 0); 68 } 69 } 70 71 /+ 72 Read a UTF-32 code-point from the input range (for clobs). 73 Params: 74 t = The tokenizer 75 Returns: 76 a UTF-32 code-point 77 +/ 78 dchar readEscapedClobChar(return ref IonTokenizer t) @nogc @safe pure { 79 return readEscapedChar!(true)(t); 80 } 81 82 /+ 83 Read out a UTF-32 code-point from a hex escape within our input range. 84 85 For simplicity's sake, this will return the largest type possible (a UTF-32 code-point). 86 Params: 87 t = The tokenizer 88 Returns: 89 a code-point representing the escape value that was read 90 Throws: 91 IonTokenizerException if an invalid escape value was found. 92 +/ 93 dchar readEscapedChar(bool isClob = false)(return ref IonTokenizer t) @nogc @safe pure 94 { 95 dchar readHexEscapeLiteral(int length)() @nogc @safe pure { 96 dchar codePoint = 0, val; 97 for (int i = 0; i < length; i++) { 98 const(char) c = t.expect!isHexDigit; 99 const(char) hexVal = hexLiteral(c); 100 codePoint = (codePoint << 4) | hexVal; // TODO: is this correct? 101 } 102 val = codePoint; 103 return val; 104 } 105 106 char c; 107 static if (isClob) { 108 c = t.expect!"a != 'U' && a != 'u'"; // cannot have unicode escapes within clobs 109 } else { 110 c = t.readInput(); 111 } 112 113 switch (c) { 114 case '0': 115 // TODO: will this cause an error and make our code confused? 116 // \0 should not normally exist (except in it's escaped form) -- determine if this is expected behavior 117 return '\0'; 118 static foreach(member; ['a', 'b', 't', 'n', 'f', 'r', 'v']) { 119 case member: 120 return mixin("'\\" ~ member ~ "'"); 121 } 122 static foreach(member; ['?', '/', '\'', '"', '\\']) { 123 case member: 124 return member; 125 } 126 case 'U': 127 return readHexEscapeLiteral!8; 128 case 'u': 129 return readHexEscapeLiteral!4; 130 case 'x': 131 return readHexEscapeLiteral!2; 132 default: 133 throw IonTokenizerErrorCode.invalidHexEscape.ionTokenizerException; 134 } 135 } 136 // Test reading a unicode escape 137 version(mir_ion_parser_test) unittest 138 { 139 import mir.deser.text.tokenizer : tokenizeString; 140 import mir.deser.text.tokens : IonTokenizerException; 141 142 void test(string ts, dchar expected) { 143 auto t = tokenizeString(ts); 144 assert(t.readEscapedChar() == expected); 145 } 146 147 void testFail(string ts) { 148 import std.exception : assertThrown; 149 auto t = tokenizeString(ts); 150 assertThrown!IonTokenizerException(t.readEscapedChar()); 151 } 152 153 test("U0001F44D", '\U0001F44D'); 154 test("u2248", '\u2248'); 155 test("x20", '\x20'); 156 test("a", '\a'); 157 test("b", '\b'); 158 test("?", '?'); 159 test("\"", '"'); 160 test("0", '\0'); 161 162 testFail("c0101"); 163 testFail("d21231"); 164 testFail("!"); 165 } 166 167 /+ 168 Read a UTF-32 escape sequence, and return it as UTF-8 character(s). 169 Params: 170 t = The tokenizer 171 Returns: 172 A string containing the UTF-32 escape sequence, or nothing if we read a new-line. 173 The length of the string is not well-defined, it can change depending on the escape sequence. 174 +/ 175 size_t readEscapeSeq(bool isClob = false)(return ref IonTokenizer t) @nogc @safe pure 176 { 177 const(char) esc = t.peekOne(); 178 if (esc == '\r') { 179 const(char)[] cs = t.peekMax(2); 180 if (cs.length == 2 && cs == "\r\n") { 181 t.skipExactly(2); 182 return 0; 183 } else { 184 t.skipOne(); 185 return 0; 186 } 187 } 188 else if (esc == '\n') { 189 t.skipOne(); 190 return 0; 191 } 192 193 // I hate this, but apparently toUTF8 cannot take in a single UTF-32 code-point 194 const(dchar) c = readEscapedChar!(isClob)(t); 195 // Extracted encode logic from std.utf.encode 196 // Zero out the escape sequence (since we re-use this buffer) 197 t.resetEscapeBuffer(); 198 if (c <= 0x7F) 199 { 200 assert(isValidDchar(c)); 201 t.escapeSequence[0] = cast(char) c; 202 return 1; 203 } 204 if (c <= 0x7FF) 205 { 206 assert(isValidDchar(c)); 207 t.escapeSequence[0] = cast(char)(0xC0 | (c >> 6)); 208 t.escapeSequence[1] = cast(char)(0x80 | (c & 0x3F)); 209 return 2; 210 } 211 if (c <= 0xFFFF) 212 { 213 if (0xD800 <= c && c <= 0xDFFF) 214 throw IonTokenizerErrorCode.encodingSurrogateCode.ionTokenizerException; 215 216 assert(isValidDchar(c)); 217 t.escapeSequence[0] = cast(char)(0xE0 | (c >> 12)); 218 t.escapeSequence[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 219 t.escapeSequence[2] = cast(char)(0x80 | (c & 0x3F)); 220 return 3; 221 } 222 if (c <= 0x10FFFF) 223 { 224 assert(isValidDchar(c)); 225 t.escapeSequence[0] = cast(char)(0xF0 | (c >> 18)); 226 t.escapeSequence[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 227 t.escapeSequence[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 228 t.escapeSequence[3] = cast(char)(0x80 | (c & 0x3F)); 229 return 4; 230 } 231 232 assert(!isValidDchar(c)); 233 throw IonTokenizerErrorCode.encodingInvalidCode.ionTokenizerException; 234 } 235 236 /+ 237 Read a non-quoted symbol from our input range. 238 Params: 239 t = The tokenizer 240 Returns: 241 A string containing the un-quoted symbol from the input range in the tokenizer. 242 +/ 243 IonTextSymbol readSymbol(return ref IonTokenizer t) @safe pure @nogc 244 { 245 IonTextSymbol val; 246 size_t end = 0, endPos = 0; 247 const(char)[] window = t.window; 248 249 if (window.length == 0) return val; 250 foreach(c; window) { 251 if (!c.isIdentifierPart()) { 252 break; 253 } 254 end++; 255 } 256 257 endPos = t.position + end; 258 if (end > t.window.length || endPos > t.input.length) { 259 assert(0); // should never happen 260 } 261 262 val.matchedIndex = t.position; 263 val.matchedText = t.input[t.position .. endPos]; 264 t.skipExactly(end); 265 266 return val; 267 } 268 // Test reading a symbol 269 version(mir_ion_parser_test) unittest 270 { 271 import mir.deser.text.tokenizer : tokenizeString; 272 import mir.deser.text.tokens : IonTokenizerException, IonTokenType; 273 274 void test(string ts, string expected, IonTokenType after) { 275 import std.exception : assertNotThrown; 276 auto t = tokenizeString(ts); 277 assertNotThrown!IonTokenizerException(t.nextToken()); 278 assert(t.currentToken == IonTokenType.TokenSymbol); 279 assert(t.readSymbol().matchedText == expected); 280 assertNotThrown!IonTokenizerException(t.nextToken()); 281 assert(t.currentToken == after); 282 } 283 284 test("hello", "hello", IonTokenType.TokenEOF); 285 test("a", "a", IonTokenType.TokenEOF); 286 test("abc", "abc", IonTokenType.TokenEOF); 287 test("null +inf", "null", IonTokenType.TokenFloatInf); 288 test("false,", "false", IonTokenType.TokenComma); 289 // nan should not be a symbol -- we should treat it as it's own case 290 // test("nan]", "nan", IonTokenType.TokenCloseBracket); 291 } 292 293 /+ 294 Read a quoted symbol from our input range, 295 and automatically decode any escape sequences found. 296 297 Params: 298 t = The tokenizer 299 Returns: 300 A string containing the quoted symbol. 301 +/ 302 IonTextQuotedSymbol readSymbolQuoted(return ref IonTokenizer t) @nogc @safe pure 303 { 304 IonTextQuotedSymbol val; 305 val.isFinal = true; 306 size_t read, startIndex = t.position, endIndex = 0; 307 loop: while (true) { 308 char c = t.expect!"a != 0 && a != '\\n'"; 309 s: switch (c) { 310 case '\'': // found the end 311 break loop; 312 case '\\': 313 if (read != 0) { 314 t.unread(c); 315 val.isFinal = false; 316 endIndex = t.position; 317 break loop; 318 } 319 320 size_t esc = t.readEscapeSeq(); 321 if (esc == 0) continue; 322 val.matchedText = t.escapeSequence[0 .. esc]; 323 val.matchedIndex = startIndex; 324 val.isEscapeSequence = true; 325 val.isFinal = false; 326 if (t.peekOne() == '\'') { 327 t.skipOne(); 328 val.isFinal = true; 329 } 330 return val; 331 default: 332 read++; 333 break s; 334 } 335 } 336 337 if (endIndex == 0) { 338 endIndex = t.position - 1; 339 } 340 341 val.matchedText = t.input[startIndex .. endIndex]; 342 val.matchedIndex = startIndex; 343 return val; 344 } 345 // Test reading quoted symbols 346 version(mir_ion_parser_test) unittest 347 { 348 import mir.deser.text.tokenizer : tokenizeString; 349 import mir.deser.text.tokens : IonTokenType; 350 351 void test(string ts, string expected, char after) { 352 auto t = tokenizeString(ts); 353 assert(t.nextToken()); 354 assert(t.currentToken == IonTokenType.TokenSymbolQuoted); 355 auto val = t.readSymbolQuoted(); 356 assert(val.matchedText == expected); 357 assert(val.isFinal); 358 assert(!val.isEscapeSequence); 359 assert(t.readInput() == after); 360 } 361 362 void testMultipart(string ts, string expected1, string expected2, string expected3, char after) { 363 auto t = tokenizeString(ts); 364 assert(t.nextToken()); 365 assert(t.currentToken == IonTokenType.TokenSymbolQuoted); 366 367 auto val = t.readSymbolQuoted(); 368 assert(val.matchedText == expected1); 369 assert(!val.isFinal); 370 371 auto val2 = t.readSymbolQuoted(); 372 assert(val2.matchedText == expected2); 373 assert(!val2.isFinal); 374 375 auto val3 = t.readSymbolQuoted(); 376 assert(val3.matchedText == expected3); 377 assert(val3.isFinal); 378 assert(t.readInput() == after); 379 } 380 381 test("'a'", "a", 0); 382 test("'a b c'", "a b c", 0); 383 test("'null' ", "null", ' '); 384 test("'false',", "false", ','); 385 test("'nan']", "nan", ']'); 386 387 testMultipart("'a\\'b'", "a", "'", "b", 0); 388 testMultipart(`'a\nb'`, "a", "\n", "b", 0); 389 testMultipart("'a\\\\b'", "a", "\\", "b", 0); 390 testMultipart(`'a\x20b'`, "a", " ", "b", 0); 391 testMultipart(`'a\u2248b'`, "a", "≈", "b", 0); 392 testMultipart(`'a\U0001F44Db'`, "a", "👍", "b", 0); 393 } 394 395 /+ 396 Read a symbol operator from the input range. 397 Params: 398 t = The tokenizer 399 Returns: 400 A string containing any symbol operators that were able to be read. 401 +/ 402 IonTextSymbolOperator readSymbolOperator(return ref IonTokenizer t) @safe @nogc pure 403 { 404 IonTextSymbolOperator val; 405 size_t startIndex = t.position; 406 val.matchedIndex = startIndex; 407 char c = t.peekOne(); 408 while (c.isOperatorChar()) { 409 t.skipOne(); 410 c = t.peekOne(); 411 } 412 413 val.matchedText = t.input[startIndex .. t.position]; 414 return val; 415 } 416 417 /+ 418 Read a string from the input range and automatically decode any UTF escapes. 419 Params: 420 longString = Is this string a 'long' string, defined by 3 single-quotes? 421 isClob = Is this string allowed to have UTF escapes? 422 t = The tokenizer 423 Returns: 424 The string's content from the input range. 425 +/ 426 auto readString(bool longString = false, bool isClob = false)(return ref IonTokenizer t) @safe @nogc pure 427 { 428 static if (isClob) { 429 IonTextClob val; 430 } else { 431 IonTextString val; 432 } 433 434 val.isFinal = true; 435 static if (longString && !isClob) { 436 val.isLongString = true; 437 } 438 439 size_t read = 0, startIndex = t.position, endIndex = 0; 440 loop: while (true) { 441 char c = t.expect!"a != 0"; 442 443 static if (!longString) { 444 t.expectFalse!(isNewLine, true)(c); 445 } 446 447 /* 448 static if (isClob) { 449 //t.expectFalse!(isInvalidChar, true)(c); 450 t.expect!(isASCIIChar, true)(c); 451 } else { 452 t.expectFalse!(isInvalidChar, true)(c); 453 } 454 */ 455 456 s: switch (c) { 457 static if (!longString) { 458 case '"': 459 break loop; 460 } else { 461 static if (!isClob) { 462 case '\r': 463 if (read != 0) { 464 t.unread(c); 465 endIndex = t.position; 466 val.isFinal = false; 467 break loop; 468 } 469 470 const(char)[] v = t.peekMax(1); 471 if (v.length == 1 && v[0] == '\n') { // see if this is \r\n or just \r 472 t.skipOne(); 473 } 474 475 t.resetEscapeBuffer(); 476 t.escapeSequence[0] = '\n'; 477 val.matchedText = t.escapeSequence[0 .. 1]; 478 val.isNormalizedNewLine = true; 479 val.isFinal = false; 480 481 // do the same check, and see if this string ends *directly* after this newline 482 // again, peekExactly is acceptable here because the long string *MUST* end with 483 // a sequence of 3 quotes, and we should throw if it's not there. 484 if (t.peekExactly(3) == "'''") { 485 // consume, and skip whitespace 486 assert(t.skipExactly(3)); // consume the first quote mark 487 val.isFinal = true; 488 c = t.skipWhitespace!(true, false); 489 t.unread(c); 490 } 491 return val; 492 } 493 case '\'': 494 const(char)[] v = t.peekMax(2); 495 if (v.length == 2 && v[0] == '\'' && v[1] == '\'') { 496 val.isFinal = true; 497 endIndex = t.position - 1; 498 t.skipExactly(2); 499 static if (isClob) { 500 c = t.skipWhitespace!(false, true); 501 if (c) { 502 t.unread(c); 503 break loop; 504 } else { 505 break s; 506 } 507 } else { 508 break loop; 509 } 510 } else { 511 goto default; 512 } 513 } 514 case '\\': 515 if (read != 0) { 516 t.unread(c); 517 endIndex = t.position; 518 val.isFinal = false; 519 break loop; 520 } 521 522 size_t esc = readEscapeSeq!(isClob)(t); 523 static if (isClob) { 524 if (esc == 2) { 525 // XXX: hack 526 // Since we can't have unicode escapes, this HAS to be \x80 - \xFF. 527 // We shouldn't convert this into a UTF codepoint, and we should keep it as is. 528 break s; 529 } 530 } 531 532 assert(esc <= 4); // sanity check that we do not have an escape larger then 4 chars 533 534 val.matchedText = t.escapeSequence[0 .. esc]; 535 val.matchedIndex = startIndex; 536 val.isEscapeSequence = true; 537 val.isFinal = false; 538 // check if the string ends *directly* after this escape, 539 // if so, just consume the quotations, and call it a day 540 static if (longString) { 541 // if this is a long string, there should be *at least* 3 extra 542 // characters left (for the ending quotes). this will throw 543 // if they are not there. 544 if (t.peekExactly(3) == "'''") { 545 // consume, and skip whitespace 546 assert(t.skipExactly(3)); 547 val.isFinal = true; 548 static if (isClob) { 549 c = t.skipWhitespace!(false, true); 550 } else { 551 c = t.skipWhitespace!(true, false); 552 } 553 t.unread(c); 554 } 555 } else { 556 if (t.peekOne() == '"') { 557 assert(t.skipOne()); 558 val.isFinal = true; 559 } 560 } 561 if (esc >= 1) { 562 val.escapeSequenceType = IonTextEscapeType.UTF; 563 } else { 564 val.escapeSequenceType = IonTextEscapeType.Hex; 565 } 566 return val; 567 //break s; 568 default: 569 read++; 570 break s; 571 } 572 } 573 574 if (endIndex == 0) { 575 endIndex = t.position - 1; 576 } 577 578 val.matchedText = t.input[startIndex .. endIndex]; 579 val.matchedIndex = startIndex; 580 return val; 581 } 582 // Test reading a string 583 version(mir_ion_parser_test) unittest 584 { 585 import mir.deser.text.tokenizer : tokenizeString; 586 import mir.deser.text.tokens : IonTokenType; 587 588 void test(string ts, string expected, char after) { 589 auto t = tokenizeString(ts); 590 assert(t.nextToken()); 591 assert(t.currentToken == IonTokenType.TokenString); 592 auto str = t.readString(); 593 assert(str.matchedText == expected); 594 assert(t.readInput() == after); 595 } 596 597 void testMultiPart(string ts, string expected, string after, char last) { 598 auto t = tokenizeString(ts); 599 600 assert(t.nextToken()); 601 assert(t.currentToken == IonTokenType.TokenString); 602 auto str = t.readString(); 603 assert(str.matchedText == expected); 604 assert(!str.isEscapeSequence); 605 assert(!str.isFinal); 606 607 auto str2 = t.readString(); 608 assert(str2.matchedText == after); 609 assert(str2.isEscapeSequence); 610 assert(str2.isFinal); 611 assert(t.readInput() == last); 612 } 613 614 test(`"Hello, world"`, "Hello, world", 0); 615 testMultiPart(`"Hello! \U0001F44D"`, "Hello! ", "👍", 0); 616 test(`"0xFOOBAR",`, "0xFOOBAR", ','); 617 } 618 619 /+ 620 Read a long string (defined by having three single quotes surrounding it's contents). 621 622 $(NOTE If this function encounters another long string in the input range separated by whitespace, 623 it will concatenate the contents of the two long strings together. This is not implementation-specific, 624 rather, part of the Ion specification) 625 626 Params: 627 t = The tokenizer 628 Returns: 629 A string holding the contents of any long strings found. 630 +/ 631 IonTextString readLongString(return ref IonTokenizer t) @safe @nogc pure 632 { 633 return readString!(true)(t); 634 } 635 // Test reading a long string 636 version(mir_ion_parser_test) unittest 637 { 638 import mir.deser.text.tokenizer : tokenizeString; 639 import mir.deser.text.tokens : IonTokenType; 640 641 void test(string ts, string expected, char after) { 642 auto t = tokenizeString(ts); 643 assert(t.nextToken()); 644 assert(t.currentToken == IonTokenType.TokenLongString); 645 auto str = t.readLongString(); 646 t.finished = true; 647 assert(str.matchedText == expected); 648 assert(t.readInput() == after); 649 assert(str.isFinal); 650 } 651 652 void testMultiPart(string ts, string expected1, string expected2, char after) { 653 auto t = tokenizeString(ts); 654 assert(t.nextToken()); 655 assert(t.currentToken == IonTokenType.TokenLongString); 656 657 auto str = t.readLongString(); 658 t.finished = true; 659 assert(str.matchedText == expected1); 660 assert(str.isFinal); 661 662 assert(t.nextToken()); 663 assert(t.currentToken == IonTokenType.TokenLongString); 664 auto str2 = t.readLongString(); 665 assert(str2.matchedText == expected2); 666 assert(t.readInput() == after); 667 assert(str.isFinal); 668 } 669 670 void testNewLine(string ts, string expected1, string expected2, bool normalized, bool eofFinal, char after) { 671 auto t = tokenizeString(ts); 672 assert(t.nextToken()); 673 assert(t.currentToken == IonTokenType.TokenLongString); 674 auto str = t.readLongString(); 675 assert(str.matchedText == expected1); 676 t.finished = true; 677 if (normalized) { 678 assert(!str.isFinal); 679 auto str1 = t.readLongString(); 680 t.finished = true; 681 assert(str1.isNormalizedNewLine); 682 assert(str1.matchedText == "\n"); 683 if (eofFinal) { 684 assert(str1.isFinal); 685 assert(t.nextToken()); 686 assert(t.currentToken == IonTokenType.TokenLongString); 687 } else { 688 assert(!str1.isFinal); 689 } 690 } else { 691 assert(str.isFinal); 692 assert(t.nextToken()); 693 assert(t.currentToken == IonTokenType.TokenLongString); 694 } 695 auto str1 = t.readLongString(); 696 assert(str1.matchedText == expected2); 697 assert(str1.isFinal); 698 assert(t.readInput() == after); 699 } 700 701 test(`'''Hello, world'''`, "Hello, world", 0); 702 testMultiPart(`'''Hello! ''''''\U0001F44D'''`, "Hello! ", "👍", 0); 703 test(`'''0xFOOBAR''',`, "0xFOOBAR", ','); 704 test(`'''Hello, 'world'!'''`, "Hello, \'world\'!", 0); 705 testMultiPart(`'''Hello,'''''' world!'''`, "Hello,", " world!", 0); 706 testMultiPart(`'''Hello,''' ''' world!'''`, "Hello,", " world!", 0); 707 // Test the normalization of new-lines in long strings here. 708 testNewLine("'''Hello, \r\n''' '''world!'''", "Hello, ", "world!", true, true, 0); // normalized, crlf precedes end of string 709 testNewLine("'''Hello, \r\n world!'''", "Hello, ", " world!", true, false, 0); // normalized, but there is extra text 710 testNewLine("'''Hello, \n''' '''world!'''", "Hello, \n", "world!", false, false, 0); // not normalized, no extra text 711 testNewLine("'''Hello, \r''' '''world!'''", "Hello, ", "world!", true, true, 0); // normalized, crlf precedes end of string 712 testNewLine("'''Hello, \r \nworld!'''", "Hello, ", " \nworld!", true, false, 0); // normalized, but there is extra text 713 } 714 715 /+ 716 Read the contents of a clob, and return it as an untyped array. 717 718 $(NOTE As per Ion specification, a clob does not contain Base64 data. Use readBlob if you are expecting to decode Base64 data.) 719 720 Params: 721 longClob = Should this function concatenate the contents of multiple clobs within the brackets? 722 t = The tokenizer 723 Returns: 724 An untyped array containing the contents of the clob. This array is guaranteed to have no UTF-8/UTF-32 characters -- only ASCII characters. 725 +/ 726 727 IonTextClob readClob(bool longClob = false)(return ref IonTokenizer t) @safe @nogc pure 728 { 729 // Always read out bytes, as clobs are octet-based (and not necessarily a string) 730 auto data = readString!(longClob, true)(t); 731 char c; 732 static if (longClob) { 733 data.isLongClob = true; 734 if (data.isFinal) { 735 c = t.skipLobWhitespace(); 736 if (c == '\'' && t.isTripleQuote()) { 737 data.isFinal = false; 738 return data; 739 } 740 } 741 } 742 743 // read out the following }} ONLY if we don't encounter an escape sequence 744 if (data.isFinal) { 745 static if (longClob) { 746 c = t.expect!("a == '}'", true)(c); 747 } else { 748 c = t.expect!("a == '}'", true)(t.skipLobWhitespace()); // after skipping any whitespace, it should be the terminator ('}') 749 } 750 c = t.expect!"a == '}'"; // and no whitespace should between one bracket and another 751 } 752 753 return data; 754 } 755 // Test reading a short clob 756 version(mir_ion_parser_test) unittest 757 { 758 import mir.deser.text.tokenizer : tokenizeString; 759 import mir.deser.text.tokens : IonTokenType; 760 761 void test(string ts, string expected, char after) { 762 auto t = tokenizeString(ts); 763 assert(t.nextToken()); 764 assert(t.currentToken == IonTokenType.TokenString); 765 assert(t.readClob().matchedText == expected); 766 assert(t.readInput() == after); 767 } 768 769 test(`"Hello, world"}}`, "Hello, world", 0); 770 test(`"0xF00BAR"}}, `, "0xF00BAR", ','); 771 } 772 773 /+ 774 Helper to read a long clob from the input stream. 775 776 See [readClob] for any notes. 777 Params: 778 t = The tokenizer 779 Returns: 780 An untyped array holding the contents of the clob. 781 +/ 782 IonTextClob readLongClob(return ref IonTokenizer t) @safe @nogc pure 783 { 784 return readClob!(true)(t); 785 } 786 787 /+ 788 Read a blob from the input stream, and return the Base64 contents. 789 790 $(NOTE This function does not verify if the Base64 contained is valid, or if it is even Base64.) 791 Params: 792 t = The tokenizer 793 Returns: 794 An untyped array containing the Base64 contents of the blob. 795 +/ 796 IonTextBlob readBlob(return ref IonTokenizer t) @safe @nogc pure 797 { 798 IonTextBlob val; 799 size_t startIndex = t.position, endIndex = t.position; 800 char c; 801 while (true) { 802 c = t.expect!("a != 0", true)(t.skipLobWhitespace()); 803 if (c == '}') { 804 break; 805 } else { 806 endIndex = t.position; // grow our end index as we get more data 807 } 808 } 809 810 c = t.expect!"a == '}'"; 811 t.finished = true; 812 val.matchedText = t.input[startIndex .. endIndex]; 813 val.matchedIndex = startIndex; 814 return val; 815 } 816 /+ 817 Read a number from the input stream, and return the type of number, as well as the number itself. 818 819 Params: 820 t = The tokenizer 821 Returns: 822 A struct holding the type and value of the number. 823 See the examples below on how to access the type/value. 824 +/ 825 826 IonTextNumber readNumber(return ref IonTokenizer t) @safe @nogc pure 827 { 828 import mir.ion.type_code : IonTypeCode; 829 IonTextNumber num; 830 size_t startIndex = t.position; 831 832 void readExponent() @safe @nogc pure { 833 char c = t.readInput(); 834 if (c == '+' || c == '-') { 835 c = t.expect!"a != 0"; 836 } 837 838 readDigits(t, c); 839 } 840 841 char c = t.readInput(); 842 if (c == '-' || c == '+') { 843 c = t.readInput(); 844 } 845 846 immutable char leader = c; 847 const(char)[] digits = readDigits(t, leader); 848 if (leader == '0') { 849 if (digits.length != 1) { // if it is not just a plain 0, fail since we don't support leading zeros 850 throw IonTokenizerErrorCode.invalidLeadingZeros.ionTokenizerException; 851 } 852 } 853 854 c = t.readInput(); 855 if (c == '.') { 856 c = t.readInput(); 857 if (c.isDigit) { 858 immutable char decimalLeader = t.expect!("a != 0", true)(c); 859 readDigits(t, decimalLeader); 860 c = t.readInput(); 861 } 862 } 863 864 switch (c) { 865 case 'e': 866 case 'E': 867 case 'd': 868 case 'D': 869 readExponent(); 870 break; 871 default: 872 // this is not a character we want, so unread it (for now) 873 t.unread(c); 874 break; 875 } 876 877 c = t.expect!(t.isStopChar); 878 t.unread(c); 879 num.matchedText = t.input[startIndex .. t.position]; 880 num.matchedIndex = startIndex; 881 882 return num; 883 } 884 // Test reading numbers 885 version(mir_ion_parser_test) unittest 886 { 887 import mir.deser.text.tokenizer : tokenizeString; 888 import mir.deser.text.tokens : IonTokenType; 889 import mir.ion.type_code : IonTypeCode; 890 891 void test(string ts, string expected, char after) { 892 auto t = tokenizeString(ts); 893 assert(t.nextToken()); 894 assert(t.currentToken == IonTokenType.TokenNumber); 895 auto n = t.readNumber(); 896 assert(n.matchedText == expected); 897 assert(t.readInput() == after); 898 } 899 900 test("12341", "12341", 0); 901 test("-12312", "-12312", 0); 902 test("0.420d2", "0.420d2", 0); 903 test("1.1999999999999999555910790149937383830547332763671875e0", 904 "1.1999999999999999555910790149937383830547332763671875e0", 0); 905 test("1.1999999999999999e0, ", "1.1999999999999999e0", ','); 906 } 907 908 /+ 909 Read as many digits from the input stream as possible, given the first digit of the digits. 910 911 This function will stop reading digits as soon as whitespace is hit. 912 Params: 913 t = The tokenizer 914 leader = The leading digit in a sequence of digits following 915 buf = The appender on which this function will put it's output 916 Returns: 917 A character located after it has read every single digit in a sequence. 918 +/ 919 const(char)[] readDigits(return ref IonTokenizer t, char leader) @safe @nogc pure 920 { 921 immutable char c = leader; 922 if (!isDigit(c)) { 923 throw IonTokenizerErrorCode.expectedValidLeader.ionTokenizerException; 924 } 925 t.unread(c); // unread so the readRadixDigits can consume it 926 return readRadixDigits(t); 927 } 928 929 /+ 930 Read as many digits from the input stream as possible, given a validator. 931 932 This function will stop reading digits as soon as the validator returns false. 933 Params: 934 isValid = The validation function which is called to determine if the reader should halt. 935 t = The tokenizer 936 buf = The appender on which this function will put it's output 937 Returns: 938 A character located after it has read every single digit in a sequence. 939 +/ 940 const(char)[] readRadixDigits(alias isValid = isDigit)(return ref IonTokenizer t) 941 { 942 import mir.functional : naryFun; 943 size_t startIndex = t.position; 944 while (true) { 945 char c = t.readInput(); 946 if (c == '_') { 947 t.expect!(isValid, true)(t.peekOne()); 948 continue; // skip over it 949 } 950 951 if (!naryFun!isValid(c)) { 952 t.unread(c); 953 return t.input[startIndex .. t.position]; 954 } 955 } 956 } 957 958 /+ 959 Read a radix number, given two validation functions for it's marker and the validity of each digit read. 960 961 Params: 962 isMarker = A validation function to check if the marker is valid (0b/0x/etc) 963 isValid = A validation function to check if every digit found is valid (0-1/0-9A-F/etc) 964 t = The tokenizer 965 Returns: 966 A string containing the full radix number (including the leading 0 and marker). 967 +/ 968 const(char)[] readRadix(alias isMarker, alias isValid)(return ref IonTokenizer t) @safe @nogc pure 969 { 970 size_t startIndex = t.position; 971 char c = t.readInput(); 972 if (c == '-') { 973 c = t.readInput(); 974 } 975 976 // 0 977 t.expect!("a == '0'", true)(c); 978 // 0(b || x) 979 c = t.expect!isMarker; 980 t.expect!("a != '_'", true)(t.peekOne()); // cannot be 0x_ or 0b_ 981 const(char)[] val = readRadixDigits!(isValid)(t); 982 c = t.readInput(); 983 if (c) { 984 c = t.expect!(t.isStopChar, true)(c); 985 t.unread(c); 986 } 987 988 return t.input[startIndex .. t.position]; 989 } 990 991 /+ 992 Read a binary number (marked by '0b') from the input stream. 993 994 Params: 995 t = The tokenizer 996 Returns: 997 A string containing the entire binary number read. 998 +/ 999 const(char)[] readBinary(return ref IonTokenizer t) @safe @nogc pure 1000 { 1001 return readRadix!("a == 'b' || a == 'B'", "a == '0' || a == '1'")(t); 1002 } 1003 // Test reading a binary number 1004 version(mir_ion_parser_test) unittest 1005 { 1006 import mir.deser.text.tokenizer : tokenizeString; 1007 import mir.deser.text.tokens : IonTokenType; 1008 1009 void test(string ts, string expected, char after) { 1010 auto t = tokenizeString(ts); 1011 assert(t.nextToken()); 1012 assert(t.currentToken == IonTokenType.TokenBinary); 1013 assert(t.readBinary() == expected); 1014 assert(t.readInput() == after); 1015 } 1016 1017 test("0b101011010", "0b101011010", 0); 1018 test("0b100000101000001010000010100000101000001 ", "0b100000101000001010000010100000101000001", ' '); 1019 test("0b11011110101011011011111011101111,", "0b11011110101011011011111011101111", ','); 1020 test(" 0b11011110101011011011111011101111,", "0b11011110101011011011111011101111", ','); 1021 } 1022 1023 /+ 1024 Read a hex number (marked by '0x') from the input stream. 1025 1026 Params: 1027 t = The tokenizer 1028 Returns: 1029 A string containing the entire hex number read. 1030 +/ 1031 const(char)[] readHex(return ref IonTokenizer t) @safe @nogc pure 1032 { 1033 return readRadix!("a == 'x' || a == 'X'", isHexDigit)(t); 1034 } 1035 // Test reading a hex number 1036 version(mir_ion_parser_test) unittest 1037 { 1038 import mir.deser.text.tokenizer : tokenizeString; 1039 import mir.deser.text.tokens : IonTokenType; 1040 1041 void test(string ts, string expected, char after) { 1042 auto t = tokenizeString(ts); 1043 assert(t.nextToken()); 1044 assert(t.currentToken == IonTokenType.TokenHex); 1045 assert(t.readHex() == expected); 1046 assert(t.readInput() == after); 1047 } 1048 1049 void testMultipart(string ts, string expected1, char after, string expected2) { 1050 auto t = tokenizeString(ts); 1051 assert(t.nextToken()); 1052 assert(t.currentToken == IonTokenType.TokenHex); 1053 assert(t.readHex() == expected1); 1054 assert(t.readInput() == after); 1055 assert(t.readHex() == expected2); 1056 } 1057 1058 test("0xBADBABE", "0xBADBABE", 0); 1059 test("0x414141", "0x414141", 0); 1060 test("0x0", "0x0", 0); 1061 test(" 0x414141", "0x414141", 0); 1062 test(" 0x414141,", "0x414141", ','); 1063 testMultipart(" 0x414141,0x414142", "0x414141", ',', "0x414142"); 1064 } 1065 1066 /+ 1067 Read a ISO-8601 extended timestamp from the input stream. 1068 1069 $(NOTE This function does some rudimentary checks to see if the timestamp is valid, 1070 but it does nothing more then that.) 1071 1072 Params: 1073 t = The tokenizer 1074 Returns: 1075 A string containing the entire timestamp read from the input stream. 1076 +/ 1077 1078 IonTextTimestamp readTimestamp(return ref IonTokenizer t) @safe @nogc pure 1079 { 1080 IonTextTimestamp val; 1081 size_t startIndex = t.position; 1082 1083 char readTSDigits(int nums) @safe @nogc pure { 1084 for (int i = 0; i < nums; i++) { 1085 t.expect!isDigit; 1086 } 1087 return t.readInput(); 1088 } 1089 1090 char readTSOffset(char c) @safe @nogc pure { 1091 if (c != '-' && c != '+') { 1092 return c; 1093 } 1094 const(char) cs = t.expect!("a == ':'", true)(readTSDigits(2)); 1095 return readTSDigits(2); 1096 } 1097 1098 char readTSOffsetOrZ(char c) @safe @nogc pure { 1099 t.expect!("a == '-' || a == '+' || a == 'z' || a == 'Z'", true)(c); 1100 if (c == '-' || c == '+') { 1101 return readTSOffset(c); 1102 } 1103 if (c == 'z' || c == 'Z') { 1104 return t.readInput(); 1105 } 1106 assert(0); 1107 } 1108 1109 IonTextTimestamp readTSFinish(char c) @safe @nogc pure { 1110 t.expect!(t.isStopChar, true)(c); 1111 t.unread(c); 1112 val.matchedIndex = startIndex; 1113 val.matchedText = t.input[startIndex .. t.position]; 1114 return val; 1115 } 1116 1117 // could be either: 1118 // yyyy(T || -) 1119 // or hh 1120 char c = readTSDigits(2); 1121 // is this a year? if so, then the character after 1122 // the first two digits should be a digit... if not, 1123 // we can just assume that it's a "time of day" 1124 if (c.isDigit()) 1125 { 1126 c = t.expect!("a == 'T' || a == '-'", true)(readTSDigits(1)); 1127 if (c == 'T') { 1128 // yyyyT 1129 val.matchedText = t.input[startIndex .. t.position]; 1130 return val; 1131 } 1132 // yyyy-mm(T || -) 1133 c = t.expect!("a == 'T' || a == '-'", true)(readTSDigits(2)); 1134 if (c == 'T') { 1135 val.matchedText = t.input[startIndex .. t.position]; 1136 return val; 1137 } 1138 // yyyy-mm-dd(T)? 1139 c = readTSDigits(2); 1140 if (c != 'T') { 1141 return readTSFinish(c); 1142 } 1143 // yyyy-mm-ddT 1144 c = t.readInput(); 1145 if (!c.isDigit()) { 1146 // yyyy-mm-ddT(+ || -)hh:mm 1147 c = readTSOffset(c); 1148 return readTSFinish(c); 1149 } 1150 // (yyyy-mm-ddT)?hh 1151 c = t.expect!("a == ':'", true)(readTSDigits(1)); 1152 } 1153 else 1154 { 1155 // hh 1156 c = t.expect!("a == ':'", true)(c); 1157 } 1158 1159 // (yyyy-mm-ddT)?hh:mm 1160 c = readTSDigits(2); 1161 if (c != ':') { 1162 // (yyyy-mm-ddT)?hh:mm(+-|Z)? 1163 if (c) { 1164 c = readTSOffsetOrZ(c); 1165 } 1166 return readTSFinish(c); 1167 } 1168 1169 // (yyyy-mm-ddT)?hh:mm:ss 1170 c = readTSDigits(2); 1171 1172 if (c != '.') { 1173 // (yyyy-mm-ddT)?hh:mm:ss(Z)? 1174 if (c) { 1175 c = readTSOffsetOrZ(c); 1176 } 1177 return readTSFinish(c); 1178 } 1179 1180 // (yyyy-mm-ddT)?hh:mm:ss.ssssZ 1181 c = t.readInput(); 1182 if (c.isDigit()) { 1183 readDigits(t, c); 1184 } 1185 1186 c = t.readInput(); 1187 if (c) { 1188 c = readTSOffsetOrZ(c); 1189 } 1190 1191 return readTSFinish(c); 1192 } 1193 // Test reading timestamps 1194 version(mir_ion_parser_test) unittest 1195 { 1196 import mir.deser.text.tokenizer : tokenizeString; 1197 import mir.deser.text.tokens : IonTokenType; 1198 1199 void test(string ts, string expected, char after) { 1200 auto t = tokenizeString(ts); 1201 assert(t.nextToken()); 1202 assert(t.currentToken == IonTokenType.TokenTimestamp); 1203 assert(t.readTimestamp().matchedText == expected); 1204 assert(t.readInput() == after); 1205 } 1206 1207 test("2001T", "2001T", 0); 1208 test("2001-01T,", "2001-01T", ','); 1209 test("2001-01-02}", "2001-01-02", '}'); 1210 test("2001-01-02T ", "2001-01-02T", ' '); 1211 test("2001-01-02T+00:00\t", "2001-01-02T+00:00", '\t'); 1212 test("2001-01-02T-00:00\n", "2001-01-02T-00:00", '\n'); 1213 test("2001-01-02T03:04+00:00 ", "2001-01-02T03:04+00:00", ' '); 1214 test("2001-01-02T03:04-00:00 ", "2001-01-02T03:04-00:00", ' '); 1215 test("2001-01-02T03:04Z ", "2001-01-02T03:04Z", ' '); 1216 test("2001-01-02T03:04z ", "2001-01-02T03:04z", ' '); 1217 test("2001-01-02T03:04:05Z ", "2001-01-02T03:04:05Z", ' '); 1218 test("2001-01-02T03:04:05+00:00 ", "2001-01-02T03:04:05+00:00", ' '); 1219 test("2001-01-02T03:04:05.666Z ", "2001-01-02T03:04:05.666Z", ' '); 1220 test("2001-01-02T03:04:05.666666z ", "2001-01-02T03:04:05.666666z", ' '); 1221 1222 // Test new "time of day" timestamps 1223 test("03:04+00:00", "03:04+00:00", 0); 1224 test("03:04-00:00", "03:04-00:00", 0); 1225 test("03:04Z", "03:04Z", 0); 1226 test("03:04z", "03:04z", 0); 1227 test("03:04:05Z", "03:04:05Z", 0); 1228 test("03:04:05+00:00", "03:04:05+00:00", 0); 1229 test("03:04:05.666Z", "03:04:05.666Z", 0); 1230 test("03:04:05.666z", "03:04:05.666z", 0); 1231 test("03:04:05.666666Z", "03:04:05.666666Z", 0); 1232 test("03:04:05.666666z", "03:04:05.666666z", 0); 1233 }