1 /+ 2 Tokenizer to split up the contents of an Ion Text file into tokens 3 4 Authors: Harrison Ford 5 +/ 6 module mir.deser.text.tokenizer; 7 8 import mir.deser.text.readers; 9 import mir.deser.text.skippers; 10 import mir.deser.text.tokens; 11 12 /+ 13 Create a tokenizer for a given UTF-8 string. 14 15 This function will take in a given string, and duplicate it. 16 Then, it will proceed to tokenize it. 17 18 $(NOTE If this string is not a UTF-8 string, consider using the overload which accepts a UTF-16/UTF-32 string.) 19 20 Params: 21 input = String to tokenize 22 Returns: 23 [IonTokenizer] 24 +/ 25 IonTokenizer tokenizeString(const(char)[] input) @safe @nogc pure { 26 return IonTokenizer(input); 27 } 28 29 /+ 30 Tokenizer based off of how ion-go handles tokenization 31 +/ 32 struct IonTokenizer { 33 /+ Our input range that we read from +/ 34 const(char)[] input; 35 36 /+ The current window that we're reading from (sliding window) +/ 37 const(char)[] window; 38 39 /+ The escape sequence that we're reading from the wire +/ 40 char[4] escapeSequence; 41 42 /+ Bool specifying if we want to read through the contents of the current token +/ 43 bool finished; 44 45 /+ Current position within our input range +/ 46 size_t position; 47 48 /+ Current token that we're located on +/ 49 IonTokenType currentToken; 50 51 /+ 52 Constructor 53 Params: 54 input = The input range to read over 55 +/ 56 this(const(char)[] input) @safe @nogc pure { 57 this.input = input; 58 this.finished = true; 59 resizeWindow(0); 60 } 61 62 /+ 63 Update the sliding window's beginning index 64 Params: 65 start = The beginning index to start at 66 +/ 67 void resizeWindow(size_t start) @safe @nogc pure { 68 if (start > input.length) { 69 throw IonTokenizerErrorCode.cannotUpdateWindow.ionTokenizerException; 70 } 71 72 window = input[start .. $]; 73 this.position = start; 74 } 75 76 /+ 77 Clear out the escape sequence buffer. 78 +/ 79 void resetEscapeBuffer() @safe @nogc pure { 80 this.escapeSequence[0] = '\0'; 81 this.escapeSequence[1] = '\0'; 82 this.escapeSequence[2] = '\0'; 83 this.escapeSequence[3] = '\0'; 84 } 85 86 /+ 87 Variable to indicate if we at the end of our range 88 Returns: 89 true if end of file, false otherwise 90 +/ 91 bool isEOF() @safe @nogc pure { 92 return this.window.length == 0 93 || this.currentToken == IonTokenType.TokenEOF 94 || this.position >= this.input.length; 95 } 96 97 /+ 98 Unread a given character and append it to the peek buffer 99 Params: 100 c = Character to append to the top of the peek buffer. 101 +/ 102 void unread(char c) @safe @nogc pure { 103 if (this.position <= 0) { 104 throw IonTokenizerErrorCode.cannotUnreadAtPos0.ionTokenizerException; 105 } 106 107 if (c == 0) { 108 return; 109 } else { 110 resizeWindow(this.position - 1); 111 } 112 } 113 // Test reading / unreading bytes 114 version(mir_ion_parser_test) unittest 115 { 116 auto t = tokenizeString("abc\rd\ne\r\n"); 117 118 t.testRead('a'); 119 t.unread('a'); 120 121 t.testRead('a'); 122 t.testRead('b'); 123 t.testRead('c'); 124 t.unread('c'); 125 t.unread('b'); 126 127 t.testRead('b'); 128 t.testRead('c'); 129 t.testRead('\r'); 130 t.unread('\r'); 131 132 t.testRead('\r'); 133 t.testRead('d'); 134 t.testRead('\n'); 135 t.testRead('e'); 136 t.testRead('\r'); 137 t.testRead('\n'); 138 t.testRead(0); // test EOF 139 140 t.unread(0); // unread EOF 141 t.unread('\n'); 142 143 t.testRead('\n'); 144 t.testRead(0); // test EOF 145 t.testRead(0); // test EOF 146 } 147 148 /+ 149 Skip a single character within our input range, and discard it 150 Returns: 151 true if it was able to skip a single character, 152 false if it was unable (due to hitting an EOF or the like) 153 +/ 154 bool skipOne() @safe @nogc pure { 155 const(char) c = readInput(); 156 if (c == 0) { 157 return false; 158 } 159 return true; 160 } 161 162 /+ 163 Skip exactly n input characters from the input range 164 165 $(NOTE 166 This function will only return true IF it is able to skip *the entire amount specified*) 167 Params: 168 n = Number of characters to skip 169 Returns: 170 true if skipped the entire range, 171 false if unable to skip the full range specified. 172 +/ 173 bool skipExactly(size_t n) @safe @nogc pure { 174 for (size_t i = 0; i < n; i++) { 175 if (!skipOne()) { 176 return false; 177 } 178 } 179 return true; 180 } 181 182 /+ 183 Read ahead at most n characters from the input range without discarding them. 184 185 $(NOTE 186 This function does not require n characters to be present. 187 If it encounters an EOF, it will simply return a shorter range.) 188 Params: 189 n = Max number of characters to peek 190 Returns: 191 Array of peeked characters 192 +/ 193 auto peekMax(size_t wanted = 4096) @safe @nogc pure { 194 size_t n = wanted; 195 if (n >= window.length) { 196 n = window.length; 197 } 198 199 auto arr = window[0 .. n]; 200 return arr; 201 } 202 203 /+ 204 Read ahead exactly n characters from the input range without discarding them. 205 206 $(NOTE 207 This function will throw if all n characters are not present. 208 If you would like to peek as many as possible, use [peekMax] instead.) 209 Params: 210 n = Number of characters to peek 211 Returns: 212 An array filled with n characters. 213 Throws: 214 [IonTokenizerException] 215 +/ 216 auto peekExactly(size_t required = 4096) @safe @nogc pure { 217 size_t n = required; 218 if (n > window.length) { 219 unexpectedEOF(); 220 } 221 222 auto buf = window[0 .. n]; 223 224 return buf; 225 } 226 // Test peekExactly 227 version(mir_ion_parser_test) unittest 228 { 229 import std.exception : assertThrown; 230 import mir.exception : enforce; 231 import mir.deser.text.tokens : IonTokenizerException; 232 233 auto t = tokenizeString("abc\r\ndef"); 234 235 assert(t.peekExactly(1).ptr == t.window.ptr); 236 assert(t.peekExactly(1) == "a"); 237 assert(t.peekExactly(2) == "ab"); 238 assert(t.peekExactly(3) == "abc"); 239 240 t.testRead('a'); 241 t.testRead('b'); 242 243 assert(t.peekExactly(3).ptr == t.window.ptr); 244 assert(t.peekExactly(3) == "c\r\n"); 245 assert(t.peekExactly(2) == "c\r"); 246 assert(t.peekExactly(3) == "c\r\n"); 247 248 t.testRead('c'); 249 t.testRead('\r'); 250 t.testRead('\n'); 251 t.testRead('d'); 252 253 assertThrown!IonTokenizerException(t.peekExactly(3)); 254 assertThrown!IonTokenizerException(t.peekExactly(3)); 255 assert(t.peekExactly(2) == "ef"); 256 257 t.testRead('e'); 258 t.testRead('f'); 259 t.testRead(0); 260 261 assertThrown!IonTokenizerException(t.peekExactly(10)); 262 } 263 264 /+ 265 Read ahead one character from the input range without discarding it. 266 267 $(NOTE 268 This function will throw if it cannot read one character ahead. 269 Use [peekMax] if you want to read without throwing.) 270 Returns: 271 A single character read ahead from the input range. 272 Throws: 273 [IonTokenizerException] 274 +/ 275 char peekOne() @safe @nogc pure { 276 if (isEOF) { 277 this.unexpectedEOF(); 278 } 279 280 char c; 281 c = readInput(); 282 unread(c); 283 284 return c; 285 } 286 // Test peeking the next byte in the stream 287 version(mir_ion_parser_test) unittest 288 { 289 import std.exception : assertThrown; 290 import mir.deser.text.tokens : IonTokenizerException; 291 292 auto t = tokenizeString("abc"); 293 294 t.testPeek('a'); 295 t.testPeek('a'); 296 t.testRead('a'); 297 298 t.testPeek('b'); 299 t.unread('a'); 300 301 t.testPeek('a'); 302 t.testRead('a'); 303 t.testRead('b'); 304 t.testPeek('c'); 305 t.testPeek('c'); 306 t.testRead('c'); 307 308 assertThrown!IonTokenizerException(t.peekOne() == 0); 309 assertThrown!IonTokenizerException(t.peekOne() == 0); 310 assert(t.readInput() == 0); 311 } 312 313 /+ 314 Read a single character from the input range (or from the peek buffer, if it's not empty) 315 316 $(NOTE `readInput` does NOT normalize CRLF to a simple new-line.) 317 Returns: 318 a single character from the input range, or 0 if the EOF is encountered. 319 Throws: 320 [IonTokenizerException] 321 +/ 322 char readInput() @safe @nogc pure { 323 if (isEOF) { 324 return 0; 325 } 326 327 immutable char c = this.window[0]; 328 resizeWindow(this.position + 1); 329 /* 330 if (c == '\r') { 331 // EOFs should've been normalized at the first stage 332 throw Mir(IonTokenizerErrorCode.normalizeEOFFail); 333 } 334 */ 335 336 return c; 337 } 338 // Test reading bytes off of a range 339 version(mir_ion_parser_test) unittest 340 { 341 auto t = tokenizeString("abcdefghijklmopqrstuvwxyz1234567890"); 342 t.testRead('a'); 343 t.testRead('b'); 344 t.testRead('c'); 345 t.testRead('d'); 346 t.testRead('e'); 347 t.testRead('f'); 348 t.testRead('g'); 349 t.testRead('h'); 350 t.testRead('i'); 351 } 352 // Test the normalization of CRLFs 353 version(mir_ion_parser_test) unittest 354 { 355 auto t = tokenizeString("a\r\nb\r\nc\rd"); 356 t.testRead('a'); 357 t.testRead('\r'); 358 t.testRead('\n'); 359 t.testRead('b'); 360 t.testRead('\r'); 361 t.testRead('\n'); 362 t.testRead('c'); 363 t.testRead('\r'); 364 t.testRead('d'); 365 t.testRead(0); 366 } 367 368 /+ 369 Skip any whitespace that is present between our current token and the next valid token. 370 371 Additionally, skip comments (or fail on comments). 372 373 $(NOTE `skipComments` and `failOnComment` cannot both be true.) 374 Returns: 375 The character located directly after the whitespace. 376 Throws: 377 [IonTokenizerException] 378 +/ 379 char skipWhitespace(bool skipComments = true, bool failOnComment = false)() @safe @nogc pure 380 if (skipComments != failOnComment || (skipComments == false && skipComments == failOnComment)) { // just a sanity check, we cannot skip comments and also fail on comments -- it is one or another (fail or skip) 381 while (true) { 382 char c = readInput(); 383 sw: switch(c) { 384 static foreach(member; ION_WHITESPACE) { 385 case member: 386 break sw; 387 } 388 389 case '/': { 390 static if (failOnComment) { 391 throw IonTokenizerErrorCode.commentsNotAllowed.ionTokenizerException; 392 } else static if(skipComments) { 393 // Peek on the next letter, and check if it's a second slash / star 394 // This may fail if we read a comment and do not find the end (newline / '*/') 395 // Undetermined if I need to unread the last char if this happens? 396 if (this.skipComment()) 397 break; 398 else 399 goto default; 400 } 401 else { 402 return '/'; 403 } 404 } 405 // If this is a non-whitespace character, unread it 406 default: 407 return c; 408 } 409 } 410 return 0; 411 } 412 // Test skipping over whitespace 413 version(mir_ion_parser_test) unittest 414 { 415 import std.exception : assertNotThrown; 416 import mir.exception : enforce; 417 import mir.deser.text.tokens : IonTokenizerException; 418 void test(string txt, char expectedChar) { 419 auto t = tokenizeString(txt); 420 assertNotThrown!IonTokenizerException( 421 enforce!"skipWhitespace did not return expected character"(t.skipWhitespace() == expectedChar) 422 ); 423 } 424 425 test("/ 0)", '/'); 426 test("xyz_", 'x'); 427 test(" / 0)", '/'); 428 test(" xyz_", 'x'); 429 test(" \t\r\n / 0)", '/'); 430 test("\t\t // comment\t\r\n\t\t x", 'x'); 431 test(" \r\n /* comment *//* \r\n comment */x", 'x'); 432 } 433 434 /+ 435 Skip whitespace within a clob/blob. 436 437 This function is just a wrapper around skipWhitespace, but toggles on it's "fail on comment" mode, as 438 comments are not allowed within clobs/blobs. 439 Returns: 440 a character located after the whitespace within a clob/blob 441 Throws: 442 IonTokenizerException if a comment is found 443 +/ 444 char skipLobWhitespace() @safe @nogc pure { 445 return skipWhitespace!(false, false); 446 } 447 // Test skipping over whitespace within a (c|b)lob 448 version(mir_ion_parser_test) unittest 449 { 450 import std.exception : assertNotThrown; 451 import mir.exception : enforce; 452 import mir.deser.text.tokens : IonTokenizerException; 453 void test(string txt, char expectedChar)() { 454 auto t = tokenizeString(txt); 455 assertNotThrown!IonTokenizerException( 456 enforce!"Lob whitespace did not match expected character"(t.skipLobWhitespace() == expectedChar) 457 ); 458 } 459 460 test!("//=", '/'); 461 test!("xyz_", 'x'); 462 test!(" //=", '/'); 463 test!(" xyz_", 'x'); 464 test!("\r\n\t//=", '/'); 465 test!("\r\n\txyz_", 'x'); 466 } 467 468 /+ 469 Check if the next characters within the input range are a double colon, representing an annotation. 470 Returns: 471 true if it finds a double colon, false if it does not. 472 +/ 473 bool isDoubleColon() @safe @nogc pure { 474 char c = skipWhitespace(); 475 unread(c); 476 477 auto cs = peekMax(2); 478 if (cs.length == 2 && cs[0] == ':' && cs[1] == ':') { 479 return true; 480 } 481 return false; 482 } 483 484 /+ 485 Check if the next characters within the input range are the special "nan" type. 486 Params: 487 c = The last character read off of the stream (typically 'n') 488 Returns: 489 true if it is the nan type, false if it is not. 490 +/ 491 bool isNAN(char c) @safe @nogc pure { 492 if (c != 'n') return false; 493 494 auto cs = peekMax(4); 495 496 if (cs.length < 2 || cs[0] != 'a' || cs[1] != 'n') { 497 return false; 498 } 499 500 if (cs.length == 2) { // is this just 'an' + EOF? 501 skipExactly(2); 502 return true; 503 } else if (cs.length == 3 && isStopChar(cs[2])) { // is this 'an' + stop char 504 skipExactly(2); 505 return true; 506 // is this 'an' + comment (block or regular) 507 } else if ((cs.length > 2 && cs[2] == '/') && cs.length > 3 && (cs[3] == '/' || cs[3] == '*')) { 508 skipExactly(2); 509 return true; 510 } 511 512 return false; 513 } 514 // Test scanning for nan 515 version(mir_ion_parser_test) unittest 516 { 517 void test(string txt, bool nan, char after) { 518 auto t = tokenizeString(txt); 519 auto c = t.readInput(); 520 assert(t.isNAN(c) == nan); 521 assert(t.readInput() == after); 522 } 523 524 test("nan", true, 0); 525 test("nan/*", true, '/'); 526 test("nan\t", true, '\t'); 527 test("nan\n", true, '\n'); 528 test("nan ", true, ' '); 529 530 test("-nan", false, 'n'); 531 test("+nan", false, 'n'); 532 test("nat\t", false, 'a'); 533 test("nat/*", false, 'a'); 534 test("nat//", false, 'a'); 535 test("na", false, 'a'); 536 test("n", false, 0); 537 } 538 539 540 /+ 541 Check if the next characters within the input range are the special "infinity" type. 542 543 Params: 544 c = The last character read off of the stream (typically '+' or '-') 545 Returns: 546 true if it is the infinity type, false if it is not. 547 +/ 548 bool isInfinity(char c) @safe @nogc pure { 549 if (c != '+' && c != '-') return false; 550 551 auto cs = peekMax(5); 552 553 if (cs.length < 3 || cs[0] != 'i' || cs[1] != 'n' || cs[2] != 'f') { 554 return false; 555 } 556 557 if (cs.length == 3) { 558 skipExactly(3); 559 return true; 560 } else if (cs.length > 3 && isStopChar(cs[3])) { // cleanly terminated with a stop char 561 skipExactly(3); 562 return true; 563 } else if ((cs.length > 3 && cs[3] == '/') && cs.length > 4 && (cs[4] == '/' || cs[4] == '*')) { 564 skipExactly(3); 565 return true; 566 } 567 568 return false; 569 } 570 // Test scanning for inf 571 version(mir_ion_parser_test) unittest 572 { 573 void test(string txt, bool inf, char after) { 574 auto t = tokenizeString(txt); 575 auto c = t.readInput(); 576 assert(t.isInfinity(c) == inf); 577 assert(t.readInput() == after); 578 } 579 580 test("+inf", true, 0); 581 test("-inf", true, 0); 582 test("+inf ", true, ' '); 583 test("-inf\t", true, '\t'); 584 test("-inf\n", true, '\n'); 585 test("+inf,", true, ','); 586 test("-inf}", true, '}'); 587 test("+inf)", true, ')'); 588 test("-inf]", true, ']'); 589 test("+inf//", true, '/'); 590 test("+inf/*", true, '/'); 591 592 test("+inf/", false, 'i'); 593 test("-inf/0", false, 'i'); 594 test("+int//", false, 'i'); 595 test("+int/*", false, 'i'); 596 test("+int", false, 'i'); 597 test("-iot", false, 'i'); 598 test("+unf", false, 'u'); 599 test("_inf", false, 'i'); 600 601 test("-in", false, 'i'); 602 test("+i", false, 'i'); 603 test("+", false, 0); 604 test("-", false, 0); 605 } 606 607 /+ 608 Check if the current character selected is part of a triple quote (''') 609 610 $(NOTE This function will not throw if an EOF is hit. It will simply return false.) 611 Returns: 612 true if the character is part of a triple quote, 613 false if it is not. 614 +/ 615 bool isTripleQuote() @safe @nogc pure { 616 auto cs = peekMax(2); 617 618 // If the next two characters are '', then it is a triple-quote. 619 if (cs.length == 2 && cs[0] == '\'' && cs[1] == '\'') { 620 skipExactly(2); 621 return true; 622 } 623 624 return false; 625 } 626 627 /+ 628 Check if the current character selected is part of a whole number. 629 630 If it is part of a whole number, then return the type of number (hex, binary, timestamp, number) 631 Params: 632 c = The last character read from the range 633 Returns: 634 the corresponding number type (or invalid) 635 +/ 636 IonTokenType scanForNumber(char c) @safe @nogc pure 637 in { 638 assert(isDigit(c), "Scan for number called with non-digit number"); 639 } do { 640 const(char)[] cs; 641 try { 642 cs = peekMax(4); 643 } catch(IonTokenizerException e) { 644 return IonTokenType.TokenInvalid; 645 } 646 647 // Check if the first character is a 0, then check if the next character is a radix identifier (binary / hex) 648 if (c == '0' && cs.length > 0) { 649 switch(cs[0]) { 650 case 'b': 651 case 'B': 652 return IonTokenType.TokenBinary; 653 654 case 'x': 655 case 'X': 656 return IonTokenType.TokenHex; 657 658 default: 659 break; 660 } 661 } 662 663 // Otherwise, it's not, and we check if it's a timestamp or just a plain number. 664 if (cs.length == 4) { 665 if (!isDigit(cs[0])) return IonTokenType.TokenNumber; 666 667 // "time-of-day" extension 668 if (cs[1] == ':') 669 { 670 return IonTokenType.TokenTimestamp; 671 } 672 else 673 { 674 foreach(i; 1 .. 3) 675 { 676 if (!isDigit(cs[i])) return IonTokenType.TokenNumber; 677 } 678 if (cs[3] == '-' || cs[3] == 'T') { 679 return IonTokenType.TokenTimestamp; 680 } 681 } 682 683 } 684 return IonTokenType.TokenNumber; 685 686 } 687 // Test scanning for numbers 688 version(mir_ion_parser_test) unittest 689 { 690 import mir.deser.text.tokens : IonTokenType; 691 692 void test(string txt, IonTokenType expectedToken) { 693 auto t = tokenizeString(txt); 694 auto c = t.readInput(); 695 assert(t.scanForNumber(c) == expectedToken); 696 } 697 698 test("0b0101", IonTokenType.TokenBinary); 699 test("0B", IonTokenType.TokenBinary); 700 test("0xABCD", IonTokenType.TokenHex); 701 test("0X", IonTokenType.TokenHex); 702 test("0000-00-00", IonTokenType.TokenTimestamp); 703 test("0000T", IonTokenType.TokenTimestamp); 704 705 test("0", IonTokenType.TokenNumber); 706 test("1b0101", IonTokenType.TokenNumber); 707 test("1B", IonTokenType.TokenNumber); 708 test("1x0101", IonTokenType.TokenNumber); 709 test("1X", IonTokenType.TokenNumber); 710 test("1234", IonTokenType.TokenNumber); 711 test("12345", IonTokenType.TokenNumber); 712 test("1,23T", IonTokenType.TokenNumber); 713 test("12,3T", IonTokenType.TokenNumber); 714 test("123,T", IonTokenType.TokenNumber); 715 } 716 717 /+ 718 Set the current token, and if we want to go into the token. 719 Params: 720 token = The updated token type 721 finished = Whether or not we want to go into the token (and parse it) 722 +/ 723 void ok(IonTokenType token, bool finished) @safe @nogc pure { 724 this.currentToken = token; 725 this.finished = finished; 726 } 727 728 /+ 729 Read the next token from the range. 730 Returns: 731 true if it was able to read a valid token from the range. 732 +/ 733 bool nextToken() @safe @nogc pure { 734 char c; 735 // if we're finished with the current value, then skip over the rest of it and go to the next token 736 // this typically happens when we hit commas (or the like) and don't have anything to extract 737 if (!this.finished) { 738 c = this.skipValue(); 739 } else { 740 c = skipWhitespace(); 741 } 742 743 // NOTE: these variable declarations are up here 744 // since we would miss them within the switch decl. 745 746 // have we hit an inf? 747 bool inf; 748 749 // second character 750 char cs; 751 752 with(IonTokenType) switch(c) { 753 case 0: 754 ok(TokenEOF, false); 755 return true; 756 case ':': 757 cs = peekOne(); 758 if (cs == ':') { 759 skipOne(); 760 ok(TokenDoubleColon, true); 761 } else { 762 ok(TokenColon, true); 763 } 764 return true; 765 case '{': 766 cs = peekOne(); 767 if (cs == '{') { 768 skipOne(); 769 ok(TokenOpenDoubleBrace, false); 770 } else { 771 ok(TokenOpenBrace, false); 772 } 773 return true; 774 case '}': 775 ok(TokenCloseBrace, true); 776 return true; 777 case '[': 778 ok(TokenOpenBracket, false); 779 return true; 780 case ']': 781 ok(TokenCloseBracket, false); 782 return true; 783 case '(': 784 ok(TokenOpenParen, false); 785 return true; 786 case ')': 787 ok(TokenCloseParen, false); 788 return true; 789 case ',': 790 ok(TokenComma, true); 791 return true; 792 case '.': 793 cs = peekOne(); 794 if (isOperatorChar(cs)) { 795 unread(cs); 796 ok(TokenSymbolOperator, false); 797 return true; 798 } 799 800 if (cs == ' ' || isIdentifierPart(cs)) { 801 unread(cs); 802 } 803 ok(TokenDot, true); 804 return true; 805 case '\'': 806 if (isTripleQuote()) { 807 ok(TokenLongString, false); 808 return true; 809 } 810 ok(TokenSymbolQuoted, false); 811 return true; 812 case '+': 813 inf = isInfinity(c); 814 if (inf) { 815 ok(TokenFloatInf, true); 816 return true; 817 } 818 unread(c); 819 ok(TokenSymbolOperator, false); 820 return true; 821 case '-': 822 cs = peekOne(); 823 if (isDigit(cs)) { 824 skipOne(); 825 IonTokenType tokenType = scanForNumber(cs); 826 if (tokenType == TokenTimestamp) { 827 throw IonTokenizerErrorCode.negativeTimestamp.ionTokenizerException; 828 } 829 unread(cs); 830 unread(c); 831 ok(tokenType, false); 832 return true; 833 } 834 835 inf = isInfinity(c); 836 if (inf) { 837 ok(TokenFloatMinusInf, true); 838 return true; 839 } 840 unread(c); 841 ok(TokenSymbolOperator, false); 842 return true; 843 844 static foreach(member; ION_OPERATOR_CHARS) { 845 static if (member != '+' && member != '-' && member != '"' && member != '.') { 846 case member: 847 unread(c); 848 ok(TokenSymbolOperator, false); 849 return true; 850 } 851 } 852 853 case '"': 854 ok(TokenString, false); 855 return true; 856 857 static foreach(member; ION_IDENTIFIER_START_CHARS) { 858 case member: 859 static if (member == 'n') { 860 if (isNAN(c)) { 861 ok(TokenFloatNaN, false); 862 return true; 863 } 864 } 865 unread(c); 866 ok(TokenSymbol, false); 867 return true; 868 } 869 870 static foreach(member; ION_DIGITS) { 871 case member: 872 IonTokenType t = scanForNumber(c); 873 unread(c); 874 ok(t, false); 875 return true; 876 } 877 878 default: 879 unexpectedChar(c); 880 return false; 881 } 882 } 883 884 /+ 885 Finish reading the current token, and skip to the end of it. 886 887 This function will only work if we are in the middle of reading a token. 888 Returns: 889 false if we already finished with a token, 890 true if we were able to skip to the end of it. 891 Throws: 892 IonTokenizerException if we were not able to skip to the end. 893 +/ 894 bool finish() @safe @nogc pure { 895 if (finished) { 896 return false; 897 } 898 899 immutable char c = this.skipValue(); 900 unread(c); 901 finished = true; 902 return true; 903 } 904 905 /+ 906 Check if the given character is a "stop" character. 907 908 Stop characters are typically terminators of objects, but here we overload and check if there's a comment after our character. 909 Params: 910 c = The last character read from the input range. 911 Returns: 912 true if the character is the "stop" character. 913 +/ 914 bool isStopChar(char c) @safe @nogc pure { 915 if (mir.deser.text.tokens.isStopChar(c)) { // make sure 916 return true; 917 } 918 919 if (c == '/') { 920 const(char) c2 = peekOne(); 921 if (c2 == '/' || c2 == '*') { 922 return true; 923 } 924 } 925 926 return false; 927 } 928 929 /+ 930 Helper to generate a thrown exception (if an unexpected character is hit) 931 +/ 932 void unexpectedChar(char c, size_t pos = -1, string file = __FILE__, int line = __LINE__) @safe @nogc pure { 933 static if (__traits(compiles, ()@nogc { throw new Exception(""); })) 934 throw new IonTokenizerException(c ? IonTokenizerErrorCode.unexpectedCharacter : IonTokenizerErrorCode.unexpectedEOF, file, line); 935 else 936 throw ionTokenizerException(c ? IonTokenizerErrorCode.unexpectedCharacter : IonTokenizerErrorCode.unexpectedEOF, /+file, line+/); 937 } 938 939 /+ 940 Helper to throw if an unexpected end-of-file is hit. 941 +/ 942 void unexpectedEOF(size_t pos = -1, string file = __FILE__, int line = __LINE__) @safe @nogc pure { 943 if (pos == -1) 944 pos = this.position; 945 unexpectedChar(0, pos, file, line); 946 } 947 948 /+ 949 Ensure that the next item in the range fulfills the predicate given. 950 Params: 951 pred = A predicate that the next character in the range must fulfill 952 Throws: 953 [IonTokenizerException] if the predicate is not fulfilled 954 +/ 955 template expect(alias pred = "a", bool noRead = false) { 956 import mir.functional : naryFun; 957 static if (noRead) { 958 char expect(char c, string file = __FILE__, int line = __LINE__) @trusted @nogc pure { 959 if (!naryFun!pred(c)) { 960 unexpectedChar(c, -1, file, line); 961 } 962 963 return c; 964 } 965 } else { 966 char expect(string file = __FILE__, int line = __LINE__) @trusted @nogc pure { 967 char c = readInput(); 968 if (!naryFun!pred(c)) { 969 unexpectedChar(c, -1, file, line); 970 } 971 972 return c; 973 } 974 } 975 } 976 // Text expect() 977 version(mir_ion_parser_test) unittest 978 { 979 import mir.deser.text.tokens : IonTokenizerException, isHexDigit; 980 981 void testIsHex(string ts) { 982 auto t = tokenizeString(ts); 983 while (!t.isEOF) { 984 import std.exception : assertNotThrown; 985 assertNotThrown!IonTokenizerException(t.expect!(isHexDigit)); 986 } 987 } 988 989 void testFailHex(string ts) { 990 auto t = tokenizeString(ts); 991 while (!t.isEOF) { 992 import std.exception : assertThrown; 993 assertThrown!IonTokenizerException(t.expect!(isHexDigit)); 994 } 995 } 996 997 testIsHex("1231231231"); 998 testIsHex("BADBAB3"); 999 testIsHex("F00BAD"); 1000 testIsHex("420"); 1001 testIsHex("41414141"); 1002 testIsHex("BADF00D"); 1003 testIsHex("BaDf00D"); 1004 testIsHex("badf00d"); 1005 testIsHex("AbCdEf123"); 1006 1007 testFailHex("HIWORLT"); 1008 testFailHex("Tst"); 1009 } 1010 1011 /+ 1012 Ensure that the next item in the range does NOT fulfill the predicate given. 1013 1014 This is the opposite of `expect` - which expects that the predicate is fulfilled. 1015 However, for all intents and purposes, the functionality of `expectFalse` is identical to `expect`. 1016 Params: 1017 pred = A predicate that the next character in the range must NOT fulfill. 1018 Throws: 1019 [IonTokenizerException] if the predicate is fulfilled. 1020 +/ 1021 template expectFalse(alias pred = "a", bool noRead = false, string file = __FILE__, int line = __LINE__) { 1022 import mir.functional : naryFun; 1023 static if (noRead) { 1024 char expectFalse(char c) @trusted @nogc pure { 1025 if (naryFun!pred(c)) { 1026 unexpectedChar(c, -1, file, line); 1027 } 1028 1029 return c; 1030 } 1031 } else { 1032 char expectFalse() @trusted @nogc pure { 1033 char c = readInput(); 1034 if (naryFun!pred(c)) { 1035 unexpectedChar(c, -1, file, line); 1036 } 1037 1038 return c; 1039 } 1040 } 1041 } 1042 } 1043 1044 /+ 1045 Generic helper to verify the functionality of the parsing code in unit-tests 1046 +/ 1047 void testRead(T)(ref T t, char expected, string file = __FILE__, int line = __LINE__) { 1048 import mir.exception : MirError; 1049 char v = t.readInput(); 1050 if (v != expected) { 1051 import mir.format : stringBuf, print; 1052 auto buf = stringBuf; 1053 buf.print("Expected ", expected, " but got ", v); 1054 throw new MirError(buf.data, file, line); 1055 } 1056 } 1057 1058 /+ 1059 Generic helper to verify the functionality of the parsing code in unit-tests 1060 +/ 1061 void testPeek(T)(ref T t, char expected, string file = __FILE__, int line = __LINE__) { 1062 import mir.exception : MirError; 1063 char v = t.peekOne(); 1064 if (v != expected) { 1065 import mir.format : stringBuf, print; 1066 auto buf = stringBuf; 1067 buf.print("Expected ", expected, " but got ", v); 1068 throw new MirError(buf.data, file, line); 1069 } 1070 }