1 /+ 2 Token definitions for parsing Ion Text. 3 4 Authors: Harrison Ford 5 +/ 6 module mir.deser.text.tokens; 7 import mir.ion.type_code : IonTypeCode; 8 /+ 9 Ion Token Types 10 +/ 11 enum IonTokenType : ubyte 12 { 13 /+ Invalid token +/ 14 TokenInvalid, 15 16 /+ EOF +/ 17 TokenEOF, 18 19 /+ numbers +/ 20 TokenNumber, 21 22 /+ 0b[01]+ +/ 23 TokenBinary, 24 25 /+ 0x[0-9a-fA-F]+ +/ 26 TokenHex, 27 28 /+ +inf +/ 29 TokenFloatInf, 30 31 /+ -inf +/ 32 TokenFloatMinusInf, 33 34 /+ nan +/ 35 TokenFloatNaN, 36 37 /+ 38 2020-01-01T00:00:00.000Z 39 40 All timestamps *must* be compliant to ISO-8601 41 +/ 42 TokenTimestamp, 43 44 /+ [a-zA-Z_]+ +/ 45 TokenSymbol, 46 47 /+ '[^']+' +/ 48 TokenSymbolQuoted, 49 50 /+ [+-/*] +/ 51 TokenSymbolOperator, 52 53 /+ "[^"]+" +/ 54 TokenString, 55 56 /+ '''[^']+''' +/ 57 TokenLongString, 58 59 /+ [.] +/ 60 TokenDot, 61 62 /+ [,] +/ 63 TokenComma, 64 65 /+ : +/ 66 TokenColon, 67 68 /+ :: +/ 69 TokenDoubleColon, 70 71 /+ ( +/ 72 TokenOpenParen, 73 74 /+ ) +/ 75 TokenCloseParen, 76 77 /+ { +/ 78 TokenOpenBrace, 79 80 /+ } +/ 81 TokenCloseBrace, 82 83 /+ [ +/ 84 TokenOpenBracket, 85 86 /+ ] +/ 87 TokenCloseBracket, 88 89 /+ {{ +/ 90 TokenOpenDoubleBrace, 91 92 /+ }} +/ 93 TokenCloseDoubleBrace 94 } 95 /// 96 version(mir_ion_test) unittest 97 { 98 static assert(!IonTokenType.TokenInvalid); 99 static assert(IonTokenType.TokenInvalid == IonTokenType.init); 100 static assert(IonTokenType.TokenEOF > 0); 101 } 102 103 /+ 104 Get a stringified version of a token. 105 Params: 106 code = $(LREF IonTokenType) 107 Returns: 108 Stringified version of the token 109 +/ 110 111 string ionTokenMsg(IonTokenType token) @property 112 @safe pure nothrow @nogc 113 { 114 static immutable string[] tokens = [ 115 "<invalid>", 116 "<EOF>", 117 "<number>", 118 "<binary>", 119 "<hex>", 120 "+inf", 121 "-inf", 122 "nan", 123 "<timestamp>", 124 "<symbol>", 125 "<quoted-symbol>", 126 "<operator>", 127 "<string>", 128 "<long-string>", 129 ".", 130 ",", 131 ":", 132 "::", 133 "(", 134 ")", 135 "{", 136 "}", 137 "[", 138 "]", 139 "{{", 140 "}}", 141 "<error>" 142 ]; 143 return tokens[token - IonTokenType.min]; 144 } 145 /// 146 @safe pure nothrow @nogc 147 version(mir_ion_test) unittest 148 { 149 static assert(IonTokenType.TokenInvalid.ionTokenMsg == "<invalid>"); 150 static assert(IonTokenType.TokenCloseDoubleBrace.ionTokenMsg == "}}"); 151 } 152 153 /+ 154 All valid Ion operator characters. 155 +/ 156 static immutable ION_OPERATOR_CHARS = ['!', '#', '%', '&', '*', '+', '-', '.', '/', ';', '<', '=', 157 '>', '?', '@', '^', '`', '|', '~']; 158 159 /+ 160 All characters that Ion considers to be whitespace 161 +/ 162 static immutable ION_WHITESPACE = [' ', '\t', '\n', '\r']; 163 164 /+ 165 All characters that Ion considers to be the end of a token (stop chars) 166 +/ 167 static immutable ION_STOP_CHARS = ['\0', '{', '}', '[', ']', '(', ')', ',', '"', '\''] ~ ION_WHITESPACE; 168 169 /+ 170 All valid digits within Ion (0-9) 171 +/ 172 static immutable ION_DIGITS = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; 173 174 /+ 175 All valid hex digits within Ion ([a-fA-F0-9]) 176 +/ 177 static immutable ION_HEX_DIGITS = ION_DIGITS ~ ['a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F']; 178 179 /+ 180 All valid lowercase letters within Ion 181 +/ 182 static immutable ION_LOWERCASE = 183 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']; 184 185 /+ 186 All valid uppercase letters within Ion 187 +/ 188 static immutable ION_UPPERCASE = 189 ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']; 190 191 /+ 192 All valid characters which can be the beginning of an identifier (a-zA-Z_$) 193 +/ 194 static immutable ION_IDENTIFIER_START_CHARS = ION_LOWERCASE ~ ION_UPPERCASE ~ ['_', '$']; 195 196 /+ 197 All symbols which must be surrounded by quotes 198 +/ 199 static immutable ION_QUOTED_SYMBOLS = ["", "null", "true", "false", "nan"]; 200 201 /+ 202 Carriage-Return + Line-Feed 203 +/ 204 static immutable ubyte[] ION_CR_LF = ION_CR ~ ION_LF; 205 206 /+ 207 Carriage-Return 208 +/ 209 static immutable ubyte[] ION_CR = ['\r']; 210 211 /+ 212 Line-Feed 213 +/ 214 static immutable ubyte[] ION_LF = ['\n']; 215 216 /+ 217 Check if a character is considered by Ion to be a digit. 218 Params: 219 c = The character to check 220 Returns: 221 true if the character is considered by Ion to be a digit. 222 +/ 223 bool isDigit(char c) @safe @nogc pure nothrow { 224 static foreach(member; ION_DIGITS) { 225 if (c == member) return true; 226 } 227 return false; 228 } 229 230 /+ 231 Check if a character is considered by Ion to be a hex digit. 232 Params: 233 c = The character to check 234 Returns: 235 true if the character is considered by Ion to be a hex digit. 236 +/ 237 bool isHexDigit(char c) @safe @nogc pure nothrow { 238 static foreach(member; ION_HEX_DIGITS) { 239 if (c == member) return true; 240 } 241 return false; 242 } 243 244 /+ 245 Check if a character is considered by Ion to be a valid start to an identifier. 246 Params: 247 c = The character to check 248 Returns: 249 true if the character is considered by Ion to be a valid start to an identifier. 250 +/ 251 bool isIdentifierStart(char c) @safe @nogc pure nothrow { 252 static foreach(member; ION_IDENTIFIER_START_CHARS) { 253 if (c == member) return true; 254 } 255 return false; 256 } 257 258 /+ 259 Check if a character is considered by Ion to be a valid part of an identifier. 260 Params: 261 c = The character to check 262 Returns: 263 true if the character is considered by Ion to be a valid part of an identifier. 264 +/ 265 bool isIdentifierPart(char c) @safe @nogc pure nothrow { 266 return isIdentifierStart(c) || isDigit(c); 267 } 268 269 /+ 270 Check if a character is considered by Ion to be a symbol operator character. 271 Params: 272 c = The character to check 273 Returns: 274 true if the character is considered by Ion to be a symbol operator character. 275 +/ 276 bool isOperatorChar(char c) @safe @nogc pure nothrow { 277 static foreach(member; ION_OPERATOR_CHARS) { 278 if (c == member) return true; 279 } 280 return false; 281 } 282 283 /+ 284 Check if a character is considered by Ion to be a "stop" character. 285 Params: 286 c = The character to check 287 Returns: 288 true if the character is considered by Ion to be a "stop" character. 289 +/ 290 bool isStopChar(char c) @safe @nogc pure nothrow { 291 static foreach(member; ION_STOP_CHARS) { 292 if (c == member) return true; 293 } 294 295 return false; 296 } 297 298 /+ 299 Check if a character is considered by Ion to be whitespace. 300 Params: 301 c = The character to check 302 Returns: 303 true if the character is considered by Ion to be whitespace. 304 +/ 305 bool isWhitespace(char c) @safe @nogc pure nothrow { 306 static foreach(member; ION_WHITESPACE) { 307 if (c == member) return true; 308 } 309 return false; 310 } 311 312 /+ 313 Check if a symbol found needs to be surrounded in quotes. 314 Params: 315 symbol = The symbol to check 316 Returns: 317 true if the symbol requires quotes around it. 318 +/ 319 bool symbolNeedsQuotes(scope const(char)[] symbol) @safe @nogc pure nothrow { 320 static foreach(member; ION_QUOTED_SYMBOLS) { 321 if (symbol == member) return true; 322 } 323 324 if (!isIdentifierStart(symbol[0])) 325 return true; 326 foreach (char c; symbol) 327 if (!isIdentifierPart(c)) 328 return true; 329 return false; 330 } 331 332 /+ 333 Check if a symbol found has quotes surrounding it. 334 Params: 335 symbol = The symbol to check 336 Returns: 337 true if the symbol has quotes surrounding it. 338 +/ 339 bool symbolHasQuotes(const(char)[] symbol) @safe @nogc pure nothrow { 340 if (symbol[0] != '\'') return false; 341 if (symbol[$ - 1] != '\'') return false; 342 return true; 343 } 344 345 /+ 346 Check if a character is a new-line character. 347 348 Params: 349 c = The character to check 350 Returns: 351 true if a character is considered to be a new-line. 352 +/ 353 bool isNewLine(char c) @safe @nogc pure nothrow { 354 return c == 0x0A || c == 0x0D; 355 } 356 357 /+ 358 Check if a character is printable whitespace within a string. 359 360 Params: 361 c = The character to check 362 Returns: 363 true if a character is considered to be printable whitespace. 364 +/ 365 bool isStringWhitespace(char c) @safe @nogc pure nothrow { 366 return c == 0x09 || c == 0x0B || c == 0x0C; 367 } 368 369 /+ 370 Check if a character is a control character. 371 372 Params: 373 c = The character to check 374 Returns: 375 true if a character is considered a control character. 376 +/ 377 bool isControlChar(char c) @safe @nogc pure nothrow { 378 return c < 0x20 || c == 0x7F; 379 } 380 381 /+ 382 Check if a character is within the valid ASCII range (0x00 - 0x7F) 383 384 Params: 385 c = The character to check 386 Returns: 387 true if a character is considered to be valid ASCII. 388 +/ 389 bool isASCIIChar(char c) @safe @nogc pure nothrow { 390 return c <= 0x7F; 391 } 392 393 /+ 394 Check if a character is invalid (non-printable). 395 Params: 396 c = The character to check 397 Returns: 398 true if a character is invalid, false otherwise 399 +/ 400 bool isInvalidChar(char c) @safe @nogc pure nothrow { 401 if (isStringWhitespace(c) || isNewLine(c)) return false; 402 if (isControlChar(c)) return true; 403 return false; 404 } 405 406 /+ 407 Convert a character that represents a hex-digit into it's actual form. 408 409 This is to convert a hex-literal as fast as possible. 410 Params: 411 c = a hex character 412 +/ 413 char hexLiteral(char c) @safe @nogc pure { 414 if (isDigit(c)) return cast(char)(c - ION_DIGITS[0]); 415 else if (c >= 'a' && c <= 'f') return cast(char)(10 + (c - ION_LOWERCASE[0])); 416 else if (c >= 'A' && c <= 'F') return cast(char)(10 + (c - ION_UPPERCASE[0])); 417 throw IonTokenizerErrorCode.invalidHexLiteral.ionTokenizerException; 418 } 419 420 /+ 421 Represents Ion Text token read from the tokenizer. 422 +/ 423 mixin template IonTextToken() { 424 /+ 425 What text did we match while scanning for this token? 426 +/ 427 const(char)[] matchedText; 428 /+ 429 Where in the input stream did we match it? 430 +/ 431 size_t matchedIndex; 432 } 433 434 /+ 435 For Ion Text Tokens which allow escape characters, what type is it? 436 (either a hex '\xFF', or a Unicode '\uXXXX' '\UXXXXXXXX') 437 +/ 438 enum IonTextEscapeType { 439 Hex, 440 UTF 441 } 442 443 /+ 444 Represents Ion Text tokens which can have escape characters. 445 +/ 446 mixin template IonTextWithEscapeToken() { 447 mixin IonTextToken; 448 /+ 449 Have we the end of the token, or is there more to be read? 450 +/ 451 bool isFinal; 452 /+ 453 Is this token only an escape sequence (and nothing else?) 454 +/ 455 bool isEscapeSequence; 456 /+ 457 If this is an escape sequence, what type is it? 458 +/ 459 IonTextEscapeType escapeSequenceType; 460 } 461 462 /+ 463 The Ion Text Number token 464 +/ 465 struct IonTextNumber { 466 mixin IonTextToken; 467 } 468 469 /+ 470 The Ion Text Timestamp token 471 +/ 472 struct IonTextTimestamp { 473 mixin IonTextToken; 474 } 475 476 /+ 477 The Ion Text Blob token 478 +/ 479 struct IonTextBlob { 480 mixin IonTextToken; 481 } 482 483 /+ 484 The Ion Text Symbol token 485 +/ 486 struct IonTextSymbol { 487 mixin IonTextToken; 488 } 489 490 /+ 491 The Ion Text Symbol (with quotes surrounding it) token 492 +/ 493 struct IonTextQuotedSymbol { 494 mixin IonTextWithEscapeToken; 495 } 496 497 /+ 498 The Ion Text Symbol Operator token 499 +/ 500 struct IonTextSymbolOperator { 501 mixin IonTextToken; 502 } 503 504 /+ 505 The Ion Text String token. Can represent a long string as well. 506 +/ 507 struct IonTextString { 508 mixin IonTextWithEscapeToken; 509 /+ 510 Is this a long string? 511 +/ 512 bool isLongString; 513 /+ 514 Is this long string a normalized new line? 515 +/ 516 bool isNormalizedNewLine; 517 } 518 519 /+ 520 The Ion Text Clob token. Can represent a long clob as well. 521 +/ 522 struct IonTextClob { 523 mixin IonTextWithEscapeToken; 524 /+ 525 Is this a long clob? 526 +/ 527 bool isLongClob; 528 } 529 530 version(D_Exceptions): 531 import mir.ion.exception; 532 533 /+ 534 All possible exceptions within the deserializer. 535 +/ 536 enum IonDeserializerErrorCode { 537 none, 538 unexpectedState, 539 unexpectedToken, 540 unexpectedDecimalValue, 541 twoHandlersState, 542 nestedAnnotations, 543 requiresQuotes, 544 invalidNullType, 545 unexpectedEOF, 546 unimplemented, 547 invalidBase64Length 548 }; 549 550 string ionDeserializerMsg(IonDeserializerErrorCode error) @property 551 @safe pure nothrow @nogc 552 { 553 static immutable string[] errors = [ 554 null, 555 "unexpected state", 556 "unexpected token", 557 "unexpected decimal value", 558 "two handlers for one state is not supported", 559 "nested annotations are not supported", 560 "keyword requires quotes when used as field name / annotation", 561 "invalid null type specified", 562 "unexpected end of file", 563 "feature unimplemented", 564 "invalid base64 length (maybe missing padding?)", 565 ]; 566 567 return errors[error - IonDeserializerErrorCode.min]; 568 } 569 570 /+ 571 Mir Ion Text Deserializer Exception 572 +/ 573 class IonDeserializerException : IonException 574 { 575 /// 576 this( 577 IonDeserializerErrorCode code, 578 string file = __FILE__, 579 size_t line = __LINE__, 580 Throwable next = null) pure nothrow @nogc @safe 581 { 582 super(code.ionDeserializerMsg, file, line, next); 583 } 584 585 /// 586 this( 587 string msg, 588 string file = __FILE__, 589 size_t line = __LINE__, 590 Throwable next = null) pure nothrow @nogc @safe 591 { 592 super(msg, file, line, next); 593 } 594 595 /// 596 this( 597 string msg, 598 Throwable next, 599 string file = __FILE__, 600 size_t line = __LINE__, 601 ) pure nothrow @nogc @safe 602 { 603 this(msg, file, line, next); 604 } 605 } 606 607 /// 608 IonDeserializerException ionDeserializerException(IonDeserializerErrorCode code) @safe pure nothrow @nogc 609 { 610 import mir.array.allocation: array; 611 import mir.ndslice.topology: map; 612 import std.traits: EnumMembers; 613 614 static immutable IonDeserializerException[] exceptions = 615 [EnumMembers!IonDeserializerErrorCode] 616 .map!(code => code ? new immutable IonDeserializerException(code) : null) 617 .array; 618 return unqualException(exceptions[code - IonDeserializerErrorCode.min]); 619 } 620 621 /+ 622 All possible exceptions within the tokenizer. 623 +/ 624 enum IonTokenizerErrorCode { 625 none, 626 unexpectedEOF, 627 invalidHexLiteral, 628 unexpectedCharacter, 629 negativeTimestamp, 630 commentsNotAllowed, 631 normalizeEOFFail, 632 cannotUnreadAtPos0, 633 invalidHexEscape, 634 invalidLeadingZeros, 635 cannotUpdateWindow, 636 encodingSurrogateCode, 637 encodingInvalidCode, 638 cannotSkipWhitespace, 639 cannotSkipLongString, 640 expectedValidLeader, 641 invalidTimestampOffset, 642 } 643 644 string ionTokenizerMsg(IonTokenizerErrorCode error) @property 645 @safe pure nothrow @nogc 646 { 647 static immutable string[] errors = [ 648 null, 649 "unexpected EOF", 650 "invalid hex literal", 651 "unexpected character", 652 "encountered negative timestamp", 653 "encountered unexpected comment", 654 "could not normalize EOF", 655 "cannot unread when pos >= 0", 656 "invalid hex escape", 657 "invalid leading zeros in integer literal", 658 "cannot update sliding window", 659 "encoding a surrogate code point in UTF-8", 660 "encoding an invalid code point in UTF-8", 661 "could not skip over whitespace", 662 "could not skip to end of long string", 663 "expected a valid digit leader", 664 "invalid timestamp offset", 665 ]; 666 667 return errors[error - IonTokenizerErrorCode.min]; 668 } 669 670 /+ 671 Mir Ion Tokenizer Exception 672 +/ 673 class IonTokenizerException : IonException 674 { 675 /// 676 this( 677 IonTokenizerErrorCode code, 678 string file = __FILE__, 679 size_t line = __LINE__, 680 Throwable next = null) pure nothrow @nogc @safe 681 { 682 super(code.ionTokenizerMsg, file, line, next); 683 } 684 685 /// 686 this( 687 string msg, 688 string file = __FILE__, 689 size_t line = __LINE__, 690 Throwable next = null) pure nothrow @nogc @safe 691 { 692 super(msg, file, line, next); 693 } 694 695 /// 696 this( 697 string msg, 698 Throwable next, 699 string file = __FILE__, 700 size_t line = __LINE__, 701 ) pure nothrow @nogc @safe 702 { 703 this(msg, file, line, next); 704 } 705 } 706 707 /// 708 IonTokenizerException ionTokenizerException(IonTokenizerErrorCode code) @safe pure nothrow @nogc 709 { 710 import mir.array.allocation: array; 711 import mir.ndslice.topology: map; 712 import std.traits: EnumMembers; 713 714 static immutable IonTokenizerException[] exceptions = 715 [EnumMembers!IonTokenizerErrorCode] 716 .map!(code => code ? new immutable IonTokenizerException(code) : null) 717 .array; 718 return unqualException(exceptions[code - IonErrorCode.min]); 719 }