1 /+
2 Token definitions for parsing Ion Text.
3 
4 Authors: Harrison Ford
5 +/
6 module mir.deser.text.tokens;
7 import mir.ion.type_code : IonTypeCode;
8 /+
9 Ion Token Types
10 +/
11 enum IonTokenType : ubyte 
12 {
13     /+ Invalid token +/
14     TokenInvalid,
15 
16     /+ EOF +/
17     TokenEOF,
18 
19     /+ numbers +/
20     TokenNumber,
21 
22     /+ 0b[01]+ +/
23     TokenBinary,
24 
25     /+ 0x[0-9a-fA-F]+ +/
26     TokenHex,
27 
28     /+ +inf +/
29     TokenFloatInf,
30 
31     /+ -inf +/
32     TokenFloatMinusInf,
33 
34     /+ nan +/
35     TokenFloatNaN,
36 
37     /+
38     2020-01-01T00:00:00.000Z
39 
40     All timestamps *must* be compliant to ISO-8601
41     +/
42     TokenTimestamp,
43 
44     /+ [a-zA-Z_]+ +/
45     TokenSymbol,
46 
47     /+ '[^']+' +/
48     TokenSymbolQuoted,
49 
50     /+ [+-/*] +/
51     TokenSymbolOperator,
52 
53     /+ "[^"]+" +/
54     TokenString,
55 
56     /+ '''[^']+''' +/
57     TokenLongString,
58 
59     /+ [.] +/
60     TokenDot,
61 
62     /+ [,] +/
63     TokenComma,
64 
65     /+ : +/
66     TokenColon,
67 
68     /+ :: +/
69     TokenDoubleColon,
70 
71     /+ ( +/
72     TokenOpenParen,
73 
74     /+ ) +/
75     TokenCloseParen,
76 
77     /+ { +/
78     TokenOpenBrace,
79 
80     /+ } +/
81     TokenCloseBrace,
82 
83     /+ [ +/
84     TokenOpenBracket,
85 
86     /+ ] +/
87     TokenCloseBracket,
88 
89     /+ {{ +/ 
90     TokenOpenDoubleBrace,
91 
92     /+ }} +/ 
93     TokenCloseDoubleBrace
94 }
95 ///
96 version(mir_ion_test) unittest 
97 {
98     static assert(!IonTokenType.TokenInvalid);
99     static assert(IonTokenType.TokenInvalid == IonTokenType.init);
100     static assert(IonTokenType.TokenEOF > 0);
101 }
102 
103 /+
104 Get a stringified version of a token.
105 Params:
106     code = $(LREF IonTokenType)
107 Returns:
108     Stringified version of the token
109 +/
110     
111 string ionTokenMsg(IonTokenType token) @property
112 @safe pure nothrow @nogc
113 {
114     static immutable string[] tokens = [
115         "<invalid>",
116         "<EOF>",
117         "<number>",
118         "<binary>",
119         "<hex>",
120         "+inf",
121         "-inf",
122         "nan",
123         "<timestamp>",
124         "<symbol>",
125         "<quoted-symbol>",
126         "<operator>",
127         "<string>",
128         "<long-string>",
129         ".",
130         ",",
131         ":",
132         "::",
133         "(",
134         ")",
135         "{",
136         "}",
137         "[",
138         "]",
139         "{{",
140         "}}",
141         "<error>"
142     ];
143     return tokens[token - IonTokenType.min];
144 }
145 ///
146 @safe pure nothrow @nogc
147 version(mir_ion_test) unittest
148 {
149     static assert(IonTokenType.TokenInvalid.ionTokenMsg == "<invalid>");
150     static assert(IonTokenType.TokenCloseDoubleBrace.ionTokenMsg == "}}");
151 }
152 
153 /+
154 All valid Ion operator characters.
155 +/
156 static immutable ION_OPERATOR_CHARS = ['!', '#', '%', '&', '*', '+', '-', '.', '/', ';', '<', '=',
157         '>', '?', '@', '^', '`', '|', '~'];
158 
159 /+
160 All characters that Ion considers to be whitespace
161 +/
162 static immutable ION_WHITESPACE = [' ', '\t', '\n', '\r'];
163 
164 /+
165 All characters that Ion considers to be the end of a token (stop chars)
166 +/
167 static immutable ION_STOP_CHARS = ['\0', '{', '}', '[', ']', '(', ')', ',', '"', '\''] ~ ION_WHITESPACE;
168 
169 /+
170 All valid digits within Ion (0-9)
171 +/
172 static immutable ION_DIGITS = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
173 
174 /+
175 All valid hex digits within Ion ([a-fA-F0-9])
176 +/
177 static immutable ION_HEX_DIGITS = ION_DIGITS ~ ['a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F'];
178 
179 /+
180 All valid lowercase letters within Ion
181 +/
182 static immutable ION_LOWERCASE = 
183     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'];
184 
185 /+
186 All valid uppercase letters within Ion
187 +/
188 static immutable ION_UPPERCASE =
189     ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
190 
191 /+
192 All valid characters which can be the beginning of an identifier (a-zA-Z_$)
193 +/
194 static immutable ION_IDENTIFIER_START_CHARS = ION_LOWERCASE ~ ION_UPPERCASE ~ ['_', '$'];
195 
196 /+
197 All symbols which must be surrounded by quotes
198 +/
199 static immutable ION_QUOTED_SYMBOLS = ["", "null", "true", "false", "nan"];
200 
201 /+
202 Carriage-Return + Line-Feed
203 +/
204 static immutable ubyte[] ION_CR_LF = ION_CR ~ ION_LF;
205 
206 /+
207 Carriage-Return
208 +/
209 static immutable ubyte[] ION_CR = ['\r'];
210 
211 /+
212 Line-Feed
213 +/
214 static immutable ubyte[] ION_LF = ['\n'];
215 
216 /+
217 Check if a character is considered by Ion to be a digit.
218 Params:
219     c = The character to check
220 Returns:
221     true if the character is considered by Ion to be a digit.
222 +/
223 bool isDigit(char c) @safe @nogc pure nothrow {
224     static foreach(member; ION_DIGITS) {
225         if (c == member) return true;
226     }
227     return false;
228 }
229 
230 /+
231 Check if a character is considered by Ion to be a hex digit.
232 Params:
233     c = The character to check
234 Returns:
235     true if the character is considered by Ion to be a hex digit.
236 +/
237 bool isHexDigit(char c) @safe @nogc pure nothrow {
238     static foreach(member; ION_HEX_DIGITS) {
239         if (c == member) return true;
240     }
241     return false;
242 }
243 
244 /+
245 Check if a character is considered by Ion to be a valid start to an identifier.
246 Params:
247     c = The character to check
248 Returns:
249     true if the character is considered by Ion to be a valid start to an identifier.
250 +/
251 bool isIdentifierStart(char c) @safe @nogc pure nothrow {
252     static foreach(member; ION_IDENTIFIER_START_CHARS) {
253         if (c == member) return true;
254     }
255     return false;
256 }
257 
258 /+
259 Check if a character is considered by Ion to be a valid part of an identifier.
260 Params:
261     c = The character to check
262 Returns:
263     true if the character is considered by Ion to be a valid part of an identifier.
264 +/
265 bool isIdentifierPart(char c) @safe @nogc pure nothrow {
266     return isIdentifierStart(c) || isDigit(c);
267 }   
268 
269 /+
270 Check if a character is considered by Ion to be a symbol operator character.
271 Params:
272     c = The character to check
273 Returns:
274     true if the character is considered by Ion to be a symbol operator character.
275 +/
276 bool isOperatorChar(char c) @safe @nogc pure nothrow {
277     static foreach(member; ION_OPERATOR_CHARS) {
278         if (c == member) return true;
279     }
280     return false;
281 }
282 
283 /+
284 Check if a character is considered by Ion to be a "stop" character.
285 Params:
286     c = The character to check
287 Returns:
288     true if the character is considered by Ion to be a "stop" character.
289 +/
290 bool isStopChar(char c) @safe @nogc pure nothrow {
291     static foreach(member; ION_STOP_CHARS) {
292         if (c == member) return true;
293     }
294 
295     return false;
296 }
297 
298 /+
299 Check if a character is considered by Ion to be whitespace.
300 Params:
301     c = The character to check
302 Returns:
303     true if the character is considered by Ion to be whitespace.
304 +/
305 bool isWhitespace(char c) @safe @nogc pure nothrow {
306     static foreach(member; ION_WHITESPACE) {
307         if (c == member) return true;
308     }
309     return false;
310 }
311 
312 /+
313 Check if a symbol found needs to be surrounded in quotes.
314 Params:
315     symbol = The symbol to check
316 Returns:
317     true if the symbol requires quotes around it.
318 +/
319 bool symbolNeedsQuotes(scope const(char)[] symbol) @safe @nogc pure nothrow {
320     static foreach(member; ION_QUOTED_SYMBOLS) {
321         if (symbol == member) return true;
322     }
323 
324     if (!isIdentifierStart(symbol[0]))
325         return true;
326     foreach (char c; symbol)
327         if (!isIdentifierPart(c))
328             return true;
329     return false;
330 }
331 
332 /+
333 Check if a symbol found has quotes surrounding it.
334 Params:
335     symbol = The symbol to check
336 Returns:
337     true if the symbol has quotes surrounding it.
338 +/
339 bool symbolHasQuotes(const(char)[] symbol) @safe @nogc pure nothrow {
340     if (symbol[0] != '\'') return false;
341     if (symbol[$ - 1] != '\'') return false;
342     return true;
343 }
344 
345 /+
346 Check if a character is a new-line character.
347 
348 Params:
349     c = The character to check
350 Returns:
351     true if a character is considered to be a new-line.
352 +/
353 bool isNewLine(char c) @safe @nogc pure nothrow {
354     return c == 0x0A || c == 0x0D;
355 }
356 
357 /+
358 Check if a character is printable whitespace within a string.
359 
360 Params:
361     c = The character to check
362 Returns:
363     true if a character is considered to be printable whitespace.
364 +/
365 bool isStringWhitespace(char c) @safe @nogc pure nothrow {
366     return c == 0x09 || c == 0x0B || c == 0x0C;
367 }
368 
369 /+
370 Check if a character is a control character.
371 
372 Params:
373     c = The character to check
374 Returns:
375     true if a character is considered a control character.
376 +/
377 bool isControlChar(char c) @safe @nogc pure nothrow {
378     return c < 0x20 || c == 0x7F;
379 }
380 
381 /+
382 Check if a character is within the valid ASCII range (0x00 - 0x7F)
383     
384 Params:
385     c = The character to check
386 Returns:
387     true if a character is considered to be valid ASCII.
388 +/
389 bool isASCIIChar(char c) @safe @nogc pure nothrow {
390     return c <= 0x7F;
391 }
392 
393 /+
394 Check if a character is invalid (non-printable).
395 Params:
396     c = The character to check
397 Returns:
398     true if a character is invalid, false otherwise
399 +/
400 bool isInvalidChar(char c) @safe @nogc pure nothrow {
401     if (isStringWhitespace(c) || isNewLine(c)) return false;
402     if (isControlChar(c)) return true;
403     return false;
404 }
405 
406 /+
407 Convert a character that represents a hex-digit into it's actual form.
408 
409 This is to convert a hex-literal as fast as possible.
410 Params:
411     c = a hex character
412 +/
413 char hexLiteral(char c) @safe @nogc pure {
414     if (isDigit(c)) return cast(char)(c - ION_DIGITS[0]);
415     else if (c >= 'a' && c <= 'f') return cast(char)(10 + (c - ION_LOWERCASE[0]));
416     else if (c >= 'A' && c <= 'F') return cast(char)(10 + (c - ION_UPPERCASE[0]));
417     throw IonTokenizerErrorCode.invalidHexLiteral.ionTokenizerException;
418 }
419 
420 /+
421 Represents Ion Text token read from the tokenizer.
422 +/
423 mixin template IonTextToken() {
424     /+
425     What text did we match while scanning for this token?
426     +/
427     const(char)[] matchedText;
428     /+
429     Where in the input stream did we match it?
430     +/
431     size_t matchedIndex;
432 }
433 
434 /+
435 For Ion Text Tokens which allow escape characters, what type is it? 
436 (either a hex '\xFF', or a Unicode '\uXXXX' '\UXXXXXXXX')
437 +/
438 enum IonTextEscapeType {
439     Hex,
440     UTF
441 }
442 
443 /+
444 Represents Ion Text tokens which can have escape characters.
445 +/
446 mixin template IonTextWithEscapeToken() {
447     mixin IonTextToken;
448     /+
449     Have we the end of the token, or is there more to be read?
450     +/
451     bool isFinal;
452     /+
453     Is this token only an escape sequence (and nothing else?)
454     +/
455     bool isEscapeSequence;
456     /+
457     If this is an escape sequence, what type is it?
458     +/
459     IonTextEscapeType escapeSequenceType;
460 }
461 
462 /+
463 The Ion Text Number token
464 +/
465 struct IonTextNumber {
466     mixin IonTextToken;
467 }
468 
469 /+
470 The Ion Text Timestamp token
471 +/
472 struct IonTextTimestamp {
473     mixin IonTextToken;
474 }
475 
476 /+
477 The Ion Text Blob token
478 +/
479 struct IonTextBlob {
480     mixin IonTextToken;
481 }
482 
483 /+
484 The Ion Text Symbol token
485 +/
486 struct IonTextSymbol {
487     mixin IonTextToken;
488 }
489 
490 /+
491 The Ion Text Symbol (with quotes surrounding it) token
492 +/
493 struct IonTextQuotedSymbol {
494     mixin IonTextWithEscapeToken;
495 }
496 
497 /+
498 The Ion Text Symbol Operator token
499 +/
500 struct IonTextSymbolOperator {
501     mixin IonTextToken;
502 }
503 
504 /+
505 The Ion Text String token. Can represent a long string as well.
506 +/
507 struct IonTextString {
508     mixin IonTextWithEscapeToken;
509     /+
510     Is this a long string?
511     +/
512     bool isLongString;
513     /+
514     Is this long string a normalized new line?
515     +/
516     bool isNormalizedNewLine;
517 }
518 
519 /+
520 The Ion Text Clob token. Can represent a long clob as well.
521 +/
522 struct IonTextClob {
523     mixin IonTextWithEscapeToken;
524     /+
525     Is this a long clob?
526     +/
527     bool isLongClob;
528 }
529 
530 version(D_Exceptions):
531 import mir.ion.exception;
532 
533 /+
534 All possible exceptions within the deserializer.
535 +/
536 enum IonDeserializerErrorCode {
537     none,
538     unexpectedState,
539     unexpectedToken,
540     unexpectedDecimalValue,
541     twoHandlersState,
542     nestedAnnotations,
543     requiresQuotes,
544     invalidNullType,
545     unexpectedEOF,
546     unimplemented,
547     invalidBase64Length
548 };
549 
550 string ionDeserializerMsg(IonDeserializerErrorCode error) @property
551 @safe pure nothrow @nogc
552 {
553     static immutable string[] errors = [
554         null,
555         "unexpected state",
556         "unexpected token",
557         "unexpected decimal value",
558         "two handlers for one state is not supported",
559         "nested annotations are not supported",
560         "keyword requires quotes when used as field name / annotation",
561         "invalid null type specified",
562         "unexpected end of file",
563         "feature unimplemented",
564         "invalid base64 length (maybe missing padding?)",
565     ];
566 
567     return errors[error - IonDeserializerErrorCode.min];
568 }
569 
570 /+
571 Mir Ion Text Deserializer Exception
572 +/
573 class IonDeserializerException : IonException
574 {
575     ///
576     this(
577         IonDeserializerErrorCode code,
578         string file = __FILE__,
579         size_t line = __LINE__,
580         Throwable next = null) pure nothrow @nogc @safe 
581     {
582         super(code.ionDeserializerMsg, file, line, next);
583     }
584 
585     ///
586     this(
587         string msg,
588         string file = __FILE__,
589         size_t line = __LINE__,
590         Throwable next = null) pure nothrow @nogc @safe 
591     {
592         super(msg, file, line, next);
593     }
594 
595     ///
596     this(
597         string msg,
598         Throwable next,
599         string file = __FILE__,
600         size_t line = __LINE__,
601         ) pure nothrow @nogc @safe 
602     {
603         this(msg, file, line, next);
604     }
605 }
606 
607 ///
608 IonDeserializerException ionDeserializerException(IonDeserializerErrorCode code) @safe pure nothrow @nogc
609 {
610     import mir.array.allocation: array;
611     import mir.ndslice.topology: map;
612     import std.traits: EnumMembers;
613 
614     static immutable IonDeserializerException[] exceptions =
615         [EnumMembers!IonDeserializerErrorCode]
616         .map!(code => code ? new immutable IonDeserializerException(code) : null)
617         .array;
618     return unqualException(exceptions[code - IonDeserializerErrorCode.min]);
619 }
620 
621 /+
622 All possible exceptions within the tokenizer.
623 +/
624 enum IonTokenizerErrorCode {
625     none,
626     unexpectedEOF,
627     invalidHexLiteral,
628     unexpectedCharacter,
629     negativeTimestamp,
630     commentsNotAllowed,
631     normalizeEOFFail,
632     cannotUnreadAtPos0,
633     invalidHexEscape,
634     invalidLeadingZeros,
635     cannotUpdateWindow,
636     encodingSurrogateCode,
637     encodingInvalidCode,
638     cannotSkipWhitespace,
639     cannotSkipLongString,
640     expectedValidLeader,
641     invalidTimestampOffset,
642 }
643 
644 string ionTokenizerMsg(IonTokenizerErrorCode error) @property
645 @safe pure nothrow @nogc
646 {
647     static immutable string[] errors = [
648         null,
649         "unexpected EOF",
650         "invalid hex literal",
651         "unexpected character",
652         "encountered negative timestamp",
653         "encountered unexpected comment",
654         "could not normalize EOF",
655         "cannot unread when pos >= 0",
656         "invalid hex escape",
657         "invalid leading zeros in integer literal",
658         "cannot update sliding window",
659         "encoding a surrogate code point in UTF-8",
660         "encoding an invalid code point in UTF-8",
661         "could not skip over whitespace",
662         "could not skip to end of long string",
663         "expected a valid digit leader",
664         "invalid timestamp offset",
665     ];
666 
667     return errors[error - IonTokenizerErrorCode.min];
668 }
669 
670 /+
671 Mir Ion Tokenizer Exception
672 +/
673 class IonTokenizerException : IonException
674 {
675     ///
676     this(
677         IonTokenizerErrorCode code,
678         string file = __FILE__,
679         size_t line = __LINE__,
680         Throwable next = null) pure nothrow @nogc @safe 
681     {
682         super(code.ionTokenizerMsg, file, line, next);
683     }
684 
685     ///
686     this(
687         string msg,
688         string file = __FILE__,
689         size_t line = __LINE__,
690         Throwable next = null) pure nothrow @nogc @safe 
691     {
692         super(msg, file, line, next);
693     }
694 
695     ///
696     this(
697         string msg,
698         Throwable next,
699         string file = __FILE__,
700         size_t line = __LINE__,
701         ) pure nothrow @nogc @safe 
702     {
703         this(msg, file, line, next);
704     }
705 }
706 
707 ///
708 IonTokenizerException ionTokenizerException(IonTokenizerErrorCode code) @safe pure nothrow @nogc
709 {
710     import mir.array.allocation: array;
711     import mir.ndslice.topology: map;
712     import std.traits: EnumMembers;
713 
714     static immutable IonTokenizerException[] exceptions =
715         [EnumMembers!IonTokenizerErrorCode]
716         .map!(code => code ? new immutable IonTokenizerException(code) : null)
717         .array;
718     return unqualException(exceptions[code - IonErrorCode.min]);
719 }