1 /++
2 +/
3 module mir.ion.symbol_table;
4 
5 import mir.utility: min, max, swap, _expect;
6 import mir.internal.memory;
7 
8 // hash - string hash
9 // key - string start position
10 // value - symbol id
11 
12 /++
13 Each version of the Ion specification defines the corresponding system symbol table version.
14 Ion 1.0 uses the `"$ion"` symbol table, version 1,
15 and future versions of Ion will use larger versions of the `"$ion"` symbol table.
16 `$ion_1_1` will probably use version 2, while `$ion_2_0` might use version 5.
17 
18 Applications and users should never have to care about these symbol table versions,
19 since they are never explicit in user data: this specification disallows (by ignoring) imports named `"$ion"`.
20 
21 Here are the system symbols for Ion 1.0.
22 +/
23 static immutable string[] IonSystemSymbolTable_v1 = [
24     "$0",
25     "$ion",
26     "$ion_1_0",
27     "$ion_symbol_table",
28     "name",
29     "version",
30     "imports",
31     "symbols",
32     "max_id",
33     "$ion_shared_symbol_table",
34 ];
35 
36 /++
37 +/
38 enum IonSystemSymbol : ubyte
39 {
40     ///
41     zero,
42     ///
43     ion,
44     ///
45     ion_1_0,
46     ///
47     ion_symbol_table,
48     ///
49     name,
50     ///
51     version_,
52     ///
53     imports,
54     ///
55     symbols,
56     ///
57     max_id,
58     ///
59     ion_shared_symbol_table,
60 }
61 
62 package(mir) string[] removeSystemSymbols(const(string)[] keys) @safe pure nothrow
63 {
64     string[] ret;
65     F: foreach (key; keys) switch(key)
66     {
67         static foreach (skey; IonSystemSymbolTable_v1)
68         {
69             case skey: continue F;
70         }
71         default:
72             ret ~= key;
73     }
74     return ret;
75 }
76 
77 struct IonSymbolTableSequental
78 {
79     import mir.ser.ion: IonSerializer;
80     import mir.ndslice.slice;
81     import core.stdc.string;
82     import core.stdc.stdio;
83 
84     static struct Entry
85     {
86         ulong* data;
87         uint[] ids;
88     }
89 
90     ulong[] temporalStorage;
91     Entry[] entries;
92     uint nextID = IonSystemSymbol.max + 1;
93     IonSerializer!(1024, null, false) serializer = void;
94     enum size_t annotationWrapperState = 0;
95     enum size_t annotationsState = 5;
96     enum size_t structState = 5;
97     enum size_t listState = 9;
98 
99 @trusted pure nothrow @nogc:
100 
101     // $ion_symbol_table::
102     // {
103     //     symbols:[ ... ]
104     // }
105     void initialize()(size_t n = 64)
106     {
107         pragma(inline, true);
108         auto llen = n / ulong.sizeof + (n % ulong.sizeof != 0);
109         auto temporalStoragePtr = cast(ulong*) malloc(llen * ulong.sizeof);
110         this.temporalStorage = temporalStoragePtr[0 .. llen];
111         auto entriesPtr = cast(Entry*) malloc(n * Entry.sizeof);
112         this.entries = entriesPtr[0 .. n];
113         this.entries[] = Entry.init;
114         this.nextID = IonSystemSymbol.max + 1;
115         this.serializer.initializeNoTable;
116 
117         auto annotationWrapperState = serializer.annotationWrapperBegin;
118         assert(annotationWrapperState == this.annotationWrapperState);
119         serializer.putAnnotationId(IonSystemSymbol.ion_symbol_table);
120         auto annotationsState = serializer.annotationsEnd(annotationWrapperState);
121         assert(annotationsState == this.annotationsState);
122         auto structState = serializer.structBegin();
123         assert(structState == this.structState);
124         serializer.putKeyId(IonSystemSymbol.symbols);
125         auto listState = serializer.listBegin();
126         assert(listState == this.listState);
127     }
128 
129     void finalize()()
130     {
131         pragma(inline, true);
132         if (nextID > IonSystemSymbol.max + 1)
133         {
134             serializer.listEnd(listState);
135             serializer.structEnd(structState);
136             serializer.annotationWrapperEnd(annotationsState, annotationWrapperState);
137         }
138         else
139         {
140             serializer.buffer._currentLength = 0;
141         }
142 
143         temporalStorage.ptr.free;
144         foreach(ref e; entries)
145         {
146             e.data.free;
147             e.ids.ptr.free;
148         }
149         entries.ptr.free;
150     }
151 
152     uint insert()(scope const(char)[] str)
153     {
154         pragma(inline, true);
155         auto n = str.length;
156         auto llen = n / ulong.sizeof + (n % ulong.sizeof != 0);
157         if (_expect(n >= entries.length, false))
158         {
159             auto oldLength = entries.length;
160             auto temporalStoragePtr = cast(ulong*) realloc(temporalStorage.ptr, llen * ulong.sizeof);
161             this.temporalStorage = temporalStoragePtr[0 .. llen];
162             this.temporalStorage.ptr[0] = 0;
163             auto entriesPtr = cast(Entry*) realloc(entries.ptr, (n + 1) * Entry.sizeof);
164             this.entries = entriesPtr[0 .. n + 1];
165             this.entries[oldLength .. $] = Entry.init;
166         }
167         temporalStorage.ptr[0] = 0;
168         memcpy(cast(ubyte*)(temporalStorage.ptr + llen) - str.length, str.ptr, str.length);
169 
170         // {
171         //     auto tempPtr0 = cast(ubyte*)(temporalStorage.ptr + llen) - str.length;
172         //     foreach (i; 0 .. str.length)
173         //         tempPtr0[i] = str[i];
174         // }
175 
176         with(entries[n])
177         {
178             if (_expect(ids.length == 0, false))
179             {
180                 auto idsPtr = cast(uint*)malloc(uint.sizeof);
181                 ids = idsPtr[0 .. 1];
182                 if (llen)
183                 {
184                     data = cast(ulong*) malloc(ulong.sizeof * llen);
185                     memcpy(data, temporalStorage.ptr, llen * ulong.sizeof);
186                 }
187                 goto R;
188             }
189             if (llen == 0)
190                 return ids[0];
191             {
192                 sizediff_t i = ids.length - 1;
193                 L: do
194                 {
195                     sizediff_t j = llen - 1;
196                     auto datai = data + i * llen;
197                     for(;;)
198                     {
199                         if (datai[j] == temporalStorage[j])
200                         {
201                             if (--j >= 0)
202                                 continue;
203                             return ids.ptr[i];
204                         }
205                         break;
206                     }
207                 }
208                 while (--i >= 0);
209             }
210 
211             {
212                 auto idsPtr = cast(uint*)realloc(ids.ptr, (ids.length + 1) * uint.sizeof);
213                 ids = idsPtr[0 .. ids.length + 1];
214                 data = cast(ulong*) realloc(data, ids.length * ulong.sizeof * llen);
215                 memcpy(data + llen * (ids.length - 1), temporalStorage.ptr, llen * ulong.sizeof);
216             }
217         R:
218             serializer.putValue(str);
219             return ids.ptr[ids.length - 1] = nextID++;
220         }
221     }
222 }
223 
224 unittest
225 {
226     import mir.test;
227     import mir.ion.conv: text2ion;
228 
229     {
230         IonSymbolTableSequental symbolTable = void;
231         symbolTable.initialize;
232         symbolTable.finalize;
233         symbolTable.serializer.data.should == [];
234     }
235 
236 
237     {
238         IonSymbolTableSequental symbolTable = void;
239         symbolTable.initialize;
240         symbolTable.insert(`id`);
241         symbolTable.finalize;
242         symbolTable.serializer.data.should == [0xE8, 0x81, 0x83, 0xD5, 0x87, 0xB3, 0x82, 0x69, 0x64];
243     }
244 
245     {
246         IonSymbolTableSequental symbolTable = void;
247         symbolTable.initialize;
248         symbolTable.insert(`id`);
249         symbolTable.insert(`id`);
250         symbolTable.insert(`id`);
251         symbolTable.insert(`id`);
252         symbolTable.finalize;
253         symbolTable.serializer.data.should == [0xE8, 0x81, 0x83, 0xD5, 0x87, 0xB3, 0x82, 0x69, 0x64];
254     }
255 
256     {
257         IonSymbolTableSequental symbolTable = void;
258         symbolTable.initialize;
259         auto d = symbolTable.insert(`id`);
260         auto D = symbolTable.insert(`qwertyuioplkjhgfdszxcvbnm`);
261         assert(symbolTable.insert(`id`) == d);
262         assert(symbolTable.insert(`qwertyuioplkjhgfdszxcvbnm`) == D);
263         symbolTable.finalize;
264         symbolTable.serializer.data.should == [0xEE, 0xA5, 0x81, 0x83, 0xDE, 0xA1, 0x87, 0xBE, 0x9E, 0x82, 0x69, 0x64, 0x8E, 0x99, 0x71, 0x77, 0x65, 0x72, 0x74, 0x79, 0x75, 0x69, 0x6F, 0x70, 0x6C, 0x6B, 0x6A, 0x68, 0x67, 0x66, 0x64, 0x73, 0x7A, 0x78, 0x63, 0x76, 0x62, 0x6E, 0x6D];
265     }
266 
267     {
268         IonSymbolTableSequental symbolTable = void;
269         symbolTable.initialize;
270         auto d = symbolTable.insert(`id`);
271         assert(symbolTable.insert(`qwertyuioOlkjhgfdszxcvbnm`) == d + 1);
272         assert(symbolTable.insert(`ID`) == d + 2);
273         assert(symbolTable.insert(`qwertyuioplkjhgfdszxcvbnm`) == d + 3);
274         symbolTable.finalize;
275         symbolTable.serializer.data.should == `[id, qwertyuioOlkjhgfdszxcvbnm, ID, qwertyuioplkjhgfdszxcvbnm]`.text2ion[4 .. $ - 9];
276     }
277 }
278 
279 
280 /++
281 +/
282 struct IonSymbolTable(bool gc)
283 {
284     private static align(16) struct Entry
285     {
286         int probeCount = -1;
287         uint hash;
288         uint keyPosition;
289         uint keyLength;
290         uint value;
291 
292     @safe pure nothrow @nogc @property:
293 
294         bool empty() const scope
295         {
296             return probeCount < 0;
297         }
298     }
299 
300     enum double maxLoadFactor = 0.8;
301     enum uint initialMaxProbe = 8;
302     enum uint initialLength = 1 << initialMaxProbe;
303 
304     Entry* entries;
305     uint nextKeyPosition;
306     uint lengthMinusOne = initialLength - 1;
307     uint maxProbe = initialMaxProbe;
308     uint elementCount;
309     uint startId = IonSystemSymbol.max + 1;
310     ubyte[] keySpace;
311 
312     static if (!gc)
313     {
314         Entry[initialLength + initialMaxProbe] initialStackSpace = void;
315         ubyte[8192] initialKeysSpace = void;
316     }
317 
318     import mir.ion.type_code: IonTypeCode;
319     import mir.ion.tape: ionPut, ionPutEnd, ionPutVarUInt, ionPutAnnotationsListEnd, ionPutStartLength, ionPutAnnotationsListStartLength;
320 
321     @disable this(this);
322 
323 pure nothrow:
324 
325     static if (!gc)
326     ~this() @trusted
327     {
328         if (entries != initialStackSpace.ptr)
329             free(entries);
330         if (keySpace.ptr != initialKeysSpace.ptr)
331             free(keySpace.ptr);
332         entries = null;
333         keySpace = null;
334     }
335 
336     ///
337     bool initialized() @property
338     {
339         return keySpace.length != 0;
340     }
341 
342     ///
343     void initializeNull() @property
344     {
345         entries = null;
346         nextKeyPosition = 0;
347         lengthMinusOne = initialLength - 1;
348         maxProbe = initialMaxProbe;
349         elementCount = 0;
350         startId = IonSystemSymbol.max + 1;
351         keySpace = null;
352     }
353 
354     ///
355     void initialize() scope
356     {
357         initializeNull;
358         static if (gc)
359         {
360             entries = new Entry[initialLength + initialMaxProbe].ptr;
361             keySpace = new ubyte[1024];
362         }
363         else
364         {
365             initialStackSpace[] = Entry.init;
366             entries = initialStackSpace.ptr;
367             keySpace = initialKeysSpace[];
368         }
369 
370         assert(nextKeyPosition == 0);
371         nextKeyPosition += ionPutStartLength; // annotation object
372         nextKeyPosition += ionPutVarUInt(keySpace.ptr + nextKeyPosition, ubyte(1u));
373         nextKeyPosition += ionPutVarUInt(keySpace.ptr + nextKeyPosition, ubyte(IonSystemSymbol.ion_symbol_table));
374         assert(nextKeyPosition == 5);
375         nextKeyPosition += ionPutStartLength; // object
376         nextKeyPosition += ionPutVarUInt(keySpace.ptr + nextKeyPosition, IonSystemSymbol.symbols);
377         assert(nextKeyPosition == 9);
378         nextKeyPosition += ionPutStartLength; // symbol array
379         assert(nextKeyPosition == unfinilizedFirstKeyPosition);
380     }
381 
382     package enum unfinilizedFirstKeyPosition = 12;
383 
384     const(ubyte)[] unfinilizedKeysData() scope const {
385         return keySpace[unfinilizedFirstKeyPosition .. nextKeyPosition];
386     }
387 
388     /++
389     Prepare the table for writing.
390     The table shouldn't be used after that.
391     +/
392     void finalize() @trusted
393     {
394         if (nextKeyPosition == unfinilizedFirstKeyPosition)
395         {
396             nextKeyPosition = 0;
397             return;
398         }
399         {
400             auto shift = 9;
401             auto length = nextKeyPosition - (shift + ionPutStartLength);
402             nextKeyPosition = cast(uint)(shift + ionPutEnd(keySpace.ptr + shift, IonTypeCode.list, length));
403         }
404 
405         {
406             auto shift = 5;
407             auto length = nextKeyPosition - (shift + ionPutStartLength);
408             nextKeyPosition = cast(uint)(shift + ionPutEnd(keySpace.ptr + shift, IonTypeCode.struct_, length));
409         }
410 
411         {
412             auto shift = 0;
413             auto length = nextKeyPosition - (shift + ionPutStartLength);
414             nextKeyPosition = cast(uint)(shift + ionPutEnd(keySpace.ptr + shift, IonTypeCode.annotations, length));
415         }
416     }
417 
418     inout(ubyte)[] data() inout @property
419     {
420         return keySpace[0 .. nextKeyPosition];
421     }
422 
423     private inout(Entry)[] currentEntries() inout @property
424     {
425         return entries[0 .. lengthMinusOne + 1 + maxProbe];
426     }
427 
428     private void grow()
429     {
430         pragma(inline, false);
431         auto currentEntries = this.currentEntries[0 .. $-1];
432 
433         lengthMinusOne = lengthMinusOne * 2 + 1;
434         maxProbe++;
435 
436         static if (gc)
437         {
438             entries = new Entry[lengthMinusOne + 1 + maxProbe].ptr;
439         }
440         else
441         {
442             entries = cast(Entry*)malloc((lengthMinusOne + 1 + maxProbe) * Entry.sizeof);
443             if (entries is null)
444                 assert(0);
445         }
446 
447         this.currentEntries[] = Entry.init;
448         this.currentEntries[$ - 1].probeCount = 0;
449 
450         foreach (i, ref entry; currentEntries)
451         {
452             if (!entry.empty)
453             {
454                 auto current = entries + (entry.hash & lengthMinusOne);
455                 int probeCount;
456 
457                 while (current[probeCount].probeCount >= probeCount)
458                     probeCount++;
459 
460                 assert (elementCount + 1 < (lengthMinusOne + 1) * maxLoadFactor && probeCount < maxProbe);
461 
462                 entry.probeCount = probeCount;
463                 current += entry.probeCount;
464                 if (current.empty)
465                 {
466                     *current = entry;
467                     continue;
468                 }
469                 if (entry.probeCount <= current.probeCount)
470                 {
471             L:
472                     do {
473                         entry.probeCount++;
474                         current++;
475                     }
476                     while (entry.probeCount <= current.probeCount);
477                 }
478 
479                 assert (current <= entries + lengthMinusOne + maxProbe);
480 
481                 swap(*current, entry);
482                 if (!entry.empty)
483                     goto L;
484             }
485         }
486 
487         static if (!gc)
488         {
489             if (currentEntries.ptr != initialStackSpace.ptr)
490                 free(currentEntries.ptr);
491         }
492     }
493 
494     ///
495     uint insert(scope const(char)[] key)
496     {
497         pragma(inline, true);
498         uint ret = insert(key, cast(uint)dlang_hll_murmurhash(key));
499         return ret;
500     }
501 
502     ///
503     uint insert(scope const(char)[] key, uint hash)
504     {
505     L0:
506         pragma(inline, true);
507         auto current = entries + (hash & lengthMinusOne);
508         int probeCount;
509         for (;;)
510         {
511             if (current[probeCount].probeCount < probeCount)
512                 break;
513             probeCount++;
514             if (hash != current[probeCount - 1].hash)
515                 continue;
516             auto pos = current[probeCount - 1].keyPosition;
517             auto len = current[probeCount - 1].keyLength;
518             if (key == keySpace[pos .. pos + len])
519                 return current[probeCount - 1].value;
520         }
521 
522         if (_expect(elementCount + 1 > (lengthMinusOne + 1) * maxLoadFactor || probeCount == maxProbe, false))
523         {
524             grow();
525             goto L0;
526         }
527 
528         // add key
529         if (_expect(nextKeyPosition + key.length + 16 > keySpace.length, false))
530         {
531             auto newLength = max(nextKeyPosition + key.length + 16, keySpace.length * 2);
532             static if (gc)
533             {
534                 keySpace.length = newLength;
535             }
536             else
537             {
538                 if (keySpace.ptr == initialKeysSpace.ptr)
539                 {
540                     import core.stdc.string: memcpy;
541                     keySpace = (cast(ubyte*)malloc(newLength))[0 .. newLength];
542                     if (keySpace.ptr is null)
543                         assert(0);
544                     memcpy(keySpace.ptr, initialKeysSpace.ptr, initialKeysSpace.length);
545                 }
546                 else
547                 {
548                     if (keySpace.length == 0)
549                         assert(0);
550                     keySpace = (cast(ubyte*)realloc(keySpace.ptr, newLength))[0 .. newLength];
551                     if (keySpace.ptr is null)
552                         assert(0);
553                 }
554             }
555         }
556         nextKeyPosition += cast(uint) ionPut(keySpace.ptr + nextKeyPosition, key);
557 
558         Entry entry;
559         entry.probeCount = probeCount;
560         entry.hash = hash;
561         entry.value = elementCount++ + startId;
562         entry.keyPosition = nextKeyPosition - cast(uint)key.length;
563         entry.keyLength = cast(uint)key.length;
564         current += entry.probeCount;
565 
566         auto ret = entry.value;
567 
568         if (current.empty)
569         {
570             *current = entry;
571             return ret;
572         }
573     L1:
574         if (entry.probeCount <= current.probeCount)
575         {
576     L2:
577             do {
578                 entry.probeCount++;
579                 current++;
580             }
581             while (entry.probeCount <= current.probeCount);
582         }
583         if (_expect(current < entries + lengthMinusOne + maxProbe, true))
584         {
585             swap(*current, entry);
586             if (!entry.empty)
587                 goto L2;
588             return ret;
589         }
590         grow();
591         current = entries + (entry.hash & lengthMinusOne);
592         entry.probeCount = 0;
593         for (;;)
594         {
595             if (current.probeCount < entry.probeCount)
596                 break;
597             current++;
598             entry.probeCount++;
599         }
600         goto L1;
601     }
602 }
603 
604 version(mir_ion_test) unittest
605 {
606     IonSymbolTable!true table;
607     table.initialize;
608 
609     foreach (i; 0 .. 20)
610     {
611         auto id = table.insert("a key a bit larger then 14");
612         assert(id == IonSystemSymbol.max + 1);
613     }
614 
615     table.finalize;
616 }
617 
618 package(mir) auto findKey()(const string[] symbolTable, string key)
619 {
620     import mir.algorithm.iteration: findIndex;
621     auto ret = symbolTable.findIndex!(a => a == key);
622     assert(ret != size_t.max, "Missing key: " ~ key);
623     return ret;
624 }
625 
626 private:
627 
628 pragma(inline, true)
629 uint dlang_hll_murmurhash(scope const(char)[] data)
630     @trusted pure nothrow @nogc
631 {
632     if (__ctfe)
633         return cast(uint) hashOf(data);
634     return murmur3_32(cast(const ubyte*)data.ptr, data.length);
635 }
636 
637 
638 pragma(inline, true)
639     @safe pure nothrow @nogc
640 static uint murmur_32_scramble(uint k) {
641     k *= 0xcc9e2d51;
642     k = (k << 15) | (k >> 17);
643     k *= 0x1b873593;
644     return k;
645 }
646 pragma(inline, true)
647     @trusted pure nothrow @nogc
648 uint murmur3_32(const(ubyte)* key, size_t len, uint seed = 0)
649 {
650 	uint h = seed;
651     uint k;
652     /* Read in groups of 4. */
653     for (size_t i = len >> 2; i; i--) {
654         import core.stdc.string;
655         // Here is a source of differing results across endiannesses.
656         // A swap here has no effects on hash properties though.
657         k = *cast(const uint*) key;
658         key += 4;
659         h ^= murmur_32_scramble(k);
660         h = (h << 13) | (h >> 19);
661         h = h * 5 + 0xe6546b64;
662     }
663     /* Read the rest. */
664     k = 0;
665     for (size_t i = len & 3; i; i--) {
666         k <<= 8;
667         k |= key[i - 1];
668     }
669     // A swap is *not* necessary here because the preceding loop already
670     // places the low bytes in the low places according to whatever endianness
671     // we use. Swaps only apply when the memory is copied in a chunk.
672     h ^= murmur_32_scramble(k);
673     /* Finalize. */
674 	h ^= len;
675 	h ^= h >> 16;
676 	h *= 0x85ebca6b;
677 	h ^= h >> 13;
678 	h *= 0xc2b2ae35;
679 	h ^= h >> 16;
680 	return h;
681 }