| /* | 
 |     Kickstart is a coarse-grained "filter" engine that finds likely matches | 
 |     to be verified by full-blown matcher. | 
 | */ | 
 | module std.regex.internal.kickstart; | 
 |  | 
 | package(std.regex): | 
 |  | 
 | import std.range.primitives, std.utf; | 
 | import std.regex.internal.ir; | 
 |  | 
 | //utility for shiftOr, returns a minimum number of bytes to test in a Char | 
 | uint effectiveSize(Char)() | 
 | { | 
 |     static if (is(Char == char)) | 
 |         return 1; | 
 |     else static if (is(Char == wchar)) | 
 |         return 2; | 
 |     else static if (is(Char == dchar)) | 
 |         return 3; | 
 |     else | 
 |         static assert(0); | 
 | } | 
 |  | 
 | /* | 
 |     Kickstart engine using ShiftOr algorithm, | 
 |     a bit parallel technique for inexact string searching. | 
 | */ | 
 | struct ShiftOr(Char) | 
 | { | 
 | private: | 
 |     uint[] table; | 
 |     uint fChar; | 
 |     uint n_length; | 
 |     enum charSize =  effectiveSize!Char(); | 
 |     //maximum number of chars in CodepointSet to process | 
 |     enum uint charsetThreshold = 32_000; | 
 |     static struct ShiftThread | 
 |     { | 
 |         uint[] tab; | 
 |         uint mask; | 
 |         uint idx; | 
 |         uint pc, counter, hops; | 
 |         this(uint newPc, uint newCounter, uint[] table) | 
 |         { | 
 |             pc = newPc; | 
 |             counter = newCounter; | 
 |             mask = 1; | 
 |             idx = 0; | 
 |             hops = 0; | 
 |             tab = table; | 
 |         } | 
 |  | 
 |         void setMask(uint idx, uint mask) | 
 |         { | 
 |             tab[idx] |= mask; | 
 |         } | 
 |  | 
 |         void setInvMask(uint idx, uint mask) | 
 |         { | 
 |             tab[idx] &= ~mask; | 
 |         } | 
 |  | 
 |         void set(alias setBits = setInvMask)(dchar ch) | 
 |         { | 
 |             static if (charSize == 3) | 
 |             { | 
 |                 uint val = ch, tmask = mask; | 
 |                 setBits(val&0xFF, tmask); | 
 |                 tmask <<= 1; | 
 |                 val >>= 8; | 
 |                 setBits(val&0xFF, tmask); | 
 |                 tmask <<= 1; | 
 |                 val >>= 8; | 
 |                 assert(val <= 0x10); | 
 |                 setBits(val, tmask); | 
 |                 tmask <<= 1; | 
 |             } | 
 |             else | 
 |             { | 
 |                 Char[dchar.sizeof/Char.sizeof] buf; | 
 |                 uint tmask = mask; | 
 |                 size_t total = encode(buf, ch); | 
 |                 for (size_t i = 0; i < total; i++, tmask<<=1) | 
 |                 { | 
 |                     static if (charSize == 1) | 
 |                         setBits(buf[i], tmask); | 
 |                     else static if (charSize == 2) | 
 |                     { | 
 |                         setBits(buf[i]&0xFF, tmask); | 
 |                         tmask <<= 1; | 
 |                         setBits(buf[i]>>8, tmask); | 
 |                     } | 
 |                 } | 
 |             } | 
 |         } | 
 |         void add(dchar ch){ return set!setInvMask(ch); } | 
 |         void advance(uint s) | 
 |         { | 
 |             mask <<= s; | 
 |             idx += s; | 
 |         } | 
 |         @property bool full(){    return !mask; } | 
 |     } | 
 |  | 
 |     static ShiftThread fork(ShiftThread t, uint newPc, uint newCounter) | 
 |     { | 
 |         ShiftThread nt = t; | 
 |         nt.pc = newPc; | 
 |         nt.counter = newCounter; | 
 |         return nt; | 
 |     } | 
 |  | 
 |     @trusted static ShiftThread fetch(ref ShiftThread[] worklist) | 
 |     { | 
 |         auto t = worklist[$-1]; | 
 |         worklist.length -= 1; | 
 |         if (!__ctfe) | 
 |             cast(void) worklist.assumeSafeAppend(); | 
 |         return t; | 
 |     } | 
 |  | 
 |     static uint charLen(uint ch) | 
 |     { | 
 |         assert(ch <= 0x10FFFF); | 
 |         return codeLength!Char(cast(dchar) ch)*charSize; | 
 |     } | 
 |  | 
 | public: | 
 |     @trusted this(ref Regex!Char re, uint[] memory) | 
 |     { | 
 |         static import std.algorithm.comparison; | 
 |         import std.algorithm.searching : countUntil; | 
 |         import std.conv : text; | 
 |         import std.range : assumeSorted; | 
 |         assert(memory.length == 256); | 
 |         fChar = uint.max; | 
 |         // FNV-1a flavored hash (uses 32bits at a time) | 
 |         ulong hash(uint[] tab) | 
 |         { | 
 |             ulong h = 0xcbf29ce484222325; | 
 |             foreach (v; tab) | 
 |             { | 
 |                 h ^= v; | 
 |                 h *= 0x100000001b3; | 
 |             } | 
 |             return h; | 
 |         } | 
 |     L_FindChar: | 
 |         for (size_t i = 0;;) | 
 |         { | 
 |             switch (re.ir[i].code) | 
 |             { | 
 |                 case IR.Char: | 
 |                     fChar = re.ir[i].data; | 
 |                     static if (charSize != 3) | 
 |                     { | 
 |                         Char[dchar.sizeof/Char.sizeof] buf; | 
 |                         encode(buf, fChar); | 
 |                         fChar = buf[0]; | 
 |                     } | 
 |                     fChar = fChar & 0xFF; | 
 |                     break L_FindChar; | 
 |                 case IR.GroupStart, IR.GroupEnd: | 
 |                     i += IRL!(IR.GroupStart); | 
 |                     break; | 
 |                 case IR.Bof, IR.Bol, IR.Wordboundary, IR.Notwordboundary: | 
 |                     i += IRL!(IR.Bol); | 
 |                     break; | 
 |                 default: | 
 |                     break L_FindChar; | 
 |             } | 
 |         } | 
 |         table = memory; | 
 |         table[] =  uint.max; | 
 |         alias MergeTab = bool[ulong]; | 
 |         // use reasonably complex hash to identify equivalent tables | 
 |         auto merge = new MergeTab[re.hotspotTableSize]; | 
 |         ShiftThread[] trs; | 
 |         ShiftThread t = ShiftThread(0, 0, table); | 
 |         //locate first fixed char if any | 
 |         n_length = 32; | 
 |         for (;;) | 
 |         { | 
 |         L_Eval_Thread: | 
 |             for (;;) | 
 |             { | 
 |                 switch (re.ir[t.pc].code) | 
 |                 { | 
 |                 case IR.Char: | 
 |                     uint s = charLen(re.ir[t.pc].data); | 
 |                     if (t.idx+s > n_length) | 
 |                         goto L_StopThread; | 
 |                     t.add(re.ir[t.pc].data); | 
 |                     t.advance(s); | 
 |                     t.pc += IRL!(IR.Char); | 
 |                     break; | 
 |                 case IR.OrChar://assumes IRL!(OrChar) == 1 | 
 |                     uint len = re.ir[t.pc].sequence; | 
 |                     uint end = t.pc + len; | 
 |                     uint[Bytecode.maxSequence] s; | 
 |                     uint numS; | 
 |                     for (uint i = 0; i < len; i++) | 
 |                     { | 
 |                         auto x = charLen(re.ir[t.pc+i].data); | 
 |                         if (countUntil(s[0 .. numS], x) < 0) | 
 |                            s[numS++] = x; | 
 |                     } | 
 |                     for (uint i = t.pc; i < end; i++) | 
 |                     { | 
 |                         t.add(re.ir[i].data); | 
 |                     } | 
 |                     for (uint i = 0; i < numS; i++) | 
 |                     { | 
 |                         auto tx = fork(t, t.pc + len, t.counter); | 
 |                         if (tx.idx + s[i] <= n_length) | 
 |                         { | 
 |                             tx.advance(s[i]); | 
 |                             trs ~= tx; | 
 |                         } | 
 |                     } | 
 |                     if (!trs.empty) | 
 |                         t = fetch(trs); | 
 |                     else | 
 |                         goto L_StopThread; | 
 |                     break; | 
 |                 case IR.CodepointSet: | 
 |                 case IR.Trie: | 
 |                     auto set = re.charsets[re.ir[t.pc].data]; | 
 |                     uint[4] s; | 
 |                     uint numS; | 
 |                     static if (charSize == 3) | 
 |                     { | 
 |                         s[0] = charSize; | 
 |                         numS = 1; | 
 |                     } | 
 |                     else | 
 |                     { | 
 |  | 
 |                         static if (charSize == 1) | 
 |                             static immutable codeBounds = [0x0, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF]; | 
 |                         else //== 2 | 
 |                             static immutable codeBounds = [0x0, 0xFFFF, 0x10000, 0x10FFFF]; | 
 |                         uint[] arr = new uint[set.byInterval.length * 2]; | 
 |                         size_t ofs = 0; | 
 |                         foreach (ival; set.byInterval) | 
 |                         { | 
 |                             arr[ofs++] = ival.a; | 
 |                             arr[ofs++] = ival.b; | 
 |                         } | 
 |                         auto srange = assumeSorted!"a <= b"(arr); | 
 |                         for (uint i = 0; i < codeBounds.length/2; i++) | 
 |                         { | 
 |                             auto start = srange.lowerBound(codeBounds[2*i]).length; | 
 |                             auto end = srange.lowerBound(codeBounds[2*i+1]).length; | 
 |                             if (end > start || (end == start && (end & 1))) | 
 |                                s[numS++] = (i+1)*charSize; | 
 |                         } | 
 |                     } | 
 |                     if (numS == 0 || t.idx + s[numS-1] > n_length) | 
 |                         goto L_StopThread; | 
 |                     auto  chars = set.length; | 
 |                     if (chars > charsetThreshold) | 
 |                         goto L_StopThread; | 
 |                     foreach (ch; set.byCodepoint) | 
 |                     { | 
 |                         //avoid surrogate pairs | 
 |                         if (0xD800 <= ch && ch <= 0xDFFF) | 
 |                             continue; | 
 |                         t.add(ch); | 
 |                     } | 
 |                     for (uint i = 0; i < numS; i++) | 
 |                     { | 
 |                         auto tx =  fork(t, t.pc + IRL!(IR.CodepointSet), t.counter); | 
 |                         tx.advance(s[i]); | 
 |                         trs ~= tx; | 
 |                     } | 
 |                     if (!trs.empty) | 
 |                         t = fetch(trs); | 
 |                     else | 
 |                         goto L_StopThread; | 
 |                     break; | 
 |                 case IR.Any: | 
 |                     goto L_StopThread; | 
 |  | 
 |                 case IR.GotoEndOr: | 
 |                     t.pc += IRL!(IR.GotoEndOr)+re.ir[t.pc].data; | 
 |                     assert(re.ir[t.pc].code == IR.OrEnd); | 
 |                     goto case; | 
 |                 case IR.OrEnd: | 
 |                     auto slot = re.ir[t.pc+1].raw+t.counter; | 
 |                     auto val = hash(t.tab); | 
 |                     if (val in merge[slot]) | 
 |                         goto L_StopThread; // merge equivalent | 
 |                     merge[slot][val] = true; | 
 |                     t.pc += IRL!(IR.OrEnd); | 
 |                     break; | 
 |                 case IR.OrStart: | 
 |                     t.pc += IRL!(IR.OrStart); | 
 |                     goto case; | 
 |                 case IR.Option: | 
 |                     uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option); | 
 |                     //queue next Option | 
 |                     if (re.ir[next].code == IR.Option) | 
 |                     { | 
 |                         trs ~= fork(t, next, t.counter); | 
 |                     } | 
 |                     t.pc += IRL!(IR.Option); | 
 |                     break; | 
 |                 case IR.RepeatStart:case IR.RepeatQStart: | 
 |                     t.pc += IRL!(IR.RepeatStart)+re.ir[t.pc].data; | 
 |                     goto case IR.RepeatEnd; | 
 |                 case IR.RepeatEnd: | 
 |                 case IR.RepeatQEnd: | 
 |                     auto slot = re.ir[t.pc+1].raw+t.counter; | 
 |                     auto val = hash(t.tab); | 
 |                     if (val in merge[slot]) | 
 |                         goto L_StopThread; // merge equivalent | 
 |                     merge[slot][val] = true; | 
 |                     uint len = re.ir[t.pc].data; | 
 |                     uint step = re.ir[t.pc+2].raw; | 
 |                     uint min = re.ir[t.pc+3].raw; | 
 |                     if (t.counter < min) | 
 |                     { | 
 |                         t.counter += step; | 
 |                         t.pc -= len; | 
 |                         break; | 
 |                     } | 
 |                     uint max = re.ir[t.pc+4].raw; | 
 |                     if (t.counter < max) | 
 |                     { | 
 |                         trs ~= fork(t, t.pc - len, t.counter + step); | 
 |                         t.counter = t.counter%step; | 
 |                         t.pc += IRL!(IR.RepeatEnd); | 
 |                     } | 
 |                     else | 
 |                     { | 
 |                         t.counter = t.counter%step; | 
 |                         t.pc += IRL!(IR.RepeatEnd); | 
 |                     } | 
 |                     break; | 
 |                 case IR.InfiniteStart, IR.InfiniteQStart: | 
 |                     t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); | 
 |                     goto case IR.InfiniteEnd; //both Q and non-Q | 
 |                 case IR.InfiniteEnd: | 
 |                 case IR.InfiniteQEnd: | 
 |                     auto slot = re.ir[t.pc+1].raw+t.counter; | 
 |                     auto val = hash(t.tab); | 
 |                     if (val in merge[slot]) | 
 |                         goto L_StopThread; // merge equivalent | 
 |                     merge[slot][val] = true; | 
 |                     uint len = re.ir[t.pc].data; | 
 |                     uint pc1, pc2; //branches to take in priority order | 
 |                     if (++t.hops == 32) | 
 |                         goto L_StopThread; | 
 |                     pc1 = t.pc + IRL!(IR.InfiniteEnd); | 
 |                     pc2 = t.pc - len; | 
 |                     trs ~= fork(t, pc2, t.counter); | 
 |                     t.pc = pc1; | 
 |                     break; | 
 |                 case IR.GroupStart, IR.GroupEnd: | 
 |                     t.pc += IRL!(IR.GroupStart); | 
 |                     break; | 
 |                 case IR.Bof, IR.Bol, IR.Wordboundary, IR.Notwordboundary: | 
 |                     t.pc += IRL!(IR.Bol); | 
 |                     break; | 
 |                 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: | 
 |                     t.pc += IRL!(IR.LookaheadStart) + IRL!(IR.LookaheadEnd) + re.ir[t.pc].data; | 
 |                     break; | 
 |                 default: | 
 |                 L_StopThread: | 
 |                     assert(re.ir[t.pc].code >= 0x80, text(re.ir[t.pc].code)); | 
 |                     debug (fred_search) writeln("ShiftOr stumbled on ",re.ir[t.pc].mnemonic); | 
 |                     n_length = std.algorithm.comparison.min(t.idx, n_length); | 
 |                     break L_Eval_Thread; | 
 |                 } | 
 |             } | 
 |             if (trs.empty) | 
 |                 break; | 
 |             t = fetch(trs); | 
 |         } | 
 |         debug(std_regex_search) | 
 |         { | 
 |             writeln("Min length: ", n_length); | 
 |         } | 
 |     } | 
 |  | 
 |     @property bool empty() const {  return n_length == 0; } | 
 |  | 
 |     @property uint length() const{ return n_length/charSize; } | 
 |  | 
 |     // lookup compatible bit pattern in haystack, return starting index | 
 |     // has a useful trait: if supplied with valid UTF indexes, | 
 |     // returns only valid UTF indexes | 
 |     // (that given the haystack in question is valid UTF string) | 
 |     @trusted size_t search(const(Char)[] haystack, size_t idx) const | 
 |     {//@BUG: apparently assumes little endian machines | 
 |         import core.stdc.string : memchr; | 
 |         import std.conv : text; | 
 |         assert(!empty); | 
 |         auto p = cast(const(ubyte)*)(haystack.ptr+idx); | 
 |         uint state = uint.max; | 
 |         uint limit = 1u<<(n_length - 1u); | 
 |         debug(std_regex_search) writefln("Limit: %32b",limit); | 
 |         if (fChar != uint.max) | 
 |         { | 
 |             const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); | 
 |             const orginalAlign = cast(size_t) p & (Char.sizeof-1); | 
 |             while (p != end) | 
 |             { | 
 |                 if (!~state) | 
 |                 {//speed up seeking first matching place | 
 |                     for (;;) | 
 |                     { | 
 |                         assert(p <= end, text(p," vs ", end)); | 
 |                         p = cast(ubyte*) memchr(p, fChar, end - p); | 
 |                         if (!p) | 
 |                             return haystack.length; | 
 |                         if ((cast(size_t) p & (Char.sizeof-1)) == orginalAlign) | 
 |                             break; | 
 |                         if (++p == end) | 
 |                             return haystack.length; | 
 |                     } | 
 |                     state = ~1u; | 
 |                     assert((cast(size_t) p & (Char.sizeof-1)) == orginalAlign); | 
 |                     static if (charSize == 3) | 
 |                     { | 
 |                         state = (state << 1) | table[p[1]]; | 
 |                         state = (state << 1) | table[p[2]]; | 
 |                         p += 4; | 
 |                     } | 
 |                     else | 
 |                         p++; | 
 |                     //first char is tested, see if that's all | 
 |                     if (!(state & limit)) | 
 |                         return (p-cast(ubyte*) haystack.ptr)/Char.sizeof | 
 |                             -length; | 
 |                 } | 
 |                 else | 
 |                 {//have some bits/states for possible matches, | 
 |                  //use the usual shift-or cycle | 
 |                     static if (charSize == 3) | 
 |                     { | 
 |                         state = (state << 1) | table[p[0]]; | 
 |                         state = (state << 1) | table[p[1]]; | 
 |                         state = (state << 1) | table[p[2]]; | 
 |                         p += 4; | 
 |                     } | 
 |                     else | 
 |                     { | 
 |                         state = (state << 1) | table[p[0]]; | 
 |                         p++; | 
 |                     } | 
 |                     if (!(state & limit)) | 
 |                         return (p-cast(ubyte*) haystack.ptr)/Char.sizeof | 
 |                             -length; | 
 |                 } | 
 |                 debug(std_regex_search) writefln("State: %32b", state); | 
 |             } | 
 |         } | 
 |         else | 
 |         { | 
 |             //normal path, partially unrolled for char/wchar | 
 |             static if (charSize == 3) | 
 |             { | 
 |                 const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); | 
 |                 while (p != end) | 
 |                 { | 
 |                     state = (state << 1) | table[p[0]]; | 
 |                     state = (state << 1) | table[p[1]]; | 
 |                     state = (state << 1) | table[p[2]]; | 
 |                     p += 4; | 
 |                     if (!(state & limit))//division rounds down for dchar | 
 |                         return (p-cast(ubyte*) haystack.ptr)/Char.sizeof | 
 |                         -length; | 
 |                 } | 
 |             } | 
 |             else | 
 |             { | 
 |                 auto len = cast(ubyte*)(haystack.ptr + haystack.length) - p; | 
 |                 size_t i  = 0; | 
 |                 if (len & 1) | 
 |                 { | 
 |                     state = (state << 1) | table[p[i++]]; | 
 |                     if (!(state & limit)) | 
 |                         return idx+i/Char.sizeof-length; | 
 |                 } | 
 |                 while (i < len) | 
 |                 { | 
 |                     state = (state << 1) | table[p[i++]]; | 
 |                     if (!(state & limit)) | 
 |                         return idx+i/Char.sizeof | 
 |                             -length; | 
 |                     state = (state << 1) | table[p[i++]]; | 
 |                     if (!(state & limit)) | 
 |                         return idx+i/Char.sizeof | 
 |                             -length; | 
 |                     debug(std_regex_search) writefln("State: %32b", state); | 
 |                 } | 
 |             } | 
 |         } | 
 |         return haystack.length; | 
 |     } | 
 |  | 
 |     @system debug static void dump(uint[] table) | 
 |     {//@@@BUG@@@ writef(ln) is @system | 
 |         import std.stdio : writefln; | 
 |         for (size_t i = 0; i < table.length; i += 4) | 
 |         { | 
 |             writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]); | 
 |         } | 
 |     } | 
 | } | 
 |  | 
 | @system unittest | 
 | { | 
 |     import std.conv, std.regex; | 
 |     @trusted void test_fixed(alias Kick)() | 
 |     { | 
 |         static foreach (i, v; AliasSeq!(char, wchar, dchar)) | 
 |         {{ | 
 |             alias Char = v; | 
 |             alias String = immutable(v)[]; | 
 |             auto r = regex(to!String(`abc$`)); | 
 |             auto kick = Kick!Char(r, new uint[256]); | 
 |             assert(kick.length == 3, text(Kick.stringof," ",v.stringof, " == ", kick.length)); | 
 |             auto r2 = regex(to!String(`(abc){2}a+`)); | 
 |             kick = Kick!Char(r2, new uint[256]); | 
 |             assert(kick.length == 7, text(Kick.stringof,v.stringof," == ", kick.length)); | 
 |             auto r3 = regex(to!String(`\b(a{2}b{3}){2,4}`)); | 
 |             kick = Kick!Char(r3, new uint[256]); | 
 |             assert(kick.length == 10, text(Kick.stringof,v.stringof," == ", kick.length)); | 
 |             auto r4 = regex(to!String(`\ba{2}c\bxyz`)); | 
 |             kick = Kick!Char(r4, new uint[256]); | 
 |             assert(kick.length == 6, text(Kick.stringof,v.stringof, " == ", kick.length)); | 
 |             auto r5 = regex(to!String(`\ba{2}c\b`)); | 
 |             kick = Kick!Char(r5, new uint[256]); | 
 |             size_t x = kick.search("aabaacaa", 0); | 
 |             assert(x == 3, text(Kick.stringof,v.stringof," == ", kick.length)); | 
 |             x = kick.search("aabaacaa", x+1); | 
 |             assert(x == 8, text(Kick.stringof,v.stringof," == ", kick.length)); | 
 |         }} | 
 |     } | 
 |     @trusted void test_flex(alias Kick)() | 
 |     { | 
 |         static foreach (i, v; AliasSeq!(char, wchar, dchar)) | 
 |         {{ | 
 |             alias Char = v; | 
 |             alias String = immutable(v)[]; | 
 |             auto r = regex(to!String(`abc[a-z]`)); | 
 |             auto kick = Kick!Char(r, new uint[256]); | 
 |             auto x = kick.search(to!String("abbabca"), 0); | 
 |             assert(x == 3, text("real x is ", x, " ",v.stringof)); | 
 |  | 
 |             auto r2 = regex(to!String(`(ax|bd|cdy)`)); | 
 |             String s2 = to!String("abdcdyabax"); | 
 |             kick = Kick!Char(r2, new uint[256]); | 
 |             x = kick.search(s2, 0); | 
 |             assert(x == 1, text("real x is ", x)); | 
 |             x = kick.search(s2, x+1); | 
 |             assert(x == 3, text("real x is ", x)); | 
 |             x = kick.search(s2, x+1); | 
 |             assert(x == 8, text("real x is ", x)); | 
 |             auto rdot = regex(to!String(`...`)); | 
 |             kick = Kick!Char(rdot, new uint[256]); | 
 |             assert(kick.length == 0); | 
 |             auto rN = regex(to!String(`a(b+|c+)x`)); | 
 |             kick = Kick!Char(rN, new uint[256]); | 
 |             assert(kick.length == 3, to!string(kick.length)); | 
 |             assert(kick.search("ababx",0) == 2); | 
 |             assert(kick.search("abaacba",0) == 3);//expected inexact | 
 |  | 
 |         }} | 
 |     } | 
 |     test_fixed!(ShiftOr)(); | 
 |     test_flex!(ShiftOr)(); | 
 | } | 
 |  | 
 | alias Kickstart = ShiftOr; |