diff options
Diffstat (limited to 'node_modules/streamsearch/lib')
-rw-r--r-- | node_modules/streamsearch/lib/sbmh.js | 267 |
1 files changed, 267 insertions, 0 deletions
diff --git a/node_modules/streamsearch/lib/sbmh.js b/node_modules/streamsearch/lib/sbmh.js new file mode 100644 index 0000000..510cae2 --- /dev/null +++ b/node_modules/streamsearch/lib/sbmh.js @@ -0,0 +1,267 @@ +'use strict'; +/* + Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation + by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool +*/ +function memcmp(buf1, pos1, buf2, pos2, num) { + for (let i = 0; i < num; ++i) { + if (buf1[pos1 + i] !== buf2[pos2 + i]) + return false; + } + return true; +} + +class SBMH { + constructor(needle, cb) { + if (typeof cb !== 'function') + throw new Error('Missing match callback'); + + if (typeof needle === 'string') + needle = Buffer.from(needle); + else if (!Buffer.isBuffer(needle)) + throw new Error(`Expected Buffer for needle, got ${typeof needle}`); + + const needleLen = needle.length; + + this.maxMatches = Infinity; + this.matches = 0; + + this._cb = cb; + this._lookbehindSize = 0; + this._needle = needle; + this._bufPos = 0; + + this._lookbehind = Buffer.allocUnsafe(needleLen); + + // Initialize occurrence table. + this._occ = [ + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, + needleLen, needleLen, needleLen, needleLen + ]; + + // Populate occurrence table with analysis of the needle, ignoring the last + // letter. + if (needleLen > 1) { + for (let i = 0; i < needleLen - 1; ++i) + this._occ[needle[i]] = needleLen - 1 - i; + } + } + + reset() { + this.matches = 0; + this._lookbehindSize = 0; + this._bufPos = 0; + } + + push(chunk, pos) { + let result; + if (!Buffer.isBuffer(chunk)) + chunk = Buffer.from(chunk, 'latin1'); + const chunkLen = chunk.length; + this._bufPos = pos || 0; + while (result !== chunkLen && this.matches < this.maxMatches) + result = feed(this, chunk); + return result; + } + + destroy() { + const lbSize = this._lookbehindSize; + if (lbSize) + this._cb(false, this._lookbehind, 0, lbSize, false); + this.reset(); + } +} + +function feed(self, data) { + const len = data.length; + const needle = self._needle; + const needleLen = needle.length; + + // Positive: points to a position in `data` + // pos == 3 points to data[3] + // Negative: points to a position in the lookbehind buffer + // pos == -2 points to lookbehind[lookbehindSize - 2] + let pos = -self._lookbehindSize; + const lastNeedleCharPos = needleLen - 1; + const lastNeedleChar = needle[lastNeedleCharPos]; + const end = len - needleLen; + const occ = self._occ; + const lookbehind = self._lookbehind; + + if (pos < 0) { + // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool + // search with character lookup code that considers both the + // lookbehind buffer and the current round's haystack data. + // + // Loop until + // there is a match. + // or until + // we've moved past the position that requires the + // lookbehind buffer. In this case we switch to the + // optimized loop. + // or until + // the character to look at lies outside the haystack. + while (pos < 0 && pos <= end) { + const nextPos = pos + lastNeedleCharPos; + const ch = (nextPos < 0 + ? lookbehind[self._lookbehindSize + nextPos] + : data[nextPos]); + + if (ch === lastNeedleChar + && matchNeedle(self, data, pos, lastNeedleCharPos)) { + self._lookbehindSize = 0; + ++self.matches; + if (pos > -self._lookbehindSize) + self._cb(true, lookbehind, 0, self._lookbehindSize + pos, false); + else + self._cb(true, undefined, 0, 0, true); + + return (self._bufPos = pos + needleLen); + } + + pos += occ[ch]; + } + + // No match. + + // There's too few data for Boyer-Moore-Horspool to run, + // so let's use a different algorithm to skip as much as + // we can. + // Forward pos until + // the trailing part of lookbehind + data + // looks like the beginning of the needle + // or until + // pos == 0 + while (pos < 0 && !matchNeedle(self, data, pos, len - pos)) + ++pos; + + if (pos < 0) { + // Cut off part of the lookbehind buffer that has + // been processed and append the entire haystack + // into it. + const bytesToCutOff = self._lookbehindSize + pos; + + if (bytesToCutOff > 0) { + // The cut off data is guaranteed not to contain the needle. + self._cb(false, lookbehind, 0, bytesToCutOff, false); + } + + self._lookbehindSize -= bytesToCutOff; + lookbehind.copy(lookbehind, 0, bytesToCutOff, self._lookbehindSize); + lookbehind.set(data, self._lookbehindSize); + self._lookbehindSize += len; + + self._bufPos = len; + return len; + } + + // Discard lookbehind buffer. + self._cb(false, lookbehind, 0, self._lookbehindSize, false); + self._lookbehindSize = 0; + } + + pos += self._bufPos; + + const firstNeedleChar = needle[0]; + + // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool + // search with optimized character lookup code that only considers + // the current round's haystack data. + while (pos <= end) { + const ch = data[pos + lastNeedleCharPos]; + + if (ch === lastNeedleChar + && data[pos] === firstNeedleChar + && memcmp(needle, 0, data, pos, lastNeedleCharPos)) { + ++self.matches; + if (pos > 0) + self._cb(true, data, self._bufPos, pos, true); + else + self._cb(true, undefined, 0, 0, true); + + return (self._bufPos = pos + needleLen); + } + + pos += occ[ch]; + } + + // There was no match. If there's trailing haystack data that we cannot + // match yet using the Boyer-Moore-Horspool algorithm (because the trailing + // data is less than the needle size) then match using a modified + // algorithm that starts matching from the beginning instead of the end. + // Whatever trailing data is left after running this algorithm is added to + // the lookbehind buffer. + while (pos < len) { + if (data[pos] !== firstNeedleChar + || !memcmp(data, pos, needle, 0, len - pos)) { + ++pos; + continue; + } + data.copy(lookbehind, 0, pos, len); + self._lookbehindSize = len - pos; + break; + } + + // Everything until `pos` is guaranteed not to contain needle data. + if (pos > 0) + self._cb(false, data, self._bufPos, pos < len ? pos : len, true); + + self._bufPos = len; + return len; +} + +function matchNeedle(self, data, pos, len) { + const lb = self._lookbehind; + const lbSize = self._lookbehindSize; + const needle = self._needle; + + for (let i = 0; i < len; ++i, ++pos) { + const ch = (pos < 0 ? lb[lbSize + pos] : data[pos]); + if (ch !== needle[i]) + return false; + } + return true; +} + +module.exports = SBMH; |