mirror of
https://github.com/IRBorisov/ConceptCore.git
synced 2025-06-26 09:10:37 +03:00
Update re-flex
This commit is contained in:
parent
735d315862
commit
0c6d998cc5
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -30,7 +30,7 @@
|
||||||
@file absmatcher.h
|
@file absmatcher.h
|
||||||
@brief RE/flex abstract matcher base class and pattern matcher class
|
@brief RE/flex abstract matcher base class and pattern matcher class
|
||||||
@author Robert van Engelen - engelen@genivia.com
|
@author Robert van Engelen - engelen@genivia.com
|
||||||
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved.
|
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
|
||||||
@copyright (c) BSD-3 License - see LICENSE.txt
|
@copyright (c) BSD-3 License - see LICENSE.txt
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -42,12 +42,12 @@
|
||||||
#define WITH_REALLOC 1
|
#define WITH_REALLOC 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/// This compile-time option speeds up matching, but slows input().
|
/// This compile-time option speeds up matching, but slows input() somewhat.
|
||||||
#ifndef WITH_FAST_GET
|
#ifndef WITH_FAST_GET
|
||||||
#define WITH_FAST_GET 1
|
#define WITH_FAST_GET 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/// This compile-time option adds span(), line(), wline(), bol(), eol()
|
/// This compile-time option adds span(), line(), wline(), bol(), eol().
|
||||||
#ifndef WITH_SPAN
|
#ifndef WITH_SPAN
|
||||||
#define WITH_SPAN 1
|
#define WITH_SPAN 1
|
||||||
#endif
|
#endif
|
||||||
|
@ -111,7 +111,7 @@ class AbstractMatcher {
|
||||||
static const int EOB = EOF; ///< end of buffer meta-char marker
|
static const int EOB = EOF; ///< end of buffer meta-char marker
|
||||||
static const size_t BLOCK = 4096; ///< minimum remaining unused space in the buffer, to prevent excessive shifting
|
static const size_t BLOCK = 4096; ///< minimum remaining unused space in the buffer, to prevent excessive shifting
|
||||||
#ifndef REFLEX_BUFSZ
|
#ifndef REFLEX_BUFSZ
|
||||||
static const size_t BUFSZ = (128*1024); ///< initial buffer size, at least 4096 bytes
|
static const size_t BUFSZ = (256*1024); ///< initial buffer size, at least 4096 bytes
|
||||||
#else
|
#else
|
||||||
static const size_t BUFSZ = REFLEX_BUFSZ;
|
static const size_t BUFSZ = REFLEX_BUFSZ;
|
||||||
#endif
|
#endif
|
||||||
|
@ -154,11 +154,13 @@ class AbstractMatcher {
|
||||||
A(false),
|
A(false),
|
||||||
N(false),
|
N(false),
|
||||||
W(false),
|
W(false),
|
||||||
|
X(false),
|
||||||
T(8)
|
T(8)
|
||||||
{ }
|
{ }
|
||||||
bool A; ///< accept any/all (?^X) negative patterns as Const::REDO accept index codes
|
bool A; ///< accept any/all (?^X) negative patterns as Const::REDO accept index codes
|
||||||
bool N; ///< nullable, find may return empty match (N/A to scan, split, matches)
|
bool N; ///< nullable, find may return empty match (N/A to scan, split, matches)
|
||||||
bool W; ///< half-check for "whole words", check only left of \< and right of \> for non-word character
|
bool W; ///< reflex::Matcher matches whole words as if bound by \< and \>
|
||||||
|
bool X; ///< reflex::LineMatcher matches empty lines
|
||||||
char T; ///< tab size, must be a power of 2, default is 8, for column count and indent \i, \j, and \k
|
char T; ///< tab size, must be a power of 2, default is 8, for column count and indent \i, \j, and \k
|
||||||
};
|
};
|
||||||
/// AbstractMatcher::Iterator class for scanning, searching, and splitting input character sequences.
|
/// AbstractMatcher::Iterator class for scanning, searching, and splitting input character sequences.
|
||||||
|
@ -356,7 +358,8 @@ class AbstractMatcher {
|
||||||
{
|
{
|
||||||
opt_.A = false; // when true: accept any/all (?^X) negative patterns as Const::REDO accept index codes
|
opt_.A = false; // when true: accept any/all (?^X) negative patterns as Const::REDO accept index codes
|
||||||
opt_.N = false; // when true: find may return empty match (N/A to scan, split, matches)
|
opt_.N = false; // when true: find may return empty match (N/A to scan, split, matches)
|
||||||
opt_.W = false; // when true: half-check for "whole words", check only left of \< and right of \> for non-word character
|
opt_.W = false; // when true: reflex::Matcher matches whole words as if bound by \< and \>
|
||||||
|
opt_.X = false; // when true: reflex::LineMatcher matches empty lines
|
||||||
opt_.T = 8; // tab size 1, 2, 4, or 8
|
opt_.T = 8; // tab size 1, 2, 4, or 8
|
||||||
if (opt)
|
if (opt)
|
||||||
{
|
{
|
||||||
|
@ -373,6 +376,9 @@ class AbstractMatcher {
|
||||||
case 'W':
|
case 'W':
|
||||||
opt_.W = true;
|
opt_.W = true;
|
||||||
break;
|
break;
|
||||||
|
case 'X':
|
||||||
|
opt_.X = true;
|
||||||
|
break;
|
||||||
case 'T':
|
case 'T':
|
||||||
opt_.T = isdigit(*(s += (s[1] == '=') + 1)) ? static_cast<char>(*s - '0') : 0;
|
opt_.T = isdigit(*(s += (s[1] == '=') + 1)) ? static_cast<char>(*s - '0') : 0;
|
||||||
break;
|
break;
|
||||||
|
@ -422,6 +428,7 @@ class AbstractMatcher {
|
||||||
own_ = true;
|
own_ = true;
|
||||||
eof_ = false;
|
eof_ = false;
|
||||||
mat_ = false;
|
mat_ = false;
|
||||||
|
cml_ = false;
|
||||||
}
|
}
|
||||||
/// Set buffer block size for reading: use 0 (or omit argument) to buffer all input in which case returns true if all the data could be read and false if a read error occurred.
|
/// Set buffer block size for reading: use 0 (or omit argument) to buffer all input in which case returns true if all the data could be read and false if a read error occurred.
|
||||||
bool buffer(size_t blk = 0) ///< new block size between 1 and Const::BLOCK, or 0 to buffer all input (default)
|
bool buffer(size_t blk = 0) ///< new block size between 1 and Const::BLOCK, or 0 to buffer all input (default)
|
||||||
|
@ -486,7 +493,7 @@ class AbstractMatcher {
|
||||||
(void)buffer(1);
|
(void)buffer(1);
|
||||||
}
|
}
|
||||||
/// Flush the buffer's remaining content.
|
/// Flush the buffer's remaining content.
|
||||||
void flush()
|
inline void flush()
|
||||||
{
|
{
|
||||||
DBGLOG("AbstractMatcher::flush()");
|
DBGLOG("AbstractMatcher::flush()");
|
||||||
pos_ = end_;
|
pos_ = end_;
|
||||||
|
@ -560,6 +567,7 @@ class AbstractMatcher {
|
||||||
own_ = false;
|
own_ = false;
|
||||||
eof_ = true;
|
eof_ = true;
|
||||||
mat_ = false;
|
mat_ = false;
|
||||||
|
cml_ = false;
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
@ -645,6 +653,13 @@ class AbstractMatcher {
|
||||||
{
|
{
|
||||||
return utf8(txt_);
|
return utf8(txt_);
|
||||||
}
|
}
|
||||||
|
#if WITH_SPAN
|
||||||
|
/// Set or reset mode to count matching lines only and skip other (e.g. for speed).
|
||||||
|
inline void lineno_skip(bool f = false)
|
||||||
|
{
|
||||||
|
cml_ = f;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
/// Set or change the starting line number of the last match.
|
/// Set or change the starting line number of the last match.
|
||||||
inline void lineno(size_t n) ///< new line number
|
inline void lineno(size_t n) ///< new line number
|
||||||
{
|
{
|
||||||
|
@ -652,7 +667,7 @@ class AbstractMatcher {
|
||||||
lno_ = n;
|
lno_ = n;
|
||||||
}
|
}
|
||||||
/// Updates and returns the starting line number of the match in the input character sequence.
|
/// Updates and returns the starting line number of the match in the input character sequence.
|
||||||
inline size_t lineno()
|
size_t lineno()
|
||||||
/// @returns line number
|
/// @returns line number
|
||||||
{
|
{
|
||||||
#if WITH_SPAN
|
#if WITH_SPAN
|
||||||
|
@ -661,55 +676,16 @@ class AbstractMatcher {
|
||||||
const char *s = lpb_;
|
const char *s = lpb_;
|
||||||
const char *t = txt_;
|
const char *t = txt_;
|
||||||
size_t n = 0;
|
size_t n = 0;
|
||||||
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
|
if (cml_)
|
||||||
if (have_HW_AVX512BW())
|
{
|
||||||
n = simd_nlcount_avx512bw(s, t);
|
// count number of matching lines only, not line numbers
|
||||||
else if (have_HW_AVX2())
|
n = std::memchr(s, '\n', t - s) != NULL;
|
||||||
n = simd_nlcount_avx2(s, t);
|
}
|
||||||
else
|
else
|
||||||
n = simd_nlcount_sse2(s, t);
|
|
||||||
#elif defined(HAVE_AVX2)
|
|
||||||
if (have_HW_AVX2())
|
|
||||||
n = simd_nlcount_avx2(s, t);
|
|
||||||
else
|
|
||||||
n = simd_nlcount_sse2(s, t);
|
|
||||||
#elif defined(HAVE_SSE2)
|
|
||||||
n = simd_nlcount_sse2(s, t);
|
|
||||||
#endif
|
|
||||||
#if defined(HAVE_NEON)
|
|
||||||
// no ARM AArch64/NEON SIMD optimized loop? - no code that runs faster than the code below?!
|
|
||||||
uint32_t n0 = 0, n1 = 0;
|
|
||||||
while (s < t - 1)
|
|
||||||
{
|
{
|
||||||
n0 += s[0] == '\n';
|
// count line numbers
|
||||||
n1 += s[1] == '\n';
|
n = nlcount(s, t);
|
||||||
s += 2;
|
|
||||||
}
|
}
|
||||||
n += n0 + n1 + (s < t && *s == '\n');
|
|
||||||
#else
|
|
||||||
// clang/gcc 4-way auto-vectorizable loop
|
|
||||||
uint32_t n0 = 0, n1 = 0, n2 = 0, n3 = 0;
|
|
||||||
while (s < t - 3)
|
|
||||||
{
|
|
||||||
n0 += s[0] == '\n';
|
|
||||||
n1 += s[1] == '\n';
|
|
||||||
n2 += s[2] == '\n';
|
|
||||||
n3 += s[3] == '\n';
|
|
||||||
s += 4;
|
|
||||||
}
|
|
||||||
n += n0 + n1 + n2 + n3;
|
|
||||||
// epilogue
|
|
||||||
if (s < t)
|
|
||||||
{
|
|
||||||
n += *s == '\n';
|
|
||||||
if (++s < t)
|
|
||||||
{
|
|
||||||
n += *s == '\n';
|
|
||||||
if (++s < t)
|
|
||||||
n += *s == '\n';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
// if newlines are detected, then find begin of the last line to adjust bol
|
// if newlines are detected, then find begin of the last line to adjust bol
|
||||||
if (n > 0)
|
if (n > 0)
|
||||||
{
|
{
|
||||||
|
@ -956,13 +932,13 @@ class AbstractMatcher {
|
||||||
else if (got_ == '\n')
|
else if (got_ == '\n')
|
||||||
got_ = Const::UNK;
|
got_ = Const::UNK;
|
||||||
}
|
}
|
||||||
/// Returns true if this matcher matched text that begins a word.
|
/// Returns true if this matcher matched text that begins an ASCII word.
|
||||||
inline bool at_bow()
|
inline bool at_bow()
|
||||||
/// @returns true if this matcher matched text that begins a word
|
/// @returns true if this matcher matched text that begins a word
|
||||||
{
|
{
|
||||||
return !isword(got_) && isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more());
|
return !isword(got_) && isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more());
|
||||||
}
|
}
|
||||||
/// Returns true if this matcher matched text that ends a word.
|
/// Returns true if this matcher matched text that ends an ASCII word.
|
||||||
inline bool at_eow()
|
inline bool at_eow()
|
||||||
/// @returns true if this matcher matched text that ends a word
|
/// @returns true if this matcher matched text that ends a word
|
||||||
{
|
{
|
||||||
|
@ -1116,21 +1092,37 @@ class AbstractMatcher {
|
||||||
}
|
}
|
||||||
return buf_ + end_;
|
return buf_ + end_;
|
||||||
}
|
}
|
||||||
|
/// Return number of bytes available given number of bytes to fetch ahead, limited by input size and buffer size
|
||||||
|
inline size_t fetch(size_t len)
|
||||||
|
/// @returns number of bytes available after fetching.
|
||||||
|
{
|
||||||
|
DBGLOG("AbstractMatcher::fetch(%zu)", len);
|
||||||
|
if (eof_)
|
||||||
|
return 0;
|
||||||
|
if (len <= end_ - (txt_ - buf_))
|
||||||
|
return end_ - (txt_ - buf_);
|
||||||
|
if (end_ + len + 1 >= max_)
|
||||||
|
(void)grow();
|
||||||
|
if (end_ + len + 1 >= max_)
|
||||||
|
len = max_ - end_ - 1;
|
||||||
|
end_ += get(buf_ + end_, len);
|
||||||
|
return avail();
|
||||||
|
}
|
||||||
/// Returns the number of bytes in the buffer available to search from the current begin()/text() position.
|
/// Returns the number of bytes in the buffer available to search from the current begin()/text() position.
|
||||||
size_t avail()
|
inline size_t avail()
|
||||||
{
|
{
|
||||||
if (peek() == EOF)
|
if (peek() == EOF)
|
||||||
return 0;
|
return 0;
|
||||||
return end_ - (txt_ - buf_);
|
return end_ - (txt_ - buf_);
|
||||||
}
|
}
|
||||||
/// Returns the byte offset of the match from the start of the line.
|
/// Returns the byte offset of the match from the start of the line.
|
||||||
size_t border()
|
inline size_t border()
|
||||||
/// @returns border offset
|
/// @returns border offset
|
||||||
{
|
{
|
||||||
return txt_ - bol();
|
return txt_ - bol();
|
||||||
}
|
}
|
||||||
/// Enlarge the match to span the entire line of input (excluding \n), return text().
|
/// Enlarge the match to span the entire line of input (excluding \n), return text().
|
||||||
const char *span()
|
inline const char *span()
|
||||||
/// @returns const char* span of text for the entire line
|
/// @returns const char* span of text for the entire line
|
||||||
{
|
{
|
||||||
DBGLOG("AbstractMatcher::span()");
|
DBGLOG("AbstractMatcher::span()");
|
||||||
|
@ -1146,7 +1138,7 @@ class AbstractMatcher {
|
||||||
return text();
|
return text();
|
||||||
}
|
}
|
||||||
/// Returns the line of input (excluding \n) as a string containing the matched text as a substring.
|
/// Returns the line of input (excluding \n) as a string containing the matched text as a substring.
|
||||||
std::string line()
|
inline std::string line()
|
||||||
/// @returns matching line as a string
|
/// @returns matching line as a string
|
||||||
{
|
{
|
||||||
DBGLOG("AbstractMatcher::line()");
|
DBGLOG("AbstractMatcher::line()");
|
||||||
|
@ -1156,7 +1148,7 @@ class AbstractMatcher {
|
||||||
return std::string(b, e - b);
|
return std::string(b, e - b);
|
||||||
}
|
}
|
||||||
/// Returns the line of input (excluding \n) as a wide string containing the matched text as a substring.
|
/// Returns the line of input (excluding \n) as a wide string containing the matched text as a substring.
|
||||||
std::wstring wline()
|
inline std::wstring wline()
|
||||||
/// @returns matching line as a wide string
|
/// @returns matching line as a wide string
|
||||||
{
|
{
|
||||||
DBGLOG("AbstractMatcher::wline()");
|
DBGLOG("AbstractMatcher::wline()");
|
||||||
|
@ -1252,12 +1244,12 @@ class AbstractMatcher {
|
||||||
return text();
|
return text();
|
||||||
}
|
}
|
||||||
/// Append the next match to the currently matched text returned by AbstractMatcher::text, when the next match found is adjacent to the current match.
|
/// Append the next match to the currently matched text returned by AbstractMatcher::text, when the next match found is adjacent to the current match.
|
||||||
void more()
|
inline void more()
|
||||||
{
|
{
|
||||||
cur_ = txt_ - buf_;
|
cur_ = txt_ - buf_;
|
||||||
}
|
}
|
||||||
/// Truncate the AbstractMatcher::text length of the match to n characters in length and reposition for next match.
|
/// Truncate the AbstractMatcher::text length of the match to n characters in length and reposition for next match.
|
||||||
void less(size_t n) ///< truncated string length
|
inline void less(size_t n) ///< truncated string length
|
||||||
{
|
{
|
||||||
if (n < len_)
|
if (n < len_)
|
||||||
{
|
{
|
||||||
|
@ -1270,80 +1262,80 @@ class AbstractMatcher {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/// Cast this matcher to positive integer indicating the nonzero capture index of the matched text in the pattern, same as AbstractMatcher::accept.
|
/// Cast this matcher to positive integer indicating the nonzero capture index of the matched text in the pattern, same as AbstractMatcher::accept.
|
||||||
operator size_t() const
|
inline operator size_t() const
|
||||||
/// @returns nonzero capture index of a match, which may be matcher dependent, or zero for a mismatch
|
/// @returns nonzero capture index of a match, which may be matcher dependent, or zero for a mismatch
|
||||||
{
|
{
|
||||||
return accept();
|
return accept();
|
||||||
}
|
}
|
||||||
/// Cast this matcher to a std::string of the text matched by this matcher.
|
/// Cast this matcher to a std::string of the text matched by this matcher.
|
||||||
operator std::string() const
|
inline operator std::string() const
|
||||||
/// @returns std::string with matched text
|
/// @returns std::string with matched text
|
||||||
{
|
{
|
||||||
return str();
|
return str();
|
||||||
}
|
}
|
||||||
/// Cast this matcher to a std::wstring of the text matched by this matcher.
|
/// Cast this matcher to a std::wstring of the text matched by this matcher.
|
||||||
operator std::wstring() const
|
inline operator std::wstring() const
|
||||||
/// @returns std::wstring converted to UCS from the 0-terminated matched UTF-8 text
|
/// @returns std::wstring converted to UCS from the 0-terminated matched UTF-8 text
|
||||||
{
|
{
|
||||||
return wstr();
|
return wstr();
|
||||||
}
|
}
|
||||||
/// Cast the match to std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenization into containers.
|
/// Cast the match to std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenization into containers.
|
||||||
operator std::pair<size_t,std::string>() const
|
inline operator std::pair<size_t,std::string>() const
|
||||||
/// @returns std::pair<size_t,std::wstring>(accept(), wstr())
|
/// @returns std::pair<size_t,std::wstring>(accept(), wstr())
|
||||||
{
|
{
|
||||||
return pair();
|
return pair();
|
||||||
}
|
}
|
||||||
/// Returns true if matched text is equal to a string, useful for std::algorithm.
|
/// Returns true if matched text is equal to a string, useful for std::algorithm.
|
||||||
bool operator==(const char *rhs) ///< rhs string to compare to
|
inline bool operator==(const char *rhs) ///< rhs string to compare to
|
||||||
/// @returns true if matched text is equal to rhs string
|
/// @returns true if matched text is equal to rhs string
|
||||||
const
|
const
|
||||||
{
|
{
|
||||||
return std::strncmp(rhs, txt_, len_) == 0 && rhs[len_] == '\0';
|
return std::strncmp(rhs, txt_, len_) == 0 && rhs[len_] == '\0';
|
||||||
}
|
}
|
||||||
/// Returns true if matched text is equalt to a string, useful for std::algorithm.
|
/// Returns true if matched text is equalt to a string, useful for std::algorithm.
|
||||||
bool operator==(const std::string& rhs) ///< rhs string to compare to
|
inline bool operator==(const std::string& rhs) ///< rhs string to compare to
|
||||||
/// @returns true if matched text is equal to rhs string
|
/// @returns true if matched text is equal to rhs string
|
||||||
const
|
const
|
||||||
{
|
{
|
||||||
return rhs.size() == len_ && rhs.compare(0, std::string::npos, txt_, len_) == 0;
|
return rhs.size() == len_ && rhs.compare(0, std::string::npos, txt_, len_) == 0;
|
||||||
}
|
}
|
||||||
/// Returns true if capture index is equal to a given size_t value, useful for std::algorithm.
|
/// Returns true if capture index is equal to a given size_t value, useful for std::algorithm.
|
||||||
bool operator==(size_t rhs) ///< capture index to compare accept() to
|
inline bool operator==(size_t rhs) ///< capture index to compare accept() to
|
||||||
/// @returns true if capture index is equal to rhs
|
/// @returns true if capture index is equal to rhs
|
||||||
const
|
const
|
||||||
{
|
{
|
||||||
return accept() == rhs;
|
return accept() == rhs;
|
||||||
}
|
}
|
||||||
/// Returns true if capture index is equal to a given int value, useful for std::algorithm.
|
/// Returns true if capture index is equal to a given int value, useful for std::algorithm.
|
||||||
bool operator==(int rhs) ///< capture index to compare accept() to
|
inline bool operator==(int rhs) ///< capture index to compare accept() to
|
||||||
/// @returns true if capture index is equal to rhs
|
/// @returns true if capture index is equal to rhs
|
||||||
const
|
const
|
||||||
{
|
{
|
||||||
return static_cast<int>(accept()) == rhs;
|
return static_cast<int>(accept()) == rhs;
|
||||||
}
|
}
|
||||||
/// Returns true if matched text is not equal to a string, useful for std::algorithm.
|
/// Returns true if matched text is not equal to a string, useful for std::algorithm.
|
||||||
bool operator!=(const char *rhs) ///< rhs string to compare to
|
inline bool operator!=(const char *rhs) ///< rhs string to compare to
|
||||||
/// @returns true if matched text is not equal to rhs string
|
/// @returns true if matched text is not equal to rhs string
|
||||||
const
|
const
|
||||||
{
|
{
|
||||||
return std::strncmp(rhs, txt_, len_) != 0 || rhs[len_] != '\0'; // if static checkers complain here, they are wrong
|
return std::strncmp(rhs, txt_, len_) != 0 || rhs[len_] != '\0'; // if static checkers complain here, they are wrong
|
||||||
}
|
}
|
||||||
/// Returns true if matched text is not equal to a string, useful for std::algorithm.
|
/// Returns true if matched text is not equal to a string, useful for std::algorithm.
|
||||||
bool operator!=(const std::string& rhs) ///< rhs string to compare to
|
inline bool operator!=(const std::string& rhs) ///< rhs string to compare to
|
||||||
/// @returns true if matched text is not equal to rhs string
|
/// @returns true if matched text is not equal to rhs string
|
||||||
const
|
const
|
||||||
{
|
{
|
||||||
return rhs.size() > len_ || rhs.compare(0, std::string::npos, txt_, len_) != 0;
|
return rhs.size() > len_ || rhs.compare(0, std::string::npos, txt_, len_) != 0;
|
||||||
}
|
}
|
||||||
/// Returns true if capture index is not equal to a given size_t value, useful for std::algorithm.
|
/// Returns true if capture index is not equal to a given size_t value, useful for std::algorithm.
|
||||||
bool operator!=(size_t rhs) ///< capture index to compare accept() to
|
inline bool operator!=(size_t rhs) ///< capture index to compare accept() to
|
||||||
/// @returns true if capture index is not equal to rhs
|
/// @returns true if capture index is not equal to rhs
|
||||||
const
|
const
|
||||||
{
|
{
|
||||||
return accept() != rhs;
|
return accept() != rhs;
|
||||||
}
|
}
|
||||||
/// Returns true if capture index is not equal to a given int value, useful for std::algorithm.
|
/// Returns true if capture index is not equal to a given int value, useful for std::algorithm.
|
||||||
bool operator!=(int rhs) ///< capture index to compare accept() to
|
inline bool operator!=(int rhs) ///< capture index to compare accept() to
|
||||||
/// @returns true if capture index is not equal to rhs
|
/// @returns true if capture index is not equal to rhs
|
||||||
const
|
const
|
||||||
{
|
{
|
||||||
|
@ -1632,6 +1624,7 @@ class AbstractMatcher {
|
||||||
bool own_; ///< true if AbstractMatcher::buf_ was allocated and should be deleted
|
bool own_; ///< true if AbstractMatcher::buf_ was allocated and should be deleted
|
||||||
bool eof_; ///< input has reached EOF
|
bool eof_; ///< input has reached EOF
|
||||||
bool mat_; ///< true if AbstractMatcher::matches() was successful
|
bool mat_; ///< true if AbstractMatcher::matches() was successful
|
||||||
|
bool cml_; ///< true when counting matching lines instead of line numbers
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The pattern matcher class template extends abstract matcher base class.
|
/// The pattern matcher class template extends abstract matcher base class.
|
||||||
|
@ -1656,7 +1649,7 @@ class PatternMatcher : public AbstractMatcher {
|
||||||
delete pat_;
|
delete pat_;
|
||||||
}
|
}
|
||||||
/// Assign a matcher, the underlying pattern object is shared (not deep copied).
|
/// Assign a matcher, the underlying pattern object is shared (not deep copied).
|
||||||
PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
|
virtual PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
|
||||||
{
|
{
|
||||||
scan.init(this, Const::SCAN);
|
scan.init(this, Const::SCAN);
|
||||||
find.init(this, Const::FIND);
|
find.init(this, Const::FIND);
|
||||||
|
@ -1664,9 +1657,7 @@ class PatternMatcher : public AbstractMatcher {
|
||||||
in = matcher.in;
|
in = matcher.in;
|
||||||
reset();
|
reset();
|
||||||
opt_ = matcher.opt_;
|
opt_ = matcher.opt_;
|
||||||
pat_ = matcher.pat_,
|
return pattern(matcher.pat_);
|
||||||
own_ = false;
|
|
||||||
return *this;
|
|
||||||
}
|
}
|
||||||
/// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
|
/// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
|
||||||
virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
|
virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
|
||||||
|
@ -1726,19 +1717,19 @@ class PatternMatcher : public AbstractMatcher {
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
/// Returns true if this matcher has a pattern.
|
/// Returns true if this matcher has a pattern.
|
||||||
bool has_pattern() const
|
inline bool has_pattern() const
|
||||||
/// @returns true if this matcher has a pattern
|
/// @returns true if this matcher has a pattern
|
||||||
{
|
{
|
||||||
return pat_ != NULL;
|
return pat_ != NULL;
|
||||||
}
|
}
|
||||||
/// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
|
/// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
|
||||||
bool own_pattern() const
|
inline bool own_pattern() const
|
||||||
/// @returns true if this matcher has its own pattern
|
/// @returns true if this matcher has its own pattern
|
||||||
{
|
{
|
||||||
return own_ && pat_ != NULL;
|
return own_ && pat_ != NULL;
|
||||||
}
|
}
|
||||||
/// Returns a reference to the pattern object associated with this matcher.
|
/// Returns a reference to the pattern object associated with this matcher.
|
||||||
const Pattern& pattern() const
|
virtual const Pattern& pattern() const
|
||||||
/// @returns reference to pattern object
|
/// @returns reference to pattern object
|
||||||
{
|
{
|
||||||
ASSERT(pat_ != NULL);
|
ASSERT(pat_ != NULL);
|
||||||
|
@ -1809,7 +1800,7 @@ class PatternMatcher<std::string> : public AbstractMatcher {
|
||||||
delete pat_;
|
delete pat_;
|
||||||
}
|
}
|
||||||
/// Assign a matcher, the underlying pattern string is shared (not deep copied).
|
/// Assign a matcher, the underlying pattern string is shared (not deep copied).
|
||||||
PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
|
virtual PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
|
||||||
{
|
{
|
||||||
scan.init(this, Const::SCAN);
|
scan.init(this, Const::SCAN);
|
||||||
find.init(this, Const::FIND);
|
find.init(this, Const::FIND);
|
||||||
|
@ -1817,9 +1808,7 @@ class PatternMatcher<std::string> : public AbstractMatcher {
|
||||||
in = matcher.in;
|
in = matcher.in;
|
||||||
reset();
|
reset();
|
||||||
opt_ = matcher.opt_;
|
opt_ = matcher.opt_;
|
||||||
pat_ = matcher.pat_,
|
return pattern(matcher.pat_);
|
||||||
own_ = false;
|
|
||||||
return *this;
|
|
||||||
}
|
}
|
||||||
/// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
|
/// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
|
||||||
virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
|
virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
|
||||||
|
@ -1865,19 +1854,19 @@ class PatternMatcher<std::string> : public AbstractMatcher {
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
/// Returns true if this matcher has a pattern.
|
/// Returns true if this matcher has a pattern.
|
||||||
bool has_pattern() const
|
inline bool has_pattern() const
|
||||||
/// @returns true if this matcher has a pattern
|
/// @returns true if this matcher has a pattern
|
||||||
{
|
{
|
||||||
return pat_ != NULL;
|
return pat_ != NULL;
|
||||||
}
|
}
|
||||||
/// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
|
/// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
|
||||||
bool own_pattern() const
|
inline bool own_pattern() const
|
||||||
/// @returns true if this matcher has its own pattern
|
/// @returns true if this matcher has its own pattern
|
||||||
{
|
{
|
||||||
return own_ && pat_ != NULL;
|
return own_ && pat_ != NULL;
|
||||||
}
|
}
|
||||||
/// Returns a reference to the pattern string associated with this matcher.
|
/// Returns a reference to the pattern string associated with this matcher.
|
||||||
const Pattern& pattern() const
|
virtual const Pattern& pattern() const
|
||||||
/// @returns reference to pattern string
|
/// @returns reference to pattern string
|
||||||
{
|
{
|
||||||
ASSERT(pat_ != NULL);
|
ASSERT(pat_ != NULL);
|
||||||
|
|
1178
ccl/rslang/import/reflex/include/reflex/fuzzymatcher.h
Normal file
1178
ccl/rslang/import/reflex/include/reflex/fuzzymatcher.h
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -138,8 +138,8 @@ find:
|
||||||
// option N also finds empty lines
|
// option N also finds empty lines
|
||||||
if (n == 0 && !opt_.N)
|
if (n == 0 && !opt_.N)
|
||||||
goto find;
|
goto find;
|
||||||
// option W only finds empty lines
|
// option X only finds empty lines
|
||||||
if (n > 0 && opt_.W)
|
if (n > 0 && opt_.X)
|
||||||
goto find;
|
goto find;
|
||||||
break;
|
break;
|
||||||
case Const::SPLIT:
|
case Const::SPLIT:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -365,8 +365,8 @@ class Pattern {
|
||||||
f |= pmh[h] & 4;
|
f |= pmh[h] & 4;
|
||||||
h = hash(h, static_cast<uint8_t>(*++s));
|
h = hash(h, static_cast<uint8_t>(*++s));
|
||||||
f |= pmh[h] & 8;
|
f |= pmh[h] & 8;
|
||||||
Pred m = 16;
|
|
||||||
const char *e = s + n - 3;
|
const char *e = s + n - 3;
|
||||||
|
Pred m = 16;
|
||||||
while (f == 0 && ++s < e)
|
while (f == 0 && ++s < e)
|
||||||
{
|
{
|
||||||
h = hash(h, static_cast<uint8_t>(*s));
|
h = hash(h, static_cast<uint8_t>(*s));
|
||||||
|
@ -941,8 +941,7 @@ class Pattern {
|
||||||
void check_dfa_closure(
|
void check_dfa_closure(
|
||||||
const DFA::State *state,
|
const DFA::State *state,
|
||||||
int nest,
|
int nest,
|
||||||
bool& peek,
|
bool& peek) const;
|
||||||
bool& prev) const;
|
|
||||||
void gencode_dfa_closure(
|
void gencode_dfa_closure(
|
||||||
FILE *fd,
|
FILE *fd,
|
||||||
const DFA::State *start,
|
const DFA::State *start,
|
||||||
|
@ -1172,7 +1171,7 @@ class Pattern {
|
||||||
Index cut_; ///< DFA s-t cut to improve predict match and HFA accuracy together with lbk_ and cbk_
|
Index cut_; ///< DFA s-t cut to improve predict match and HFA accuracy together with lbk_ and cbk_
|
||||||
size_t len_; ///< length of chr_[], less or equal to 255
|
size_t len_; ///< length of chr_[], less or equal to 255
|
||||||
size_t min_; ///< patterns after the prefix are at least this long but no more than 8
|
size_t min_; ///< patterns after the prefix are at least this long but no more than 8
|
||||||
size_t pin_; ///< number of needles
|
size_t pin_; ///< number of needles, 0 to 16
|
||||||
std::bitset<256> cbk_; ///< characters to look back over when lbk_ > 0, never includes \n
|
std::bitset<256> cbk_; ///< characters to look back over when lbk_ > 0, never includes \n
|
||||||
std::bitset<256> fst_; ///< the beginning characters of the pattern
|
std::bitset<256> fst_; ///< the beginning characters of the pattern
|
||||||
char chr_[256]; ///< pattern prefix string or character needles for needle-based search
|
char chr_[256]; ///< pattern prefix string or character needles for needle-based search
|
||||||
|
@ -1183,7 +1182,7 @@ class Pattern {
|
||||||
uint16_t lbm_; ///< loopback minimum distance when lbk_ > 0
|
uint16_t lbm_; ///< loopback minimum distance when lbk_ > 0
|
||||||
uint16_t lcp_; ///< primary least common character position in the pattern or 0xffff
|
uint16_t lcp_; ///< primary least common character position in the pattern or 0xffff
|
||||||
uint16_t lcs_; ///< secondary least common character position in the pattern or 0xffff
|
uint16_t lcs_; ///< secondary least common character position in the pattern or 0xffff
|
||||||
size_t bmd_; ///< Boyer-Moore jump distance on mismatch, B-M is enabled when bmd_ > 0
|
size_t bmd_; ///< Boyer-Moore jump distance on mismatch, B-M is enabled when bmd_ > 0 (<= 255)
|
||||||
uint8_t bms_[256]; ///< Boyer-Moore skip array
|
uint8_t bms_[256]; ///< Boyer-Moore skip array
|
||||||
float pms_; ///< ms elapsed time to parse regex
|
float pms_; ///< ms elapsed time to parse regex
|
||||||
float vms_; ///< ms elapsed time to compile DFA vertices
|
float vms_; ///< ms elapsed time to compile DFA vertices
|
||||||
|
@ -1192,6 +1191,7 @@ class Pattern {
|
||||||
float ams_; ///< ms elapsed time to analyze DFA for predict match and HFA
|
float ams_; ///< ms elapsed time to analyze DFA for predict match and HFA
|
||||||
size_t npy_; ///< entropy derived from the bitap array bit_[]
|
size_t npy_; ///< entropy derived from the bitap array bit_[]
|
||||||
bool one_; ///< true if matching one string stored in chr_[] without meta/anchors
|
bool one_; ///< true if matching one string stored in chr_[] without meta/anchors
|
||||||
|
bool bol_; ///< true if matching all patterns at the begin of a line with anchor ^
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace reflex
|
} // namespace reflex
|
||||||
|
|
|
@ -28,15 +28,19 @@
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@file simd.h
|
@file simd.h
|
||||||
@brief RE/flex SIMD intrinsics
|
@brief RE/flex SIMD primitives
|
||||||
@author Robert van Engelen - engelen@genivia.com
|
@author Robert van Engelen - engelen@genivia.com
|
||||||
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved.
|
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
|
||||||
@copyright (c) BSD-3 License - see LICENSE.txt
|
@copyright (c) BSD-3 License - see LICENSE.txt
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef SIMD_H
|
#ifndef SIMD_H
|
||||||
#define SIMD_H
|
#define SIMD_H
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
#if defined(HAVE_AVX512BW)
|
#if defined(HAVE_AVX512BW)
|
||||||
# include <immintrin.h>
|
# include <immintrin.h>
|
||||||
#elif defined(HAVE_AVX2)
|
#elif defined(HAVE_AVX2)
|
||||||
|
@ -133,8 +137,7 @@ inline uint32_t popcountl(uint64_t x)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
|
// Partially count newlines in string b up to e, updates b close to e with uncounted part
|
||||||
extern size_t simd_nlcount_sse2(const char*& b, const char *e);
|
|
||||||
extern size_t simd_nlcount_avx2(const char *&b, const char *e);
|
extern size_t simd_nlcount_avx2(const char *&b, const char *e);
|
||||||
extern size_t simd_nlcount_avx512bw(const char *&b, const char *e);
|
extern size_t simd_nlcount_avx512bw(const char *&b, const char *e);
|
||||||
|
|
||||||
|
@ -142,6 +145,11 @@ extern size_t simd_nlcount_avx512bw(const char*& b, const char *e);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
namespace reflex {
|
||||||
|
|
||||||
|
// Count newlines in string s up to t
|
||||||
|
extern size_t nlcount(const char *s, const char *t);
|
||||||
|
|
||||||
|
} // namespace reflex
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -778,7 +778,11 @@ static void insert_posix_class(const char *pattern, size_t len, size_t& pos, con
|
||||||
else if (name[0] == 'A' && name[1] == 's')
|
else if (name[0] == 'A' && name[1] == 's')
|
||||||
name = const_cast<char*>("ASCII");
|
name = const_cast<char*>("ASCII");
|
||||||
}
|
}
|
||||||
const int *wc = Posix::range(name);
|
const int *wc = NULL;
|
||||||
|
if ((flags & convert_flag::unicode))
|
||||||
|
wc = Unicode::range(name);
|
||||||
|
if (wc == NULL)
|
||||||
|
wc = Posix::range(name);
|
||||||
if (wc == NULL)
|
if (wc == NULL)
|
||||||
throw regex_error(regex_error::invalid_class, pattern, pos);
|
throw regex_error(regex_error::invalid_class, pattern, pos);
|
||||||
if (*buf == '^')
|
if (*buf == '^')
|
||||||
|
|
|
@ -28,7 +28,7 @@
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@file input.cpp
|
@file input.cpp
|
||||||
@brief RE/flex input character sequence class and simd.h CPUID check
|
@brief RE/flex input character sequence class
|
||||||
@author Robert van Engelen - engelen@genivia.com
|
@author Robert van Engelen - engelen@genivia.com
|
||||||
@copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
|
@copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
|
||||||
@copyright (c) BSD-3 License - see LICENSE.txt
|
@copyright (c) BSD-3 License - see LICENSE.txt
|
||||||
|
@ -1361,27 +1361,4 @@ void Input::file_encoding(unsigned short enc, const unsigned short *page)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
|
|
||||||
|
|
||||||
#include <reflex/simd.h>
|
|
||||||
|
|
||||||
// simd.h get_HW()
|
|
||||||
static uint64_t get_HW()
|
|
||||||
{
|
|
||||||
int CPUInfo1[4] = { 0, 0, 0, 0 };
|
|
||||||
int CPUInfo7[4] = { 0, 0, 0, 0 };
|
|
||||||
cpuidex(CPUInfo1, 0, 0);
|
|
||||||
int n = CPUInfo1[0];
|
|
||||||
if (n <= 0)
|
|
||||||
return 0ULL;
|
|
||||||
cpuidex(CPUInfo1, 1, 0); // cpuid EAX=1
|
|
||||||
if (n >= 7)
|
|
||||||
cpuidex(CPUInfo7, 7, 0); // cpuid EAX=7, ECX=0
|
|
||||||
return static_cast<uint32_t>(CPUInfo1[2]) | (static_cast<uint64_t>(static_cast<uint32_t>(CPUInfo7[1])) << 32);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t HW = get_HW();
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
} // namespace reflex
|
} // namespace reflex
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -27,10 +27,10 @@
|
||||||
\******************************************************************************/
|
\******************************************************************************/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@file matcher.cpp, matcher_avx2.cpp, matcher_avx512bw.cpp
|
@file matcher_avx512bw.cpp
|
||||||
@brief RE/flex matcher engine
|
@brief RE/flex matcher engine
|
||||||
@author Robert van Engelen - engelen@genivia.com
|
@author Robert van Engelen - engelen@genivia.com
|
||||||
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved.
|
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
|
||||||
@copyright (c) BSD-3 License - see LICENSE.txt
|
@copyright (c) BSD-3 License - see LICENSE.txt
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -40,5 +40,344 @@
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define COMPILE_AVX512BW
|
#include <reflex/matcher.h>
|
||||||
#include "matcher.cpp"
|
|
||||||
|
namespace reflex {
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
|
||||||
|
|
||||||
|
// AVX512BW runtime optimized function callback overrides
|
||||||
|
void Matcher::simd_init_advance_avx512bw()
|
||||||
|
{
|
||||||
|
if (pat_->len_ == 0)
|
||||||
|
{
|
||||||
|
// no specialization
|
||||||
|
}
|
||||||
|
else if (pat_->len_ == 1)
|
||||||
|
{
|
||||||
|
// no specialization
|
||||||
|
}
|
||||||
|
else if (pat_->len_ == 2)
|
||||||
|
{
|
||||||
|
if (pat_->min_ == 0)
|
||||||
|
adv_ = &Matcher::simd_advance_chars_avx512bw<2>;
|
||||||
|
else if (pat_->min_ < 4)
|
||||||
|
adv_ = &Matcher::simd_advance_chars_pma_avx512bw<2>;
|
||||||
|
else
|
||||||
|
adv_ = &Matcher::simd_advance_chars_pmh_avx512bw<2>;
|
||||||
|
}
|
||||||
|
else if (pat_->len_ == 3)
|
||||||
|
{
|
||||||
|
if (pat_->min_ == 0)
|
||||||
|
adv_ = &Matcher::simd_advance_chars_avx512bw<3>;
|
||||||
|
else if (pat_->min_ < 4)
|
||||||
|
adv_ = &Matcher::simd_advance_chars_pma_avx512bw<3>;
|
||||||
|
else
|
||||||
|
adv_ = &Matcher::simd_advance_chars_pmh_avx512bw<3>;
|
||||||
|
}
|
||||||
|
else if (pat_->bmd_ == 0)
|
||||||
|
{
|
||||||
|
#if defined(WITH_STRING_PM)
|
||||||
|
if (pat_->min_ >= 4)
|
||||||
|
adv_ = &Matcher::simd_advance_string_pmh_avx512bw;
|
||||||
|
else if (pat_->min_ > 0)
|
||||||
|
adv_ = &Matcher::simd_advance_string_pma_avx512bw;
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
adv_ = &Matcher::simd_advance_string_avx512bw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Few chars
|
||||||
|
template<uint8_t LEN>
|
||||||
|
bool Matcher::simd_advance_chars_avx512bw(size_t loc)
|
||||||
|
{
|
||||||
|
static const uint16_t lcp = 0;
|
||||||
|
static const uint16_t lcs = LEN - 1;
|
||||||
|
const char *chr = pat_->chr_;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *s = buf_ + loc + lcp;
|
||||||
|
const char *e = buf_ + end_ + lcp - LEN + 1;
|
||||||
|
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
|
||||||
|
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
|
||||||
|
while (s <= e - 64)
|
||||||
|
{
|
||||||
|
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
|
||||||
|
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
|
||||||
|
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
|
||||||
|
while (mask != 0)
|
||||||
|
{
|
||||||
|
uint32_t offset = ctzl(mask);
|
||||||
|
if (LEN == 2 ||
|
||||||
|
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
|
||||||
|
{
|
||||||
|
loc = s - lcp + offset - buf_;
|
||||||
|
set_current(loc);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
mask &= mask - 1;
|
||||||
|
}
|
||||||
|
s += 64;
|
||||||
|
}
|
||||||
|
s -= lcp;
|
||||||
|
loc = s - buf_;
|
||||||
|
set_current_and_peek_more(loc - 1);
|
||||||
|
loc = cur_ + 1;
|
||||||
|
if (loc + LEN > end_)
|
||||||
|
return false;
|
||||||
|
if (loc + LEN + 63 > end_)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return advance_chars<LEN>(loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Few chars followed by 2 to 3 minimal char pattern
|
||||||
|
template<uint8_t LEN>
|
||||||
|
bool Matcher::simd_advance_chars_pma_avx512bw(size_t loc)
|
||||||
|
{
|
||||||
|
static const uint16_t lcp = 0;
|
||||||
|
static const uint16_t lcs = LEN - 1;
|
||||||
|
const Pattern::Pred *pma = pat_->pma_;
|
||||||
|
const char *chr = pat_->chr_;
|
||||||
|
size_t min = pat_->min_;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *s = buf_ + loc + lcp;
|
||||||
|
const char *e = buf_ + end_ + lcp - LEN + 1;
|
||||||
|
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
|
||||||
|
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
|
||||||
|
while (s <= e - 64)
|
||||||
|
{
|
||||||
|
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
|
||||||
|
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
|
||||||
|
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
|
||||||
|
while (mask != 0)
|
||||||
|
{
|
||||||
|
uint32_t offset = ctzl(mask);
|
||||||
|
if (LEN == 2 ||
|
||||||
|
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
|
||||||
|
{
|
||||||
|
loc = s - lcp + offset - buf_;
|
||||||
|
if (loc + LEN + 4 > end_ || Pattern::predict_match(pma, &buf_[loc + LEN]) == 0)
|
||||||
|
{
|
||||||
|
set_current(loc);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mask &= mask - 1;
|
||||||
|
}
|
||||||
|
s += 64;
|
||||||
|
}
|
||||||
|
s -= lcp;
|
||||||
|
loc = s - buf_;
|
||||||
|
set_current_and_peek_more(loc - 1);
|
||||||
|
loc = cur_ + 1;
|
||||||
|
if (loc + LEN + min > end_)
|
||||||
|
return false;
|
||||||
|
if (loc + LEN + min + 63 > end_)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return advance_chars_pma<LEN>(loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Few chars followed by 4 minimal char pattern
|
||||||
|
template<uint8_t LEN>
|
||||||
|
bool Matcher::simd_advance_chars_pmh_avx512bw(size_t loc)
|
||||||
|
{
|
||||||
|
static const uint16_t lcp = 0;
|
||||||
|
static const uint16_t lcs = LEN - 1;
|
||||||
|
const Pattern::Pred *pmh = pat_->pmh_;
|
||||||
|
const char *chr = pat_->chr_;
|
||||||
|
size_t min = pat_->min_;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *s = buf_ + loc + lcp;
|
||||||
|
const char *e = buf_ + end_ + lcp - LEN + 1;
|
||||||
|
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
|
||||||
|
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
|
||||||
|
while (s <= e - 64)
|
||||||
|
{
|
||||||
|
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
|
||||||
|
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
|
||||||
|
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
|
||||||
|
while (mask != 0)
|
||||||
|
{
|
||||||
|
uint32_t offset = ctzl(mask);
|
||||||
|
if (LEN == 2 ||
|
||||||
|
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
|
||||||
|
{
|
||||||
|
loc = s - lcp + offset - buf_;
|
||||||
|
if (loc + LEN + min > end_ || Pattern::predict_match(pmh, &buf_[loc + LEN], min))
|
||||||
|
{
|
||||||
|
set_current(loc);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mask &= mask - 1;
|
||||||
|
}
|
||||||
|
s += 64;
|
||||||
|
}
|
||||||
|
s -= lcp;
|
||||||
|
loc = s - buf_;
|
||||||
|
set_current_and_peek_more(loc - 1);
|
||||||
|
loc = cur_ + 1;
|
||||||
|
if (loc + LEN + min > end_)
|
||||||
|
return false;
|
||||||
|
if (loc + LEN + min + 63 > end_)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return advance_chars_pmh<LEN>(loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
|
||||||
|
bool Matcher::simd_advance_string_avx512bw(size_t loc)
|
||||||
|
{
|
||||||
|
const char *chr = pat_->chr_;
|
||||||
|
size_t len = pat_->len_;
|
||||||
|
uint16_t lcp = pat_->lcp_;
|
||||||
|
uint16_t lcs = pat_->lcs_;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *s = buf_ + loc + lcp;
|
||||||
|
const char *e = buf_ + end_ + lcp - len + 1;
|
||||||
|
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
|
||||||
|
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
|
||||||
|
while (s <= e - 64)
|
||||||
|
{
|
||||||
|
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
|
||||||
|
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
|
||||||
|
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
|
||||||
|
while (mask != 0)
|
||||||
|
{
|
||||||
|
uint32_t offset = ctzl(mask);
|
||||||
|
if (std::memcmp(s - lcp + offset, chr, len) == 0)
|
||||||
|
{
|
||||||
|
loc = s - lcp + offset - buf_;
|
||||||
|
set_current(loc);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
mask &= mask - 1;
|
||||||
|
}
|
||||||
|
s += 64;
|
||||||
|
}
|
||||||
|
s -= lcp;
|
||||||
|
loc = s - buf_;
|
||||||
|
set_current_and_peek_more(loc - 1);
|
||||||
|
loc = cur_ + 1;
|
||||||
|
if (loc + len > end_)
|
||||||
|
return false;
|
||||||
|
if (loc + len + 63 > end_)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return advance_string(loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(WITH_STRING_PM)
|
||||||
|
|
||||||
|
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
|
||||||
|
bool Matcher::simd_advance_string_pma_avx512bw(size_t loc)
|
||||||
|
{
|
||||||
|
const Pattern::Pred *pma = pat_->pma_;
|
||||||
|
const char *chr = pat_->chr_;
|
||||||
|
size_t len = pat_->len_;
|
||||||
|
size_t min = pat_->min_;
|
||||||
|
uint16_t lcp = pat_->lcp_;
|
||||||
|
uint16_t lcs = pat_->lcs_;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *s = buf_ + loc + lcp;
|
||||||
|
const char *e = buf_ + end_ + lcp - len + 1;
|
||||||
|
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
|
||||||
|
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
|
||||||
|
while (s <= e - 64)
|
||||||
|
{
|
||||||
|
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
|
||||||
|
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
|
||||||
|
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
|
||||||
|
while (mask != 0)
|
||||||
|
{
|
||||||
|
uint32_t offset = ctzl(mask);
|
||||||
|
if (std::memcmp(s - lcp + offset, chr, len) == 0)
|
||||||
|
{
|
||||||
|
loc = s - lcp + offset - buf_;
|
||||||
|
if (loc + len + 4 > end_ || Pattern::predict_match(pma, &buf_[loc + len]) == 0)
|
||||||
|
{
|
||||||
|
set_current(loc);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mask &= mask - 1;
|
||||||
|
}
|
||||||
|
s += 64;
|
||||||
|
}
|
||||||
|
s -= lcp;
|
||||||
|
loc = s - buf_;
|
||||||
|
set_current_and_peek_more(loc - 1);
|
||||||
|
loc = cur_ + 1;
|
||||||
|
if (loc + len + min > end_)
|
||||||
|
return false;
|
||||||
|
if (loc + len + min + 63 > end_)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return advance_string_pma(loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
|
||||||
|
bool Matcher::simd_advance_string_pmh_avx512bw(size_t loc)
|
||||||
|
{
|
||||||
|
const Pattern::Pred *pmh = pat_->pmh_;
|
||||||
|
const char *chr = pat_->chr_;
|
||||||
|
size_t len = pat_->len_;
|
||||||
|
size_t min = pat_->min_;
|
||||||
|
uint16_t lcp = pat_->lcp_;
|
||||||
|
uint16_t lcs = pat_->lcs_;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *s = buf_ + loc + lcp;
|
||||||
|
const char *e = buf_ + end_ + lcp - len + 1;
|
||||||
|
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
|
||||||
|
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
|
||||||
|
while (s <= e - 64)
|
||||||
|
{
|
||||||
|
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
|
||||||
|
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
|
||||||
|
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
|
||||||
|
while (mask != 0)
|
||||||
|
{
|
||||||
|
uint32_t offset = ctzl(mask);
|
||||||
|
if (std::memcmp(s - lcp + offset, chr, len) == 0)
|
||||||
|
{
|
||||||
|
loc = s - lcp + offset - buf_;
|
||||||
|
if (loc + len + min > end_ || Pattern::predict_match(pmh, &buf_[loc + len], min))
|
||||||
|
{
|
||||||
|
set_current(loc);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mask &= mask - 1;
|
||||||
|
}
|
||||||
|
s += 64;
|
||||||
|
}
|
||||||
|
s -= lcp;
|
||||||
|
loc = s - buf_;
|
||||||
|
set_current_and_peek_more(loc - 1);
|
||||||
|
loc = cur_ + 1;
|
||||||
|
if (loc + len + min > end_)
|
||||||
|
return false;
|
||||||
|
if (loc + len + min + 63 > end_)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return advance_string_pmh(loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // WITH_STRING_PM
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// appease ranlib "has no symbols"
|
||||||
|
void matcher_not_compiled_with_avx512bw() { }
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace reflex
|
||||||
|
|
|
@ -45,10 +45,10 @@
|
||||||
/// DFA compaction: -1 == reverse order edge compression (best); 1 == edge compression; 0 == no edge compression.
|
/// DFA compaction: -1 == reverse order edge compression (best); 1 == edge compression; 0 == no edge compression.
|
||||||
/** Edge compression reorders edges to produce fewer tests when executed in the compacted order.
|
/** Edge compression reorders edges to produce fewer tests when executed in the compacted order.
|
||||||
For example ([a-cg-ik]|d|[e-g]|j|y|[x-z]) after reverse edge compression has only 2 edges:
|
For example ([a-cg-ik]|d|[e-g]|j|y|[x-z]) after reverse edge compression has only 2 edges:
|
||||||
c1 = m.FSM_CHAR();
|
c = m.FSM_CHAR();
|
||||||
if ('x' <= c1 && c1 <= 'z') goto S3;
|
if ('x' <= c && c <= 'z') goto S3;
|
||||||
if ('a' <= c1 && c1 <= 'k') goto S3;
|
if ('a' <= c && c <= 'k') goto S3;
|
||||||
return m.FSM_HALT(c1);
|
return m.FSM_HALT(c);
|
||||||
*/
|
*/
|
||||||
#define WITH_COMPACT_DFA -1
|
#define WITH_COMPACT_DFA -1
|
||||||
|
|
||||||
|
@ -177,6 +177,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
bmd_ = 0;
|
bmd_ = 0;
|
||||||
npy_ = 0;
|
npy_ = 0;
|
||||||
one_ = false;
|
one_ = false;
|
||||||
|
bol_ = false;
|
||||||
vno_ = 0;
|
vno_ = 0;
|
||||||
eno_ = 0;
|
eno_ = 0;
|
||||||
hno_ = 0;
|
hno_ = 0;
|
||||||
|
@ -197,36 +198,43 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
len_ = pred[0];
|
len_ = pred[0];
|
||||||
min_ = pred[1] & 0x0f;
|
min_ = pred[1] & 0x0f;
|
||||||
one_ = pred[1] & 0x10;
|
one_ = pred[1] & 0x10;
|
||||||
|
bol_ = pred[1] & 0x40;
|
||||||
memcpy(chr_, pred + 2, len_);
|
memcpy(chr_, pred + 2, len_);
|
||||||
size_t n = len_ + 2;
|
size_t n = 2 + len_;
|
||||||
if (len_ == 0)
|
if (len_ == 0)
|
||||||
{
|
{
|
||||||
|
// get bitap bit_[] parameters
|
||||||
for (size_t i = 0; i < 256; ++i)
|
for (size_t i = 0; i < 256; ++i)
|
||||||
bit_[i] = ~pred[i + n];
|
bit_[i] = ~pred[i + n];
|
||||||
n += 256;
|
n += 256;
|
||||||
}
|
}
|
||||||
if (min_ >= 4)
|
if (min_ < 4)
|
||||||
{
|
|
||||||
for (size_t i = 0; i < Const::HASH; ++i)
|
|
||||||
pmh_[i] = ~pred[i + n];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
|
// get predict match PM4 pma_[] parameters
|
||||||
for (size_t i = 0; i < Const::HASH; ++i)
|
for (size_t i = 0; i < Const::HASH; ++i)
|
||||||
pma_[i] = ~pred[i + n];
|
pma_[i] = ~pred[i + n];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// get predict match hash pmh_[] parameters
|
||||||
|
for (size_t i = 0; i < Const::HASH; ++i)
|
||||||
|
pmh_[i] = ~pred[i + n];
|
||||||
|
}
|
||||||
|
n += Const::HASH;
|
||||||
if ((pred[1] & 0x20) != 0)
|
if ((pred[1] & 0x20) != 0)
|
||||||
{
|
{
|
||||||
n += Const::HASH;
|
// get lookback parameters lbk_ lbm_ and cbk_[] after s-t cut and first s-t cut pattern characters fst_[]
|
||||||
lbk_ = pred[n + 0] | (pred[n + 1] << 8);
|
lbk_ = pred[n + 0] | (pred[n + 1] << 8);
|
||||||
lbm_ = pred[n + 2] | (pred[n + 3] << 8);
|
lbm_ = pred[n + 2] | (pred[n + 3] << 8);
|
||||||
for (size_t i = 0; i < 256; ++i)
|
for (size_t i = 0; i < 256; ++i)
|
||||||
cbk_.set(i, pred[n + 4 + (i >> 3)] & (1 << (i & 7)));
|
cbk_.set(i, pred[n + 4 + (i >> 3)] & (1 << (i & 7)));
|
||||||
for (size_t i = 0; i < 256; ++i)
|
for (size_t i = 0; i < 256; ++i)
|
||||||
fst_.set(i, pred[n + 32 + 4 + (i >> 3)] & (1 << (i & 7)));
|
fst_.set(i, pred[n + 4 + 32 + (i >> 3)] & (1 << (i & 7)));
|
||||||
|
n += 4 + 32 + 32;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// get first pattern characters fst_[] from bitap
|
||||||
for (size_t i = 0; i < 256; ++i)
|
for (size_t i = 0; i < 256; ++i)
|
||||||
fst_.set(i, (bit_[i] & 1) == 0);
|
fst_.set(i, (bit_[i] & 1) == 0);
|
||||||
}
|
}
|
||||||
|
@ -311,8 +319,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
}
|
}
|
||||||
// needle count and frequency thresholds to enable needle-based search
|
// needle count and frequency thresholds to enable needle-based search
|
||||||
uint16_t pinmax = 8;
|
uint16_t pinmax = 8;
|
||||||
uint8_t freqmax1 = 91; // one position
|
uint8_t freqmax = 251;
|
||||||
uint8_t freqmax2 = 251; // two positions
|
|
||||||
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
|
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
|
||||||
if (have_HW_AVX512BW() || have_HW_AVX2())
|
if (have_HW_AVX512BW() || have_HW_AVX2())
|
||||||
pinmax = 16;
|
pinmax = 16;
|
||||||
|
@ -331,7 +338,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
lcs_ = 0;
|
lcs_ = 0;
|
||||||
uint16_t nlcp = 65535; // max and undefined
|
uint16_t nlcp = 65535; // max and undefined
|
||||||
uint16_t nlcs = 65535; // max and undefined
|
uint16_t nlcs = 65535; // max and undefined
|
||||||
uint16_t freqsum = 0;
|
|
||||||
uint8_t freqlcp = 255; // max
|
uint8_t freqlcp = 255; // max
|
||||||
uint8_t freqlcs = 255; // max
|
uint8_t freqlcs = 255; // max
|
||||||
size_t min = (min_ == 0 ? 1 : min_);
|
size_t min = (min_ == 0 ? 1 : min_);
|
||||||
|
@ -339,7 +345,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
{
|
{
|
||||||
Pred mask = 1 << k;
|
Pred mask = 1 << k;
|
||||||
uint16_t n = 0;
|
uint16_t n = 0;
|
||||||
uint16_t sum = 0;
|
|
||||||
uint8_t max = 0;
|
uint8_t max = 0;
|
||||||
// at position k count the matching characters and find the max character frequency
|
// at position k count the matching characters and find the max character frequency
|
||||||
for (uint16_t i = 0; i < 256; ++i)
|
for (uint16_t i = 0; i < 256; ++i)
|
||||||
|
@ -348,14 +353,13 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
{
|
{
|
||||||
++n;
|
++n;
|
||||||
uint8_t freq = frequency(static_cast<uint8_t>(i));
|
uint8_t freq = frequency(static_cast<uint8_t>(i));
|
||||||
sum += freq;
|
|
||||||
if (freq > max)
|
if (freq > max)
|
||||||
max = freq;
|
max = freq;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n <= pinmax)
|
if (n <= pinmax)
|
||||||
{
|
{
|
||||||
// pick the fewest and rarest (least frequently occurring) needles to search
|
// pick the fewest and rarest (less frequently occurring) needles to search
|
||||||
if (max < freqlcp || (n < nlcp && max == freqlcp))
|
if (max < freqlcp || (n < nlcp && max == freqlcp))
|
||||||
{
|
{
|
||||||
lcs_ = lcp_;
|
lcs_ = lcp_;
|
||||||
|
@ -363,7 +367,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
freqlcs = freqlcp;
|
freqlcs = freqlcp;
|
||||||
lcp_ = static_cast<uint8_t>(k);
|
lcp_ = static_cast<uint8_t>(k);
|
||||||
nlcp = n;
|
nlcp = n;
|
||||||
freqsum = sum;
|
|
||||||
freqlcp = max;
|
freqlcp = max;
|
||||||
}
|
}
|
||||||
else if (n < nlcs ||
|
else if (n < nlcs ||
|
||||||
|
@ -377,8 +380,8 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// one position to pin: make lcp and lcs equal (compared and optimized later)
|
// one position to pin: make lcp and lcs equal to 0 (only one position at 0)
|
||||||
if (min == 1 || ((freqsum <= freqlcp || nlcs == 65535) && freqsum <= freqmax1))
|
if (min == 1 || nlcs == 65535)
|
||||||
{
|
{
|
||||||
nlcs = nlcp;
|
nlcs = nlcp;
|
||||||
lcs_ = lcp_;
|
lcs_ = lcp_;
|
||||||
|
@ -387,7 +390,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
uint16_t n = nlcp > nlcs ? nlcp : nlcs;
|
uint16_t n = nlcp > nlcs ? nlcp : nlcs;
|
||||||
DBGLOG("min=%zu lcp=%hu(%hu) pin=%hu nlcp=%hu(%hu) freq=%hu(%hu) freqsum=%hu npy=%zu", min, lcp_, lcs_, n, nlcp, nlcs, freqlcp, freqlcs, freqsum, npy_);
|
DBGLOG("min=%zu lcp=%hu(%hu) pin=%hu nlcp=%hu(%hu) freq=%hu(%hu) freqsum=%hu npy=%zu", min, lcp_, lcs_, n, nlcp, nlcs, freqlcp, freqlcs, freqsum, npy_);
|
||||||
// determine if a needle-based search is worthwhile, below or meeting the thresholds
|
// determine if a needle-based search is worthwhile, below or meeting the thresholds
|
||||||
if (n <= pinmax && freqlcp <= freqmax2)
|
if (n <= pinmax && freqlcp <= freqmax)
|
||||||
{
|
{
|
||||||
// bridge the gap from 9 to 16 to handle 9 to 16 combined
|
// bridge the gap from 9 to 16 to handle 9 to 16 combined
|
||||||
if (n > 8)
|
if (n > 8)
|
||||||
|
@ -412,7 +415,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
}
|
}
|
||||||
else if (len_ > 1)
|
else if (len_ > 1)
|
||||||
{
|
{
|
||||||
// Boyer-Moore preprocessing of the given string pattern pat of length len, generates bmd_ > 0 and bms_[] shifts
|
// produce lcp and lcs positions and Boyer-Moore bms_[] shifts when bmd_ > 0
|
||||||
uint8_t n = static_cast<uint8_t>(len_); // okay to cast: actually never more than 255
|
uint8_t n = static_cast<uint8_t>(len_); // okay to cast: actually never more than 255
|
||||||
uint16_t i;
|
uint16_t i;
|
||||||
for (i = 0; i < 256; ++i)
|
for (i = 0; i < 256; ++i)
|
||||||
|
@ -433,13 +436,14 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
lcs_ = lcp_;
|
lcs_ = lcp_;
|
||||||
lcp_ = i;
|
lcp_ = i;
|
||||||
}
|
}
|
||||||
else if (lcpch != pch && frequency(lcsch) > freqpch)
|
else if (frequency(lcsch) > freqpch ||
|
||||||
|
(frequency(lcsch) == freqpch &&
|
||||||
|
abs(static_cast<int>(lcp_) - static_cast<int>(lcs_)) < abs(static_cast<int>(lcp_) - static_cast<int>(i))))
|
||||||
{
|
{
|
||||||
lcs_ = i;
|
lcs_ = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DBGLOG("len=%zu lcp=%hu(%hu)", len_, lcp_, lcs_);
|
|
||||||
uint16_t j;
|
uint16_t j;
|
||||||
for (i = n - 1, j = i; j > 0; --j)
|
for (i = n - 1, j = i; j > 0; --j)
|
||||||
if (chr_[j - 1] == chr_[i])
|
if (chr_[j - 1] == chr_[i])
|
||||||
|
@ -469,7 +473,34 @@ void Pattern::init(const char *options, const uint8_t *pred)
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
if (lcs_ < 0xffff)
|
if (lcs_ < 0xffff)
|
||||||
bmd_ = 0; // do not use B-M
|
{
|
||||||
|
// do not use B-M
|
||||||
|
bmd_ = 0;
|
||||||
|
// spread lcp and lcs apart if lcp and lcs are adjacent (chars are possibly correlated)
|
||||||
|
if (len_ == 3 && (lcp_ == 1 || lcs_ == 1))
|
||||||
|
{
|
||||||
|
lcp_ = 0;
|
||||||
|
lcs_ = 2;
|
||||||
|
}
|
||||||
|
else if (len_ > 3 && (lcp_ + 1 == lcs_ || lcs_ + 1 == lcp_))
|
||||||
|
{
|
||||||
|
uint8_t freqlcs = 255;
|
||||||
|
for (i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
if (i > lcp_ + 1 || i + 1 < lcp_)
|
||||||
|
{
|
||||||
|
uint8_t pch = static_cast<uint8_t>(chr_[i]);
|
||||||
|
uint8_t freqpch = frequency(pch);
|
||||||
|
if (freqlcs > freqpch)
|
||||||
|
{
|
||||||
|
lcs_ = i;
|
||||||
|
freqlcs = freqpch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DBGLOG("len=%zu bmd=%zu lcp=%hu(%hu)", len_, bmd_, lcp_, lcs_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -629,6 +660,7 @@ void Pattern::parse(
|
||||||
loc = 0;
|
loc = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
bol_ = at(loc) == '^';
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
Location end = loc;
|
Location end = loc;
|
||||||
|
@ -733,6 +765,8 @@ void Pattern::parse(
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
if (at(loc) != '^')
|
||||||
|
bol_ = false;
|
||||||
parse2(
|
parse2(
|
||||||
true,
|
true,
|
||||||
loc,
|
loc,
|
||||||
|
@ -2961,8 +2995,8 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
::fprintf(file,
|
::fprintf(file,
|
||||||
"void reflex_code_%s(reflex::Matcher& m)\n"
|
"void reflex_code_%s(reflex::Matcher& m)\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
" int c0 = 0, c1 = 0;\n"
|
" int c = 0;\n"
|
||||||
" m.FSM_INIT(c1);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str());
|
" m.FSM_INIT(c);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str());
|
||||||
for (const DFA::State *state = start; state != NULL; state = state->next)
|
for (const DFA::State *state = start; state != NULL; state = state->next)
|
||||||
{
|
{
|
||||||
::fprintf(file, "\nS%u:\n", state->index);
|
::fprintf(file, "\nS%u:\n", state->index);
|
||||||
|
@ -2978,8 +3012,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
::fprintf(file, " m.FSM_HEAD(%u);\n", *i);
|
::fprintf(file, " m.FSM_HEAD(%u);\n", *i);
|
||||||
if (state->edges.rbegin() != state->edges.rend() && state->edges.rbegin()->first == META_DED)
|
if (state->edges.rbegin() != state->edges.rend() && state->edges.rbegin()->first == META_DED)
|
||||||
::fprintf(file, " if (m.FSM_DENT()) goto S%u;\n", state->edges.rbegin()->second.second->index);
|
::fprintf(file, " if (m.FSM_DENT()) goto S%u;\n", state->edges.rbegin()->second.second->index);
|
||||||
bool peek = false; // if we need to read a character into c1
|
bool peek = false; // if we need to read a character into c
|
||||||
bool prev = false; // if we need to keep the previous character in c0
|
|
||||||
for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
|
for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
|
||||||
{
|
{
|
||||||
#if WITH_COMPACT_DFA == -1
|
#if WITH_COMPACT_DFA == -1
|
||||||
|
@ -2993,13 +3026,12 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
{
|
{
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (lo == META_EOB || lo == META_EOL)
|
if (lo == META_EOB || lo == META_EOL || lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
|
||||||
|
{
|
||||||
peek = true;
|
peek = true;
|
||||||
else if (lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
|
|
||||||
prev = peek = true;
|
|
||||||
if (prev && peek)
|
|
||||||
break;
|
break;
|
||||||
check_dfa_closure(i->second.second, 1, peek, prev);
|
}
|
||||||
|
check_dfa_closure(i->second.second, 1, peek);
|
||||||
} while (++lo <= hi);
|
} while (++lo <= hi);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -3025,10 +3057,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
target_index = i->second.second->index;
|
target_index = i->second.second->index;
|
||||||
if (read)
|
if (read)
|
||||||
{
|
{
|
||||||
if (prev)
|
::fprintf(file, " c = m.FSM_CHAR();\n");
|
||||||
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
|
|
||||||
else
|
|
||||||
::fprintf(file, " c1 = m.FSM_CHAR();\n");
|
|
||||||
read = false;
|
read = false;
|
||||||
}
|
}
|
||||||
if (is_meta(lo))
|
if (is_meta(lo))
|
||||||
|
@ -3039,14 +3068,6 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
{
|
{
|
||||||
case META_EOB:
|
case META_EOB:
|
||||||
case META_EOL:
|
case META_EOL:
|
||||||
::fprintf(file, " ");
|
|
||||||
if (elif)
|
|
||||||
::fprintf(file, "else ");
|
|
||||||
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
|
|
||||||
gencode_dfa_closure(file, i->second.second, 2, peek);
|
|
||||||
::fprintf(file, " }\n");
|
|
||||||
elif = true;
|
|
||||||
break;
|
|
||||||
case META_EWE:
|
case META_EWE:
|
||||||
case META_BWE:
|
case META_BWE:
|
||||||
case META_NWE:
|
case META_NWE:
|
||||||
|
@ -3054,7 +3075,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
::fprintf(file, " ");
|
::fprintf(file, " ");
|
||||||
if (elif)
|
if (elif)
|
||||||
::fprintf(file, "else ");
|
::fprintf(file, "else ");
|
||||||
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
|
::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
|
||||||
gencode_dfa_closure(file, i->second.second, 2, peek);
|
gencode_dfa_closure(file, i->second.second, 2, peek);
|
||||||
::fprintf(file, " }\n");
|
::fprintf(file, " }\n");
|
||||||
elif = true;
|
elif = true;
|
||||||
|
@ -3077,7 +3098,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
break;
|
break;
|
||||||
if (lo == hi)
|
if (lo == hi)
|
||||||
{
|
{
|
||||||
::fprintf(file, " if (c1 == ");
|
::fprintf(file, " if (c == ");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, ")");
|
::fprintf(file, ")");
|
||||||
}
|
}
|
||||||
|
@ -3085,20 +3106,20 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
{
|
{
|
||||||
::fprintf(file, " if (");
|
::fprintf(file, " if (");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, " <= c1)");
|
::fprintf(file, " <= c)");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
::fprintf(file, " if (");
|
::fprintf(file, " if (");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, " <= c1 && c1 <= ");
|
::fprintf(file, " <= c && c <= ");
|
||||||
print_char(file, hi);
|
print_char(file, hi);
|
||||||
::fprintf(file, ")");
|
::fprintf(file, ")");
|
||||||
}
|
}
|
||||||
if (target_index == Const::IMAX)
|
if (target_index == Const::IMAX)
|
||||||
{
|
{
|
||||||
if (peek)
|
if (peek)
|
||||||
::fprintf(file, " return m.FSM_HALT(c1);\n");
|
::fprintf(file, " return m.FSM_HALT(c);\n");
|
||||||
else
|
else
|
||||||
::fprintf(file, " return m.FSM_HALT();\n");
|
::fprintf(file, " return m.FSM_HALT();\n");
|
||||||
}
|
}
|
||||||
|
@ -3117,10 +3138,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
{
|
{
|
||||||
if (read)
|
if (read)
|
||||||
{
|
{
|
||||||
if (prev)
|
::fprintf(file, " c = m.FSM_CHAR();\n");
|
||||||
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
|
|
||||||
else
|
|
||||||
::fprintf(file, " c1 = m.FSM_CHAR();\n");
|
|
||||||
read = false;
|
read = false;
|
||||||
}
|
}
|
||||||
do
|
do
|
||||||
|
@ -3129,14 +3147,6 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
{
|
{
|
||||||
case META_EOB:
|
case META_EOB:
|
||||||
case META_EOL:
|
case META_EOL:
|
||||||
::fprintf(file, " ");
|
|
||||||
if (elif)
|
|
||||||
::fprintf(file, "else ");
|
|
||||||
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
|
|
||||||
gencode_dfa_closure(file, i->second.second, 2, peek);
|
|
||||||
::fprintf(file, " }\n");
|
|
||||||
elif = true;
|
|
||||||
break;
|
|
||||||
case META_EWE:
|
case META_EWE:
|
||||||
case META_BWE:
|
case META_BWE:
|
||||||
case META_NWE:
|
case META_NWE:
|
||||||
|
@ -3144,7 +3154,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
::fprintf(file, " ");
|
::fprintf(file, " ");
|
||||||
if (elif)
|
if (elif)
|
||||||
::fprintf(file, "else ");
|
::fprintf(file, "else ");
|
||||||
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
|
::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
|
||||||
gencode_dfa_closure(file, i->second.second, 2, peek);
|
gencode_dfa_closure(file, i->second.second, 2, peek);
|
||||||
::fprintf(file, " }\n");
|
::fprintf(file, " }\n");
|
||||||
elif = true;
|
elif = true;
|
||||||
|
@ -3170,10 +3180,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
target_index = i->second.second->index;
|
target_index = i->second.second->index;
|
||||||
if (read)
|
if (read)
|
||||||
{
|
{
|
||||||
if (prev)
|
::fprintf(file, " c = m.FSM_CHAR();\n");
|
||||||
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
|
|
||||||
else
|
|
||||||
::fprintf(file, " c1 = m.FSM_CHAR();\n");
|
|
||||||
read = false;
|
read = false;
|
||||||
}
|
}
|
||||||
if (!is_meta(lo))
|
if (!is_meta(lo))
|
||||||
|
@ -3183,7 +3190,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
break;
|
break;
|
||||||
if (lo == hi)
|
if (lo == hi)
|
||||||
{
|
{
|
||||||
::fprintf(file, " if (c1 == ");
|
::fprintf(file, " if (c == ");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, ")");
|
::fprintf(file, ")");
|
||||||
}
|
}
|
||||||
|
@ -3191,20 +3198,20 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
{
|
{
|
||||||
::fprintf(file, " if (");
|
::fprintf(file, " if (");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, " <= c1)");
|
::fprintf(file, " <= c)");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
::fprintf(file, " if (");
|
::fprintf(file, " if (");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, " <= c1 && c1 <= ");
|
::fprintf(file, " <= c && c <= ");
|
||||||
print_char(file, hi);
|
print_char(file, hi);
|
||||||
::fprintf(file, ")");
|
::fprintf(file, ")");
|
||||||
}
|
}
|
||||||
if (target_index == Const::IMAX)
|
if (target_index == Const::IMAX)
|
||||||
{
|
{
|
||||||
if (peek)
|
if (peek)
|
||||||
::fprintf(file, " return m.FSM_HALT(c1);\n");
|
::fprintf(file, " return m.FSM_HALT(c);\n");
|
||||||
else
|
else
|
||||||
::fprintf(file, " return m.FSM_HALT();\n");
|
::fprintf(file, " return m.FSM_HALT();\n");
|
||||||
}
|
}
|
||||||
|
@ -3216,7 +3223,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (peek)
|
if (peek)
|
||||||
::fprintf(file, " return m.FSM_HALT(c1);\n");
|
::fprintf(file, " return m.FSM_HALT(c);\n");
|
||||||
else
|
else
|
||||||
::fprintf(file, " return m.FSM_HALT();\n");
|
::fprintf(file, " return m.FSM_HALT();\n");
|
||||||
}
|
}
|
||||||
|
@ -3234,7 +3241,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef WITH_NO_CODEGEN
|
#ifndef WITH_NO_CODEGEN
|
||||||
void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, bool& prev) const
|
void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek) const
|
||||||
{
|
{
|
||||||
if (nest > 5)
|
if (nest > 5)
|
||||||
return;
|
return;
|
||||||
|
@ -3251,13 +3258,12 @@ void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, b
|
||||||
{
|
{
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (lo == META_EOB || lo == META_EOL)
|
if (lo == META_EOB || lo == META_EOL || lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
|
||||||
|
{
|
||||||
peek = true;
|
peek = true;
|
||||||
else if (lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
|
|
||||||
prev = peek = true;
|
|
||||||
if (prev && peek)
|
|
||||||
break;
|
break;
|
||||||
check_dfa_closure(i->second.second, nest + 1, peek, prev);
|
}
|
||||||
|
check_dfa_closure(i->second.second, nest + 1, peek);
|
||||||
} while (++lo <= hi);
|
} while (++lo <= hi);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3271,14 +3277,14 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
|
||||||
if (state->redo)
|
if (state->redo)
|
||||||
{
|
{
|
||||||
if (peek)
|
if (peek)
|
||||||
::fprintf(file, "%*sm.FSM_REDO(c1);\n", 2*nest, "");
|
::fprintf(file, "%*sm.FSM_REDO(c);\n", 2*nest, "");
|
||||||
else
|
else
|
||||||
::fprintf(file, "%*sm.FSM_REDO();\n", 2*nest, "");
|
::fprintf(file, "%*sm.FSM_REDO();\n", 2*nest, "");
|
||||||
}
|
}
|
||||||
else if (state->accept > 0)
|
else if (state->accept > 0)
|
||||||
{
|
{
|
||||||
if (peek)
|
if (peek)
|
||||||
::fprintf(file, "%*sm.FSM_TAKE(%u, c1);\n", 2*nest, "", state->accept);
|
::fprintf(file, "%*sm.FSM_TAKE(%u, c);\n", 2*nest, "", state->accept);
|
||||||
else
|
else
|
||||||
::fprintf(file, "%*sm.FSM_TAKE(%u);\n", 2*nest, "", state->accept);
|
::fprintf(file, "%*sm.FSM_TAKE(%u);\n", 2*nest, "", state->accept);
|
||||||
}
|
}
|
||||||
|
@ -3303,14 +3309,6 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
|
||||||
{
|
{
|
||||||
case META_EOB:
|
case META_EOB:
|
||||||
case META_EOL:
|
case META_EOL:
|
||||||
::fprintf(file, "%*s", 2*nest, "");
|
|
||||||
if (elif)
|
|
||||||
::fprintf(file, "else ");
|
|
||||||
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
|
|
||||||
gencode_dfa_closure(file, i->second.second, nest + 1, peek);
|
|
||||||
::fprintf(file, "%*s}\n", 2*nest, "");
|
|
||||||
elif = true;
|
|
||||||
break;
|
|
||||||
case META_EWE:
|
case META_EWE:
|
||||||
case META_BWE:
|
case META_BWE:
|
||||||
case META_NWE:
|
case META_NWE:
|
||||||
|
@ -3318,7 +3316,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
|
||||||
::fprintf(file, "%*s", 2*nest, "");
|
::fprintf(file, "%*s", 2*nest, "");
|
||||||
if (elif)
|
if (elif)
|
||||||
::fprintf(file, "else ");
|
::fprintf(file, "else ");
|
||||||
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
|
::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
|
||||||
gencode_dfa_closure(file, i->second.second, nest + 1, peek);
|
gencode_dfa_closure(file, i->second.second, nest + 1, peek);
|
||||||
::fprintf(file, "%*s}\n", 2*nest, "");
|
::fprintf(file, "%*s}\n", 2*nest, "");
|
||||||
elif = true;
|
elif = true;
|
||||||
|
@ -3346,7 +3344,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
|
||||||
::fprintf(file, "%*s", 2*nest, "");
|
::fprintf(file, "%*s", 2*nest, "");
|
||||||
if (lo == hi)
|
if (lo == hi)
|
||||||
{
|
{
|
||||||
::fprintf(file, "if (c1 == ");
|
::fprintf(file, "if (c == ");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, ")");
|
::fprintf(file, ")");
|
||||||
}
|
}
|
||||||
|
@ -3354,20 +3352,20 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
|
||||||
{
|
{
|
||||||
::fprintf(file, "if (");
|
::fprintf(file, "if (");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, " <= c1)");
|
::fprintf(file, " <= c)");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
::fprintf(file, "if (");
|
::fprintf(file, "if (");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, " <= c1 && c1 <= ");
|
::fprintf(file, " <= c && c <= ");
|
||||||
print_char(file, hi);
|
print_char(file, hi);
|
||||||
::fprintf(file, ")");
|
::fprintf(file, ")");
|
||||||
}
|
}
|
||||||
if (target_index == Const::IMAX)
|
if (target_index == Const::IMAX)
|
||||||
{
|
{
|
||||||
if (peek)
|
if (peek)
|
||||||
::fprintf(file, " return m.FSM_HALT(c1);\n");
|
::fprintf(file, " return m.FSM_HALT(c);\n");
|
||||||
else
|
else
|
||||||
::fprintf(file, " return m.FSM_HALT();\n");
|
::fprintf(file, " return m.FSM_HALT();\n");
|
||||||
}
|
}
|
||||||
|
@ -3394,7 +3392,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
|
||||||
::fprintf(file, "%*s", 2*nest, "");
|
::fprintf(file, "%*s", 2*nest, "");
|
||||||
if (lo == hi)
|
if (lo == hi)
|
||||||
{
|
{
|
||||||
::fprintf(file, "if (c1 == ");
|
::fprintf(file, "if (c == ");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, ")");
|
::fprintf(file, ")");
|
||||||
}
|
}
|
||||||
|
@ -3402,20 +3400,20 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
|
||||||
{
|
{
|
||||||
::fprintf(file, "if (");
|
::fprintf(file, "if (");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, " <= c1)");
|
::fprintf(file, " <= c)");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
::fprintf(file, "if (");
|
::fprintf(file, "if (");
|
||||||
print_char(file, lo);
|
print_char(file, lo);
|
||||||
::fprintf(file, " <= c1 && c1 <= ");
|
::fprintf(file, " <= c && c <= ");
|
||||||
print_char(file, hi);
|
print_char(file, hi);
|
||||||
::fprintf(file, ")");
|
::fprintf(file, ")");
|
||||||
}
|
}
|
||||||
if (target_index == Const::IMAX)
|
if (target_index == Const::IMAX)
|
||||||
{
|
{
|
||||||
if (peek)
|
if (peek)
|
||||||
::fprintf(file, " return m.FSM_HALT(c1);\n");
|
::fprintf(file, " return m.FSM_HALT(c);\n");
|
||||||
else
|
else
|
||||||
::fprintf(file, " return m.FSM_HALT();\n");
|
::fprintf(file, " return m.FSM_HALT();\n");
|
||||||
}
|
}
|
||||||
|
@ -4560,26 +4558,31 @@ bool Pattern::match_hfa_transitions(size_t level, const HFA::Hashes& hashes, con
|
||||||
void Pattern::write_predictor(FILE *file) const
|
void Pattern::write_predictor(FILE *file) const
|
||||||
{
|
{
|
||||||
::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (len_ == 0) * 256 + Const::HASH + (lbk_ > 0) * 68);
|
::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (len_ == 0) * 256 + Const::HASH + (lbk_ > 0) * 68);
|
||||||
::fprintf(file, "\n %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4) | ((lbk_ > 0) << 5))));
|
::fprintf(file, "\n %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4) | ((lbk_ > 0) << 5) | (bol_ << 6))));
|
||||||
|
// save match characters chr_[0..len_-1]
|
||||||
for (size_t i = 0; i < len_; ++i)
|
for (size_t i = 0; i < len_; ++i)
|
||||||
::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n ", static_cast<uint8_t>(chr_[i]));
|
::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n ", static_cast<uint8_t>(chr_[i]));
|
||||||
if (len_ == 0)
|
if (len_ == 0)
|
||||||
{
|
{
|
||||||
|
// save bitap bit_[] parameters
|
||||||
for (Char i = 0; i < 256; ++i)
|
for (Char i = 0; i < 256; ++i)
|
||||||
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~bit_[i]));
|
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~bit_[i]));
|
||||||
}
|
}
|
||||||
if (min_ >= 4)
|
if (min_ < 4)
|
||||||
{
|
|
||||||
for (Hash i = 0; i < Const::HASH; ++i)
|
|
||||||
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pmh_[i]));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
|
// save predict match PM4 pma_[] parameters
|
||||||
for (Hash i = 0; i < Const::HASH; ++i)
|
for (Hash i = 0; i < Const::HASH; ++i)
|
||||||
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pma_[i]));
|
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pma_[i]));
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// save predict match hash pmh_[] parameters
|
||||||
|
for (Hash i = 0; i < Const::HASH; ++i)
|
||||||
|
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pmh_[i]));
|
||||||
|
}
|
||||||
if (lbk_ > 0)
|
if (lbk_ > 0)
|
||||||
{
|
{
|
||||||
|
// save lookback parameters lbk_ lbm_ cbk_[] after s-t cut and first s-t cut pattern characters fst_[]
|
||||||
::fprintf(file, "\n %3hhu,%3hhu,%3hhu,%3hhu,", static_cast<uint8_t>(lbk_ & 0xff), static_cast<uint8_t>(lbk_ >> 8), static_cast<uint8_t>(lbm_ & 0xff), static_cast<uint8_t>(lbm_ >> 8));
|
::fprintf(file, "\n %3hhu,%3hhu,%3hhu,%3hhu,", static_cast<uint8_t>(lbk_ & 0xff), static_cast<uint8_t>(lbk_ >> 8), static_cast<uint8_t>(lbm_ & 0xff), static_cast<uint8_t>(lbm_ >> 8));
|
||||||
for (size_t i = 0; i < 256; i += 8)
|
for (size_t i = 0; i < 256; i += 8)
|
||||||
{
|
{
|
||||||
|
|
157
ccl/rslang/import/reflex/lib/simd.cpp
Normal file
157
ccl/rslang/import/reflex/lib/simd.cpp
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
/******************************************************************************\
|
||||||
|
* Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved. *
|
||||||
|
* *
|
||||||
|
* Redistribution and use in source and binary forms, with or without *
|
||||||
|
* modification, are permitted provided that the following conditions are met: *
|
||||||
|
* *
|
||||||
|
* (1) Redistributions of source code must retain the above copyright notice, *
|
||||||
|
* this list of conditions and the following disclaimer. *
|
||||||
|
* *
|
||||||
|
* (2) Redistributions in binary form must reproduce the above copyright *
|
||||||
|
* notice, this list of conditions and the following disclaimer in the *
|
||||||
|
* documentation and/or other materials provided with the distribution. *
|
||||||
|
* *
|
||||||
|
* (3) The name of the author may not be used to endorse or promote products *
|
||||||
|
* derived from this software without specific prior written permission. *
|
||||||
|
* *
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF *
|
||||||
|
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO *
|
||||||
|
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
|
||||||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; *
|
||||||
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, *
|
||||||
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR *
|
||||||
|
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF *
|
||||||
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
|
||||||
|
\******************************************************************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
@file simd.cpp
|
||||||
|
@brief RE/flex SIMD primitives
|
||||||
|
@author Robert van Engelen - engelen@genivia.com
|
||||||
|
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
|
||||||
|
@copyright (c) BSD-3 License - see LICENSE.txt
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <reflex/simd.h>
|
||||||
|
|
||||||
|
namespace reflex {
|
||||||
|
|
||||||
|
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
|
||||||
|
|
||||||
|
// simd.h get_HW()
|
||||||
|
static uint64_t get_HW()
|
||||||
|
{
|
||||||
|
int CPUInfo1[4] = { 0, 0, 0, 0 };
|
||||||
|
int CPUInfo7[4] = { 0, 0, 0, 0 };
|
||||||
|
cpuidex(CPUInfo1, 0, 0);
|
||||||
|
int n = CPUInfo1[0];
|
||||||
|
if (n <= 0)
|
||||||
|
return 0ULL;
|
||||||
|
cpuidex(CPUInfo1, 1, 0); // cpuid EAX=1
|
||||||
|
if (n >= 7)
|
||||||
|
cpuidex(CPUInfo7, 7, 0); // cpuid EAX=7, ECX=0
|
||||||
|
return static_cast<uint32_t>(CPUInfo1[2]) | (static_cast<uint64_t>(static_cast<uint32_t>(CPUInfo7[1])) << 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t HW = get_HW();
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
size_t nlcount(const char *s, const char *t)
|
||||||
|
{
|
||||||
|
size_t n = 0;
|
||||||
|
if (s <= t - 256)
|
||||||
|
{
|
||||||
|
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
|
||||||
|
if (have_HW_AVX512BW())
|
||||||
|
n = simd_nlcount_avx512bw(s, t);
|
||||||
|
else if (have_HW_AVX2())
|
||||||
|
n = simd_nlcount_avx2(s, t);
|
||||||
|
else
|
||||||
|
#elif defined(HAVE_AVX512BW) || defined(HAVE_AVX2)
|
||||||
|
if (have_HW_AVX2())
|
||||||
|
n = simd_nlcount_avx2(s, t);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
|
||||||
|
{
|
||||||
|
const char *e = t - 64;
|
||||||
|
// align on 16 bytes
|
||||||
|
while ((reinterpret_cast<std::ptrdiff_t>(s) & 0x0f) != 0)
|
||||||
|
n += (*s++ == '\n');
|
||||||
|
__m128i vlcn = _mm_set1_epi8('\n');
|
||||||
|
while (s <= e)
|
||||||
|
{
|
||||||
|
__m128i vlcm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
|
||||||
|
__m128i vlcm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 16));
|
||||||
|
__m128i vlcm3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 32));
|
||||||
|
__m128i vlcm4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 48));
|
||||||
|
__m128i vlceq1 = _mm_cmpeq_epi8(vlcm1, vlcn);
|
||||||
|
__m128i vlceq2 = _mm_cmpeq_epi8(vlcm2, vlcn);
|
||||||
|
__m128i vlceq3 = _mm_cmpeq_epi8(vlcm3, vlcn);
|
||||||
|
__m128i vlceq4 = _mm_cmpeq_epi8(vlcm4, vlcn);
|
||||||
|
n += popcount(_mm_movemask_epi8(vlceq1))
|
||||||
|
+ popcount(_mm_movemask_epi8(vlceq2))
|
||||||
|
+ popcount(_mm_movemask_epi8(vlceq3))
|
||||||
|
+ popcount(_mm_movemask_epi8(vlceq4));
|
||||||
|
s += 64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#elif defined(HAVE_NEON)
|
||||||
|
const char *e = t - 64;
|
||||||
|
uint8x16_t vlcn = vdupq_n_u8('\n');
|
||||||
|
while (s <= e)
|
||||||
|
{
|
||||||
|
uint8x16_t vlcm0 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
|
||||||
|
uint8x16_t vleq0 = vceqq_u8(vlcm0, vlcn);
|
||||||
|
s += 16;
|
||||||
|
uint8x16_t vlcm1 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
|
||||||
|
uint8x16_t vleq1 = vceqq_u8(vlcm1, vlcn);
|
||||||
|
s += 16;
|
||||||
|
uint8x16_t vlcm2 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
|
||||||
|
uint8x16_t vleq2 = vceqq_u8(vlcm2, vlcn);
|
||||||
|
s += 16;
|
||||||
|
uint8x16_t vlcm3 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
|
||||||
|
uint8x16_t vleq3 = vceqq_u8(vlcm3, vlcn);
|
||||||
|
s += 16;
|
||||||
|
#if defined(__aarch64__)
|
||||||
|
n += vaddvq_s8(vqabsq_s8(vreinterpretq_s8_u8(vaddq_u8(vleq0, vaddq_u8(vleq1, vaddq_u8(vleq2, vleq3))))));
|
||||||
|
#else
|
||||||
|
// my homebrew horizontal sum (we have a very limited range 0..4 to sum to a total max 4x16=64 < 256)
|
||||||
|
uint64x2_t vsum = vreinterpretq_u64_s8(vqabsq_s8(vreinterpretq_s8_u8(vaddq_u8(vleq0, vaddq_u8(vleq1, vaddq_u8(vleq2, vleq3))))));
|
||||||
|
uint64_t sum0 = vgetq_lane_u64(vsum, 0) + vgetq_lane_u64(vsum, 1);
|
||||||
|
uint32_t sum1 = static_cast<uint32_t>(sum0) + (sum0 >> 32);
|
||||||
|
uint16_t sum2 = static_cast<uint16_t>(sum1) + (sum1 >> 16);
|
||||||
|
n += static_cast<uint8_t>(sum2) + (sum2 >> 8);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
// 4-way auto-vectorizable loop
|
||||||
|
uint32_t n0 = 0, n1 = 0, n2 = 0, n3 = 0;
|
||||||
|
while (s < t - 3)
|
||||||
|
{
|
||||||
|
n0 += s[0] == '\n';
|
||||||
|
n1 += s[1] == '\n';
|
||||||
|
n2 += s[2] == '\n';
|
||||||
|
n3 += s[3] == '\n';
|
||||||
|
s += 4;
|
||||||
|
}
|
||||||
|
n += n0 + n1 + n2 + n3;
|
||||||
|
// epilogue
|
||||||
|
if (s < t)
|
||||||
|
{
|
||||||
|
n += *s == '\n';
|
||||||
|
if (++s < t)
|
||||||
|
{
|
||||||
|
n += *s == '\n';
|
||||||
|
if (++s < t)
|
||||||
|
n += *s == '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace reflex
|
|
@ -34,15 +34,20 @@
|
||||||
@copyright (c) BSD-3 License - see LICENSE.txt
|
@copyright (c) BSD-3 License - see LICENSE.txt
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <reflex/absmatcher.h>
|
#if defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
|
||||||
#include <cstddef>
|
# if !defined(__AVX2__) && !defined(__AVX512BW__)
|
||||||
|
# error simd_avx2.cpp must be compiled with -mavx2 or /arch:avx2.
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <reflex/simd.h>
|
||||||
|
|
||||||
namespace reflex {
|
namespace reflex {
|
||||||
|
|
||||||
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
|
// Partially count newlines in string b up to e, updates b close to e with uncounted part
|
||||||
size_t simd_nlcount_avx2(const char*& b, const char *e)
|
size_t simd_nlcount_avx2(const char*& b, const char *e)
|
||||||
{
|
{
|
||||||
#if defined(HAVE_AVX2)
|
#if defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
|
||||||
const char *s = b;
|
const char *s = b;
|
||||||
e -= 128;
|
e -= 128;
|
||||||
if (s > e)
|
if (s > e)
|
||||||
|
@ -73,42 +78,4 @@ size_t simd_nlcount_avx2(const char*& b, const char *e)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
|
|
||||||
size_t simd_nlcount_sse2(const char*& b, const char *e)
|
|
||||||
{
|
|
||||||
#if defined(HAVE_SSE2)
|
|
||||||
const char *s = b;
|
|
||||||
e -= 64;
|
|
||||||
if (s > e)
|
|
||||||
return 0;
|
|
||||||
size_t n = 0;
|
|
||||||
// align on 16 bytes
|
|
||||||
while ((reinterpret_cast<std::ptrdiff_t>(s) & 0x0f) != 0)
|
|
||||||
n += (*s++ == '\n');
|
|
||||||
__m128i vlcn = _mm_set1_epi8('\n');
|
|
||||||
while (s <= e)
|
|
||||||
{
|
|
||||||
__m128i vlcm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
|
|
||||||
__m128i vlcm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 16));
|
|
||||||
__m128i vlcm3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 32));
|
|
||||||
__m128i vlcm4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 48));
|
|
||||||
__m128i vlceq1 = _mm_cmpeq_epi8(vlcm1, vlcn);
|
|
||||||
__m128i vlceq2 = _mm_cmpeq_epi8(vlcm2, vlcn);
|
|
||||||
__m128i vlceq3 = _mm_cmpeq_epi8(vlcm3, vlcn);
|
|
||||||
__m128i vlceq4 = _mm_cmpeq_epi8(vlcm4, vlcn);
|
|
||||||
n += popcount(_mm_movemask_epi8(vlceq1))
|
|
||||||
+ popcount(_mm_movemask_epi8(vlceq2))
|
|
||||||
+ popcount(_mm_movemask_epi8(vlceq3))
|
|
||||||
+ popcount(_mm_movemask_epi8(vlceq4));
|
|
||||||
s += 64;
|
|
||||||
}
|
|
||||||
b = s;
|
|
||||||
return n;
|
|
||||||
#else
|
|
||||||
(void)b;
|
|
||||||
(void)e;
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace reflex
|
} // namespace reflex
|
||||||
|
|
|
@ -34,12 +34,17 @@
|
||||||
@copyright (c) BSD-3 License - see LICENSE.txt
|
@copyright (c) BSD-3 License - see LICENSE.txt
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <reflex/absmatcher.h>
|
#if defined(HAVE_AVX512BW)
|
||||||
#include <cstddef>
|
# if !defined(__AVX512BW__)
|
||||||
|
# error simd_avx512bw.cpp must be compiled with -mavx512bw or /arch:avx512.
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <reflex/simd.h>
|
||||||
|
|
||||||
namespace reflex {
|
namespace reflex {
|
||||||
|
|
||||||
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
|
// Partially count newlines in string b up to e, updates b close to e with uncounted part
|
||||||
size_t simd_nlcount_avx512bw(const char*& b, const char *e)
|
size_t simd_nlcount_avx512bw(const char*& b, const char *e)
|
||||||
{
|
{
|
||||||
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
|
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
|
||||||
|
|
|
@ -83,9 +83,15 @@ Tables::Tables()
|
||||||
range["Control"] = range["Cc"];
|
range["Control"] = range["Cc"];
|
||||||
range["Format"] = range["Cf"];
|
range["Format"] = range["Cf"];
|
||||||
|
|
||||||
range["d"] = range["Decimal_Digit_Number"];
|
range["Cntrl"] = range["C"];
|
||||||
range["l"] = range["Lowercase_Letter"];
|
range["Digit"] = range["Nd"];
|
||||||
range["u"] = range["Uppercase_Letter"];
|
range["Lower"] = range["Ll"];
|
||||||
|
range["Punct"] = range["P"];
|
||||||
|
range["Upper"] = range["Lu"];
|
||||||
|
|
||||||
|
range["d"] = range["Digit"];
|
||||||
|
range["l"] = range["Lower"];
|
||||||
|
range["u"] = range["Upper"];
|
||||||
range["s"] = range["Space"];
|
range["s"] = range["Space"];
|
||||||
range["w"] = range["Word"];
|
range["w"] = range["Word"];
|
||||||
}
|
}
|
||||||
|
|
|
@ -2107,6 +2107,36 @@ void reflex::Unicode::Tables::language_scripts(void)
|
||||||
0, 0
|
0, 0
|
||||||
};
|
};
|
||||||
range["Grantha"] = Grantha;
|
range["Grantha"] = Grantha;
|
||||||
|
static const int Graph[] = {
|
||||||
|
33, 126,
|
||||||
|
161, 172,
|
||||||
|
174, 1535,
|
||||||
|
1542, 1563,
|
||||||
|
1565, 1756,
|
||||||
|
1758, 1806,
|
||||||
|
1808, 2191,
|
||||||
|
2194, 2273,
|
||||||
|
2275, 5759,
|
||||||
|
5761, 6157,
|
||||||
|
6159, 8191,
|
||||||
|
8208, 8231,
|
||||||
|
8240, 8286,
|
||||||
|
8293, 8293,
|
||||||
|
8304, 12287,
|
||||||
|
12289, 55295,
|
||||||
|
57344, 65278,
|
||||||
|
65280, 65528,
|
||||||
|
65532, 69820,
|
||||||
|
69822, 69836,
|
||||||
|
69838, 78895,
|
||||||
|
78912, 113823,
|
||||||
|
113828, 119154,
|
||||||
|
119163, 917504,
|
||||||
|
917506, 917535,
|
||||||
|
917632, 1114111,
|
||||||
|
0, 0
|
||||||
|
};
|
||||||
|
range["Graph"] = Graph;
|
||||||
static const int Greek[] = {
|
static const int Greek[] = {
|
||||||
880, 883,
|
880, 883,
|
||||||
885, 887,
|
885, 887,
|
||||||
|
@ -7430,6 +7460,34 @@ void reflex::Unicode::Tables::language_scripts(void)
|
||||||
0, 0
|
0, 0
|
||||||
};
|
};
|
||||||
range["Po"] = Po;
|
range["Po"] = Po;
|
||||||
|
static const int Print[] = {
|
||||||
|
32, 126,
|
||||||
|
160, 172,
|
||||||
|
174, 1535,
|
||||||
|
1542, 1563,
|
||||||
|
1565, 1756,
|
||||||
|
1758, 1806,
|
||||||
|
1808, 2191,
|
||||||
|
2194, 2273,
|
||||||
|
2275, 6157,
|
||||||
|
6159, 8202,
|
||||||
|
8208, 8233,
|
||||||
|
8239, 8287,
|
||||||
|
8293, 8293,
|
||||||
|
8304, 55295,
|
||||||
|
57344, 65278,
|
||||||
|
65280, 65528,
|
||||||
|
65532, 69820,
|
||||||
|
69822, 69836,
|
||||||
|
69838, 78895,
|
||||||
|
78912, 113823,
|
||||||
|
113828, 119154,
|
||||||
|
119163, 917504,
|
||||||
|
917506, 917535,
|
||||||
|
917632, 1114111,
|
||||||
|
0, 0
|
||||||
|
};
|
||||||
|
range["Print"] = Print;
|
||||||
static const int Ps[] = {
|
static const int Ps[] = {
|
||||||
40, 40,
|
40, 40,
|
||||||
91, 91,
|
91, 91,
|
||||||
|
|
|
@ -2,6 +2,377 @@
|
||||||
#include <reflex/unicode.h>
|
#include <reflex/unicode.h>
|
||||||
void reflex::Unicode::Tables::letter_scripts(void)
|
void reflex::Unicode::Tables::letter_scripts(void)
|
||||||
{
|
{
|
||||||
|
static const int Alnum[] = {
|
||||||
|
48, 57,
|
||||||
|
65, 90,
|
||||||
|
97, 122,
|
||||||
|
181, 181,
|
||||||
|
192, 214,
|
||||||
|
216, 246,
|
||||||
|
248, 442,
|
||||||
|
444, 447,
|
||||||
|
452, 452,
|
||||||
|
454, 455,
|
||||||
|
457, 458,
|
||||||
|
460, 497,
|
||||||
|
499, 659,
|
||||||
|
661, 687,
|
||||||
|
880, 883,
|
||||||
|
886, 887,
|
||||||
|
891, 893,
|
||||||
|
895, 895,
|
||||||
|
902, 902,
|
||||||
|
904, 906,
|
||||||
|
908, 908,
|
||||||
|
910, 929,
|
||||||
|
931, 1013,
|
||||||
|
1015, 1153,
|
||||||
|
1162, 1327,
|
||||||
|
1329, 1366,
|
||||||
|
1376, 1416,
|
||||||
|
1632, 1641,
|
||||||
|
1776, 1785,
|
||||||
|
1984, 1993,
|
||||||
|
2406, 2415,
|
||||||
|
2534, 2543,
|
||||||
|
2662, 2671,
|
||||||
|
2790, 2799,
|
||||||
|
2918, 2927,
|
||||||
|
3046, 3055,
|
||||||
|
3174, 3183,
|
||||||
|
3302, 3311,
|
||||||
|
3430, 3439,
|
||||||
|
3558, 3567,
|
||||||
|
3664, 3673,
|
||||||
|
3792, 3801,
|
||||||
|
3872, 3881,
|
||||||
|
4160, 4169,
|
||||||
|
4240, 4249,
|
||||||
|
4256, 4293,
|
||||||
|
4295, 4295,
|
||||||
|
4301, 4301,
|
||||||
|
4304, 4346,
|
||||||
|
4349, 4351,
|
||||||
|
5024, 5109,
|
||||||
|
5112, 5117,
|
||||||
|
6112, 6121,
|
||||||
|
6160, 6169,
|
||||||
|
6470, 6479,
|
||||||
|
6608, 6617,
|
||||||
|
6784, 6793,
|
||||||
|
6800, 6809,
|
||||||
|
6992, 7001,
|
||||||
|
7088, 7097,
|
||||||
|
7232, 7241,
|
||||||
|
7248, 7257,
|
||||||
|
7296, 7304,
|
||||||
|
7312, 7354,
|
||||||
|
7357, 7359,
|
||||||
|
7424, 7467,
|
||||||
|
7531, 7543,
|
||||||
|
7545, 7578,
|
||||||
|
7680, 7957,
|
||||||
|
7960, 7965,
|
||||||
|
7968, 8005,
|
||||||
|
8008, 8013,
|
||||||
|
8016, 8023,
|
||||||
|
8025, 8025,
|
||||||
|
8027, 8027,
|
||||||
|
8029, 8029,
|
||||||
|
8031, 8061,
|
||||||
|
8064, 8071,
|
||||||
|
8080, 8087,
|
||||||
|
8096, 8103,
|
||||||
|
8112, 8116,
|
||||||
|
8118, 8123,
|
||||||
|
8126, 8126,
|
||||||
|
8130, 8132,
|
||||||
|
8134, 8139,
|
||||||
|
8144, 8147,
|
||||||
|
8150, 8155,
|
||||||
|
8160, 8172,
|
||||||
|
8178, 8180,
|
||||||
|
8182, 8187,
|
||||||
|
8450, 8450,
|
||||||
|
8455, 8455,
|
||||||
|
8458, 8467,
|
||||||
|
8469, 8469,
|
||||||
|
8473, 8477,
|
||||||
|
8484, 8484,
|
||||||
|
8486, 8486,
|
||||||
|
8488, 8488,
|
||||||
|
8490, 8493,
|
||||||
|
8495, 8500,
|
||||||
|
8505, 8505,
|
||||||
|
8508, 8511,
|
||||||
|
8517, 8521,
|
||||||
|
8526, 8526,
|
||||||
|
8579, 8580,
|
||||||
|
11264, 11387,
|
||||||
|
11390, 11492,
|
||||||
|
11499, 11502,
|
||||||
|
11506, 11507,
|
||||||
|
11520, 11557,
|
||||||
|
11559, 11559,
|
||||||
|
11565, 11565,
|
||||||
|
42528, 42537,
|
||||||
|
42560, 42605,
|
||||||
|
42624, 42651,
|
||||||
|
42786, 42863,
|
||||||
|
42865, 42887,
|
||||||
|
42891, 42894,
|
||||||
|
42896, 42954,
|
||||||
|
42960, 42961,
|
||||||
|
42963, 42963,
|
||||||
|
42965, 42969,
|
||||||
|
42997, 42998,
|
||||||
|
43002, 43002,
|
||||||
|
43216, 43225,
|
||||||
|
43264, 43273,
|
||||||
|
43472, 43481,
|
||||||
|
43504, 43513,
|
||||||
|
43600, 43609,
|
||||||
|
43824, 43866,
|
||||||
|
43872, 43880,
|
||||||
|
43888, 43967,
|
||||||
|
44016, 44025,
|
||||||
|
64256, 64262,
|
||||||
|
64275, 64279,
|
||||||
|
65296, 65305,
|
||||||
|
65313, 65338,
|
||||||
|
65345, 65370,
|
||||||
|
66560, 66639,
|
||||||
|
66720, 66729,
|
||||||
|
66736, 66771,
|
||||||
|
66776, 66811,
|
||||||
|
66928, 66938,
|
||||||
|
66940, 66954,
|
||||||
|
66956, 66962,
|
||||||
|
66964, 66965,
|
||||||
|
66967, 66977,
|
||||||
|
66979, 66993,
|
||||||
|
66995, 67001,
|
||||||
|
67003, 67004,
|
||||||
|
68736, 68786,
|
||||||
|
68800, 68850,
|
||||||
|
68912, 68921,
|
||||||
|
69734, 69743,
|
||||||
|
69872, 69881,
|
||||||
|
69942, 69951,
|
||||||
|
70096, 70105,
|
||||||
|
70384, 70393,
|
||||||
|
70736, 70745,
|
||||||
|
70864, 70873,
|
||||||
|
71248, 71257,
|
||||||
|
71360, 71369,
|
||||||
|
71472, 71481,
|
||||||
|
71840, 71913,
|
||||||
|
72016, 72025,
|
||||||
|
72784, 72793,
|
||||||
|
73040, 73049,
|
||||||
|
73120, 73129,
|
||||||
|
73552, 73561,
|
||||||
|
92768, 92777,
|
||||||
|
92864, 92873,
|
||||||
|
93008, 93017,
|
||||||
|
93760, 93823,
|
||||||
|
119808, 119892,
|
||||||
|
119894, 119964,
|
||||||
|
119966, 119967,
|
||||||
|
119970, 119970,
|
||||||
|
119973, 119974,
|
||||||
|
119977, 119980,
|
||||||
|
119982, 119993,
|
||||||
|
119995, 119995,
|
||||||
|
119997, 120003,
|
||||||
|
120005, 120069,
|
||||||
|
120071, 120074,
|
||||||
|
120077, 120084,
|
||||||
|
120086, 120092,
|
||||||
|
120094, 120121,
|
||||||
|
120123, 120126,
|
||||||
|
120128, 120132,
|
||||||
|
120134, 120134,
|
||||||
|
120138, 120144,
|
||||||
|
120146, 120485,
|
||||||
|
120488, 120512,
|
||||||
|
120514, 120538,
|
||||||
|
120540, 120570,
|
||||||
|
120572, 120596,
|
||||||
|
120598, 120628,
|
||||||
|
120630, 120654,
|
||||||
|
120656, 120686,
|
||||||
|
120688, 120712,
|
||||||
|
120714, 120744,
|
||||||
|
120746, 120770,
|
||||||
|
120772, 120779,
|
||||||
|
120782, 120831,
|
||||||
|
122624, 122633,
|
||||||
|
122635, 122654,
|
||||||
|
122661, 122666,
|
||||||
|
123200, 123209,
|
||||||
|
123632, 123641,
|
||||||
|
124144, 124153,
|
||||||
|
125184, 125251,
|
||||||
|
125264, 125273,
|
||||||
|
130032, 130041,
|
||||||
|
0, 0
|
||||||
|
};
|
||||||
|
range["Alnum"] = Alnum;
|
||||||
|
static const int Alpha[] = {
|
||||||
|
65, 90,
|
||||||
|
97, 122,
|
||||||
|
181, 181,
|
||||||
|
192, 214,
|
||||||
|
216, 246,
|
||||||
|
248, 442,
|
||||||
|
444, 447,
|
||||||
|
452, 452,
|
||||||
|
454, 455,
|
||||||
|
457, 458,
|
||||||
|
460, 497,
|
||||||
|
499, 659,
|
||||||
|
661, 687,
|
||||||
|
880, 883,
|
||||||
|
886, 887,
|
||||||
|
891, 893,
|
||||||
|
895, 895,
|
||||||
|
902, 902,
|
||||||
|
904, 906,
|
||||||
|
908, 908,
|
||||||
|
910, 929,
|
||||||
|
931, 1013,
|
||||||
|
1015, 1153,
|
||||||
|
1162, 1327,
|
||||||
|
1329, 1366,
|
||||||
|
1376, 1416,
|
||||||
|
4256, 4293,
|
||||||
|
4295, 4295,
|
||||||
|
4301, 4301,
|
||||||
|
4304, 4346,
|
||||||
|
4349, 4351,
|
||||||
|
5024, 5109,
|
||||||
|
5112, 5117,
|
||||||
|
7296, 7304,
|
||||||
|
7312, 7354,
|
||||||
|
7357, 7359,
|
||||||
|
7424, 7467,
|
||||||
|
7531, 7543,
|
||||||
|
7545, 7578,
|
||||||
|
7680, 7957,
|
||||||
|
7960, 7965,
|
||||||
|
7968, 8005,
|
||||||
|
8008, 8013,
|
||||||
|
8016, 8023,
|
||||||
|
8025, 8025,
|
||||||
|
8027, 8027,
|
||||||
|
8029, 8029,
|
||||||
|
8031, 8061,
|
||||||
|
8064, 8071,
|
||||||
|
8080, 8087,
|
||||||
|
8096, 8103,
|
||||||
|
8112, 8116,
|
||||||
|
8118, 8123,
|
||||||
|
8126, 8126,
|
||||||
|
8130, 8132,
|
||||||
|
8134, 8139,
|
||||||
|
8144, 8147,
|
||||||
|
8150, 8155,
|
||||||
|
8160, 8172,
|
||||||
|
8178, 8180,
|
||||||
|
8182, 8187,
|
||||||
|
8450, 8450,
|
||||||
|
8455, 8455,
|
||||||
|
8458, 8467,
|
||||||
|
8469, 8469,
|
||||||
|
8473, 8477,
|
||||||
|
8484, 8484,
|
||||||
|
8486, 8486,
|
||||||
|
8488, 8488,
|
||||||
|
8490, 8493,
|
||||||
|
8495, 8500,
|
||||||
|
8505, 8505,
|
||||||
|
8508, 8511,
|
||||||
|
8517, 8521,
|
||||||
|
8526, 8526,
|
||||||
|
8579, 8580,
|
||||||
|
11264, 11387,
|
||||||
|
11390, 11492,
|
||||||
|
11499, 11502,
|
||||||
|
11506, 11507,
|
||||||
|
11520, 11557,
|
||||||
|
11559, 11559,
|
||||||
|
11565, 11565,
|
||||||
|
42560, 42605,
|
||||||
|
42624, 42651,
|
||||||
|
42786, 42863,
|
||||||
|
42865, 42887,
|
||||||
|
42891, 42894,
|
||||||
|
42896, 42954,
|
||||||
|
42960, 42961,
|
||||||
|
42963, 42963,
|
||||||
|
42965, 42969,
|
||||||
|
42997, 42998,
|
||||||
|
43002, 43002,
|
||||||
|
43824, 43866,
|
||||||
|
43872, 43880,
|
||||||
|
43888, 43967,
|
||||||
|
64256, 64262,
|
||||||
|
64275, 64279,
|
||||||
|
65313, 65338,
|
||||||
|
65345, 65370,
|
||||||
|
66560, 66639,
|
||||||
|
66736, 66771,
|
||||||
|
66776, 66811,
|
||||||
|
66928, 66938,
|
||||||
|
66940, 66954,
|
||||||
|
66956, 66962,
|
||||||
|
66964, 66965,
|
||||||
|
66967, 66977,
|
||||||
|
66979, 66993,
|
||||||
|
66995, 67001,
|
||||||
|
67003, 67004,
|
||||||
|
68736, 68786,
|
||||||
|
68800, 68850,
|
||||||
|
71840, 71903,
|
||||||
|
93760, 93823,
|
||||||
|
119808, 119892,
|
||||||
|
119894, 119964,
|
||||||
|
119966, 119967,
|
||||||
|
119970, 119970,
|
||||||
|
119973, 119974,
|
||||||
|
119977, 119980,
|
||||||
|
119982, 119993,
|
||||||
|
119995, 119995,
|
||||||
|
119997, 120003,
|
||||||
|
120005, 120069,
|
||||||
|
120071, 120074,
|
||||||
|
120077, 120084,
|
||||||
|
120086, 120092,
|
||||||
|
120094, 120121,
|
||||||
|
120123, 120126,
|
||||||
|
120128, 120132,
|
||||||
|
120134, 120134,
|
||||||
|
120138, 120144,
|
||||||
|
120146, 120485,
|
||||||
|
120488, 120512,
|
||||||
|
120514, 120538,
|
||||||
|
120540, 120570,
|
||||||
|
120572, 120596,
|
||||||
|
120598, 120628,
|
||||||
|
120630, 120654,
|
||||||
|
120656, 120686,
|
||||||
|
120688, 120712,
|
||||||
|
120714, 120744,
|
||||||
|
120746, 120770,
|
||||||
|
120772, 120779,
|
||||||
|
122624, 122633,
|
||||||
|
122635, 122654,
|
||||||
|
122661, 122666,
|
||||||
|
125184, 125251,
|
||||||
|
0, 0
|
||||||
|
};
|
||||||
|
range["Alpha"] = Alpha;
|
||||||
static const int Ll[] = {
|
static const int Ll[] = {
|
||||||
97, 122,
|
97, 122,
|
||||||
181, 181,
|
181, 181,
|
||||||
|
|
|
@ -31,6 +31,7 @@
|
||||||
#include "../lib/error.cpp"
|
#include "../lib/error.cpp"
|
||||||
#include "../lib/input.cpp"
|
#include "../lib/input.cpp"
|
||||||
#include "../lib/matcher.cpp"
|
#include "../lib/matcher.cpp"
|
||||||
|
#include "../lib/simd.cpp"
|
||||||
|
|
||||||
#undef min
|
#undef min
|
||||||
#undef max
|
#undef max
|
||||||
|
|
|
@ -2,6 +2,11 @@
|
||||||
|
|
||||||
$workDir = Resolve-Path -Path "${PSScriptRoot}\..\ccl\rslang\src"
|
$workDir = Resolve-Path -Path "${PSScriptRoot}\..\ccl\rslang\src"
|
||||||
|
|
||||||
|
# Change default relative path according to your work directory setup
|
||||||
|
# Re-flex repository: https://github.com/Genivia/RE-flex
|
||||||
|
$reflexRelative = Resolve-Path -Path "${PSScriptRoot}\..\..\GH-RE-flex\bin\win64"
|
||||||
|
$Env:PATH += ";${reflexRelative}"
|
||||||
|
|
||||||
function BuildLexers {
|
function BuildLexers {
|
||||||
Set-Location -Path ${workDir}
|
Set-Location -Path ${workDir}
|
||||||
BuildSyntax('AsciiLexer')
|
BuildSyntax('AsciiLexer')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user