Update re-flex

This commit is contained in:
Ivan 2024-07-14 12:30:40 +03:00
parent 735d315862
commit 0c6d998cc5
22 changed files with 8736 additions and 4394 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -30,7 +30,7 @@
@file absmatcher.h @file absmatcher.h
@brief RE/flex abstract matcher base class and pattern matcher class @brief RE/flex abstract matcher base class and pattern matcher class
@author Robert van Engelen - engelen@genivia.com @author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved. @copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt @copyright (c) BSD-3 License - see LICENSE.txt
*/ */
@ -42,12 +42,12 @@
#define WITH_REALLOC 1 #define WITH_REALLOC 1
#endif #endif
/// This compile-time option speeds up matching, but slows input(). /// This compile-time option speeds up matching, but slows input() somewhat.
#ifndef WITH_FAST_GET #ifndef WITH_FAST_GET
#define WITH_FAST_GET 1 #define WITH_FAST_GET 1
#endif #endif
/// This compile-time option adds span(), line(), wline(), bol(), eol() /// This compile-time option adds span(), line(), wline(), bol(), eol().
#ifndef WITH_SPAN #ifndef WITH_SPAN
#define WITH_SPAN 1 #define WITH_SPAN 1
#endif #endif
@ -111,7 +111,7 @@ class AbstractMatcher {
static const int EOB = EOF; ///< end of buffer meta-char marker static const int EOB = EOF; ///< end of buffer meta-char marker
static const size_t BLOCK = 4096; ///< minimum remaining unused space in the buffer, to prevent excessive shifting static const size_t BLOCK = 4096; ///< minimum remaining unused space in the buffer, to prevent excessive shifting
#ifndef REFLEX_BUFSZ #ifndef REFLEX_BUFSZ
static const size_t BUFSZ = (128*1024); ///< initial buffer size, at least 4096 bytes static const size_t BUFSZ = (256*1024); ///< initial buffer size, at least 4096 bytes
#else #else
static const size_t BUFSZ = REFLEX_BUFSZ; static const size_t BUFSZ = REFLEX_BUFSZ;
#endif #endif
@ -154,11 +154,13 @@ class AbstractMatcher {
A(false), A(false),
N(false), N(false),
W(false), W(false),
X(false),
T(8) T(8)
{ } { }
bool A; ///< accept any/all (?^X) negative patterns as Const::REDO accept index codes bool A; ///< accept any/all (?^X) negative patterns as Const::REDO accept index codes
bool N; ///< nullable, find may return empty match (N/A to scan, split, matches) bool N; ///< nullable, find may return empty match (N/A to scan, split, matches)
bool W; ///< half-check for "whole words", check only left of \< and right of \> for non-word character bool W; ///< reflex::Matcher matches whole words as if bound by \< and \>
bool X; ///< reflex::LineMatcher matches empty lines
char T; ///< tab size, must be a power of 2, default is 8, for column count and indent \i, \j, and \k char T; ///< tab size, must be a power of 2, default is 8, for column count and indent \i, \j, and \k
}; };
/// AbstractMatcher::Iterator class for scanning, searching, and splitting input character sequences. /// AbstractMatcher::Iterator class for scanning, searching, and splitting input character sequences.
@ -356,7 +358,8 @@ class AbstractMatcher {
{ {
opt_.A = false; // when true: accept any/all (?^X) negative patterns as Const::REDO accept index codes opt_.A = false; // when true: accept any/all (?^X) negative patterns as Const::REDO accept index codes
opt_.N = false; // when true: find may return empty match (N/A to scan, split, matches) opt_.N = false; // when true: find may return empty match (N/A to scan, split, matches)
opt_.W = false; // when true: half-check for "whole words", check only left of \< and right of \> for non-word character opt_.W = false; // when true: reflex::Matcher matches whole words as if bound by \< and \>
opt_.X = false; // when true: reflex::LineMatcher matches empty lines
opt_.T = 8; // tab size 1, 2, 4, or 8 opt_.T = 8; // tab size 1, 2, 4, or 8
if (opt) if (opt)
{ {
@ -373,6 +376,9 @@ class AbstractMatcher {
case 'W': case 'W':
opt_.W = true; opt_.W = true;
break; break;
case 'X':
opt_.X = true;
break;
case 'T': case 'T':
opt_.T = isdigit(*(s += (s[1] == '=') + 1)) ? static_cast<char>(*s - '0') : 0; opt_.T = isdigit(*(s += (s[1] == '=') + 1)) ? static_cast<char>(*s - '0') : 0;
break; break;
@ -422,6 +428,7 @@ class AbstractMatcher {
own_ = true; own_ = true;
eof_ = false; eof_ = false;
mat_ = false; mat_ = false;
cml_ = false;
} }
/// Set buffer block size for reading: use 0 (or omit argument) to buffer all input in which case returns true if all the data could be read and false if a read error occurred. /// Set buffer block size for reading: use 0 (or omit argument) to buffer all input in which case returns true if all the data could be read and false if a read error occurred.
bool buffer(size_t blk = 0) ///< new block size between 1 and Const::BLOCK, or 0 to buffer all input (default) bool buffer(size_t blk = 0) ///< new block size between 1 and Const::BLOCK, or 0 to buffer all input (default)
@ -486,7 +493,7 @@ class AbstractMatcher {
(void)buffer(1); (void)buffer(1);
} }
/// Flush the buffer's remaining content. /// Flush the buffer's remaining content.
void flush() inline void flush()
{ {
DBGLOG("AbstractMatcher::flush()"); DBGLOG("AbstractMatcher::flush()");
pos_ = end_; pos_ = end_;
@ -560,6 +567,7 @@ class AbstractMatcher {
own_ = false; own_ = false;
eof_ = true; eof_ = true;
mat_ = false; mat_ = false;
cml_ = false;
} }
return *this; return *this;
} }
@ -645,6 +653,13 @@ class AbstractMatcher {
{ {
return utf8(txt_); return utf8(txt_);
} }
#if WITH_SPAN
/// Set or reset mode to count matching lines only and skip other (e.g. for speed).
inline void lineno_skip(bool f = false)
{
cml_ = f;
}
#endif
/// Set or change the starting line number of the last match. /// Set or change the starting line number of the last match.
inline void lineno(size_t n) ///< new line number inline void lineno(size_t n) ///< new line number
{ {
@ -652,7 +667,7 @@ class AbstractMatcher {
lno_ = n; lno_ = n;
} }
/// Updates and returns the starting line number of the match in the input character sequence. /// Updates and returns the starting line number of the match in the input character sequence.
inline size_t lineno() size_t lineno()
/// @returns line number /// @returns line number
{ {
#if WITH_SPAN #if WITH_SPAN
@ -661,55 +676,16 @@ class AbstractMatcher {
const char *s = lpb_; const char *s = lpb_;
const char *t = txt_; const char *t = txt_;
size_t n = 0; size_t n = 0;
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64)) if (cml_)
if (have_HW_AVX512BW()) {
n = simd_nlcount_avx512bw(s, t); // count number of matching lines only, not line numbers
else if (have_HW_AVX2()) n = std::memchr(s, '\n', t - s) != NULL;
n = simd_nlcount_avx2(s, t); }
else else
n = simd_nlcount_sse2(s, t);
#elif defined(HAVE_AVX2)
if (have_HW_AVX2())
n = simd_nlcount_avx2(s, t);
else
n = simd_nlcount_sse2(s, t);
#elif defined(HAVE_SSE2)
n = simd_nlcount_sse2(s, t);
#endif
#if defined(HAVE_NEON)
// no ARM AArch64/NEON SIMD optimized loop? - no code that runs faster than the code below?!
uint32_t n0 = 0, n1 = 0;
while (s < t - 1)
{ {
n0 += s[0] == '\n'; // count line numbers
n1 += s[1] == '\n'; n = nlcount(s, t);
s += 2;
} }
n += n0 + n1 + (s < t && *s == '\n');
#else
// clang/gcc 4-way auto-vectorizable loop
uint32_t n0 = 0, n1 = 0, n2 = 0, n3 = 0;
while (s < t - 3)
{
n0 += s[0] == '\n';
n1 += s[1] == '\n';
n2 += s[2] == '\n';
n3 += s[3] == '\n';
s += 4;
}
n += n0 + n1 + n2 + n3;
// epilogue
if (s < t)
{
n += *s == '\n';
if (++s < t)
{
n += *s == '\n';
if (++s < t)
n += *s == '\n';
}
}
#endif
// if newlines are detected, then find begin of the last line to adjust bol // if newlines are detected, then find begin of the last line to adjust bol
if (n > 0) if (n > 0)
{ {
@ -956,13 +932,13 @@ class AbstractMatcher {
else if (got_ == '\n') else if (got_ == '\n')
got_ = Const::UNK; got_ = Const::UNK;
} }
/// Returns true if this matcher matched text that begins a word. /// Returns true if this matcher matched text that begins an ASCII word.
inline bool at_bow() inline bool at_bow()
/// @returns true if this matcher matched text that begins a word /// @returns true if this matcher matched text that begins a word
{ {
return !isword(got_) && isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more()); return !isword(got_) && isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more());
} }
/// Returns true if this matcher matched text that ends a word. /// Returns true if this matcher matched text that ends an ASCII word.
inline bool at_eow() inline bool at_eow()
/// @returns true if this matcher matched text that ends a word /// @returns true if this matcher matched text that ends a word
{ {
@ -1116,21 +1092,37 @@ class AbstractMatcher {
} }
return buf_ + end_; return buf_ + end_;
} }
/// Return number of bytes available given number of bytes to fetch ahead, limited by input size and buffer size
inline size_t fetch(size_t len)
/// @returns number of bytes available after fetching.
{
DBGLOG("AbstractMatcher::fetch(%zu)", len);
if (eof_)
return 0;
if (len <= end_ - (txt_ - buf_))
return end_ - (txt_ - buf_);
if (end_ + len + 1 >= max_)
(void)grow();
if (end_ + len + 1 >= max_)
len = max_ - end_ - 1;
end_ += get(buf_ + end_, len);
return avail();
}
/// Returns the number of bytes in the buffer available to search from the current begin()/text() position. /// Returns the number of bytes in the buffer available to search from the current begin()/text() position.
size_t avail() inline size_t avail()
{ {
if (peek() == EOF) if (peek() == EOF)
return 0; return 0;
return end_ - (txt_ - buf_); return end_ - (txt_ - buf_);
} }
/// Returns the byte offset of the match from the start of the line. /// Returns the byte offset of the match from the start of the line.
size_t border() inline size_t border()
/// @returns border offset /// @returns border offset
{ {
return txt_ - bol(); return txt_ - bol();
} }
/// Enlarge the match to span the entire line of input (excluding \n), return text(). /// Enlarge the match to span the entire line of input (excluding \n), return text().
const char *span() inline const char *span()
/// @returns const char* span of text for the entire line /// @returns const char* span of text for the entire line
{ {
DBGLOG("AbstractMatcher::span()"); DBGLOG("AbstractMatcher::span()");
@ -1146,7 +1138,7 @@ class AbstractMatcher {
return text(); return text();
} }
/// Returns the line of input (excluding \n) as a string containing the matched text as a substring. /// Returns the line of input (excluding \n) as a string containing the matched text as a substring.
std::string line() inline std::string line()
/// @returns matching line as a string /// @returns matching line as a string
{ {
DBGLOG("AbstractMatcher::line()"); DBGLOG("AbstractMatcher::line()");
@ -1156,7 +1148,7 @@ class AbstractMatcher {
return std::string(b, e - b); return std::string(b, e - b);
} }
/// Returns the line of input (excluding \n) as a wide string containing the matched text as a substring. /// Returns the line of input (excluding \n) as a wide string containing the matched text as a substring.
std::wstring wline() inline std::wstring wline()
/// @returns matching line as a wide string /// @returns matching line as a wide string
{ {
DBGLOG("AbstractMatcher::wline()"); DBGLOG("AbstractMatcher::wline()");
@ -1252,12 +1244,12 @@ class AbstractMatcher {
return text(); return text();
} }
/// Append the next match to the currently matched text returned by AbstractMatcher::text, when the next match found is adjacent to the current match. /// Append the next match to the currently matched text returned by AbstractMatcher::text, when the next match found is adjacent to the current match.
void more() inline void more()
{ {
cur_ = txt_ - buf_; cur_ = txt_ - buf_;
} }
/// Truncate the AbstractMatcher::text length of the match to n characters in length and reposition for next match. /// Truncate the AbstractMatcher::text length of the match to n characters in length and reposition for next match.
void less(size_t n) ///< truncated string length inline void less(size_t n) ///< truncated string length
{ {
if (n < len_) if (n < len_)
{ {
@ -1270,80 +1262,80 @@ class AbstractMatcher {
} }
} }
/// Cast this matcher to positive integer indicating the nonzero capture index of the matched text in the pattern, same as AbstractMatcher::accept. /// Cast this matcher to positive integer indicating the nonzero capture index of the matched text in the pattern, same as AbstractMatcher::accept.
operator size_t() const inline operator size_t() const
/// @returns nonzero capture index of a match, which may be matcher dependent, or zero for a mismatch /// @returns nonzero capture index of a match, which may be matcher dependent, or zero for a mismatch
{ {
return accept(); return accept();
} }
/// Cast this matcher to a std::string of the text matched by this matcher. /// Cast this matcher to a std::string of the text matched by this matcher.
operator std::string() const inline operator std::string() const
/// @returns std::string with matched text /// @returns std::string with matched text
{ {
return str(); return str();
} }
/// Cast this matcher to a std::wstring of the text matched by this matcher. /// Cast this matcher to a std::wstring of the text matched by this matcher.
operator std::wstring() const inline operator std::wstring() const
/// @returns std::wstring converted to UCS from the 0-terminated matched UTF-8 text /// @returns std::wstring converted to UCS from the 0-terminated matched UTF-8 text
{ {
return wstr(); return wstr();
} }
/// Cast the match to std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenization into containers. /// Cast the match to std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenization into containers.
operator std::pair<size_t,std::string>() const inline operator std::pair<size_t,std::string>() const
/// @returns std::pair<size_t,std::wstring>(accept(), wstr()) /// @returns std::pair<size_t,std::wstring>(accept(), wstr())
{ {
return pair(); return pair();
} }
/// Returns true if matched text is equal to a string, useful for std::algorithm. /// Returns true if matched text is equal to a string, useful for std::algorithm.
bool operator==(const char *rhs) ///< rhs string to compare to inline bool operator==(const char *rhs) ///< rhs string to compare to
/// @returns true if matched text is equal to rhs string /// @returns true if matched text is equal to rhs string
const const
{ {
return std::strncmp(rhs, txt_, len_) == 0 && rhs[len_] == '\0'; return std::strncmp(rhs, txt_, len_) == 0 && rhs[len_] == '\0';
} }
/// Returns true if matched text is equalt to a string, useful for std::algorithm. /// Returns true if matched text is equalt to a string, useful for std::algorithm.
bool operator==(const std::string& rhs) ///< rhs string to compare to inline bool operator==(const std::string& rhs) ///< rhs string to compare to
/// @returns true if matched text is equal to rhs string /// @returns true if matched text is equal to rhs string
const const
{ {
return rhs.size() == len_ && rhs.compare(0, std::string::npos, txt_, len_) == 0; return rhs.size() == len_ && rhs.compare(0, std::string::npos, txt_, len_) == 0;
} }
/// Returns true if capture index is equal to a given size_t value, useful for std::algorithm. /// Returns true if capture index is equal to a given size_t value, useful for std::algorithm.
bool operator==(size_t rhs) ///< capture index to compare accept() to inline bool operator==(size_t rhs) ///< capture index to compare accept() to
/// @returns true if capture index is equal to rhs /// @returns true if capture index is equal to rhs
const const
{ {
return accept() == rhs; return accept() == rhs;
} }
/// Returns true if capture index is equal to a given int value, useful for std::algorithm. /// Returns true if capture index is equal to a given int value, useful for std::algorithm.
bool operator==(int rhs) ///< capture index to compare accept() to inline bool operator==(int rhs) ///< capture index to compare accept() to
/// @returns true if capture index is equal to rhs /// @returns true if capture index is equal to rhs
const const
{ {
return static_cast<int>(accept()) == rhs; return static_cast<int>(accept()) == rhs;
} }
/// Returns true if matched text is not equal to a string, useful for std::algorithm. /// Returns true if matched text is not equal to a string, useful for std::algorithm.
bool operator!=(const char *rhs) ///< rhs string to compare to inline bool operator!=(const char *rhs) ///< rhs string to compare to
/// @returns true if matched text is not equal to rhs string /// @returns true if matched text is not equal to rhs string
const const
{ {
return std::strncmp(rhs, txt_, len_) != 0 || rhs[len_] != '\0'; // if static checkers complain here, they are wrong return std::strncmp(rhs, txt_, len_) != 0 || rhs[len_] != '\0'; // if static checkers complain here, they are wrong
} }
/// Returns true if matched text is not equal to a string, useful for std::algorithm. /// Returns true if matched text is not equal to a string, useful for std::algorithm.
bool operator!=(const std::string& rhs) ///< rhs string to compare to inline bool operator!=(const std::string& rhs) ///< rhs string to compare to
/// @returns true if matched text is not equal to rhs string /// @returns true if matched text is not equal to rhs string
const const
{ {
return rhs.size() > len_ || rhs.compare(0, std::string::npos, txt_, len_) != 0; return rhs.size() > len_ || rhs.compare(0, std::string::npos, txt_, len_) != 0;
} }
/// Returns true if capture index is not equal to a given size_t value, useful for std::algorithm. /// Returns true if capture index is not equal to a given size_t value, useful for std::algorithm.
bool operator!=(size_t rhs) ///< capture index to compare accept() to inline bool operator!=(size_t rhs) ///< capture index to compare accept() to
/// @returns true if capture index is not equal to rhs /// @returns true if capture index is not equal to rhs
const const
{ {
return accept() != rhs; return accept() != rhs;
} }
/// Returns true if capture index is not equal to a given int value, useful for std::algorithm. /// Returns true if capture index is not equal to a given int value, useful for std::algorithm.
bool operator!=(int rhs) ///< capture index to compare accept() to inline bool operator!=(int rhs) ///< capture index to compare accept() to
/// @returns true if capture index is not equal to rhs /// @returns true if capture index is not equal to rhs
const const
{ {
@ -1632,6 +1624,7 @@ class AbstractMatcher {
bool own_; ///< true if AbstractMatcher::buf_ was allocated and should be deleted bool own_; ///< true if AbstractMatcher::buf_ was allocated and should be deleted
bool eof_; ///< input has reached EOF bool eof_; ///< input has reached EOF
bool mat_; ///< true if AbstractMatcher::matches() was successful bool mat_; ///< true if AbstractMatcher::matches() was successful
bool cml_; ///< true when counting matching lines instead of line numbers
}; };
/// The pattern matcher class template extends abstract matcher base class. /// The pattern matcher class template extends abstract matcher base class.
@ -1656,7 +1649,7 @@ class PatternMatcher : public AbstractMatcher {
delete pat_; delete pat_;
} }
/// Assign a matcher, the underlying pattern object is shared (not deep copied). /// Assign a matcher, the underlying pattern object is shared (not deep copied).
PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared) virtual PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
{ {
scan.init(this, Const::SCAN); scan.init(this, Const::SCAN);
find.init(this, Const::FIND); find.init(this, Const::FIND);
@ -1664,9 +1657,7 @@ class PatternMatcher : public AbstractMatcher {
in = matcher.in; in = matcher.in;
reset(); reset();
opt_ = matcher.opt_; opt_ = matcher.opt_;
pat_ = matcher.pat_, return pattern(matcher.pat_);
own_ = false;
return *this;
} }
/// Set the pattern to use with this matcher as a shared pointer to another matcher pattern. /// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
@ -1726,19 +1717,19 @@ class PatternMatcher : public AbstractMatcher {
return *this; return *this;
} }
/// Returns true if this matcher has a pattern. /// Returns true if this matcher has a pattern.
bool has_pattern() const inline bool has_pattern() const
/// @returns true if this matcher has a pattern /// @returns true if this matcher has a pattern
{ {
return pat_ != NULL; return pat_ != NULL;
} }
/// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete). /// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
bool own_pattern() const inline bool own_pattern() const
/// @returns true if this matcher has its own pattern /// @returns true if this matcher has its own pattern
{ {
return own_ && pat_ != NULL; return own_ && pat_ != NULL;
} }
/// Returns a reference to the pattern object associated with this matcher. /// Returns a reference to the pattern object associated with this matcher.
const Pattern& pattern() const virtual const Pattern& pattern() const
/// @returns reference to pattern object /// @returns reference to pattern object
{ {
ASSERT(pat_ != NULL); ASSERT(pat_ != NULL);
@ -1809,7 +1800,7 @@ class PatternMatcher<std::string> : public AbstractMatcher {
delete pat_; delete pat_;
} }
/// Assign a matcher, the underlying pattern string is shared (not deep copied). /// Assign a matcher, the underlying pattern string is shared (not deep copied).
PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared) virtual PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
{ {
scan.init(this, Const::SCAN); scan.init(this, Const::SCAN);
find.init(this, Const::FIND); find.init(this, Const::FIND);
@ -1817,9 +1808,7 @@ class PatternMatcher<std::string> : public AbstractMatcher {
in = matcher.in; in = matcher.in;
reset(); reset();
opt_ = matcher.opt_; opt_ = matcher.opt_;
pat_ = matcher.pat_, return pattern(matcher.pat_);
own_ = false;
return *this;
} }
/// Set the pattern to use with this matcher as a shared pointer to another matcher pattern. /// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
@ -1865,19 +1854,19 @@ class PatternMatcher<std::string> : public AbstractMatcher {
return *this; return *this;
} }
/// Returns true if this matcher has a pattern. /// Returns true if this matcher has a pattern.
bool has_pattern() const inline bool has_pattern() const
/// @returns true if this matcher has a pattern /// @returns true if this matcher has a pattern
{ {
return pat_ != NULL; return pat_ != NULL;
} }
/// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete). /// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
bool own_pattern() const inline bool own_pattern() const
/// @returns true if this matcher has its own pattern /// @returns true if this matcher has its own pattern
{ {
return own_ && pat_ != NULL; return own_ && pat_ != NULL;
} }
/// Returns a reference to the pattern string associated with this matcher. /// Returns a reference to the pattern string associated with this matcher.
const Pattern& pattern() const virtual const Pattern& pattern() const
/// @returns reference to pattern string /// @returns reference to pattern string
{ {
ASSERT(pat_ != NULL); ASSERT(pat_ != NULL);

File diff suppressed because it is too large Load Diff

View File

@ -138,8 +138,8 @@ find:
// option N also finds empty lines // option N also finds empty lines
if (n == 0 && !opt_.N) if (n == 0 && !opt_.N)
goto find; goto find;
// option W only finds empty lines // option X only finds empty lines
if (n > 0 && opt_.W) if (n > 0 && opt_.X)
goto find; goto find;
break; break;
case Const::SPLIT: case Const::SPLIT:

File diff suppressed because it is too large Load Diff

View File

@ -365,8 +365,8 @@ class Pattern {
f |= pmh[h] & 4; f |= pmh[h] & 4;
h = hash(h, static_cast<uint8_t>(*++s)); h = hash(h, static_cast<uint8_t>(*++s));
f |= pmh[h] & 8; f |= pmh[h] & 8;
Pred m = 16;
const char *e = s + n - 3; const char *e = s + n - 3;
Pred m = 16;
while (f == 0 && ++s < e) while (f == 0 && ++s < e)
{ {
h = hash(h, static_cast<uint8_t>(*s)); h = hash(h, static_cast<uint8_t>(*s));
@ -941,8 +941,7 @@ class Pattern {
void check_dfa_closure( void check_dfa_closure(
const DFA::State *state, const DFA::State *state,
int nest, int nest,
bool& peek, bool& peek) const;
bool& prev) const;
void gencode_dfa_closure( void gencode_dfa_closure(
FILE *fd, FILE *fd,
const DFA::State *start, const DFA::State *start,
@ -1172,7 +1171,7 @@ class Pattern {
Index cut_; ///< DFA s-t cut to improve predict match and HFA accuracy together with lbk_ and cbk_ Index cut_; ///< DFA s-t cut to improve predict match and HFA accuracy together with lbk_ and cbk_
size_t len_; ///< length of chr_[], less or equal to 255 size_t len_; ///< length of chr_[], less or equal to 255
size_t min_; ///< patterns after the prefix are at least this long but no more than 8 size_t min_; ///< patterns after the prefix are at least this long but no more than 8
size_t pin_; ///< number of needles size_t pin_; ///< number of needles, 0 to 16
std::bitset<256> cbk_; ///< characters to look back over when lbk_ > 0, never includes \n std::bitset<256> cbk_; ///< characters to look back over when lbk_ > 0, never includes \n
std::bitset<256> fst_; ///< the beginning characters of the pattern std::bitset<256> fst_; ///< the beginning characters of the pattern
char chr_[256]; ///< pattern prefix string or character needles for needle-based search char chr_[256]; ///< pattern prefix string or character needles for needle-based search
@ -1183,7 +1182,7 @@ class Pattern {
uint16_t lbm_; ///< loopback minimum distance when lbk_ > 0 uint16_t lbm_; ///< loopback minimum distance when lbk_ > 0
uint16_t lcp_; ///< primary least common character position in the pattern or 0xffff uint16_t lcp_; ///< primary least common character position in the pattern or 0xffff
uint16_t lcs_; ///< secondary least common character position in the pattern or 0xffff uint16_t lcs_; ///< secondary least common character position in the pattern or 0xffff
size_t bmd_; ///< Boyer-Moore jump distance on mismatch, B-M is enabled when bmd_ > 0 size_t bmd_; ///< Boyer-Moore jump distance on mismatch, B-M is enabled when bmd_ > 0 (<= 255)
uint8_t bms_[256]; ///< Boyer-Moore skip array uint8_t bms_[256]; ///< Boyer-Moore skip array
float pms_; ///< ms elapsed time to parse regex float pms_; ///< ms elapsed time to parse regex
float vms_; ///< ms elapsed time to compile DFA vertices float vms_; ///< ms elapsed time to compile DFA vertices
@ -1192,6 +1191,7 @@ class Pattern {
float ams_; ///< ms elapsed time to analyze DFA for predict match and HFA float ams_; ///< ms elapsed time to analyze DFA for predict match and HFA
size_t npy_; ///< entropy derived from the bitap array bit_[] size_t npy_; ///< entropy derived from the bitap array bit_[]
bool one_; ///< true if matching one string stored in chr_[] without meta/anchors bool one_; ///< true if matching one string stored in chr_[] without meta/anchors
bool bol_; ///< true if matching all patterns at the begin of a line with anchor ^
}; };
} // namespace reflex } // namespace reflex

View File

@ -28,15 +28,19 @@
/** /**
@file simd.h @file simd.h
@brief RE/flex SIMD intrinsics @brief RE/flex SIMD primitives
@author Robert van Engelen - engelen@genivia.com @author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved. @copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt @copyright (c) BSD-3 License - see LICENSE.txt
*/ */
#ifndef SIMD_H #ifndef SIMD_H
#define SIMD_H #define SIMD_H
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#if defined(HAVE_AVX512BW) #if defined(HAVE_AVX512BW)
# include <immintrin.h> # include <immintrin.h>
#elif defined(HAVE_AVX2) #elif defined(HAVE_AVX2)
@ -133,15 +137,19 @@ inline uint32_t popcountl(uint64_t x)
} }
#endif #endif
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part // Partially count newlines in string b up to e, updates b close to e with uncounted part
extern size_t simd_nlcount_sse2(const char*& b, const char *e); extern size_t simd_nlcount_avx2(const char *&b, const char *e);
extern size_t simd_nlcount_avx2(const char*& b, const char *e); extern size_t simd_nlcount_avx512bw(const char *&b, const char *e);
extern size_t simd_nlcount_avx512bw(const char*& b, const char *e);
} // namespace reflex } // namespace reflex
#endif #endif
namespace reflex {
// Count newlines in string s up to t
extern size_t nlcount(const char *s, const char *t);
} // namespace reflex
#endif #endif

View File

@ -778,7 +778,11 @@ static void insert_posix_class(const char *pattern, size_t len, size_t& pos, con
else if (name[0] == 'A' && name[1] == 's') else if (name[0] == 'A' && name[1] == 's')
name = const_cast<char*>("ASCII"); name = const_cast<char*>("ASCII");
} }
const int *wc = Posix::range(name); const int *wc = NULL;
if ((flags & convert_flag::unicode))
wc = Unicode::range(name);
if (wc == NULL)
wc = Posix::range(name);
if (wc == NULL) if (wc == NULL)
throw regex_error(regex_error::invalid_class, pattern, pos); throw regex_error(regex_error::invalid_class, pattern, pos);
if (*buf == '^') if (*buf == '^')

View File

@ -28,7 +28,7 @@
/** /**
@file input.cpp @file input.cpp
@brief RE/flex input character sequence class and simd.h CPUID check @brief RE/flex input character sequence class
@author Robert van Engelen - engelen@genivia.com @author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved. @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt @copyright (c) BSD-3 License - see LICENSE.txt
@ -1361,27 +1361,4 @@ void Input::file_encoding(unsigned short enc, const unsigned short *page)
} }
} }
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
#include <reflex/simd.h>
// simd.h get_HW()
static uint64_t get_HW()
{
int CPUInfo1[4] = { 0, 0, 0, 0 };
int CPUInfo7[4] = { 0, 0, 0, 0 };
cpuidex(CPUInfo1, 0, 0);
int n = CPUInfo1[0];
if (n <= 0)
return 0ULL;
cpuidex(CPUInfo1, 1, 0); // cpuid EAX=1
if (n >= 7)
cpuidex(CPUInfo7, 7, 0); // cpuid EAX=7, ECX=0
return static_cast<uint32_t>(CPUInfo1[2]) | (static_cast<uint64_t>(static_cast<uint32_t>(CPUInfo7[1])) << 32);
}
uint64_t HW = get_HW();
#endif
} // namespace reflex } // namespace reflex

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -27,10 +27,10 @@
\******************************************************************************/ \******************************************************************************/
/** /**
@file matcher.cpp, matcher_avx2.cpp, matcher_avx512bw.cpp @file matcher_avx512bw.cpp
@brief RE/flex matcher engine @brief RE/flex matcher engine
@author Robert van Engelen - engelen@genivia.com @author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved. @copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt @copyright (c) BSD-3 License - see LICENSE.txt
*/ */
@ -40,5 +40,344 @@
# endif # endif
#endif #endif
#define COMPILE_AVX512BW #include <reflex/matcher.h>
#include "matcher.cpp"
namespace reflex {
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
// AVX512BW runtime optimized function callback overrides
void Matcher::simd_init_advance_avx512bw()
{
if (pat_->len_ == 0)
{
// no specialization
}
else if (pat_->len_ == 1)
{
// no specialization
}
else if (pat_->len_ == 2)
{
if (pat_->min_ == 0)
adv_ = &Matcher::simd_advance_chars_avx512bw<2>;
else if (pat_->min_ < 4)
adv_ = &Matcher::simd_advance_chars_pma_avx512bw<2>;
else
adv_ = &Matcher::simd_advance_chars_pmh_avx512bw<2>;
}
else if (pat_->len_ == 3)
{
if (pat_->min_ == 0)
adv_ = &Matcher::simd_advance_chars_avx512bw<3>;
else if (pat_->min_ < 4)
adv_ = &Matcher::simd_advance_chars_pma_avx512bw<3>;
else
adv_ = &Matcher::simd_advance_chars_pmh_avx512bw<3>;
}
else if (pat_->bmd_ == 0)
{
#if defined(WITH_STRING_PM)
if (pat_->min_ >= 4)
adv_ = &Matcher::simd_advance_string_pmh_avx512bw;
else if (pat_->min_ > 0)
adv_ = &Matcher::simd_advance_string_pma_avx512bw;
else
#endif
adv_ = &Matcher::simd_advance_string_avx512bw;
}
}
/// Few chars
template<uint8_t LEN>
bool Matcher::simd_advance_chars_avx512bw(size_t loc)
{
static const uint16_t lcp = 0;
static const uint16_t lcs = LEN - 1;
const char *chr = pat_->chr_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - LEN + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (LEN == 2 ||
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
{
loc = s - lcp + offset - buf_;
set_current(loc);
return true;
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + LEN > end_)
return false;
if (loc + LEN + 63 > end_)
break;
}
return advance_chars<LEN>(loc);
}
/// Few chars followed by 2 to 3 minimal char pattern
template<uint8_t LEN>
bool Matcher::simd_advance_chars_pma_avx512bw(size_t loc)
{
static const uint16_t lcp = 0;
static const uint16_t lcs = LEN - 1;
const Pattern::Pred *pma = pat_->pma_;
const char *chr = pat_->chr_;
size_t min = pat_->min_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - LEN + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (LEN == 2 ||
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
{
loc = s - lcp + offset - buf_;
if (loc + LEN + 4 > end_ || Pattern::predict_match(pma, &buf_[loc + LEN]) == 0)
{
set_current(loc);
return true;
}
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + LEN + min > end_)
return false;
if (loc + LEN + min + 63 > end_)
break;
}
return advance_chars_pma<LEN>(loc);
}
/// Few chars followed by 4 minimal char pattern
template<uint8_t LEN>
bool Matcher::simd_advance_chars_pmh_avx512bw(size_t loc)
{
static const uint16_t lcp = 0;
static const uint16_t lcs = LEN - 1;
const Pattern::Pred *pmh = pat_->pmh_;
const char *chr = pat_->chr_;
size_t min = pat_->min_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - LEN + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (LEN == 2 ||
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
{
loc = s - lcp + offset - buf_;
if (loc + LEN + min > end_ || Pattern::predict_match(pmh, &buf_[loc + LEN], min))
{
set_current(loc);
return true;
}
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + LEN + min > end_)
return false;
if (loc + LEN + min + 63 > end_)
break;
}
return advance_chars_pmh<LEN>(loc);
}
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
bool Matcher::simd_advance_string_avx512bw(size_t loc)
{
const char *chr = pat_->chr_;
size_t len = pat_->len_;
uint16_t lcp = pat_->lcp_;
uint16_t lcs = pat_->lcs_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - len + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (std::memcmp(s - lcp + offset, chr, len) == 0)
{
loc = s - lcp + offset - buf_;
set_current(loc);
return true;
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + len > end_)
return false;
if (loc + len + 63 > end_)
break;
}
return advance_string(loc);
}
#if defined(WITH_STRING_PM)
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
bool Matcher::simd_advance_string_pma_avx512bw(size_t loc)
{
const Pattern::Pred *pma = pat_->pma_;
const char *chr = pat_->chr_;
size_t len = pat_->len_;
size_t min = pat_->min_;
uint16_t lcp = pat_->lcp_;
uint16_t lcs = pat_->lcs_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - len + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (std::memcmp(s - lcp + offset, chr, len) == 0)
{
loc = s - lcp + offset - buf_;
if (loc + len + 4 > end_ || Pattern::predict_match(pma, &buf_[loc + len]) == 0)
{
set_current(loc);
return true;
}
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + len + min > end_)
return false;
if (loc + len + min + 63 > end_)
break;
}
return advance_string_pma(loc);
}
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
bool Matcher::simd_advance_string_pmh_avx512bw(size_t loc)
{
const Pattern::Pred *pmh = pat_->pmh_;
const char *chr = pat_->chr_;
size_t len = pat_->len_;
size_t min = pat_->min_;
uint16_t lcp = pat_->lcp_;
uint16_t lcs = pat_->lcs_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - len + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (std::memcmp(s - lcp + offset, chr, len) == 0)
{
loc = s - lcp + offset - buf_;
if (loc + len + min > end_ || Pattern::predict_match(pmh, &buf_[loc + len], min))
{
set_current(loc);
return true;
}
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + len + min > end_)
return false;
if (loc + len + min + 63 > end_)
break;
}
return advance_string_pmh(loc);
}
#endif // WITH_STRING_PM
#else
// appease ranlib "has no symbols"
void matcher_not_compiled_with_avx512bw() { }
#endif
} // namespace reflex

View File

@ -45,10 +45,10 @@
/// DFA compaction: -1 == reverse order edge compression (best); 1 == edge compression; 0 == no edge compression. /// DFA compaction: -1 == reverse order edge compression (best); 1 == edge compression; 0 == no edge compression.
/** Edge compression reorders edges to produce fewer tests when executed in the compacted order. /** Edge compression reorders edges to produce fewer tests when executed in the compacted order.
For example ([a-cg-ik]|d|[e-g]|j|y|[x-z]) after reverse edge compression has only 2 edges: For example ([a-cg-ik]|d|[e-g]|j|y|[x-z]) after reverse edge compression has only 2 edges:
c1 = m.FSM_CHAR(); c = m.FSM_CHAR();
if ('x' <= c1 && c1 <= 'z') goto S3; if ('x' <= c && c <= 'z') goto S3;
if ('a' <= c1 && c1 <= 'k') goto S3; if ('a' <= c && c <= 'k') goto S3;
return m.FSM_HALT(c1); return m.FSM_HALT(c);
*/ */
#define WITH_COMPACT_DFA -1 #define WITH_COMPACT_DFA -1
@ -177,6 +177,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
bmd_ = 0; bmd_ = 0;
npy_ = 0; npy_ = 0;
one_ = false; one_ = false;
bol_ = false;
vno_ = 0; vno_ = 0;
eno_ = 0; eno_ = 0;
hno_ = 0; hno_ = 0;
@ -197,36 +198,43 @@ void Pattern::init(const char *options, const uint8_t *pred)
len_ = pred[0]; len_ = pred[0];
min_ = pred[1] & 0x0f; min_ = pred[1] & 0x0f;
one_ = pred[1] & 0x10; one_ = pred[1] & 0x10;
bol_ = pred[1] & 0x40;
memcpy(chr_, pred + 2, len_); memcpy(chr_, pred + 2, len_);
size_t n = len_ + 2; size_t n = 2 + len_;
if (len_ == 0) if (len_ == 0)
{ {
// get bitap bit_[] parameters
for (size_t i = 0; i < 256; ++i) for (size_t i = 0; i < 256; ++i)
bit_[i] = ~pred[i + n]; bit_[i] = ~pred[i + n];
n += 256; n += 256;
} }
if (min_ >= 4) if (min_ < 4)
{
for (size_t i = 0; i < Const::HASH; ++i)
pmh_[i] = ~pred[i + n];
}
else
{ {
// get predict match PM4 pma_[] parameters
for (size_t i = 0; i < Const::HASH; ++i) for (size_t i = 0; i < Const::HASH; ++i)
pma_[i] = ~pred[i + n]; pma_[i] = ~pred[i + n];
} }
else
{
// get predict match hash pmh_[] parameters
for (size_t i = 0; i < Const::HASH; ++i)
pmh_[i] = ~pred[i + n];
}
n += Const::HASH;
if ((pred[1] & 0x20) != 0) if ((pred[1] & 0x20) != 0)
{ {
n += Const::HASH; // get lookback parameters lbk_ lbm_ and cbk_[] after s-t cut and first s-t cut pattern characters fst_[]
lbk_ = pred[n + 0] | (pred[n + 1] << 8); lbk_ = pred[n + 0] | (pred[n + 1] << 8);
lbm_ = pred[n + 2] | (pred[n + 3] << 8); lbm_ = pred[n + 2] | (pred[n + 3] << 8);
for (size_t i = 0; i < 256; ++i) for (size_t i = 0; i < 256; ++i)
cbk_.set(i, pred[n + 4 + (i >> 3)] & (1 << (i & 7))); cbk_.set(i, pred[n + 4 + (i >> 3)] & (1 << (i & 7)));
for (size_t i = 0; i < 256; ++i) for (size_t i = 0; i < 256; ++i)
fst_.set(i, pred[n + 32 + 4 + (i >> 3)] & (1 << (i & 7))); fst_.set(i, pred[n + 4 + 32 + (i >> 3)] & (1 << (i & 7)));
n += 4 + 32 + 32;
} }
else else
{ {
// get first pattern characters fst_[] from bitap
for (size_t i = 0; i < 256; ++i) for (size_t i = 0; i < 256; ++i)
fst_.set(i, (bit_[i] & 1) == 0); fst_.set(i, (bit_[i] & 1) == 0);
} }
@ -311,8 +319,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
} }
// needle count and frequency thresholds to enable needle-based search // needle count and frequency thresholds to enable needle-based search
uint16_t pinmax = 8; uint16_t pinmax = 8;
uint8_t freqmax1 = 91; // one position uint8_t freqmax = 251;
uint8_t freqmax2 = 251; // two positions
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2) #if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
if (have_HW_AVX512BW() || have_HW_AVX2()) if (have_HW_AVX512BW() || have_HW_AVX2())
pinmax = 16; pinmax = 16;
@ -331,7 +338,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
lcs_ = 0; lcs_ = 0;
uint16_t nlcp = 65535; // max and undefined uint16_t nlcp = 65535; // max and undefined
uint16_t nlcs = 65535; // max and undefined uint16_t nlcs = 65535; // max and undefined
uint16_t freqsum = 0;
uint8_t freqlcp = 255; // max uint8_t freqlcp = 255; // max
uint8_t freqlcs = 255; // max uint8_t freqlcs = 255; // max
size_t min = (min_ == 0 ? 1 : min_); size_t min = (min_ == 0 ? 1 : min_);
@ -339,7 +345,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
{ {
Pred mask = 1 << k; Pred mask = 1 << k;
uint16_t n = 0; uint16_t n = 0;
uint16_t sum = 0;
uint8_t max = 0; uint8_t max = 0;
// at position k count the matching characters and find the max character frequency // at position k count the matching characters and find the max character frequency
for (uint16_t i = 0; i < 256; ++i) for (uint16_t i = 0; i < 256; ++i)
@ -348,14 +353,13 @@ void Pattern::init(const char *options, const uint8_t *pred)
{ {
++n; ++n;
uint8_t freq = frequency(static_cast<uint8_t>(i)); uint8_t freq = frequency(static_cast<uint8_t>(i));
sum += freq;
if (freq > max) if (freq > max)
max = freq; max = freq;
} }
} }
if (n <= pinmax) if (n <= pinmax)
{ {
// pick the fewest and rarest (least frequently occurring) needles to search // pick the fewest and rarest (less frequently occurring) needles to search
if (max < freqlcp || (n < nlcp && max == freqlcp)) if (max < freqlcp || (n < nlcp && max == freqlcp))
{ {
lcs_ = lcp_; lcs_ = lcp_;
@ -363,7 +367,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
freqlcs = freqlcp; freqlcs = freqlcp;
lcp_ = static_cast<uint8_t>(k); lcp_ = static_cast<uint8_t>(k);
nlcp = n; nlcp = n;
freqsum = sum;
freqlcp = max; freqlcp = max;
} }
else if (n < nlcs || else if (n < nlcs ||
@ -377,8 +380,8 @@ void Pattern::init(const char *options, const uint8_t *pred)
} }
} }
} }
// one position to pin: make lcp and lcs equal (compared and optimized later) // one position to pin: make lcp and lcs equal to 0 (only one position at 0)
if (min == 1 || ((freqsum <= freqlcp || nlcs == 65535) && freqsum <= freqmax1)) if (min == 1 || nlcs == 65535)
{ {
nlcs = nlcp; nlcs = nlcp;
lcs_ = lcp_; lcs_ = lcp_;
@ -387,7 +390,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
uint16_t n = nlcp > nlcs ? nlcp : nlcs; uint16_t n = nlcp > nlcs ? nlcp : nlcs;
DBGLOG("min=%zu lcp=%hu(%hu) pin=%hu nlcp=%hu(%hu) freq=%hu(%hu) freqsum=%hu npy=%zu", min, lcp_, lcs_, n, nlcp, nlcs, freqlcp, freqlcs, freqsum, npy_); DBGLOG("min=%zu lcp=%hu(%hu) pin=%hu nlcp=%hu(%hu) freq=%hu(%hu) freqsum=%hu npy=%zu", min, lcp_, lcs_, n, nlcp, nlcs, freqlcp, freqlcs, freqsum, npy_);
// determine if a needle-based search is worthwhile, below or meeting the thresholds // determine if a needle-based search is worthwhile, below or meeting the thresholds
if (n <= pinmax && freqlcp <= freqmax2) if (n <= pinmax && freqlcp <= freqmax)
{ {
// bridge the gap from 9 to 16 to handle 9 to 16 combined // bridge the gap from 9 to 16 to handle 9 to 16 combined
if (n > 8) if (n > 8)
@ -412,7 +415,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
} }
else if (len_ > 1) else if (len_ > 1)
{ {
// Boyer-Moore preprocessing of the given string pattern pat of length len, generates bmd_ > 0 and bms_[] shifts // produce lcp and lcs positions and Boyer-Moore bms_[] shifts when bmd_ > 0
uint8_t n = static_cast<uint8_t>(len_); // okay to cast: actually never more than 255 uint8_t n = static_cast<uint8_t>(len_); // okay to cast: actually never more than 255
uint16_t i; uint16_t i;
for (i = 0; i < 256; ++i) for (i = 0; i < 256; ++i)
@ -433,13 +436,14 @@ void Pattern::init(const char *options, const uint8_t *pred)
lcs_ = lcp_; lcs_ = lcp_;
lcp_ = i; lcp_ = i;
} }
else if (lcpch != pch && frequency(lcsch) > freqpch) else if (frequency(lcsch) > freqpch ||
(frequency(lcsch) == freqpch &&
abs(static_cast<int>(lcp_) - static_cast<int>(lcs_)) < abs(static_cast<int>(lcp_) - static_cast<int>(i))))
{ {
lcs_ = i; lcs_ = i;
} }
} }
} }
DBGLOG("len=%zu lcp=%hu(%hu)", len_, lcp_, lcs_);
uint16_t j; uint16_t j;
for (i = n - 1, j = i; j > 0; --j) for (i = n - 1, j = i; j > 0; --j)
if (chr_[j - 1] == chr_[i]) if (chr_[j - 1] == chr_[i])
@ -469,7 +473,34 @@ void Pattern::init(const char *options, const uint8_t *pred)
#endif #endif
#endif #endif
if (lcs_ < 0xffff) if (lcs_ < 0xffff)
bmd_ = 0; // do not use B-M {
// do not use B-M
bmd_ = 0;
// spread lcp and lcs apart if lcp and lcs are adjacent (chars are possibly correlated)
if (len_ == 3 && (lcp_ == 1 || lcs_ == 1))
{
lcp_ = 0;
lcs_ = 2;
}
else if (len_ > 3 && (lcp_ + 1 == lcs_ || lcs_ + 1 == lcp_))
{
uint8_t freqlcs = 255;
for (i = 0; i < n; ++i)
{
if (i > lcp_ + 1 || i + 1 < lcp_)
{
uint8_t pch = static_cast<uint8_t>(chr_[i]);
uint8_t freqpch = frequency(pch);
if (freqlcs > freqpch)
{
lcs_ = i;
freqlcs = freqpch;
}
}
}
}
}
DBGLOG("len=%zu bmd=%zu lcp=%hu(%hu)", len_, bmd_, lcp_, lcs_);
} }
} }
@ -629,6 +660,7 @@ void Pattern::parse(
loc = 0; loc = 0;
} }
} }
bol_ = at(loc) == '^';
do do
{ {
Location end = loc; Location end = loc;
@ -733,6 +765,8 @@ void Pattern::parse(
} }
else else
{ {
if (at(loc) != '^')
bol_ = false;
parse2( parse2(
true, true,
loc, loc,
@ -2961,8 +2995,8 @@ void Pattern::gencode_dfa(const DFA::State *start) const
::fprintf(file, ::fprintf(file,
"void reflex_code_%s(reflex::Matcher& m)\n" "void reflex_code_%s(reflex::Matcher& m)\n"
"{\n" "{\n"
" int c0 = 0, c1 = 0;\n" " int c = 0;\n"
" m.FSM_INIT(c1);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str()); " m.FSM_INIT(c);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str());
for (const DFA::State *state = start; state != NULL; state = state->next) for (const DFA::State *state = start; state != NULL; state = state->next)
{ {
::fprintf(file, "\nS%u:\n", state->index); ::fprintf(file, "\nS%u:\n", state->index);
@ -2978,8 +3012,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
::fprintf(file, " m.FSM_HEAD(%u);\n", *i); ::fprintf(file, " m.FSM_HEAD(%u);\n", *i);
if (state->edges.rbegin() != state->edges.rend() && state->edges.rbegin()->first == META_DED) if (state->edges.rbegin() != state->edges.rend() && state->edges.rbegin()->first == META_DED)
::fprintf(file, " if (m.FSM_DENT()) goto S%u;\n", state->edges.rbegin()->second.second->index); ::fprintf(file, " if (m.FSM_DENT()) goto S%u;\n", state->edges.rbegin()->second.second->index);
bool peek = false; // if we need to read a character into c1 bool peek = false; // if we need to read a character into c
bool prev = false; // if we need to keep the previous character in c0
for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i) for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
{ {
#if WITH_COMPACT_DFA == -1 #if WITH_COMPACT_DFA == -1
@ -2993,13 +3026,12 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{ {
do do
{ {
if (lo == META_EOB || lo == META_EOL) if (lo == META_EOB || lo == META_EOL || lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
{
peek = true; peek = true;
else if (lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
prev = peek = true;
if (prev && peek)
break; break;
check_dfa_closure(i->second.second, 1, peek, prev); }
check_dfa_closure(i->second.second, 1, peek);
} while (++lo <= hi); } while (++lo <= hi);
} }
else else
@ -3025,10 +3057,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
target_index = i->second.second->index; target_index = i->second.second->index;
if (read) if (read)
{ {
if (prev) ::fprintf(file, " c = m.FSM_CHAR();\n");
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
else
::fprintf(file, " c1 = m.FSM_CHAR();\n");
read = false; read = false;
} }
if (is_meta(lo)) if (is_meta(lo))
@ -3039,14 +3068,6 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{ {
case META_EOB: case META_EOB:
case META_EOL: case META_EOL:
::fprintf(file, " ");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, 2, peek);
::fprintf(file, " }\n");
elif = true;
break;
case META_EWE: case META_EWE:
case META_BWE: case META_BWE:
case META_NWE: case META_NWE:
@ -3054,7 +3075,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
::fprintf(file, " "); ::fprintf(file, " ");
if (elif) if (elif)
::fprintf(file, "else "); ::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]); ::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, 2, peek); gencode_dfa_closure(file, i->second.second, 2, peek);
::fprintf(file, " }\n"); ::fprintf(file, " }\n");
elif = true; elif = true;
@ -3077,7 +3098,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
break; break;
if (lo == hi) if (lo == hi)
{ {
::fprintf(file, " if (c1 == "); ::fprintf(file, " if (c == ");
print_char(file, lo); print_char(file, lo);
::fprintf(file, ")"); ::fprintf(file, ")");
} }
@ -3085,20 +3106,20 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{ {
::fprintf(file, " if ("); ::fprintf(file, " if (");
print_char(file, lo); print_char(file, lo);
::fprintf(file, " <= c1)"); ::fprintf(file, " <= c)");
} }
else else
{ {
::fprintf(file, " if ("); ::fprintf(file, " if (");
print_char(file, lo); print_char(file, lo);
::fprintf(file, " <= c1 && c1 <= "); ::fprintf(file, " <= c && c <= ");
print_char(file, hi); print_char(file, hi);
::fprintf(file, ")"); ::fprintf(file, ")");
} }
if (target_index == Const::IMAX) if (target_index == Const::IMAX)
{ {
if (peek) if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n"); ::fprintf(file, " return m.FSM_HALT(c);\n");
else else
::fprintf(file, " return m.FSM_HALT();\n"); ::fprintf(file, " return m.FSM_HALT();\n");
} }
@ -3117,10 +3138,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{ {
if (read) if (read)
{ {
if (prev) ::fprintf(file, " c = m.FSM_CHAR();\n");
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
else
::fprintf(file, " c1 = m.FSM_CHAR();\n");
read = false; read = false;
} }
do do
@ -3129,14 +3147,6 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{ {
case META_EOB: case META_EOB:
case META_EOL: case META_EOL:
::fprintf(file, " ");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, 2, peek);
::fprintf(file, " }\n");
elif = true;
break;
case META_EWE: case META_EWE:
case META_BWE: case META_BWE:
case META_NWE: case META_NWE:
@ -3144,7 +3154,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
::fprintf(file, " "); ::fprintf(file, " ");
if (elif) if (elif)
::fprintf(file, "else "); ::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]); ::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, 2, peek); gencode_dfa_closure(file, i->second.second, 2, peek);
::fprintf(file, " }\n"); ::fprintf(file, " }\n");
elif = true; elif = true;
@ -3170,10 +3180,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
target_index = i->second.second->index; target_index = i->second.second->index;
if (read) if (read)
{ {
if (prev) ::fprintf(file, " c = m.FSM_CHAR();\n");
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
else
::fprintf(file, " c1 = m.FSM_CHAR();\n");
read = false; read = false;
} }
if (!is_meta(lo)) if (!is_meta(lo))
@ -3183,7 +3190,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
break; break;
if (lo == hi) if (lo == hi)
{ {
::fprintf(file, " if (c1 == "); ::fprintf(file, " if (c == ");
print_char(file, lo); print_char(file, lo);
::fprintf(file, ")"); ::fprintf(file, ")");
} }
@ -3191,20 +3198,20 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{ {
::fprintf(file, " if ("); ::fprintf(file, " if (");
print_char(file, lo); print_char(file, lo);
::fprintf(file, " <= c1)"); ::fprintf(file, " <= c)");
} }
else else
{ {
::fprintf(file, " if ("); ::fprintf(file, " if (");
print_char(file, lo); print_char(file, lo);
::fprintf(file, " <= c1 && c1 <= "); ::fprintf(file, " <= c && c <= ");
print_char(file, hi); print_char(file, hi);
::fprintf(file, ")"); ::fprintf(file, ")");
} }
if (target_index == Const::IMAX) if (target_index == Const::IMAX)
{ {
if (peek) if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n"); ::fprintf(file, " return m.FSM_HALT(c);\n");
else else
::fprintf(file, " return m.FSM_HALT();\n"); ::fprintf(file, " return m.FSM_HALT();\n");
} }
@ -3216,7 +3223,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
} }
#endif #endif
if (peek) if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n"); ::fprintf(file, " return m.FSM_HALT(c);\n");
else else
::fprintf(file, " return m.FSM_HALT();\n"); ::fprintf(file, " return m.FSM_HALT();\n");
} }
@ -3234,7 +3241,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
} }
#ifndef WITH_NO_CODEGEN #ifndef WITH_NO_CODEGEN
void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, bool& prev) const void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek) const
{ {
if (nest > 5) if (nest > 5)
return; return;
@ -3251,13 +3258,12 @@ void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, b
{ {
do do
{ {
if (lo == META_EOB || lo == META_EOL) if (lo == META_EOB || lo == META_EOL || lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
{
peek = true; peek = true;
else if (lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
prev = peek = true;
if (prev && peek)
break; break;
check_dfa_closure(i->second.second, nest + 1, peek, prev); }
check_dfa_closure(i->second.second, nest + 1, peek);
} while (++lo <= hi); } while (++lo <= hi);
} }
} }
@ -3271,14 +3277,14 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
if (state->redo) if (state->redo)
{ {
if (peek) if (peek)
::fprintf(file, "%*sm.FSM_REDO(c1);\n", 2*nest, ""); ::fprintf(file, "%*sm.FSM_REDO(c);\n", 2*nest, "");
else else
::fprintf(file, "%*sm.FSM_REDO();\n", 2*nest, ""); ::fprintf(file, "%*sm.FSM_REDO();\n", 2*nest, "");
} }
else if (state->accept > 0) else if (state->accept > 0)
{ {
if (peek) if (peek)
::fprintf(file, "%*sm.FSM_TAKE(%u, c1);\n", 2*nest, "", state->accept); ::fprintf(file, "%*sm.FSM_TAKE(%u, c);\n", 2*nest, "", state->accept);
else else
::fprintf(file, "%*sm.FSM_TAKE(%u);\n", 2*nest, "", state->accept); ::fprintf(file, "%*sm.FSM_TAKE(%u);\n", 2*nest, "", state->accept);
} }
@ -3303,14 +3309,6 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
{ {
case META_EOB: case META_EOB:
case META_EOL: case META_EOL:
::fprintf(file, "%*s", 2*nest, "");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, nest + 1, peek);
::fprintf(file, "%*s}\n", 2*nest, "");
elif = true;
break;
case META_EWE: case META_EWE:
case META_BWE: case META_BWE:
case META_NWE: case META_NWE:
@ -3318,7 +3316,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
::fprintf(file, "%*s", 2*nest, ""); ::fprintf(file, "%*s", 2*nest, "");
if (elif) if (elif)
::fprintf(file, "else "); ::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]); ::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, nest + 1, peek); gencode_dfa_closure(file, i->second.second, nest + 1, peek);
::fprintf(file, "%*s}\n", 2*nest, ""); ::fprintf(file, "%*s}\n", 2*nest, "");
elif = true; elif = true;
@ -3346,7 +3344,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
::fprintf(file, "%*s", 2*nest, ""); ::fprintf(file, "%*s", 2*nest, "");
if (lo == hi) if (lo == hi)
{ {
::fprintf(file, "if (c1 == "); ::fprintf(file, "if (c == ");
print_char(file, lo); print_char(file, lo);
::fprintf(file, ")"); ::fprintf(file, ")");
} }
@ -3354,20 +3352,20 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
{ {
::fprintf(file, "if ("); ::fprintf(file, "if (");
print_char(file, lo); print_char(file, lo);
::fprintf(file, " <= c1)"); ::fprintf(file, " <= c)");
} }
else else
{ {
::fprintf(file, "if ("); ::fprintf(file, "if (");
print_char(file, lo); print_char(file, lo);
::fprintf(file, " <= c1 && c1 <= "); ::fprintf(file, " <= c && c <= ");
print_char(file, hi); print_char(file, hi);
::fprintf(file, ")"); ::fprintf(file, ")");
} }
if (target_index == Const::IMAX) if (target_index == Const::IMAX)
{ {
if (peek) if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n"); ::fprintf(file, " return m.FSM_HALT(c);\n");
else else
::fprintf(file, " return m.FSM_HALT();\n"); ::fprintf(file, " return m.FSM_HALT();\n");
} }
@ -3394,7 +3392,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
::fprintf(file, "%*s", 2*nest, ""); ::fprintf(file, "%*s", 2*nest, "");
if (lo == hi) if (lo == hi)
{ {
::fprintf(file, "if (c1 == "); ::fprintf(file, "if (c == ");
print_char(file, lo); print_char(file, lo);
::fprintf(file, ")"); ::fprintf(file, ")");
} }
@ -3402,20 +3400,20 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
{ {
::fprintf(file, "if ("); ::fprintf(file, "if (");
print_char(file, lo); print_char(file, lo);
::fprintf(file, " <= c1)"); ::fprintf(file, " <= c)");
} }
else else
{ {
::fprintf(file, "if ("); ::fprintf(file, "if (");
print_char(file, lo); print_char(file, lo);
::fprintf(file, " <= c1 && c1 <= "); ::fprintf(file, " <= c && c <= ");
print_char(file, hi); print_char(file, hi);
::fprintf(file, ")"); ::fprintf(file, ")");
} }
if (target_index == Const::IMAX) if (target_index == Const::IMAX)
{ {
if (peek) if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n"); ::fprintf(file, " return m.FSM_HALT(c);\n");
else else
::fprintf(file, " return m.FSM_HALT();\n"); ::fprintf(file, " return m.FSM_HALT();\n");
} }
@ -4560,26 +4558,31 @@ bool Pattern::match_hfa_transitions(size_t level, const HFA::Hashes& hashes, con
void Pattern::write_predictor(FILE *file) const void Pattern::write_predictor(FILE *file) const
{ {
::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (len_ == 0) * 256 + Const::HASH + (lbk_ > 0) * 68); ::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (len_ == 0) * 256 + Const::HASH + (lbk_ > 0) * 68);
::fprintf(file, "\n %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4) | ((lbk_ > 0) << 5)))); ::fprintf(file, "\n %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4) | ((lbk_ > 0) << 5) | (bol_ << 6))));
// save match characters chr_[0..len_-1]
for (size_t i = 0; i < len_; ++i) for (size_t i = 0; i < len_; ++i)
::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n ", static_cast<uint8_t>(chr_[i])); ::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n ", static_cast<uint8_t>(chr_[i]));
if (len_ == 0) if (len_ == 0)
{ {
// save bitap bit_[] parameters
for (Char i = 0; i < 256; ++i) for (Char i = 0; i < 256; ++i)
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~bit_[i])); ::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~bit_[i]));
} }
if (min_ >= 4) if (min_ < 4)
{
for (Hash i = 0; i < Const::HASH; ++i)
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pmh_[i]));
}
else
{ {
// save predict match PM4 pma_[] parameters
for (Hash i = 0; i < Const::HASH; ++i) for (Hash i = 0; i < Const::HASH; ++i)
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pma_[i])); ::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pma_[i]));
} }
else
{
// save predict match hash pmh_[] parameters
for (Hash i = 0; i < Const::HASH; ++i)
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pmh_[i]));
}
if (lbk_ > 0) if (lbk_ > 0)
{ {
// save lookback parameters lbk_ lbm_ cbk_[] after s-t cut and first s-t cut pattern characters fst_[]
::fprintf(file, "\n %3hhu,%3hhu,%3hhu,%3hhu,", static_cast<uint8_t>(lbk_ & 0xff), static_cast<uint8_t>(lbk_ >> 8), static_cast<uint8_t>(lbm_ & 0xff), static_cast<uint8_t>(lbm_ >> 8)); ::fprintf(file, "\n %3hhu,%3hhu,%3hhu,%3hhu,", static_cast<uint8_t>(lbk_ & 0xff), static_cast<uint8_t>(lbk_ >> 8), static_cast<uint8_t>(lbm_ & 0xff), static_cast<uint8_t>(lbm_ >> 8));
for (size_t i = 0; i < 256; i += 8) for (size_t i = 0; i < 256; i += 8)
{ {

View File

@ -0,0 +1,157 @@
/******************************************************************************\
* Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions are met: *
* *
* (1) Redistributions of source code must retain the above copyright notice, *
* this list of conditions and the following disclaimer. *
* *
* (2) Redistributions in binary form must reproduce the above copyright *
* notice, this list of conditions and the following disclaimer in the *
* documentation and/or other materials provided with the distribution. *
* *
* (3) The name of the author may not be used to endorse or promote products *
* derived from this software without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF *
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO *
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; *
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, *
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR *
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF *
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
\******************************************************************************/
/**
@file simd.cpp
@brief RE/flex SIMD primitives
@author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt
*/
#include <reflex/simd.h>
namespace reflex {
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
// simd.h get_HW()
static uint64_t get_HW()
{
int CPUInfo1[4] = { 0, 0, 0, 0 };
int CPUInfo7[4] = { 0, 0, 0, 0 };
cpuidex(CPUInfo1, 0, 0);
int n = CPUInfo1[0];
if (n <= 0)
return 0ULL;
cpuidex(CPUInfo1, 1, 0); // cpuid EAX=1
if (n >= 7)
cpuidex(CPUInfo7, 7, 0); // cpuid EAX=7, ECX=0
return static_cast<uint32_t>(CPUInfo1[2]) | (static_cast<uint64_t>(static_cast<uint32_t>(CPUInfo7[1])) << 32);
}
uint64_t HW = get_HW();
#endif
size_t nlcount(const char *s, const char *t)
{
size_t n = 0;
if (s <= t - 256)
{
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
if (have_HW_AVX512BW())
n = simd_nlcount_avx512bw(s, t);
else if (have_HW_AVX2())
n = simd_nlcount_avx2(s, t);
else
#elif defined(HAVE_AVX512BW) || defined(HAVE_AVX2)
if (have_HW_AVX2())
n = simd_nlcount_avx2(s, t);
else
#endif
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
{
const char *e = t - 64;
// align on 16 bytes
while ((reinterpret_cast<std::ptrdiff_t>(s) & 0x0f) != 0)
n += (*s++ == '\n');
__m128i vlcn = _mm_set1_epi8('\n');
while (s <= e)
{
__m128i vlcm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
__m128i vlcm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 16));
__m128i vlcm3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 32));
__m128i vlcm4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 48));
__m128i vlceq1 = _mm_cmpeq_epi8(vlcm1, vlcn);
__m128i vlceq2 = _mm_cmpeq_epi8(vlcm2, vlcn);
__m128i vlceq3 = _mm_cmpeq_epi8(vlcm3, vlcn);
__m128i vlceq4 = _mm_cmpeq_epi8(vlcm4, vlcn);
n += popcount(_mm_movemask_epi8(vlceq1))
+ popcount(_mm_movemask_epi8(vlceq2))
+ popcount(_mm_movemask_epi8(vlceq3))
+ popcount(_mm_movemask_epi8(vlceq4));
s += 64;
}
}
#elif defined(HAVE_NEON)
const char *e = t - 64;
uint8x16_t vlcn = vdupq_n_u8('\n');
while (s <= e)
{
uint8x16_t vlcm0 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
uint8x16_t vleq0 = vceqq_u8(vlcm0, vlcn);
s += 16;
uint8x16_t vlcm1 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
uint8x16_t vleq1 = vceqq_u8(vlcm1, vlcn);
s += 16;
uint8x16_t vlcm2 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
uint8x16_t vleq2 = vceqq_u8(vlcm2, vlcn);
s += 16;
uint8x16_t vlcm3 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
uint8x16_t vleq3 = vceqq_u8(vlcm3, vlcn);
s += 16;
#if defined(__aarch64__)
n += vaddvq_s8(vqabsq_s8(vreinterpretq_s8_u8(vaddq_u8(vleq0, vaddq_u8(vleq1, vaddq_u8(vleq2, vleq3))))));
#else
// my homebrew horizontal sum (we have a very limited range 0..4 to sum to a total max 4x16=64 < 256)
uint64x2_t vsum = vreinterpretq_u64_s8(vqabsq_s8(vreinterpretq_s8_u8(vaddq_u8(vleq0, vaddq_u8(vleq1, vaddq_u8(vleq2, vleq3))))));
uint64_t sum0 = vgetq_lane_u64(vsum, 0) + vgetq_lane_u64(vsum, 1);
uint32_t sum1 = static_cast<uint32_t>(sum0) + (sum0 >> 32);
uint16_t sum2 = static_cast<uint16_t>(sum1) + (sum1 >> 16);
n += static_cast<uint8_t>(sum2) + (sum2 >> 8);
#endif
}
#endif
}
// 4-way auto-vectorizable loop
uint32_t n0 = 0, n1 = 0, n2 = 0, n3 = 0;
while (s < t - 3)
{
n0 += s[0] == '\n';
n1 += s[1] == '\n';
n2 += s[2] == '\n';
n3 += s[3] == '\n';
s += 4;
}
n += n0 + n1 + n2 + n3;
// epilogue
if (s < t)
{
n += *s == '\n';
if (++s < t)
{
n += *s == '\n';
if (++s < t)
n += *s == '\n';
}
}
return n;
}
} // namespace reflex

View File

@ -34,15 +34,20 @@
@copyright (c) BSD-3 License - see LICENSE.txt @copyright (c) BSD-3 License - see LICENSE.txt
*/ */
#include <reflex/absmatcher.h> #if defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
#include <cstddef> # if !defined(__AVX2__) && !defined(__AVX512BW__)
# error simd_avx2.cpp must be compiled with -mavx2 or /arch:avx2.
# endif
#endif
#include <reflex/simd.h>
namespace reflex { namespace reflex {
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part // Partially count newlines in string b up to e, updates b close to e with uncounted part
size_t simd_nlcount_avx2(const char*& b, const char *e) size_t simd_nlcount_avx2(const char*& b, const char *e)
{ {
#if defined(HAVE_AVX2) #if defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
const char *s = b; const char *s = b;
e -= 128; e -= 128;
if (s > e) if (s > e)
@ -73,42 +78,4 @@ size_t simd_nlcount_avx2(const char*& b, const char *e)
#endif #endif
} }
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
size_t simd_nlcount_sse2(const char*& b, const char *e)
{
#if defined(HAVE_SSE2)
const char *s = b;
e -= 64;
if (s > e)
return 0;
size_t n = 0;
// align on 16 bytes
while ((reinterpret_cast<std::ptrdiff_t>(s) & 0x0f) != 0)
n += (*s++ == '\n');
__m128i vlcn = _mm_set1_epi8('\n');
while (s <= e)
{
__m128i vlcm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
__m128i vlcm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 16));
__m128i vlcm3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 32));
__m128i vlcm4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 48));
__m128i vlceq1 = _mm_cmpeq_epi8(vlcm1, vlcn);
__m128i vlceq2 = _mm_cmpeq_epi8(vlcm2, vlcn);
__m128i vlceq3 = _mm_cmpeq_epi8(vlcm3, vlcn);
__m128i vlceq4 = _mm_cmpeq_epi8(vlcm4, vlcn);
n += popcount(_mm_movemask_epi8(vlceq1))
+ popcount(_mm_movemask_epi8(vlceq2))
+ popcount(_mm_movemask_epi8(vlceq3))
+ popcount(_mm_movemask_epi8(vlceq4));
s += 64;
}
b = s;
return n;
#else
(void)b;
(void)e;
return 0;
#endif
}
} // namespace reflex } // namespace reflex

View File

@ -34,12 +34,17 @@
@copyright (c) BSD-3 License - see LICENSE.txt @copyright (c) BSD-3 License - see LICENSE.txt
*/ */
#include <reflex/absmatcher.h> #if defined(HAVE_AVX512BW)
#include <cstddef> # if !defined(__AVX512BW__)
# error simd_avx512bw.cpp must be compiled with -mavx512bw or /arch:avx512.
# endif
#endif
#include <reflex/simd.h>
namespace reflex { namespace reflex {
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part // Partially count newlines in string b up to e, updates b close to e with uncounted part
size_t simd_nlcount_avx512bw(const char*& b, const char *e) size_t simd_nlcount_avx512bw(const char*& b, const char *e)
{ {
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64)) #if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))

View File

@ -83,9 +83,15 @@ Tables::Tables()
range["Control"] = range["Cc"]; range["Control"] = range["Cc"];
range["Format"] = range["Cf"]; range["Format"] = range["Cf"];
range["d"] = range["Decimal_Digit_Number"]; range["Cntrl"] = range["C"];
range["l"] = range["Lowercase_Letter"]; range["Digit"] = range["Nd"];
range["u"] = range["Uppercase_Letter"]; range["Lower"] = range["Ll"];
range["Punct"] = range["P"];
range["Upper"] = range["Lu"];
range["d"] = range["Digit"];
range["l"] = range["Lower"];
range["u"] = range["Upper"];
range["s"] = range["Space"]; range["s"] = range["Space"];
range["w"] = range["Word"]; range["w"] = range["Word"];
} }

View File

@ -2107,6 +2107,36 @@ void reflex::Unicode::Tables::language_scripts(void)
0, 0 0, 0
}; };
range["Grantha"] = Grantha; range["Grantha"] = Grantha;
static const int Graph[] = {
33, 126,
161, 172,
174, 1535,
1542, 1563,
1565, 1756,
1758, 1806,
1808, 2191,
2194, 2273,
2275, 5759,
5761, 6157,
6159, 8191,
8208, 8231,
8240, 8286,
8293, 8293,
8304, 12287,
12289, 55295,
57344, 65278,
65280, 65528,
65532, 69820,
69822, 69836,
69838, 78895,
78912, 113823,
113828, 119154,
119163, 917504,
917506, 917535,
917632, 1114111,
0, 0
};
range["Graph"] = Graph;
static const int Greek[] = { static const int Greek[] = {
880, 883, 880, 883,
885, 887, 885, 887,
@ -7430,6 +7460,34 @@ void reflex::Unicode::Tables::language_scripts(void)
0, 0 0, 0
}; };
range["Po"] = Po; range["Po"] = Po;
static const int Print[] = {
32, 126,
160, 172,
174, 1535,
1542, 1563,
1565, 1756,
1758, 1806,
1808, 2191,
2194, 2273,
2275, 6157,
6159, 8202,
8208, 8233,
8239, 8287,
8293, 8293,
8304, 55295,
57344, 65278,
65280, 65528,
65532, 69820,
69822, 69836,
69838, 78895,
78912, 113823,
113828, 119154,
119163, 917504,
917506, 917535,
917632, 1114111,
0, 0
};
range["Print"] = Print;
static const int Ps[] = { static const int Ps[] = {
40, 40, 40, 40,
91, 91, 91, 91,

View File

@ -2,6 +2,377 @@
#include <reflex/unicode.h> #include <reflex/unicode.h>
void reflex::Unicode::Tables::letter_scripts(void) void reflex::Unicode::Tables::letter_scripts(void)
{ {
static const int Alnum[] = {
48, 57,
65, 90,
97, 122,
181, 181,
192, 214,
216, 246,
248, 442,
444, 447,
452, 452,
454, 455,
457, 458,
460, 497,
499, 659,
661, 687,
880, 883,
886, 887,
891, 893,
895, 895,
902, 902,
904, 906,
908, 908,
910, 929,
931, 1013,
1015, 1153,
1162, 1327,
1329, 1366,
1376, 1416,
1632, 1641,
1776, 1785,
1984, 1993,
2406, 2415,
2534, 2543,
2662, 2671,
2790, 2799,
2918, 2927,
3046, 3055,
3174, 3183,
3302, 3311,
3430, 3439,
3558, 3567,
3664, 3673,
3792, 3801,
3872, 3881,
4160, 4169,
4240, 4249,
4256, 4293,
4295, 4295,
4301, 4301,
4304, 4346,
4349, 4351,
5024, 5109,
5112, 5117,
6112, 6121,
6160, 6169,
6470, 6479,
6608, 6617,
6784, 6793,
6800, 6809,
6992, 7001,
7088, 7097,
7232, 7241,
7248, 7257,
7296, 7304,
7312, 7354,
7357, 7359,
7424, 7467,
7531, 7543,
7545, 7578,
7680, 7957,
7960, 7965,
7968, 8005,
8008, 8013,
8016, 8023,
8025, 8025,
8027, 8027,
8029, 8029,
8031, 8061,
8064, 8071,
8080, 8087,
8096, 8103,
8112, 8116,
8118, 8123,
8126, 8126,
8130, 8132,
8134, 8139,
8144, 8147,
8150, 8155,
8160, 8172,
8178, 8180,
8182, 8187,
8450, 8450,
8455, 8455,
8458, 8467,
8469, 8469,
8473, 8477,
8484, 8484,
8486, 8486,
8488, 8488,
8490, 8493,
8495, 8500,
8505, 8505,
8508, 8511,
8517, 8521,
8526, 8526,
8579, 8580,
11264, 11387,
11390, 11492,
11499, 11502,
11506, 11507,
11520, 11557,
11559, 11559,
11565, 11565,
42528, 42537,
42560, 42605,
42624, 42651,
42786, 42863,
42865, 42887,
42891, 42894,
42896, 42954,
42960, 42961,
42963, 42963,
42965, 42969,
42997, 42998,
43002, 43002,
43216, 43225,
43264, 43273,
43472, 43481,
43504, 43513,
43600, 43609,
43824, 43866,
43872, 43880,
43888, 43967,
44016, 44025,
64256, 64262,
64275, 64279,
65296, 65305,
65313, 65338,
65345, 65370,
66560, 66639,
66720, 66729,
66736, 66771,
66776, 66811,
66928, 66938,
66940, 66954,
66956, 66962,
66964, 66965,
66967, 66977,
66979, 66993,
66995, 67001,
67003, 67004,
68736, 68786,
68800, 68850,
68912, 68921,
69734, 69743,
69872, 69881,
69942, 69951,
70096, 70105,
70384, 70393,
70736, 70745,
70864, 70873,
71248, 71257,
71360, 71369,
71472, 71481,
71840, 71913,
72016, 72025,
72784, 72793,
73040, 73049,
73120, 73129,
73552, 73561,
92768, 92777,
92864, 92873,
93008, 93017,
93760, 93823,
119808, 119892,
119894, 119964,
119966, 119967,
119970, 119970,
119973, 119974,
119977, 119980,
119982, 119993,
119995, 119995,
119997, 120003,
120005, 120069,
120071, 120074,
120077, 120084,
120086, 120092,
120094, 120121,
120123, 120126,
120128, 120132,
120134, 120134,
120138, 120144,
120146, 120485,
120488, 120512,
120514, 120538,
120540, 120570,
120572, 120596,
120598, 120628,
120630, 120654,
120656, 120686,
120688, 120712,
120714, 120744,
120746, 120770,
120772, 120779,
120782, 120831,
122624, 122633,
122635, 122654,
122661, 122666,
123200, 123209,
123632, 123641,
124144, 124153,
125184, 125251,
125264, 125273,
130032, 130041,
0, 0
};
range["Alnum"] = Alnum;
static const int Alpha[] = {
65, 90,
97, 122,
181, 181,
192, 214,
216, 246,
248, 442,
444, 447,
452, 452,
454, 455,
457, 458,
460, 497,
499, 659,
661, 687,
880, 883,
886, 887,
891, 893,
895, 895,
902, 902,
904, 906,
908, 908,
910, 929,
931, 1013,
1015, 1153,
1162, 1327,
1329, 1366,
1376, 1416,
4256, 4293,
4295, 4295,
4301, 4301,
4304, 4346,
4349, 4351,
5024, 5109,
5112, 5117,
7296, 7304,
7312, 7354,
7357, 7359,
7424, 7467,
7531, 7543,
7545, 7578,
7680, 7957,
7960, 7965,
7968, 8005,
8008, 8013,
8016, 8023,
8025, 8025,
8027, 8027,
8029, 8029,
8031, 8061,
8064, 8071,
8080, 8087,
8096, 8103,
8112, 8116,
8118, 8123,
8126, 8126,
8130, 8132,
8134, 8139,
8144, 8147,
8150, 8155,
8160, 8172,
8178, 8180,
8182, 8187,
8450, 8450,
8455, 8455,
8458, 8467,
8469, 8469,
8473, 8477,
8484, 8484,
8486, 8486,
8488, 8488,
8490, 8493,
8495, 8500,
8505, 8505,
8508, 8511,
8517, 8521,
8526, 8526,
8579, 8580,
11264, 11387,
11390, 11492,
11499, 11502,
11506, 11507,
11520, 11557,
11559, 11559,
11565, 11565,
42560, 42605,
42624, 42651,
42786, 42863,
42865, 42887,
42891, 42894,
42896, 42954,
42960, 42961,
42963, 42963,
42965, 42969,
42997, 42998,
43002, 43002,
43824, 43866,
43872, 43880,
43888, 43967,
64256, 64262,
64275, 64279,
65313, 65338,
65345, 65370,
66560, 66639,
66736, 66771,
66776, 66811,
66928, 66938,
66940, 66954,
66956, 66962,
66964, 66965,
66967, 66977,
66979, 66993,
66995, 67001,
67003, 67004,
68736, 68786,
68800, 68850,
71840, 71903,
93760, 93823,
119808, 119892,
119894, 119964,
119966, 119967,
119970, 119970,
119973, 119974,
119977, 119980,
119982, 119993,
119995, 119995,
119997, 120003,
120005, 120069,
120071, 120074,
120077, 120084,
120086, 120092,
120094, 120121,
120123, 120126,
120128, 120132,
120134, 120134,
120138, 120144,
120146, 120485,
120488, 120512,
120514, 120538,
120540, 120570,
120572, 120596,
120598, 120628,
120630, 120654,
120656, 120686,
120688, 120712,
120714, 120744,
120746, 120770,
120772, 120779,
122624, 122633,
122635, 122654,
122661, 122666,
125184, 125251,
0, 0
};
range["Alpha"] = Alpha;
static const int Ll[] = { static const int Ll[] = {
97, 122, 97, 122,
181, 181, 181, 181,

View File

@ -31,6 +31,7 @@
#include "../lib/error.cpp" #include "../lib/error.cpp"
#include "../lib/input.cpp" #include "../lib/input.cpp"
#include "../lib/matcher.cpp" #include "../lib/matcher.cpp"
#include "../lib/simd.cpp"
#undef min #undef min
#undef max #undef max

View File

@ -2,6 +2,11 @@
$workDir = Resolve-Path -Path "${PSScriptRoot}\..\ccl\rslang\src" $workDir = Resolve-Path -Path "${PSScriptRoot}\..\ccl\rslang\src"
# Change default relative path according to your work directory setup
# Re-flex repository: https://github.com/Genivia/RE-flex
$reflexRelative = Resolve-Path -Path "${PSScriptRoot}\..\..\GH-RE-flex\bin\win64"
$Env:PATH += ";${reflexRelative}"
function BuildLexers { function BuildLexers {
Set-Location -Path ${workDir} Set-Location -Path ${workDir}
BuildSyntax('AsciiLexer') BuildSyntax('AsciiLexer')
@ -21,4 +26,4 @@ function BuildSyntax([string] $lexerName) {
Move-Item -Path "${lexer}.hpp" -Destination "..\header\${lexer}.hpp" -Force Move-Item -Path "${lexer}.hpp" -Destination "..\header\${lexer}.hpp" -Force
} }
BuildLexers BuildLexers