Update re-flex

This commit is contained in:
Ivan 2024-07-14 12:30:40 +03:00
parent 735d315862
commit 0c6d998cc5
22 changed files with 8736 additions and 4394 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -30,7 +30,7 @@
@file absmatcher.h
@brief RE/flex abstract matcher base class and pattern matcher class
@author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt
*/
@ -42,12 +42,12 @@
#define WITH_REALLOC 1
#endif
/// This compile-time option speeds up matching, but slows input().
/// This compile-time option speeds up matching, but slows input() somewhat.
#ifndef WITH_FAST_GET
#define WITH_FAST_GET 1
#endif
/// This compile-time option adds span(), line(), wline(), bol(), eol()
/// This compile-time option adds span(), line(), wline(), bol(), eol().
#ifndef WITH_SPAN
#define WITH_SPAN 1
#endif
@ -111,7 +111,7 @@ class AbstractMatcher {
static const int EOB = EOF; ///< end of buffer meta-char marker
static const size_t BLOCK = 4096; ///< minimum remaining unused space in the buffer, to prevent excessive shifting
#ifndef REFLEX_BUFSZ
static const size_t BUFSZ = (128*1024); ///< initial buffer size, at least 4096 bytes
static const size_t BUFSZ = (256*1024); ///< initial buffer size, at least 4096 bytes
#else
static const size_t BUFSZ = REFLEX_BUFSZ;
#endif
@ -154,11 +154,13 @@ class AbstractMatcher {
A(false),
N(false),
W(false),
X(false),
T(8)
{ }
bool A; ///< accept any/all (?^X) negative patterns as Const::REDO accept index codes
bool N; ///< nullable, find may return empty match (N/A to scan, split, matches)
bool W; ///< half-check for "whole words", check only left of \< and right of \> for non-word character
bool W; ///< reflex::Matcher matches whole words as if bound by \< and \>
bool X; ///< reflex::LineMatcher matches empty lines
char T; ///< tab size, must be a power of 2, default is 8, for column count and indent \i, \j, and \k
};
/// AbstractMatcher::Iterator class for scanning, searching, and splitting input character sequences.
@ -356,7 +358,8 @@ class AbstractMatcher {
{
opt_.A = false; // when true: accept any/all (?^X) negative patterns as Const::REDO accept index codes
opt_.N = false; // when true: find may return empty match (N/A to scan, split, matches)
opt_.W = false; // when true: half-check for "whole words", check only left of \< and right of \> for non-word character
opt_.W = false; // when true: reflex::Matcher matches whole words as if bound by \< and \>
opt_.X = false; // when true: reflex::LineMatcher matches empty lines
opt_.T = 8; // tab size 1, 2, 4, or 8
if (opt)
{
@ -373,6 +376,9 @@ class AbstractMatcher {
case 'W':
opt_.W = true;
break;
case 'X':
opt_.X = true;
break;
case 'T':
opt_.T = isdigit(*(s += (s[1] == '=') + 1)) ? static_cast<char>(*s - '0') : 0;
break;
@ -422,6 +428,7 @@ class AbstractMatcher {
own_ = true;
eof_ = false;
mat_ = false;
cml_ = false;
}
/// Set buffer block size for reading: use 0 (or omit argument) to buffer all input in which case returns true if all the data could be read and false if a read error occurred.
bool buffer(size_t blk = 0) ///< new block size between 1 and Const::BLOCK, or 0 to buffer all input (default)
@ -486,7 +493,7 @@ class AbstractMatcher {
(void)buffer(1);
}
/// Flush the buffer's remaining content.
void flush()
inline void flush()
{
DBGLOG("AbstractMatcher::flush()");
pos_ = end_;
@ -560,6 +567,7 @@ class AbstractMatcher {
own_ = false;
eof_ = true;
mat_ = false;
cml_ = false;
}
return *this;
}
@ -645,6 +653,13 @@ class AbstractMatcher {
{
return utf8(txt_);
}
#if WITH_SPAN
/// Set or reset mode to count matching lines only and skip other (e.g. for speed).
inline void lineno_skip(bool f = false)
{
cml_ = f;
}
#endif
/// Set or change the starting line number of the last match.
inline void lineno(size_t n) ///< new line number
{
@ -652,7 +667,7 @@ class AbstractMatcher {
lno_ = n;
}
/// Updates and returns the starting line number of the match in the input character sequence.
inline size_t lineno()
size_t lineno()
/// @returns line number
{
#if WITH_SPAN
@ -661,55 +676,16 @@ class AbstractMatcher {
const char *s = lpb_;
const char *t = txt_;
size_t n = 0;
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
if (have_HW_AVX512BW())
n = simd_nlcount_avx512bw(s, t);
else if (have_HW_AVX2())
n = simd_nlcount_avx2(s, t);
if (cml_)
{
// count number of matching lines only, not line numbers
n = std::memchr(s, '\n', t - s) != NULL;
}
else
n = simd_nlcount_sse2(s, t);
#elif defined(HAVE_AVX2)
if (have_HW_AVX2())
n = simd_nlcount_avx2(s, t);
else
n = simd_nlcount_sse2(s, t);
#elif defined(HAVE_SSE2)
n = simd_nlcount_sse2(s, t);
#endif
#if defined(HAVE_NEON)
// no ARM AArch64/NEON SIMD optimized loop? - no code that runs faster than the code below?!
uint32_t n0 = 0, n1 = 0;
while (s < t - 1)
{
n0 += s[0] == '\n';
n1 += s[1] == '\n';
s += 2;
// count line numbers
n = nlcount(s, t);
}
n += n0 + n1 + (s < t && *s == '\n');
#else
// clang/gcc 4-way auto-vectorizable loop
uint32_t n0 = 0, n1 = 0, n2 = 0, n3 = 0;
while (s < t - 3)
{
n0 += s[0] == '\n';
n1 += s[1] == '\n';
n2 += s[2] == '\n';
n3 += s[3] == '\n';
s += 4;
}
n += n0 + n1 + n2 + n3;
// epilogue
if (s < t)
{
n += *s == '\n';
if (++s < t)
{
n += *s == '\n';
if (++s < t)
n += *s == '\n';
}
}
#endif
// if newlines are detected, then find begin of the last line to adjust bol
if (n > 0)
{
@ -956,13 +932,13 @@ class AbstractMatcher {
else if (got_ == '\n')
got_ = Const::UNK;
}
/// Returns true if this matcher matched text that begins a word.
/// Returns true if this matcher matched text that begins an ASCII word.
inline bool at_bow()
/// @returns true if this matcher matched text that begins a word
{
return !isword(got_) && isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more());
}
/// Returns true if this matcher matched text that ends a word.
/// Returns true if this matcher matched text that ends an ASCII word.
inline bool at_eow()
/// @returns true if this matcher matched text that ends a word
{
@ -1116,21 +1092,37 @@ class AbstractMatcher {
}
return buf_ + end_;
}
/// Return number of bytes available given number of bytes to fetch ahead, limited by input size and buffer size
inline size_t fetch(size_t len)
/// @returns number of bytes available after fetching.
{
DBGLOG("AbstractMatcher::fetch(%zu)", len);
if (eof_)
return 0;
if (len <= end_ - (txt_ - buf_))
return end_ - (txt_ - buf_);
if (end_ + len + 1 >= max_)
(void)grow();
if (end_ + len + 1 >= max_)
len = max_ - end_ - 1;
end_ += get(buf_ + end_, len);
return avail();
}
/// Returns the number of bytes in the buffer available to search from the current begin()/text() position.
size_t avail()
inline size_t avail()
{
if (peek() == EOF)
return 0;
return end_ - (txt_ - buf_);
}
/// Returns the byte offset of the match from the start of the line.
size_t border()
inline size_t border()
/// @returns border offset
{
return txt_ - bol();
}
/// Enlarge the match to span the entire line of input (excluding \n), return text().
const char *span()
inline const char *span()
/// @returns const char* span of text for the entire line
{
DBGLOG("AbstractMatcher::span()");
@ -1146,7 +1138,7 @@ class AbstractMatcher {
return text();
}
/// Returns the line of input (excluding \n) as a string containing the matched text as a substring.
std::string line()
inline std::string line()
/// @returns matching line as a string
{
DBGLOG("AbstractMatcher::line()");
@ -1156,7 +1148,7 @@ class AbstractMatcher {
return std::string(b, e - b);
}
/// Returns the line of input (excluding \n) as a wide string containing the matched text as a substring.
std::wstring wline()
inline std::wstring wline()
/// @returns matching line as a wide string
{
DBGLOG("AbstractMatcher::wline()");
@ -1252,12 +1244,12 @@ class AbstractMatcher {
return text();
}
/// Append the next match to the currently matched text returned by AbstractMatcher::text, when the next match found is adjacent to the current match.
void more()
inline void more()
{
cur_ = txt_ - buf_;
}
/// Truncate the AbstractMatcher::text length of the match to n characters in length and reposition for next match.
void less(size_t n) ///< truncated string length
inline void less(size_t n) ///< truncated string length
{
if (n < len_)
{
@ -1270,80 +1262,80 @@ class AbstractMatcher {
}
}
/// Cast this matcher to positive integer indicating the nonzero capture index of the matched text in the pattern, same as AbstractMatcher::accept.
operator size_t() const
inline operator size_t() const
/// @returns nonzero capture index of a match, which may be matcher dependent, or zero for a mismatch
{
return accept();
}
/// Cast this matcher to a std::string of the text matched by this matcher.
operator std::string() const
inline operator std::string() const
/// @returns std::string with matched text
{
return str();
}
/// Cast this matcher to a std::wstring of the text matched by this matcher.
operator std::wstring() const
inline operator std::wstring() const
/// @returns std::wstring converted to UCS from the 0-terminated matched UTF-8 text
{
return wstr();
}
/// Cast the match to std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenization into containers.
operator std::pair<size_t,std::string>() const
inline operator std::pair<size_t,std::string>() const
/// @returns std::pair<size_t,std::wstring>(accept(), wstr())
{
return pair();
}
/// Returns true if matched text is equal to a string, useful for std::algorithm.
bool operator==(const char *rhs) ///< rhs string to compare to
inline bool operator==(const char *rhs) ///< rhs string to compare to
/// @returns true if matched text is equal to rhs string
const
{
return std::strncmp(rhs, txt_, len_) == 0 && rhs[len_] == '\0';
}
/// Returns true if matched text is equalt to a string, useful for std::algorithm.
bool operator==(const std::string& rhs) ///< rhs string to compare to
inline bool operator==(const std::string& rhs) ///< rhs string to compare to
/// @returns true if matched text is equal to rhs string
const
{
return rhs.size() == len_ && rhs.compare(0, std::string::npos, txt_, len_) == 0;
}
/// Returns true if capture index is equal to a given size_t value, useful for std::algorithm.
bool operator==(size_t rhs) ///< capture index to compare accept() to
inline bool operator==(size_t rhs) ///< capture index to compare accept() to
/// @returns true if capture index is equal to rhs
const
{
return accept() == rhs;
}
/// Returns true if capture index is equal to a given int value, useful for std::algorithm.
bool operator==(int rhs) ///< capture index to compare accept() to
inline bool operator==(int rhs) ///< capture index to compare accept() to
/// @returns true if capture index is equal to rhs
const
{
return static_cast<int>(accept()) == rhs;
}
/// Returns true if matched text is not equal to a string, useful for std::algorithm.
bool operator!=(const char *rhs) ///< rhs string to compare to
inline bool operator!=(const char *rhs) ///< rhs string to compare to
/// @returns true if matched text is not equal to rhs string
const
{
return std::strncmp(rhs, txt_, len_) != 0 || rhs[len_] != '\0'; // if static checkers complain here, they are wrong
}
/// Returns true if matched text is not equal to a string, useful for std::algorithm.
bool operator!=(const std::string& rhs) ///< rhs string to compare to
inline bool operator!=(const std::string& rhs) ///< rhs string to compare to
/// @returns true if matched text is not equal to rhs string
const
{
return rhs.size() > len_ || rhs.compare(0, std::string::npos, txt_, len_) != 0;
}
/// Returns true if capture index is not equal to a given size_t value, useful for std::algorithm.
bool operator!=(size_t rhs) ///< capture index to compare accept() to
inline bool operator!=(size_t rhs) ///< capture index to compare accept() to
/// @returns true if capture index is not equal to rhs
const
{
return accept() != rhs;
}
/// Returns true if capture index is not equal to a given int value, useful for std::algorithm.
bool operator!=(int rhs) ///< capture index to compare accept() to
inline bool operator!=(int rhs) ///< capture index to compare accept() to
/// @returns true if capture index is not equal to rhs
const
{
@ -1632,6 +1624,7 @@ class AbstractMatcher {
bool own_; ///< true if AbstractMatcher::buf_ was allocated and should be deleted
bool eof_; ///< input has reached EOF
bool mat_; ///< true if AbstractMatcher::matches() was successful
bool cml_; ///< true when counting matching lines instead of line numbers
};
/// The pattern matcher class template extends abstract matcher base class.
@ -1656,7 +1649,7 @@ class PatternMatcher : public AbstractMatcher {
delete pat_;
}
/// Assign a matcher, the underlying pattern object is shared (not deep copied).
PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
virtual PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
{
scan.init(this, Const::SCAN);
find.init(this, Const::FIND);
@ -1664,9 +1657,7 @@ class PatternMatcher : public AbstractMatcher {
in = matcher.in;
reset();
opt_ = matcher.opt_;
pat_ = matcher.pat_,
own_ = false;
return *this;
return pattern(matcher.pat_);
}
/// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
@ -1726,19 +1717,19 @@ class PatternMatcher : public AbstractMatcher {
return *this;
}
/// Returns true if this matcher has a pattern.
bool has_pattern() const
inline bool has_pattern() const
/// @returns true if this matcher has a pattern
{
return pat_ != NULL;
}
/// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
bool own_pattern() const
inline bool own_pattern() const
/// @returns true if this matcher has its own pattern
{
return own_ && pat_ != NULL;
}
/// Returns a reference to the pattern object associated with this matcher.
const Pattern& pattern() const
virtual const Pattern& pattern() const
/// @returns reference to pattern object
{
ASSERT(pat_ != NULL);
@ -1809,7 +1800,7 @@ class PatternMatcher<std::string> : public AbstractMatcher {
delete pat_;
}
/// Assign a matcher, the underlying pattern string is shared (not deep copied).
PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
virtual PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
{
scan.init(this, Const::SCAN);
find.init(this, Const::FIND);
@ -1817,9 +1808,7 @@ class PatternMatcher<std::string> : public AbstractMatcher {
in = matcher.in;
reset();
opt_ = matcher.opt_;
pat_ = matcher.pat_,
own_ = false;
return *this;
return pattern(matcher.pat_);
}
/// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
@ -1865,19 +1854,19 @@ class PatternMatcher<std::string> : public AbstractMatcher {
return *this;
}
/// Returns true if this matcher has a pattern.
bool has_pattern() const
inline bool has_pattern() const
/// @returns true if this matcher has a pattern
{
return pat_ != NULL;
}
/// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
bool own_pattern() const
inline bool own_pattern() const
/// @returns true if this matcher has its own pattern
{
return own_ && pat_ != NULL;
}
/// Returns a reference to the pattern string associated with this matcher.
const Pattern& pattern() const
virtual const Pattern& pattern() const
/// @returns reference to pattern string
{
ASSERT(pat_ != NULL);

File diff suppressed because it is too large Load Diff

View File

@ -138,8 +138,8 @@ find:
// option N also finds empty lines
if (n == 0 && !opt_.N)
goto find;
// option W only finds empty lines
if (n > 0 && opt_.W)
// option X only finds empty lines
if (n > 0 && opt_.X)
goto find;
break;
case Const::SPLIT:

File diff suppressed because it is too large Load Diff

View File

@ -365,8 +365,8 @@ class Pattern {
f |= pmh[h] & 4;
h = hash(h, static_cast<uint8_t>(*++s));
f |= pmh[h] & 8;
Pred m = 16;
const char *e = s + n - 3;
Pred m = 16;
while (f == 0 && ++s < e)
{
h = hash(h, static_cast<uint8_t>(*s));
@ -941,8 +941,7 @@ class Pattern {
void check_dfa_closure(
const DFA::State *state,
int nest,
bool& peek,
bool& prev) const;
bool& peek) const;
void gencode_dfa_closure(
FILE *fd,
const DFA::State *start,
@ -1172,7 +1171,7 @@ class Pattern {
Index cut_; ///< DFA s-t cut to improve predict match and HFA accuracy together with lbk_ and cbk_
size_t len_; ///< length of chr_[], less or equal to 255
size_t min_; ///< patterns after the prefix are at least this long but no more than 8
size_t pin_; ///< number of needles
size_t pin_; ///< number of needles, 0 to 16
std::bitset<256> cbk_; ///< characters to look back over when lbk_ > 0, never includes \n
std::bitset<256> fst_; ///< the beginning characters of the pattern
char chr_[256]; ///< pattern prefix string or character needles for needle-based search
@ -1183,7 +1182,7 @@ class Pattern {
uint16_t lbm_; ///< loopback minimum distance when lbk_ > 0
uint16_t lcp_; ///< primary least common character position in the pattern or 0xffff
uint16_t lcs_; ///< secondary least common character position in the pattern or 0xffff
size_t bmd_; ///< Boyer-Moore jump distance on mismatch, B-M is enabled when bmd_ > 0
size_t bmd_; ///< Boyer-Moore jump distance on mismatch, B-M is enabled when bmd_ > 0 (<= 255)
uint8_t bms_[256]; ///< Boyer-Moore skip array
float pms_; ///< ms elapsed time to parse regex
float vms_; ///< ms elapsed time to compile DFA vertices
@ -1192,6 +1191,7 @@ class Pattern {
float ams_; ///< ms elapsed time to analyze DFA for predict match and HFA
size_t npy_; ///< entropy derived from the bitap array bit_[]
bool one_; ///< true if matching one string stored in chr_[] without meta/anchors
bool bol_; ///< true if matching all patterns at the begin of a line with anchor ^
};
} // namespace reflex

View File

@ -28,15 +28,19 @@
/**
@file simd.h
@brief RE/flex SIMD intrinsics
@brief RE/flex SIMD primitives
@author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt
*/
#ifndef SIMD_H
#define SIMD_H
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#if defined(HAVE_AVX512BW)
# include <immintrin.h>
#elif defined(HAVE_AVX2)
@ -133,8 +137,7 @@ inline uint32_t popcountl(uint64_t x)
}
#endif
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
extern size_t simd_nlcount_sse2(const char*& b, const char *e);
// Partially count newlines in string b up to e, updates b close to e with uncounted part
extern size_t simd_nlcount_avx2(const char *&b, const char *e);
extern size_t simd_nlcount_avx512bw(const char *&b, const char *e);
@ -142,6 +145,11 @@ extern size_t simd_nlcount_avx512bw(const char*& b, const char *e);
#endif
namespace reflex {
// Count newlines in string s up to t
extern size_t nlcount(const char *s, const char *t);
} // namespace reflex
#endif

View File

@ -778,7 +778,11 @@ static void insert_posix_class(const char *pattern, size_t len, size_t& pos, con
else if (name[0] == 'A' && name[1] == 's')
name = const_cast<char*>("ASCII");
}
const int *wc = Posix::range(name);
const int *wc = NULL;
if ((flags & convert_flag::unicode))
wc = Unicode::range(name);
if (wc == NULL)
wc = Posix::range(name);
if (wc == NULL)
throw regex_error(regex_error::invalid_class, pattern, pos);
if (*buf == '^')

View File

@ -28,7 +28,7 @@
/**
@file input.cpp
@brief RE/flex input character sequence class and simd.h CPUID check
@brief RE/flex input character sequence class
@author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt
@ -1361,27 +1361,4 @@ void Input::file_encoding(unsigned short enc, const unsigned short *page)
}
}
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
#include <reflex/simd.h>
// simd.h get_HW()
static uint64_t get_HW()
{
int CPUInfo1[4] = { 0, 0, 0, 0 };
int CPUInfo7[4] = { 0, 0, 0, 0 };
cpuidex(CPUInfo1, 0, 0);
int n = CPUInfo1[0];
if (n <= 0)
return 0ULL;
cpuidex(CPUInfo1, 1, 0); // cpuid EAX=1
if (n >= 7)
cpuidex(CPUInfo7, 7, 0); // cpuid EAX=7, ECX=0
return static_cast<uint32_t>(CPUInfo1[2]) | (static_cast<uint64_t>(static_cast<uint32_t>(CPUInfo7[1])) << 32);
}
uint64_t HW = get_HW();
#endif
} // namespace reflex

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -27,10 +27,10 @@
\******************************************************************************/
/**
@file matcher.cpp, matcher_avx2.cpp, matcher_avx512bw.cpp
@file matcher_avx512bw.cpp
@brief RE/flex matcher engine
@author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2022, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt
*/
@ -40,5 +40,344 @@
# endif
#endif
#define COMPILE_AVX512BW
#include "matcher.cpp"
#include <reflex/matcher.h>
namespace reflex {
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
// AVX512BW runtime optimized function callback overrides
void Matcher::simd_init_advance_avx512bw()
{
if (pat_->len_ == 0)
{
// no specialization
}
else if (pat_->len_ == 1)
{
// no specialization
}
else if (pat_->len_ == 2)
{
if (pat_->min_ == 0)
adv_ = &Matcher::simd_advance_chars_avx512bw<2>;
else if (pat_->min_ < 4)
adv_ = &Matcher::simd_advance_chars_pma_avx512bw<2>;
else
adv_ = &Matcher::simd_advance_chars_pmh_avx512bw<2>;
}
else if (pat_->len_ == 3)
{
if (pat_->min_ == 0)
adv_ = &Matcher::simd_advance_chars_avx512bw<3>;
else if (pat_->min_ < 4)
adv_ = &Matcher::simd_advance_chars_pma_avx512bw<3>;
else
adv_ = &Matcher::simd_advance_chars_pmh_avx512bw<3>;
}
else if (pat_->bmd_ == 0)
{
#if defined(WITH_STRING_PM)
if (pat_->min_ >= 4)
adv_ = &Matcher::simd_advance_string_pmh_avx512bw;
else if (pat_->min_ > 0)
adv_ = &Matcher::simd_advance_string_pma_avx512bw;
else
#endif
adv_ = &Matcher::simd_advance_string_avx512bw;
}
}
/// Few chars
template<uint8_t LEN>
bool Matcher::simd_advance_chars_avx512bw(size_t loc)
{
static const uint16_t lcp = 0;
static const uint16_t lcs = LEN - 1;
const char *chr = pat_->chr_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - LEN + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (LEN == 2 ||
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
{
loc = s - lcp + offset - buf_;
set_current(loc);
return true;
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + LEN > end_)
return false;
if (loc + LEN + 63 > end_)
break;
}
return advance_chars<LEN>(loc);
}
/// Few chars followed by 2 to 3 minimal char pattern
template<uint8_t LEN>
bool Matcher::simd_advance_chars_pma_avx512bw(size_t loc)
{
static const uint16_t lcp = 0;
static const uint16_t lcs = LEN - 1;
const Pattern::Pred *pma = pat_->pma_;
const char *chr = pat_->chr_;
size_t min = pat_->min_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - LEN + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (LEN == 2 ||
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
{
loc = s - lcp + offset - buf_;
if (loc + LEN + 4 > end_ || Pattern::predict_match(pma, &buf_[loc + LEN]) == 0)
{
set_current(loc);
return true;
}
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + LEN + min > end_)
return false;
if (loc + LEN + min + 63 > end_)
break;
}
return advance_chars_pma<LEN>(loc);
}
/// Few chars followed by 4 minimal char pattern
template<uint8_t LEN>
bool Matcher::simd_advance_chars_pmh_avx512bw(size_t loc)
{
static const uint16_t lcp = 0;
static const uint16_t lcs = LEN - 1;
const Pattern::Pred *pmh = pat_->pmh_;
const char *chr = pat_->chr_;
size_t min = pat_->min_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - LEN + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (LEN == 2 ||
(LEN == 3 ? s[offset + 1 - lcp] == chr[1] : std::memcmp(s + 1 - lcp + offset, chr + 1, LEN - 2) == 0))
{
loc = s - lcp + offset - buf_;
if (loc + LEN + min > end_ || Pattern::predict_match(pmh, &buf_[loc + LEN], min))
{
set_current(loc);
return true;
}
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + LEN + min > end_)
return false;
if (loc + LEN + min + 63 > end_)
break;
}
return advance_chars_pmh<LEN>(loc);
}
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
bool Matcher::simd_advance_string_avx512bw(size_t loc)
{
const char *chr = pat_->chr_;
size_t len = pat_->len_;
uint16_t lcp = pat_->lcp_;
uint16_t lcs = pat_->lcs_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - len + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (std::memcmp(s - lcp + offset, chr, len) == 0)
{
loc = s - lcp + offset - buf_;
set_current(loc);
return true;
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + len > end_)
return false;
if (loc + len + 63 > end_)
break;
}
return advance_string(loc);
}
#if defined(WITH_STRING_PM)
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
bool Matcher::simd_advance_string_pma_avx512bw(size_t loc)
{
const Pattern::Pred *pma = pat_->pma_;
const char *chr = pat_->chr_;
size_t len = pat_->len_;
size_t min = pat_->min_;
uint16_t lcp = pat_->lcp_;
uint16_t lcs = pat_->lcs_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - len + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (std::memcmp(s - lcp + offset, chr, len) == 0)
{
loc = s - lcp + offset - buf_;
if (loc + len + 4 > end_ || Pattern::predict_match(pma, &buf_[loc + len]) == 0)
{
set_current(loc);
return true;
}
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + len + min > end_)
return false;
if (loc + len + min + 63 > end_)
break;
}
return advance_string_pma(loc);
}
/// Implements AVX512BW string search scheme based on http://0x80.pl/articles/simd-friendly-karp-rabin.html
bool Matcher::simd_advance_string_pmh_avx512bw(size_t loc)
{
const Pattern::Pred *pmh = pat_->pmh_;
const char *chr = pat_->chr_;
size_t len = pat_->len_;
size_t min = pat_->min_;
uint16_t lcp = pat_->lcp_;
uint16_t lcs = pat_->lcs_;
while (true)
{
const char *s = buf_ + loc + lcp;
const char *e = buf_ + end_ + lcp - len + 1;
__m512i vlcp = _mm512_set1_epi8(chr[lcp]);
__m512i vlcs = _mm512_set1_epi8(chr[lcs]);
while (s <= e - 64)
{
__m512i vlcpm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s));
__m512i vlcsm = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(s + lcs - lcp));
uint64_t mask = _mm512_cmpeq_epi8_mask(vlcp, vlcpm) & _mm512_cmpeq_epi8_mask(vlcs, vlcsm);
while (mask != 0)
{
uint32_t offset = ctzl(mask);
if (std::memcmp(s - lcp + offset, chr, len) == 0)
{
loc = s - lcp + offset - buf_;
if (loc + len + min > end_ || Pattern::predict_match(pmh, &buf_[loc + len], min))
{
set_current(loc);
return true;
}
}
mask &= mask - 1;
}
s += 64;
}
s -= lcp;
loc = s - buf_;
set_current_and_peek_more(loc - 1);
loc = cur_ + 1;
if (loc + len + min > end_)
return false;
if (loc + len + min + 63 > end_)
break;
}
return advance_string_pmh(loc);
}
#endif // WITH_STRING_PM
#else
// appease ranlib "has no symbols"
void matcher_not_compiled_with_avx512bw() { }
#endif
} // namespace reflex

View File

@ -45,10 +45,10 @@
/// DFA compaction: -1 == reverse order edge compression (best); 1 == edge compression; 0 == no edge compression.
/** Edge compression reorders edges to produce fewer tests when executed in the compacted order.
For example ([a-cg-ik]|d|[e-g]|j|y|[x-z]) after reverse edge compression has only 2 edges:
c1 = m.FSM_CHAR();
if ('x' <= c1 && c1 <= 'z') goto S3;
if ('a' <= c1 && c1 <= 'k') goto S3;
return m.FSM_HALT(c1);
c = m.FSM_CHAR();
if ('x' <= c && c <= 'z') goto S3;
if ('a' <= c && c <= 'k') goto S3;
return m.FSM_HALT(c);
*/
#define WITH_COMPACT_DFA -1
@ -177,6 +177,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
bmd_ = 0;
npy_ = 0;
one_ = false;
bol_ = false;
vno_ = 0;
eno_ = 0;
hno_ = 0;
@ -197,36 +198,43 @@ void Pattern::init(const char *options, const uint8_t *pred)
len_ = pred[0];
min_ = pred[1] & 0x0f;
one_ = pred[1] & 0x10;
bol_ = pred[1] & 0x40;
memcpy(chr_, pred + 2, len_);
size_t n = len_ + 2;
size_t n = 2 + len_;
if (len_ == 0)
{
// get bitap bit_[] parameters
for (size_t i = 0; i < 256; ++i)
bit_[i] = ~pred[i + n];
n += 256;
}
if (min_ >= 4)
{
for (size_t i = 0; i < Const::HASH; ++i)
pmh_[i] = ~pred[i + n];
}
else
if (min_ < 4)
{
// get predict match PM4 pma_[] parameters
for (size_t i = 0; i < Const::HASH; ++i)
pma_[i] = ~pred[i + n];
}
else
{
// get predict match hash pmh_[] parameters
for (size_t i = 0; i < Const::HASH; ++i)
pmh_[i] = ~pred[i + n];
}
n += Const::HASH;
if ((pred[1] & 0x20) != 0)
{
n += Const::HASH;
// get lookback parameters lbk_ lbm_ and cbk_[] after s-t cut and first s-t cut pattern characters fst_[]
lbk_ = pred[n + 0] | (pred[n + 1] << 8);
lbm_ = pred[n + 2] | (pred[n + 3] << 8);
for (size_t i = 0; i < 256; ++i)
cbk_.set(i, pred[n + 4 + (i >> 3)] & (1 << (i & 7)));
for (size_t i = 0; i < 256; ++i)
fst_.set(i, pred[n + 32 + 4 + (i >> 3)] & (1 << (i & 7)));
fst_.set(i, pred[n + 4 + 32 + (i >> 3)] & (1 << (i & 7)));
n += 4 + 32 + 32;
}
else
{
// get first pattern characters fst_[] from bitap
for (size_t i = 0; i < 256; ++i)
fst_.set(i, (bit_[i] & 1) == 0);
}
@ -311,8 +319,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
}
// needle count and frequency thresholds to enable needle-based search
uint16_t pinmax = 8;
uint8_t freqmax1 = 91; // one position
uint8_t freqmax2 = 251; // two positions
uint8_t freqmax = 251;
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
if (have_HW_AVX512BW() || have_HW_AVX2())
pinmax = 16;
@ -331,7 +338,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
lcs_ = 0;
uint16_t nlcp = 65535; // max and undefined
uint16_t nlcs = 65535; // max and undefined
uint16_t freqsum = 0;
uint8_t freqlcp = 255; // max
uint8_t freqlcs = 255; // max
size_t min = (min_ == 0 ? 1 : min_);
@ -339,7 +345,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
{
Pred mask = 1 << k;
uint16_t n = 0;
uint16_t sum = 0;
uint8_t max = 0;
// at position k count the matching characters and find the max character frequency
for (uint16_t i = 0; i < 256; ++i)
@ -348,14 +353,13 @@ void Pattern::init(const char *options, const uint8_t *pred)
{
++n;
uint8_t freq = frequency(static_cast<uint8_t>(i));
sum += freq;
if (freq > max)
max = freq;
}
}
if (n <= pinmax)
{
// pick the fewest and rarest (least frequently occurring) needles to search
// pick the fewest and rarest (less frequently occurring) needles to search
if (max < freqlcp || (n < nlcp && max == freqlcp))
{
lcs_ = lcp_;
@ -363,7 +367,6 @@ void Pattern::init(const char *options, const uint8_t *pred)
freqlcs = freqlcp;
lcp_ = static_cast<uint8_t>(k);
nlcp = n;
freqsum = sum;
freqlcp = max;
}
else if (n < nlcs ||
@ -377,8 +380,8 @@ void Pattern::init(const char *options, const uint8_t *pred)
}
}
}
// one position to pin: make lcp and lcs equal (compared and optimized later)
if (min == 1 || ((freqsum <= freqlcp || nlcs == 65535) && freqsum <= freqmax1))
// one position to pin: make lcp and lcs equal to 0 (only one position at 0)
if (min == 1 || nlcs == 65535)
{
nlcs = nlcp;
lcs_ = lcp_;
@ -387,7 +390,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
uint16_t n = nlcp > nlcs ? nlcp : nlcs;
DBGLOG("min=%zu lcp=%hu(%hu) pin=%hu nlcp=%hu(%hu) freq=%hu(%hu) freqsum=%hu npy=%zu", min, lcp_, lcs_, n, nlcp, nlcs, freqlcp, freqlcs, freqsum, npy_);
// determine if a needle-based search is worthwhile, below or meeting the thresholds
if (n <= pinmax && freqlcp <= freqmax2)
if (n <= pinmax && freqlcp <= freqmax)
{
// bridge the gap from 9 to 16 to handle 9 to 16 combined
if (n > 8)
@ -412,7 +415,7 @@ void Pattern::init(const char *options, const uint8_t *pred)
}
else if (len_ > 1)
{
// Boyer-Moore preprocessing of the given string pattern pat of length len, generates bmd_ > 0 and bms_[] shifts
// produce lcp and lcs positions and Boyer-Moore bms_[] shifts when bmd_ > 0
uint8_t n = static_cast<uint8_t>(len_); // okay to cast: actually never more than 255
uint16_t i;
for (i = 0; i < 256; ++i)
@ -433,13 +436,14 @@ void Pattern::init(const char *options, const uint8_t *pred)
lcs_ = lcp_;
lcp_ = i;
}
else if (lcpch != pch && frequency(lcsch) > freqpch)
else if (frequency(lcsch) > freqpch ||
(frequency(lcsch) == freqpch &&
abs(static_cast<int>(lcp_) - static_cast<int>(lcs_)) < abs(static_cast<int>(lcp_) - static_cast<int>(i))))
{
lcs_ = i;
}
}
}
DBGLOG("len=%zu lcp=%hu(%hu)", len_, lcp_, lcs_);
uint16_t j;
for (i = n - 1, j = i; j > 0; --j)
if (chr_[j - 1] == chr_[i])
@ -469,7 +473,34 @@ void Pattern::init(const char *options, const uint8_t *pred)
#endif
#endif
if (lcs_ < 0xffff)
bmd_ = 0; // do not use B-M
{
// do not use B-M
bmd_ = 0;
// spread lcp and lcs apart if lcp and lcs are adjacent (chars are possibly correlated)
if (len_ == 3 && (lcp_ == 1 || lcs_ == 1))
{
lcp_ = 0;
lcs_ = 2;
}
else if (len_ > 3 && (lcp_ + 1 == lcs_ || lcs_ + 1 == lcp_))
{
uint8_t freqlcs = 255;
for (i = 0; i < n; ++i)
{
if (i > lcp_ + 1 || i + 1 < lcp_)
{
uint8_t pch = static_cast<uint8_t>(chr_[i]);
uint8_t freqpch = frequency(pch);
if (freqlcs > freqpch)
{
lcs_ = i;
freqlcs = freqpch;
}
}
}
}
}
DBGLOG("len=%zu bmd=%zu lcp=%hu(%hu)", len_, bmd_, lcp_, lcs_);
}
}
@ -629,6 +660,7 @@ void Pattern::parse(
loc = 0;
}
}
bol_ = at(loc) == '^';
do
{
Location end = loc;
@ -733,6 +765,8 @@ void Pattern::parse(
}
else
{
if (at(loc) != '^')
bol_ = false;
parse2(
true,
loc,
@ -2961,8 +2995,8 @@ void Pattern::gencode_dfa(const DFA::State *start) const
::fprintf(file,
"void reflex_code_%s(reflex::Matcher& m)\n"
"{\n"
" int c0 = 0, c1 = 0;\n"
" m.FSM_INIT(c1);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str());
" int c = 0;\n"
" m.FSM_INIT(c);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str());
for (const DFA::State *state = start; state != NULL; state = state->next)
{
::fprintf(file, "\nS%u:\n", state->index);
@ -2978,8 +3012,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
::fprintf(file, " m.FSM_HEAD(%u);\n", *i);
if (state->edges.rbegin() != state->edges.rend() && state->edges.rbegin()->first == META_DED)
::fprintf(file, " if (m.FSM_DENT()) goto S%u;\n", state->edges.rbegin()->second.second->index);
bool peek = false; // if we need to read a character into c1
bool prev = false; // if we need to keep the previous character in c0
bool peek = false; // if we need to read a character into c
for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
{
#if WITH_COMPACT_DFA == -1
@ -2993,13 +3026,12 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{
do
{
if (lo == META_EOB || lo == META_EOL)
if (lo == META_EOB || lo == META_EOL || lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
{
peek = true;
else if (lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
prev = peek = true;
if (prev && peek)
break;
check_dfa_closure(i->second.second, 1, peek, prev);
}
check_dfa_closure(i->second.second, 1, peek);
} while (++lo <= hi);
}
else
@ -3025,10 +3057,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
target_index = i->second.second->index;
if (read)
{
if (prev)
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
else
::fprintf(file, " c1 = m.FSM_CHAR();\n");
::fprintf(file, " c = m.FSM_CHAR();\n");
read = false;
}
if (is_meta(lo))
@ -3039,14 +3068,6 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{
case META_EOB:
case META_EOL:
::fprintf(file, " ");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, 2, peek);
::fprintf(file, " }\n");
elif = true;
break;
case META_EWE:
case META_BWE:
case META_NWE:
@ -3054,7 +3075,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
::fprintf(file, " ");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, 2, peek);
::fprintf(file, " }\n");
elif = true;
@ -3077,7 +3098,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
break;
if (lo == hi)
{
::fprintf(file, " if (c1 == ");
::fprintf(file, " if (c == ");
print_char(file, lo);
::fprintf(file, ")");
}
@ -3085,20 +3106,20 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{
::fprintf(file, " if (");
print_char(file, lo);
::fprintf(file, " <= c1)");
::fprintf(file, " <= c)");
}
else
{
::fprintf(file, " if (");
print_char(file, lo);
::fprintf(file, " <= c1 && c1 <= ");
::fprintf(file, " <= c && c <= ");
print_char(file, hi);
::fprintf(file, ")");
}
if (target_index == Const::IMAX)
{
if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n");
::fprintf(file, " return m.FSM_HALT(c);\n");
else
::fprintf(file, " return m.FSM_HALT();\n");
}
@ -3117,10 +3138,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{
if (read)
{
if (prev)
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
else
::fprintf(file, " c1 = m.FSM_CHAR();\n");
::fprintf(file, " c = m.FSM_CHAR();\n");
read = false;
}
do
@ -3129,14 +3147,6 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{
case META_EOB:
case META_EOL:
::fprintf(file, " ");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, 2, peek);
::fprintf(file, " }\n");
elif = true;
break;
case META_EWE:
case META_BWE:
case META_NWE:
@ -3144,7 +3154,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
::fprintf(file, " ");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, 2, peek);
::fprintf(file, " }\n");
elif = true;
@ -3170,10 +3180,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
target_index = i->second.second->index;
if (read)
{
if (prev)
::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
else
::fprintf(file, " c1 = m.FSM_CHAR();\n");
::fprintf(file, " c = m.FSM_CHAR();\n");
read = false;
}
if (!is_meta(lo))
@ -3183,7 +3190,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
break;
if (lo == hi)
{
::fprintf(file, " if (c1 == ");
::fprintf(file, " if (c == ");
print_char(file, lo);
::fprintf(file, ")");
}
@ -3191,20 +3198,20 @@ void Pattern::gencode_dfa(const DFA::State *start) const
{
::fprintf(file, " if (");
print_char(file, lo);
::fprintf(file, " <= c1)");
::fprintf(file, " <= c)");
}
else
{
::fprintf(file, " if (");
print_char(file, lo);
::fprintf(file, " <= c1 && c1 <= ");
::fprintf(file, " <= c && c <= ");
print_char(file, hi);
::fprintf(file, ")");
}
if (target_index == Const::IMAX)
{
if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n");
::fprintf(file, " return m.FSM_HALT(c);\n");
else
::fprintf(file, " return m.FSM_HALT();\n");
}
@ -3216,7 +3223,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
}
#endif
if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n");
::fprintf(file, " return m.FSM_HALT(c);\n");
else
::fprintf(file, " return m.FSM_HALT();\n");
}
@ -3234,7 +3241,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const
}
#ifndef WITH_NO_CODEGEN
void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, bool& prev) const
void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek) const
{
if (nest > 5)
return;
@ -3251,13 +3258,12 @@ void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, b
{
do
{
if (lo == META_EOB || lo == META_EOL)
if (lo == META_EOB || lo == META_EOL || lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
{
peek = true;
else if (lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE)
prev = peek = true;
if (prev && peek)
break;
check_dfa_closure(i->second.second, nest + 1, peek, prev);
}
check_dfa_closure(i->second.second, nest + 1, peek);
} while (++lo <= hi);
}
}
@ -3271,14 +3277,14 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
if (state->redo)
{
if (peek)
::fprintf(file, "%*sm.FSM_REDO(c1);\n", 2*nest, "");
::fprintf(file, "%*sm.FSM_REDO(c);\n", 2*nest, "");
else
::fprintf(file, "%*sm.FSM_REDO();\n", 2*nest, "");
}
else if (state->accept > 0)
{
if (peek)
::fprintf(file, "%*sm.FSM_TAKE(%u, c1);\n", 2*nest, "", state->accept);
::fprintf(file, "%*sm.FSM_TAKE(%u, c);\n", 2*nest, "", state->accept);
else
::fprintf(file, "%*sm.FSM_TAKE(%u);\n", 2*nest, "", state->accept);
}
@ -3303,14 +3309,6 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
{
case META_EOB:
case META_EOL:
::fprintf(file, "%*s", 2*nest, "");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, nest + 1, peek);
::fprintf(file, "%*s}\n", 2*nest, "");
elif = true;
break;
case META_EWE:
case META_BWE:
case META_NWE:
@ -3318,7 +3316,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
::fprintf(file, "%*s", 2*nest, "");
if (elif)
::fprintf(file, "else ");
::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]);
gencode_dfa_closure(file, i->second.second, nest + 1, peek);
::fprintf(file, "%*s}\n", 2*nest, "");
elif = true;
@ -3346,7 +3344,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
::fprintf(file, "%*s", 2*nest, "");
if (lo == hi)
{
::fprintf(file, "if (c1 == ");
::fprintf(file, "if (c == ");
print_char(file, lo);
::fprintf(file, ")");
}
@ -3354,20 +3352,20 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
{
::fprintf(file, "if (");
print_char(file, lo);
::fprintf(file, " <= c1)");
::fprintf(file, " <= c)");
}
else
{
::fprintf(file, "if (");
print_char(file, lo);
::fprintf(file, " <= c1 && c1 <= ");
::fprintf(file, " <= c && c <= ");
print_char(file, hi);
::fprintf(file, ")");
}
if (target_index == Const::IMAX)
{
if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n");
::fprintf(file, " return m.FSM_HALT(c);\n");
else
::fprintf(file, " return m.FSM_HALT();\n");
}
@ -3394,7 +3392,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
::fprintf(file, "%*s", 2*nest, "");
if (lo == hi)
{
::fprintf(file, "if (c1 == ");
::fprintf(file, "if (c == ");
print_char(file, lo);
::fprintf(file, ")");
}
@ -3402,20 +3400,20 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest,
{
::fprintf(file, "if (");
print_char(file, lo);
::fprintf(file, " <= c1)");
::fprintf(file, " <= c)");
}
else
{
::fprintf(file, "if (");
print_char(file, lo);
::fprintf(file, " <= c1 && c1 <= ");
::fprintf(file, " <= c && c <= ");
print_char(file, hi);
::fprintf(file, ")");
}
if (target_index == Const::IMAX)
{
if (peek)
::fprintf(file, " return m.FSM_HALT(c1);\n");
::fprintf(file, " return m.FSM_HALT(c);\n");
else
::fprintf(file, " return m.FSM_HALT();\n");
}
@ -4560,26 +4558,31 @@ bool Pattern::match_hfa_transitions(size_t level, const HFA::Hashes& hashes, con
void Pattern::write_predictor(FILE *file) const
{
::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (len_ == 0) * 256 + Const::HASH + (lbk_ > 0) * 68);
::fprintf(file, "\n %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4) | ((lbk_ > 0) << 5))));
::fprintf(file, "\n %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4) | ((lbk_ > 0) << 5) | (bol_ << 6))));
// save match characters chr_[0..len_-1]
for (size_t i = 0; i < len_; ++i)
::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n ", static_cast<uint8_t>(chr_[i]));
if (len_ == 0)
{
// save bitap bit_[] parameters
for (Char i = 0; i < 256; ++i)
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~bit_[i]));
}
if (min_ >= 4)
{
for (Hash i = 0; i < Const::HASH; ++i)
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pmh_[i]));
}
else
if (min_ < 4)
{
// save predict match PM4 pma_[] parameters
for (Hash i = 0; i < Const::HASH; ++i)
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pma_[i]));
}
else
{
// save predict match hash pmh_[] parameters
for (Hash i = 0; i < Const::HASH; ++i)
::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pmh_[i]));
}
if (lbk_ > 0)
{
// save lookback parameters lbk_ lbm_ cbk_[] after s-t cut and first s-t cut pattern characters fst_[]
::fprintf(file, "\n %3hhu,%3hhu,%3hhu,%3hhu,", static_cast<uint8_t>(lbk_ & 0xff), static_cast<uint8_t>(lbk_ >> 8), static_cast<uint8_t>(lbm_ & 0xff), static_cast<uint8_t>(lbm_ >> 8));
for (size_t i = 0; i < 256; i += 8)
{

View File

@ -0,0 +1,157 @@
/******************************************************************************\
* Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions are met: *
* *
* (1) Redistributions of source code must retain the above copyright notice, *
* this list of conditions and the following disclaimer. *
* *
* (2) Redistributions in binary form must reproduce the above copyright *
* notice, this list of conditions and the following disclaimer in the *
* documentation and/or other materials provided with the distribution. *
* *
* (3) The name of the author may not be used to endorse or promote products *
* derived from this software without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF *
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO *
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; *
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, *
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR *
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF *
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
\******************************************************************************/
/**
@file simd.cpp
@brief RE/flex SIMD primitives
@author Robert van Engelen - engelen@genivia.com
@copyright (c) 2016-2024, Robert van Engelen, Genivia Inc. All rights reserved.
@copyright (c) BSD-3 License - see LICENSE.txt
*/
#include <reflex/simd.h>
namespace reflex {
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
// simd.h get_HW()
static uint64_t get_HW()
{
int CPUInfo1[4] = { 0, 0, 0, 0 };
int CPUInfo7[4] = { 0, 0, 0, 0 };
cpuidex(CPUInfo1, 0, 0);
int n = CPUInfo1[0];
if (n <= 0)
return 0ULL;
cpuidex(CPUInfo1, 1, 0); // cpuid EAX=1
if (n >= 7)
cpuidex(CPUInfo7, 7, 0); // cpuid EAX=7, ECX=0
return static_cast<uint32_t>(CPUInfo1[2]) | (static_cast<uint64_t>(static_cast<uint32_t>(CPUInfo7[1])) << 32);
}
uint64_t HW = get_HW();
#endif
size_t nlcount(const char *s, const char *t)
{
size_t n = 0;
if (s <= t - 256)
{
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
if (have_HW_AVX512BW())
n = simd_nlcount_avx512bw(s, t);
else if (have_HW_AVX2())
n = simd_nlcount_avx2(s, t);
else
#elif defined(HAVE_AVX512BW) || defined(HAVE_AVX2)
if (have_HW_AVX2())
n = simd_nlcount_avx2(s, t);
else
#endif
#if defined(HAVE_AVX512BW) || defined(HAVE_AVX2) || defined(HAVE_SSE2)
{
const char *e = t - 64;
// align on 16 bytes
while ((reinterpret_cast<std::ptrdiff_t>(s) & 0x0f) != 0)
n += (*s++ == '\n');
__m128i vlcn = _mm_set1_epi8('\n');
while (s <= e)
{
__m128i vlcm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
__m128i vlcm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 16));
__m128i vlcm3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 32));
__m128i vlcm4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 48));
__m128i vlceq1 = _mm_cmpeq_epi8(vlcm1, vlcn);
__m128i vlceq2 = _mm_cmpeq_epi8(vlcm2, vlcn);
__m128i vlceq3 = _mm_cmpeq_epi8(vlcm3, vlcn);
__m128i vlceq4 = _mm_cmpeq_epi8(vlcm4, vlcn);
n += popcount(_mm_movemask_epi8(vlceq1))
+ popcount(_mm_movemask_epi8(vlceq2))
+ popcount(_mm_movemask_epi8(vlceq3))
+ popcount(_mm_movemask_epi8(vlceq4));
s += 64;
}
}
#elif defined(HAVE_NEON)
const char *e = t - 64;
uint8x16_t vlcn = vdupq_n_u8('\n');
while (s <= e)
{
uint8x16_t vlcm0 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
uint8x16_t vleq0 = vceqq_u8(vlcm0, vlcn);
s += 16;
uint8x16_t vlcm1 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
uint8x16_t vleq1 = vceqq_u8(vlcm1, vlcn);
s += 16;
uint8x16_t vlcm2 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
uint8x16_t vleq2 = vceqq_u8(vlcm2, vlcn);
s += 16;
uint8x16_t vlcm3 = vld1q_u8(reinterpret_cast<const uint8_t*>(s));
uint8x16_t vleq3 = vceqq_u8(vlcm3, vlcn);
s += 16;
#if defined(__aarch64__)
n += vaddvq_s8(vqabsq_s8(vreinterpretq_s8_u8(vaddq_u8(vleq0, vaddq_u8(vleq1, vaddq_u8(vleq2, vleq3))))));
#else
// my homebrew horizontal sum (we have a very limited range 0..4 to sum to a total max 4x16=64 < 256)
uint64x2_t vsum = vreinterpretq_u64_s8(vqabsq_s8(vreinterpretq_s8_u8(vaddq_u8(vleq0, vaddq_u8(vleq1, vaddq_u8(vleq2, vleq3))))));
uint64_t sum0 = vgetq_lane_u64(vsum, 0) + vgetq_lane_u64(vsum, 1);
uint32_t sum1 = static_cast<uint32_t>(sum0) + (sum0 >> 32);
uint16_t sum2 = static_cast<uint16_t>(sum1) + (sum1 >> 16);
n += static_cast<uint8_t>(sum2) + (sum2 >> 8);
#endif
}
#endif
}
// 4-way auto-vectorizable loop
uint32_t n0 = 0, n1 = 0, n2 = 0, n3 = 0;
while (s < t - 3)
{
n0 += s[0] == '\n';
n1 += s[1] == '\n';
n2 += s[2] == '\n';
n3 += s[3] == '\n';
s += 4;
}
n += n0 + n1 + n2 + n3;
// epilogue
if (s < t)
{
n += *s == '\n';
if (++s < t)
{
n += *s == '\n';
if (++s < t)
n += *s == '\n';
}
}
return n;
}
} // namespace reflex

View File

@ -34,15 +34,20 @@
@copyright (c) BSD-3 License - see LICENSE.txt
*/
#include <reflex/absmatcher.h>
#include <cstddef>
#if defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
# if !defined(__AVX2__) && !defined(__AVX512BW__)
# error simd_avx2.cpp must be compiled with -mavx2 or /arch:avx2.
# endif
#endif
#include <reflex/simd.h>
namespace reflex {
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
// Partially count newlines in string b up to e, updates b close to e with uncounted part
size_t simd_nlcount_avx2(const char*& b, const char *e)
{
#if defined(HAVE_AVX2)
#if defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
const char *s = b;
e -= 128;
if (s > e)
@ -73,42 +78,4 @@ size_t simd_nlcount_avx2(const char*& b, const char *e)
#endif
}
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
size_t simd_nlcount_sse2(const char*& b, const char *e)
{
#if defined(HAVE_SSE2)
const char *s = b;
e -= 64;
if (s > e)
return 0;
size_t n = 0;
// align on 16 bytes
while ((reinterpret_cast<std::ptrdiff_t>(s) & 0x0f) != 0)
n += (*s++ == '\n');
__m128i vlcn = _mm_set1_epi8('\n');
while (s <= e)
{
__m128i vlcm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
__m128i vlcm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 16));
__m128i vlcm3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 32));
__m128i vlcm4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + 48));
__m128i vlceq1 = _mm_cmpeq_epi8(vlcm1, vlcn);
__m128i vlceq2 = _mm_cmpeq_epi8(vlcm2, vlcn);
__m128i vlceq3 = _mm_cmpeq_epi8(vlcm3, vlcn);
__m128i vlceq4 = _mm_cmpeq_epi8(vlcm4, vlcn);
n += popcount(_mm_movemask_epi8(vlceq1))
+ popcount(_mm_movemask_epi8(vlceq2))
+ popcount(_mm_movemask_epi8(vlceq3))
+ popcount(_mm_movemask_epi8(vlceq4));
s += 64;
}
b = s;
return n;
#else
(void)b;
(void)e;
return 0;
#endif
}
} // namespace reflex

View File

@ -34,12 +34,17 @@
@copyright (c) BSD-3 License - see LICENSE.txt
*/
#include <reflex/absmatcher.h>
#include <cstddef>
#if defined(HAVE_AVX512BW)
# if !defined(__AVX512BW__)
# error simd_avx512bw.cpp must be compiled with -mavx512bw or /arch:avx512.
# endif
#endif
#include <reflex/simd.h>
namespace reflex {
// Partially count newlines in string b up to and including position e in b, updates b close to e with uncounted part
// Partially count newlines in string b up to e, updates b close to e with uncounted part
size_t simd_nlcount_avx512bw(const char*& b, const char *e)
{
#if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))

View File

@ -83,9 +83,15 @@ Tables::Tables()
range["Control"] = range["Cc"];
range["Format"] = range["Cf"];
range["d"] = range["Decimal_Digit_Number"];
range["l"] = range["Lowercase_Letter"];
range["u"] = range["Uppercase_Letter"];
range["Cntrl"] = range["C"];
range["Digit"] = range["Nd"];
range["Lower"] = range["Ll"];
range["Punct"] = range["P"];
range["Upper"] = range["Lu"];
range["d"] = range["Digit"];
range["l"] = range["Lower"];
range["u"] = range["Upper"];
range["s"] = range["Space"];
range["w"] = range["Word"];
}

View File

@ -2107,6 +2107,36 @@ void reflex::Unicode::Tables::language_scripts(void)
0, 0
};
range["Grantha"] = Grantha;
static const int Graph[] = {
33, 126,
161, 172,
174, 1535,
1542, 1563,
1565, 1756,
1758, 1806,
1808, 2191,
2194, 2273,
2275, 5759,
5761, 6157,
6159, 8191,
8208, 8231,
8240, 8286,
8293, 8293,
8304, 12287,
12289, 55295,
57344, 65278,
65280, 65528,
65532, 69820,
69822, 69836,
69838, 78895,
78912, 113823,
113828, 119154,
119163, 917504,
917506, 917535,
917632, 1114111,
0, 0
};
range["Graph"] = Graph;
static const int Greek[] = {
880, 883,
885, 887,
@ -7430,6 +7460,34 @@ void reflex::Unicode::Tables::language_scripts(void)
0, 0
};
range["Po"] = Po;
static const int Print[] = {
32, 126,
160, 172,
174, 1535,
1542, 1563,
1565, 1756,
1758, 1806,
1808, 2191,
2194, 2273,
2275, 6157,
6159, 8202,
8208, 8233,
8239, 8287,
8293, 8293,
8304, 55295,
57344, 65278,
65280, 65528,
65532, 69820,
69822, 69836,
69838, 78895,
78912, 113823,
113828, 119154,
119163, 917504,
917506, 917535,
917632, 1114111,
0, 0
};
range["Print"] = Print;
static const int Ps[] = {
40, 40,
91, 91,

View File

@ -2,6 +2,377 @@
#include <reflex/unicode.h>
void reflex::Unicode::Tables::letter_scripts(void)
{
static const int Alnum[] = {
48, 57,
65, 90,
97, 122,
181, 181,
192, 214,
216, 246,
248, 442,
444, 447,
452, 452,
454, 455,
457, 458,
460, 497,
499, 659,
661, 687,
880, 883,
886, 887,
891, 893,
895, 895,
902, 902,
904, 906,
908, 908,
910, 929,
931, 1013,
1015, 1153,
1162, 1327,
1329, 1366,
1376, 1416,
1632, 1641,
1776, 1785,
1984, 1993,
2406, 2415,
2534, 2543,
2662, 2671,
2790, 2799,
2918, 2927,
3046, 3055,
3174, 3183,
3302, 3311,
3430, 3439,
3558, 3567,
3664, 3673,
3792, 3801,
3872, 3881,
4160, 4169,
4240, 4249,
4256, 4293,
4295, 4295,
4301, 4301,
4304, 4346,
4349, 4351,
5024, 5109,
5112, 5117,
6112, 6121,
6160, 6169,
6470, 6479,
6608, 6617,
6784, 6793,
6800, 6809,
6992, 7001,
7088, 7097,
7232, 7241,
7248, 7257,
7296, 7304,
7312, 7354,
7357, 7359,
7424, 7467,
7531, 7543,
7545, 7578,
7680, 7957,
7960, 7965,
7968, 8005,
8008, 8013,
8016, 8023,
8025, 8025,
8027, 8027,
8029, 8029,
8031, 8061,
8064, 8071,
8080, 8087,
8096, 8103,
8112, 8116,
8118, 8123,
8126, 8126,
8130, 8132,
8134, 8139,
8144, 8147,
8150, 8155,
8160, 8172,
8178, 8180,
8182, 8187,
8450, 8450,
8455, 8455,
8458, 8467,
8469, 8469,
8473, 8477,
8484, 8484,
8486, 8486,
8488, 8488,
8490, 8493,
8495, 8500,
8505, 8505,
8508, 8511,
8517, 8521,
8526, 8526,
8579, 8580,
11264, 11387,
11390, 11492,
11499, 11502,
11506, 11507,
11520, 11557,
11559, 11559,
11565, 11565,
42528, 42537,
42560, 42605,
42624, 42651,
42786, 42863,
42865, 42887,
42891, 42894,
42896, 42954,
42960, 42961,
42963, 42963,
42965, 42969,
42997, 42998,
43002, 43002,
43216, 43225,
43264, 43273,
43472, 43481,
43504, 43513,
43600, 43609,
43824, 43866,
43872, 43880,
43888, 43967,
44016, 44025,
64256, 64262,
64275, 64279,
65296, 65305,
65313, 65338,
65345, 65370,
66560, 66639,
66720, 66729,
66736, 66771,
66776, 66811,
66928, 66938,
66940, 66954,
66956, 66962,
66964, 66965,
66967, 66977,
66979, 66993,
66995, 67001,
67003, 67004,
68736, 68786,
68800, 68850,
68912, 68921,
69734, 69743,
69872, 69881,
69942, 69951,
70096, 70105,
70384, 70393,
70736, 70745,
70864, 70873,
71248, 71257,
71360, 71369,
71472, 71481,
71840, 71913,
72016, 72025,
72784, 72793,
73040, 73049,
73120, 73129,
73552, 73561,
92768, 92777,
92864, 92873,
93008, 93017,
93760, 93823,
119808, 119892,
119894, 119964,
119966, 119967,
119970, 119970,
119973, 119974,
119977, 119980,
119982, 119993,
119995, 119995,
119997, 120003,
120005, 120069,
120071, 120074,
120077, 120084,
120086, 120092,
120094, 120121,
120123, 120126,
120128, 120132,
120134, 120134,
120138, 120144,
120146, 120485,
120488, 120512,
120514, 120538,
120540, 120570,
120572, 120596,
120598, 120628,
120630, 120654,
120656, 120686,
120688, 120712,
120714, 120744,
120746, 120770,
120772, 120779,
120782, 120831,
122624, 122633,
122635, 122654,
122661, 122666,
123200, 123209,
123632, 123641,
124144, 124153,
125184, 125251,
125264, 125273,
130032, 130041,
0, 0
};
range["Alnum"] = Alnum;
static const int Alpha[] = {
65, 90,
97, 122,
181, 181,
192, 214,
216, 246,
248, 442,
444, 447,
452, 452,
454, 455,
457, 458,
460, 497,
499, 659,
661, 687,
880, 883,
886, 887,
891, 893,
895, 895,
902, 902,
904, 906,
908, 908,
910, 929,
931, 1013,
1015, 1153,
1162, 1327,
1329, 1366,
1376, 1416,
4256, 4293,
4295, 4295,
4301, 4301,
4304, 4346,
4349, 4351,
5024, 5109,
5112, 5117,
7296, 7304,
7312, 7354,
7357, 7359,
7424, 7467,
7531, 7543,
7545, 7578,
7680, 7957,
7960, 7965,
7968, 8005,
8008, 8013,
8016, 8023,
8025, 8025,
8027, 8027,
8029, 8029,
8031, 8061,
8064, 8071,
8080, 8087,
8096, 8103,
8112, 8116,
8118, 8123,
8126, 8126,
8130, 8132,
8134, 8139,
8144, 8147,
8150, 8155,
8160, 8172,
8178, 8180,
8182, 8187,
8450, 8450,
8455, 8455,
8458, 8467,
8469, 8469,
8473, 8477,
8484, 8484,
8486, 8486,
8488, 8488,
8490, 8493,
8495, 8500,
8505, 8505,
8508, 8511,
8517, 8521,
8526, 8526,
8579, 8580,
11264, 11387,
11390, 11492,
11499, 11502,
11506, 11507,
11520, 11557,
11559, 11559,
11565, 11565,
42560, 42605,
42624, 42651,
42786, 42863,
42865, 42887,
42891, 42894,
42896, 42954,
42960, 42961,
42963, 42963,
42965, 42969,
42997, 42998,
43002, 43002,
43824, 43866,
43872, 43880,
43888, 43967,
64256, 64262,
64275, 64279,
65313, 65338,
65345, 65370,
66560, 66639,
66736, 66771,
66776, 66811,
66928, 66938,
66940, 66954,
66956, 66962,
66964, 66965,
66967, 66977,
66979, 66993,
66995, 67001,
67003, 67004,
68736, 68786,
68800, 68850,
71840, 71903,
93760, 93823,
119808, 119892,
119894, 119964,
119966, 119967,
119970, 119970,
119973, 119974,
119977, 119980,
119982, 119993,
119995, 119995,
119997, 120003,
120005, 120069,
120071, 120074,
120077, 120084,
120086, 120092,
120094, 120121,
120123, 120126,
120128, 120132,
120134, 120134,
120138, 120144,
120146, 120485,
120488, 120512,
120514, 120538,
120540, 120570,
120572, 120596,
120598, 120628,
120630, 120654,
120656, 120686,
120688, 120712,
120714, 120744,
120746, 120770,
120772, 120779,
122624, 122633,
122635, 122654,
122661, 122666,
125184, 125251,
0, 0
};
range["Alpha"] = Alpha;
static const int Ll[] = {
97, 122,
181, 181,

View File

@ -31,6 +31,7 @@
#include "../lib/error.cpp"
#include "../lib/input.cpp"
#include "../lib/matcher.cpp"
#include "../lib/simd.cpp"
#undef min
#undef max

View File

@ -2,6 +2,11 @@
$workDir = Resolve-Path -Path "${PSScriptRoot}\..\ccl\rslang\src"
# Change default relative path according to your work directory setup
# Re-flex repository: https://github.com/Genivia/RE-flex
$reflexRelative = Resolve-Path -Path "${PSScriptRoot}\..\..\GH-RE-flex\bin\win64"
$Env:PATH += ";${reflexRelative}"
function BuildLexers {
Set-Location -Path ${workDir}
BuildSyntax('AsciiLexer')