Files
pip/libs/main/text/piregularexpression.cpp

323 lines
8.3 KiB
C++

#include "piregularexpression.h"
// clang-format off
#define PCRE2_CODE_UNIT_WIDTH 16
#include <pcre2.h>
#include <pistringlist.h>
// clang-format on
PRIVATE_DEFINITION_START(PIRegularExpression)
pcre2_code * compiled = nullptr;
PIString error_msg;
PCRE2_SIZE error_offset = 0;
pcre2_match_data * match_data = nullptr;
int capture_count = -1;
PIMap<PIString, int> named_group_index;
PIMap<int, PIString> named_group_name;
bool isCompiled() const {
return compiled && match_data;
}
void free() {
named_group_index.clear();
named_group_name.clear();
error_msg.clear();
error_offset = 0;
capture_count = -1;
if (match_data) {
pcre2_match_data_free(match_data);
match_data = nullptr;
}
if (compiled) {
pcre2_code_free(compiled);
compiled = nullptr;
}
}
PIString getNEString(const void * ptr, uint32_t max_size) {
PIString ret;
auto * cptr = (PIChar *)ptr;
uint32_t sz = 0;
while (*cptr != PIChar()) {
ret.append(*cptr);
cptr++;
if (++sz > max_size) break;
}
return ret;
}
uint32_t convertOptions(Options opt) {
uint32_t ret = PCRE2_UTF | PCRE2_NO_UTF_CHECK;
// clang-format off
if (opt[CaseInsensitive ]) ret |= PCRE2_CASELESS ;
if (opt[Singleline ]) ret |= PCRE2_FIRSTLINE;
if (opt[Multiline ]) ret |= PCRE2_MULTILINE;
if (opt[InvertedGreediness]) ret |= PCRE2_UNGREEDY ;
if (opt[Extended ]) ret |= PCRE2_EXTENDED ;
// clang-format on
return ret;
}
bool compile(PIString & pat, Options opt) {
free();
if (pat.isEmpty()) return false;
auto * pat_ptr = &(pat[0]);
int error_number = 0;
compiled = pcre2_compile((PCRE2_SPTR)pat_ptr, pat.size(), convertOptions(opt), &error_number, &error_offset, nullptr);
if (!compiled) {
PIChar buffer[256];
int sz = pcre2_get_error_message(error_number, (PCRE2_UCHAR16 *)buffer, sizeof(buffer));
error_msg = PIString(buffer, sz);
return false;
// printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset, buffer);
}
error_msg.clear();
match_data = pcre2_match_data_create_from_pattern(compiled, nullptr);
uint32_t namecount = 0, name_entry_size = 0, cap_cout = 0;
PCRE2_SPTR name_table = nullptr;
pcre2_pattern_info(compiled, PCRE2_INFO_CAPTURECOUNT, &cap_cout);
pcre2_pattern_info(compiled, PCRE2_INFO_NAMECOUNT, &namecount);
pcre2_pattern_info(compiled, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size);
pcre2_pattern_info(compiled, PCRE2_INFO_NAMETABLE, &name_table);
capture_count = cap_cout;
auto tabptr = name_table;
for (uint32_t i = 0; i < namecount; i++) {
int gnum = *(ushort *)tabptr;
PIString gname = getNEString(tabptr + 1, name_entry_size);
named_group_index[gname] = gnum;
named_group_name[gnum] = gname;
tabptr += name_entry_size;
}
return isCompiled();
}
void match(Matcher & ret) {
int rc = pcre2_match(compiled,
(PCRE2_SPTR)ret.subjectPtr(),
ret.subject->size(),
ret.start_offset,
PCRE2_NO_UTF_CHECK,
match_data,
nullptr);
ret.has_match = ret.is_error = false;
ret.groups.clear();
if (rc == PCRE2_ERROR_NOMATCH) return;
if (rc < 0) {
ret.is_error = true;
} else {
ret.has_match = true;
auto ovector = pcre2_get_ovector_pointer(match_data);
for (int i = 0; i < rc; i++) {
Matcher::Group g;
g.index = ovector[2 * i];
g.size = ovector[2 * i + 1] - ovector[2 * i];
// g.string = PIString(&(sub_ptr[g.index]), g.size);
ret.groups << g;
}
ret.start_offset = ovector[1];
}
}
PRIVATE_DEFINITION_END(PIRegularExpression)
PIRegularExpression::PIRegularExpression(const PIString & pattern, Options opt) {
setPattern(pattern, opt);
}
PIRegularExpression::PIRegularExpression(const PIRegularExpression & o) {
setPattern(o.pat_, o.opt_);
}
PIRegularExpression & PIRegularExpression::operator=(const PIRegularExpression & o) {
setPattern(o.pat_, o.opt_);
return *this;
}
PIRegularExpression::~PIRegularExpression() {
PRIVATE->free();
}
void PIRegularExpression::setPattern(const PIString & pattern) {
pat_ = pattern;
PRIVATE->compile(pat_, opt_);
}
void PIRegularExpression::setPattern(const PIString & pattern, Options opt) {
opt_ = opt;
setPattern(pattern);
}
bool PIRegularExpression::isValid() const {
return PRIVATE->isCompiled();
}
PIString PIRegularExpression::errorString() const {
return PRIVATE->error_msg;
}
int PIRegularExpression::errorPosition() const {
return PRIVATE->error_offset;
}
int PIRegularExpression::captureGroupsCount() const {
return PRIVATE->capture_count;
}
PIStringList PIRegularExpression::captureGroupNames() const {
return PRIVATE->named_group_name.values();
}
PIString PIRegularExpression::captureGroupName(int index) const {
return PRIVATE->named_group_name.value(index);
}
int PIRegularExpression::captureGroupIndex(const PIString & gname) const {
return PRIVATE->named_group_index.value(gname);
}
PIRegularExpression::Matcher PIRegularExpression::makeMatcher(PIString & subject, size_t offset) {
PIRegularExpression::Matcher ret(this);
ret.start_offset = offset;
ret.subject = &subject;
return ret;
}
PIRegularExpression::Matcher PIRegularExpression::makeMatcher(const PIString & subject, size_t offset) {
PIRegularExpression::Matcher ret(this);
ret.start_offset = offset;
ret.subject_own = subject;
ret.subject = &ret.subject_own;
return ret;
}
PIRegularExpression::Matcher PIRegularExpression::match(PIString & subject, size_t offset) {
PIRegularExpression::Matcher ret = makeMatcher(subject, offset);
PRIVATE->match(ret);
return ret;
}
PIRegularExpression::Matcher PIRegularExpression::match(const PIString & subject, size_t offset) {
PIRegularExpression::Matcher ret = makeMatcher(subject, offset);
PRIVATE->match(ret);
return ret;
}
PIRegularExpression::Matcher::Matcher(PIRegularExpression * p): parent(p) {}
PIChar * PIRegularExpression::Matcher::subjectPtr() const {
if (!subject) return nullptr;
return &(*subject)[0];
}
bool PIRegularExpression::Matcher::hasMatch() const {
return has_match;
}
bool PIRegularExpression::Matcher::next() {
parent->PRIVATEWB->match(*this);
return hasMatch();
}
PIStringList PIRegularExpression::Matcher::matchedStrings() const {
if (!subject) return {};
PIStringList ret;
for (const auto & g: groups)
ret << subject->mid(g.index, g.size);
return ret;
}
PIString PIRegularExpression::Matcher::matchedString(int index) const {
if (index < 0 || index >= groups.size_s()) return {};
if (!subject) return {};
return subject->mid(groups[index].index, groups[index].size);
}
int PIRegularExpression::Matcher::matchedStart(int index) const {
if (index < 0 || index >= groups.size_s()) return -1;
return groups[index].index;
}
int PIRegularExpression::Matcher::matchedSize(int index) const {
if (index < 0 || index >= groups.size_s()) return -1;
return groups[index].size;
}
PIString PIRegularExpression::Matcher::matchedString(const PIString & gname) const {
return matchedString(parent->PRIVATEWB->named_group_index.value(gname));
}
int PIRegularExpression::Matcher::matchedStart(const PIString & gname) const {
return matchedStart(parent->PRIVATEWB->named_group_index.value(gname));
}
int PIRegularExpression::Matcher::matchedSize(const PIString & gname) const {
return matchedSize(parent->PRIVATEWB->named_group_index.value(gname));
}
PIRegularExpression PIRegularExpression::fromGlob(const PIString & pattern, Options opt) {
PIRegularExpression ret;
ret.convertFrom(pattern, PCRE2_CONVERT_GLOB, opt);
return ret;
}
PIRegularExpression PIRegularExpression::fromPOSIX(const PIString & pattern, Options opt) {
PIRegularExpression ret;
ret.convertFrom(pattern, PCRE2_CONVERT_POSIX_BASIC, opt);
return ret;
}
void PIRegularExpression::convertFrom(const PIString & pattern, uint type, Options opt) {
if (pattern.isEmpty()) return;
PIChar * cptr = &((PIString &)pattern)[0];
PCRE2_UCHAR * out = nullptr;
PCRE2_SIZE out_size = 0;
int rc = pcre2_pattern_convert((PCRE2_SPTR)cptr,
pattern.size_s(),
type | PCRE2_CONVERT_UTF | PCRE2_CONVERT_NO_UTF_CHECK,
&out,
&out_size,
nullptr);
if (rc != 0) {
piCout << "PIRegularExpression::convertFrom error" << rc;
} else {
setPattern(PIString((PIChar *)out, out_size), opt);
}
pcre2_converted_pattern_free(out);
}