[Enhancement] Optimize the performance for regexp_replace (#16356)

performance case:
```
SELECT max(REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1')) AS k, COUNT(*) AS c FROM hits where Referer<>'';
```
baseline: 3.55
after-upgrade-re2: 2.75
update-re2 and optimize allocate: 2.12
This commit is contained in:
stdpain 2023-01-10 10:04:01 +08:00 committed by GitHub
parent 147b3360e0
commit 8c04b6c240
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 111 additions and 8 deletions

View File

@ -2818,6 +2818,7 @@ static ColumnPtr regexp_replace_const(re2::RE2* const_re, const Columns& columns
auto size = columns[0]->size();
ColumnBuilder<TYPE_VARCHAR> result(size);
std::string result_str;
for (int row = 0; row < size; ++row) {
if (str_viewer.is_null(row) || rpl_viewer.is_null(row)) {
result.append_null();
@ -2827,8 +2828,9 @@ static ColumnPtr regexp_replace_const(re2::RE2* const_re, const Columns& columns
auto rpl_value = rpl_viewer.value(row);
re2::StringPiece rpl_str = re2::StringPiece(rpl_value.get_data(), rpl_value.get_size());
auto str_value = str_viewer.value(row);
std::string result_str(str_value.get_data(), str_value.get_size());
re2::RE2::GlobalReplace(&result_str, *const_re, rpl_str);
re2::StringPiece str_str = re2::StringPiece(str_value.get_data(), str_value.get_size());
result_str.clear();
re2::RE2::GlobalReplace(str_str, *const_re, rpl_str, result_str);
result.append(Slice(result_str.data(), result_str.size()));
}

View File

@ -437,7 +437,8 @@ build_re2() {
check_if_source_exist $RE2_SOURCE
cd $TP_SOURCE_DIR/$RE2_SOURCE
$CMAKE_CMD -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=0 -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR
$CMAKE_CMD -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=0 -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR -DCMAKE_INSTALL_LIBDIR=lib
${BUILD_SYSTEM} -j$PARALLEL install
}

View File

@ -256,7 +256,7 @@ echo "Finished patching $GLOG_SOURCE"
# re2 patch
cd $TP_SOURCE_DIR/$RE2_SOURCE
if [ ! -f $PATCHED_MARK ]; then
patch -p0 < $TP_PATCH_DIR/re2-2017-05-01.patch
patch -p1 < $TP_PATCH_DIR/re2-2022-12-01.patch
touch $PATCHED_MARK
fi
cd -

100
thirdparty/patches/re2-2022-12-01.patch vendored Normal file
View File

@ -0,0 +1,100 @@
commit 6bfac7c766bddb2ac9eda2c2acd98009b9da95bd
Author: stdpain <drfeng08@gmail.com>
Date: Mon Jan 9 10:00:35 2023 +0800
add a interface to reuse memory for GlobalReplace
diff --git a/re2/re2.cc b/re2/re2.cc
index b24c6d6..9d4969e 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -461,6 +461,30 @@ bool RE2::Replace(std::string* str,
int RE2::GlobalReplace(std::string* str,
const RE2& re,
const StringPiece& rewrite) {
+ std::string out;
+ int count = _GlobalReplace(*str, re, rewrite, out);
+ if (count > 0) {
+ using std::swap;
+ swap(out, *str);
+ }
+ return count;
+}
+
+int RE2::GlobalReplace(const StringPiece& str,
+ const RE2& re,
+ const StringPiece& rewrite,
+ std::string& out) {
+ int count = _GlobalReplace(str, re, rewrite, out);
+ if (count == 0) {
+ out.append(str.data(), str.size());
+ }
+ return count;
+}
+
+int RE2::_GlobalReplace(const StringPiece& str,
+ const RE2& re,
+ const StringPiece& rewrite,
+ std::string& out) {
StringPiece vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
@@ -468,17 +492,16 @@ int RE2::GlobalReplace(std::string* str,
if (nvec > static_cast<int>(arraysize(vec)))
return false;
- const char* p = str->data();
- const char* ep = p + str->size();
+ const char* p = str.data();
+ const char* ep = p + str.size();
const char* lastend = NULL;
- std::string out;
int count = 0;
while (p <= ep) {
if (maximum_global_replace_count != -1 &&
count >= maximum_global_replace_count)
break;
- if (!re.Match(*str, static_cast<size_t>(p - str->data()),
- str->size(), UNANCHORED, vec, nvec))
+ if (!re.Match(str, static_cast<size_t>(p - str.data()),
+ str.size(), UNANCHORED, vec, nvec))
break;
if (p < vec[0].data())
out.append(p, vec[0].data() - p);
@@ -523,8 +546,6 @@ int RE2::GlobalReplace(std::string* str,
if (p < ep)
out.append(p, ep - p);
- using std::swap;
- swap(out, *str);
return count;
}
diff --git a/re2/re2.h b/re2/re2.h
index 1d82518..7da0922 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -485,6 +485,11 @@ class RE2 {
const RE2& re,
const StringPiece& rewrite);
+ static int GlobalReplace(const StringPiece& str,
+ const RE2& re,
+ const StringPiece& rewrite,
+ std::string& out);
+
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
@@ -767,6 +772,11 @@ class RE2 {
re2::Prog* ReverseProg() const;
+ static int _GlobalReplace(const StringPiece& str,
+ const RE2& re,
+ const StringPiece& rewrite,
+ std::string& out);
+
// First cache line is relatively cold fields.
const std::string* pattern_; // string regular expression
Options options_; // option flags

8
thirdparty/vars.sh vendored
View File

@ -165,10 +165,10 @@ CURL_SOURCE=curl-7.79.0
CURL_MD5SUM="b40e4dc4bbc9e109c330556cd58c8ec8"
# RE2
RE2_DOWNLOAD="https://github.com/google/re2/archive/2017-05-01.tar.gz"
RE2_NAME=re2-2017-05-01.tar.gz
RE2_SOURCE=re2-2017-05-01
RE2_MD5SUM="4aa65a0b22edacb7ddcd7e4aec038dcf"
RE2_DOWNLOAD="https://github.com/google/re2/archive/refs/tags/2022-12-01.tar.gz"
RE2_NAME=re2-2022-12-01.tar.gz
RE2_SOURCE=re2-2022-12-01
RE2_MD5SUM="f25d7b06a3e7747ecbb2f12d48be61cd"
# boost
BOOST_DOWNLOAD="https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.gz"