[Enhancement] Optimize the performance for regexp_replace (#16356)
performance case: ``` SELECT max(REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1')) AS k, COUNT(*) AS c FROM hits where Referer<>''; ``` baseline: 3.55 after-upgrade-re2: 2.75 update-re2 and optimize allocate: 2.12
This commit is contained in:
parent
147b3360e0
commit
8c04b6c240
|
|
@ -2818,6 +2818,7 @@ static ColumnPtr regexp_replace_const(re2::RE2* const_re, const Columns& columns
|
|||
|
||||
auto size = columns[0]->size();
|
||||
ColumnBuilder<TYPE_VARCHAR> result(size);
|
||||
std::string result_str;
|
||||
for (int row = 0; row < size; ++row) {
|
||||
if (str_viewer.is_null(row) || rpl_viewer.is_null(row)) {
|
||||
result.append_null();
|
||||
|
|
@ -2827,8 +2828,9 @@ static ColumnPtr regexp_replace_const(re2::RE2* const_re, const Columns& columns
|
|||
auto rpl_value = rpl_viewer.value(row);
|
||||
re2::StringPiece rpl_str = re2::StringPiece(rpl_value.get_data(), rpl_value.get_size());
|
||||
auto str_value = str_viewer.value(row);
|
||||
std::string result_str(str_value.get_data(), str_value.get_size());
|
||||
re2::RE2::GlobalReplace(&result_str, *const_re, rpl_str);
|
||||
re2::StringPiece str_str = re2::StringPiece(str_value.get_data(), str_value.get_size());
|
||||
result_str.clear();
|
||||
re2::RE2::GlobalReplace(str_str, *const_re, rpl_str, result_str);
|
||||
result.append(Slice(result_str.data(), result_str.size()));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -437,7 +437,8 @@ build_re2() {
|
|||
check_if_source_exist $RE2_SOURCE
|
||||
cd $TP_SOURCE_DIR/$RE2_SOURCE
|
||||
|
||||
$CMAKE_CMD -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=0 -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR
|
||||
$CMAKE_CMD -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release \
|
||||
-DBUILD_SHARED_LIBS=0 -DCMAKE_INSTALL_PREFIX=$TP_INSTALL_DIR -DCMAKE_INSTALL_LIBDIR=lib
|
||||
${BUILD_SYSTEM} -j$PARALLEL install
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -256,7 +256,7 @@ echo "Finished patching $GLOG_SOURCE"
|
|||
# re2 patch
|
||||
cd $TP_SOURCE_DIR/$RE2_SOURCE
|
||||
if [ ! -f $PATCHED_MARK ]; then
|
||||
patch -p0 < $TP_PATCH_DIR/re2-2017-05-01.patch
|
||||
patch -p1 < $TP_PATCH_DIR/re2-2022-12-01.patch
|
||||
touch $PATCHED_MARK
|
||||
fi
|
||||
cd -
|
||||
|
|
|
|||
|
|
@ -0,0 +1,100 @@
|
|||
commit 6bfac7c766bddb2ac9eda2c2acd98009b9da95bd
|
||||
Author: stdpain <drfeng08@gmail.com>
|
||||
Date: Mon Jan 9 10:00:35 2023 +0800
|
||||
|
||||
add a interface to reuse memory for GlobalReplace
|
||||
|
||||
diff --git a/re2/re2.cc b/re2/re2.cc
|
||||
index b24c6d6..9d4969e 100644
|
||||
--- a/re2/re2.cc
|
||||
+++ b/re2/re2.cc
|
||||
@@ -461,6 +461,30 @@ bool RE2::Replace(std::string* str,
|
||||
int RE2::GlobalReplace(std::string* str,
|
||||
const RE2& re,
|
||||
const StringPiece& rewrite) {
|
||||
+ std::string out;
|
||||
+ int count = _GlobalReplace(*str, re, rewrite, out);
|
||||
+ if (count > 0) {
|
||||
+ using std::swap;
|
||||
+ swap(out, *str);
|
||||
+ }
|
||||
+ return count;
|
||||
+}
|
||||
+
|
||||
+int RE2::GlobalReplace(const StringPiece& str,
|
||||
+ const RE2& re,
|
||||
+ const StringPiece& rewrite,
|
||||
+ std::string& out) {
|
||||
+ int count = _GlobalReplace(str, re, rewrite, out);
|
||||
+ if (count == 0) {
|
||||
+ out.append(str.data(), str.size());
|
||||
+ }
|
||||
+ return count;
|
||||
+}
|
||||
+
|
||||
+int RE2::_GlobalReplace(const StringPiece& str,
|
||||
+ const RE2& re,
|
||||
+ const StringPiece& rewrite,
|
||||
+ std::string& out) {
|
||||
StringPiece vec[kVecSize];
|
||||
int nvec = 1 + MaxSubmatch(rewrite);
|
||||
if (nvec > 1 + re.NumberOfCapturingGroups())
|
||||
@@ -468,17 +492,16 @@ int RE2::GlobalReplace(std::string* str,
|
||||
if (nvec > static_cast<int>(arraysize(vec)))
|
||||
return false;
|
||||
|
||||
- const char* p = str->data();
|
||||
- const char* ep = p + str->size();
|
||||
+ const char* p = str.data();
|
||||
+ const char* ep = p + str.size();
|
||||
const char* lastend = NULL;
|
||||
- std::string out;
|
||||
int count = 0;
|
||||
while (p <= ep) {
|
||||
if (maximum_global_replace_count != -1 &&
|
||||
count >= maximum_global_replace_count)
|
||||
break;
|
||||
- if (!re.Match(*str, static_cast<size_t>(p - str->data()),
|
||||
- str->size(), UNANCHORED, vec, nvec))
|
||||
+ if (!re.Match(str, static_cast<size_t>(p - str.data()),
|
||||
+ str.size(), UNANCHORED, vec, nvec))
|
||||
break;
|
||||
if (p < vec[0].data())
|
||||
out.append(p, vec[0].data() - p);
|
||||
@@ -523,8 +546,6 @@ int RE2::GlobalReplace(std::string* str,
|
||||
|
||||
if (p < ep)
|
||||
out.append(p, ep - p);
|
||||
- using std::swap;
|
||||
- swap(out, *str);
|
||||
return count;
|
||||
}
|
||||
|
||||
diff --git a/re2/re2.h b/re2/re2.h
|
||||
index 1d82518..7da0922 100644
|
||||
--- a/re2/re2.h
|
||||
+++ b/re2/re2.h
|
||||
@@ -485,6 +485,11 @@ class RE2 {
|
||||
const RE2& re,
|
||||
const StringPiece& rewrite);
|
||||
|
||||
+ static int GlobalReplace(const StringPiece& str,
|
||||
+ const RE2& re,
|
||||
+ const StringPiece& rewrite,
|
||||
+ std::string& out);
|
||||
+
|
||||
// Like Replace, except that if the pattern matches, "rewrite"
|
||||
// is copied into "out" with substitutions. The non-matching
|
||||
// portions of "text" are ignored.
|
||||
@@ -767,6 +772,11 @@ class RE2 {
|
||||
|
||||
re2::Prog* ReverseProg() const;
|
||||
|
||||
+ static int _GlobalReplace(const StringPiece& str,
|
||||
+ const RE2& re,
|
||||
+ const StringPiece& rewrite,
|
||||
+ std::string& out);
|
||||
+
|
||||
// First cache line is relatively cold fields.
|
||||
const std::string* pattern_; // string regular expression
|
||||
Options options_; // option flags
|
||||
|
|
@ -165,10 +165,10 @@ CURL_SOURCE=curl-7.79.0
|
|||
CURL_MD5SUM="b40e4dc4bbc9e109c330556cd58c8ec8"
|
||||
|
||||
# RE2
|
||||
RE2_DOWNLOAD="https://github.com/google/re2/archive/2017-05-01.tar.gz"
|
||||
RE2_NAME=re2-2017-05-01.tar.gz
|
||||
RE2_SOURCE=re2-2017-05-01
|
||||
RE2_MD5SUM="4aa65a0b22edacb7ddcd7e4aec038dcf"
|
||||
RE2_DOWNLOAD="https://github.com/google/re2/archive/refs/tags/2022-12-01.tar.gz"
|
||||
RE2_NAME=re2-2022-12-01.tar.gz
|
||||
RE2_SOURCE=re2-2022-12-01
|
||||
RE2_MD5SUM="f25d7b06a3e7747ecbb2f12d48be61cd"
|
||||
|
||||
# boost
|
||||
BOOST_DOWNLOAD="https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.gz"
|
||||
|
|
|
|||
Loading…
Reference in New Issue