[BugFix] Fix throw exception issue in low-cardinality optimization error in ALLOW_THROW_EXCEPTION mode (backport #62098) (#62144)

Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
This commit is contained in:
mergify[bot] 2025-08-20 10:07:30 +00:00 committed by GitHub
parent c42eaf88df
commit ed1d4cc111
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 393 additions and 24 deletions

View File

@ -45,6 +45,7 @@
#include "gutil/casts.h"
#include "gutil/strings/substitute.h"
#include "runtime/datetime_value.h"
#include "runtime/exception.h"
#include "runtime/runtime_state.h"
#include "runtime/types.h"
#include "types/hll.h"
@ -65,13 +66,13 @@ namespace starrocks {
#define THROW_RUNTIME_ERROR_WITH_TYPE(TYPE) \
std::stringstream ss; \
ss << "not supported type " << type_to_string(TYPE); \
throw std::runtime_error(ss.str())
throw RuntimeException(ss.str())
#define THROW_RUNTIME_ERROR_WITH_TYPES_AND_VALUE(FROMTYPE, TOTYPE, VALUE) \
std::stringstream ss; \
ss << "cast from " << type_to_string(FROMTYPE) << "(" << VALUE << ")" \
<< " to " << type_to_string(TOTYPE) << " failed"; \
throw std::runtime_error(ss.str())
throw RuntimeException(ss.str())
template <LogicalType FromType, LogicalType ToType, bool AllowThrowException = false>
struct CastFn {

View File

@ -39,6 +39,7 @@ StatusOr<ColumnPtr> DictMappingExpr::evaluate_checked(ExprContext* context, Chun
// do array-expresion first, then string expression
if (_children.size() == 2) {
auto target_column = ptr->get_column_by_slot_id(slot_id());
DCHECK(!target_column->only_null());
auto data_column = ColumnHelper::get_data_column(target_column.get());
if (data_column->is_binary()) {

View File

@ -50,6 +50,7 @@
#include "gutil/strings/fastmem.h"
#include "gutil/strings/strip.h"
#include "gutil/strings/substitute.h"
#include "runtime/exception.h"
#include "runtime/runtime_state.h"
#include "storage/olap_define.h"
#include "types/large_int_value.h"
@ -62,10 +63,10 @@ namespace starrocks {
// A regex to match any regex pattern is equivalent to a substring search.
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]+)(?:\.\*)*)", re2::RE2::Quiet);
#define THROW_RUNTIME_ERROR_IF_EXCEED_LIMIT(col, func_name) \
if (UNLIKELY(!col->capacity_limit_reached().ok())) { \
col->reset_column(); \
throw std::runtime_error("binary column exceed 4G in function " #func_name); \
#define THROW_RUNTIME_ERROR_IF_EXCEED_LIMIT(col, func_name) \
if (UNLIKELY(!col->capacity_limit_reached().ok())) { \
col->reset_column(); \
throw RuntimeException("binary column exceed 4G in function " #func_name); \
}
#define RETURN_COLUMN(stmt, func_name) \

View File

@ -0,0 +1,29 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdexcept>
namespace starrocks {
// Demangling is a very time-consuming operation.
// Sometimes we are not concerned with the specific stack trace of certain runtime_errors.
// Therefore, we use a wrapper class to avoid printing the stack trace.
class RuntimeException final : public std::runtime_error {
public:
RuntimeException(const std::string& cause) : std::runtime_error(cause) {}
RuntimeException(const char* cause) : std::runtime_error(cause) {}
};
} // namespace starrocks

View File

@ -108,7 +108,28 @@ public:
Expr* clone(ObjectPool* pool) const override { return pool->add(new DictFuncExpr(_origin_expr, _dict_opt_ctx)); }
private:
ColumnPtr _translate_string(ColumnPtr& input, size_t num_rows) {
StatusOr<ColumnPtr> _translate_string(ColumnPtr& input, size_t num_rows) {
if (_dict_opt_ctx->err_status.has_value()) {
if (input->only_null()) {
RETURN_IF_ERROR((*_dict_opt_ctx->err_status)[_dict_opt_ctx->code_convert_map[0]]);
} else if (input->is_constant()) {
auto data_column = ColumnHelper::get_data_column(input);
size_t idx = data_column->get(0).get_int32();
RETURN_IF_ERROR((*_dict_opt_ctx->err_status)[_dict_opt_ctx->code_convert_map[idx]]);
} else {
if (input->is_nullable()) {
auto* nullable_column = down_cast<NullableColumn*>(input.get());
nullable_column->fill_null_with_default();
}
const auto* data_column =
down_cast<const LowCardDictColumn*>(ColumnHelper::get_data_column(input.get()));
const auto& dicts_data = data_column->get_data();
for (size_t i = 0; i < num_rows; ++i) {
RETURN_IF_ERROR((*_dict_opt_ctx->err_status)[_dict_opt_ctx->code_convert_map[dicts_data[i]]]);
}
}
}
if (_always_null) {
return ColumnHelper::create_const_null_column(num_rows);
}
@ -121,13 +142,12 @@ private:
// is const column
if (input->only_null() || input->is_constant()) {
if (_null_column_ptr && _null_column_ptr.get()->is_null(0)) {
if (input->only_null() && _null_column_ptr && _null_column_ptr.get()->is_null(0)) {
return ColumnHelper::create_const_null_column(num_rows);
} else {
auto idx = input->get(0);
auto idx = input->only_null() ? 0 : input->get(0).get_int32();
auto res = _data_column_ptr->clone_empty();
res->append_datum(_data_column_ptr->get(idx.get_int32()));
res->append_datum(_data_column_ptr->get(_dict_opt_ctx->code_convert_map[idx]));
return ConstColumn::create(std::move(res));
}
} else if (input->is_nullable()) {
@ -166,7 +186,7 @@ private:
}
}
ColumnPtr _translate_array(ColumnPtr& array, size_t num_rows) {
StatusOr<ColumnPtr> _translate_array(ColumnPtr& array, size_t num_rows) {
if ((array->only_null())) {
return ColumnHelper::create_const_null_column(num_rows);
}
@ -181,7 +201,7 @@ private:
auto element = array_col->elements_column();
auto offsets = UInt32Column::create(array_col->offsets());
ColumnPtr string_col = _translate_string(element, element->size());
ASSIGN_OR_RETURN(ColumnPtr string_col, _translate_string(element, element->size()));
string_col = ColumnHelper::unfold_const_column(stringType, element->size(), string_col);
return ConstColumn::create(ArrayColumn::create(string_col, std::move(offsets)), num_rows);
} else if (array->is_nullable()) {
@ -192,7 +212,7 @@ private:
auto element = array_col->elements_column();
auto offsets = UInt32Column::create(array_col->offsets());
ColumnPtr string_col = _translate_string(element, element->size());
ASSIGN_OR_RETURN(ColumnPtr string_col, _translate_string(element, element->size()));
string_col = ColumnHelper::unfold_const_column(stringType, element->size(), string_col);
return NullableColumn::create(ArrayColumn::create(string_col, std::move(offsets)), array_null);
} else {
@ -200,7 +220,7 @@ private:
auto element = array_col->elements_column();
auto offsets = UInt32Column::create(array_col->offsets());
ColumnPtr string_col = _translate_string(element, element->size());
ASSIGN_OR_RETURN(ColumnPtr string_col, _translate_string(element, element->size()));
string_col = ColumnHelper::unfold_const_column(stringType, element->size(), string_col);
return ArrayColumn::create(string_col, std::move(offsets));
}
@ -272,9 +292,41 @@ Status DictOptimizeParser::_eval_and_rewrite(ExprContext* ctx, Expr* expr, DictO
ChunkPtr temp_chunk = std::make_shared<Chunk>();
temp_chunk->append_column(binary_column, expr_slot_id);
// call inner expr with input column
ASSIGN_OR_RETURN(auto result_column, ctx->evaluate(origin_expr, temp_chunk.get()));
// assign convert mapping column
dict_opt_ctx->convert_column = result_column;
auto result_column = ctx->evaluate(origin_expr, temp_chunk.get());
if (UNLIKELY(!result_column.ok())) {
// Certain string inputs cause the expression to generate an error status. This branch handles such cases.
auto result = ColumnHelper::create_column(origin_expr->type(), true);
size_t num_rows = codes.size();
// slow path
std::vector<Status> err_status;
err_status.resize(num_rows);
auto input_column = binary_column->clone_empty();
temp_chunk->update_column(input_column, expr_slot_id);
for (size_t i = 0; i < num_rows; ++i) {
input_column->reset_column();
input_column->append(*binary_column, i, 1);
auto row_result = ctx->evaluate(origin_expr, temp_chunk.get());
if (row_result.ok()) {
if (row_result.value()->only_null()) {
result->append_nulls(1);
} else if (row_result.value()->is_constant()) {
result->append(*ColumnHelper::get_data_column(row_result.value().get()), 0, 1);
} else {
result->append(*row_result.value(), 0, 1);
}
} else {
result->append_nulls(1);
err_status[i] = row_result.status();
}
}
dict_opt_ctx->convert_column = std::move(result);
dict_opt_ctx->err_status = std::move(err_status);
} else {
// assign convert mapping column
dict_opt_ctx->convert_column = std::move(result_column.value());
}
// build code convert map
dict_opt_ctx->code_convert_map.resize(codes.size() + 1);
for (int i = 0; i < codes.size(); ++i) {
@ -290,7 +342,7 @@ Status DictOptimizeParser::_eval_and_rewrite(ExprContext* ctx, Expr* expr, DictO
(origin_expr->type().is_array_type() && dict_mapping->type().is_array_type() &&
origin_expr->type().children[0].type != dict_mapping->type().children[0].type)) {
DCHECK_GE(targetSlotId, 0);
ColumnViewer<TYPE_VARCHAR> viewer(result_column);
ColumnViewer<TYPE_VARCHAR> viewer(dict_opt_ctx->convert_column);
int num_rows = codes.size();
GlobalDictMap result_map;
@ -312,8 +364,6 @@ Status DictOptimizeParser::_eval_and_rewrite(ExprContext* ctx, Expr* expr, DictO
ctor(slice, id_allocator);
values.emplace_back(slice);
});
} else {
dict_opt_ctx->result_nullable = true;
}
}

View File

@ -16,6 +16,7 @@
#include <cstdint>
#include <map>
#include <optional>
#include <vector>
#include "column/column.h"
@ -37,13 +38,13 @@ class DictMappingExpr;
struct DictOptimizeContext {
bool could_apply_dict_optimize = false;
SlotId slot_id;
// if input was not nullable but output was nullable this flag will set true
bool result_nullable = false;
// size: dict codes.size() + 1
std::vector<int16_t> code_convert_map;
Filter filter;
// for no-string column convert map
ColumnPtr convert_column;
// error status
std::optional<std::vector<Status>> err_status;
};
class DictOptimizeParser {

View File

@ -184,6 +184,7 @@ set(EXEC_FILES
./exprs/min_max_predicate_test.cpp
./exprs/in_const_predicate_test.cpp
./exprs/table_function/test_list_rowsets.cpp
./exprs/dict_expr_test.cpp
./formats/csv/array_converter_test.cpp
./formats/csv/boolean_converter_test.cpp
./formats/csv/csv_file_writer_test.cpp

View File

@ -0,0 +1,216 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <memory>
#include <string>
#include "column/chunk.h"
#include "column/column_builder.h"
#include "column/column_helper.h"
#include "column/column_viewer.h"
#include "column/const_column.h"
#include "column/vectorized_fwd.h"
#include "common/object_pool.h"
#include "exprs/column_ref.h"
#include "exprs/dictmapping_expr.h"
#include "exprs/expr_context.h"
#include "exprs/placeholder_ref.h"
#include "gen_cpp/Types_types.h"
#include "runtime/exception.h"
#include "runtime/global_dict/config.h"
#include "runtime/mem_pool.h"
#include "runtime/runtime_state.h"
#include "runtime/types.h"
#include "testutil/assert.h"
#include "types/logical_type.h"
namespace starrocks {
template <class Provider>
class ProvideExpr final : public Expr {
public:
ProvideExpr(Provider provider) : Expr(TypeDescriptor(TYPE_VARCHAR), false), _provider(provider){};
Expr* clone(ObjectPool* pool) const override { return pool->add(new ProvideExpr(*this)); }
StatusOr<ColumnPtr> evaluate_checked(ExprContext* context, Chunk* ptr) override {
return _provider(ptr->columns()[0]);
}
private:
Provider _provider;
};
class DictMappingTest : public ::testing::Test {
public:
void SetUp() override {
node.__set_node_type(TExprNodeType::DICT_EXPR);
TTypeNode n_type;
TScalarType scalar_type;
scalar_type.__set_type(TPrimitiveType::VARCHAR);
n_type.__set_scalar_type(scalar_type);
node.type.types.emplace_back(n_type);
std::vector<TGlobalDict> list;
TGlobalDict dict;
dict.__set_columnId(1);
dict.__set_ids(std::vector<int32_t>{0, 1, 2, 3, 4});
dict.__set_strings(std::vector<std::string>{"", "1", "2", "3", "4"});
list.emplace_back(dict);
state._obj_pool = std::make_shared<ObjectPool>();
state._instance_mem_pool = std::make_unique<MemPool>();
ASSERT_OK(state.init_query_global_dict(list));
}
public:
TExprNode node;
ObjectPool pool;
RuntimeState state;
Expr* dict_expr;
Expr* origin;
ExprContext* context;
};
TEST_F(DictMappingTest, test1) {
auto origin = pool.add(new ProvideExpr([](ColumnPtr column) {
ColumnViewer<TYPE_VARCHAR> viewer(column);
ColumnBuilder<TYPE_VARCHAR> builder(5);
size_t num_rows = column->size();
for (size_t i = 0; i < num_rows; ++i) {
auto value = viewer.value(i);
if (viewer.is_null(i)) {
builder.append("k");
} else if (value == "4") {
builder.append("k1");
} else {
builder.append("k" + value.to_string());
}
}
return builder.build(false);
}));
auto pl = pool.add(new PlaceHolderRef(node));
origin->add_child(pl);
auto slot = pool.add(new ColumnRef(TypeDescriptor(TYPE_INT), 1));
dict_expr = pool.add(new DictMappingExpr(node));
context = pool.add(new ExprContext(dict_expr));
dict_expr->add_child(slot);
dict_expr->add_child(origin);
ASSERT_OK(context->prepare(&state));
ASSERT_OK(context->open(&state));
{
auto chunk = std::make_unique<Chunk>();
auto dict_column = Int32Column::create();
dict_column->get_data().emplace_back(1);
dict_column->get_data().emplace_back(2);
dict_column->get_data().emplace_back(3);
dict_column->get_data().emplace_back(4);
auto nullable_column = NullableColumn::wrap_if_necessary(dict_column);
auto c = down_cast<NullableColumn*>(nullable_column.get());
c->set_null(0);
chunk->append_column(nullable_column, 1);
auto column = context->evaluate(chunk.get());
ASSERT_OK(column.status());
EXPECT_EQ(column->get()->get(0).get_slice(), "k");
EXPECT_EQ(column->get()->get(1).get_slice(), "k2");
EXPECT_EQ(column->get()->get(2).get_slice(), "k3");
EXPECT_EQ(column->get()->get(3).get_slice(), "k1");
}
{
auto chunk = std::make_unique<Chunk>();
auto dict_column = Int32Column::create();
dict_column->get_data().emplace_back(1);
chunk->append_column(ConstColumn::create(dict_column, 1), 1);
auto column = context->evaluate(chunk.get());
ASSERT_OK(column.status());
EXPECT_EQ(column->get()->get(0).get_slice(), "k1");
}
}
TEST_F(DictMappingTest, test_function_return_exception) {
auto origin = pool.add(new ProvideExpr([](ColumnPtr column) {
ColumnViewer<TYPE_VARCHAR> viewer(column);
ColumnBuilder<TYPE_VARCHAR> builder(5);
size_t num_rows = column->size();
for (size_t i = 0; i < num_rows; ++i) {
auto value = viewer.value(i);
if (viewer.is_null(i)) {
builder.append("k");
} else if (value == "4") {
throw RuntimeException("test return exception function");
} else {
builder.append("k" + value.to_string());
}
}
return builder.build(false);
}));
auto pl = pool.add(new PlaceHolderRef(node));
origin->add_child(pl);
auto slot = pool.add(new ColumnRef(TypeDescriptor(TYPE_INT), 1));
dict_expr = pool.add(new DictMappingExpr(node));
context = pool.add(new ExprContext(dict_expr));
dict_expr->add_child(slot);
dict_expr->add_child(origin);
ASSERT_OK(context->prepare(&state));
ASSERT_OK(context->open(&state));
{
auto chunk = std::make_unique<Chunk>();
auto dict_column = Int32Column::create();
dict_column->get_data().emplace_back(1);
dict_column->get_data().emplace_back(2);
dict_column->get_data().emplace_back(3);
dict_column->get_data().emplace_back(4);
dict_column->get_data().emplace_back(5);
auto nullable_column = NullableColumn::wrap_if_necessary(dict_column);
auto c = down_cast<NullableColumn*>(nullable_column.get());
c->set_null(0);
chunk->append_column(nullable_column, 1);
auto column = context->evaluate(chunk.get());
ASSERT_ERROR(column.status());
}
{
auto chunk = std::make_unique<Chunk>();
auto dict_column = Int32Column::create();
dict_column->get_data().emplace_back(1);
chunk->append_column(ConstColumn::create(dict_column, 1), 1);
auto column = context->evaluate(chunk.get());
ASSERT_OK(column.status());
EXPECT_EQ(column->get()->get(0).get_slice(), "k1");
}
{
auto chunk = std::make_unique<Chunk>();
auto dict_column = Int32Column::create();
dict_column->get_data().emplace_back(4);
chunk->append_column(ConstColumn::create(dict_column, 1), 1);
auto column = context->evaluate(chunk.get());
ASSERT_ERROR(column.status());
}
}
} // namespace starrocks

View File

@ -0,0 +1,49 @@
-- name: test_dict_exception_expr
set sql_mode='ALLOW_THROW_EXCEPTION';
-- result:
-- !result
create table t_low(c1 int, c2 string);
-- result:
-- !result
insert into t_low values (1, null);
-- result:
-- !result
function: wait_global_dict_ready('c2', 't_low')
-- result:
-- !result
select c1, cast (c2 as int) as name from t_low;
-- result:
1 None
-- !result
insert into t_low values (2, "");
-- result:
-- !result
insert into t_low values (3, "1");
-- result:
-- !result
insert into t_low values (4, "2");
-- result:
-- !result
function: wait_global_dict_ready('c2', 't_low')
-- result:
-- !result
select c1, cast (c2 as int) as name from t_low where c1 = 3;
-- result:
3 1
-- !result
select c1, cast (c2 as int) as name from t_low where c1 = 4;
-- result:
4 2
-- !result
select c1, cast (c2 as int) as name from t_low where c1 != 2;
-- result:
1 None
3 1
4 2
-- !result
select c1, cast (c2 as int) as name from t_low where c1 = 2;
-- result:
[REGEX].*Expr evaluate meet error*
-- !result

View File

@ -0,0 +1,20 @@
-- name: test_dict_exception_expr
set sql_mode='ALLOW_THROW_EXCEPTION';
create table t_low(c1 int, c2 string);
insert into t_low values (1, null);
function: wait_global_dict_ready('c2', 't_low')
select c1, cast (c2 as int) as name from t_low;
insert into t_low values (2, "");
insert into t_low values (3, "1");
insert into t_low values (4, "2");
function: wait_global_dict_ready('c2', 't_low')
select c1, cast (c2 as int) as name from t_low where c1 = 3;
select c1, cast (c2 as int) as name from t_low where c1 = 4;
select c1, cast (c2 as int) as name from t_low where c1 != 2;
select c1, cast (c2 as int) as name from t_low where c1 = 2;