[BugFix] Fix the crash issue caused by using an unintialized column evaluator in iceberg partition writer. (#63782)

Signed-off-by: GavinMar <yangguansuo@starrocks.com>
This commit is contained in:
Gavin 2025-10-05 20:15:06 -07:00 committed by GitHub
parent 647ed56800
commit 3b454d6ec9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 26 additions and 24 deletions

View File

@ -56,8 +56,9 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> FileChunkSinkProvider::create_chun
std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
if (boost::iequals(ctx->format, formats::PARQUET)) {
file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators), std::nullopt,
ctx->executor, runtime_state);
fs, ctx->compression_type, ctx->options, ctx->column_names,
std::make_shared<std::vector<std::unique_ptr<ColumnEvaluator>>>(std::move(column_evaluators)),
std::nullopt, ctx->executor, runtime_state);
} else if (boost::iequals(ctx->format, formats::ORC)) {
file_writer_factory = std::make_shared<formats::ORCFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators), ctx->executor,

View File

@ -66,7 +66,8 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> HiveChunkSinkProvider::create_chun
ctx->options[formats::ParquetWriterOptions::USE_LEGACY_DECIMAL_ENCODING] = "true";
ctx->options[formats::ParquetWriterOptions::USE_INT96_TIMESTAMP_ENCODING] = "true";
file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->data_column_names, std::move(data_column_evaluators),
fs, ctx->compression_type, ctx->options, ctx->data_column_names,
std::make_shared<std::vector<std::unique_ptr<ColumnEvaluator>>>(std::move(data_column_evaluators)),
std::nullopt, ctx->executor, runtime_state);
} else if (boost::iequals(ctx->format, formats::ORC)) {
file_writer_factory = std::make_shared<formats::ORCFileWriterFactory>(

View File

@ -82,7 +82,8 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> IcebergChunkSinkProvider::create_c
auto ctx = std::dynamic_pointer_cast<IcebergChunkSinkContext>(context);
auto runtime_state = ctx->fragment_context->runtime_state();
std::shared_ptr<FileSystem> fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
auto column_evaluators = ColumnEvaluator::clone(ctx->column_evaluators);
auto column_evaluators = std::make_shared<std::vector<std::unique_ptr<ColumnEvaluator>>>(
ColumnEvaluator::clone(ctx->column_evaluators));
auto location_provider = std::make_shared<connector::LocationProvider>(
ctx->path, print_id(ctx->fragment_context->query_id()), runtime_state->be_number(), driver_id,
boost::to_lower_copy(ctx->format));
@ -93,8 +94,8 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> IcebergChunkSinkProvider::create_c
std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
if (boost::iequals(ctx->format, formats::PARQUET)) {
file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
ctx->parquet_field_ids, ctx->executor, runtime_state);
fs, ctx->compression_type, ctx->options, ctx->column_names, column_evaluators, ctx->parquet_field_ids,
ctx->executor, runtime_state);
} else {
file_writer_factory = std::make_shared<formats::UnknownFileWriterFactory>(ctx->format);
}
@ -107,7 +108,7 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> IcebergChunkSinkProvider::create_c
fs,
ctx->fragment_context,
runtime_state->desc_tbl().get_tuple_descriptor(ctx->tuple_desc_id),
&ctx->column_evaluators,
column_evaluators,
ctx->sort_ordering});
partition_chunk_writer_factory = std::make_unique<SpillPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
} else {

View File

@ -50,7 +50,7 @@ struct SpillPartitionChunkWriterContext : public PartitionChunkWriterContext {
std::shared_ptr<FileSystem> fs;
pipeline::FragmentContext* fragment_context = nullptr;
TupleDescriptor* tuple_desc = nullptr;
std::vector<std::unique_ptr<ColumnEvaluator>>* column_evaluators;
std::shared_ptr<std::vector<std::unique_ptr<ColumnEvaluator>>> column_evaluators;
std::shared_ptr<SortOrdering> sort_ordering;
};
@ -185,13 +185,12 @@ private:
std::shared_ptr<FileSystem> _fs = nullptr;
pipeline::FragmentContext* _fragment_context = nullptr;
TupleDescriptor* _tuple_desc = nullptr;
std::vector<std::unique_ptr<ColumnEvaluator>>* _column_evaluators;
std::shared_ptr<std::vector<std::unique_ptr<ColumnEvaluator>>> _column_evaluators;
std::shared_ptr<SortOrdering> _sort_ordering;
std::unique_ptr<ThreadPoolToken> _chunk_spill_token;
std::unique_ptr<ThreadPoolToken> _block_merge_token;
std::unique_ptr<LoadSpillBlockManager> _load_spill_block_mgr;
std::shared_ptr<LoadChunkSpiller> _load_chunk_spiller;
//std::function<StatusOr<ColumnPtr>(Chunk*, size_t)> _column_eval_func;
TUniqueId _writer_id;
std::list<ChunkPtr> _chunks;

View File

@ -455,13 +455,12 @@ Status ParquetFileWriter::init() {
ParquetFileWriter::~ParquetFileWriter() = default;
ParquetFileWriterFactory::ParquetFileWriterFactory(std::shared_ptr<FileSystem> fs,
TCompressionType::type compression_type,
std::map<std::string, std::string> options,
std::vector<std::string> column_names,
std::vector<std::unique_ptr<ColumnEvaluator>>&& column_evaluators,
std::optional<std::vector<formats::FileColumnId>> field_ids,
PriorityThreadPool* executors, RuntimeState* runtime_state)
ParquetFileWriterFactory::ParquetFileWriterFactory(
std::shared_ptr<FileSystem> fs, TCompressionType::type compression_type,
std::map<std::string, std::string> options, std::vector<std::string> column_names,
std::shared_ptr<std::vector<std::unique_ptr<ColumnEvaluator>>> column_evaluators,
std::optional<std::vector<formats::FileColumnId>> field_ids, PriorityThreadPool* executors,
RuntimeState* runtime_state)
: _fs(std::move(fs)),
_compression_type(compression_type),
_field_ids(std::move(field_ids)),
@ -472,7 +471,7 @@ ParquetFileWriterFactory::ParquetFileWriterFactory(std::shared_ptr<FileSystem> f
_runtime_state(runtime_state) {}
Status ParquetFileWriterFactory::init() {
RETURN_IF_ERROR(ColumnEvaluator::init(_column_evaluators));
RETURN_IF_ERROR(ColumnEvaluator::init(*_column_evaluators));
_parsed_options = std::make_shared<ParquetWriterOptions>();
_parsed_options->column_ids = _field_ids;
if (_options.contains(ParquetWriterOptions::USE_LEGACY_DECIMAL_ENCODING)) {
@ -506,8 +505,8 @@ StatusOr<WriterAndStream> ParquetFileWriterFactory::create(const std::string& pa
auto rollback_action = [fs = _fs, path = path]() {
WARN_IF_ERROR(ignore_not_found(fs->delete_file(path)), "fail to delete file");
};
auto column_evaluators = ColumnEvaluator::clone(_column_evaluators);
auto types = ColumnEvaluator::types(_column_evaluators);
auto column_evaluators = ColumnEvaluator::clone(*_column_evaluators);
auto types = ColumnEvaluator::types(*_column_evaluators);
auto async_output_stream =
std::make_unique<io::AsyncFlushOutputStream>(std::move(file), _executors, _runtime_state);
auto parquet_output_stream = std::make_shared<parquet::AsyncParquetOutputStream>(async_output_stream.get());

View File

@ -162,7 +162,7 @@ class ParquetFileWriterFactory : public FileWriterFactory {
public:
ParquetFileWriterFactory(std::shared_ptr<FileSystem> fs, TCompressionType::type compression_type,
std::map<std::string, std::string> options, std::vector<std::string> column_names,
std::vector<std::unique_ptr<ColumnEvaluator>>&& column_evaluators,
std::shared_ptr<std::vector<std::unique_ptr<ColumnEvaluator>>> column_evaluators,
std::optional<std::vector<formats::FileColumnId>> field_ids, PriorityThreadPool* executors,
RuntimeState* runtime_state);
@ -178,7 +178,7 @@ private:
std::shared_ptr<ParquetWriterOptions> _parsed_options;
std::vector<std::string> _column_names;
std::vector<std::unique_ptr<ColumnEvaluator>> _column_evaluators;
std::shared_ptr<std::vector<std::unique_ptr<ColumnEvaluator>>> _column_evaluators;
PriorityThreadPool* _executors = nullptr;
RuntimeState* _runtime_state = nullptr;
};

View File

@ -1123,10 +1123,11 @@ TEST_F(ParquetFileWriterTest, TestFactory) {
std::vector<TypeDescriptor> type_descs{type_bool};
auto column_names = _make_type_names(type_descs);
auto column_evaluators = ColumnSlotIdEvaluator::from_types(type_descs);
auto column_evaluators = std::make_shared<std::vector<std::unique_ptr<ColumnEvaluator>>>(
ColumnSlotIdEvaluator::from_types(type_descs));
auto fs = std::make_shared<MemoryFileSystem>();
auto factory = formats::ParquetFileWriterFactory(fs, TCompressionType::NO_COMPRESSION, {}, column_names,
std::move(column_evaluators), std::nullopt, nullptr, nullptr);
column_evaluators, std::nullopt, nullptr, nullptr);
ASSERT_OK(factory.init());
auto maybe_writer = factory.create(_file_path);
ASSERT_OK(maybe_writer.status());