Remove iceberg column stats (#4699 )

Set ndv=1 is a dangerous behaviour, and maybe optimizer will give you a unexpected plan.
support a faster unique id implement (#4701 )
2022-04-07 14:32:56 +08:00 · 2022-04-06 23:45:36 +08:00 · 2022-04-06 16:26:15 +08:00 · 2022-04-06 16:26:15 +08:00 · 2022-04-06 16:26:15 +08:00 · 2022-04-06 16:26:15 +08:00
525 changed files with 11712 additions and 8389 deletions
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@ -326,7 +326,9 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 endif()

 if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86_64")
-    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2")
+    if (USE_SSE4_2)
+        set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2")
+    endif()
    if (USE_AVX2)
        set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
    endif()
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@ -64,6 +64,7 @@ namespace starrocks {
 const uint32_t TASK_FINISH_MAX_RETRY = 3;
 const uint32_t PUBLISH_VERSION_MAX_RETRY = 3;
 const uint32_t PUBLISH_VERSION_SUBMIT_MAX_RETRY = 10;
+const size_t PUBLISH_VERSION_BATCH_SIZE = 10;

 std::atomic_ulong TaskWorkerPool::_s_report_version(time(nullptr) * 10000);
 std::mutex TaskWorkerPool::_s_task_signatures_locks[TTaskType::type::NUM_TASK_TYPE];
@ -707,7 +708,8 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) {
 }

 Status TaskWorkerPool::_publish_version_in_parallel(void* arg_this, std::unique_ptr<ThreadPool>& threadpool,
-                                                    const TPublishVersionRequest publish_version_req, size_t* tablet_n,
+                                                    const TPublishVersionRequest publish_version_req,
+                                                    std::set<TTabletId>* tablet_ids, size_t* tablet_n,
                                                    std::vector<TTabletId>* error_tablet_ids) {
    TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this;
    int64_t transaction_id = publish_version_req.transaction_id;
@ -797,8 +799,8 @@ Status TaskWorkerPool::_publish_version_in_parallel(void* arg_this, std::unique_
                    error_status = status;
                }
            }
+            tablet_ids->insert(tablet_info.tablet_id);
        }
-        *tablet_n += tablet_infos.size();
    }
    return error_status;
 }
@ -814,12 +816,16 @@ void* TaskWorkerPool::_publish_version_worker_thread_callback(void* arg_this) {
                    // The ideal queue size of threadpool should be larger than the maximum number of tablet of a partition.
                    // But it seems that there's no limit for the number of tablets of a partition.
                    // Since a large queue size brings a little overhead, a big one is chosen here.
-                    .set_max_queue_size(256)
+                    .set_max_queue_size(2048)
                    .build(&threadpool);
+    assert(st.ok());
+
+    std::vector<TAgentTaskRequest> task_requests;
+    std::vector<TFinishTaskRequest> finish_task_requests;
+    std::set<TTabletId> tablet_ids;
+    std::vector<TabletSharedPtr> tablets;

    while (true) {
-        TAgentTaskRequest agent_task_req;
-        TPublishVersionRequest publish_version_req;
        {
            std::unique_lock l(worker_pool_this->_worker_thread_lock);
            while (worker_pool_this->_tasks.empty() && !(worker_pool_this->_stopped)) {
@ -829,54 +835,86 @@ void* TaskWorkerPool::_publish_version_worker_thread_callback(void* arg_this) {
                break;
            }

-            agent_task_req = worker_pool_this->_tasks.front();
-            publish_version_req = agent_task_req.publish_version_req;
-            worker_pool_this->_tasks.pop_front();
-        }
-
-        StarRocksMetrics::instance()->publish_task_request_total.increment(1);
-        LOG(INFO) << "get publish version task, signature:" << agent_task_req.signature;
-
-        std::vector<TTabletId> error_tablet_ids;
-        uint32_t retry_time = 0;
-        Status status;
-
-        size_t tablet_n = 0;
-        while (retry_time < PUBLISH_VERSION_MAX_RETRY) {
-            error_tablet_ids.clear();
-            status = _publish_version_in_parallel(arg_this, threadpool, publish_version_req, &tablet_n,
-                                                  &error_tablet_ids);
-            if (status.ok()) {
-                break;
-            } else {
-                LOG(WARNING) << "publish version error, retry. [transaction_id=" << publish_version_req.transaction_id
-                             << ", error_tablets_size=" << error_tablet_ids.size() << "]";
-                ++retry_time;
-                SleepFor(MonoDelta::FromSeconds(1));
+            while (!worker_pool_this->_tasks.empty() && task_requests.size() < PUBLISH_VERSION_BATCH_SIZE) {
+                // collect some publish version tasks as a group.
+                task_requests.push_back(worker_pool_this->_tasks.front());
+                worker_pool_this->_tasks.pop_front();
            }
        }

-        TFinishTaskRequest finish_task_request;
-        if (!status.ok()) {
-            StarRocksMetrics::instance()->publish_task_failed_total.increment(1);
-            // if publish failed, return failed, FE will ignore this error and
-            // check error tablet ids and FE will also republish this task
-            LOG(WARNING) << "Fail to publish version. signature:" << agent_task_req.signature
-                         << " related tablet num: " << tablet_n;
-            finish_task_request.__set_error_tablet_ids(error_tablet_ids);
-        } else {
-            LOG(INFO) << "publish_version success. signature:" << agent_task_req.signature
-                      << " related tablet num: " << tablet_n;
+        for (size_t i = 0; i < task_requests.size(); ++i) {
+            auto& publish_version_task = task_requests[i];
+            StarRocksMetrics::instance()->publish_task_request_total.increment(1);
+            LOG(INFO) << "get publish version task, signature:" << publish_version_task.signature << " index: " << i
+                      << " group size: " << task_requests.size();
+
+            auto& publish_version_req = publish_version_task.publish_version_req;
+            std::vector<TTabletId> error_tablet_ids;
+            uint32_t retry_time = 0;
+            Status status;
+
+            size_t tablet_n = 0;
+            while (retry_time < PUBLISH_VERSION_MAX_RETRY) {
+                error_tablet_ids.clear();
+                status = _publish_version_in_parallel(arg_this, threadpool, publish_version_req, &tablet_ids, &tablet_n,
+                                                      &error_tablet_ids);
+                if (status.ok()) {
+                    break;
+                } else {
+                    LOG(WARNING) << "publish version error, retry. [transaction_id="
+                                 << publish_version_req.transaction_id
+                                 << ", error_tablets_size=" << error_tablet_ids.size() << "]";
+                    ++retry_time;
+                    SleepFor(MonoDelta::FromSeconds(1));
+                }
+            }
+
+            TFinishTaskRequest finish_task_request;
+            if (!status.ok()) {
+                StarRocksMetrics::instance()->publish_task_failed_total.increment(1);
+                // if publish failed, return failed, FE will ignore this error and
+                // check error tablet ids and FE will also republish this task
+                LOG(WARNING) << "Fail to publish version. signature:" << publish_version_task.signature
+                             << " related tablet num: " << tablet_n;
+                finish_task_request.__set_error_tablet_ids(error_tablet_ids);
+            } else {
+                LOG(INFO) << "publish_version success. signature:" << publish_version_task.signature
+                          << " related tablet num: " << tablet_n;
+            }
+
+            status.to_thrift(&finish_task_request.task_status);
+            finish_task_request.__set_backend(worker_pool_this->_backend);
+            finish_task_request.__set_task_type(publish_version_task.task_type);
+            finish_task_request.__set_signature(publish_version_task.signature);
+            finish_task_request.__set_report_version(_s_report_version);
+
+            finish_task_requests.emplace_back(std::move(finish_task_request));
        }

-        status.to_thrift(&finish_task_request.task_status);
-        finish_task_request.__set_backend(worker_pool_this->_backend);
-        finish_task_request.__set_task_type(agent_task_req.task_type);
-        finish_task_request.__set_signature(agent_task_req.signature);
-        finish_task_request.__set_report_version(_s_report_version);
+        // persist all related meta once in a group.
+        tablets.reserve(tablet_ids.size());
+        for (const auto tablet_id : tablet_ids) {
+            TabletSharedPtr tablet = StorageEngine::instance()->tablet_manager()->get_tablet(tablet_id);
+            if (tablet != nullptr) {
+                tablets.push_back(tablet);
+            }
+        }

-        worker_pool_this->_finish_task(finish_task_request);
-        worker_pool_this->_remove_task_info(agent_task_req.task_type, agent_task_req.signature);
+        auto st = StorageEngine::instance()->txn_manager()->persist_tablet_related_txns(tablets);
+        if (!st.ok()) {
+            LOG(WARNING) << "failed to persist transactions, tablets num: " << tablets.size() << " err: " << st;
+        }
+
+        tablet_ids.clear();
+        tablets.clear();
+
+        // notify FE when all tasks of group have been finished.
+        for (auto& finish_task_request : finish_task_requests) {
+            worker_pool_this->_finish_task(finish_task_request);
+            worker_pool_this->_remove_task_info(finish_task_request.task_type, finish_task_request.signature);
+        }
+        task_requests.clear();
+        finish_task_requests.clear();
    }
    threadpool->shutdown();
    return (void*)nullptr;
--- a/be/src/agent/task_worker_pool.h
+++ b/be/src/agent/task_worker_pool.h
@ -99,7 +99,8 @@ private:
    static void* _drop_tablet_worker_thread_callback(void* arg_this);
    static void* _push_worker_thread_callback(void* arg_this);
    static Status _publish_version_in_parallel(void* arg_this, std::unique_ptr<ThreadPool>& threadpool,
-                                               const TPublishVersionRequest publish_version_req, size_t* tablet_n,
+                                               const TPublishVersionRequest publish_version_req,
+                                               std::set<TTabletId>* tablet_ids, size_t* tablet_n,
                                               std::vector<TTabletId>* error_tablet_ids);
    static void* _publish_version_worker_thread_callback(void* arg_this);
    static void* _clear_transaction_task_worker_thread_callback(void* arg_this);
--- a/be/src/column/column_hash.h
+++ b/be/src/column/column_hash.h
@ -4,10 +4,15 @@

 #include <cstdint>

-#if defined(__x86_64__)
+#ifdef __SSE4_2__
 #include <nmmintrin.h>
 #endif

+#ifdef __SSE2__
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+
 #include "util/hash_util.hpp"
 #include "util/slice.h"
 #include "util/unaligned_access.h"
@ -103,6 +108,9 @@ public:
 };

 static uint32_t crc_hash_32(const void* data, int32_t bytes, uint32_t hash) {
+#if defined(__x86_64__) && !defined(__SSE4_2__)
+    return crc32(hash, (const unsigned char*)data, bytes);
+#else
    uint32_t words = bytes / sizeof(uint32_t);
    bytes = bytes % 4 /*sizeof(uint32_t)*/;

@ -134,9 +142,13 @@ static uint32_t crc_hash_32(const void* data, int32_t bytes, uint32_t hash) {
    // for anyone who only uses the first several bits of the hash.
    hash = (hash << 16u) | (hash >> 16u);
    return hash;
+#endif
 }

 static uint64_t crc_hash_64(const void* data, int32_t length, uint64_t hash) {
+#if defined(__x86_64__) && !defined(__SSE4_2__)
+    return crc32(hash, (const unsigned char*)data, length);
+#else
    if (UNLIKELY(length < 8)) {
        return crc_hash_32(data, length, hash);
    }
@ -145,7 +157,7 @@ static uint64_t crc_hash_64(const void* data, int32_t length, uint64_t hash) {
    auto* p = reinterpret_cast<const uint8_t*>(data);
    auto* end = reinterpret_cast<const uint8_t*>(data) + length;
    while (words--) {
-#if defined(__x86_64__)
+#if defined(__x86_64__) && defined(__SSE4_2__)
        hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(p));
 #elif defined(__aarch64__)
        hash = __crc32cd(hash, unaligned_load<uint32_t>(p));
@ -165,6 +177,7 @@ static uint64_t crc_hash_64(const void* data, int32_t length, uint64_t hash) {
 #endif
    p += sizeof(uint64_t);
    return hash;
+#endif
 }

 // TODO: 0x811C9DC5 is not prime number
@ -255,8 +268,10 @@ public:
 };

 inline uint64_t crc_hash_uint64(uint64_t value, uint64_t seed) {
-#if defined(__x86_64__)
+#if defined(__x86_64__) && defined(__SSE4_2__)
    return _mm_crc32_u64(seed, value);
+#elif defined(__x86_64__)
+    return crc32(seed, (const unsigned char*)&value, sizeof(uint64_t));
 #elif defined(__aarch64__)
    return __crc32cd(seed, value);
 #else
@ -265,9 +280,12 @@ inline uint64_t crc_hash_uint64(uint64_t value, uint64_t seed) {
 }

 inline uint64_t crc_hash_uint128(uint64_t value0, uint64_t value1, uint64_t seed) {
-#if defined(__x86_64__)
+#if defined(__x86_64__) && defined(__SSE4_2__)
    uint64_t hash = _mm_crc32_u64(seed, value0);
    hash = _mm_crc32_u64(hash, value1);
+#elif defined(__x86_64__)
+    hash = crc32(seed, (const unsigned char*)&value0, sizeof(uint64_t));
+    hash = crc32(hash, (const unsigned char*)&value1, sizeof(uint64_t));
 #elif defined(__aarch64__)
    uint64_t hash = __crc32cd(seed, value0);
    hash = __crc32cd(hash, value1);
--- a/be/src/column/column_pool.h
+++ b/be/src/column/column_pool.h
@ -172,7 +172,11 @@ class CACHELINE_ALIGNED ColumnPool {
                freed_bytes += release_column_if_large(_curr_free.ptrs[i], limit);
                ASAN_POISON_MEMORY_REGION(_curr_free.ptrs[i], sizeof(T));
            }
-            _curr_free.bytes -= freed_bytes;
+            if (freed_bytes > 0) {
+                _curr_free.bytes -= freed_bytes;
+                tls_thread_status.mem_consume(freed_bytes);
+                _pool->mem_tracker()->release(freed_bytes);
+            }
        }

        static inline void delete_local_pool(void* arg) { delete (LocalPool*)arg; }
--- a/be/src/column/const_column.h
+++ b/be/src/column/const_column.h
@ -36,7 +36,7 @@ public:

    bool is_null(size_t index) const override { return _data->is_null(0); }

-    bool only_null() const override { return _data->is_nullable(); }
+    bool only_null() const override { return _data->is_null(0); }

    bool has_null() const override { return _data->has_null(); }

--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@ -175,6 +175,7 @@ CONF_String(local_library_dir, "${UDF_RUNTIME_DIR}");
 CONF_mInt32(doris_scanner_thread_pool_thread_num, "48");
 // Number of olap scanner thread pool size.
 CONF_Int32(doris_scanner_thread_pool_queue_size, "102400");
+CONF_mDouble(scan_use_query_mem_ratio, "0.25");
 // Number of etl thread pool size.
 CONF_Int32(etl_thread_pool_size, "8");
 CONF_Int32(udf_thread_pool_size, "1");
@ -665,6 +666,7 @@ CONF_String(object_storage_endpoint, "");
 CONF_Int64(object_storage_max_connection, "102400");

 CONF_Bool(enable_orc_late_materialization, "true");
+CONF_Int32(orc_file_cache_max_size, "2097152");

 // default: 16MB
 CONF_mInt64(experimental_s3_max_single_part_size, "16777216");
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@ -113,6 +113,8 @@ public:
        return Status(TStatusCode::DATA_QUALITY_ERROR, msg);
    }

+    static Status GlobalDictError(const Slice& msg) { return Status(TStatusCode::GLOBAL_DICT_ERROR, msg); }
+
    bool ok() const { return _state == nullptr; }

    bool is_cancelled() const { return code() == TStatusCode::CANCELLED; }
--- a/be/src/env/env_s3.cpp
+++ b/be/src/env/env_s3.cpp
@ -123,13 +123,6 @@ bool operator==(const Aws::Client::ClientConfiguration& lhs, const Aws::Client::

 class S3ClientFactory {
 public:
-    // We cached config here and make a deep copy each time.Since aws sdk has changed the
-    // Aws::Client::ClientConfiguration default constructor to search for the region
-    // (where as before 1.8 it has been hard coded default of "us-east-1").
-    // Part of that change is looking through the ec2 metadata, which can take a long time.
-    // For more details, please refer https://github.com/aws/aws-sdk-cpp/issues/1440
-    static Aws::Client::ClientConfiguration s_config;
-
    using ClientConfiguration = Aws::Client::ClientConfiguration;
    using S3Client = Aws::S3::S3Client;
    using S3ClientPtr = std::shared_ptr<S3Client>;
@ -148,6 +141,16 @@ public:

    S3ClientPtr new_client(const ClientConfiguration& config);

+    static ClientConfiguration& getClientConfig() {
+        // We cached config here and make a deep copy each time.Since aws sdk has changed the
+        // Aws::Client::ClientConfiguration default constructor to search for the region
+        // (where as before 1.8 it has been hard coded default of "us-east-1").
+        // Part of that change is looking through the ec2 metadata, which can take a long time.
+        // For more details, please refer https://github.com/aws/aws-sdk-cpp/issues/1440
+        static ClientConfiguration instance;
+        return instance;
+    }
+
 private:
    S3ClientFactory();

@ -161,8 +164,6 @@ private:
    Random _rand;
 };

-Aws::Client::ClientConfiguration S3ClientFactory::s_config;
-
 S3ClientFactory::S3ClientFactory() : _items(0), _rand((int)::time(NULL)) {}

 S3ClientFactory::S3ClientPtr S3ClientFactory::new_client(const ClientConfiguration& config) {
@ -196,7 +197,7 @@ S3ClientFactory::S3ClientPtr S3ClientFactory::new_client(const ClientConfigurati
 }

 static std::shared_ptr<Aws::S3::S3Client> new_s3client(const S3URI& uri) {
-    Aws::Client::ClientConfiguration config = S3ClientFactory::s_config;
+    Aws::Client::ClientConfiguration config = S3ClientFactory::getClientConfig();
    config.scheme = Aws::Http::Scheme::HTTP; // TODO: use the scheme in uri
    if (!uri.endpoint().empty()) {
        config.endpointOverride = uri.endpoint();
--- a/be/src/exec/CMakeLists.txt
+++ b/be/src/exec/CMakeLists.txt
@ -94,6 +94,7 @@ set(EXEC_FILES
    vectorized/hdfs_scanner_parquet.cpp
    vectorized/hdfs_scanner_text.cpp
    vectorized/json_scanner.cpp
+    vectorized/json_parser.cpp
    vectorized/project_node.cpp
    vectorized/dict_decode_node.cpp
    vectorized/repeat_node.cpp
@ -195,8 +196,8 @@ set(EXEC_FILES
 # simdjson Runtime Implement Dispatch: https://github.com/simdjson/simdjson/blob/master/doc/implementation-selection.md#runtime-cpu-detection
 # Unset architecture-specific flags to avoid breaking implement runtime dispatch.
 if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86_64")
-    set_source_files_properties(vectorized/json_scanner.cpp PROPERTIES COMPILE_FLAGS -mno-avx)
-    set_source_files_properties(vectorized/json_scanner.cpp PROPERTIES COMPILE_FLAGS -mno-avx2)
+    set_source_files_properties(vectorized/json_scanner.cpp PROPERTIES COMPILE_FLAGS "-mno-avx -mno-avx2")
+    set_source_files_properties(vectorized/json_parser.cpp PROPERTIES COMPILE_FLAGS "-mno-avx -mno-avx2")
 endif()

 set(EXEC_FILES
--- a/be/src/exec/olap_common.h
+++ b/be/src/exec/olap_common.h
@ -311,12 +311,8 @@ public:
        return _begin_scan_keys.size();
    }

-    void set_begin_include(bool begin_include) { _begin_include = begin_include; }
-
    bool begin_include() const { return _begin_include; }

-    void set_end_include(bool end_include) { _end_include = end_include; }
-
    bool end_include() const { return _end_include; }

    void set_is_convertible(bool is_convertible) { _is_convertible = is_convertible; }
--- a/be/src/exec/pipeline/aggregate/aggregate_blocking_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_blocking_sink_operator.cpp
@ -17,7 +17,7 @@ void AggregateBlockingSinkOperator::close(RuntimeState* state) {
    Operator::close(state);
 }

-void AggregateBlockingSinkOperator::set_finishing(RuntimeState* state) {
+Status AggregateBlockingSinkOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;

    if (!_aggregator->is_none_group_by_exprs()) {
@ -45,6 +45,7 @@ void AggregateBlockingSinkOperator::set_finishing(RuntimeState* state) {
    COUNTER_SET(_aggregator->input_row_count(), _aggregator->num_input_rows());

    _aggregator->sink_complete();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> AggregateBlockingSinkOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/aggregate/aggregate_blocking_sink_operator.h
+++ b/be/src/exec/pipeline/aggregate/aggregate_blocking_sink_operator.h
@ -20,7 +20,7 @@ public:
    bool has_output() const override { return false; }
    bool need_input() const override { return !is_finished(); }
    bool is_finished() const override { return _is_finished || _aggregator->is_finished(); }
-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    Status prepare(RuntimeState* state) override;
    void close(RuntimeState* state) override;
--- a/be/src/exec/pipeline/aggregate/aggregate_blocking_source_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_blocking_source_operator.cpp
@ -14,8 +14,8 @@ bool AggregateBlockingSourceOperator::is_finished() const {
    return _aggregator->is_sink_complete() && _aggregator->is_ht_eos();
 }

-void AggregateBlockingSourceOperator::set_finished(RuntimeState* state) {
-    _aggregator->set_finished();
+Status AggregateBlockingSourceOperator::set_finished(RuntimeState* state) {
+    return _aggregator->set_finished();
 }

 void AggregateBlockingSourceOperator::close(RuntimeState* state) {
--- a/be/src/exec/pipeline/aggregate/aggregate_blocking_source_operator.h
+++ b/be/src/exec/pipeline/aggregate/aggregate_blocking_source_operator.h
@ -22,7 +22,7 @@ public:
    bool has_output() const override;
    bool is_finished() const override;

-    void set_finished(RuntimeState* state) override;
+    Status set_finished(RuntimeState* state) override;

    void close(RuntimeState* state) override;

--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.cpp
@ -17,7 +17,7 @@ void AggregateDistinctBlockingSinkOperator::close(RuntimeState* state) {
    Operator::close(state);
 }

-void AggregateDistinctBlockingSinkOperator::set_finishing(RuntimeState* state) {
+Status AggregateDistinctBlockingSinkOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;

    COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_set_variant().size());
@ -38,6 +38,7 @@ void AggregateDistinctBlockingSinkOperator::set_finishing(RuntimeState* state) {
    COUNTER_SET(_aggregator->input_row_count(), _aggregator->num_input_rows());

    _aggregator->sink_complete();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> AggregateDistinctBlockingSinkOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.h
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.h
@ -22,7 +22,7 @@ public:
    bool has_output() const override { return false; }
    bool need_input() const override { return !is_finished(); }
    bool is_finished() const override { return _is_finished || _aggregator->is_finished(); }
-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    Status prepare(RuntimeState* state) override;
    void close(RuntimeState* state) override;
--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_source_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_source_operator.cpp
@ -14,8 +14,8 @@ bool AggregateDistinctBlockingSourceOperator::is_finished() const {
    return _aggregator->is_sink_complete() && _aggregator->is_ht_eos();
 }

-void AggregateDistinctBlockingSourceOperator::set_finished(RuntimeState* state) {
-    _aggregator->set_finished();
+Status AggregateDistinctBlockingSourceOperator::set_finished(RuntimeState* state) {
+    return _aggregator->set_finished();
 }

 void AggregateDistinctBlockingSourceOperator::close(RuntimeState* state) {
--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_source_operator.h
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_source_operator.h
@ -22,7 +22,7 @@ public:
    bool has_output() const override;
    bool is_finished() const override;

-    void set_finished(RuntimeState* state) override;
+    Status set_finished(RuntimeState* state) override;

    void close(RuntimeState* state) override;

--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_sink_operator.cpp
@ -17,7 +17,7 @@ void AggregateDistinctStreamingSinkOperator::close(RuntimeState* state) {
    Operator::close(state);
 }

-void AggregateDistinctStreamingSinkOperator::set_finishing(RuntimeState* state) {
+Status AggregateDistinctStreamingSinkOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;

    if (_aggregator->hash_set_variant().size() == 0) {
@ -25,6 +25,7 @@ void AggregateDistinctStreamingSinkOperator::set_finishing(RuntimeState* state)
    }

    _aggregator->sink_complete();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> AggregateDistinctStreamingSinkOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_sink_operator.h
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_sink_operator.h
@ -22,7 +22,7 @@ public:
    bool has_output() const override { return false; }
    bool need_input() const override { return !is_finished(); }
    bool is_finished() const override { return _is_finished || _aggregator->is_finished(); }
-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    Status prepare(RuntimeState* state) override;
    void close(RuntimeState* state) override;
--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_source_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_source_operator.cpp
@ -29,8 +29,8 @@ bool AggregateDistinctStreamingSourceOperator::is_finished() const {
    return _aggregator->is_sink_complete() && _aggregator->is_chunk_buffer_empty() && _aggregator->is_ht_eos();
 }

-void AggregateDistinctStreamingSourceOperator::set_finished(RuntimeState* state) {
-    _aggregator->set_finished();
+Status AggregateDistinctStreamingSourceOperator::set_finished(RuntimeState* state) {
+    return _aggregator->set_finished();
 }

 void AggregateDistinctStreamingSourceOperator::close(RuntimeState* state) {
--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_source_operator.h
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_source_operator.h
@ -22,7 +22,7 @@ public:
    bool has_output() const override;
    bool is_finished() const override;

-    void set_finished(RuntimeState* state) override;
+    Status set_finished(RuntimeState* state) override;

    void close(RuntimeState* state) override;

--- a/be/src/exec/pipeline/aggregate/aggregate_streaming_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_streaming_sink_operator.cpp
@ -17,7 +17,7 @@ void AggregateStreamingSinkOperator::close(RuntimeState* state) {
    Operator::close(state);
 }

-void AggregateStreamingSinkOperator::set_finishing(RuntimeState* state) {
+Status AggregateStreamingSinkOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;

    if (_aggregator->hash_map_variant().size() == 0) {
@ -25,6 +25,7 @@ void AggregateStreamingSinkOperator::set_finishing(RuntimeState* state) {
    }

    _aggregator->sink_complete();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> AggregateStreamingSinkOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/aggregate/aggregate_streaming_sink_operator.h
+++ b/be/src/exec/pipeline/aggregate/aggregate_streaming_sink_operator.h
@ -20,7 +20,7 @@ public:
    bool has_output() const override { return false; }
    bool need_input() const override { return !is_finished(); }
    bool is_finished() const override { return _is_finished || _aggregator->is_finished(); }
-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    Status prepare(RuntimeState* state) override;
    void close(RuntimeState* state) override;
--- a/be/src/exec/pipeline/aggregate/aggregate_streaming_source_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_streaming_source_operator.cpp
@ -43,12 +43,13 @@ bool AggregateStreamingSourceOperator::is_finished() const {
    return _is_finished;
 }

-void AggregateStreamingSourceOperator::set_finishing(RuntimeState* state) {
+Status AggregateStreamingSourceOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
+    return Status::OK();
 }

-void AggregateStreamingSourceOperator::set_finished(RuntimeState* state) {
-    _aggregator->set_finished();
+Status AggregateStreamingSourceOperator::set_finished(RuntimeState* state) {
+    return _aggregator->set_finished();
 }

 void AggregateStreamingSourceOperator::close(RuntimeState* state) {
--- a/be/src/exec/pipeline/aggregate/aggregate_streaming_source_operator.h
+++ b/be/src/exec/pipeline/aggregate/aggregate_streaming_source_operator.h
@ -21,8 +21,8 @@ public:

    bool has_output() const override;
    bool is_finished() const override;
-    void set_finishing(RuntimeState* state) override;
-    void set_finished(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;
+    Status set_finished(RuntimeState* state) override;

    void close(RuntimeState* state) override;

--- a/be/src/exec/pipeline/aggregate/repeat/repeat_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/repeat/repeat_operator.cpp
@ -18,8 +18,9 @@ bool RepeatOperator::is_finished() const {
    return _is_finished && _repeat_times_last >= _repeat_times_required;
 }

-void RepeatOperator::set_finishing(RuntimeState* state) {
+Status RepeatOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
+    return Status::OK();
 }

 bool RepeatOperator::has_output() const {
--- a/be/src/exec/pipeline/aggregate/repeat/repeat_operator.h
+++ b/be/src/exec/pipeline/aggregate/repeat/repeat_operator.h
@ -43,7 +43,7 @@ public:
        return _repeat_times_last >= _repeat_times_required;
    }
    bool is_finished() const override;
-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    Status prepare(RuntimeState* state) override;

--- a/be/src/exec/pipeline/analysis/analytic_sink_operator.cpp
+++ b/be/src/exec/pipeline/analysis/analytic_sink_operator.cpp
@ -2,6 +2,7 @@

 #include "analytic_sink_operator.h"

+#include "runtime/current_thread.h"
 #include "runtime/runtime_state.h"

 namespace starrocks::pipeline {
@ -43,11 +44,12 @@ void AnalyticSinkOperator::close(RuntimeState* state) {
    Operator::close(state);
 }

-void AnalyticSinkOperator::set_finishing(RuntimeState* state) {
+Status AnalyticSinkOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
    _analytor->input_eos() = true;
-    _process_by_partition_if_necessary();
+    RETURN_IF_ERROR(_process_by_partition_if_necessary());
    _analytor->sink_complete();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> AnalyticSinkOperator::pull_chunk(RuntimeState* state) {
@ -68,21 +70,22 @@ Status AnalyticSinkOperator::push_chunk(RuntimeState* state, const vectorized::C
            // For performance, we do this special handle.
            // In future, if need, we could remove this if else easily.
            if (j == 0) {
-                _analytor->append_column(chunk_size, _analytor->agg_intput_columns()[i][j].get(), column);
+                TRY_CATCH_BAD_ALLOC(
+                        _analytor->append_column(chunk_size, _analytor->agg_intput_columns()[i][j].get(), column));
            } else {
-                _analytor->agg_intput_columns()[i][j]->append(*column, 0, column->size());
+                TRY_CATCH_BAD_ALLOC(_analytor->agg_intput_columns()[i][j]->append(*column, 0, column->size()));
            }
        }
    }

    for (size_t i = 0; i < _analytor->partition_ctxs().size(); i++) {
        ColumnPtr column = _analytor->partition_ctxs()[i]->evaluate(chunk.get());
-        _analytor->append_column(chunk_size, _analytor->partition_columns()[i].get(), column);
+        TRY_CATCH_BAD_ALLOC(_analytor->append_column(chunk_size, _analytor->partition_columns()[i].get(), column));
    }

    for (size_t i = 0; i < _analytor->order_ctxs().size(); i++) {
        ColumnPtr column = _analytor->order_ctxs()[i]->evaluate(chunk.get());
-        _analytor->append_column(chunk_size, _analytor->order_columns()[i].get(), column);
+        TRY_CATCH_BAD_ALLOC(_analytor->append_column(chunk_size, _analytor->order_columns()[i].get(), column));
    }

    _analytor->input_chunks().emplace_back(std::move(chunk));
@ -92,6 +95,10 @@ Status AnalyticSinkOperator::push_chunk(RuntimeState* state, const vectorized::C

 Status AnalyticSinkOperator::_process_by_partition_if_necessary() {
    while (_analytor->has_output()) {
+        if (_analytor->reached_limit()) {
+            return Status::OK();
+        }
+
        int64_t found_partition_end = _analytor->find_partition_end();
        // Only process after all the data in a partition is reached
        if (!_analytor->is_partition_finished(found_partition_end)) {
@ -114,9 +121,6 @@ Status AnalyticSinkOperator::_process_by_partition_if_necessary() {
            vectorized::ChunkPtr chunk;
            RETURN_IF_ERROR(_analytor->output_result_chunk(&chunk));
            _analytor->offer_chunk_to_buffer(chunk);
-            if (_analytor->reached_limit()) {
-                return Status::OK();
-            }
        }
    }
    return Status::OK();
--- a/be/src/exec/pipeline/analysis/analytic_sink_operator.h
+++ b/be/src/exec/pipeline/analysis/analytic_sink_operator.h
@ -17,8 +17,8 @@ public:

    bool has_output() const override { return false; }
    bool need_input() const override { return !is_finished(); }
-    bool is_finished() const override { return _is_finished || _analytor->is_finished(); }
-    void set_finishing(RuntimeState* state) override;
+    bool is_finished() const override { return _is_finished || _analytor->reached_limit() || _analytor->is_finished(); }
+    Status set_finishing(RuntimeState* state) override;

    Status prepare(RuntimeState* state) override;
    void close(RuntimeState* state) override;
--- a/be/src/exec/pipeline/analysis/analytic_source_operator.cpp
+++ b/be/src/exec/pipeline/analysis/analytic_source_operator.cpp
@ -12,8 +12,8 @@ bool AnalyticSourceOperator::is_finished() const {
    return _analytor->is_sink_complete() && _analytor->is_chunk_buffer_empty();
 }

-void AnalyticSourceOperator::set_finished(RuntimeState* state) {
-    _analytor->set_finished();
+Status AnalyticSourceOperator::set_finished(RuntimeState* state) {
+    return _analytor->set_finished();
 }

 void AnalyticSourceOperator::close(RuntimeState* state) {
--- a/be/src/exec/pipeline/analysis/analytic_source_operator.h
+++ b/be/src/exec/pipeline/analysis/analytic_source_operator.h
@ -19,7 +19,7 @@ public:
    bool has_output() const override;
    bool is_finished() const override;

-    void set_finished(RuntimeState* state) override;
+    Status set_finished(RuntimeState* state) override;

    void close(RuntimeState* state) override;

--- a/be/src/exec/pipeline/assert_num_rows_operator.cpp
+++ b/be/src/exec/pipeline/assert_num_rows_operator.cpp
@ -60,8 +60,9 @@ Status AssertNumRowsOperator::push_chunk(RuntimeState* state, const vectorized::
    return Status::OK();
 }

-void AssertNumRowsOperator::set_finishing(RuntimeState* state) {
+Status AssertNumRowsOperator::set_finishing(RuntimeState* state) {
    _input_finished = true;
+    return Status::OK();
 }

 bool AssertNumRowsOperator::is_finished() const {
--- a/be/src/exec/pipeline/assert_num_rows_operator.h
+++ b/be/src/exec/pipeline/assert_num_rows_operator.h
@ -39,7 +39,7 @@ public:
    bool need_input() const override;
    Status push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) override;

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;
    bool is_finished() const override;

 private:
--- a/be/src/exec/pipeline/chunk_source.h
+++ b/be/src/exec/pipeline/chunk_source.h
@ -12,12 +12,14 @@

 namespace starrocks {
 class RuntimeState;
+class RuntimeProfile;

 namespace pipeline {

 class ChunkSource {
 public:
-    ChunkSource(MorselPtr&& morsel) : _morsel(std::move(morsel)) {}
+    ChunkSource(RuntimeProfile* runtime_profile, MorselPtr&& morsel)
+            : _runtime_profile(runtime_profile), _morsel(std::move(morsel)){};

    virtual ~ChunkSource() = default;

@ -40,8 +42,10 @@ public:
    virtual Status buffer_next_batch_chunks_blocking_for_workgroup(size_t chunk_size, bool& can_finish,
                                                                   size_t* num_read_chunks, int worker_id,
                                                                   workgroup::WorkGroupPtr running_wg) = 0;
+    virtual int64_t last_spent_cpu_time_ns() { return 0; }

 protected:
+    RuntimeProfile* _runtime_profile;
    // The morsel will own by pipeline driver
    MorselPtr _morsel;
 };
--- a/be/src/exec/pipeline/context_with_dependency.h
+++ b/be/src/exec/pipeline/context_with_dependency.h
@ -48,7 +48,10 @@ public:
    }

    // When the output operator is finished, the context can be finished regardless of other running operators.
-    void set_finished() { _is_finished.store(true, std::memory_order_release); }
+    Status set_finished() {
+        _is_finished.store(true, std::memory_order_release);
+        return Status::OK();
+    }

    // Predicate whether the context is finished, which is used to notify
    // non-output operators to be finished early.
@ -59,4 +62,4 @@ protected:
    std::atomic<bool> _is_finished = false;
 };

-} // namespace starrocks::pipeline
+} // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/crossjoin/cross_join_left_operator.h
+++ b/be/src/exec/pipeline/crossjoin/cross_join_left_operator.h
@ -67,9 +67,15 @@ public:
        return _is_finished && _is_curr_probe_chunk_finished();
    }

-    void set_finishing(RuntimeState* state) override { _is_finished = true; }
+    Status set_finishing(RuntimeState* state) override {
+        _is_finished = true;
+        return Status::OK();
+    }

-    void set_finished(RuntimeState* state) override { _cross_join_context->set_finished(); }
+    Status set_finished(RuntimeState* state) override {
+        _cross_join_context->set_finished();
+        return Status::OK();
+    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/crossjoin/cross_join_right_sink_operator.cpp
+++ b/be/src/exec/pipeline/crossjoin/cross_join_right_sink_operator.cpp
@ -4,6 +4,7 @@

 #include "column/chunk.h"
 #include "column/column_helper.h"
+#include "runtime/current_thread.h"

 using namespace starrocks::vectorized;

@ -22,9 +23,9 @@ Status CrossJoinRightSinkOperator::push_chunk(RuntimeState* state, const vectori
            // merge chunks from right table.
            size_t col_number = chunk->num_columns();
            for (size_t col = 0; col < col_number; ++col) {
-                _cross_join_context->get_build_chunk(_driver_sequence)
-                        ->get_column_by_index(col)
-                        ->append(*(chunk->get_column_by_index(col).get()), 0, row_number);
+                TRY_CATCH_BAD_ALLOC(_cross_join_context->get_build_chunk(_driver_sequence)
+                                            ->get_column_by_index(col)
+                                            ->append(*(chunk->get_column_by_index(col).get()), 0, row_number));
            }
        }
    }
--- a/be/src/exec/pipeline/crossjoin/cross_join_right_sink_operator.h
+++ b/be/src/exec/pipeline/crossjoin/cross_join_right_sink_operator.h
@ -34,10 +34,11 @@ public:

    bool is_finished() const override { return _is_finished || _cross_join_context->is_finished(); }

-    void set_finishing(RuntimeState* state) override {
+    Status set_finishing(RuntimeState* state) override {
        _is_finished = true;
        // Used to notify cross_join_left_operator.
        _cross_join_context->finish_one_right_sinker();
+        return Status::OK();
    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;
--- a/be/src/exec/pipeline/dict_decode_operator.cpp
+++ b/be/src/exec/pipeline/dict_decode_operator.cpp
@ -8,7 +8,7 @@
 namespace starrocks::pipeline {

 Status DictDecodeOperator::prepare(RuntimeState* state) {
-    Operator::prepare(state);
+    RETURN_IF_ERROR(Operator::prepare(state));
    return Status::OK();
 }

--- a/be/src/exec/pipeline/dict_decode_operator.h
+++ b/be/src/exec/pipeline/dict_decode_operator.h
@ -41,7 +41,10 @@ public:

    bool is_finished() const override { return _is_finished && _cur_chunk == nullptr; }

-    void set_finishing(RuntimeState* state) override { _is_finished = true; }
+    Status set_finishing(RuntimeState* state) override {
+        _is_finished = true;
+        return Status::OK();
+    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/exchange/exchange_merge_sort_source_operator.cpp
+++ b/be/src/exec/pipeline/exchange/exchange_merge_sort_source_operator.cpp
@ -38,9 +38,10 @@ bool ExchangeMergeSortSourceOperator::is_finished() const {
    }
 }

-void ExchangeMergeSortSourceOperator::set_finishing(RuntimeState* state) {
+Status ExchangeMergeSortSourceOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
-    return _stream_recvr->close();
+    _stream_recvr->close();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> ExchangeMergeSortSourceOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/exchange/exchange_merge_sort_source_operator.h
+++ b/be/src/exec/pipeline/exchange/exchange_merge_sort_source_operator.h
@ -36,7 +36,7 @@ public:

    bool is_finished() const override;

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/exchange/exchange_sink_operator.cpp
+++ b/be/src/exec/pipeline/exchange/exchange_sink_operator.cpp
@ -41,13 +41,12 @@ public:
    // how much tuple data is getting accumulated before being sent; it only applies
    // when data is added via add_row() and not sent directly via send_batch().
    Channel(ExchangeSinkOperator* parent, const TNetworkAddress& brpc_dest, const TUniqueId& fragment_instance_id,
-            PlanNodeId dest_node_id, int32_t num_shuffles, int32_t channel_id, bool enable_exchange_pass_through,
+            PlanNodeId dest_node_id, int32_t num_shuffles, bool enable_exchange_pass_through,
            PassThroughChunkBuffer* pass_through_chunk_buffer)
            : _parent(parent),
              _brpc_dest_addr(brpc_dest),
              _fragment_instance_id(fragment_instance_id),
              _dest_node_id(dest_node_id),
-              _channel_id(channel_id),
              _enable_exchange_pass_through(enable_exchange_pass_through),
              _pass_through_context(pass_through_chunk_buffer, fragment_instance_id, dest_node_id),
              _chunks(num_shuffles) {}
@ -102,7 +101,6 @@ private:
    const TNetworkAddress _brpc_dest_addr;
    const TUniqueId _fragment_instance_id;
    const PlanNodeId _dest_node_id;
-    const int32_t _channel_id;

    const bool _enable_exchange_pass_through;
    PassThroughContext _pass_through_context;
@ -116,7 +114,7 @@ private:
    // always be 1
    std::vector<std::unique_ptr<vectorized::Chunk>> _chunks;
    PTransmitChunkParamsPtr _chunk_request;
-    size_t _current_request_bytes;
+    size_t _current_request_bytes = 0;

    bool _is_inited = false;
    bool _use_pass_through = false;
@ -302,10 +300,8 @@ ExchangeSinkOperator::ExchangeSinkOperator(OperatorFactory* factory, int32_t id,
        if (fragment_instance_id.lo == -1 && pseudo_channel.has_value()) {
            _channels.emplace_back(pseudo_channel.value());
        } else {
-            const auto channel_id = _channels.size();
            _channels.emplace_back(new Channel(this, destination.brpc_server, fragment_instance_id, dest_node_id,
-                                               _num_shuffles, channel_id, enable_exchange_pass_through,
-                                               pass_through_chunk_buffer));
+                                               _num_shuffles, enable_exchange_pass_through, pass_through_chunk_buffer));
            if (fragment_instance_id.lo == -1) {
                pseudo_channel = _channels.back();
            }
@ -342,7 +338,7 @@ Status ExchangeSinkOperator::prepare(RuntimeState* state) {
    _unique_metrics->add_info_string("PartType", _TPartitionType_VALUES_TO_NAMES.at(_part_type));

    if (_part_type == TPartitionType::HASH_PARTITIONED ||
-        _part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) {
+        _part_type == TPartitionType::BUCKET_SHUFFLE_HASH_PARTITIONED) {
        _partitions_columns.resize(_partition_expr_ctxs.size());
    }

@ -352,21 +348,12 @@ Status ExchangeSinkOperator::prepare(RuntimeState* state) {
    srand(reinterpret_cast<uint64_t>(this));
    std::shuffle(_channel_indices.begin(), _channel_indices.end(), std::mt19937(std::random_device()()));

-    _bytes_sent_counter = ADD_COUNTER(_unique_metrics, "BytesSent", TUnit::BYTES);
    _bytes_pass_through_counter = ADD_COUNTER(_unique_metrics, "BytesPassThrough", TUnit::BYTES);
    _uncompressed_bytes_counter = ADD_COUNTER(_unique_metrics, "UncompressedBytes", TUnit::BYTES);
-    _ignore_rows = ADD_COUNTER(_unique_metrics, "IgnoreRows", TUnit::UNIT);
-    _serialize_batch_timer = ADD_TIMER(_unique_metrics, "SerializeBatchTime");
-    _shuffle_hash_timer = ADD_TIMER(_unique_metrics, "ShuffleHashTimer");
+    _serialize_chunk_timer = ADD_TIMER(_unique_metrics, "SerializeChunkTime");
+    _shuffle_hash_timer = ADD_TIMER(_unique_metrics, "ShuffleHashTime");
    _compress_timer = ADD_TIMER(_unique_metrics, "CompressTime");
-    _send_request_timer = ADD_TIMER(_unique_metrics, "SendRequestTime");
-    _wait_response_timer = ADD_TIMER(_unique_metrics, "WaitResponseTime");
-    _overall_throughput = _unique_metrics->add_derived_counter(
-            "OverallThroughput", TUnit::BYTES_PER_SECOND,
-            [capture0 = _bytes_sent_counter, capture1 = _total_timer] {
-                return RuntimeProfile::units_per_second(capture0, capture1);
-            },
-            "");
+
    for (auto& _channel : _channels) {
        RETURN_IF_ERROR(_channel->init(state));
    }
@ -390,8 +377,9 @@ bool ExchangeSinkOperator::pending_finish() const {
    return !_buffer->is_finished();
 }

-void ExchangeSinkOperator::set_cancelled(RuntimeState* state) {
+Status ExchangeSinkOperator::set_cancelled(RuntimeState* state) {
    _buffer->cancel_one_sinker();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> ExchangeSinkOperator::pull_chunk(RuntimeState* state) {
@ -452,7 +440,7 @@ Status ExchangeSinkOperator::push_chunk(RuntimeState* state, const vectorized::C
            _curr_random_channel_idx = (_curr_random_channel_idx + 1) % _channels.size();
        }
    } else if (_part_type == TPartitionType::HASH_PARTITIONED ||
-               _part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) {
+               _part_type == TPartitionType::BUCKET_SHUFFLE_HASH_PARTITIONED) {
        // hash-partition batch's rows across channels
        const auto num_channels = _channels.size();
        {
@ -482,7 +470,8 @@ Status ExchangeSinkOperator::push_chunk(RuntimeState* state, const vectorized::C

            for (size_t i = 0; i < num_rows; ++i) {
                auto channel_id = _hash_values[i] % num_channels;
-                auto driver_sequence = _hash_values[i] % _num_shuffles;
+                // Note that xorshift32 rehash must be applied for both local shuffle and exchange sink here.
+                auto driver_sequence = HashUtil::xorshift32(_hash_values[i]) % _num_shuffles;
                _channel_ids[i] = channel_id;
                _driver_sequences[i] = driver_sequence;
                _channel_row_idx_start_points[channel_id * _num_shuffles + driver_sequence]++;
@ -522,7 +511,7 @@ Status ExchangeSinkOperator::push_chunk(RuntimeState* state, const vectorized::C
    return Status::OK();
 }

-void ExchangeSinkOperator::set_finishing(RuntimeState* state) {
+Status ExchangeSinkOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;

    if (_chunk_request != nullptr) {
@ -539,9 +528,13 @@ void ExchangeSinkOperator::set_finishing(RuntimeState* state) {
    for (auto& _channel : _channels) {
        _channel->close(state, _fragment_ctx);
    }
+
+    _buffer->set_finishing();
+    return Status::OK();
 }

 void ExchangeSinkOperator::close(RuntimeState* state) {
+    _buffer->update_profile(_unique_metrics.get());
    Operator::close(state);
 }

@ -549,7 +542,7 @@ Status ExchangeSinkOperator::serialize_chunk(const vectorized::Chunk* src, Chunk
                                             int num_receivers) {
    VLOG_ROW << "[ExchangeSinkOperator] serializing " << src->num_rows() << " rows";
    {
-        SCOPED_TIMER(_serialize_batch_timer);
+        SCOPED_TIMER(_serialize_chunk_timer);
        // We only serialize chunk meta for first chunk
        if (*is_first_chunk) {
            StatusOr<ChunkPB> res = serde::ProtobufChunkSerde::serialize(*src);
@ -567,8 +560,8 @@ Status ExchangeSinkOperator::serialize_chunk(const vectorized::Chunk* src, Chunk
    const size_t uncompressed_size = dst->uncompressed_size();

    if (_compress_codec != nullptr && _compress_codec->exceed_max_input_size(uncompressed_size)) {
-        return Status::InternalError("The input size for compression should be less than " +
-                                     _compress_codec->max_input_size());
+        return Status::InternalError(strings::Substitute("The input size for compression should be less than $0",
+                                                         _compress_codec->max_input_size()));
    }

    // try compress the ChunkPB data
@ -596,7 +589,6 @@ Status ExchangeSinkOperator::serialize_chunk(const vectorized::Chunk* src, Chunk
    size_t chunk_size = dst->data().size();
    VLOG_ROW << "chunk data size " << chunk_size;

-    COUNTER_UPDATE(_bytes_sent_counter, chunk_size * num_receivers);
    COUNTER_UPDATE(_uncompressed_bytes_counter, uncompressed_size * num_receivers);
    return Status::OK();
 }
@ -637,7 +629,7 @@ Status ExchangeSinkOperatorFactory::prepare(RuntimeState* state) {
    RETURN_IF_ERROR(OperatorFactory::prepare(state));

    if (_part_type == TPartitionType::HASH_PARTITIONED ||
-        _part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) {
+        _part_type == TPartitionType::BUCKET_SHUFFLE_HASH_PARTITIONED) {
        RETURN_IF_ERROR(Expr::prepare(_partition_expr_ctxs, state));
        RETURN_IF_ERROR(Expr::open(_partition_expr_ctxs, state));
    }
--- a/be/src/exec/pipeline/exchange/exchange_sink_operator.h
+++ b/be/src/exec/pipeline/exchange/exchange_sink_operator.h
@ -51,9 +51,9 @@ public:

    bool pending_finish() const override;

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

-    void set_cancelled(RuntimeState* state) override;
+    Status set_cancelled(RuntimeState* state) override;

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

@ -115,18 +115,11 @@ private:
    CompressionTypePB _compress_type = CompressionTypePB::NO_COMPRESSION;
    const BlockCompressionCodec* _compress_codec = nullptr;

-    RuntimeProfile::Counter* _serialize_batch_timer = nullptr;
+    RuntimeProfile::Counter* _serialize_chunk_timer = nullptr;
    RuntimeProfile::Counter* _shuffle_hash_timer = nullptr;
    RuntimeProfile::Counter* _compress_timer = nullptr;
-    RuntimeProfile::Counter* _bytes_sent_counter = nullptr;
    RuntimeProfile::Counter* _bytes_pass_through_counter = nullptr;
    RuntimeProfile::Counter* _uncompressed_bytes_counter = nullptr;
-    RuntimeProfile::Counter* _ignore_rows = nullptr;
-
-    RuntimeProfile::Counter* _send_request_timer = nullptr;
-    RuntimeProfile::Counter* _wait_response_timer = nullptr;
-    // Throughput per total time spent in sender
-    RuntimeProfile::Counter* _overall_throughput = nullptr;

    std::atomic<bool> _is_finished = false;
    std::atomic<bool> _is_cancelled = false;
--- a/be/src/exec/pipeline/exchange/exchange_source_operator.cpp
+++ b/be/src/exec/pipeline/exchange/exchange_source_operator.cpp
@ -24,10 +24,11 @@ bool ExchangeSourceOperator::is_finished() const {
    return _stream_recvr->is_finished();
 }

-void ExchangeSourceOperator::set_finishing(RuntimeState* state) {
+Status ExchangeSourceOperator::set_finishing(RuntimeState* state) {
    _is_finishing = true;
    _stream_recvr->short_circuit_for_pipeline(_driver_sequence);
    static_cast<ExchangeSourceOperatorFactory*>(_factory)->close_stream_recvr();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> ExchangeSourceOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/exchange/exchange_source_operator.h
+++ b/be/src/exec/pipeline/exchange/exchange_source_operator.h
@ -23,7 +23,7 @@ public:

    bool is_finished() const override;

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/exchange/local_exchange.cpp
+++ b/be/src/exec/pipeline/exchange/local_exchange.cpp
@ -32,6 +32,17 @@ Status PartitionExchanger::Partitioner::partition_chunk(const vectorized::ChunkP
        }
    }

+    // When the local exchange is the successor of ExchangeSourceOperator,
+    // the data flow is `exchange sink -> exchange source -> local exchange`.
+    // `exchange sink -> exchange source` (phase 1) determines which fragment instance to deliver each row,
+    // while `exchange source -> local exchange` (phase 2) determines which pipeline driver to deliver each row.
+    // To avoid hash two times and data skew due to hash one time, phase 1 hashes one time
+    // and phase 2 applies xorshift32 on the hash value.
+    // Note that xorshift32 rehash must be applied for both local shuffle here and exchange sink.
+    for (int32_t i = 0; i < num_rows; ++i) {
+        _hash_values[i] = HashUtil::xorshift32(_hash_values[i]);
+    }
+
    // Compute row indexes for each channel.
    _partition_row_indexes_start_points.assign(num_partitions + 1, 0);
    for (int32_t i = 0; i < num_rows; ++i) {
--- a/be/src/exec/pipeline/exchange/local_exchange_sink_operator.cpp
+++ b/be/src/exec/pipeline/exchange/local_exchange_sink_operator.cpp
@ -20,9 +20,10 @@ StatusOr<vectorized::ChunkPtr> LocalExchangeSinkOperator::pull_chunk(RuntimeStat
    return Status::InternalError("Shouldn't call pull_chunk from local exchange sink.");
 }

-void LocalExchangeSinkOperator::set_finishing(RuntimeState* state) {
+Status LocalExchangeSinkOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
    _exchanger->finish(state);
+    return Status::OK();
 }

 Status LocalExchangeSinkOperator::push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) {
--- a/be/src/exec/pipeline/exchange/local_exchange_sink_operator.h
+++ b/be/src/exec/pipeline/exchange/local_exchange_sink_operator.h
@ -32,7 +32,7 @@ public:
    // In either case,  LocalExchangeSinkOperator is finished.
    bool is_finished() const override { return _is_finished || _exchanger->is_all_sources_finished(); }

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/exchange/local_exchange_source_operator.cpp
+++ b/be/src/exec/pipeline/exchange/local_exchange_source_operator.cpp
@ -49,7 +49,7 @@ bool LocalExchangeSourceOperator::has_output() const {
           (_is_finished && _partition_rows_num > 0);
 }

-void LocalExchangeSourceOperator::set_finished(RuntimeState* state) {
+Status LocalExchangeSourceOperator::set_finished(RuntimeState* state) {
    std::lock_guard<std::mutex> l(_chunk_lock);
    _is_finished = true;
    // Compute out the number of rows of the _full_chunk_queue.
@ -66,6 +66,7 @@ void LocalExchangeSourceOperator::set_finished(RuntimeState* state) {
    // Subtract the number of rows of buffered chunks from row_count of _memory_manager and make it unblocked.
    _memory_manager->update_row_count(-(full_rows_num + _partition_rows_num));
    _partition_rows_num = 0;
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> LocalExchangeSourceOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/exchange/local_exchange_source_operator.h
+++ b/be/src/exec/pipeline/exchange/local_exchange_source_operator.h
@ -41,10 +41,11 @@ public:

    bool is_finished() const override;

-    void set_finished(RuntimeState* state) override;
-    void set_finishing(RuntimeState* state) override {
+    Status set_finished(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override {
        std::lock_guard<std::mutex> l(_chunk_lock);
        _is_finished = true;
+        return Status::OK();
    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;
--- a/be/src/exec/pipeline/exchange/multi_cast_local_exchange.cpp
+++ b/be/src/exec/pipeline/exchange/multi_cast_local_exchange.cpp
@ -182,11 +182,12 @@ Status MultiCastLocalExchangeSourceOperator::prepare(RuntimeState* state) {
    return Status::OK();
 }

-void MultiCastLocalExchangeSourceOperator::set_finishing(RuntimeState* state) {
+Status MultiCastLocalExchangeSourceOperator::set_finishing(RuntimeState* state) {
    if (!_is_finished) {
        _is_finished = true;
        _exchanger->close_source_operator(_mcast_consumer_index);
    }
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> MultiCastLocalExchangeSourceOperator::pull_chunk(RuntimeState* state) {
@ -215,11 +216,12 @@ bool MultiCastLocalExchangeSinkOperator::need_input() const {
    return _exchanger->can_push_chunk();
 }

-void MultiCastLocalExchangeSinkOperator::set_finishing(RuntimeState* state) {
+Status MultiCastLocalExchangeSinkOperator::set_finishing(RuntimeState* state) {
    if (!_is_finished) {
        _is_finished = true;
        _exchanger->close_sink_operator();
    }
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> MultiCastLocalExchangeSinkOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/exchange/multi_cast_local_exchange.h
+++ b/be/src/exec/pipeline/exchange/multi_cast_local_exchange.h
@ -90,7 +90,7 @@ public:

    bool is_finished() const override { return _is_finished; }

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

@ -139,7 +139,7 @@ public:

    bool is_finished() const override { return _is_finished; }

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/exchange/sink_buffer.cpp
+++ b/be/src/exec/pipeline/exchange/sink_buffer.cpp
@ -2,6 +2,10 @@

 #include "exec/pipeline/exchange/sink_buffer.h"

+#include <chrono>
+
+#include "util/time.h"
+
 namespace starrocks::pipeline {

 SinkBuffer::SinkBuffer(FragmentContext* fragment_ctx, const std::vector<TPlanFragmentDestination>& destinations,
@ -31,6 +35,7 @@ SinkBuffer::SinkBuffer(FragmentContext* fragment_ctx, const std::vector<TPlanFra
            _buffers[instance_id.lo] = std::queue<TransmitChunkInfo, std::list<TransmitChunkInfo>>();
            _num_finished_rpcs[instance_id.lo] = 0;
            _num_in_flight_rpcs[instance_id.lo] = 0;
+            _network_times[instance_id.lo] = TimeTrace{};
            _mutexes[instance_id.lo] = std::make_unique<std::mutex>();

            PUniqueId finst_id;
@ -64,7 +69,7 @@ SinkBuffer::~SinkBuffer() {
    }
 }

-void SinkBuffer::add_request(const TransmitChunkInfo& request) {
+void SinkBuffer::add_request(TransmitChunkInfo& request) {
    DCHECK(_num_remaining_eos > 0);
    if (_is_finishing) {
        // Once the request is added to SinkBuffer, its ownership will also be transferred,
@ -72,6 +77,10 @@ void SinkBuffer::add_request(const TransmitChunkInfo& request) {
        request.params->release_finst_id();
        return;
    }
+    if (!request.attachment.empty()) {
+        _bytes_enqueued += request.attachment.size();
+        _request_enqueued++;
+    }
    {
        auto& instance_id = request.fragment_instance_id;
        std::lock_guard<std::mutex> l(*_mutexes[instance_id.lo]);
@ -89,7 +98,21 @@ bool SinkBuffer::is_full() const {
    for (auto& [_, buffer] : _buffers) {
        buffer_size += buffer.size();
    }
-    return buffer_size > max_buffer_size;
+    bool is_full = buffer_size > max_buffer_size;
+
+    if (is_full && _last_full_timestamp == -1) {
+        _last_full_timestamp = MonotonicNanos();
+    }
+    if (!is_full && _last_full_timestamp != -1) {
+        _full_time += (MonotonicNanos() - _last_full_timestamp);
+        _last_full_timestamp = -1;
+    }
+
+    return is_full;
+}
+
+void SinkBuffer::set_finishing() {
+    _pending_timestamp = MonotonicNanos();
 }

 bool SinkBuffer::is_finished() const {
@ -100,14 +123,68 @@ bool SinkBuffer::is_finished() const {
    return _num_sending_rpc == 0 && _total_in_flight_rpc == 0;
 }

-// When all the ExchangeSinkOperator shared this SinkBuffer are cancelled,
-// the rest chunk request and EOS request needn't be sent anymore.
+void SinkBuffer::update_profile(RuntimeProfile* profile) {
+    bool flag = false;
+    if (!_is_profile_updated.compare_exchange_strong(flag, true)) {
+        return;
+    }
+
+    auto* network_timer = ADD_TIMER(profile, "NetworkTime");
+    auto* wait_timer = ADD_TIMER(profile, "WaitTime");
+    COUNTER_SET(network_timer, _network_time());
+
+    // WaitTime consists two parts
+    // 1. buffer full time
+    // 2. pending finish time
+    COUNTER_UPDATE(wait_timer, _full_time);
+    COUNTER_UPDATE(wait_timer, MonotonicNanos() - _pending_timestamp);
+
+    auto* bytes_sent_counter = ADD_COUNTER(profile, "BytesSent", TUnit::BYTES);
+    auto* request_sent_counter = ADD_COUNTER(profile, "RequestSent", TUnit::UNIT);
+    COUNTER_SET(bytes_sent_counter, _bytes_sent);
+    COUNTER_SET(request_sent_counter, _request_sent);
+
+    if (_bytes_enqueued - _bytes_sent > 0) {
+        auto* bytes_unsent_counter = ADD_COUNTER(profile, "BytesUnsent", TUnit::BYTES);
+        auto* request_unsent_counter = ADD_COUNTER(profile, "RequestUnsent", TUnit::UNIT);
+        COUNTER_SET(bytes_unsent_counter, _bytes_enqueued - _bytes_sent);
+        COUNTER_SET(request_unsent_counter, _request_enqueued - _request_sent);
+    }
+
+    profile->add_derived_counter(
+            "OverallThroughput", TUnit::BYTES_PER_SECOND,
+            [bytes_sent_counter, network_timer] {
+                return RuntimeProfile::units_per_second(bytes_sent_counter, network_timer);
+            },
+            "");
+}
+
+int64_t SinkBuffer::_network_time() {
+    int64_t max = 0;
+    for (auto& [_, time_trace] : _network_times) {
+        double average_concurrency =
+                static_cast<double>(time_trace.accumulated_concurrency) / std::max(1, time_trace.times);
+        int64_t average_accumulated_time =
+                static_cast<int64_t>(time_trace.accumulated_time / std::max(1.0, average_concurrency));
+        if (average_accumulated_time > max) {
+            max = average_accumulated_time;
+        }
+    }
+    return max;
+}
+
 void SinkBuffer::cancel_one_sinker() {
    if (--_num_uncancelled_sinkers == 0) {
        _is_finishing = true;
    }
 }

+void SinkBuffer::_update_network_time(const TUniqueId& instance_id, const int64_t send_timestamp,
+                                      const int64_t receive_timestamp) {
+    int32_t concurrency = _num_in_flight_rpcs[instance_id.lo];
+    _network_times[instance_id.lo].update(receive_timestamp - send_timestamp, concurrency);
+}
+
 void SinkBuffer::_process_send_window(const TUniqueId& instance_id, const int64_t sequence) {
    // Both sender side and receiver side can tolerate disorder of tranmission
    // if receiver side is not ExchangeMergeSortSourceOperator
@ -202,8 +279,14 @@ void SinkBuffer::_try_to_send_rpc(const TUniqueId& instance_id) {
        request.params->set_allocated_finst_id(&_instance_id2finst_id[instance_id.lo]);
        request.params->set_sequence(_request_seqs[instance_id.lo]++);

-        auto* closure =
-                new DisposableClosure<PTransmitChunkResult, ClosureContext>({instance_id, request.params->sequence()});
+        if (!request.attachment.empty()) {
+            _bytes_sent += request.attachment.size();
+            _request_sent++;
+        }
+
+        auto* closure = new DisposableClosure<PTransmitChunkResult, ClosureContext>(
+                {instance_id, request.params->sequence(), GetCurrentTimeNanos()});
+
        closure->addFailedHandler([this](const ClosureContext& ctx) noexcept {
            _is_finishing = true;
            {
@ -229,6 +312,7 @@ void SinkBuffer::_try_to_send_rpc(const TUniqueId& instance_id) {
            } else {
                std::lock_guard<std::mutex> l(*_mutexes[ctx.instance_id.lo]);
                _process_send_window(ctx.instance_id, ctx.sequence);
+                _update_network_time(ctx.instance_id, ctx.send_timestamp, result.receive_timestamp());
                _try_to_send_rpc(ctx.instance_id);
            }
            --_total_in_flight_rpc;
--- a/be/src/exec/pipeline/exchange/sink_buffer.h
+++ b/be/src/exec/pipeline/exchange/sink_buffer.h
@ -24,7 +24,7 @@ namespace starrocks::pipeline {
 using PTransmitChunkParamsPtr = std::shared_ptr<PTransmitChunkParams>;

 struct TransmitChunkInfo {
-    // For BUCKET_SHFFULE_HASH_PARTITIONED, multiple channels may be related to
+    // For BUCKET_SHUFFLE_HASH_PARTITIONED, multiple channels may be related to
    // a same exchange source fragment instance, so we should use fragment_instance_id
    // of the destination as the key of destination instead of channel_id.
    TUniqueId fragment_instance_id;
@ -36,6 +36,27 @@ struct TransmitChunkInfo {
 struct ClosureContext {
    TUniqueId instance_id;
    int64_t sequence;
+    int64_t send_timestamp;
+};
+
+// TimeTrace is introduced to estimate time more accurately.
+// For every update
+// 1. times will be increased by 1.
+// 2. sample time will be accumulated to accumulated_time.
+// 3. sample concurrency will be accumulated to accumulated_concurrency.
+// So we can get the average time of each direction by
+// `average_concurrency = accumulated_concurrency / times`
+// `average_time = accumulated_time / average_concurrency`
+struct TimeTrace {
+    int32_t times = 0;
+    int64_t accumulated_time = 0;
+    int32_t accumulated_concurrency = 0;
+
+    void update(int64_t time, int32_t concurrency) {
+        times++;
+        accumulated_time += time;
+        accumulated_concurrency += concurrency;
+    }
 };

 // TODO(hcf) how to export brpc error
@ -45,15 +66,22 @@ public:
               bool is_dest_merge, size_t num_sinkers);
    ~SinkBuffer();

-    void add_request(const TransmitChunkInfo& request);
+    void add_request(TransmitChunkInfo& request);
    bool is_full() const;
+
+    void set_finishing();
    bool is_finished() const;

+    // Add counters to the given profile
+    void update_profile(RuntimeProfile* profile);
+
    // When all the ExchangeSinkOperator shared this SinkBuffer are cancelled,
    // the rest chunk request and EOS request needn't be sent anymore.
    void cancel_one_sinker();

 private:
+    void _update_network_time(const TUniqueId& instance_id, const int64_t send_timestamp,
+                              const int64_t receive_timestamp);
    // Update the discontinuous acked window, here are the invariants:
    // all acks received with sequence from [0, _max_continuous_acked_seqs[x]]
    // not all the acks received with sequence from [_max_continuous_acked_seqs[x]+1, _request_seqs[x]]
@ -61,6 +89,14 @@ private:
    void _process_send_window(const TUniqueId& instance_id, const int64_t sequence);
    void _try_to_send_rpc(const TUniqueId& instance_id);

+    // Roughly estimate network time which is defined as the time between sending a and receiving a packet,
+    // and the processing time of both sides are excluded
+    // For each destination, we may send multiply packages at the same time, and the time is
+    // related to the degree of concurrency, so the network_time will be calculated as
+    // `accumulated_network_time / average_concurrency`
+    // And we just pick the maximum accumulated_network_time among all destination
+    int64_t _network_time();
+
    FragmentContext* _fragment_ctx;
    const MemTracker* _mem_tracker;
    const int32_t _brpc_timeout_ms;
@ -91,6 +127,7 @@ private:
    phmap::flat_hash_map<int64_t, std::queue<TransmitChunkInfo, std::list<TransmitChunkInfo>>> _buffers;
    phmap::flat_hash_map<int64_t, int32_t> _num_finished_rpcs;
    phmap::flat_hash_map<int64_t, int32_t> _num_in_flight_rpcs;
+    phmap::flat_hash_map<int64_t, TimeTrace> _network_times;
    phmap::flat_hash_map<int64_t, std::unique_ptr<std::mutex>> _mutexes;

    // True means that SinkBuffer needn't input chunk and send chunk anymore,
@ -106,6 +143,16 @@ private:
    std::atomic<bool> _is_finishing = false;
    std::atomic<int32_t> _num_sending_rpc = 0;

+    // RuntimeProfile counters
+    std::atomic_bool _is_profile_updated = false;
+    std::atomic<int64_t> _bytes_enqueued = 0;
+    std::atomic<int64_t> _request_enqueued = 0;
+    std::atomic<int64_t> _bytes_sent = 0;
+    std::atomic<int64_t> _request_sent = 0;
+
+    int64_t _pending_timestamp = -1;
+    mutable int64_t _last_full_timestamp = -1;
+    mutable int64_t _full_time = 0;
 }; // namespace starrocks::pipeline

 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/fragment_context.h
+++ b/be/src/exec/pipeline/fragment_context.h
@ -53,8 +53,8 @@ public:
    RuntimeState* runtime_state() const { return _runtime_state.get(); }
    std::shared_ptr<RuntimeState> runtime_state_ptr() { return _runtime_state; }
    void set_runtime_state(std::shared_ptr<RuntimeState>&& runtime_state) { _runtime_state = std::move(runtime_state); }
-    ExecNode* plan() const { return _plan; }
-    void set_plan(ExecNode* plan) { _plan = plan; }
+    ExecNode*& plan() { return _plan; }
+
    Pipelines& pipelines() { return _pipelines; }
    void set_pipelines(Pipelines&& pipelines) { _pipelines = std::move(pipelines); }
    Drivers& drivers() { return _drivers; }
--- a/be/src/exec/pipeline/fragment_executor.cpp
+++ b/be/src/exec/pipeline/fragment_executor.cpp
@ -41,8 +41,10 @@ static void setup_profile_hierarchy(RuntimeState* runtime_state, const PipelineP

 static void setup_profile_hierarchy(const PipelinePtr& pipeline, const DriverPtr& driver) {
    pipeline->runtime_profile()->add_child(driver->runtime_profile(), true, nullptr);
-    auto* counter = pipeline->runtime_profile()->add_counter("DegreeOfParallelism", TUnit::UNIT);
-    counter->set(static_cast<int64_t>(pipeline->source_operator_factory()->degree_of_parallelism()));
+    auto* dop_counter = ADD_COUNTER(pipeline->runtime_profile(), "DegreeOfParallelism", TUnit::UNIT);
+    COUNTER_SET(dop_counter, static_cast<int64_t>(pipeline->source_operator_factory()->degree_of_parallelism()));
+    auto* total_dop_counter = ADD_COUNTER(pipeline->runtime_profile(), "TotalDegreeOfParallelism", TUnit::UNIT);
+    COUNTER_SET(total_dop_counter, dop_counter->value());
    auto& operators = driver->operators();
    for (int32_t i = operators.size() - 1; i >= 0; --i) {
        auto& curr_op = operators[i];
@ -145,8 +147,11 @@ Status FragmentExecutor::prepare(ExecEnv* exec_env, const TExecPlanFragmentParam
                std::make_unique<RuntimeState>(query_id, fragment_instance_id, query_options, query_globals, exec_env));
    }
    auto* runtime_state = _fragment_ctx->runtime_state();
+    int func_version = request.__isset.func_version ? request.func_version : 2;
+    runtime_state->set_func_version(func_version);
    runtime_state->init_mem_trackers(query_id, wg != nullptr ? wg->mem_tracker() : nullptr);
    runtime_state->set_be_number(backend_num);
+    runtime_state->set_query_ctx(_query_ctx);

    // RuntimeFilterWorker::open_query is idempotent
    if (params.__isset.runtime_filter_params && params.runtime_filter_params.id_to_prober_params.size() != 0) {
@ -175,13 +180,12 @@ Status FragmentExecutor::prepare(ExecEnv* exec_env, const TExecPlanFragmentParam
    }
    runtime_state->set_desc_tbl(desc_tbl);
    // Set up plan
-    ExecNode* plan = nullptr;
-    RETURN_IF_ERROR(ExecNode::create_tree(runtime_state, obj_pool, fragment.plan, *desc_tbl, &plan));
+    RETURN_IF_ERROR(ExecNode::create_tree(runtime_state, obj_pool, fragment.plan, *desc_tbl, &_fragment_ctx->plan()));
+    ExecNode* plan = _fragment_ctx->plan();
    plan->push_down_join_runtime_filter_recursively(runtime_state);
    std::vector<TupleSlotMapping> empty_mappings;
    plan->push_down_tuple_slot_mappings(runtime_state, empty_mappings);
    runtime_state->set_fragment_root_id(plan->id());
-    _fragment_ctx->set_plan(plan);

    // Set up global dict
    if (request.fragment.__isset.query_global_dicts) {
@ -236,8 +240,8 @@ Status FragmentExecutor::prepare(ExecEnv* exec_env, const TExecPlanFragmentParam
        const auto& pipeline = pipelines[n];
        // DOP(degree of parallelism) of Pipeline's SourceOperator determines the Pipeline's DOP.
        const auto degree_of_parallelism = pipeline->source_operator_factory()->degree_of_parallelism();
-        LOG(INFO) << "Pipeline " << pipeline->to_readable_string() << " parallel=" << degree_of_parallelism
-                  << " fragment_instance_id=" << print_id(params.fragment_instance_id);
+        VLOG_ROW << "Pipeline " << pipeline->to_readable_string() << " parallel=" << degree_of_parallelism
+                 << " fragment_instance_id=" << print_id(params.fragment_instance_id);
        const bool is_root = pipeline->is_root();
        // If pipeline's SourceOperator is with morsels, a MorselQueue is added to the SourceOperator.
        // at present, only OlapScanOperator need a MorselQueue attached.
@ -339,9 +343,14 @@ void FragmentExecutor::_decompose_data_sink_to_operator(RuntimeState* runtime_st
        bool is_pipeline_level_shuffle = false;
        int32_t dest_dop = -1;
        if (sender->get_partition_type() == TPartitionType::HASH_PARTITIONED ||
-            sender->get_partition_type() == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) {
-            is_pipeline_level_shuffle = true;
+            sender->get_partition_type() == TPartitionType::BUCKET_SHUFFLE_HASH_PARTITIONED) {
            dest_dop = t_stream_sink.dest_dop;
+
+            // UNPARTITIONED mode will be performed if both num of destination and dest dop is 1
+            // So we only enable pipeline level shuffle when num of destination or dest dop is greater than 1
+            if (sender->destinations().size() > 1 || dest_dop > 1) {
+                is_pipeline_level_shuffle = true;
+            }
            DCHECK_GT(dest_dop, 0);
        }

@ -406,9 +415,10 @@ void FragmentExecutor::_decompose_data_sink_to_operator(RuntimeState* runtime_st
            // sink op
            auto sink_buffer = std::make_shared<SinkBuffer>(_fragment_ctx, sender->destinations(), is_dest_merge, dop);
            auto sink_op = std::make_shared<ExchangeSinkOperatorFactory>(
-                    context->next_operator_id(), -1, sink_buffer, sender->get_partition_type(), sender->destinations(),
-                    is_pipeline_level_shuffle, dest_dop, sender->sender_id(), sender->get_dest_node_id(),
-                    sender->get_partition_exprs(), sender->get_enable_exchange_pass_through(), _fragment_ctx);
+                    context->next_operator_id(), t_stream_sink.dest_node_id, sink_buffer, sender->get_partition_type(),
+                    sender->destinations(), is_pipeline_level_shuffle, dest_dop, sender->sender_id(),
+                    sender->get_dest_node_id(), sender->get_partition_exprs(),
+                    sender->get_enable_exchange_pass_through(), _fragment_ctx);

            ops.emplace_back(source_op);
            ops.emplace_back(sink_op);
--- a/be/src/exec/pipeline/hashjoin/hash_join_build_operator.cpp
+++ b/be/src/exec/pipeline/hashjoin/hash_join_build_operator.cpp
@ -50,26 +50,32 @@ StatusOr<vectorized::ChunkPtr> HashJoinBuildOperator::pull_chunk(RuntimeState* s
    return Status::NotSupported(msg);
 }

-void HashJoinBuildOperator::set_finishing(RuntimeState* state) {
+Status HashJoinBuildOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
-    _join_builder->build_ht(state);
+    RETURN_IF_ERROR(_join_builder->build_ht(state));

    size_t merger_index = _driver_sequence;
    // Broadcast Join only has one build operator.
    DCHECK(_distribution_mode != TJoinDistributionMode::BROADCAST || _driver_sequence == 0);

-    _join_builder->create_runtime_filters(state);
+    RETURN_IF_ERROR(_join_builder->create_runtime_filters(state));

    auto ht_row_count = _join_builder->get_ht_row_count();
    auto& partial_in_filters = _join_builder->get_runtime_in_filters();
    auto& partial_bloom_filter_build_params = _join_builder->get_runtime_bloom_filter_build_params();
    auto& partial_bloom_filters = _join_builder->get_runtime_bloom_filters();
+    // retain string-typed key columns to avoid premature de-allocation when both probe side and build side
+    // PipelineDrivers finalization before in-filers is merged.
+    ((HashJoinBuildOperatorFactory*)_factory)
+            ->retain_string_key_columns(_driver_sequence, _join_builder->string_key_columns());
    // add partial filters generated by this HashJoinBuildOperator to PartialRuntimeFilterMerger to merge into a
    // total one.
    auto status = _partial_rf_merger->add_partial_filters(merger_index, ht_row_count, std::move(partial_in_filters),
                                                          std::move(partial_bloom_filter_build_params),
                                                          std::move(partial_bloom_filters));
-    if (status.ok() && status.value()) {
+    if (!status.ok()) {
+        return status.status();
+    } else if (status.value()) {
        auto&& in_filters = _partial_rf_merger->get_total_in_filters();
        auto&& bloom_filters = _partial_rf_merger->get_total_bloom_filters();

@ -87,6 +93,7 @@ void HashJoinBuildOperator::set_finishing(RuntimeState* state) {
    for (auto& read_only_join_prober : _read_only_join_probers) {
        read_only_join_prober->enter_probe_phase();
    }
+    return Status::OK();
 }

 HashJoinBuildOperatorFactory::HashJoinBuildOperatorFactory(
@ -109,11 +116,17 @@ void HashJoinBuildOperatorFactory::close(RuntimeState* state) {
 }

 OperatorPtr HashJoinBuildOperatorFactory::create(int32_t degree_of_parallelism, int32_t driver_sequence) {
+    if (_string_key_columns.empty()) {
+        _string_key_columns.resize(degree_of_parallelism);
+    }
    return std::make_shared<HashJoinBuildOperator>(this, _id, _name, _plan_node_id,
                                                   _hash_joiner_factory->create_builder(driver_sequence),
                                                   _hash_joiner_factory->get_read_only_probers(), driver_sequence,
                                                   _partial_rf_merger.get(), _distribution_mode);
 }

+void HashJoinBuildOperatorFactory::retain_string_key_columns(int32_t driver_sequence, vectorized::Columns&& columns) {
+    _string_key_columns[driver_sequence] = std::move(columns);
+}
 } // namespace pipeline
 } // namespace starrocks
--- a/be/src/exec/pipeline/hashjoin/hash_join_build_operator.h
+++ b/be/src/exec/pipeline/hashjoin/hash_join_build_operator.h
@ -35,7 +35,7 @@ public:
    }
    bool need_input() const override { return !is_finished(); }

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;
    bool is_finished() const override { return _is_finished || _join_builder->is_finished(); }

    Status push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) override;
@ -66,11 +66,12 @@ public:
    Status prepare(RuntimeState* state) override;
    void close(RuntimeState* state) override;
    OperatorPtr create(int32_t degree_of_parallelism, int32_t driver_sequence) override;
+    void retain_string_key_columns(int32_t driver_sequence, vectorized::Columns&& columns);

 private:
    HashJoinerFactoryPtr _hash_joiner_factory;
    std::unique_ptr<PartialRuntimeFilterMerger> _partial_rf_merger;
-
+    std::vector<vectorized::Columns> _string_key_columns;
    const TJoinDistributionMode::type _distribution_mode;
 };

--- a/be/src/exec/pipeline/hashjoin/hash_join_probe_operator.cpp
+++ b/be/src/exec/pipeline/hashjoin/hash_join_probe_operator.cpp
@ -55,14 +55,16 @@ StatusOr<vectorized::ChunkPtr> HashJoinProbeOperator::pull_chunk(RuntimeState* s
    return _join_prober->pull_chunk(state);
 }

-void HashJoinProbeOperator::set_finishing(RuntimeState* state) {
+Status HashJoinProbeOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
    _join_prober->enter_post_probe_phase();
+    return Status::OK();
 }

-void HashJoinProbeOperator::set_finished(RuntimeState* state) {
+Status HashJoinProbeOperator::set_finished(RuntimeState* state) {
    _join_prober->enter_eos_phase();
    _join_builder->set_prober_finished();
+    return Status::OK();
 }

 bool HashJoinProbeOperator::is_ready() const {
--- a/be/src/exec/pipeline/hashjoin/hash_join_probe_operator.h
+++ b/be/src/exec/pipeline/hashjoin/hash_join_probe_operator.h
@ -27,8 +27,8 @@ public:
    bool need_input() const override;

    bool is_finished() const override;
-    void set_finishing(RuntimeState* state) override;
-    void set_finished(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;
+    Status set_finished(RuntimeState* state) override;

    bool is_ready() const override;
    std::string get_name() const override {
--- a/be/src/exec/pipeline/limit_operator.h
+++ b/be/src/exec/pipeline/limit_operator.h
@ -19,7 +19,10 @@ public:

    bool is_finished() const override { return (_is_finished || _limit == 0) && _cur_chunk == nullptr; }

-    void set_finishing(RuntimeState* state) override { _is_finished = true; }
+    Status set_finishing(RuntimeState* state) override {
+        _is_finished = true;
+        return Status::OK();
+    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/olap_chunk_source.cpp
+++ b/be/src/exec/pipeline/olap_chunk_source.cpp
@ -23,13 +23,13 @@
 namespace starrocks::pipeline {
 using namespace vectorized;

-OlapChunkSource::OlapChunkSource(MorselPtr&& morsel, ScanOperator* op, vectorized::OlapScanNode* scan_node)
-        : ChunkSource(std::move(morsel)),
+OlapChunkSource::OlapChunkSource(RuntimeProfile* runtime_profile, MorselPtr&& morsel, ScanOperator* op,
+                                 vectorized::OlapScanNode* scan_node)
+        : ChunkSource(runtime_profile, std::move(morsel)),
          _scan_node(scan_node),
          _limit(scan_node->limit()),
          _runtime_in_filters(op->runtime_in_filters()),
-          _runtime_bloom_filters(op->runtime_bloom_filters()),
-          _runtime_profile(op->unique_metrics()) {
+          _runtime_bloom_filters(op->runtime_bloom_filters()) {
    _conjunct_ctxs = scan_node->conjunct_ctxs();
    _conjunct_ctxs.insert(_conjunct_ctxs.end(), _runtime_in_filters.begin(), _runtime_in_filters.end());
    ScanMorsel* scan_morsel = (ScanMorsel*)_morsel.get();
@ -42,6 +42,14 @@ Status OlapChunkSource::prepare(RuntimeState* state) {
    const TupleDescriptor* tuple_desc = state->desc_tbl().get_tuple_descriptor(thrift_olap_scan_node.tuple_id);
    _slots = &tuple_desc->slots();

+    _runtime_profile->add_info_string("Table", tuple_desc->table_desc()->name());
+    if (thrift_olap_scan_node.__isset.rollup_name) {
+        _runtime_profile->add_info_string("Rollup", thrift_olap_scan_node.rollup_name);
+    }
+    if (thrift_olap_scan_node.__isset.sql_predicates) {
+        _runtime_profile->add_info_string("Predicates", thrift_olap_scan_node.sql_predicates);
+    }
+
    _init_counter(state);

    _dict_optimize_parser.set_mutable_dict_maps(state->mutable_query_global_dict_map());
@ -77,45 +85,43 @@ void OlapChunkSource::_init_counter(RuntimeState* state) {
    _bytes_read_counter = ADD_COUNTER(_runtime_profile, "BytesRead", TUnit::BYTES);
    _rows_read_counter = ADD_COUNTER(_runtime_profile, "RowsRead", TUnit::UNIT);

-    _scan_profile = _runtime_profile->create_child("SCAN", true, false);
+    _create_seg_iter_timer = ADD_TIMER(_runtime_profile, "CreateSegmentIter");

-    _create_seg_iter_timer = ADD_TIMER(_scan_profile, "CreateSegmentIter");
+    _read_compressed_counter = ADD_COUNTER(_runtime_profile, "CompressedBytesRead", TUnit::BYTES);
+    _read_uncompressed_counter = ADD_COUNTER(_runtime_profile, "UncompressedBytesRead", TUnit::BYTES);

-    _read_compressed_counter = ADD_COUNTER(_scan_profile, "CompressedBytesRead", TUnit::BYTES);
-    _read_uncompressed_counter = ADD_COUNTER(_scan_profile, "UncompressedBytesRead", TUnit::BYTES);
-
-    _raw_rows_counter = ADD_COUNTER(_scan_profile, "RawRowsRead", TUnit::UNIT);
-    _read_pages_num_counter = ADD_COUNTER(_scan_profile, "ReadPagesNum", TUnit::UNIT);
-    _cached_pages_num_counter = ADD_COUNTER(_scan_profile, "CachedPagesNum", TUnit::UNIT);
-    _pushdown_predicates_counter = ADD_COUNTER(_scan_profile, "PushdownPredicates", TUnit::UNIT);
+    _raw_rows_counter = ADD_COUNTER(_runtime_profile, "RawRowsRead", TUnit::UNIT);
+    _read_pages_num_counter = ADD_COUNTER(_runtime_profile, "ReadPagesNum", TUnit::UNIT);
+    _cached_pages_num_counter = ADD_COUNTER(_runtime_profile, "CachedPagesNum", TUnit::UNIT);
+    _pushdown_predicates_counter = ADD_COUNTER(_runtime_profile, "PushdownPredicates", TUnit::UNIT);

    // SegmentInit
-    _seg_init_timer = ADD_TIMER(_scan_profile, "SegmentInit");
-    _bi_filter_timer = ADD_CHILD_TIMER(_scan_profile, "BitmapIndexFilter", "SegmentInit");
-    _bi_filtered_counter = ADD_CHILD_COUNTER(_scan_profile, "BitmapIndexFilterRows", TUnit::UNIT, "SegmentInit");
-    _bf_filtered_counter = ADD_CHILD_COUNTER(_scan_profile, "BloomFilterFilterRows", TUnit::UNIT, "SegmentInit");
-    _zm_filtered_counter = ADD_CHILD_COUNTER(_scan_profile, "ZoneMapIndexFilterRows", TUnit::UNIT, "SegmentInit");
-    _sk_filtered_counter = ADD_CHILD_COUNTER(_scan_profile, "ShortKeyFilterRows", TUnit::UNIT, "SegmentInit");
+    _seg_init_timer = ADD_TIMER(_runtime_profile, "SegmentInit");
+    _bi_filter_timer = ADD_CHILD_TIMER(_runtime_profile, "BitmapIndexFilter", "SegmentInit");
+    _bi_filtered_counter = ADD_CHILD_COUNTER(_runtime_profile, "BitmapIndexFilterRows", TUnit::UNIT, "SegmentInit");
+    _bf_filtered_counter = ADD_CHILD_COUNTER(_runtime_profile, "BloomFilterFilterRows", TUnit::UNIT, "SegmentInit");
+    _zm_filtered_counter = ADD_CHILD_COUNTER(_runtime_profile, "ZoneMapIndexFilterRows", TUnit::UNIT, "SegmentInit");
+    _sk_filtered_counter = ADD_CHILD_COUNTER(_runtime_profile, "ShortKeyFilterRows", TUnit::UNIT, "SegmentInit");

    // SegmentRead
-    _block_load_timer = ADD_TIMER(_scan_profile, "SegmentRead");
-    _block_fetch_timer = ADD_CHILD_TIMER(_scan_profile, "BlockFetch", "SegmentRead");
-    _block_load_counter = ADD_CHILD_COUNTER(_scan_profile, "BlockFetchCount", TUnit::UNIT, "SegmentRead");
-    _block_seek_timer = ADD_CHILD_TIMER(_scan_profile, "BlockSeek", "SegmentRead");
-    _block_seek_counter = ADD_CHILD_COUNTER(_scan_profile, "BlockSeekCount", TUnit::UNIT, "SegmentRead");
-    _pred_filter_timer = ADD_CHILD_TIMER(_scan_profile, "PredFilter", "SegmentRead");
-    _pred_filter_counter = ADD_CHILD_COUNTER(_scan_profile, "PredFilterRows", TUnit::UNIT, "SegmentRead");
-    _del_vec_filter_counter = ADD_CHILD_COUNTER(_scan_profile, "DelVecFilterRows", TUnit::UNIT, "SegmentRead");
-    _chunk_copy_timer = ADD_CHILD_TIMER(_scan_profile, "ChunkCopy", "SegmentRead");
-    _decompress_timer = ADD_CHILD_TIMER(_scan_profile, "DecompressT", "SegmentRead");
-    _index_load_timer = ADD_CHILD_TIMER(_scan_profile, "IndexLoad", "SegmentRead");
-    _rowsets_read_count = ADD_CHILD_COUNTER(_scan_profile, "RowsetsReadCount", TUnit::UNIT, "SegmentRead");
-    _segments_read_count = ADD_CHILD_COUNTER(_scan_profile, "SegmentsReadCount", TUnit::UNIT, "SegmentRead");
+    _block_load_timer = ADD_TIMER(_runtime_profile, "SegmentRead");
+    _block_fetch_timer = ADD_CHILD_TIMER(_runtime_profile, "BlockFetch", "SegmentRead");
+    _block_load_counter = ADD_CHILD_COUNTER(_runtime_profile, "BlockFetchCount", TUnit::UNIT, "SegmentRead");
+    _block_seek_timer = ADD_CHILD_TIMER(_runtime_profile, "BlockSeek", "SegmentRead");
+    _block_seek_counter = ADD_CHILD_COUNTER(_runtime_profile, "BlockSeekCount", TUnit::UNIT, "SegmentRead");
+    _pred_filter_timer = ADD_CHILD_TIMER(_runtime_profile, "PredFilter", "SegmentRead");
+    _pred_filter_counter = ADD_CHILD_COUNTER(_runtime_profile, "PredFilterRows", TUnit::UNIT, "SegmentRead");
+    _del_vec_filter_counter = ADD_CHILD_COUNTER(_runtime_profile, "DelVecFilterRows", TUnit::UNIT, "SegmentRead");
+    _chunk_copy_timer = ADD_CHILD_TIMER(_runtime_profile, "ChunkCopy", "SegmentRead");
+    _decompress_timer = ADD_CHILD_TIMER(_runtime_profile, "DecompressT", "SegmentRead");
+    _index_load_timer = ADD_CHILD_TIMER(_runtime_profile, "IndexLoad", "SegmentRead");
+    _rowsets_read_count = ADD_CHILD_COUNTER(_runtime_profile, "RowsetsReadCount", TUnit::UNIT, "SegmentRead");
+    _segments_read_count = ADD_CHILD_COUNTER(_runtime_profile, "SegmentsReadCount", TUnit::UNIT, "SegmentRead");
    _total_columns_data_page_count =
-            ADD_CHILD_COUNTER(_scan_profile, "TotalColumnsDataPageCount", TUnit::UNIT, "SegmentRead");
+            ADD_CHILD_COUNTER(_runtime_profile, "TotalColumnsDataPageCount", TUnit::UNIT, "SegmentRead");

    // IOTime
-    _io_timer = ADD_TIMER(_scan_profile, "IOTime");
+    _io_timer = ADD_TIMER(_runtime_profile, "IOTime");
 }

 Status OlapChunkSource::_build_scan_range(RuntimeState* state) {
@ -163,7 +169,7 @@ Status OlapChunkSource::_init_reader_params(const std::vector<OlapScanRange*>& k
    bool skip_aggregation = thrift_olap_scan_node.is_preaggregation;
    _params.reader_type = READER_QUERY;
    _params.skip_aggregation = skip_aggregation;
-    _params.profile = _scan_profile;
+    _params.profile = _runtime_profile;
    _params.runtime_state = _runtime_state;
    _params.use_page_cache = !config::disable_storage_page_cache;
    // Improve for select * from table limit x, x is small
@ -288,7 +294,7 @@ Status OlapChunkSource::_init_olap_reader(RuntimeState* runtime_state) {
    }

    if (!_not_push_down_conjuncts.empty() || !_not_push_down_predicates.empty()) {
-        _expr_filter_timer = ADD_TIMER(_scan_profile, "ExprFilterTime");
+        _expr_filter_timer = ADD_TIMER(_runtime_profile, "ExprFilterTime");
    }

    DCHECK(_params.global_dictmaps != nullptr);
@ -328,7 +334,7 @@ Status OlapChunkSource::buffer_next_batch_chunks_blocking(size_t batch_size, boo

    for (size_t i = 0; i < batch_size && !can_finish; ++i) {
        ChunkUniquePtr chunk(
-                ChunkHelper::new_chunk_pooled(_prj_iter->encoded_schema(), _runtime_state->chunk_size(), true));
+                ChunkHelper::new_chunk_pooled(_prj_iter->output_schema(), _runtime_state->chunk_size(), true));
        _status = _read_chunk_from_storage(_runtime_state, chunk.get());
        if (!_status.ok()) {
            // end of file is normal case, need process chunk
@ -356,7 +362,7 @@ Status OlapChunkSource::buffer_next_batch_chunks_blocking_for_workgroup(size_t b
            SCOPED_RAW_TIMER(&time_spent);

            ChunkUniquePtr chunk(
-                    ChunkHelper::new_chunk_pooled(_prj_iter->encoded_schema(), _runtime_state->chunk_size(), true));
+                    ChunkHelper::new_chunk_pooled(_prj_iter->output_schema(), _runtime_state->chunk_size(), true));
            _status = _read_chunk_from_storage(_runtime_state, chunk.get());
            if (!_status.ok()) {
                // end of file is normal case, need process chunk
@ -417,6 +423,7 @@ Status OlapChunkSource::_read_chunk_from_storage(RuntimeState* state, vectorized
        if (Status status = _prj_iter->get_next(chunk); !status.ok()) {
            return status;
        }
+        TRY_CATCH_ALLOC_SCOPE_START()

        for (auto slot : _query_slots) {
            size_t column_index = chunk->schema()->get_field_index_by_name(slot->col_name());
@ -436,6 +443,8 @@ Status OlapChunkSource::_read_chunk_from_storage(RuntimeState* state, vectorized
            ExecNode::eval_conjuncts(_not_push_down_conjuncts, chunk);
            DCHECK_CHUNK(chunk);
        }
+        TRY_CATCH_ALLOC_SCOPE_END()
+
    } while (chunk->num_rows() == 0);
    _update_realtime_counter(chunk);
    // Improve for select * from table limit x, x is small
@ -464,6 +473,14 @@ void OlapChunkSource::_update_realtime_counter(vectorized::Chunk* chunk) {
    _num_rows_read += chunk->num_rows();
 }

+int64_t OlapChunkSource::last_spent_cpu_time_ns() {
+    int64_t time_ns = _last_spent_cpu_time_ns;
+    _last_spent_cpu_time_ns += _reader->stats().decompress_ns;
+    _last_spent_cpu_time_ns += _reader->stats().vec_cond_ns;
+    _last_spent_cpu_time_ns += _reader->stats().del_filter_ns;
+    return _last_spent_cpu_time_ns - time_ns;
+}
+
 void OlapChunkSource::_update_counter() {
    COUNTER_UPDATE(_create_seg_iter_timer, _reader->stats().create_segment_iter_ns);
    COUNTER_UPDATE(_rows_read_counter, _num_rows_read);
@ -512,16 +529,16 @@ void OlapChunkSource::_update_counter() {
    StarRocksMetrics::instance()->query_scan_rows.increment(_raw_rows_read);

    if (_reader->stats().decode_dict_ns > 0) {
-        RuntimeProfile::Counter* c = ADD_TIMER(_scan_profile, "DictDecode");
+        RuntimeProfile::Counter* c = ADD_TIMER(_runtime_profile, "DictDecode");
        COUNTER_UPDATE(c, _reader->stats().decode_dict_ns);
    }
    if (_reader->stats().late_materialize_ns > 0) {
-        RuntimeProfile::Counter* c = ADD_TIMER(_scan_profile, "LateMaterialize");
+        RuntimeProfile::Counter* c = ADD_TIMER(_runtime_profile, "LateMaterialize");
        COUNTER_UPDATE(c, _reader->stats().late_materialize_ns);
    }
    if (_reader->stats().del_filter_ns > 0) {
-        RuntimeProfile::Counter* c1 = ADD_TIMER(_scan_profile, "DeleteFilter");
-        RuntimeProfile::Counter* c2 = ADD_COUNTER(_scan_profile, "DeleteFilterRows", TUnit::UNIT);
+        RuntimeProfile::Counter* c1 = ADD_TIMER(_runtime_profile, "DeleteFilter");
+        RuntimeProfile::Counter* c2 = ADD_COUNTER(_runtime_profile, "DeleteFilterRows", TUnit::UNIT);
        COUNTER_UPDATE(c1, _reader->stats().del_filter_ns);
        COUNTER_UPDATE(c2, _reader->stats().rows_del_filtered);
    }
--- a/be/src/exec/pipeline/olap_chunk_source.h
+++ b/be/src/exec/pipeline/olap_chunk_source.h
@ -30,7 +30,8 @@ namespace pipeline {
 class ScanOperator;
 class OlapChunkSource final : public ChunkSource {
 public:
-    OlapChunkSource(MorselPtr&& morsel, ScanOperator* op, vectorized::OlapScanNode* scan_node);
+    OlapChunkSource(RuntimeProfile* runtime_profile, MorselPtr&& morsel, ScanOperator* op,
+                    vectorized::OlapScanNode* scan_node);

    ~OlapChunkSource() override = default;

@ -49,6 +50,7 @@ public:
    Status buffer_next_batch_chunks_blocking(size_t chunk_size, bool& can_finish) override;
    Status buffer_next_batch_chunks_blocking_for_workgroup(size_t chunk_size, bool& can_finish, size_t* num_read_chunks,
                                                           int worker_id, workgroup::WorkGroupPtr running_wg) override;
+    int64_t last_spent_cpu_time_ns() override;

 private:
    // Yield scan io task when maximum time in nano-seconds has spent in current execution round.
@ -115,11 +117,11 @@ private:
    int64_t _raw_rows_read = 0;
    int64_t _compressed_bytes_read = 0;

-    RuntimeProfile* _runtime_profile;
    RuntimeProfile::Counter* _bytes_read_counter = nullptr;
    RuntimeProfile::Counter* _rows_read_counter = nullptr;

-    RuntimeProfile* _scan_profile = nullptr;
+    int64_t _last_spent_cpu_time_ns = 0;
+
    RuntimeProfile::Counter* _expr_filter_timer = nullptr;
    RuntimeProfile::Counter* _scan_timer = nullptr;
    RuntimeProfile::Counter* _create_seg_iter_timer = nullptr;
--- a/be/src/exec/pipeline/olap_scan_operator.cpp
+++ b/be/src/exec/pipeline/olap_scan_operator.cpp
@ -80,9 +80,10 @@ Status OlapScanOperator::_capture_tablet_rowsets() {
    return Status::OK();
 }

-ChunkSourcePtr OlapScanOperator::create_chunk_source(MorselPtr morsel) {
+ChunkSourcePtr OlapScanOperator::create_chunk_source(MorselPtr morsel, int32_t chunk_source_index) {
    vectorized::OlapScanNode* olap_scan_node = down_cast<vectorized::OlapScanNode*>(_scan_node);
-    return std::make_shared<OlapChunkSource>(std::move(morsel), this, olap_scan_node);
+    return std::make_shared<OlapChunkSource>(_chunk_source_profiles[chunk_source_index].get(), std::move(morsel), this,
+                                             olap_scan_node);
 }

 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/olap_scan_operator.h
+++ b/be/src/exec/pipeline/olap_scan_operator.h
@ -34,7 +34,7 @@ public:

    Status do_prepare(RuntimeState* state) override;
    void do_close(RuntimeState* state) override;
-    ChunkSourcePtr create_chunk_source(MorselPtr morsel) override;
+    ChunkSourcePtr create_chunk_source(MorselPtr morsel, int32_t chunk_source_index) override;

 private:
    Status _capture_tablet_rowsets();
--- a/be/src/exec/pipeline/operator.h
+++ b/be/src/exec/pipeline/operator.h
@ -46,7 +46,7 @@ public:
    // data inside is processed.
    // It's one of the stages of the operator life cycle（prepare -> finishing -> finished -> [cancelled] -> closed)
    // This method will be exactly invoked once in the whole life cycle
-    virtual void set_finishing(RuntimeState* state) {}
+    virtual Status set_finishing(RuntimeState* state) { return Status::OK(); }

    // set_finished is used to shutdown both input and output stream of a operator and after its invocation
    // buffered data inside the operator is cleared.
@ -58,13 +58,13 @@ public:
    // set_finishing function.
    // It's one of the stages of the operator life cycle（prepare -> finishing -> finished -> [cancelled] -> closed)
    // This method will be exactly invoked once in the whole life cycle
-    virtual void set_finished(RuntimeState* state) {}
+    virtual Status set_finished(RuntimeState* state) { return Status::OK(); }

    // It's one of the stages of the operator life cycle（prepare -> finishing -> finished -> [cancelled] -> closed)
    // - When the fragment exits abnormally, the stage operator will become to CANCELLED between FINISHED and CLOSE.
    // - When the fragment exits normally, there isn't CANCELLED stage for the drivers.
    // Sometimes, the operator need to realize it is cancelled to stop earlier than normal, such as ExchangeSink.
-    virtual void set_cancelled(RuntimeState* state) {}
+    virtual Status set_cancelled(RuntimeState* state) { return Status::OK(); }

    // when local runtime filters are ready, the operator should bound its corresponding runtime in-filters.
    virtual void set_precondition_ready(RuntimeState* state);
--- a/be/src/exec/pipeline/pipeline_driver.cpp
+++ b/be/src/exec/pipeline/pipeline_driver.cpp
@ -34,12 +34,6 @@ Status PipelineDriver::prepare(RuntimeState* runtime_state) {
    _first_input_empty_timer = ADD_CHILD_TIMER(_runtime_profile, "FirstInputEmptyTime", "InputEmptyTime");
    _followup_input_empty_timer = ADD_CHILD_TIMER(_runtime_profile, "FollowupInputEmptyTime", "InputEmptyTime");
    _output_full_timer = ADD_CHILD_TIMER(_runtime_profile, "OutputFullTime", "PendingTime");
-    _local_rf_waiting_set_counter = ADD_COUNTER(_runtime_profile, "LocalRfWaitingSet", TUnit::UNIT);
-
-    _schedule_counter = ADD_COUNTER(_runtime_profile, "ScheduleCounter", TUnit::UNIT);
-    _schedule_effective_counter = ADD_COUNTER(_runtime_profile, "ScheduleEffectiveCounter", TUnit::UNIT);
-    _schedule_rows_per_chunk = ADD_COUNTER(_runtime_profile, "ScheduleAccumulatedRowsPerChunk", TUnit::UNIT);
-    _schedule_accumulated_chunks_moved = ADD_COUNTER(_runtime_profile, "ScheduleAccumulatedChunkMoved", TUnit::UNIT);

    DCHECK(_state == DriverState::NOT_READY);
    // fill OperatorWithDependency instances into _dependencies from _operators.
@ -64,7 +58,9 @@ Status PipelineDriver::prepare(RuntimeState* runtime_state) {
                    std::max(_global_rf_wait_timeout_ns, global_rf_collector->wait_timeout_ms() * 1000L * 1000L);
        }
    }
-    _local_rf_waiting_set_counter->set((int64_t)all_local_rf_set.size());
+    if (!all_local_rf_set.empty()) {
+        _runtime_profile->add_info_string("LocalRfWaitingSet", strings::Substitute("$0", all_local_rf_set.size()));
+    }
    _local_rf_holders = fragment_ctx()->runtime_filter_hub()->gather_holders(all_local_rf_set);

    source_operator()->add_morsel_queue(_morsel_queue.get());
@ -118,9 +114,9 @@ StatusOr<DriverState> PipelineDriver::process(RuntimeState* runtime_state, int w
                if (curr_op->is_finished()) {
                    if (i == 0) {
                        // For source operators
-                        _mark_operator_finishing(curr_op, runtime_state);
+                        RETURN_IF_ERROR(_mark_operator_finishing(curr_op, runtime_state));
                    }
-                    _mark_operator_finishing(next_op, runtime_state);
+                    RETURN_IF_ERROR(_mark_operator_finishing(next_op, runtime_state));
                    new_first_unfinished = i + 1;
                    continue;
                }
@ -180,9 +176,9 @@ StatusOr<DriverState> PipelineDriver::process(RuntimeState* runtime_state, int w
                if (curr_op->is_finished()) {
                    if (i == 0) {
                        // For source operators
-                        _mark_operator_finishing(curr_op, runtime_state);
+                        RETURN_IF_ERROR(_mark_operator_finishing(curr_op, runtime_state));
                    }
-                    _mark_operator_finishing(next_op, runtime_state);
+                    RETURN_IF_ERROR(_mark_operator_finishing(next_op, runtime_state));
                    new_first_unfinished = i + 1;
                    continue;
                }
@ -202,7 +198,7 @@ StatusOr<DriverState> PipelineDriver::process(RuntimeState* runtime_state, int w
        }
        // close finished operators and update _first_unfinished index
        for (auto i = _first_unfinished; i < new_first_unfinished; ++i) {
-            _mark_operator_finished(_operators[i], runtime_state);
+            RETURN_IF_ERROR(_mark_operator_finished(_operators[i], runtime_state));
        }
        _first_unfinished = new_first_unfinished;

@ -304,10 +300,6 @@ void PipelineDriver::finalize(RuntimeState* runtime_state, DriverState state) {

    COUNTER_UPDATE(_total_timer, _total_timer_sw->elapsed_time());
    COUNTER_UPDATE(_schedule_timer, _total_timer->value() - _active_timer->value() - _pending_timer->value());
-    COUNTER_UPDATE(_schedule_counter, driver_acct().get_schedule_times());
-    COUNTER_UPDATE(_schedule_effective_counter, driver_acct().get_schedule_effective_times());
-    COUNTER_UPDATE(_schedule_rows_per_chunk, driver_acct().get_rows_per_chunk());
-    COUNTER_UPDATE(_schedule_accumulated_chunks_moved, driver_acct().get_accumulated_chunks_moved());
    _update_overhead_timer();

    // last root driver cancel the all drivers' execution and notify FE the
@ -328,7 +320,11 @@ void PipelineDriver::finalize(RuntimeState* runtime_state, DriverState state) {
        _fragment_ctx->destroy_pass_through_chunk_buffer();
        auto status = _fragment_ctx->final_status();
        auto fragment_id = _fragment_ctx->fragment_instance_id();
-        _query_ctx->count_down_fragments();
+        if (_query_ctx->count_down_fragments()) {
+            auto query_id = _query_ctx->query_id();
+            DCHECK(!this->is_still_pending_finish());
+            QueryContextManager::instance()->remove(query_id);
+        }
    }
 }

@ -388,72 +384,73 @@ bool PipelineDriver::_check_fragment_is_canceled(RuntimeState* runtime_state) {
    return false;
 }

-void PipelineDriver::_mark_operator_finishing(OperatorPtr& op, RuntimeState* state) {
+Status PipelineDriver::_mark_operator_finishing(OperatorPtr& op, RuntimeState* state) {
    auto& op_state = _operator_stages[op->get_id()];
    if (op_state >= OperatorStage::FINISHING) {
-        return;
+        return Status::OK();
    }

    VLOG_ROW << strings::Substitute("[Driver] finishing operator [driver=$0] [operator=$1]", to_readable_string(),
                                    op->get_name());
    {
        SCOPED_TIMER(op->_finishing_timer);
-        op->set_finishing(state);
+        op_state = OperatorStage::FINISHING;
+        return op->set_finishing(state);
    }
-    op_state = OperatorStage::FINISHING;
 }

-void PipelineDriver::_mark_operator_finished(OperatorPtr& op, RuntimeState* state) {
-    _mark_operator_finishing(op, state);
+Status PipelineDriver::_mark_operator_finished(OperatorPtr& op, RuntimeState* state) {
+    RETURN_IF_ERROR(_mark_operator_finishing(op, state));
    auto& op_state = _operator_stages[op->get_id()];
    if (op_state >= OperatorStage::FINISHED) {
-        return;
+        return Status::OK();
    }

    VLOG_ROW << strings::Substitute("[Driver] finished operator [driver=$0] [operator=$1]", to_readable_string(),
                                    op->get_name());
    {
        SCOPED_TIMER(op->_finished_timer);
-        op->set_finished(state);
+        op_state = OperatorStage::FINISHED;
+        return op->set_finished(state);
    }
-    op_state = OperatorStage::FINISHED;
 }

-void PipelineDriver::_mark_operator_cancelled(OperatorPtr& op, RuntimeState* state) {
-    _mark_operator_finished(op, state);
+Status PipelineDriver::_mark_operator_cancelled(OperatorPtr& op, RuntimeState* state) {
+    RETURN_IF_ERROR(_mark_operator_finished(op, state));
    auto& op_state = _operator_stages[op->get_id()];
    if (op_state >= OperatorStage::CANCELLED) {
-        return;
+        return Status::OK();
    }

    VLOG_ROW << strings::Substitute("[Driver] cancelled operator [driver=$0] [operator=$1]", to_readable_string(),
                                    op->get_name());
-    op->set_cancelled(state);
    op_state = OperatorStage::CANCELLED;
+    return op->set_cancelled(state);
 }

-void PipelineDriver::_mark_operator_closed(OperatorPtr& op, RuntimeState* state) {
+Status PipelineDriver::_mark_operator_closed(OperatorPtr& op, RuntimeState* state) {
    if (_fragment_ctx->is_canceled()) {
-        _mark_operator_cancelled(op, state);
+        RETURN_IF_ERROR(_mark_operator_cancelled(op, state));
    } else {
-        _mark_operator_finished(op, state);
+        RETURN_IF_ERROR(_mark_operator_finished(op, state));
    }

    auto& op_state = _operator_stages[op->get_id()];
    if (op_state >= OperatorStage::CLOSED) {
-        return;
+        return Status::OK();
    }

    VLOG_ROW << strings::Substitute("[Driver] close operator [driver=$0] [operator=$1]", to_readable_string(),
                                    op->get_name());
    {
        SCOPED_TIMER(op->_close_timer);
+        op_state = OperatorStage::CLOSED;
        op->close(state);
    }
    COUNTER_UPDATE(op->_total_timer, op->_pull_timer->value() + op->_push_timer->value() +
                                             op->_finishing_timer->value() + op->_finished_timer->value() +
                                             op->_close_timer->value());
-    op_state = OperatorStage::CLOSED;
+    return Status::OK();
 }

 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/pipeline_driver.h
+++ b/be/src/exec/pipeline/pipeline_driver.h
@ -219,10 +219,10 @@ public:
    Operators& operators() { return _operators; }
    SourceOperator* source_operator() { return down_cast<SourceOperator*>(_operators.front().get()); }
    RuntimeProfile* runtime_profile() { return _runtime_profile.get(); }
-
    // drivers that waits for runtime filters' readiness must be marked PRECONDITION_NOT_READY and put into
    // PipelineDriverPoller.
    void mark_precondition_not_ready();
+
    // drivers in PRECONDITION_BLOCK state must be marked READY after its dependent runtime-filters or hash tables
    // are finished.
    void mark_precondition_ready(RuntimeState* runtime_state);
@ -355,10 +355,10 @@ private:

    // check whether fragment is cancelled. It is used before pull_chunk and push_chunk.
    bool _check_fragment_is_canceled(RuntimeState* runtime_state);
-    void _mark_operator_finishing(OperatorPtr& op, RuntimeState* runtime_state);
-    void _mark_operator_finished(OperatorPtr& op, RuntimeState* runtime_state);
-    void _mark_operator_cancelled(OperatorPtr& op, RuntimeState* runtime_state);
-    void _mark_operator_closed(OperatorPtr& op, RuntimeState* runtime_state);
+    Status _mark_operator_finishing(OperatorPtr& op, RuntimeState* runtime_state);
+    Status _mark_operator_finished(OperatorPtr& op, RuntimeState* runtime_state);
+    Status _mark_operator_cancelled(OperatorPtr& op, RuntimeState* runtime_state);
+    Status _mark_operator_closed(OperatorPtr& op, RuntimeState* runtime_state);
    void _close_operators(RuntimeState* runtime_state);

    // Update metrics when the driver yields.
@ -416,12 +416,6 @@ private:
    RuntimeProfile::Counter* _first_input_empty_timer = nullptr;
    RuntimeProfile::Counter* _followup_input_empty_timer = nullptr;
    RuntimeProfile::Counter* _output_full_timer = nullptr;
-    RuntimeProfile::Counter* _local_rf_waiting_set_counter = nullptr;
-
-    RuntimeProfile::Counter* _schedule_counter = nullptr;
-    RuntimeProfile::Counter* _schedule_effective_counter = nullptr;
-    RuntimeProfile::Counter* _schedule_rows_per_chunk = nullptr;
-    RuntimeProfile::Counter* _schedule_accumulated_chunks_moved = nullptr;

    MonotonicStopWatch* _total_timer_sw = nullptr;
    MonotonicStopWatch* _pending_timer_sw = nullptr;
@ -431,4 +425,4 @@ private:
 };

 } // namespace pipeline
-} // namespace starrocks
+} // namespace starrocks
--- a/be/src/exec/pipeline/pipeline_driver_executor.cpp
+++ b/be/src/exec/pipeline/pipeline_driver_executor.cpp
@ -25,7 +25,7 @@ void GlobalDriverExecutor::initialize(int num_threads) {
    _blocked_driver_poller->start();
    _num_threads_setter.set_actual_num(num_threads);
    for (auto i = 0; i < num_threads; ++i) {
-        _thread_pool->submit_func([this]() { this->worker_thread(); });
+        _thread_pool->submit_func([this]() { this->_worker_thread(); });
    }
 }

@ -35,21 +35,16 @@ void GlobalDriverExecutor::change_num_threads(int32_t num_threads) {
        return;
    }
    for (int i = old_num_threads; i < num_threads; ++i) {
-        _thread_pool->submit_func([this]() { this->worker_thread(); });
+        _thread_pool->submit_func([this]() { this->_worker_thread(); });
    }
 }

-void GlobalDriverExecutor::finalize_driver(DriverRawPtr driver, RuntimeState* runtime_state, DriverState state) {
+void GlobalDriverExecutor::_finalize_driver(DriverRawPtr driver, RuntimeState* runtime_state, DriverState state) {
    DCHECK(driver);
    driver->finalize(runtime_state, state);
-    if (driver->query_ctx()->is_finished()) {
-        auto query_id = driver->query_ctx()->query_id();
-        DCHECK(!driver->is_still_pending_finish());
-        QueryContextManager::instance()->remove(query_id);
-    }
 }

-void GlobalDriverExecutor::worker_thread() {
+void GlobalDriverExecutor::_worker_thread() {
    const int worker_id = _next_id++;
    while (true) {
        if (_num_threads_setter.should_shrink()) {
@ -78,13 +73,13 @@ void GlobalDriverExecutor::worker_thread() {
                    driver->set_driver_state(DriverState::PENDING_FINISH);
                    _blocked_driver_poller->add_blocked_driver(driver);
                } else {
-                    finalize_driver(driver, runtime_state, DriverState::CANCELED);
+                    _finalize_driver(driver, runtime_state, DriverState::CANCELED);
                }
                continue;
            }
            // a blocked driver is canceled because of fragment cancellation or query expiration.
            if (driver->is_finished()) {
-                finalize_driver(driver, runtime_state, driver->driver_state());
+                _finalize_driver(driver, runtime_state, driver->driver_state());
                continue;
            }

@ -103,7 +98,7 @@ void GlobalDriverExecutor::worker_thread() {
                    driver->set_driver_state(DriverState::PENDING_FINISH);
                    _blocked_driver_poller->add_blocked_driver(driver);
                } else {
-                    finalize_driver(driver, runtime_state, DriverState::INTERNAL_ERROR);
+                    _finalize_driver(driver, runtime_state, DriverState::INTERNAL_ERROR);
                }
                continue;
            }
@ -117,7 +112,7 @@ void GlobalDriverExecutor::worker_thread() {
            case FINISH:
            case CANCELED:
            case INTERNAL_ERROR: {
-                finalize_driver(driver, runtime_state, driver_state);
+                _finalize_driver(driver, runtime_state, driver_state);
                break;
            }
            case INPUT_EMPTY:
@ -153,7 +148,7 @@ void GlobalDriverExecutor::submit(DriverRawPtr driver) {
 }

 void GlobalDriverExecutor::report_exec_state(FragmentContext* fragment_ctx, const Status& status, bool done) {
-    update_profile_by_level(fragment_ctx, done);
+    _update_profile_by_level(fragment_ctx, done);
    auto params = ExecStateReporter::create_report_exec_status_params(fragment_ctx, status, done);
    auto fe_addr = fragment_ctx->fe_addr();
    auto exec_env = fragment_ctx->runtime_state()->exec_env();
@ -177,7 +172,7 @@ void GlobalDriverExecutor::report_exec_state(FragmentContext* fragment_ctx, cons
    this->_exec_state_reporter->submit(std::move(report_task));
 }

-void GlobalDriverExecutor::update_profile_by_level(FragmentContext* fragment_ctx, bool done) {
+void GlobalDriverExecutor::_update_profile_by_level(FragmentContext* fragment_ctx, bool done) {
    if (!done) {
        return;
    }
@ -204,7 +199,7 @@ void GlobalDriverExecutor::update_profile_by_level(FragmentContext* fragment_ctx
            continue;
        }

-        remove_non_core_metrics(fragment_ctx, driver_profiles);
+        _remove_non_core_metrics(fragment_ctx, driver_profiles);

        RuntimeProfile::merge_isomorphic_profiles(driver_profiles);
        // all the isomorphic profiles will merged into the first profile
@ -212,12 +207,15 @@ void GlobalDriverExecutor::update_profile_by_level(FragmentContext* fragment_ctx

        // use the name of pipeline' profile as pipeline driver's
        merged_driver_profile->set_name(pipeline_profile->name());
-        merged_driver_profiles.push_back(merged_driver_profile);

        // add all the info string and counters of the pipeline's profile
        // to the pipeline driver's profile
-        merged_driver_profile->copy_all_counters_from(pipeline_profile);
        merged_driver_profile->copy_all_info_strings_from(pipeline_profile);
+        merged_driver_profile->copy_all_counters_from(pipeline_profile);
+
+        _simplify_common_metrics(merged_driver_profile);
+
+        merged_driver_profiles.push_back(merged_driver_profile);
    }

    // remove pipeline's profile from the hierarchy
@ -228,8 +226,8 @@ void GlobalDriverExecutor::update_profile_by_level(FragmentContext* fragment_ctx
    }
 }

-void GlobalDriverExecutor::remove_non_core_metrics(FragmentContext* fragment_ctx,
-                                                   std::vector<RuntimeProfile*>& driver_profiles) {
+void GlobalDriverExecutor::_remove_non_core_metrics(FragmentContext* fragment_ctx,
+                                                    std::vector<RuntimeProfile*>& driver_profiles) {
    if (fragment_ctx->profile_level() > TPipelineProfileLevel::CORE_METRICS) {
        return;
    }
@ -242,17 +240,34 @@ void GlobalDriverExecutor::remove_non_core_metrics(FragmentContext* fragment_ctx

        for (auto* operator_profile : operator_profiles) {
            RuntimeProfile* common_metrics = operator_profile->get_child("CommonMetrics");
+            DCHECK(common_metrics != nullptr);
+            common_metrics->remove_counters(std::set<std::string>{"OperatorTotalTime"});

-            operator_profile->remove_childs();
-
-            if (common_metrics != nullptr) {
-                common_metrics->remove_counters(std::set<std::string>{"OperatorTotalTime"});
-            }
-
-            common_metrics->reset_parent();
-            operator_profile->add_child(common_metrics, true, nullptr);
+            RuntimeProfile* unique_metrics = operator_profile->get_child("UniqueMetrics");
+            DCHECK(unique_metrics != nullptr);
+            unique_metrics->remove_counters(std::set<std::string>{"ScanTime", "WaitTime"});
        }
    }
 }

+void GlobalDriverExecutor::_simplify_common_metrics(RuntimeProfile* driver_profile) {
+    std::vector<RuntimeProfile*> operator_profiles;
+    driver_profile->get_children(&operator_profiles);
+    for (auto* operator_profile : operator_profiles) {
+        RuntimeProfile* common_metrics = operator_profile->get_child("CommonMetrics");
+        DCHECK(common_metrics != nullptr);
+
+        // Remove runtime filter related counters if it's value is 0
+        static std::string counter_names[] = {
+                "RuntimeInFilterNum",          "RuntimeBloomFilterNum",     "JoinRuntimeFilterInputRows",
+                "JoinRuntimeFilterOutputRows", "JoinRuntimeFilterEvaluate", "JoinRuntimeFilterTime",
+                "ConjunctsInputRows",          "ConjunctsOutputRows",       "ConjunctsEvaluate"};
+        for (auto& name : counter_names) {
+            auto* counter = common_metrics->get_counter(name);
+            if (counter != nullptr && counter->value() == 0) {
+                common_metrics->remove_counter(name);
+            }
+        }
+    }
+}
 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/pipeline_driver_executor.h
+++ b/be/src/exec/pipeline/pipeline_driver_executor.h
@ -48,10 +48,11 @@ public:
    void report_exec_state(FragmentContext* fragment_ctx, const Status& status, bool done) override;

 private:
-    void worker_thread();
-    void finalize_driver(DriverRawPtr driver, RuntimeState* runtime_state, DriverState state);
-    void update_profile_by_level(FragmentContext* fragment_ctx, bool done);
-    void remove_non_core_metrics(FragmentContext* fragment_ctx, std::vector<RuntimeProfile*>& driver_profiles);
+    void _worker_thread();
+    void _finalize_driver(DriverRawPtr driver, RuntimeState* runtime_state, DriverState state);
+    void _update_profile_by_level(FragmentContext* fragment_ctx, bool done);
+    void _remove_non_core_metrics(FragmentContext* fragment_ctx, std::vector<RuntimeProfile*>& driver_profiles);
+    void _simplify_common_metrics(RuntimeProfile* driver_profile);

 private:
    LimitSetter _num_threads_setter;
--- a/be/src/exec/pipeline/pipeline_driver_queue.cpp
+++ b/be/src/exec/pipeline/pipeline_driver_queue.cpp
@ -284,6 +284,7 @@ void DriverQueueWithWorkGroup::_put_back(const DriverRawPtr driver) {
                int64_t new_vruntime_ns = std::min(min_wg->vruntime_ns() - _ideal_runtime_ns(wg) / 2,
                                                   min_wg->real_runtime_ns() / int64_t(wg->cpu_limit()));
                wg->set_vruntime_ns(std::max(wg->vruntime_ns(), new_vruntime_ns));
+                wg->update_last_real_runtime_ns(wg->real_runtime_ns());

                int64_t diff_real_runtime_ns = wg->real_runtime_ns() - origin_real_runtime_ns;
                workgroup::WorkGroupManager::instance()->increment_cpu_runtime_ns(diff_real_runtime_ns);
--- a/be/src/exec/pipeline/project_operator.cpp
+++ b/be/src/exec/pipeline/project_operator.cpp
@ -7,6 +7,7 @@
 #include "column/nullable_column.h"
 #include "exprs/expr.h"
 #include "exprs/vectorized/column_ref.h"
+#include "runtime/current_thread.h"
 #include "runtime/runtime_state.h"

 namespace starrocks::pipeline {
@ -23,8 +24,10 @@ StatusOr<vectorized::ChunkPtr> ProjectOperator::pull_chunk(RuntimeState* state)
 }

 Status ProjectOperator::push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) {
+    TRY_CATCH_ALLOC_SCOPE_START()
    for (size_t i = 0; i < _common_sub_column_ids.size(); ++i) {
        chunk->append_column(_common_sub_expr_ctxs[i]->evaluate(chunk.get()), _common_sub_column_ids[i]);
+        RETURN_IF_HAS_ERROR(_common_sub_expr_ctxs);
    }

    using namespace vectorized;
@ -52,6 +55,7 @@ Status ProjectOperator::push_chunk(RuntimeState* state, const vectorized::ChunkP
                        NullableColumn::create(result_columns[i], NullColumn::create(result_columns[i]->size(), 0));
            }
        }
+        RETURN_IF_HAS_ERROR(_expr_ctxs);
    }

    _cur_chunk = std::make_shared<vectorized::Chunk>();
@ -60,6 +64,7 @@ Status ProjectOperator::push_chunk(RuntimeState* state, const vectorized::ChunkP
    }
    eval_runtime_bloom_filters(_cur_chunk.get());
    DCHECK_CHUNK(_cur_chunk);
+    TRY_CATCH_ALLOC_SCOPE_END()
    return Status::OK();
 }

--- a/be/src/exec/pipeline/project_operator.h
+++ b/be/src/exec/pipeline/project_operator.h
@ -33,7 +33,10 @@ public:

    bool is_finished() const override { return _is_finished && _cur_chunk == nullptr; }

-    void set_finishing(RuntimeState* state) override { _is_finished = true; }
+    Status set_finishing(RuntimeState* state) override {
+        _is_finished = true;
+        return Status::OK();
+    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/query_context.cpp
+++ b/be/src/exec/pipeline/query_context.cpp
@ -30,6 +30,7 @@ void QueryContext::cancel(const Status& status) {

 static constexpr size_t QUERY_CONTEXT_MAP_SLOT_NUM = 64;
 static constexpr size_t QUERY_CONTEXT_MAP_SLOT_NUM_MASK = (1 << 6) - 1;
+
 QueryContextManager::QueryContextManager()
        : _mutexes(QUERY_CONTEXT_MAP_SLOT_NUM),
          _context_maps(QUERY_CONTEXT_MAP_SLOT_NUM),
@ -132,7 +133,7 @@ void QueryContextManager::remove(const TUniqueId& query_id) {
    // the query context is really dead, so just cleanup
    if (it->second->is_dead()) {
        context_map.erase(it);
-    } else {
+    } else if (it->second->is_finished()) {
        // although all of active fragments of the query context terminates, but some fragments maybe comes too late
        // in the future, so extend the lifetime of query context and wait for some time till fragments on wire have
        // vanished
@ -143,4 +144,14 @@ void QueryContextManager::remove(const TUniqueId& query_id) {
    }
 }

+void QueryContextManager::clear() {
+    std::vector<std::unique_lock<std::shared_mutex>> locks;
+    locks.reserve(_mutexes.size());
+    for (int i = 0; i < _mutexes.size(); ++i) {
+        locks.emplace_back(_mutexes[i]);
+    }
+    _second_chance_maps.clear();
+    _context_maps.clear();
+}
+
 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/query_context.h
+++ b/be/src/exec/pipeline/query_context.h
@ -45,7 +45,7 @@ public:
    // now time point pass by deadline point.
    bool is_expired() {
        auto now = duration_cast<milliseconds>(steady_clock::now().time_since_epoch()).count();
-        return now > _deadline;
+        return is_finished() && now > _deadline;
    }

    bool is_dead() { return _num_active_fragments == 0 && _num_fragments == _total_fragments; }
@ -95,6 +95,8 @@ public:
    QueryContext* get_or_register(const TUniqueId& query_id);
    QueryContextPtr get(const TUniqueId& query_id);
    void remove(const TUniqueId& query_id);
+    // used for graceful exit
+    void clear();

 private:
    std::vector<std::shared_mutex> _mutexes;
--- a/be/src/exec/pipeline/result_sink_operator.h
+++ b/be/src/exec/pipeline/result_sink_operator.h
@ -43,7 +43,10 @@ public:

    bool is_finished() const override { return _is_finished && !_fetch_data_result; }

-    void set_finishing(RuntimeState* state) override { _is_finished = true; }
+    Status set_finishing(RuntimeState* state) override {
+        _is_finished = true;
+        return Status::OK();
+    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/runtime_filter_types.h
+++ b/be/src/exec/pipeline/runtime_filter_types.h
@ -21,31 +21,31 @@ using RuntimeBloomFilterProbeDescriptorPtr = RuntimeBloomFilterProbeDescriptor*;
 using RuntimeBloomFilterRunningContext = starrocks::vectorized::JoinRuntimeFilter::RunningContext;
 using RuntimeInFilterPtr = RuntimeInFilter*;
 using RuntimeBloomFilterPtr = RuntimeBloomFilter*;
-using RuntimeInFilters = std::list<RuntimeInFilterPtr>;
-using RuntimeBloomFilters = std::list<RuntimeBloomFilterPtr>;
+using RuntimeInFilters = std::vector<RuntimeInFilterPtr>;
+using RuntimeInFilterList = std::list<RuntimeInFilterPtr>;
+using RuntimeBloomFilters = std::vector<RuntimeBloomFilterPtr>;
+using RuntimeBloomFilterList = std::list<RuntimeBloomFilterPtr>;
 struct RuntimeFilterCollector;
 using RuntimeFilterCollectorPtr = std::unique_ptr<RuntimeFilterCollector>;
 using RuntimeFilterProbeCollector = starrocks::vectorized::RuntimeFilterProbeCollector;
 using Predicate = starrocks::Predicate;
 struct RuntimeBloomFilterBuildParam;
-using RuntimeBloomFilterBuildParams = std::list<RuntimeBloomFilterBuildParam>;
+using OptRuntimeBloomFilterBuildParams = std::vector<std::optional<RuntimeBloomFilterBuildParam>>;
 // Parameters used to build runtime bloom-filters.
 struct RuntimeBloomFilterBuildParam {
-    RuntimeBloomFilterBuildParam(bool eq_null, const ColumnPtr& column, size_t ht_row_count)
-            : eq_null(eq_null), column(column), ht_row_count(ht_row_count) {}
+    RuntimeBloomFilterBuildParam(bool eq_null, const ColumnPtr& column) : eq_null(eq_null), column(column) {}
    bool eq_null;
    ColumnPtr column;
-    size_t ht_row_count;
 };

 // RuntimeFilterCollector contains runtime in-filters and bloom-filters, it is stored in RuntimeFilerHub
 // and every HashJoinBuildOperatorFactory has its corresponding RuntimeFilterCollector.
 struct RuntimeFilterCollector {
-    RuntimeFilterCollector(RuntimeInFilters&& in_filters, RuntimeBloomFilters&& bloom_filters)
+    RuntimeFilterCollector(RuntimeInFilterList&& in_filters, RuntimeBloomFilterList&& bloom_filters)
            : _in_filters(std::move(in_filters)), _bloom_filters(std::move(bloom_filters)) {}

-    RuntimeBloomFilters& get_bloom_filters() { return _bloom_filters; }
-    RuntimeInFilters& get_in_filters() { return _in_filters; }
+    RuntimeBloomFilterList& get_bloom_filters() { return _bloom_filters; }
+    RuntimeInFilterList& get_in_filters() { return _in_filters; }

    // In-filters are constructed by a node and may be pushed down to its descendant node.
    // Different tuple id and slot id between descendant and ancestor nodes may be referenced to the same column,
@ -83,9 +83,9 @@ struct RuntimeFilterCollector {

 private:
    // local runtime in-filter
-    RuntimeInFilters _in_filters;
+    RuntimeInFilterList _in_filters;
    // global/local runtime bloom-filter(including max-min filter)
-    RuntimeBloomFilters _bloom_filters;
+    RuntimeBloomFilterList _bloom_filters;
 };

 class RuntimeFilterHolder {
@ -113,7 +113,7 @@ public:
        get_holder(id)->set_collector(std::move(collector));
    }

-    RuntimeBloomFilters& get_bloom_filters(TPlanNodeId id) {
+    RuntimeBloomFilterList& get_bloom_filters(TPlanNodeId id) {
        return get_holder(id)->get_collector()->get_bloom_filters();
    }

@ -205,10 +205,9 @@ public:

    // HashJoinBuildOperator call add_partial_filters to gather partial runtime filters. the last HashJoinBuildOperator
    // will merge partial runtime filters into total one finally.
-    StatusOr<bool> add_partial_filters(
-            size_t idx, size_t ht_row_count, std::list<ExprContext*>&& partial_in_filters,
-            std::list<RuntimeBloomFilterBuildParam>&& partial_bloom_filter_build_params,
-            std::list<vectorized::RuntimeFilterBuildDescriptor*>&& bloom_filter_descriptors) {
+    StatusOr<bool> add_partial_filters(size_t idx, size_t ht_row_count, RuntimeInFilters&& partial_in_filters,
+                                       OptRuntimeBloomFilterBuildParams&& partial_bloom_filter_build_params,
+                                       RuntimeBloomFilters&& bloom_filter_descriptors) {
        DCHECK(idx < _partial_bloom_filter_build_params.size());
        // both _ht_row_counts, _partial_in_filters, _partial_bloom_filter_build_params are reserved beforehand,
        // each HashJoinBuildOperator mutates its corresponding slot indexed by driver_sequence, so concurrent
@ -219,18 +218,22 @@ public:
        // merge
        if (1 == _num_active_builders--) {
            _bloom_filter_descriptors = std::move(bloom_filter_descriptors);
-            merge_in_filters();
-            merge_bloom_filters();
+            merge_local_in_filters();
+            merge_local_bloom_filters();
            return true;
        }
        return false;
    }

-    RuntimeInFilters get_total_in_filters() { return _partial_in_filters[0]; }
+    RuntimeInFilterList get_total_in_filters() {
+        return std::list(_partial_in_filters[0].begin(), _partial_in_filters[0].end());
+    }

-    RuntimeBloomFilters get_total_bloom_filters() { return _bloom_filter_descriptors; }
+    RuntimeBloomFilterList get_total_bloom_filters() {
+        return std::list(_bloom_filter_descriptors.begin(), _bloom_filter_descriptors.end());
+    }

-    Status merge_in_filters() {
+    Status merge_local_in_filters() {
        bool can_merge_in_filters = true;
        size_t num_rows = 0;
        ssize_t k = -1;
@ -263,17 +266,21 @@ public:
        _partial_in_filters.resize(k + 1);

        auto& total_in_filters = _partial_in_filters[0];
-        for (auto i = 1; i < _partial_in_filters.size(); ++i) {
-            auto& in_filters = _partial_in_filters[i];
-            auto total_in_filter_it = total_in_filters.begin();
-            auto in_filter_it = in_filters.begin();
-            while (total_in_filter_it != total_in_filters.end()) {
-                auto& total_in_filter = *(total_in_filter_it++);
-                auto& in_filter = *(in_filter_it++);
-                if (total_in_filter == nullptr || in_filter == nullptr) {
-                    total_in_filter = nullptr;
-                    continue;
-                }
+        const auto num_in_filters = total_in_filters.size();
+        for (auto i = 0; i < num_in_filters; ++i) {
+            auto& total_in_filter = total_in_filters[i];
+            if (total_in_filter == nullptr) {
+                continue;
+            }
+            auto can_merge = std::all_of(_partial_in_filters.begin() + 1, _partial_in_filters.end(),
+                                         [i](auto& in_filters) { return in_filters[i] != nullptr; });
+            if (!can_merge) {
+                total_in_filter = nullptr;
+                continue;
+            }
+            for (int j = 1; j < _partial_in_filters.size(); ++j) {
+                auto& in_filter = _partial_in_filters[j][i];
+                DCHECK(in_filter != nullptr);
                auto* total_in_filter_pred = down_cast<Predicate*>(total_in_filter->root());
                auto* in_filter_pred = down_cast<Predicate*>(in_filter->root());
                RETURN_IF_ERROR(total_in_filter_pred->merge(in_filter_pred));
@ -284,7 +291,7 @@ public:
        return Status::OK();
    }

-    Status merge_bloom_filters() {
+    Status merge_local_bloom_filters() {
        if (_partial_bloom_filter_build_params.empty()) {
            return Status::OK();
        }
@ -307,13 +314,46 @@ public:
            desc->set_runtime_filter(filter);
        }

-        for (auto& params : _partial_bloom_filter_build_params) {
-            auto desc_it = _bloom_filter_descriptors.begin();
-            auto param_it = params.begin();
-            while (param_it != params.end()) {
-                auto& desc = *(desc_it++);
-                auto& param = *(param_it++);
-                if (desc->runtime_filter() == nullptr || param.column == nullptr) {
+        const auto& num_bloom_filters = _bloom_filter_descriptors.size();
+
+        // remove empty params that generated in two cases:
+        // 1. the corresponding HashJoinProbeOperator is finished in short-circuit style because HashJoinBuildOperator
+        // above this operator has constructed an empty hash table.
+        // 2. the HashJoinBuildOperator is finished in advance because the fragment instance is canceled
+        _partial_bloom_filter_build_params.erase(
+                std::remove_if(_partial_bloom_filter_build_params.begin(), _partial_bloom_filter_build_params.end(),
+                               [](auto& opt_params) { return opt_params.empty(); }),
+                _partial_bloom_filter_build_params.end());
+
+        // there is no non-empty params, set all runtime filter to nullptr
+        if (_partial_bloom_filter_build_params.empty()) {
+            for (auto& desc : _bloom_filter_descriptors) {
+                desc->set_runtime_filter(nullptr);
+            }
+            return Status::OK();
+        }
+
+        // all params must have the same size as num_bloom_filters
+        DCHECK(std::all_of(_partial_bloom_filter_build_params.begin(), _partial_bloom_filter_build_params.end(),
+                           [&num_bloom_filters](auto& opt_params) { return opt_params.size() == num_bloom_filters; }));
+
+        for (auto i = 0; i < num_bloom_filters; ++i) {
+            auto& desc = _bloom_filter_descriptors[i];
+            if (desc->runtime_filter() == nullptr) {
+                continue;
+            }
+            auto can_merge =
+                    std::all_of(_partial_bloom_filter_build_params.begin(), _partial_bloom_filter_build_params.end(),
+                                [i](auto& opt_params) { return opt_params[i].has_value(); });
+            if (!can_merge) {
+                desc->set_runtime_filter(nullptr);
+                continue;
+            }
+            for (auto& opt_params : _partial_bloom_filter_build_params) {
+                auto& opt_param = opt_params[i];
+                DCHECK(opt_param.has_value());
+                auto& param = opt_param.value();
+                if (param.column == nullptr || param.column->empty()) {
                    continue;
                }
                auto status = vectorized::RuntimeFilterHelper::fill_runtime_bloom_filter(
@ -321,6 +361,7 @@ public:
                        vectorized::kHashJoinKeyColumnOffset, param.eq_null);
                if (!status.ok()) {
                    desc->set_runtime_filter(nullptr);
+                    break;
                }
            }
        }
@ -333,7 +374,7 @@ private:
    std::atomic<size_t> _num_active_builders;
    std::vector<size_t> _ht_row_counts;
    std::vector<RuntimeInFilters> _partial_in_filters;
-    std::vector<RuntimeBloomFilterBuildParams> _partial_bloom_filter_build_params;
+    std::vector<OptRuntimeBloomFilterBuildParams> _partial_bloom_filter_build_params;
    RuntimeBloomFilters _bloom_filter_descriptors;
 };

--- a/be/src/exec/pipeline/scan_operator.cpp
+++ b/be/src/exec/pipeline/scan_operator.cpp
@ -13,15 +13,18 @@

 namespace starrocks::pipeline {

-using starrocks::workgroup::WorkGroupManager;
-
 // ========== ScanOperator ==========

 ScanOperator::ScanOperator(OperatorFactory* factory, int32_t id, ScanNode* scan_node)
        : SourceOperator(factory, id, "olap_scan", scan_node->id()),
          _scan_node(scan_node),
+          _chunk_source_profiles(MAX_IO_TASKS_PER_OP),
          _is_io_task_running(MAX_IO_TASKS_PER_OP),
-          _chunk_sources(MAX_IO_TASKS_PER_OP) {}
+          _chunk_sources(MAX_IO_TASKS_PER_OP) {
+    for (auto i = 0; i < MAX_IO_TASKS_PER_OP; i++) {
+        _chunk_source_profiles[i] = std::make_shared<RuntimeProfile>(strings::Substitute("ChunkSource$0", i));
+    }
+}

 Status ScanOperator::prepare(RuntimeState* state) {
    RETURN_IF_ERROR(SourceOperator::prepare(state));
@ -120,8 +123,10 @@ bool ScanOperator::is_finished() const {
    return true;
 }

-void ScanOperator::set_finishing(RuntimeState* state) {
+Status ScanOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
+    _merge_chunk_source_profiles();
+    return Status::OK();
 }

 StatusOr<vectorized::ChunkPtr> ScanOperator::pull_chunk(RuntimeState* state) {
@ -175,12 +180,12 @@ Status ScanOperator::_trigger_next_scan(RuntimeState* state, int chunk_source_in
        workgroup::ScanTask task = workgroup::ScanTask(_workgroup, [this, state, chunk_source_index](int worker_id) {
            {
                SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(state->instance_mem_tracker());
-
                size_t num_read_chunks = 0;
                _chunk_sources[chunk_source_index]->buffer_next_batch_chunks_blocking_for_workgroup(
                        _buffer_size, _is_finished, &num_read_chunks, worker_id, _workgroup);
                // TODO (by laotan332): More detailed information is needed
                _workgroup->incr_period_scaned_chunk_num(num_read_chunks);
+                _workgroup->increment_real_runtime_ns(_chunk_sources[chunk_source_index]->last_spent_cpu_time_ns());
            }

            _num_running_io_tasks--;
@ -236,7 +241,7 @@ Status ScanOperator::_pickup_morsel(RuntimeState* state, int chunk_source_index)
    if (maybe_morsel.has_value()) {
        auto morsel = std::move(maybe_morsel.value());
        DCHECK(morsel);
-        _chunk_sources[chunk_source_index] = create_chunk_source(std::move(morsel));
+        _chunk_sources[chunk_source_index] = create_chunk_source(std::move(morsel), chunk_source_index);
        auto status = _chunk_sources[chunk_source_index]->prepare(state);
        if (!status.ok()) {
            _chunk_sources[chunk_source_index] = nullptr;
@ -249,6 +254,19 @@ Status ScanOperator::_pickup_morsel(RuntimeState* state, int chunk_source_index)
    return Status::OK();
 }

+void ScanOperator::_merge_chunk_source_profiles() {
+    std::vector<RuntimeProfile*> profiles(_chunk_source_profiles.size());
+    for (auto i = 0; i < _chunk_source_profiles.size(); i++) {
+        profiles[i] = _chunk_source_profiles[i].get();
+    }
+    RuntimeProfile::merge_isomorphic_profiles(profiles);
+
+    RuntimeProfile* merged_profile = profiles[0];
+
+    _unique_metrics->copy_all_info_strings_from(merged_profile);
+    _unique_metrics->copy_all_counters_from(merged_profile);
+}
+
 // ========== ScanOperatorFactory ==========

 ScanOperatorFactory::ScanOperatorFactory(int32_t id, ScanNode* scan_node)
--- a/be/src/exec/pipeline/scan_operator.h
+++ b/be/src/exec/pipeline/scan_operator.h
@ -28,7 +28,7 @@ public:

    bool is_finished() const override;

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

@ -38,21 +38,29 @@ public:
    // interface for different scan node
    virtual Status do_prepare(RuntimeState* state) = 0;
    virtual void do_close(RuntimeState* state) = 0;
-    virtual ChunkSourcePtr create_chunk_source(MorselPtr morsel) = 0;
-
-protected:
-    ScanNode* _scan_node = nullptr;
+    virtual ChunkSourcePtr create_chunk_source(MorselPtr morsel, int32_t chunk_source_index) = 0;

 private:
-    static constexpr int MAX_IO_TASKS_PER_OP = 4;
-
-    const size_t _buffer_size = config::pipeline_io_buffer_size;
-
    // This method is only invoked when current morsel is reached eof
    // and all cached chunk of this morsel has benn read out
    Status _pickup_morsel(RuntimeState* state, int chunk_source_index);
    Status _trigger_next_scan(RuntimeState* state, int chunk_source_index);
    Status _try_to_trigger_next_scan(RuntimeState* state);
+    void _merge_chunk_source_profiles();
+
+protected:
+    ScanNode* _scan_node = nullptr;
+    // ScanOperator may do parallel scan, so each _chunk_sources[i] needs to hold
+    // a profile indenpendently, to be more specificly, _chunk_sources[i] will go through
+    // many ChunkSourcePtr in the entire life time, all these ChunkSources of _chunk_sources[i]
+    // should share one profile because these ChunkSources are serial in timeline.
+    // And all these parallel profiles will be merged to ScanOperator's profile at the end.
+    std::vector<std::shared_ptr<RuntimeProfile>> _chunk_source_profiles;
+
+private:
+    static constexpr int MAX_IO_TASKS_PER_OP = 4;
+
+    const size_t _buffer_size = config::pipeline_io_buffer_size;

    bool _is_finished = false;

--- a/be/src/exec/pipeline/select_operator.h
+++ b/be/src/exec/pipeline/select_operator.h
@ -23,7 +23,10 @@ public:
    bool need_input() const override;
    bool is_finished() const override { return _is_finished && !_curr_chunk && !_pre_output_chunk; }

-    void set_finishing(RuntimeState* state) override { _is_finished = true; }
+    Status set_finishing(RuntimeState* state) override {
+        _is_finished = true;
+        return Status::OK();
+    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/set/except_build_sink_operator.h
+++ b/be/src/exec/pipeline/set/except_build_sink_operator.h
@ -39,9 +39,10 @@ public:

    bool is_finished() const override { return _is_finished || _except_ctx->is_finished(); }

-    void set_finishing(RuntimeState* state) override {
+    Status set_finishing(RuntimeState* state) override {
        _is_finished = true;
        _except_ctx->finish_build_ht();
+        return Status::OK();
    }

    Status prepare(RuntimeState* state) override;
--- a/be/src/exec/pipeline/set/except_output_source_operator.h
+++ b/be/src/exec/pipeline/set/except_output_source_operator.h
@ -28,7 +28,7 @@ public:
        return _except_ctx->is_dependency_finished(_dependency_index) && _except_ctx->is_output_finished();
    }

-    void set_finished(RuntimeState* state) override { _except_ctx->set_finished(); }
+    Status set_finished(RuntimeState* state) override { return _except_ctx->set_finished(); }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/set/except_probe_sink_operator.h
+++ b/be/src/exec/pipeline/set/except_probe_sink_operator.h
@ -36,9 +36,10 @@ public:
        return _except_ctx->is_dependency_finished(_dependency_index) && (_is_finished || _except_ctx->is_ht_empty());
    }

-    void set_finishing(RuntimeState* state) override {
+    Status set_finishing(RuntimeState* state) override {
        _is_finished = true;
        _except_ctx->finish_probe_ht();
+        return Status::OK();
    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;
--- a/be/src/exec/pipeline/set/intersect_build_sink_operator.h
+++ b/be/src/exec/pipeline/set/intersect_build_sink_operator.h
@ -42,9 +42,10 @@ public:

    bool is_finished() const override { return _is_finished || _intersect_ctx->is_finished(); }

-    void set_finishing(RuntimeState* state) override {
+    Status set_finishing(RuntimeState* state) override {
        _is_finished = true;
        _intersect_ctx->finish_build_ht();
+        return Status::OK();
    }

    Status prepare(RuntimeState* state) override;
--- a/be/src/exec/pipeline/set/intersect_output_source_operator.cpp
+++ b/be/src/exec/pipeline/set/intersect_output_source_operator.cpp
@ -17,4 +17,4 @@ void IntersectOutputSourceOperatorFactory::close(RuntimeState* state) {
    SourceOperatorFactory::close(state);
 }

-} // namespace starrocks::pipeline
+} // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/set/intersect_output_source_operator.h
+++ b/be/src/exec/pipeline/set/intersect_output_source_operator.h
@ -29,7 +29,10 @@ public:
        return _intersect_ctx->is_dependency_finished(_dependency_index) && _intersect_ctx->is_output_finished();
    }

-    void set_finished(RuntimeState* state) override { _intersect_ctx->set_finished(); }
+    Status set_finished(RuntimeState* state) override {
+        _intersect_ctx->set_finished();
+        return Status::OK();
+    }

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

--- a/be/src/exec/pipeline/set/intersect_probe_sink_operator.cpp
+++ b/be/src/exec/pipeline/set/intersect_probe_sink_operator.cpp
@ -28,8 +28,7 @@ Status IntersectProbeSinkOperatorFactory::prepare(RuntimeState* state) {

 void IntersectProbeSinkOperatorFactory::close(RuntimeState* state) {
    Expr::close(_dst_exprs, state);
-
    OperatorFactory::close(state);
 }

-} // namespace starrocks::pipeline
+} // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/set/intersect_probe_sink_operator.h
+++ b/be/src/exec/pipeline/set/intersect_probe_sink_operator.h
@ -38,9 +38,10 @@ public:
               (_is_finished || _intersect_ctx->is_ht_empty());
    }

-    void set_finishing(RuntimeState* state) override {
+    Status set_finishing(RuntimeState* state) override {
        _is_finished = true;
        _intersect_ctx->finish_probe_ht();
+        return Status::OK();
    }

    void close(RuntimeState* state) override;
--- a/be/src/exec/pipeline/set/union_const_source_operator.cpp
+++ b/be/src/exec/pipeline/set/union_const_source_operator.cpp
@ -26,6 +26,7 @@ StatusOr<vectorized::ChunkPtr> UnionConstSourceOperator::pull_chunk(starrocks::R
            DCHECK_EQ(_const_expr_lists[_next_processed_row_index + row_i].size(), columns_count);

            ColumnPtr src_column = _const_expr_lists[_next_processed_row_index + row_i][col_i]->evaluate(nullptr);
+            RETURN_IF_HAS_ERROR(_const_expr_lists[_next_processed_row_index + row_i]);
            auto cur_row_dst_column =
                    vectorized::ColumnHelper::move_column(dst_slot->type(), dst_slot->is_nullable(), src_column, 1);
            dst_column->append(*cur_row_dst_column, 0, 1);
--- a/be/src/exec/pipeline/set/union_const_source_operator.h
+++ b/be/src/exec/pipeline/set/union_const_source_operator.h
@ -34,9 +34,6 @@ public:

    bool is_finished() const override { return !has_output(); };

-    // finish is noop.
-    void set_finishing(RuntimeState* state) override{};
-
    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

 private:
--- a/be/src/exec/pipeline/set/union_passthrough_operator.h
+++ b/be/src/exec/pipeline/set/union_passthrough_operator.h
@ -43,7 +43,10 @@ public:

    bool is_finished() const override { return _is_finished && _dst_chunk == nullptr; }

-    void set_finishing(RuntimeState* state) override { _is_finished = true; }
+    Status set_finishing(RuntimeState* state) override {
+        _is_finished = true;
+        return Status::OK();
+    }

    Status push_chunk(RuntimeState* state, const ChunkPtr& src_chunk) override;

--- a/be/src/exec/pipeline/sort/local_merge_sort_source_operator.cpp
+++ b/be/src/exec/pipeline/sort/local_merge_sort_source_operator.cpp
@ -22,12 +22,13 @@ StatusOr<vectorized::ChunkPtr> LocalMergeSortSourceOperator::pull_chunk(RuntimeS
    return _sort_context->pull_chunk();
 }

-void LocalMergeSortSourceOperator::set_finishing(RuntimeState* state) {
+Status LocalMergeSortSourceOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
+    return Status::OK();
 }

-void LocalMergeSortSourceOperator::set_finished(RuntimeState* state) {
-    _sort_context->set_finished();
+Status LocalMergeSortSourceOperator::set_finished(RuntimeState* state) {
+    return _sort_context->set_finished();
 }

 bool LocalMergeSortSourceOperator::has_output() const {
--- a/be/src/exec/pipeline/sort/local_merge_sort_source_operator.h
+++ b/be/src/exec/pipeline/sort/local_merge_sort_source_operator.h
@ -35,8 +35,8 @@ public:

    void add_morsel(Morsel* morsel) {}

-    void set_finishing(RuntimeState* state) override;
-    void set_finished(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;
+    Status set_finished(RuntimeState* state) override;

 private:
    bool _is_finished = false;
--- a/be/src/exec/pipeline/sort/partition_sort_sink_operator.cpp
+++ b/be/src/exec/pipeline/sort/partition_sort_sink_operator.cpp
@ -14,6 +14,7 @@
 #include "exec/vectorized/chunks_sorter_full_sort.h"
 #include "exec/vectorized/chunks_sorter_topn.h"
 #include "exprs/expr.h"
+#include "runtime/current_thread.h"
 #include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"

@ -22,6 +23,7 @@ using namespace starrocks::vectorized;
 namespace starrocks::pipeline {
 Status PartitionSortSinkOperator::prepare(RuntimeState* state) {
    RETURN_IF_ERROR(Operator::prepare(state));
+    _chunks_sorter->setup_runtime(_unique_metrics.get());
    return Status::OK();
 }

@ -37,17 +39,18 @@ StatusOr<vectorized::ChunkPtr> PartitionSortSinkOperator::pull_chunk(RuntimeStat
 Status PartitionSortSinkOperator::push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) {
    vectorized::ChunkPtr materialize_chunk = ChunksSorter::materialize_chunk_before_sort(
            chunk.get(), _materialized_tuple_desc, _sort_exec_exprs, _order_by_types);
-    RETURN_IF_ERROR(_chunks_sorter->update(state, materialize_chunk));
+    TRY_CATCH_BAD_ALLOC(RETURN_IF_ERROR(_chunks_sorter->update(state, materialize_chunk)));
    return Status::OK();
 }

-void PartitionSortSinkOperator::set_finishing(RuntimeState* state) {
-    _chunks_sorter->finish(state);
+Status PartitionSortSinkOperator::set_finishing(RuntimeState* state) {
+    RETURN_IF_ERROR(_chunks_sorter->finish(state));

    // Current partition sort is ended, and
    // the last call will drive LocalMergeSortSourceOperator to work.
    _sort_context->finish_partition(_chunks_sorter->get_partition_rows());
    _is_finished = true;
+    return Status::OK();
 }

 Status PartitionSortSinkOperatorFactory::prepare(RuntimeState* state) {
@ -68,16 +71,16 @@ OperatorPtr PartitionSortSinkOperatorFactory::create(int32_t degree_of_paralleli
        if (_limit <= ChunksSorter::USE_HEAP_SORTER_LIMIT_SZ) {
            chunks_sorter = std::make_unique<HeapChunkSorter>(
                    runtime_state(), &(_sort_exec_exprs.lhs_ordering_expr_ctxs()), &_is_asc_order, &_is_null_first,
-                    _offset, _limit, SIZE_OF_CHUNK_FOR_TOPN);
+                    _sort_keys, _offset, _limit, SIZE_OF_CHUNK_FOR_TOPN);
        } else {
            chunks_sorter = std::make_unique<ChunksSorterTopn>(
                    runtime_state(), &(_sort_exec_exprs.lhs_ordering_expr_ctxs()), &_is_asc_order, &_is_null_first,
-                    _offset, _limit, SIZE_OF_CHUNK_FOR_TOPN);
+                    _sort_keys, _offset, _limit, SIZE_OF_CHUNK_FOR_TOPN);
        }
    } else {
        chunks_sorter = std::make_unique<vectorized::ChunksSorterFullSort>(
                runtime_state(), &(_sort_exec_exprs.lhs_ordering_expr_ctxs()), &_is_asc_order, &_is_null_first,
-                SIZE_OF_CHUNK_FOR_FULL_SORT);
+                _sort_keys, SIZE_OF_CHUNK_FOR_FULL_SORT);
    }
    auto sort_context = _sort_context_factory->create(driver_sequence);

--- a/be/src/exec/pipeline/sort/partition_sort_sink_operator.h
+++ b/be/src/exec/pipeline/sort/partition_sort_sink_operator.h
@ -63,7 +63,7 @@ public:

    Status push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) override;

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

 private:
    bool _is_finished = false;
@ -89,7 +89,7 @@ public:
    PartitionSortSinkOperatorFactory(
            int32_t id, int32_t plan_node_id, std::shared_ptr<SortContextFactory> sort_context_factory,
            SortExecExprs& sort_exec_exprs, std::vector<bool> is_asc_order, std::vector<bool> is_null_first,
-            int64_t offset, int64_t limit, const std::vector<OrderByType>& order_by_types,
+            const std::string& sort_keys, int64_t offset, int64_t limit, const std::vector<OrderByType>& order_by_types,
            TupleDescriptor* materialized_tuple_desc, const RowDescriptor& parent_node_row_desc,
            const RowDescriptor& parent_node_child_row_desc, const std::vector<ExprContext*>& analytic_partition_exprs)
            : OperatorFactory(id, "partition_sort_sink", plan_node_id),
@ -97,6 +97,7 @@ public:
              _sort_exec_exprs(sort_exec_exprs),
              _is_asc_order(is_asc_order),
              _is_null_first(is_null_first),
+              _sort_keys(sort_keys),
              _offset(offset),
              _limit(limit),
              _order_by_types(order_by_types),
@ -118,6 +119,7 @@ private:
    SortExecExprs& _sort_exec_exprs;
    std::vector<bool> _is_asc_order;
    std::vector<bool> _is_null_first;
+    const std::string _sort_keys;
    int64_t _offset;
    int64_t _limit;
    const std::vector<OrderByType>& _order_by_types;
--- a/be/src/exec/pipeline/table_function_operator.cpp
+++ b/be/src/exec/pipeline/table_function_operator.cpp
@ -4,6 +4,13 @@

 namespace starrocks::pipeline {

+void TableFunctionOperator::close(RuntimeState* state) {
+    if (_table_function != nullptr && _table_function_state != nullptr) {
+        _table_function->close(state, _table_function_state);
+    }
+    Operator::close(state);
+}
+
 bool TableFunctionOperator::has_output() const {
    return _input_chunk != nullptr && (_remain_repeat_times > 0 || _input_chunk_index < _input_chunk->num_rows());
 }
@ -16,8 +23,9 @@ bool TableFunctionOperator::is_finished() const {
    return _is_finished && !has_output();
 }

-void TableFunctionOperator::set_finishing(RuntimeState* state) {
+Status TableFunctionOperator::set_finishing(RuntimeState* state) {
    _is_finished = true;
+    return Status::OK();
 }

 Status TableFunctionOperator::prepare(RuntimeState* state) {
@ -58,7 +66,8 @@ Status TableFunctionOperator::prepare(RuntimeState* state) {
        return_types.emplace_back(return_type.type);
    }

-    _table_function = vectorized::get_table_function(table_function_name, arg_types, return_types);
+    _table_function =
+            vectorized::get_table_function(table_function_name, arg_types, return_types, table_fn.binary_type);
    if (_table_function == nullptr) {
        return Status::InternalError("can't find table function " + table_function_name);
    }
@ -69,7 +78,8 @@ Status TableFunctionOperator::prepare(RuntimeState* state) {
    _remain_repeat_times = 0;

    _table_function_exec_timer = ADD_TIMER(_unique_metrics, "TableFunctionTime");
-    return _table_function->prepare(_table_function_state);
+    RETURN_IF_ERROR(_table_function->prepare(_table_function_state));
+    return _table_function->open(state, _table_function_state);
 }

 StatusOr<vectorized::ChunkPtr> TableFunctionOperator::pull_chunk(RuntimeState* state) {
--- a/be/src/exec/pipeline/table_function_operator.h
+++ b/be/src/exec/pipeline/table_function_operator.h
@ -2,8 +2,6 @@

 #pragma once

-#include "column/column_helper.h"
-#include "column/type_traits.h"
 #include "column/vectorized_fwd.h"
 #include "exec/pipeline/operator.h"
 #include "exprs/expr.h"
@ -21,13 +19,15 @@ public:

    Status prepare(RuntimeState* state) override;

+    void close(RuntimeState* state) override;
+
    bool has_output() const override;

    bool need_input() const override;

    bool is_finished() const override;

-    void set_finishing(RuntimeState* state) override;
+    Status set_finishing(RuntimeState* state) override;

    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

@ -38,7 +38,7 @@ private:
    void _process_table_function();

    const TPlanNode& _tnode;
-    const vectorized::TableFunction* _table_function;
+    const vectorized::TableFunction* _table_function = nullptr;

    //Slots of output by table function
    std::vector<SlotId> _fn_result_slots;
@ -52,15 +52,15 @@ private:
    //Input chunk currently being processed
    vectorized::ChunkPtr _input_chunk;
    //The current chunk is processed to which row
-    size_t _input_chunk_index;
+    size_t _input_chunk_index = 0;
    //The current outer line needs to be repeated several times
-    size_t _remain_repeat_times;
+    size_t _remain_repeat_times = 0;
    //table function result
    std::pair<vectorized::Columns, vectorized::ColumnPtr> _table_function_result;
    //table function return result end ?
-    bool _table_function_result_eos;
+    bool _table_function_result_eos = false;
    //table function param and return offset
-    vectorized::TableFunctionState* _table_function_state;
+    vectorized::TableFunctionState* _table_function_state = nullptr;

    //Profile
    RuntimeProfile::Counter* _table_function_exec_timer = nullptr;
--- a/Show More
+++ b/Show More