[Cherry-pick][Feature] Add a case sensitive flag to hdfs scan node to indicate whether (#9744 ) (#11275 )

fix client (#10932 )
[Enhancement]enable fe to list all file in hdfs recursively
2022-09-16 15:52:01 +08:00 · 2022-09-07 11:36:33 +08:00 · 2022-07-26 19:55:45 +08:00 · 2022-07-26 19:49:15 +08:00 · 2022-07-26 15:13:00 +08:00 · 2022-07-26 13:41:14 +08:00
1012 changed files with 48684 additions and 6342 deletions
--- a/.github/workflows/add-pr-label.yml
+++ b/.github/workflows/add-pr-label.yml
@ -0,0 +1,18 @@
+name: Labels
+
+on:
+  pull_request_target:
+    types:
+      - opened
+    paths:
+      - 'docs/**'
+
+jobs:
+  pr-label:
+    runs-on: ubuntu-latest
+    steps:  
+      - name: add document label
+        uses: actions-ecosystem/action-add-labels@v1
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          labels: documentation
--- a/.github/workflows/sonar4fe.yml
+++ b/.github/workflows/sonar4fe.yml
@ -0,0 +1,54 @@
+name: FE Sonar Build
+on:
+  push:
+    branches:
+      - branch-2.3
+  pull_request:
+    paths:
+      - 'fe/**.java'
+      - 'fe/**.xml'
+  
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
+          
+      - name: Set up JDK 11
+        uses: actions/setup-java@v3
+        with:
+          java-version: 11
+          distribution: 'adopt'
+          
+      - name: Cache SonarCloud packages
+        uses: actions/cache@v3
+        with:
+          path: ~/.sonar/cache
+          key: ${{ runner.os }}-sonar
+          restore-keys: ${{ runner.os }}-sonar
+          
+      - name: Cache Maven packages
+        uses: actions/cache@v3
+        with:
+          path: ~/.m2
+          key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+          restore-keys: ${{ runner.os }}-maven
+          
+      - name: Setup thrift
+        uses: dodopizza/setup-thrift@v1
+        with:
+          version: 0.13.0
+          
+      - name: Analyze FE
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
+          SONAR_TOKEN: 391d6539e2d09aed3d187353bfd85fefa7a4c281 # ${{ secrets.SONAR_TOKEN }}
+        run: |
+          thrift --version
+          whereis thrift
+          mkdir -p thirdparty/installed/bin/
+          cd thirdparty/installed/bin/ && ln -s /usr/local/bin/thrift thrift
+          cd ${{ github.workspace }}/fe
+          mvn -B -DskipTests verify org.sonarsource.scanner.maven:sonar-maven-plugin:sonar -Dsonar.projectKey=StarRocks_starrocks -Dsonar.pullrequest.key=${{ github.event.number }} -Dsonar.pullrequest.base=${{ github.base_ref }} -Dsonar.pullrequest.branch=${{ github.head_ref }}
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@ -740,6 +740,7 @@ install(FILES
    ${BASE_DIR}/../conf/be.conf
    ${BASE_DIR}/../conf/cn.conf
    ${BASE_DIR}/../conf/hadoop_env.sh
+    ${BASE_DIR}/../conf/log4j.properties
    DESTINATION ${OUTPUT_DIR}/conf)

 install(DIRECTORY
--- a/be/src/agent/heartbeat_server.cpp
+++ b/be/src/agent/heartbeat_server.cpp
@ -34,7 +34,6 @@
 #include "storage/storage_engine.h"
 #include "storage/utils.h"
 #include "util/debug_util.h"
-#include "util/network_util.h"
 #include "util/thrift_server.h"

 using std::fstream;
@ -80,52 +79,11 @@ Status HeartbeatServer::_heartbeat(const TMasterInfo& master_info) {

    if (master_info.__isset.backend_ip) {
        if (master_info.backend_ip != BackendOptions::get_localhost()) {
-            LOG(INFO) << master_info.backend_ip << " not equal to to backend localhost "
-                      << BackendOptions::get_localhost();
-            if (is_valid_ip(master_info.backend_ip)) {
-                LOG(WARNING) << "backend ip saved in master does not equal to backend local ip"
-                             << master_info.backend_ip << " vs. " << BackendOptions::get_localhost();
-                std::stringstream ss;
-                ss << "actual backend local ip: " << BackendOptions::get_localhost();
-                return Status::InternalError(ss.str());
-            }
-
-            std::string ip = hostname_to_ip(master_info.backend_ip);
-            if (ip.empty()) {
-                std::stringstream ss;
-                ss << "can not get ip from fqdn: " << master_info.backend_ip;
-                LOG(WARNING) << ss.str();
-                return Status::InternalError(ss.str());
-            }
-
-            std::vector<InetAddress> hosts;
-            Status status = get_hosts_v4(&hosts);
-            if (!status.ok() || hosts.empty()) {
-                std::stringstream ss;
-                ss << "the status was not ok when get_hosts_v4, error is " << status.get_error_msg();
-                LOG(WARNING) << ss.str();
-                return Status::InternalError(ss.str());
-            }
-
-            bool set_new_localhost = false;
-
-            for (std::vector<InetAddress>::iterator addr_it = hosts.begin(); addr_it != hosts.end(); ++addr_it) {
-                if (addr_it->is_address_v4() && addr_it->get_host_address_v4() == ip) {
-                    BackendOptions::set_localhost(master_info.backend_ip);
-                    set_new_localhost = true;
-                    break;
-                }
-            }
-
-            if (!set_new_localhost) {
-                std::stringstream ss;
-                ss << "the host recorded in master is " << master_info.backend_ip
-                   << ", but we cannot found the local ip that mapped to that host." << BackendOptions::get_localhost();
-                LOG(WARNING) << ss.str();
-                return Status::InternalError(ss.str());
-            }
-
-            LOG(INFO) << "update localhost done, the new localhost is " << BackendOptions::get_localhost();
+            LOG(WARNING) << "backend ip saved in master does not equal to backend local ip" << master_info.backend_ip
+                         << " vs. " << BackendOptions::get_localhost();
+            std::stringstream ss;
+            ss << "actual backend local ip: " << BackendOptions::get_localhost();
+            return Status::InternalError(ss.str());
        }
    }

--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@ -390,7 +390,7 @@ void* TaskWorkerPool::_create_tablet_worker_thread_callback(void* arg_this) {

        TFinishTaskRequest finish_task_request;
        finish_task_request.__set_finish_tablet_infos(finish_tablet_infos);
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_report_version(_s_report_version);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
@ -445,7 +445,7 @@ void* TaskWorkerPool::_drop_tablet_worker_thread_callback(void* arg_this) {
        task_status.__set_error_msgs(error_msgs);

        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
        finish_task_request.__set_task_status(task_status);
@ -547,7 +547,7 @@ void TaskWorkerPool::_alter_tablet(TaskWorkerPool* worker_pool_this, const TAgen
    }

    // Return result to fe
-    finish_task_request->__set_backend(BackendOptions::get_localBackend());
+    finish_task_request->__set_backend(_backend);
    finish_task_request->__set_report_version(_s_report_version);
    finish_task_request->__set_task_type(task_type);
    finish_task_request->__set_signature(signature);
@ -684,7 +684,7 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) {
        TStatus task_status;

        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
        if (push_req.push_type == TPushType::DELETE) {
@ -901,7 +901,7 @@ void* TaskWorkerPool::_publish_version_worker_thread_callback(void* arg_this) {
        }

        status.to_thrift(&finish_task_request.task_status);
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(publish_version_task->task_type);
        finish_task_request.__set_signature(publish_version_task->signature);
        finish_task_request.__set_report_version(_s_report_version);
@ -990,7 +990,7 @@ void* TaskWorkerPool::_clear_transaction_task_worker_thread_callback(void* arg_t

        TFinishTaskRequest finish_task_request;
        finish_task_request.__set_task_status(task_status);
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);

@ -1053,7 +1053,7 @@ void* TaskWorkerPool::_update_tablet_meta_worker_thread_callback(void* arg_this)
                    // because the primary index is available in cache
                    // But it will be remove from index cache after apply is finished
                    auto manager = StorageEngine::instance()->update_manager();
-                    manager->index_cache().remove_by_key(tablet->tablet_id());
+                    manager->index_cache().try_remove_by_key(tablet->tablet_id());
                    break;
                }
            }
@ -1067,7 +1067,7 @@ void* TaskWorkerPool::_update_tablet_meta_worker_thread_callback(void* arg_this)

        TFinishTaskRequest finish_task_request;
        finish_task_request.__set_task_status(task_status);
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);

@ -1105,7 +1105,7 @@ void* TaskWorkerPool::_clone_worker_thread_callback(void* arg_this) {
        // Return result to fe
        TStatus task_status;
        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);

@ -1199,7 +1199,7 @@ void* TaskWorkerPool::_storage_medium_migrate_worker_thread_callback(void* arg_t
        std::vector<std::string> error_msgs;
        TStatus task_status;
        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);

@ -1322,7 +1322,7 @@ void* TaskWorkerPool::_check_consistency_worker_thread_callback(void* arg_this)
        task_status.__set_error_msgs(error_msgs);

        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
        finish_task_request.__set_task_status(task_status);
@ -1339,6 +1339,7 @@ void* TaskWorkerPool::_report_task_worker_thread_callback(void* arg_this) {
    TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this;

    TReportRequest request;
+    request.__set_backend(worker_pool_this->_backend);

    while ((!worker_pool_this->_stopped)) {
        if (worker_pool_this->_master_info.network_address.port == 0) {
@ -1356,7 +1357,6 @@ void* TaskWorkerPool::_report_task_worker_thread_callback(void* arg_this) {
            }
        }
        request.__set_tasks(tasks);
-        request.__set_backend(BackendOptions::get_localBackend());

        StarRocksMetrics::instance()->report_task_requests_total.increment(1);
        TMasterResult result;
@ -1378,6 +1378,7 @@ void* TaskWorkerPool::_report_disk_state_worker_thread_callback(void* arg_this)
    TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this;

    TReportRequest request;
+    request.__set_backend(worker_pool_this->_backend);

    while ((!worker_pool_this->_stopped)) {
        if (worker_pool_this->_master_info.network_address.port == 0) {
@ -1411,7 +1412,6 @@ void* TaskWorkerPool::_report_disk_state_worker_thread_callback(void* arg_this)
            StarRocksMetrics::instance()->disks_state.set_metric(root_path_info.path, root_path_info.is_used ? 1L : 0L);
        }
        request.__set_disks(disks);
-        request.__set_backend(BackendOptions::get_localBackend());

        StarRocksMetrics::instance()->report_disk_requests_total.increment(1);
        TMasterResult result;
@ -1434,6 +1434,7 @@ void* TaskWorkerPool::_report_tablet_worker_thread_callback(void* arg_this) {
    TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this;

    TReportRequest request;
+    request.__set_backend(worker_pool_this->_backend);
    request.__isset.tablets = true;
    AgentStatus status = STARROCKS_SUCCESS;

@ -1459,7 +1460,6 @@ void* TaskWorkerPool::_report_tablet_worker_thread_callback(void* arg_this) {
                std::max(StarRocksMetrics::instance()->tablet_cumulative_max_compaction_score.value(),
                         StarRocksMetrics::instance()->tablet_base_max_compaction_score.value());
        request.__set_tablet_max_compaction_score(max_compaction_score);
-        request.__set_backend(BackendOptions::get_localBackend());

        TMasterResult result;
        status = worker_pool_this->_master_client->report(request, &result);
@ -1482,6 +1482,7 @@ void* TaskWorkerPool::_report_workgroup_thread_callback(void* arg_this) {
    TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this;

    TReportRequest request;
+    request.__set_backend(worker_pool_this->_backend);
    AgentStatus status = STARROCKS_SUCCESS;

    while ((!worker_pool_this->_stopped)) {
@ -1497,7 +1498,6 @@ void* TaskWorkerPool::_report_workgroup_thread_callback(void* arg_this) {
        request.__set_report_version(_s_report_version);
        auto workgroups = workgroup::WorkGroupManager::instance()->list_workgroups();
        request.__set_active_workgroups(std::move(workgroups));
-        request.__set_backend(BackendOptions::get_localBackend());
        TMasterResult result;
        status = worker_pool_this->_master_client->report(request, &result);

@ -1555,7 +1555,7 @@ void* TaskWorkerPool::_upload_worker_thread_callback(void* arg_this) {
        task_status.__set_error_msgs(error_msgs);

        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
        finish_task_request.__set_task_status(task_status);
@ -1611,7 +1611,7 @@ void* TaskWorkerPool::_download_worker_thread_callback(void* arg_this) {
        task_status.__set_error_msgs(error_msgs);

        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
        finish_task_request.__set_task_status(task_status);
@ -1685,7 +1685,7 @@ void* TaskWorkerPool::_make_snapshot_thread_callback(void* arg_this) {
        task_status.__set_error_msgs(error_msgs);

        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
        finish_task_request.__set_snapshot_path(snapshot_path);
@ -1739,7 +1739,7 @@ void* TaskWorkerPool::_release_snapshot_thread_callback(void* arg_this) {
        task_status.__set_error_msgs(error_msgs);

        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
        finish_task_request.__set_task_status(task_status);
@ -1807,7 +1807,7 @@ void* TaskWorkerPool::_move_dir_thread_callback(void* arg_this) {
        task_status.__set_error_msgs(error_msgs);

        TFinishTaskRequest finish_task_request;
-        finish_task_request.__set_backend(BackendOptions::get_localBackend());
+        finish_task_request.__set_backend(worker_pool_this->_backend);
        finish_task_request.__set_task_type(agent_task_req.task_type);
        finish_task_request.__set_signature(agent_task_req.signature);
        finish_task_request.__set_task_status(task_status);
--- a/be/src/column/array_column.cpp
+++ b/be/src/column/array_column.cpp
@ -45,7 +45,8 @@ uint8_t* ArrayColumn::mutable_raw_data() {

 size_t ArrayColumn::byte_size(size_t from, size_t size) const {
    DCHECK_LE(from + size, this->size()) << "Range error";
-    return _elements->byte_size(_offsets->get_data()[from], _offsets->get_data()[from + size]) +
+    return _elements->byte_size(_offsets->get_data()[from],
+                                _offsets->get_data()[from + size] - _offsets->get_data()[from]) +
           _offsets->Column::byte_size(from, size);
 }

@ -130,6 +131,18 @@ void ArrayColumn::append_default(size_t count) {
    _offsets->append_value_multiple_times(&offset, count);
 }

+void ArrayColumn::fill_default(const Filter& filter) {
+    std::vector<uint32_t> indexes;
+    for (size_t i = 0; i < filter.size(); i++) {
+        if (filter[i] == 1 && get_element_size(i) > 0) {
+            indexes.push_back(i);
+        }
+    }
+    auto default_column = clone_empty();
+    default_column->append_default(indexes.size());
+    update_rows(*default_column, indexes.data());
+}
+
 Status ArrayColumn::update_rows(const Column& src, const uint32_t* indexes) {
    const auto& array_column = down_cast<const ArrayColumn&>(src);

@ -427,6 +440,11 @@ Datum ArrayColumn::get(size_t idx) const {
    return Datum(res);
 }

+size_t ArrayColumn::get_element_size(size_t idx) const {
+    DCHECK_LT(idx + 1, _offsets->size());
+    return _offsets->get_data()[idx + 1] - _offsets->get_data()[idx];
+}
+
 bool ArrayColumn::set_null(size_t idx) {
    return false;
 }
--- a/be/src/column/array_column.h
+++ b/be/src/column/array_column.h
@ -80,6 +80,8 @@ public:

    void append_default(size_t count) override;

+    void fill_default(const Filter& filter) override;
+
    Status update_rows(const Column& src, const uint32_t* indexes) override;

    void remove_first_n_values(size_t count) override {}
@ -119,6 +121,8 @@ public:

    Datum get(size_t idx) const override;

+    size_t get_element_size(size_t idx) const;
+
    bool set_null(size_t idx) override;

    size_t memory_usage() const override { return _elements->memory_usage() + _offsets->memory_usage(); }
--- a/be/src/column/binary_column.cpp
+++ b/be/src/column/binary_column.cpp
@ -209,6 +209,23 @@ void BinaryColumnBase<T>::_build_slices() const {
    _slices_cache = true;
 }

+template <typename T>
+void BinaryColumnBase<T>::fill_default(const Filter& filter) {
+    std::vector<uint32_t> indexes;
+    for (size_t i = 0; i < filter.size(); i++) {
+        size_t len = _offsets[i + 1] - _offsets[i];
+        if (filter[i] == 1 && len > 0) {
+            indexes.push_back(i);
+        }
+    }
+    if (indexes.empty()) {
+        return;
+    }
+    auto default_column = clone_empty();
+    default_column->append_default(indexes.size());
+    update_rows(*default_column, indexes.data());
+}
+
 template <typename T>
 Status BinaryColumnBase<T>::update_rows(const Column& src, const uint32_t* indexes) {
    const auto& src_column = down_cast<const BinaryColumnBase<T>&>(src);
--- a/be/src/column/binary_column.h
+++ b/be/src/column/binary_column.h
@ -181,6 +181,8 @@ public:
        _slices_cache = false;
    }

+    void fill_default(const Filter& filter) override;
+
    Status update_rows(const Column& src, const uint32_t* indexes) override;

    uint32_t max_one_element_serialize_size() const override;
@ -242,6 +244,8 @@ public:

    const Bytes& get_bytes() const { return _bytes; }

+    const uint8_t* continuous_data() const override { return reinterpret_cast<const uint8_t*>(_bytes.data()); }
+
    Offsets& get_offset() { return _offsets; }
    const Offsets& get_offset() const { return _offsets; }

--- a/be/src/column/column.h
+++ b/be/src/column/column.h
@ -85,6 +85,8 @@ public:

    virtual uint8_t* mutable_raw_data() = 0;

+    virtual const uint8_t* continuous_data() const { return raw_data(); }
+
    // Return number of values in column.
    virtual size_t size() const = 0;

@ -147,6 +149,9 @@ public:

    virtual void append(const Column& src) { append(src, 0, src.size()); }

+    // Update elements to default value which hit by the filter
+    virtual void fill_default(const Filter& filter) = 0;
+
    // This function will update data from src according to the input indexes. 'indexes' contains
    // the row index will be update
    // For example:
--- a/be/src/column/const_column.cpp
+++ b/be/src/column/const_column.cpp
@ -35,6 +35,10 @@ void ConstColumn::append_value_multiple_times(const Column& src, uint32_t index,
    append(src, index, size);
 }

+void ConstColumn::fill_default(const Filter& filter) {
+    CHECK(false) << "ConstColumn does not support update";
+}
+
 Status ConstColumn::update_rows(const Column& src, const uint32_t* indexes) {
    return Status::NotSupported("ConstColumn does not support update");
 }
--- a/be/src/column/const_column.h
+++ b/be/src/column/const_column.h
@ -116,6 +116,8 @@ public:

    void append_default(size_t count) override { _size += count; }

+    void fill_default(const Filter& filter) override;
+
    Status update_rows(const Column& src, const uint32_t* indexes) override;

    uint32_t serialize(size_t idx, uint8_t* pos) override { return _data->serialize(0, pos); }
--- a/be/src/column/datum.h
+++ b/be/src/column/datum.h
@ -10,7 +10,6 @@
 #include "types/date_value.hpp"
 #include "types/timestamp_value.h"
 #include "util/int96.h"
-#include "util/json.h"
 #include "util/percentile_value.h"
 #include "util/slice.h"

@ -105,11 +104,6 @@ public:
        }
    }

-    template <typename T>
-    std::add_pointer_t<std::add_const_t<T>> get_if() const {
-        return std::get_if<std::remove_const_t<T>>(&_value);
-    }
-
    template <typename T>
    void set(T value) {
        if constexpr (std::is_same_v<DateValue, T>) {
@ -125,21 +119,6 @@ public:
        }
    }

-    template <typename T>
-    void move_in(T&& value) {
-        if constexpr (std::is_same_v<DateValue, T>) {
-            _value = value.julian();
-        } else if constexpr (std::is_same_v<TimestampValue, T>) {
-            _value = value.timestamp();
-        } else if constexpr (std::is_same_v<bool, T>) {
-            _value = (int8_t)value;
-        } else if constexpr (std::is_unsigned_v<T>) {
-            _value = (std::make_signed_t<T>)value;
-        } else {
-            _value = std::move(value);
-        }
-    }
-
    bool is_null() const { return _value.index() == 0; }

    void set_null() { _value = std::monostate(); }
@ -150,15 +129,9 @@ public:
    }

 private:
-    // NOTE
-    // Either JsonValue and JsonValue* could stored in datum.
-    // - Pointer type JsonValue* is used as view-type, to navigate datum in a column without copy data
-    // - Value type JsonValue is used to store real data and own the value itself, which is mostly used to hold a
-    //   JsonValue as return value. Right now only schema-change procedure use it.
-    using Variant =
-            std::variant<std::monostate, int8_t, uint8_t, int16_t, uint16_t, uint24_t, int32_t, uint32_t, int64_t,
-                         uint64_t, int96_t, int128_t, Slice, decimal12_t, DecimalV2Value, float, double, DatumArray,
-                         HyperLogLog*, BitmapValue*, PercentileValue*, JsonValue*, JsonValue>;
+    using Variant = std::variant<std::monostate, int8_t, uint8_t, int16_t, uint16_t, uint24_t, int32_t, uint32_t,
+                                 int64_t, uint64_t, int96_t, int128_t, Slice, decimal12_t, DecimalV2Value, float,
+                                 double, DatumArray, HyperLogLog*, BitmapValue*, PercentileValue*, JsonValue*>;
    Variant _value;
 };

--- a/be/src/column/fixed_length_column_base.cpp
+++ b/be/src/column/fixed_length_column_base.cpp
@ -10,6 +10,7 @@
 #include "storage/decimal12.h"
 #include "util/hash_util.hpp"
 #include "util/mysql_row_buffer.h"
+#include "util/value_generator.h"

 namespace starrocks::vectorized {

@ -49,6 +50,16 @@ void FixedLengthColumnBase<T>::append_value_multiple_times(const Column& src, ui
    }
 }

+template <typename T>
+void FixedLengthColumnBase<T>::fill_default(const Filter& filter) {
+    T val = DefaultValueGenerator<T>::next_value();
+    for (size_t i = 0; i < filter.size(); i++) {
+        if (filter[i] == 1) {
+            _data[i] = val;
+        }
+    }
+}
+
 template <typename T>
 Status FixedLengthColumnBase<T>::update_rows(const Column& src, const uint32_t* indexes) {
    const T* src_data = reinterpret_cast<const T*>(src.raw_data());
--- a/be/src/column/fixed_length_column_base.h
+++ b/be/src/column/fixed_length_column_base.h
@ -129,6 +129,8 @@ public:
        _data.resize(_data.size() + count, DefaultValueGenerator<ValueType>::next_value());
    }

+    void fill_default(const Filter& filter) override;
+
    Status update_rows(const Column& src, const uint32_t* indexes) override;

    // The `_data` support one size(> 2^32), but some interface such as update_rows() will use uint32_t to
--- a/be/src/column/json_column.cpp
+++ b/be/src/column/json_column.cpp
@ -9,14 +9,9 @@
 namespace starrocks::vectorized {

 void JsonColumn::append_datum(const Datum& datum) {
-    if (const JsonValue* json = datum.get_if<JsonValue>()) {
-        append(json);
-    } else if (JsonValue* const* json_p = datum.get_if<JsonValue*>()) {
-        append(*json_p);
-    } else {
-        CHECK(false) << "invalid datum type";
-    }
+    append(datum.get<JsonValue*>());
 }
+
 int JsonColumn::compare_at(size_t left_idx, size_t right_idx, const starrocks::vectorized::Column& rhs,
                           int nan_direction_hint) const {
    JsonValue* x = get_object(left_idx);
--- a/be/src/column/nullable_column.cpp
+++ b/be/src/column/nullable_column.cpp
@ -18,7 +18,7 @@ NullableColumn::NullableColumn(MutableColumnPtr&& data_column, MutableColumnPtr&
            << "nullable column's data must be single column";
    ColumnPtr ptr = std::move(null_column);
    _null_column = std::static_pointer_cast<NullColumn>(ptr);
-    _has_null = SIMD::count_nonzero(_null_column->get_data());
+    _has_null = SIMD::contain_nonzero(_null_column->get_data(), 0);
 }

 NullableColumn::NullableColumn(ColumnPtr data_column, NullColumnPtr null_column)
@ -58,7 +58,7 @@ void NullableColumn::append(const Column& src, size_t offset, size_t count) {

        _null_column->append(*c._null_column, offset, count);
        _data_column->append(*c._data_column, offset, count);
-        _has_null = _has_null || SIMD::count_nonzero(&(c._null_column->get_data()[offset]), count);
+        _has_null = _has_null || SIMD::contain_nonzero(c._null_column->get_data(), offset, count);
    } else {
        _null_column->resize(_null_column->size() + count);
        _data_column->append(src, offset, count);
@ -78,7 +78,7 @@ void NullableColumn::append_selective(const Column& src, const uint32_t* indexes

        _null_column->append_selective(*src_column._null_column, indexes, from, size);
        _data_column->append_selective(*src_column._data_column, indexes, from, size);
-        _has_null = _has_null || SIMD::count_nonzero(&_null_column->get_data()[orig_size], size);
+        _has_null = _has_null || SIMD::contain_nonzero(_null_column->get_data(), orig_size, size);
    } else {
        _null_column->resize(orig_size + size);
        _data_column->append_selective(src, indexes, from, size);
@ -98,7 +98,7 @@ void NullableColumn::append_value_multiple_times(const Column& src, uint32_t ind

        _null_column->append_value_multiple_times(*src_column._null_column, index, size);
        _data_column->append_value_multiple_times(*src_column._data_column, index, size);
-        _has_null = _has_null || SIMD::count_nonzero(&_null_column->get_data()[orig_size], size);
+        _has_null = _has_null || SIMD::contain_nonzero(_null_column->get_data(), orig_size, size);
    } else {
        _null_column->resize(orig_size + size);
        _data_column->append_value_multiple_times(src, index, size);
@ -157,6 +157,17 @@ void NullableColumn::append_value_multiple_times(const void* value, size_t count
    null_column_data().insert(null_column_data().end(), count, 0);
 }

+void NullableColumn::fill_null_with_default() {
+    if (null_count() == 0) {
+        return;
+    }
+    _data_column->fill_default(_null_column->get_data());
+}
+
+void NullableColumn::update_has_null() {
+    _has_null = SIMD::contain_nonzero(_null_column->get_data(), 0);
+}
+
 Status NullableColumn::update_rows(const Column& src, const uint32_t* indexes) {
    DCHECK_EQ(_null_column->size(), _data_column->size());
    size_t replace_num = src.size();
@ -164,6 +175,8 @@ Status NullableColumn::update_rows(const Column& src, const uint32_t* indexes) {
        const auto& c = down_cast<const NullableColumn&>(src);
        RETURN_IF_ERROR(_null_column->update_rows(*c._null_column, indexes));
        RETURN_IF_ERROR(_data_column->update_rows(*c._data_column, indexes));
+        // update rows may convert between null and not null, so we need count every times
+        update_has_null();
    } else {
        auto new_null_column = NullColumn::create();
        new_null_column->get_data().insert(new_null_column->get_data().end(), replace_num, 0);
@ -345,7 +358,7 @@ void NullableColumn::check_or_die() const {
    CHECK_EQ(_null_column->size(), _data_column->size());
    // when _has_null=true, the column may have no null value, so don't check.
    if (!_has_null) {
-        CHECK_EQ(SIMD::count_nonzero(_null_column->get_data()), 0);
+        CHECK(!SIMD::contain_nonzero(_null_column->get_data(), 0));
    }
    _data_column->check_or_die();
    _null_column->check_or_die();
--- a/be/src/column/nullable_column.h
+++ b/be/src/column/nullable_column.h
@ -51,11 +51,12 @@ public:

    void set_has_null(bool has_null) { _has_null = _has_null | has_null; }

-    void update_has_null() {
-        const NullColumn::Container& v = _null_column->get_data();
-        const auto* p = v.data();
-        _has_null = (p != nullptr) && (nullptr != memchr(p, 1, v.size() * sizeof(v[0])));
-    }
+    // Update null element to default value
+    void fill_null_with_default();
+
+    void fill_default(const Filter& filter) override {}
+
+    void update_has_null();

    bool is_nullable() const override { return true; }

--- a/be/src/column/object_column.cpp
+++ b/be/src/column/object_column.cpp
@ -127,6 +127,16 @@ void ObjectColumn<T>::append_default(size_t count) {
    }
 }

+template <typename T>
+void ObjectColumn<T>::fill_default(const Filter& filter) {
+    for (size_t i = 0; i < filter.size(); i++) {
+        if (filter[i] == 1) {
+            _pool[i] = {};
+        }
+    }
+    _cache_ok = false;
+}
+
 template <typename T>
 Status ObjectColumn<T>::update_rows(const Column& src, const uint32_t* indexes) {
    const auto& obj_col = down_cast<const ObjectColumn<T>&>(src);
--- a/be/src/column/object_column.h
+++ b/be/src/column/object_column.h
@ -103,6 +103,8 @@ public:

    void append_default(size_t count) override;

+    void fill_default(const Filter& filter) override;
+
    Status update_rows(const Column& src, const uint32_t* indexes) override;

    uint32_t serialize(size_t idx, uint8_t* pos) override;
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@ -23,8 +23,7 @@

 #include "configbase.h"

-namespace starrocks {
-namespace config {
+namespace starrocks::config {
 // The cluster id.
 CONF_Int32(cluster_id, "-1");
 // The port on which ImpalaInternalService is exported.
@ -587,8 +586,8 @@ CONF_Int32(late_materialization_ratio, "10");
 // `1000` will enable late materialization always select metric type.
 CONF_Int32(metric_late_materialization_ratio, "1000");

-// Max batched bytes for each transmit request.
-CONF_Int64(max_transmit_batched_bytes, "65536");
+// Max batched bytes for each transmit request. (256KB)
+CONF_Int64(max_transmit_batched_bytes, "262144");

 CONF_Int16(bitmap_max_filter_items, "30");

@ -743,6 +742,18 @@ CONF_String(starmgr_addr, "");
 CONF_Int32(starlet_port, "9070");
 #endif

-} // namespace config
+CONF_mBool(dependency_librdkafka_debug_enable, "false");

-} // namespace starrocks
+// A comma-separated list of debug contexts to enable.
+// Producer debug context: broker, topic, msg
+// Consumer debug context: consumer, cgrp, topic, fetch
+// Other debug context: generic, metadata, feature, queue, protocol, security, interceptor, plugin
+// admin, eos, mock, assigner, conf
+CONF_String(dependency_librdkafka_debug, "all");
+
+// Enable compression in table sink.
+// The BE supports compression would get error when communicate with BE dose not support compression.
+// For compatible consideration, we disable it by default.
+CONF_Bool(table_sink_compression_enable, "false");
+
+} // namespace starrocks::config
--- a/be/src/common/constexpr.h
+++ b/be/src/common/constexpr.h
@ -10,4 +10,6 @@ constexpr const int DEFAULT_CHUNK_SIZE = 4096;
 // Chunk size for some huge type(HLL, JSON)
 constexpr inline int CHUNK_SIZE_FOR_HUGE_TYPE = 4096;

+constexpr inline int NUM_LOCK_SHARD_LOG = 5;
+
 } // namespace starrocks
--- a/be/src/connector/connector.h
+++ b/be/src/connector/connector.h
@ -33,8 +33,12 @@ public:

    // how many rows read from storage
    virtual int64_t raw_rows_read() const = 0;
-    // how mnay rows returned after filtering.
+    // how many rows returned after filtering.
    virtual int64_t num_rows_read() const = 0;
+    // how many bytes read from external
+    virtual int64_t num_bytes_read() const = 0;
+    // CPU time of this data source
+    virtual int64_t cpu_time_spent() const = 0;

    // following fields are set by framework
    // 1. runtime profile: any metrics you want to record
--- a/be/src/connector/es_connector.cpp
+++ b/be/src/connector/es_connector.cpp
@ -78,6 +78,12 @@ int64_t ESDataSource::raw_rows_read() const {
 int64_t ESDataSource::num_rows_read() const {
    return _rows_return_number;
 }
+int64_t ESDataSource::num_bytes_read() const {
+    return _bytes_read;
+}
+int64_t ESDataSource::cpu_time_spent() const {
+    return _cpu_time_ns;
+}

 Status ESDataSource::_build_conjuncts() {
    Status status = Status::OK();
@ -218,6 +224,8 @@ Status ESDataSource::get_next(RuntimeState* state, vectorized::ChunkPtr* chunk)
                return Status::EndOfFile("");
            }
        }
+
+        SCOPED_RAW_TIMER(&_cpu_time_ns);
        {
            SCOPED_TIMER(_materialize_timer);
            RETURN_IF_ERROR(_es_scroll_parser->fill_chunk(state, chunk, &_line_eof));
@ -228,6 +236,7 @@ Status ESDataSource::get_next(RuntimeState* state, vectorized::ChunkPtr* chunk)
            int64_t before = ck->num_rows();
            COUNTER_UPDATE(_rows_read_counter, before);
            _rows_read_number += before;
+            _bytes_read += ck->bytes_usage();

            ExecNode::eval_conjuncts(_conjunct_ctxs, ck);

--- a/be/src/connector/es_connector.h
+++ b/be/src/connector/es_connector.h
@ -50,6 +50,8 @@ public:

    int64_t raw_rows_read() const override;
    int64_t num_rows_read() const override;
+    int64_t num_bytes_read() const override;
+    int64_t cpu_time_spent() const override;

 private:
    const ESDataSourceProvider* _provider;
@ -74,6 +76,8 @@ private:
    bool _batch_eof = false;
    int64_t _rows_read_number = 0;
    int64_t _rows_return_number = 0;
+    int64_t _bytes_read = 0;
+    int64_t _cpu_time_ns = 0;

    ESScanReader* _es_reader = nullptr;
    std::unique_ptr<vectorized::ScrollParser> _es_scroll_parser;
@ -82,7 +86,6 @@ private:
    RuntimeProfile::Counter* _read_timer = nullptr;
    RuntimeProfile::Counter* _materialize_timer = nullptr;
    RuntimeProfile::Counter* _rows_read_counter = nullptr;
-
    // =========================

    Status _build_conjuncts();
--- a/be/src/connector/hive_connector.cpp
+++ b/be/src/connector/hive_connector.cpp
@ -143,6 +143,9 @@ void HiveDataSource::_init_tuples_and_slots(RuntimeState* state) {
    if (hdfs_scan_node.__isset.hive_column_names) {
        _hive_column_names = hdfs_scan_node.hive_column_names;
    }
+    if (hdfs_scan_node.__isset.case_sensitive) {
+        _case_sensitive = hdfs_scan_node.case_sensitive;
+    }
 }

 void HiveDataSource::_decompose_conjunct_ctxs() {
@ -177,7 +180,6 @@ void HiveDataSource::_init_counter(RuntimeState* state) {
    const auto& hdfs_scan_node = _provider->_hdfs_scan_node;

    _profile.runtime_profile = _runtime_profile;
-    _profile.pool = _pool;
    _profile.rows_read_counter = ADD_COUNTER(_runtime_profile, "RowsRead", TUnit::UNIT);
    _profile.bytes_read_counter = ADD_COUNTER(_runtime_profile, "BytesRead", TUnit::BYTES);

@ -242,6 +244,7 @@ Status HiveDataSource::_init_scanner(RuntimeState* state) {
    scanner_params.min_max_conjunct_ctxs = _min_max_conjunct_ctxs;
    scanner_params.min_max_tuple_desc = _min_max_tuple_desc;
    scanner_params.hive_column_names = &_hive_column_names;
+    scanner_params.case_sensitive = _case_sensitive;
    scanner_params.profile = &_profile;
    scanner_params.open_limit = nullptr;

@ -292,6 +295,14 @@ int64_t HiveDataSource::num_rows_read() const {
    if (_scanner == nullptr) return 0;
    return _scanner->num_rows_read();
 }
+int64_t HiveDataSource::num_bytes_read() const {
+    if (_scanner == nullptr) return 0;
+    return _scanner->num_bytes_read();
+}
+int64_t HiveDataSource::cpu_time_spent() const {
+    if (_scanner == nullptr) return 0;
+    return _scanner->cpu_time_spent();
+}

 } // namespace connector
 } // namespace starrocks
--- a/be/src/connector/hive_connector.h
+++ b/be/src/connector/hive_connector.h
@ -43,6 +43,8 @@ public:

    int64_t raw_rows_read() const override;
    int64_t num_rows_read() const override;
+    int64_t num_bytes_read() const override;
+    int64_t cpu_time_spent() const override;

 private:
    const HiveDataSourceProvider* _provider;
@ -98,6 +100,7 @@ private:

    std::vector<std::string> _hive_column_names;
    const LakeTableDescriptor* _lake_table = nullptr;
+    bool _case_sensitive = false;

    // ======================================
    // The following are profile metrics
@ -105,4 +108,4 @@ private:
 };

 } // namespace connector
-} // namespace starrocks
+} // namespace starrocks
--- a/be/src/connector/jdbc_connector.cpp
+++ b/be/src/connector/jdbc_connector.cpp
@ -83,6 +83,7 @@ Status JDBCDataSource::get_next(RuntimeState* state, vectorized::ChunkPtr* chunk
        return Status::EndOfFile("");
    }
    _rows_read += (*chunk)->num_rows();
+    _bytes_read += (*chunk)->bytes_usage();
    return Status::OK();
 }

@ -92,6 +93,13 @@ int64_t JDBCDataSource::raw_rows_read() const {
 int64_t JDBCDataSource::num_rows_read() const {
    return _rows_read;
 }
+int64_t JDBCDataSource::num_bytes_read() const {
+    return _bytes_read;
+}
+int64_t JDBCDataSource::cpu_time_spent() const {
+    // TODO: calculte the real cputime
+    return 0;
+}

 Status JDBCDataSource::_create_scanner(RuntimeState* state) {
    const TJDBCScanNode& jdbc_scan_node = _provider->_jdbc_scan_node;
--- a/be/src/connector/jdbc_connector.h
+++ b/be/src/connector/jdbc_connector.h
@ -49,6 +49,8 @@ public:

    int64_t raw_rows_read() const override;
    int64_t num_rows_read() const override;
+    int64_t num_bytes_read() const override;
+    int64_t cpu_time_spent() const override;

 private:
    Status _create_scanner(RuntimeState* state);
@ -60,6 +62,7 @@ private:
    RuntimeState* _runtime_state = nullptr;
    vectorized::JDBCScanner* _scanner = nullptr;
    int64_t _rows_read = 0;
+    int64_t _bytes_read = 0;
 };

 } // namespace connector
--- a/be/src/connector/mysql_connector.cpp
+++ b/be/src/connector/mysql_connector.cpp
@ -230,6 +230,7 @@ Status MySQLDataSource::get_next(RuntimeState* state, vectorized::ChunkPtr* chun
        ++row_num;
        RETURN_IF_ERROR(fill_chunk(chunk, data, length));
        ++_rows_read;
+        _bytes_read += (*chunk)->bytes_usage();
    }
 }

@ -241,11 +242,21 @@ int64_t MySQLDataSource::num_rows_read() const {
    return _rows_read;
 }

+int64_t MySQLDataSource::num_bytes_read() const {
+    return _bytes_read;
+}
+
+int64_t MySQLDataSource::cpu_time_spent() const {
+    return _cpu_time_spent_ns;
+}
+
 void MySQLDataSource::close(RuntimeState* state) {
    SCOPED_TIMER(_runtime_profile->total_time_counter());
 }

 Status MySQLDataSource::fill_chunk(vectorized::ChunkPtr* chunk, char** data, size_t* length) {
+    SCOPED_RAW_TIMER(&_cpu_time_spent_ns);
+
    int materialized_col_idx = -1;
    for (size_t col_idx = 0; col_idx < _slot_num; ++col_idx) {
        SlotDescriptor* slot_desc = _tuple_desc->slots()[col_idx];
--- a/be/src/connector/mysql_connector.h
+++ b/be/src/connector/mysql_connector.h
@ -48,6 +48,8 @@ public:

    int64_t raw_rows_read() const override;
    int64_t num_rows_read() const override;
+    int64_t num_bytes_read() const override;
+    int64_t cpu_time_spent() const override;

 private:
    const MySQLDataSourceProvider* _provider;
@ -75,6 +77,8 @@ private:
    std::unique_ptr<MysqlScanner> _mysql_scanner;

    int64_t _rows_read = 0;
+    int64_t _bytes_read = 0;
+    int64_t _cpu_time_spent_ns = 0;

    Status fill_chunk(vectorized::ChunkPtr* chunk, char** data, size_t* length);

--- a/be/src/exec/CMakeLists.txt
+++ b/be/src/exec/CMakeLists.txt
@ -146,6 +146,7 @@ set(EXEC_FILES
    pipeline/scan/olap_scan_context.cpp
    pipeline/scan/connector_scan_operator.cpp
    pipeline/scan/morsel.cpp
+    pipeline/scan/chunk_buffer_limiter.cpp
    pipeline/select_operator.cpp
    pipeline/crossjoin/cross_join_context.cpp
    pipeline/crossjoin/cross_join_right_sink_operator.cpp
@ -190,6 +191,7 @@ set(EXEC_FILES
    pipeline/set/intersect_build_sink_operator.cpp
    pipeline/set/intersect_probe_sink_operator.cpp
    pipeline/set/intersect_output_source_operator.cpp
+    pipeline/chunk_accumulate_operator.cpp
    workgroup/work_group.cpp
    workgroup/scan_executor.cpp
    workgroup/scan_task_queue.cpp
--- a/be/src/exec/exchange_node.cpp
+++ b/be/src/exec/exchange_node.cpp
@ -22,6 +22,7 @@
 #include "exec/exchange_node.h"

 #include "column/chunk.h"
+#include "exec/pipeline/chunk_accumulate_operator.h"
 #include "exec/pipeline/exchange/exchange_merge_sort_source_operator.h"
 #include "exec/pipeline/exchange/exchange_source_operator.h"
 #include "exec/pipeline/limit_operator.h"
@ -234,15 +235,13 @@ void ExchangeNode::debug_string(int indentation_level, std::stringstream* out) c

 pipeline::OpFactories ExchangeNode::decompose_to_pipeline(pipeline::PipelineBuilderContext* context) {
    using namespace pipeline;
+
    OpFactories operators;
    if (!_is_merging) {
        auto exchange_source_op = std::make_shared<ExchangeSourceOperatorFactory>(
                context->next_operator_id(), id(), _texchange_node, _num_senders, _input_row_desc);
        exchange_source_op->set_degree_of_parallelism(context->degree_of_parallelism());
        operators.emplace_back(exchange_source_op);
-        if (limit() != -1) {
-            operators.emplace_back(std::make_shared<LimitOperatorFactory>(context->next_operator_id(), id(), limit()));
-        }
    } else {
        auto exchange_merge_sort_source_operator = std::make_shared<ExchangeMergeSortSourceOperatorFactory>(
                context->next_operator_id(), id(), _num_senders, _input_row_desc, &_sort_exec_exprs, _is_asc_order,
@ -250,10 +249,16 @@ pipeline::OpFactories ExchangeNode::decompose_to_pipeline(pipeline::PipelineBuil
        exchange_merge_sort_source_operator->set_degree_of_parallelism(1);
        operators.emplace_back(std::move(exchange_merge_sort_source_operator));
    }
+
    // Create a shared RefCountedRuntimeFilterCollector
    auto&& rc_rf_probe_collector = std::make_shared<RcRfProbeCollector>(1, std::move(this->runtime_filter_collector()));
    // Initialize OperatorFactory's fields involving runtime filters.
    this->init_runtime_filter_for_operator(operators.back().get(), context, rc_rf_probe_collector);
+
+    if (operators.back()->has_runtime_filters()) {
+        operators.emplace_back(std::make_shared<ChunkAccumulateOperatorFactory>(context->next_operator_id(), id()));
+    }
+
    if (limit() != -1) {
        operators.emplace_back(std::make_shared<LimitOperatorFactory>(context->next_operator_id(), id(), limit()));
    }
--- a/be/src/exec/exec_node.cpp
+++ b/be/src/exec/exec_node.cpp
@ -313,7 +313,7 @@ Status ExecNode::close(RuntimeState* state) {
        return Status::OK();
    }
    _is_closed = true;
-    RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE));
+    exec_debug_action(TExecNodePhase::CLOSE);

    if (_rows_returned_counter != nullptr) {
        COUNTER_SET(_rows_returned_counter, _num_rows_returned);
--- a/be/src/exec/mysql_scanner.cpp
+++ b/be/src/exec/mysql_scanner.cpp
@ -30,7 +30,7 @@
 namespace starrocks {

 MysqlScanner::MysqlScanner(const MysqlScannerParam& param)
-        : _my_param(param), _my_conn(nullptr), _my_result(nullptr), _is_open(false), _field_num(0) {}
+        : _my_param(param), _my_conn(nullptr), _my_result(nullptr), _opened(false), _field_num(0) {}

 MysqlScanner::~MysqlScanner() {
    if (_my_result) {
@ -50,7 +50,7 @@ MysqlScanner::~MysqlScanner() {
 }

 Status MysqlScanner::open() {
-    if (_is_open) {
+    if (_opened) {
        LOG(INFO) << "this scanner already opened";
        return Status::OK();
    }
@ -77,13 +77,13 @@ Status MysqlScanner::open() {
        return Status::InternalError("mysql set character set failed.");
    }

-    _is_open = true;
+    _opened = true;

    return Status::OK();
 }

 Status MysqlScanner::query(const std::string& query) {
-    if (!_is_open) {
+    if (!_opened) {
        return Status::InternalError("Query before open.");
    }

@ -118,7 +118,7 @@ Status MysqlScanner::query(const std::string& table, const std::vector<std::stri
                           const std::vector<std::string>& filters,
                           const std::unordered_map<std::string, std::vector<std::string>>& filters_in,
                           std::unordered_map<std::string, bool>& filters_null_in_set, int64_t limit) {
-    if (!_is_open) {
+    if (!_opened) {
        return Status::InternalError("Query before open.");
    }

@ -191,7 +191,7 @@ Status MysqlScanner::query(const std::string& table, const std::vector<std::stri
 }

 Status MysqlScanner::get_next_row(char*** buf, unsigned long** lengths, bool* eos) {
-    if (!_is_open) {
+    if (!_opened) {
        return Status::InternalError("GetNextRow before open.");
    }

--- a/be/src/exec/mysql_scanner.h
+++ b/be/src/exec/mysql_scanner.h
@ -72,7 +72,7 @@ private:
    __StarRocksMysql* _my_conn;
    __StarRocksMysqlRes* _my_result;
    std::string _sql_str;
-    bool _is_open;
+    bool _opened;
    int _field_num;
 };

--- a/be/src/exec/pipeline/aggregate/aggregate_blocking_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_blocking_sink_operator.cpp
@ -53,7 +53,7 @@ StatusOr<vectorized::ChunkPtr> AggregateBlockingSinkOperator::pull_chunk(Runtime
 }

 Status AggregateBlockingSinkOperator::push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) {
-    _aggregator->evaluate_exprs(chunk.get());
+    RETURN_IF_ERROR(_aggregator->evaluate_exprs(chunk.get()));

    bool agg_group_by_with_limit =
            (!_aggregator->is_none_group_by_exprs() &&     // has group by
@ -75,8 +75,7 @@ Status AggregateBlockingSinkOperator::push_chunk(RuntimeState* state, const vect
        APPLY_FOR_AGG_VARIANT_ALL(HASH_MAP_METHOD)
 #undef HASH_MAP_METHOD

-        _mem_tracker->set(_aggregator->hash_map_variant().memory_usage() +
-                          _aggregator->mem_pool()->total_reserved_bytes());
+        _mem_tracker->set(_aggregator->hash_map_variant().reserved_memory_usage(_aggregator->mem_pool()));
        TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_map());
    }
    if (_aggregator->is_none_group_by_exprs()) {
--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.cpp
@ -47,7 +47,7 @@ StatusOr<vectorized::ChunkPtr> AggregateDistinctBlockingSinkOperator::pull_chunk

 Status AggregateDistinctBlockingSinkOperator::push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) {
    DCHECK_LE(chunk->num_rows(), state->chunk_size());
-    _aggregator->evaluate_exprs(chunk.get());
+    RETURN_IF_ERROR(_aggregator->evaluate_exprs(chunk.get()));

    {
        SCOPED_TIMER(_aggregator->agg_compute_timer());
@ -63,8 +63,7 @@ Status AggregateDistinctBlockingSinkOperator::push_chunk(RuntimeState* state, co
        APPLY_FOR_AGG_VARIANT_ALL(HASH_SET_METHOD)
 #undef HASH_SET_METHOD

-        _mem_tracker->set(_aggregator->hash_set_variant().memory_usage() +
-                          _aggregator->mem_pool()->total_reserved_bytes());
+        _mem_tracker->set(_aggregator->hash_set_variant().reserved_memory_usage(_aggregator->mem_pool()));
        TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_set());

        _aggregator->update_num_input_rows(chunk->num_rows());
--- a/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_distinct_streaming_sink_operator.cpp
@ -38,7 +38,7 @@ Status AggregateDistinctStreamingSinkOperator::push_chunk(RuntimeState* state, c
    _aggregator->update_num_input_rows(chunk_size);
    COUNTER_SET(_aggregator->input_row_count(), _aggregator->num_input_rows());

-    _aggregator->evaluate_exprs(chunk.get());
+    RETURN_IF_ERROR(_aggregator->evaluate_exprs(chunk.get()));

    if (_aggregator->streaming_preaggregation_mode() == TStreamingPreaggregationMode::FORCE_STREAMING) {
        return _push_chunk_by_force_streaming();
@ -74,7 +74,7 @@ Status AggregateDistinctStreamingSinkOperator::_push_chunk_by_force_preaggregati

    COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_set_variant().size());

-    _mem_tracker->set(_aggregator->hash_set_variant().memory_usage() + _aggregator->mem_pool()->total_reserved_bytes());
+    _mem_tracker->set(_aggregator->hash_set_variant().reserved_memory_usage(_aggregator->mem_pool()));
    TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_set());

    return Status::OK();
@ -85,9 +85,9 @@ Status AggregateDistinctStreamingSinkOperator::_push_chunk_by_auto(const size_t
    size_t real_capacity = _aggregator->hash_set_variant().capacity() - _aggregator->hash_set_variant().capacity() / 8;
    size_t remain_size = real_capacity - _aggregator->hash_set_variant().size();
    bool ht_needs_expansion = remain_size < chunk_size;
+    size_t allocated_bytes = _aggregator->hash_set_variant().allocated_memory_usage(_aggregator->mem_pool());
    if (!ht_needs_expansion ||
-        _aggregator->should_expand_preagg_hash_tables(_aggregator->num_input_rows(), chunk_size,
-                                                      _aggregator->mem_pool()->total_allocated_bytes(),
+        _aggregator->should_expand_preagg_hash_tables(_aggregator->num_input_rows(), chunk_size, allocated_bytes,
                                                      _aggregator->hash_set_variant().size())) {
        // hash table is not full or allow expand the hash table according reduction rate
        SCOPED_TIMER(_aggregator->agg_compute_timer());
@ -106,8 +106,7 @@ Status AggregateDistinctStreamingSinkOperator::_push_chunk_by_auto(const size_t

        COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_set_variant().size());

-        _mem_tracker->set(_aggregator->hash_set_variant().memory_usage() +
-                          _aggregator->mem_pool()->total_reserved_bytes());
+        _mem_tracker->set(_aggregator->hash_set_variant().reserved_memory_usage(_aggregator->mem_pool()));
        TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_set());
    } else {
        {
--- a/be/src/exec/pipeline/aggregate/aggregate_streaming_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_streaming_sink_operator.cpp
@ -38,7 +38,7 @@ Status AggregateStreamingSinkOperator::push_chunk(RuntimeState* state, const vec
    _aggregator->update_num_input_rows(chunk_size);
    COUNTER_SET(_aggregator->input_row_count(), _aggregator->num_input_rows());

-    _aggregator->evaluate_exprs(chunk.get());
+    RETURN_IF_ERROR(_aggregator->evaluate_exprs(chunk.get()));

    if (_aggregator->streaming_preaggregation_mode() == TStreamingPreaggregationMode::FORCE_STREAMING) {
        return _push_chunk_by_force_streaming();
@ -78,7 +78,7 @@ Status AggregateStreamingSinkOperator::_push_chunk_by_force_preaggregation(const
        _aggregator->compute_batch_agg_states(chunk_size);
    }

-    _mem_tracker->set(_aggregator->hash_map_variant().memory_usage() + _aggregator->mem_pool()->total_reserved_bytes());
+    _mem_tracker->set(_aggregator->hash_map_variant().reserved_memory_usage(_aggregator->mem_pool()));
    TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_map());

    COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_map_variant().size());
@ -91,9 +91,9 @@ Status AggregateStreamingSinkOperator::_push_chunk_by_auto(const size_t chunk_si
    size_t real_capacity = _aggregator->hash_map_variant().capacity() - _aggregator->hash_map_variant().capacity() / 8;
    size_t remain_size = real_capacity - _aggregator->hash_map_variant().size();
    bool ht_needs_expansion = remain_size < chunk_size;
+    size_t allocated_bytes = _aggregator->hash_map_variant().allocated_memory_usage(_aggregator->mem_pool());
    if (!ht_needs_expansion ||
-        _aggregator->should_expand_preagg_hash_tables(_aggregator->num_input_rows(), chunk_size,
-                                                      _aggregator->mem_pool()->total_allocated_bytes(),
+        _aggregator->should_expand_preagg_hash_tables(_aggregator->num_input_rows(), chunk_size, allocated_bytes,
                                                      _aggregator->hash_map_variant().size())) {
        // hash table is not full or allow expand the hash table according reduction rate
        SCOPED_TIMER(_aggregator->agg_compute_timer());
@ -116,8 +116,7 @@ Status AggregateStreamingSinkOperator::_push_chunk_by_auto(const size_t chunk_si
            _aggregator->compute_batch_agg_states(chunk_size);
        }

-        _mem_tracker->set(_aggregator->hash_map_variant().memory_usage() +
-                          _aggregator->mem_pool()->total_reserved_bytes());
+        _mem_tracker->set(_aggregator->hash_map_variant().reserved_memory_usage(_aggregator->mem_pool()));
        TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_map());

        COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_map_variant().size());
--- a/be/src/exec/pipeline/assert_num_rows_operator.cpp
+++ b/be/src/exec/pipeline/assert_num_rows_operator.cpp
@ -27,6 +27,7 @@ Status AssertNumRowsOperator::prepare(RuntimeState* state) {
 }

 void AssertNumRowsOperator::close(RuntimeState* state) {
+    _cur_chunk.reset();
    Operator::close(state);
 }

--- a/be/src/exec/pipeline/chunk_accumulate_operator.cpp
+++ b/be/src/exec/pipeline/chunk_accumulate_operator.cpp
@ -0,0 +1,55 @@
+// This file is licensed under the Elastic License 2.0. Copyright 2021 StarRocks Limited.
+
+#include "exec/pipeline/chunk_accumulate_operator.h"
+
+#include "column/chunk.h"
+#include "runtime/runtime_state.h"
+
+namespace starrocks {
+namespace pipeline {
+
+Status ChunkAccumulateOperator::push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) {
+    DCHECK(_out_chunk == nullptr);
+
+    if (_in_chunk == nullptr) {
+        _in_chunk = chunk;
+    } else if (_in_chunk->num_rows() + chunk->num_rows() > state->chunk_size()) {
+        _out_chunk = std::move(_in_chunk);
+        _in_chunk = chunk;
+    } else {
+        _in_chunk->append(*chunk);
+    }
+
+    if (_out_chunk == nullptr && (_in_chunk->num_rows() >= state->chunk_size() * LOW_WATERMARK_ROWS_RATE ||
+                                  _in_chunk->memory_usage() >= LOW_WATERMARK_BYTES)) {
+        _out_chunk = std::move(_in_chunk);
+    }
+
+    return Status::OK();
+}
+
+StatusOr<vectorized::ChunkPtr> ChunkAccumulateOperator::pull_chunk(RuntimeState*) {
+    // If there isn't more input chunk and _out_chunk has been outputted, output _in_chunk this time.
+    if (_is_finished && _out_chunk == nullptr) {
+        return std::move(_in_chunk);
+    }
+
+    return std::move(_out_chunk);
+}
+
+Status ChunkAccumulateOperator::set_finishing(RuntimeState* state) {
+    _is_finished = true;
+
+    return Status::OK();
+}
+
+Status ChunkAccumulateOperator::set_finished(RuntimeState*) {
+    _is_finished = true;
+    _in_chunk.reset();
+    _out_chunk.reset();
+
+    return Status::OK();
+}
+
+} // namespace pipeline
+} // namespace starrocks
--- a/be/src/exec/pipeline/chunk_accumulate_operator.h
+++ b/be/src/exec/pipeline/chunk_accumulate_operator.h
@ -0,0 +1,53 @@
+// This file is licensed under the Elastic License 2.0. Copyright 2021-present, StarRocks Limited.
+
+#pragma once
+
+#include "exec/pipeline/operator.h"
+
+namespace starrocks {
+
+class RuntimeState;
+
+namespace pipeline {
+
+// Accumulate chunks and output a chunk, until the number of rows of the input chunks is large enough.
+class ChunkAccumulateOperator final : public Operator {
+public:
+    ChunkAccumulateOperator(OperatorFactory* factory, int32_t id, int32_t plan_node_id, int32_t driver_sequence)
+            : Operator(factory, id, "chunk_accumulate", plan_node_id, driver_sequence) {}
+
+    ~ChunkAccumulateOperator() override = default;
+
+    Status push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) override;
+    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;
+
+    bool has_output() const override { return _out_chunk != nullptr || (_is_finished && _in_chunk != nullptr); }
+    bool need_input() const override { return !_is_finished && _out_chunk == nullptr; }
+    bool is_finished() const override { return _is_finished && _in_chunk == nullptr && _out_chunk == nullptr; }
+
+    Status set_finishing(RuntimeState* state) override;
+    Status set_finished(RuntimeState* state) override;
+
+private:
+    static constexpr double LOW_WATERMARK_ROWS_RATE = 0.75;          // 0.75 * chunk_size
+    static constexpr size_t LOW_WATERMARK_BYTES = 256 * 1024 * 1024; // 256MB.
+
+    bool _is_finished = false;
+    vectorized::ChunkPtr _in_chunk = nullptr;
+    vectorized::ChunkPtr _out_chunk = nullptr;
+};
+
+class ChunkAccumulateOperatorFactory final : public OperatorFactory {
+public:
+    ChunkAccumulateOperatorFactory(int32_t id, int32_t plan_node_id)
+            : OperatorFactory(id, "chunk_accumulate", plan_node_id) {}
+
+    ~ChunkAccumulateOperatorFactory() override = default;
+
+    OperatorPtr create(int32_t degree_of_parallelism, int32_t driver_sequence) override {
+        return std::make_shared<ChunkAccumulateOperator>(this, _id, _plan_node_id, driver_sequence);
+    }
+};
+
+} // namespace pipeline
+} // namespace starrocks
--- a/be/src/exec/pipeline/crossjoin/cross_join_context.cpp
+++ b/be/src/exec/pipeline/crossjoin/cross_join_context.cpp
@ -12,6 +12,11 @@
 #include "runtime/runtime_state.h"

 namespace starrocks::pipeline {
+
+void CrossJoinContext::close(RuntimeState* state) {
+    _build_chunks.clear();
+}
+
 Status CrossJoinContext::_init_runtime_filter(RuntimeState* state) {
    vectorized::ChunkPtr one_row_chunk = nullptr;
    size_t num_rows = 0;
--- a/be/src/exec/pipeline/crossjoin/cross_join_context.h
+++ b/be/src/exec/pipeline/crossjoin/cross_join_context.h
@ -35,12 +35,9 @@ public:
              _rf_hub(params.rf_hub),
              _rf_descs(std::move(params.rf_descs)) {}

-    void close(RuntimeState* state) override {}
+    void close(RuntimeState* state) override;

-    bool is_build_chunk_empty() const {
-        return std::all_of(_build_chunks.begin(), _build_chunks.end(),
-                           [](const vectorized::ChunkPtr& chunk) { return chunk == nullptr || chunk->is_empty(); });
-    }
+    bool is_build_chunk_empty() const { return _is_build_chunk_empty; }

    int32_t num_build_chunks() const { return _num_right_sinkers; }

@ -53,6 +50,9 @@ public:
    Status finish_one_right_sinker(RuntimeState* state) {
        if (_num_right_sinkers - 1 == _num_finished_right_sinkers.fetch_add(1)) {
            RETURN_IF_ERROR(_init_runtime_filter(state));
+            _is_build_chunk_empty = std::all_of(
+                    _build_chunks.begin(), _build_chunks.end(),
+                    [](const vectorized::ChunkPtr& chunk) { return chunk == nullptr || chunk->is_empty(); });
            _all_right_finished.store(true, std::memory_order_release);
        }
        return Status::OK();
@ -75,6 +75,7 @@ private:
    // _build_chunks[i] contains all the rows from i-th CrossJoinRightSinkOperator.
    std::vector<vectorized::ChunkPtr> _build_chunks;

+    bool _is_build_chunk_empty = false;
    // finished flags
    std::atomic_bool _all_right_finished = false;

--- a/be/src/exec/pipeline/dict_decode_operator.cpp
+++ b/be/src/exec/pipeline/dict_decode_operator.cpp
@ -13,6 +13,7 @@ Status DictDecodeOperator::prepare(RuntimeState* state) {
 }

 void DictDecodeOperator::close(RuntimeState* state) {
+    _cur_chunk.reset();
    Operator::close(state);
 }

--- a/be/src/exec/pipeline/exchange/exchange_sink_operator.cpp
+++ b/be/src/exec/pipeline/exchange/exchange_sink_operator.cpp
@ -9,6 +9,7 @@
 #include <memory>
 #include <random>

+#include "common/config.h"
 #include "exec/pipeline/exchange/sink_buffer.h"
 #include "exprs/expr.h"
 #include "gen_cpp/Types_types.h"
@ -59,7 +60,8 @@ public:
    // Channel will sent input request directly without batch it.
    // This function is only used when broadcast, because request can be reused
    // by all the channels.
-    Status send_chunk_request(PTransmitChunkParamsPtr chunk_request, const butil::IOBuf& attachment);
+    Status send_chunk_request(PTransmitChunkParamsPtr chunk_request, const butil::IOBuf& attachment,
+                              int64_t attachment_physical_bytes);

    // Used when doing shuffle.
    // This function will copy selective rows in chunks to batch.
@ -213,12 +215,13 @@ Status ExchangeSinkOperator::Channel::send_one_chunk(const vectorized::Chunk* ch

    // Try to accumulate enough bytes before sending a RPC. When eos is true we should send
    // last packet
-    if (_current_request_bytes > _parent->_request_bytes_threshold || eos) {
+    if (_current_request_bytes > config::max_transmit_batched_bytes || eos) {
        _chunk_request->set_eos(eos);
        _chunk_request->set_use_pass_through(_use_pass_through);
        butil::IOBuf attachment;
-        _parent->construct_brpc_attachment(_chunk_request, attachment);
-        TransmitChunkInfo info = {this->_fragment_instance_id, _brpc_stub, std::move(_chunk_request), attachment};
+        int64_t attachment_physical_bytes = _parent->construct_brpc_attachment(_chunk_request, attachment);
+        TransmitChunkInfo info = {this->_fragment_instance_id, _brpc_stub, std::move(_chunk_request), attachment,
+                                  attachment_physical_bytes};
        _parent->_buffer->add_request(info);
        _current_request_bytes = 0;
        _chunk_request.reset();
@ -229,14 +232,16 @@ Status ExchangeSinkOperator::Channel::send_one_chunk(const vectorized::Chunk* ch
 }

 Status ExchangeSinkOperator::Channel::send_chunk_request(PTransmitChunkParamsPtr chunk_request,
-                                                         const butil::IOBuf& attachment) {
+                                                         const butil::IOBuf& attachment,
+                                                         int64_t attachment_physical_bytes) {
    chunk_request->set_node_id(_dest_node_id);
    chunk_request->set_sender_id(_parent->_sender_id);
    chunk_request->set_be_number(_parent->_be_number);
    chunk_request->set_eos(false);
    chunk_request->set_use_pass_through(_use_pass_through);

-    TransmitChunkInfo info = {this->_fragment_instance_id, _brpc_stub, std::move(chunk_request), attachment};
+    TransmitChunkInfo info = {this->_fragment_instance_id, _brpc_stub, std::move(chunk_request), attachment,
+                              attachment_physical_bytes};
    _parent->_buffer->add_request(info);

    return Status::OK();
@ -367,11 +372,11 @@ bool ExchangeSinkOperator::is_finished() const {
 }

 bool ExchangeSinkOperator::need_input() const {
-    return !is_finished() && !_buffer->is_full();
+    return !is_finished() && _buffer != nullptr && !_buffer->is_full();
 }

 bool ExchangeSinkOperator::pending_finish() const {
-    return !_buffer->is_finished();
+    return _buffer != nullptr && !_buffer->is_finished();
 }

 Status ExchangeSinkOperator::set_cancelled(RuntimeState* state) {
@ -425,13 +430,14 @@ Status ExchangeSinkOperator::push_chunk(RuntimeState* state, const vectorized::C
                    RETURN_IF_ERROR(serialize_chunk(send_chunk, pchunk, &_is_first_chunk, _channels.size())));
            _current_request_bytes += pchunk->data().size();
            // 3. if request bytes exceede the threshold, send current request
-            if (_current_request_bytes > _request_bytes_threshold) {
+            if (_current_request_bytes > config::max_transmit_batched_bytes) {
                butil::IOBuf attachment;
-                construct_brpc_attachment(_chunk_request, attachment);
+                int64_t attachment_physical_bytes = construct_brpc_attachment(_chunk_request, attachment);
                for (auto idx : _channel_indices) {
                    if (!_channels[idx]->use_pass_through()) {
                        PTransmitChunkParamsPtr copy = std::make_shared<PTransmitChunkParams>(*_chunk_request);
-                        RETURN_IF_ERROR(_channels[idx]->send_chunk_request(copy, attachment));
+                        RETURN_IF_ERROR(
+                                _channels[idx]->send_chunk_request(copy, attachment, attachment_physical_bytes));
                    }
                }
                _current_request_bytes = 0;
@ -525,10 +531,10 @@ Status ExchangeSinkOperator::set_finishing(RuntimeState* state) {

    if (_chunk_request != nullptr) {
        butil::IOBuf attachment;
-        construct_brpc_attachment(_chunk_request, attachment);
+        int64_t attachment_physical_bytes = construct_brpc_attachment(_chunk_request, attachment);
        for (const auto& channel : _channels) {
            PTransmitChunkParamsPtr copy = std::make_shared<PTransmitChunkParams>(*_chunk_request);
-            channel->send_chunk_request(copy, attachment);
+            channel->send_chunk_request(copy, attachment, attachment_physical_bytes);
        }
        _current_request_bytes = 0;
        _chunk_request.reset();
@ -602,17 +608,25 @@ Status ExchangeSinkOperator::serialize_chunk(const vectorized::Chunk* src, Chunk
    return Status::OK();
 }

-void ExchangeSinkOperator::construct_brpc_attachment(PTransmitChunkParamsPtr chunk_request, butil::IOBuf& attachment) {
+int64_t ExchangeSinkOperator::construct_brpc_attachment(PTransmitChunkParamsPtr chunk_request,
+                                                        butil::IOBuf& attachment) {
+    int64_t attachment_physical_bytes = 0;
    for (int i = 0; i < chunk_request->chunks().size(); ++i) {
        auto chunk = chunk_request->mutable_chunks(i);
        chunk->set_data_size(chunk->data().size());
+
+        int64_t before_bytes = CurrentThread::current().get_consumed_bytes();
        attachment.append(chunk->data());
+        attachment_physical_bytes += CurrentThread::current().get_consumed_bytes() - before_bytes;
+
        chunk->clear_data();
        // If the request is too big, free the memory in order to avoid OOM
        if (_is_large_chunk(chunk->data_size())) {
            chunk->mutable_data()->shrink_to_fit();
        }
    }
+
+    return attachment_physical_bytes;
 }

 ExchangeSinkOperatorFactory::ExchangeSinkOperatorFactory(
@ -653,6 +667,7 @@ Status ExchangeSinkOperatorFactory::prepare(RuntimeState* state) {
 }

 void ExchangeSinkOperatorFactory::close(RuntimeState* state) {
+    _buffer.reset();
    Expr::close(_partition_expr_ctxs, state);
    OperatorFactory::close(state);
 }
--- a/be/src/exec/pipeline/exchange/exchange_sink_operator.h
+++ b/be/src/exec/pipeline/exchange/exchange_sink_operator.h
@ -63,7 +63,8 @@ public:
    // For other chunk, only serialize the chunk data to ChunkPB.
    Status serialize_chunk(const vectorized::Chunk* chunk, ChunkPB* dst, bool* is_first_chunk, int num_receivers = 1);

-    void construct_brpc_attachment(PTransmitChunkParamsPtr _chunk_request, butil::IOBuf& attachment);
+    // Return the physical bytes of attachment.
+    int64_t construct_brpc_attachment(PTransmitChunkParamsPtr _chunk_request, butil::IOBuf& attachment);

 private:
    bool _is_large_chunk(size_t sz) const {
@ -107,7 +108,6 @@ private:
    // Only used when broadcast
    PTransmitChunkParamsPtr _chunk_request;
    size_t _current_request_bytes = 0;
-    size_t _request_bytes_threshold = config::max_transmit_batched_bytes;

    bool _is_first_chunk = true;

--- a/be/src/exec/pipeline/exchange/local_exchange.cpp
+++ b/be/src/exec/pipeline/exchange/local_exchange.cpp
@ -86,7 +86,7 @@ Status PartitionExchanger::accept(const vectorized::ChunkPtr& chunk, const int32
    // and used later in pull_chunk() of source operator. If we reuse partition_row_indexes in partitioner,
    // it will be overwritten by the next time calling partitioner.partition_chunk().
    std::shared_ptr<std::vector<uint32_t>> partition_row_indexes = std::make_shared<std::vector<uint32_t>>(num_rows);
-    partitioner.partition_chunk(chunk, *partition_row_indexes);
+    RETURN_IF_ERROR(partitioner.partition_chunk(chunk, *partition_row_indexes));

    for (size_t i = 0; i < _source->get_sources().size(); ++i) {
        size_t from = partitioner.partition_begin_offset(i);
--- a/be/src/exec/pipeline/exchange/sink_buffer.cpp
+++ b/be/src/exec/pipeline/exchange/sink_buffer.cpp
@ -4,6 +4,11 @@

 #include <chrono>

+DIAGNOSTIC_PUSH
+DIAGNOSTIC_IGNORE("-Wclass-memaccess")
+#include <bthread/bthread.h>
+DIAGNOSTIC_POP
+
 #include "fmt/core.h"
 #include "util/time.h"
 #include "util/uid_util.h"
@ -86,14 +91,19 @@ bool SinkBuffer::is_full() const {
    for (auto& [_, buffer] : _buffers) {
        buffer_size += buffer.size();
    }
-    bool is_full = buffer_size > max_buffer_size;
+    const bool is_full = buffer_size > max_buffer_size;

-    if (is_full && _last_full_timestamp == -1) {
-        _last_full_timestamp = MonotonicNanos();
+    int64_t last_full_timestamp = _last_full_timestamp;
+    int64_t full_time = _full_time;
+
+    if (is_full && last_full_timestamp == -1) {
+        _last_full_timestamp.compare_exchange_weak(last_full_timestamp, MonotonicNanos());
    }
-    if (!is_full && _last_full_timestamp != -1) {
-        _full_time += (MonotonicNanos() - _last_full_timestamp);
-        _last_full_timestamp = -1;
+    if (!is_full && last_full_timestamp != -1) {
+        // The following two update operations cannot guarantee atomicity as a whole without lock
+        // But we can accept bias in estimatation
+        _full_time.compare_exchange_weak(full_time, full_time + (MonotonicNanos() - last_full_timestamp));
+        _last_full_timestamp.compare_exchange_weak(last_full_timestamp, -1);
    }

    return is_full;
@ -218,12 +228,17 @@ void SinkBuffer::_try_to_send_rpc(const TUniqueId& instance_id, std::function<vo
            return;
        }

-        TransmitChunkInfo request = buffer.front();
+        TransmitChunkInfo& request = buffer.front();
        bool need_wait = false;
-        DeferOp pop_defer([&need_wait, &buffer]() {
+        DeferOp pop_defer([&need_wait, &buffer, mem_tracker = _mem_tracker]() {
            if (need_wait) {
                return;
            }
+
+            // The request memory is acquired by ExchangeSinkOperator,
+            // so use the instance_mem_tracker passed from ExchangeSinkOperator to release memory.
+            // This must be invoked before decrease_defer desctructed to avoid sink_buffer and fragment_ctx released.
+            SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(mem_tracker);
            buffer.pop();
        });

@ -311,10 +326,23 @@ void SinkBuffer::_try_to_send_rpc(const TUniqueId& instance_id, std::function<vo
        ++_total_in_flight_rpc;
        ++_num_in_flight_rpcs[instance_id.lo];

+        // Attachment will be released by process_mem_tracker in closure->Run() in bthread, when receiving the response,
+        // so decrease the memory usage of attachment from instance_mem_tracker immediately before sending the request.
+        _mem_tracker->release(request.attachment_physical_bytes);
+        ExecEnv::GetInstance()->process_mem_tracker()->consume(request.attachment_physical_bytes);
+
        closure->cntl.Reset();
        closure->cntl.set_timeout_ms(_brpc_timeout_ms);
        closure->cntl.request_attachment().append(request.attachment);
-        request.brpc_stub->transmit_chunk(&closure->cntl, request.params.get(), &closure->result, closure);
+
+        if (bthread_self()) {
+            request.brpc_stub->transmit_chunk(&closure->cntl, request.params.get(), &closure->result, closure);
+        } else {
+            // When the driver worker thread sends request and creates the protobuf request,
+            // also use process_mem_tracker to record the memory of the protobuf request.
+            SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(nullptr);
+            request.brpc_stub->transmit_chunk(&closure->cntl, request.params.get(), &closure->result, closure);
+        }

        return;
    }
--- a/be/src/exec/pipeline/exchange/sink_buffer.h
+++ b/be/src/exec/pipeline/exchange/sink_buffer.h
@ -32,6 +32,7 @@ struct TransmitChunkInfo {
    doris::PBackendService_Stub* brpc_stub;
    PTransmitChunkParamsPtr params;
    butil::IOBuf attachment;
+    int64_t attachment_physical_bytes;
 };

 struct ClosureContext {
@ -104,7 +105,7 @@ private:
    int64_t _network_time();

    FragmentContext* _fragment_ctx;
-    const MemTracker* _mem_tracker;
+    MemTracker* const _mem_tracker;
    const int32_t _brpc_timeout_ms;
    const bool _is_dest_merge;

@ -157,8 +158,8 @@ private:
    std::atomic<int64_t> _request_sent = 0;

    int64_t _pending_timestamp = -1;
-    mutable int64_t _last_full_timestamp = -1;
-    mutable int64_t _full_time = 0;
+    mutable std::atomic<int64_t> _last_full_timestamp = -1;
+    mutable std::atomic<int64_t> _full_time = 0;
 }; // namespace starrocks::pipeline

 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/fragment_context.cpp
+++ b/be/src/exec/pipeline/fragment_context.cpp
@ -80,7 +80,9 @@ void FragmentContext::prepare_pass_through_chunk_buffer() {
    _runtime_state->exec_env()->stream_mgr()->prepare_pass_through_chunk_buffer(_query_id);
 }
 void FragmentContext::destroy_pass_through_chunk_buffer() {
-    _runtime_state->exec_env()->stream_mgr()->destroy_pass_through_chunk_buffer(_query_id);
+    if (_runtime_state) {
+        _runtime_state->exec_env()->stream_mgr()->destroy_pass_through_chunk_buffer(_query_id);
+    }
 }

 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/fragment_executor.cpp
+++ b/be/src/exec/pipeline/fragment_executor.cpp
@ -60,7 +60,6 @@ Status FragmentExecutor::_prepare_query_ctx(ExecEnv* exec_env, const TExecPlanFr
    const auto& params = request.params;
    const auto& query_id = params.query_id;
    const auto& fragment_instance_id = params.fragment_instance_id;
-    const auto& query_options = request.query_options;

    auto&& existing_query_ctx = exec_env->query_context_mgr()->get(query_id);
    if (existing_query_ctx) {
@ -75,14 +74,12 @@ Status FragmentExecutor::_prepare_query_ctx(ExecEnv* exec_env, const TExecPlanFr
    if (params.__isset.instances_number) {
        _query_ctx->set_total_fragments(params.instances_number);
    }
-    if (query_options.__isset.query_timeout) {
-        _query_ctx->set_expire_seconds(std::max<int>(query_options.query_timeout, 1));
-    } else {
-        _query_ctx->set_expire_seconds(300);
-    }

+    _query_ctx->set_delivery_expire_seconds(_calc_delivery_expired_seconds(request));
+    _query_ctx->set_query_expire_seconds(_calc_query_expired_seconds(request));
    // initialize query's deadline
-    _query_ctx->extend_lifetime();
+    _query_ctx->extend_delivery_lifetime();
+    _query_ctx->extend_query_lifetime();

    return Status::OK();
 }
@ -204,6 +201,33 @@ int32_t FragmentExecutor::_calc_dop(ExecEnv* exec_env, const TExecPlanFragmentPa
    return exec_env->calc_pipeline_dop(degree_of_parallelism);
 }

+int FragmentExecutor::_calc_delivery_expired_seconds(const TExecPlanFragmentParams& request) const {
+    const auto& query_options = request.query_options;
+
+    int expired_seconds = QueryContext::DEFAULT_EXPIRE_SECONDS;
+    if (query_options.__isset.query_delivery_timeout) {
+        if (query_options.__isset.query_timeout) {
+            expired_seconds = std::min(query_options.query_timeout, query_options.query_delivery_timeout);
+        } else {
+            expired_seconds = query_options.query_delivery_timeout;
+        }
+    } else if (query_options.__isset.query_timeout) {
+        expired_seconds = query_options.query_timeout;
+    }
+
+    return std::max<int>(1, expired_seconds);
+}
+
+int FragmentExecutor::_calc_query_expired_seconds(const TExecPlanFragmentParams& request) const {
+    const auto& query_options = request.query_options;
+
+    if (query_options.__isset.query_timeout) {
+        return std::max<int>(1, query_options.query_timeout);
+    }
+
+    return QueryContext::DEFAULT_EXPIRE_SECONDS;
+}
+
 Status FragmentExecutor::_prepare_exec_plan(ExecEnv* exec_env, const TExecPlanFragmentParams& request) {
    auto* runtime_state = _fragment_ctx->runtime_state();
    auto* obj_pool = runtime_state->obj_pool();
@ -232,6 +256,7 @@ Status FragmentExecutor::_prepare_exec_plan(ExecEnv* exec_env, const TExecPlanFr
    std::vector<TScanRangeParams> no_scan_ranges;
    plan->collect_scan_nodes(&scan_nodes);

+    int64_t sum_scan_limit = 0;
    MorselQueueMap& morsel_queues = _fragment_ctx->morsel_queues();
    for (auto& i : scan_nodes) {
        ScanNode* scan_node = down_cast<ScanNode*>(i);
@ -240,6 +265,22 @@ Status FragmentExecutor::_prepare_exec_plan(ExecEnv* exec_env, const TExecPlanFr
        ASSIGN_OR_RETURN(MorselQueuePtr morsel_queue,
                         scan_node->convert_scan_range_to_morsel_queue(scan_ranges, scan_node->id(), request));
        morsel_queues.emplace(scan_node->id(), std::move(morsel_queue));
+        if (scan_node->limit() > 0) {
+            sum_scan_limit += scan_node->limit();
+        }
+    }
+
+    int dop = exec_env->calc_pipeline_dop(request.pipeline_dop);
+    if (_wg && _wg->big_query_scan_rows_limit() > 0) {
+        // For SQL like: select * from xxx limit 5, the underlying scan_limit should be 5 * parallelism
+        // Otherwise this SQL would exceed the bigquery_rows_limit due to underlying IO parallelization
+        if (sum_scan_limit <= _wg->big_query_scan_rows_limit()) {
+            int parallelism = dop * ScanOperator::MAX_IO_TASKS_PER_OP;
+            int64_t parallel_scan_limit = sum_scan_limit * parallelism;
+            _query_ctx->set_scan_limit(parallel_scan_limit);
+        } else {
+            _query_ctx->set_scan_limit(_wg->big_query_scan_rows_limit());
+        }
    }

    return Status::OK();
@ -291,7 +332,8 @@ Status FragmentExecutor::_prepare_pipeline_driver(ExecEnv* exec_env, const TExec
            auto source_id = pipeline->get_op_factories()[0]->plan_node_id();
            DCHECK(morsel_queues.count(source_id));
            auto& morsel_queue = morsel_queues[source_id];
-            DCHECK(morsel_queue->num_morsels() == 0 || cur_pipeline_dop <= morsel_queue->num_morsels());
+            DCHECK(morsel_queue->max_degree_of_parallelism() == 0 ||
+                   cur_pipeline_dop <= morsel_queue->max_degree_of_parallelism());

            for (size_t i = 0; i < cur_pipeline_dop; ++i) {
                auto&& operators = pipeline->create_operators(cur_pipeline_dop, i);
@ -411,6 +453,8 @@ void FragmentExecutor::_fail_cleanup() {
    if (_query_ctx) {
        if (_fragment_ctx) {
            _query_ctx->fragment_mgr()->unregister(_fragment_ctx->fragment_instance_id());
+            _fragment_ctx->destroy_pass_through_chunk_buffer();
+            _fragment_ctx.reset();
        }
        if (_query_ctx->count_down_fragments()) {
            auto query_id = _query_ctx->query_id();
--- a/be/src/exec/pipeline/fragment_executor.h
+++ b/be/src/exec/pipeline/fragment_executor.h
@ -27,6 +27,8 @@ public:
 private:
    void _fail_cleanup();
    int32_t _calc_dop(ExecEnv* exec_env, const TExecPlanFragmentParams& request) const;
+    int _calc_delivery_expired_seconds(const TExecPlanFragmentParams& request) const;
+    int _calc_query_expired_seconds(const TExecPlanFragmentParams& request) const;

    // Several steps of prepare a fragment
    // 1. query context
--- a/be/src/exec/pipeline/operator.cpp
+++ b/be/src/exec/pipeline/operator.cpp
@ -13,6 +13,7 @@

 namespace starrocks::pipeline {

+/// Operator.
 const int32_t Operator::s_pseudo_plan_node_id_for_result_sink = -99;
 const int32_t Operator::s_pseudo_plan_node_id_upper_bound = -100;

@ -114,6 +115,9 @@ Status Operator::eval_conjuncts_and_in_filters(const std::vector<ExprContext*>&
                                                in_filters.end());
        _conjuncts_and_in_filters_is_cached = true;
    }
+    if (_cached_conjuncts_and_in_filters.empty()) {
+        return Status::OK();
+    }
    if (chunk == nullptr || chunk->is_empty()) {
        return Status::OK();
    }
@ -166,7 +170,7 @@ void Operator::_init_rf_counters(bool init_bloom) {

 void Operator::_init_conjuct_counters() {
    if (_conjuncts_timer == nullptr) {
-        _conjuncts_timer = ADD_TIMER(_common_metrics, "JoinRuntimeFilterTime");
+        _conjuncts_timer = ADD_TIMER(_common_metrics, "ConjunctsTime");
        _conjuncts_input_counter = ADD_COUNTER(_common_metrics, "ConjunctsInputRows", TUnit::UNIT);
        _conjuncts_output_counter = ADD_COUNTER(_common_metrics, "ConjunctsOutputRows", TUnit::UNIT);
        _conjuncts_eval_counter = ADD_COUNTER(_common_metrics, "ConjunctsEvaluate", TUnit::UNIT);
@ -201,6 +205,7 @@ Status OperatorFactory::prepare(RuntimeState* state) {
    return Status::OK();
 }

+/// OperatorFactory.
 void OperatorFactory::close(RuntimeState* state) {
    if (_runtime_filter_collector) {
        _runtime_filter_collector->close(state);
@ -224,4 +229,18 @@ void OperatorFactory::_prepare_runtime_in_filters(RuntimeState* state) {
    }
 }

+bool OperatorFactory::has_runtime_filters() const {
+    // Check runtime in-filters.
+    if (!_rf_waiting_set.empty()) {
+        return true;
+    }
+
+    // Check runtime bloom-filters.
+    if (_runtime_filter_collector == nullptr) {
+        return false;
+    }
+    auto* global_rf_collector = _runtime_filter_collector->get_rf_probe_collector();
+    return global_rf_collector != nullptr && !global_rf_collector->descriptors().empty();
+}
+
 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/operator.h
+++ b/be/src/exec/pipeline/operator.h
@ -268,6 +268,10 @@ public:

    RowDescriptor* row_desc() { return &_row_desc; }

+    // Whether it has any runtime in-filter or bloom-filter.
+    // MUST be invoked after init_runtime_filter.
+    bool has_runtime_filters() const;
+
 protected:
    void _prepare_runtime_in_filters(RuntimeState* state);

--- a/be/src/exec/pipeline/pipeline_builder.cpp
+++ b/be/src/exec/pipeline/pipeline_builder.cpp
@ -160,6 +160,22 @@ MorselQueue* PipelineBuilderContext::morsel_queue_of_source_operator(const Sourc
    return morsel_queues[source_id].get();
 }

+size_t PipelineBuilderContext::degree_of_parallelism_of_source_operator(int32_t source_node_id) const {
+    auto& morsel_queues = _fragment_context->morsel_queues();
+    auto it = morsel_queues.find(source_node_id);
+    if (it == morsel_queues.end()) {
+        return _degree_of_parallelism;
+    }
+
+    // The degree_of_parallelism of the SourceOperator with morsel is not more than the number of morsels
+    // If table is empty, then morsel size is zero and we still set degree of parallelism to 1
+    return std::min<size_t>(std::max<size_t>(1, it->second->max_degree_of_parallelism()), _degree_of_parallelism);
+}
+
+size_t PipelineBuilderContext::degree_of_parallelism_of_source_operator(const SourceOperatorFactory* source_op) const {
+    return degree_of_parallelism_of_source_operator(source_op->plan_node_id());
+}
+
 Pipelines PipelineBuilder::build(const FragmentContext& fragment, ExecNode* exec_node) {
    pipeline::OpFactories operators = exec_node->decompose_to_pipeline(&_context);
    _context.add_pipeline(operators);
--- a/be/src/exec/pipeline/pipeline_builder.h
+++ b/be/src/exec/pipeline/pipeline_builder.h
@ -59,6 +59,8 @@ public:
    FragmentContext* fragment_context() { return _fragment_context; }

    MorselQueue* morsel_queue_of_source_operator(const SourceOperatorFactory* source_op);
+    size_t degree_of_parallelism_of_source_operator(int32_t source_node_id) const;
+    size_t degree_of_parallelism_of_source_operator(const SourceOperatorFactory* source_op) const;

 private:
    static constexpr int kLocalExchangeBufferChunks = 8;
--- a/be/src/exec/pipeline/pipeline_driver.cpp
+++ b/be/src/exec/pipeline/pipeline_driver.cpp
@ -43,6 +43,7 @@ Status PipelineDriver::prepare(RuntimeState* runtime_state) {
    _first_input_empty_timer = ADD_CHILD_TIMER(_runtime_profile, "FirstInputEmptyTime", "InputEmptyTime");
    _followup_input_empty_timer = ADD_CHILD_TIMER(_runtime_profile, "FollowupInputEmptyTime", "InputEmptyTime");
    _output_full_timer = ADD_CHILD_TIMER(_runtime_profile, "OutputFullTime", "PendingTime");
+    _pending_finish_timer = ADD_CHILD_TIMER(_runtime_profile, "PendingFinishTime", "PendingTime");

    DCHECK(_state == DriverState::NOT_READY);

@ -92,11 +93,13 @@ Status PipelineDriver::prepare(RuntimeState* runtime_state) {
    _precondition_block_timer_sw = runtime_state->obj_pool()->add(new MonotonicStopWatch());
    _input_empty_timer_sw = runtime_state->obj_pool()->add(new MonotonicStopWatch());
    _output_full_timer_sw = runtime_state->obj_pool()->add(new MonotonicStopWatch());
+    _pending_finish_timer_sw = runtime_state->obj_pool()->add(new MonotonicStopWatch());
    _total_timer_sw->start();
    _pending_timer_sw->start();
    _precondition_block_timer_sw->start();
    _input_empty_timer_sw->start();
    _output_full_timer_sw->start();
+    _pending_finish_timer_sw->start();

    return Status::OK();
 }
--- a/be/src/exec/pipeline/pipeline_driver.h
+++ b/be/src/exec/pipeline/pipeline_driver.h
@ -190,10 +190,12 @@ public:
        case DriverState::OUTPUT_FULL:
            _output_full_timer->update(_output_full_timer_sw->elapsed_time());
            break;
-        case DriverState::PRECONDITION_BLOCK: {
+        case DriverState::PRECONDITION_BLOCK:
            _precondition_block_timer->update(_precondition_block_timer_sw->elapsed_time());
            break;
-        }
+        case DriverState::PENDING_FINISH:
+            _pending_finish_timer->update(_pending_finish_timer_sw->elapsed_time());
+            break;
        default:
            break;
        }
@ -208,6 +210,9 @@ public:
        case DriverState::PRECONDITION_BLOCK:
            _precondition_block_timer_sw->reset();
            break;
+        case DriverState::PENDING_FINISH:
+            _pending_finish_timer_sw->reset();
+            break;
        default:
            break;
        }
@ -420,12 +425,14 @@ private:
    RuntimeProfile::Counter* _first_input_empty_timer = nullptr;
    RuntimeProfile::Counter* _followup_input_empty_timer = nullptr;
    RuntimeProfile::Counter* _output_full_timer = nullptr;
+    RuntimeProfile::Counter* _pending_finish_timer = nullptr;

    MonotonicStopWatch* _total_timer_sw = nullptr;
    MonotonicStopWatch* _pending_timer_sw = nullptr;
    MonotonicStopWatch* _precondition_block_timer_sw = nullptr;
    MonotonicStopWatch* _input_empty_timer_sw = nullptr;
    MonotonicStopWatch* _output_full_timer_sw = nullptr;
+    MonotonicStopWatch* _pending_finish_timer_sw = nullptr;
 };

 } // namespace pipeline
--- a/be/src/exec/pipeline/pipeline_driver_executor.cpp
+++ b/be/src/exec/pipeline/pipeline_driver_executor.cpp
@ -5,6 +5,7 @@
 #include <memory>

 #include "exec/workgroup/work_group.h"
+#include "gen_cpp/Types_types.h"
 #include "gutil/strings/substitute.h"
 #include "runtime/current_thread.h"
 #include "util/defer_op.h"
@ -78,6 +79,10 @@ void GlobalDriverExecutor::_worker_thread() {
        if (_num_threads_setter.should_shrink()) {
            break;
        }
+        // Reset TLS state
+        CurrentThread::current().set_query_id({});
+        CurrentThread::current().set_fragment_instance_id({});
+        CurrentThread::current().set_pipeline_driver_id(0);

        auto maybe_driver = this->_driver_queue->take(worker_id);
        if (maybe_driver.status().is_cancelled()) {
@ -88,9 +93,9 @@ void GlobalDriverExecutor::_worker_thread() {

        auto* query_ctx = driver->query_ctx();
        auto* fragment_ctx = driver->fragment_ctx();
-        tls_thread_status.set_query_id(query_ctx->query_id());
-        tls_thread_status.set_fragment_instance_id(fragment_ctx->fragment_instance_id());
-        tls_thread_status.set_pipeline_driver_id(driver->driver_id());
+        CurrentThread::current().set_query_id(query_ctx->query_id());
+        CurrentThread::current().set_fragment_instance_id(fragment_ctx->fragment_instance_id());
+        CurrentThread::current().set_pipeline_driver_id(driver->driver_id());

        // TODO(trueeyu): This writing is to ensure that MemTracker will not be destructed before the thread ends.
        //  This writing method is a bit tricky, and when there is a better way, replace it
@ -302,10 +307,11 @@ void GlobalDriverExecutor::_simplify_common_metrics(RuntimeProfile* driver_profi
        DCHECK(common_metrics != nullptr);

        // Remove runtime filter related counters if it's value is 0
-        static std::string counter_names[] = {
-                "RuntimeInFilterNum",          "RuntimeBloomFilterNum",     "JoinRuntimeFilterInputRows",
-                "JoinRuntimeFilterOutputRows", "JoinRuntimeFilterEvaluate", "JoinRuntimeFilterTime",
-                "ConjunctsInputRows",          "ConjunctsOutputRows",       "ConjunctsEvaluate"};
+        static std::string counter_names[] = {"RuntimeInFilterNum",         "RuntimeBloomFilterNum",
+                                              "JoinRuntimeFilterInputRows", "JoinRuntimeFilterOutputRows",
+                                              "JoinRuntimeFilterEvaluate",  "JoinRuntimeFilterTime",
+                                              "ConjunctsInputRows",         "ConjunctsOutputRows",
+                                              "ConjunctsEvaluate",          "ConjunctsTime"};
        for (auto& name : counter_names) {
            auto* counter = common_metrics->get_counter(name);
            if (counter != nullptr && counter->value() == 0) {
--- a/be/src/exec/pipeline/pipeline_driver_poller.cpp
+++ b/be/src/exec/pipeline/pipeline_driver_poller.cpp
@ -53,7 +53,7 @@ void PipelineDriverPoller::run_internal() {
        while (driver_it != local_blocked_drivers.end()) {
            auto* driver = *driver_it;

-            if (driver->query_ctx()->is_expired()) {
+            if (driver->query_ctx()->is_query_expired()) {
                // there are not any drivers belonging to a query context can make progress for an expiration period
                // indicates that some fragments are missing because of failed exec_plan_fragment invocation. in
                // this situation, query is failed finally, so drivers are marked PENDING_FINISH/FINISH.
@ -63,7 +63,7 @@ void PipelineDriverPoller::run_internal() {
                LOG(WARNING) << "[Driver] Timeout, query_id=" << print_id(driver->query_ctx()->query_id())
                             << ", instance_id=" << print_id(driver->fragment_ctx()->fragment_instance_id());
                driver->fragment_ctx()->cancel(Status::TimedOut(fmt::format(
-                        "Query exceeded time limit of {} seconds", driver->query_ctx()->get_expire_seconds())));
+                        "Query exceeded time limit of {} seconds", driver->query_ctx()->get_query_expire_seconds())));
                driver->cancel_operators(driver->fragment_ctx()->runtime_state());
                if (driver->is_still_pending_finish()) {
                    driver->set_driver_state(DriverState::PENDING_FINISH);
@ -137,10 +137,10 @@ void PipelineDriverPoller::run_internal() {
 }

 void PipelineDriverPoller::add_blocked_driver(const DriverRawPtr driver) {
-    std::unique_lock<std::mutex> lock(this->_mutex);
-    this->_blocked_drivers.push_back(driver);
+    std::unique_lock<std::mutex> lock(_mutex);
+    _blocked_drivers.push_back(driver);
    driver->_pending_timer_sw->reset();
-    this->_cond.notify_one();
+    _cond.notify_one();
 }

 void PipelineDriverPoller::remove_blocked_driver(DriverList& local_blocked_drivers, DriverList::iterator& driver_it) {
--- a/be/src/exec/pipeline/project_operator.cpp
+++ b/be/src/exec/pipeline/project_operator.cpp
@ -15,6 +15,7 @@ Status ProjectOperator::prepare(RuntimeState* state) {
 }

 void ProjectOperator::close(RuntimeState* state) {
+    _cur_chunk.reset();
    Operator::close(state);
 }

--- a/be/src/exec/pipeline/query_context.cpp
+++ b/be/src/exec/pipeline/query_context.cpp
@ -15,8 +15,7 @@ QueryContext::QueryContext()
        : _fragment_mgr(new FragmentContextManager()),
          _total_fragments(0),
          _num_fragments(0),
-          _num_active_fragments(0),
-          _deadline(0) {}
+          _num_active_fragments(0) {}

 QueryContext::~QueryContext() {
    // When destruct FragmentContextManager, we use query-level MemTracker. since when PipelineDriver executor
@ -122,7 +121,7 @@ void QueryContextManager::_clean_slot_unlocked(size_t i) {
    auto& sc_map = _second_chance_maps[i];
    auto sc_it = sc_map.begin();
    while (sc_it != sc_map.end()) {
-        if (sc_it->second->has_no_active_instances() && sc_it->second->is_expired()) {
+        if (sc_it->second->has_no_active_instances() && sc_it->second->is_delivery_expired()) {
            sc_it = sc_map.erase(sc_it);
        } else {
            ++sc_it;
@ -260,7 +259,7 @@ bool QueryContextManager::remove(const TUniqueId& query_id) {
        // in the future, so extend the lifetime of query context and wait for some time till fragments on wire have
        // vanished
        auto ctx = std::move(it->second);
-        ctx->extend_lifetime();
+        ctx->extend_delivery_lifetime();
        context_map.erase(it);
        sc_map.emplace(query_id, std::move(ctx));
        return false;
--- a/be/src/exec/pipeline/query_context.h
+++ b/be/src/exec/pipeline/query_context.h
@ -45,18 +45,28 @@ public:
    int num_active_fragments() const { return _num_active_fragments.load(); }
    bool has_no_active_instances() { return _num_active_fragments.load() == 0; }

-    void set_expire_seconds(int expire_seconds) { _expire_seconds = seconds(expire_seconds); }
-    inline int get_expire_seconds() { return _expire_seconds.count(); }
+    void set_delivery_expire_seconds(int expire_seconds) { _delivery_expire_seconds = seconds(expire_seconds); }
+    void set_query_expire_seconds(int expire_seconds) { _query_expire_seconds = seconds(expire_seconds); }
+    inline int get_query_expire_seconds() const { return _query_expire_seconds.count(); }
    // now time point pass by deadline point.
-    bool is_expired() {
+    bool is_delivery_expired() const {
        auto now = duration_cast<milliseconds>(steady_clock::now().time_since_epoch()).count();
-        return now > _deadline;
+        return now > _delivery_deadline;
+    }
+    bool is_query_expired() const {
+        auto now = duration_cast<milliseconds>(steady_clock::now().time_since_epoch()).count();
+        return now > _query_deadline;
    }

    bool is_dead() { return _num_active_fragments == 0 && _num_fragments == _total_fragments; }
    // add expired seconds to deadline
-    void extend_lifetime() {
-        _deadline = duration_cast<milliseconds>(steady_clock::now().time_since_epoch() + _expire_seconds).count();
+    void extend_delivery_lifetime() {
+        _delivery_deadline =
+                duration_cast<milliseconds>(steady_clock::now().time_since_epoch() + _delivery_expire_seconds).count();
+    }
+    void extend_query_lifetime() {
+        _query_deadline =
+                duration_cast<milliseconds>(steady_clock::now().time_since_epoch() + _query_expire_seconds).count();
    }

    FragmentContextManager* fragment_mgr();
@ -103,6 +113,12 @@ public:
    int64_t query_begin_time() const { return _query_begin_time; }
    void init_query_begin_time() { _query_begin_time = MonotonicNanos(); }

+    void set_scan_limit(int64_t scan_limit) { _scan_limit = scan_limit; }
+    int64_t get_scan_limit() const { return _scan_limit; }
+
+public:
+    static constexpr int DEFAULT_EXPIRE_SECONDS = 300;
+
 private:
    ExecEnv* _exec_env = nullptr;
    TUniqueId _query_id;
@ -110,8 +126,10 @@ private:
    size_t _total_fragments;
    std::atomic<size_t> _num_fragments;
    std::atomic<size_t> _num_active_fragments;
-    int64_t _deadline;
-    seconds _expire_seconds;
+    int64_t _delivery_deadline = 0;
+    int64_t _query_deadline = 0;
+    seconds _delivery_expire_seconds = seconds(DEFAULT_EXPIRE_SECONDS);
+    seconds _query_expire_seconds = seconds(DEFAULT_EXPIRE_SECONDS);
    bool _is_runtime_filter_coordinator = false;
    std::once_flag _init_mem_tracker_once;
    std::shared_ptr<RuntimeProfile> _profile;
@ -125,6 +143,7 @@ private:
    std::atomic<int64_t> _cur_scan_rows_num = 0;
    std::atomic<int64_t> _cur_scan_bytes = 0;

+    int64_t _scan_limit = 0;
    int64_t _init_wg_cpu_cost = 0;
 };

--- a/be/src/exec/pipeline/result_sink_operator.cpp
+++ b/be/src/exec/pipeline/result_sink_operator.cpp
@ -5,6 +5,7 @@
 #include "column/chunk.h"
 #include "exprs/expr.h"
 #include "runtime/buffer_control_block.h"
+#include "runtime/current_thread.h"
 #include "runtime/exec_env.h"
 #include "runtime/mysql_result_writer.h"
 #include "runtime/query_statistics.h"
@ -72,13 +73,23 @@ StatusOr<vectorized::ChunkPtr> ResultSinkOperator::pull_chunk(RuntimeState* stat
    CHECK(false) << "Shouldn't pull chunk from result sink operator";
 }

+Status ResultSinkOperator::set_cancelled(RuntimeState* state) {
+    SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(nullptr);
+
+    _fetch_data_result.clear();
+    return Status::OK();
+}
+
 bool ResultSinkOperator::need_input() const {
+    SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(nullptr);
+
    if (is_finished()) {
        return false;
    }
    if (_fetch_data_result.empty()) {
        return true;
    }
+
    auto* mysql_writer = down_cast<MysqlResultWriter*>(_writer.get());
    auto status = mysql_writer->try_add_batch(_fetch_data_result);
    if (status.ok()) {
@ -90,10 +101,20 @@ bool ResultSinkOperator::need_input() const {
 }

 Status ResultSinkOperator::push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) {
+    // The ResultWriter memory that sends the results is no longer recorded to the query memory.
+    // There are two reason:
+    // 1. the query result has come out, and then the memory limit is triggered, cancel, it is not necessary
+    // 2. if this memory is counted, The memory of the receiving thread needs to be recorded,
+    // and the life cycle of MemTracker needs to be considered
+    //
+    // All the places where acquire and release memory of _fetch_data_result must use process_mem_tracker.
+    SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(nullptr);
+
    if (!_last_error.ok()) {
        return _last_error;
    }
    DCHECK(_fetch_data_result.empty());
+
    auto* mysql_writer = down_cast<MysqlResultWriter*>(_writer.get());
    auto status = mysql_writer->process_chunk_for_pipeline(chunk.get());
    if (status.ok()) {
@ -120,4 +141,5 @@ void ResultSinkOperatorFactory::close(RuntimeState* state) {
    Expr::close(_output_expr_ctxs, state);
    OperatorFactory::close(state);
 }
+
 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/result_sink_operator.h
+++ b/be/src/exec/pipeline/result_sink_operator.h
@ -48,6 +48,8 @@ public:
        return Status::OK();
    }

+    Status set_cancelled(RuntimeState* state) override;
+
    StatusOr<vectorized::ChunkPtr> pull_chunk(RuntimeState* state) override;

    Status push_chunk(RuntimeState* state, const vectorized::ChunkPtr& chunk) override;
--- a/be/src/exec/pipeline/scan/chunk_buffer_limiter.cpp
+++ b/be/src/exec/pipeline/scan/chunk_buffer_limiter.cpp
@ -0,0 +1,42 @@
+// This file is licensed under the Elastic License 2.0. Copyright 2021 StarRocks Limited.
+
+#include "exec/pipeline/scan/chunk_buffer_limiter.h"
+
+#include "glog/logging.h"
+
+namespace starrocks::pipeline {
+
+void DynamicChunkBufferLimiter::update_avg_row_bytes(size_t added_sum_row_bytes, size_t added_num_rows,
+                                                     size_t max_chunk_rows) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    _sum_row_bytes += added_sum_row_bytes;
+    _num_rows += added_num_rows;
+    size_t avg_row_bytes = 0;
+    if (_num_rows > 0) {
+        avg_row_bytes = _sum_row_bytes / _num_rows;
+    }
+    if (avg_row_bytes == 0) {
+        return;
+    }
+
+    size_t chunk_mem_usage = avg_row_bytes * max_chunk_rows;
+    size_t new_capacity = std::max<size_t>(_mem_limit / chunk_mem_usage, 1);
+    _capacity = std::min(new_capacity, _max_capacity);
+}
+
+ChunkBufferTokenPtr DynamicChunkBufferLimiter::pin(int num_chunks) {
+    size_t prev_value = _pinned_chunks_counter.fetch_add(num_chunks);
+    if (prev_value + num_chunks > _capacity) {
+        _unpin(num_chunks);
+        return nullptr;
+    }
+    return std::make_unique<DynamicChunkBufferLimiter::Token>(_pinned_chunks_counter, num_chunks);
+}
+
+void DynamicChunkBufferLimiter::_unpin(int num_chunks) {
+    int prev_value = _pinned_chunks_counter.fetch_sub(num_chunks);
+    DCHECK_GE(prev_value, 1);
+}
+
+} // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/scan/chunk_buffer_limiter.h
+++ b/be/src/exec/pipeline/scan/chunk_buffer_limiter.h
@ -0,0 +1,124 @@
+// This file is licensed under the Elastic License 2.0. Copyright 2021-present, StarRocks Limited.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+
+namespace starrocks::pipeline {
+
+class ChunkBufferToken;
+using ChunkBufferTokenPtr = std::unique_ptr<ChunkBufferToken>;
+class ChunkBufferLimiter;
+using ChunkBufferLimiterPtr = std::unique_ptr<ChunkBufferLimiter>;
+
+class ChunkBufferToken {
+public:
+    virtual ~ChunkBufferToken() = default;
+};
+
+// Limit the capacity of a chunk buffer.
+// - Before creating a new chunk, should use `pin()` to pin a position in the buffer and return a token.
+// - After a chunk is popped from buffer, should desctruct the token to unpin the position.
+// All the methods are thread-safe.
+class ChunkBufferLimiter {
+public:
+    virtual ~ChunkBufferLimiter() = default;
+
+    // Update the chunk memory usage statistics.
+    // `added_sum_row_bytes` is the bytes of the new reading rows.
+    // `added_num_rows` is the number of the new read rows.
+    virtual void update_avg_row_bytes(size_t added_sum_row_bytes, size_t added_num_rows, size_t max_chunk_rows) {}
+
+    // Pin a position in the buffer and return a token.
+    // When desctructing the token, the position will be unpinned.
+    virtual ChunkBufferTokenPtr pin(int num_chunks) = 0;
+
+    // Returns true, when it cannot pin a position for now.
+    virtual bool is_full() const = 0;
+    // The number of already pinned positions.
+    virtual size_t size() const = 0;
+    // The max number of positions able to be pinned.
+    virtual size_t capacity() const = 0;
+    // The default capacity when there isn't chunk memory usage statistics.
+    virtual size_t default_capacity() const = 0;
+};
+
+// The capacity of this limiter is unlimited.
+class UnlimitedChunkBufferLimiter final : public ChunkBufferLimiter {
+public:
+    class Token final : public ChunkBufferToken {
+    public:
+        ~Token() override = default;
+    };
+
+public:
+    ~UnlimitedChunkBufferLimiter() override = default;
+
+    ChunkBufferTokenPtr pin(int num_chunks) override { return std::make_unique<Token>(); }
+
+    bool is_full() const override { return false; }
+    size_t size() const override { return 0; }
+    size_t capacity() const override { return 0; }
+    size_t default_capacity() const override { return 0; }
+};
+
+// Use the dynamic chunk memory usage statistics to compute the capacity.
+class DynamicChunkBufferLimiter final : public ChunkBufferLimiter {
+public:
+    class Token final : public ChunkBufferToken {
+    public:
+        Token(std::atomic<int>& acquired_tokens_counter, int num_tokens)
+                : _acquired_tokens_counter(acquired_tokens_counter), _num_tokens(num_tokens) {}
+
+        ~Token() override { _acquired_tokens_counter.fetch_sub(_num_tokens); }
+
+        // Disable copy/move ctor and assignment.
+        Token(const Token&) = delete;
+        Token& operator=(const Token&) = delete;
+        Token(Token&&) = delete;
+        Token& operator=(Token&&) = delete;
+
+    private:
+        std::atomic<int>& _acquired_tokens_counter;
+        const int _num_tokens;
+    };
+
+public:
+    DynamicChunkBufferLimiter(size_t max_capacity, size_t default_capacity, int64_t mem_limit, int chunk_size)
+            : _capacity(default_capacity),
+              _max_capacity(max_capacity),
+              _default_capacity(default_capacity),
+              _mem_limit(mem_limit),
+              _chunk_size(chunk_size) {}
+    ~DynamicChunkBufferLimiter() override = default;
+
+    void update_avg_row_bytes(size_t added_sum_row_bytes, size_t added_num_rows, size_t max_chunk_rows) override;
+
+    ChunkBufferTokenPtr pin(int num_chunks) override;
+
+    bool is_full() const override { return _pinned_chunks_counter >= _capacity; }
+    size_t size() const override { return _pinned_chunks_counter; }
+    size_t capacity() const override { return _capacity; }
+    size_t default_capacity() const override { return _default_capacity; }
+
+private:
+    void _unpin(int num_chunks);
+
+private:
+    std::mutex _mutex;
+    size_t _sum_row_bytes = 0;
+    size_t _num_rows = 0;
+
+    size_t _capacity;
+    const size_t _max_capacity;
+    const size_t _default_capacity;
+
+    const int64_t _mem_limit;
+    const int _chunk_size;
+
+    std::atomic<int> _pinned_chunks_counter = 0;
+};
+
+} // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/scan/chunk_source.h
+++ b/be/src/exec/pipeline/scan/chunk_source.h
@ -6,11 +6,13 @@

 #include "column/vectorized_fwd.h"
 #include "common/statusor.h"
+#include "exec/pipeline/scan/chunk_buffer_limiter.h"
 #include "exec/pipeline/scan/morsel.h"
 #include "exec/workgroup/work_group_fwd.h"
 #include "util/exclusive_ptr.h"

 namespace starrocks {
+
 class RuntimeState;
 class RuntimeProfile;

@ -19,12 +21,14 @@ namespace pipeline {
 class ChunkSource {
 public:
    ChunkSource(RuntimeProfile* runtime_profile, MorselPtr&& morsel)
-            : _runtime_profile(runtime_profile), _morsel(std::move(morsel)){};
+            : _runtime_profile(runtime_profile), _morsel(std::move(morsel)) {}

    virtual ~ChunkSource() = default;

    virtual Status prepare(RuntimeState* state) = 0;

+    // Mark that it needn't produce any chunk anymore.
+    virtual Status set_finished(RuntimeState* state) = 0;
    virtual void close(RuntimeState* state) = 0;

    // Return true if eos is not reached
@ -43,27 +47,25 @@ public:
                                                                   size_t* num_read_chunks, int worker_id,
                                                                   workgroup::WorkGroupPtr running_wg) = 0;

-    // Some statistic of chunk source
-    virtual int64_t last_spent_cpu_time_ns() { return 0; }
+    // Counters of scan
+    int64_t get_cpu_time_spent() { return _cpu_time_spent_ns; }
+    int64_t get_scan_rows() const { return _scan_rows_num; }
+    int64_t get_scan_bytes() const { return _scan_bytes; }

-    virtual int64_t last_scan_rows_num() {
-        int64_t res = _last_scan_rows_num;
-        _last_scan_rows_num = 0;
-        return res;
-    }
-
-    virtual int64_t last_scan_bytes() {
-        int64_t res = _last_scan_bytes;
-        _last_scan_bytes = 0;
-        return res;
-    }
+    void pin_chunk_token(ChunkBufferTokenPtr chunk_token) { _chunk_token = std::move(chunk_token); }
+    void unpin_chunk_token() { _chunk_token.reset(nullptr); }

 protected:
    RuntimeProfile* _runtime_profile;
    // The morsel will own by pipeline driver
    MorselPtr _morsel;
-    int64_t _last_scan_rows_num = 0;
-    int64_t _last_scan_bytes = 0;
+
+    // NOTE: These counters need to be maintained by ChunkSource implementations, and update in realtime
+    int64_t _cpu_time_spent_ns = 0;
+    int64_t _scan_rows_num = 0;
+    int64_t _scan_bytes = 0;
+
+    ChunkBufferTokenPtr _chunk_token = nullptr;
 };

 using ChunkSourcePtr = std::shared_ptr<ChunkSource>;
@ -71,5 +73,6 @@ using ChunkSourcePromise = std::promise<ChunkSourcePtr>;
 using ChunkSourceFromisePtr = starrocks::exclusive_ptr<ChunkSourcePromise>;
 using ChunkSourceFuture = std::future<ChunkSourcePtr>;
 using OptionalChunkSourceFuture = std::optional<ChunkSourceFuture>;
+
 } // namespace pipeline
 } // namespace starrocks
--- a/be/src/exec/pipeline/scan/connector_scan_operator.cpp
+++ b/be/src/exec/pipeline/scan/connector_scan_operator.cpp
@ -3,6 +3,7 @@
 #include "exec/pipeline/scan/connector_scan_operator.h"

 #include "column/chunk.h"
+#include "exec/pipeline/scan/chunk_buffer_limiter.h"
 #include "exec/vectorized/connector_scan_node.h"
 #include "exec/workgroup/work_group.h"
 #include "runtime/exec_env.h"
@ -12,8 +13,9 @@ namespace starrocks::pipeline {

 // ==================== ConnectorScanOperatorFactory ====================

-ConnectorScanOperatorFactory::ConnectorScanOperatorFactory(int32_t id, ScanNode* scan_node)
-        : ScanOperatorFactory(id, scan_node) {}
+ConnectorScanOperatorFactory::ConnectorScanOperatorFactory(int32_t id, ScanNode* scan_node,
+                                                           ChunkBufferLimiterPtr buffer_limiter)
+        : ScanOperatorFactory(id, scan_node, std::move(buffer_limiter)) {}

 Status ConnectorScanOperatorFactory::do_prepare(RuntimeState* state) {
    const auto& conjunct_ctxs = _scan_node->conjunct_ctxs();
@ -29,16 +31,14 @@ void ConnectorScanOperatorFactory::do_close(RuntimeState* state) {
 }

 OperatorPtr ConnectorScanOperatorFactory::do_create(int32_t dop, int32_t driver_sequence) {
-    return std::make_shared<ConnectorScanOperator>(this, _id, driver_sequence, _scan_node, _max_scan_concurrency,
-                                                   _num_committed_scan_tasks);
+    return std::make_shared<ConnectorScanOperator>(this, _id, driver_sequence, _scan_node, _buffer_limiter.get());
 }

 // ==================== ConnectorScanOperator ====================

 ConnectorScanOperator::ConnectorScanOperator(OperatorFactory* factory, int32_t id, int32_t driver_sequence,
-                                             ScanNode* scan_node, int max_scan_concurrency,
-                                             std::atomic<int>& num_committed_scan_tasks)
-        : ScanOperator(factory, id, driver_sequence, scan_node, max_scan_concurrency, num_committed_scan_tasks) {}
+                                             ScanNode* scan_node, ChunkBufferLimiter* buffer_limiter)
+        : ScanOperator(factory, id, driver_sequence, scan_node, buffer_limiter) {}

 Status ConnectorScanOperator::do_prepare(RuntimeState* state) {
    return Status::OK();
@ -47,19 +47,21 @@ Status ConnectorScanOperator::do_prepare(RuntimeState* state) {
 void ConnectorScanOperator::do_close(RuntimeState* state) {}

 ChunkSourcePtr ConnectorScanOperator::create_chunk_source(MorselPtr morsel, int32_t chunk_source_index) {
-    vectorized::ConnectorScanNode* scan_node = down_cast<vectorized::ConnectorScanNode*>(_scan_node);
+    auto* scan_node = down_cast<vectorized::ConnectorScanNode*>(_scan_node);
    return std::make_shared<ConnectorChunkSource>(_chunk_source_profiles[chunk_source_index].get(), std::move(morsel),
-                                                  this, scan_node);
+                                                  this, scan_node, _buffer_limiter);
 }

 // ==================== ConnectorChunkSource ====================
 ConnectorChunkSource::ConnectorChunkSource(RuntimeProfile* runtime_profile, MorselPtr&& morsel, ScanOperator* op,
-                                           vectorized::ConnectorScanNode* scan_node)
+                                           vectorized::ConnectorScanNode* scan_node,
+                                           ChunkBufferLimiter* const buffer_limiter)
        : ChunkSource(runtime_profile, std::move(morsel)),
          _scan_node(scan_node),
          _limit(scan_node->limit()),
          _runtime_in_filters(op->runtime_in_filters()),
-          _runtime_bloom_filters(op->runtime_bloom_filters()) {
+          _runtime_bloom_filters(op->runtime_bloom_filters()),
+          _buffer_limiter(buffer_limiter) {
    _conjunct_ctxs = scan_node->conjunct_ctxs();
    _conjunct_ctxs.insert(_conjunct_ctxs.end(), _runtime_in_filters.begin(), _runtime_in_filters.end());
    ScanMorsel* scan_morsel = (ScanMorsel*)_morsel.get();
@ -78,9 +80,14 @@ ConnectorChunkSource::~ConnectorChunkSource() {
 }

 Status ConnectorChunkSource::prepare(RuntimeState* state) {
-    // semantics of `prepare` in ChunkSource is identical to `open`
    _runtime_state = state;
-    RETURN_IF_ERROR(_data_source->open(state));
+    return Status::OK();
+}
+
+Status ConnectorChunkSource::set_finished(RuntimeState* state) {
+    _chunk_buffer.shutdown();
+    _chunk_buffer.clear();
+
    return Status::OK();
 }

@ -88,6 +95,7 @@ void ConnectorChunkSource::close(RuntimeState* state) {
    if (_closed) return;
    _closed = true;
    _data_source->close(state);
+    set_finished(state);
 }

 bool ConnectorChunkSource::has_next_chunk() const {
@ -105,9 +113,10 @@ size_t ConnectorChunkSource::get_buffer_size() const {
 }

 StatusOr<vectorized::ChunkPtr> ConnectorChunkSource::get_next_chunk_from_buffer() {
-    vectorized::ChunkPtr chunk = nullptr;
+    // Will release the token after exiting this scope.
+    ChunkWithToken chunk = std::make_pair(nullptr, nullptr);
    _chunk_buffer.try_get(&chunk);
-    return chunk;
+    return std::move(chunk.first);
 }

 Status ConnectorChunkSource::buffer_next_batch_chunks_blocking(size_t batch_size, RuntimeState* state) {
@ -116,16 +125,22 @@ Status ConnectorChunkSource::buffer_next_batch_chunks_blocking(size_t batch_size
    }

    for (size_t i = 0; i < batch_size && !state->is_cancelled(); ++i) {
+        if (_chunk_token == nullptr && (_chunk_token = _buffer_limiter->pin(1)) == nullptr) {
+            return Status::OK();
+        }
+
        vectorized::ChunkPtr chunk;
        _status = _read_chunk(&chunk);
        if (!_status.ok()) {
            // end of file is normal case, need process chunk
            if (_status.is_end_of_file()) {
-                _chunk_buffer.put(std::move(chunk));
+                _chunk_buffer.put(std::make_pair(std::move(chunk), std::move(_chunk_token)));
            }
            break;
        }
-        _chunk_buffer.put(std::move(chunk));
+        if (!_chunk_buffer.put(std::make_pair(std::move(chunk), std::move(_chunk_token)))) {
+            break;
+        }
    }
    return _status;
 }
@ -139,6 +154,10 @@ Status ConnectorChunkSource::buffer_next_batch_chunks_blocking_for_workgroup(siz
    int64_t time_spent = 0;
    for (size_t i = 0; i < batch_size && !state->is_cancelled(); ++i) {
        {
+            if (_chunk_token == nullptr && (_chunk_token = _buffer_limiter->pin(1)) == nullptr) {
+                return Status::OK();
+            }
+
            SCOPED_RAW_TIMER(&time_spent);

            vectorized::ChunkPtr chunk;
@ -147,13 +166,15 @@ Status ConnectorChunkSource::buffer_next_batch_chunks_blocking_for_workgroup(siz
                // end of file is normal case, need process chunk
                if (_status.is_end_of_file()) {
                    ++(*num_read_chunks);
-                    _chunk_buffer.put(std::move(chunk));
+                    _chunk_buffer.put(std::make_pair(std::move(chunk), std::move(_chunk_token)));
                }
                break;
            }

            ++(*num_read_chunks);
-            _chunk_buffer.put(std::move(chunk));
+            if (!_chunk_buffer.put(std::make_pair(std::move(chunk), std::move(_chunk_token)))) {
+                break;
+            }
        }

        if (time_spent >= YIELD_MAX_TIME_SPENT) {
@ -161,7 +182,8 @@ Status ConnectorChunkSource::buffer_next_batch_chunks_blocking_for_workgroup(siz
        }

        if (time_spent >= YIELD_PREEMPT_MAX_TIME_SPENT &&
-            workgroup::WorkGroupManager::instance()->get_owners_of_scan_worker(worker_id, running_wg)) {
+            workgroup::WorkGroupManager::instance()->get_owners_of_scan_worker(workgroup::TypeHdfsScanExecutor,
+                                                                               worker_id, running_wg)) {
            break;
        }
    }
@ -171,6 +193,11 @@ Status ConnectorChunkSource::buffer_next_batch_chunks_blocking_for_workgroup(siz

 Status ConnectorChunkSource::_read_chunk(vectorized::ChunkPtr* chunk) {
    RuntimeState* state = _runtime_state;
+    if (!_opened) {
+        RETURN_IF_ERROR(_data_source->open(state));
+        _opened = true;
+    }
+
    if (state->is_cancelled()) {
        return Status::Cancelled("canceled state");
    }
@ -183,7 +210,10 @@ Status ConnectorChunkSource::_read_chunk(vectorized::ChunkPtr* chunk) {
    do {
        RETURN_IF_ERROR(_data_source->get_next(state, chunk));
    } while ((*chunk)->num_rows() == 0);
+
    _rows_read += (*chunk)->num_rows();
+    _scan_rows_num = _data_source->raw_rows_read();
+    _scan_bytes = _data_source->num_bytes_read();

    return Status::OK();
 }
--- a/be/src/exec/pipeline/scan/connector_scan_operator.h
+++ b/be/src/exec/pipeline/scan/connector_scan_operator.h
@ -13,9 +13,13 @@ class ScanNode;

 namespace pipeline {

+class ChunkBufferToken;
+using ChunkBufferTokenPtr = std::unique_ptr<ChunkBufferToken>;
+class ChunkBufferLimiter;
+
 class ConnectorScanOperatorFactory final : public ScanOperatorFactory {
 public:
-    ConnectorScanOperatorFactory(int32_t id, ScanNode* scan_node);
+    ConnectorScanOperatorFactory(int32_t id, ScanNode* scan_node, ChunkBufferLimiterPtr buffer_limiter);

    ~ConnectorScanOperatorFactory() override = default;

@ -27,7 +31,7 @@ public:
 class ConnectorScanOperator final : public ScanOperator {
 public:
    ConnectorScanOperator(OperatorFactory* factory, int32_t id, int32_t driver_sequence, ScanNode* scan_node,
-                          int max_scan_concurrency, std::atomic<int>& num_committed_scan_tasks);
+                          ChunkBufferLimiter* buffer_limiter);

    ~ConnectorScanOperator() override = default;

@ -41,12 +45,13 @@ private:
 class ConnectorChunkSource final : public ChunkSource {
 public:
    ConnectorChunkSource(RuntimeProfile* runtime_profile, MorselPtr&& morsel, ScanOperator* op,
-                         vectorized::ConnectorScanNode* scan_node);
+                         vectorized::ConnectorScanNode* scan_node, ChunkBufferLimiter* const buffer_limiter);

    ~ConnectorChunkSource() override;

    Status prepare(RuntimeState* state) override;

+    Status set_finished(RuntimeState* state) override;
    void close(RuntimeState* state) override;

    bool has_next_chunk() const override;
@ -63,6 +68,8 @@ public:
                                                           workgroup::WorkGroupPtr running_wg) override;

 private:
+    using ChunkWithToken = std::pair<vectorized::ChunkPtr, ChunkBufferTokenPtr>;
+
    Status _read_chunk(vectorized::ChunkPtr* chunk);

    // Yield scan io task when maximum time in nano-seconds has spent in current execution round.
@ -84,9 +91,13 @@ private:
    // =========================
    RuntimeState* _runtime_state = nullptr;
    Status _status = Status::OK();
+    bool _opened = false;
    bool _closed = false;
    uint64_t _rows_read = 0;
-    UnboundedBlockingQueue<vectorized::ChunkPtr> _chunk_buffer;
+    uint64_t _bytes_read = 0;
+
+    UnboundedBlockingQueue<ChunkWithToken> _chunk_buffer;
+    ChunkBufferLimiter* const _buffer_limiter;
 };

 } // namespace pipeline
--- a/be/src/exec/pipeline/scan/morsel.cpp
+++ b/be/src/exec/pipeline/scan/morsel.cpp
@ -87,7 +87,11 @@ StatusOr<MorselPtr> PhysicalSplitMorselQueue::try_get() {
            return nullptr;
        }

-        RETURN_IF_ERROR(_init_segment());
+        if (auto status = _init_segment(); !status.ok()) {
+            // Morsel_queue cannot generate morsels after errors occurring.
+            _tablet_idx = _tablets.size();
+            return status;
+        }
    }

    vectorized::SparseRange taken_range;
--- a/be/src/exec/pipeline/scan/morsel.h
+++ b/be/src/exec/pipeline/scan/morsel.h
@ -90,7 +90,8 @@ public:
    virtual void set_tablets(const std::vector<TabletSharedPtr>& tablets) {}
    virtual void set_tablet_rowsets(const std::vector<std::vector<RowsetSharedPtr>>& tablet_rowsets) {}

-    virtual size_t num_morsels() const = 0;
+    virtual size_t num_original_morsels() const = 0;
+    virtual size_t max_degree_of_parallelism() const = 0;
    virtual bool empty() const = 0;
    virtual StatusOr<MorselPtr> try_get() = 0;

@ -108,7 +109,8 @@ public:

    std::vector<TInternalScanRange*> olap_scan_ranges() const override;

-    size_t num_morsels() const override { return _num_morsels; }
+    size_t num_original_morsels() const override { return _num_morsels; }
+    size_t max_degree_of_parallelism() const override { return _num_morsels; }
    bool empty() const override { return _pop_index >= _num_morsels; }
    StatusOr<MorselPtr> try_get() override;

@ -137,7 +139,8 @@ public:
        _tablet_rowsets = tablet_rowsets;
    }

-    size_t num_morsels() const override { return _degree_of_parallelism; }
+    size_t num_original_morsels() const override { return _morsels.size(); }
+    size_t max_degree_of_parallelism() const override { return _degree_of_parallelism; }
    bool empty() const override { return _tablet_idx >= _tablets.size(); }
    StatusOr<MorselPtr> try_get() override;

--- a/be/src/exec/pipeline/scan/olap_chunk_source.cpp
+++ b/be/src/exec/pipeline/scan/olap_chunk_source.cpp
@ -4,6 +4,7 @@

 #include "column/column_helper.h"
 #include "common/constexpr.h"
+#include "exec/pipeline/scan/chunk_buffer_limiter.h"
 #include "exec/pipeline/scan/olap_scan_context.h"
 #include "exec/pipeline/scan/scan_operator.h"
 #include "exec/vectorized/olap_scan_node.h"
@ -25,23 +26,33 @@ namespace starrocks::pipeline {
 using namespace vectorized;

 OlapChunkSource::OlapChunkSource(RuntimeProfile* runtime_profile, MorselPtr&& morsel,
-                                 vectorized::OlapScanNode* scan_node, OlapScanContext* scan_ctx)
+                                 vectorized::OlapScanNode* scan_node, OlapScanContext* scan_ctx,
+                                 ChunkBufferLimiter* const buffer_limiter)
        : ChunkSource(runtime_profile, std::move(morsel)),
          _scan_node(scan_node),
          _scan_ctx(scan_ctx),
          _limit(scan_node->limit()),
-          _scan_range(down_cast<ScanMorsel*>(_morsel.get())->get_olap_scan_range()) {}
+          _scan_range(down_cast<ScanMorsel*>(_morsel.get())->get_olap_scan_range()),
+          _buffer_limiter(buffer_limiter) {}

 OlapChunkSource::~OlapChunkSource() {
    _reader.reset();
    _predicate_free_pool.clear();
 }

+Status OlapChunkSource::set_finished(RuntimeState* state) {
+    _chunk_buffer.shutdown();
+    _chunk_buffer.clear();
+
+    return Status::OK();
+}
+
 void OlapChunkSource::close(RuntimeState* state) {
    _update_counter();
    _prj_iter->close();
    _reader.reset();
    _predicate_free_pool.clear();
+    set_finished(state);
 }

 Status OlapChunkSource::prepare(RuntimeState* state) {
@ -290,9 +301,10 @@ size_t OlapChunkSource::get_buffer_size() const {
 }

 StatusOr<vectorized::ChunkPtr> OlapChunkSource::get_next_chunk_from_buffer() {
-    vectorized::ChunkPtr chunk = nullptr;
+    // Will release the token after exiting this scope.
+    ChunkWithToken chunk = std::make_pair(nullptr, nullptr);
    _chunk_buffer.try_get(&chunk);
-    return chunk;
+    return std::move(chunk.first);
 }

 Status OlapChunkSource::buffer_next_batch_chunks_blocking(size_t batch_size, RuntimeState* state) {
@ -302,18 +314,26 @@ Status OlapChunkSource::buffer_next_batch_chunks_blocking(size_t batch_size, Run
    using namespace vectorized;

    for (size_t i = 0; i < batch_size && !state->is_cancelled(); ++i) {
+        if (_chunk_token == nullptr && (_chunk_token = _buffer_limiter->pin(1)) == nullptr) {
+            return Status::OK();
+        }
+
        ChunkUniquePtr chunk(
                ChunkHelper::new_chunk_pooled(_prj_iter->output_schema(), _runtime_state->chunk_size(), true));
        _status = _read_chunk_from_storage(_runtime_state, chunk.get());
        if (!_status.ok()) {
            // end of file is normal case, need process chunk
            if (_status.is_end_of_file()) {
-                _chunk_buffer.put(std::move(chunk));
+                _chunk_buffer.put(std::make_pair(std::move(chunk), std::move(_chunk_token)));
            }
            break;
        }
-        _chunk_buffer.put(std::move(chunk));
+
+        if (!_chunk_buffer.put(std::make_pair(std::move(chunk), std::move(_chunk_token)))) {
+            break;
+        }
    }
+
    return _status;
 }

@ -323,11 +343,15 @@ Status OlapChunkSource::buffer_next_batch_chunks_blocking_for_workgroup(size_t b
    if (!_status.ok()) {
        return _status;
    }
-
    using namespace vectorized;
+
    int64_t time_spent = 0;
    for (size_t i = 0; i < batch_size && !state->is_cancelled(); ++i) {
        {
+            if (_chunk_token == nullptr && (_chunk_token = _buffer_limiter->pin(1)) == nullptr) {
+                return Status::OK();
+            }
+
            SCOPED_RAW_TIMER(&time_spent);

            ChunkUniquePtr chunk(
@ -337,13 +361,15 @@ Status OlapChunkSource::buffer_next_batch_chunks_blocking_for_workgroup(size_t b
                // end of file is normal case, need process chunk
                if (_status.is_end_of_file()) {
                    ++(*num_read_chunks);
-                    _chunk_buffer.put(std::move(chunk));
+                    _chunk_buffer.put(std::make_pair(std::move(chunk), std::move(_chunk_token)));
                }
                break;
            }

            ++(*num_read_chunks);
-            _chunk_buffer.put(std::move(chunk));
+            if (!_chunk_buffer.put(std::make_pair(std::move(chunk), std::move(_chunk_token)))) {
+                break;
+            }
        }

        if (time_spent >= YIELD_MAX_TIME_SPENT) {
@ -351,7 +377,8 @@ Status OlapChunkSource::buffer_next_batch_chunks_blocking_for_workgroup(size_t b
        }

        if (time_spent >= YIELD_PREEMPT_MAX_TIME_SPENT &&
-            workgroup::WorkGroupManager::instance()->get_owners_of_scan_worker(worker_id, running_wg)) {
+            workgroup::WorkGroupManager::instance()->get_owners_of_scan_worker(workgroup::TypeOlapScanExecutor,
+                                                                               worker_id, running_wg)) {
            break;
        }
    }
@ -416,6 +443,7 @@ Status OlapChunkSource::_read_chunk_from_storage(RuntimeState* state, vectorized
        TRY_CATCH_ALLOC_SCOPE_END()

    } while (chunk->num_rows() == 0);
+
    _update_realtime_counter(chunk);
    // Improve for select * from table limit x, x is small
    if (_limit != -1 && _num_rows_read >= _limit) {
@ -424,26 +452,22 @@ Status OlapChunkSource::_read_chunk_from_storage(RuntimeState* state, vectorized
    return Status::OK();
 }

-int64_t OlapChunkSource::last_spent_cpu_time_ns() {
-    int64_t time_ns = _last_spent_cpu_time_ns;
-    _last_spent_cpu_time_ns += _reader->stats().decompress_ns;
-    _last_spent_cpu_time_ns += _reader->stats().vec_cond_ns;
-    _last_spent_cpu_time_ns += _reader->stats().del_filter_ns;
-    return _last_spent_cpu_time_ns - time_ns;
-}
-
 void OlapChunkSource::_update_realtime_counter(vectorized::Chunk* chunk) {
-    COUNTER_UPDATE(_read_compressed_counter, _reader->stats().compressed_bytes_read);
-    _compressed_bytes_read += _reader->stats().compressed_bytes_read;
-    _reader->mutable_stats()->compressed_bytes_read = 0;
-
-    COUNTER_UPDATE(_raw_rows_counter, _reader->stats().raw_rows_read);
-    _raw_rows_read += _reader->stats().raw_rows_read;
-    _last_scan_rows_num += _reader->stats().raw_rows_read;
-    _last_scan_bytes += _reader->stats().bytes_read;
-
-    _reader->mutable_stats()->raw_rows_read = 0;
+    auto& stats = _reader->stats();
    _num_rows_read += chunk->num_rows();
+    _scan_rows_num = stats.raw_rows_read;
+    _scan_bytes = stats.bytes_read;
+    _cpu_time_spent_ns = stats.decompress_ns + stats.vec_cond_ns + stats.del_filter_ns;
+
+    // Update local counters.
+    _local_sum_row_bytes += chunk->memory_usage();
+    _local_num_rows += chunk->num_rows();
+    _local_max_chunk_rows = std::max(_local_max_chunk_rows, chunk->num_rows());
+    if (_local_sum_chunks++ % UPDATE_AVG_ROW_BYTES_FREQUENCY == 0) {
+        _buffer_limiter->update_avg_row_bytes(_local_sum_row_bytes, _local_num_rows, _local_max_chunk_rows);
+        _local_sum_row_bytes = 0;
+        _local_num_rows = 0;
+    }
 }

 void OlapChunkSource::_update_counter() {
@ -452,7 +476,6 @@ void OlapChunkSource::_update_counter() {

    COUNTER_UPDATE(_io_timer, _reader->stats().io_ns);
    COUNTER_UPDATE(_read_compressed_counter, _reader->stats().compressed_bytes_read);
-    _compressed_bytes_read += _reader->stats().compressed_bytes_read;
    COUNTER_UPDATE(_decompress_timer, _reader->stats().decompress_ns);
    COUNTER_UPDATE(_read_uncompressed_counter, _reader->stats().uncompressed_bytes_read);
    COUNTER_UPDATE(_bytes_read_counter, _reader->stats().bytes_read);
@ -462,15 +485,11 @@ void OlapChunkSource::_update_counter() {
    COUNTER_UPDATE(_block_fetch_timer, _reader->stats().block_fetch_ns);
    COUNTER_UPDATE(_block_seek_timer, _reader->stats().block_seek_ns);

-    COUNTER_UPDATE(_raw_rows_counter, _reader->stats().raw_rows_read);
-    _raw_rows_read += _reader->mutable_stats()->raw_rows_read;
-    _last_scan_rows_num += _reader->mutable_stats()->raw_rows_read;
-    _last_scan_bytes += _reader->mutable_stats()->bytes_read;
-
    COUNTER_UPDATE(_chunk_copy_timer, _reader->stats().vec_cond_chunk_copy_ns);
-
    COUNTER_UPDATE(_seg_init_timer, _reader->stats().segment_init_ns);

+    COUNTER_UPDATE(_raw_rows_counter, _reader->stats().raw_rows_read);
+
    int64_t cond_evaluate_ns = 0;
    cond_evaluate_ns += _reader->stats().vec_cond_evaluate_ns;
    cond_evaluate_ns += _reader->stats().branchless_cond_evaluate_ns;
@ -500,8 +519,8 @@ void OlapChunkSource::_update_counter() {

    COUNTER_SET(_pushdown_predicates_counter, (int64_t)_params.predicates.size());

-    StarRocksMetrics::instance()->query_scan_bytes.increment(_compressed_bytes_read);
-    StarRocksMetrics::instance()->query_scan_rows.increment(_raw_rows_read);
+    StarRocksMetrics::instance()->query_scan_bytes.increment(_scan_bytes);
+    StarRocksMetrics::instance()->query_scan_rows.increment(_scan_rows_num);

    if (_reader->stats().decode_dict_ns > 0) {
        RuntimeProfile::Counter* c = ADD_TIMER(_runtime_profile, "DictDecode");
--- a/be/src/exec/pipeline/scan/olap_chunk_source.h
+++ b/be/src/exec/pipeline/scan/olap_chunk_source.h
@ -29,16 +29,20 @@ namespace pipeline {

 class ScanOperator;
 class OlapScanContext;
+class ChunkBufferToken;
+using ChunkBufferTokenPtr = std::unique_ptr<ChunkBufferToken>;
+class ChunkBufferLimiter;

 class OlapChunkSource final : public ChunkSource {
 public:
    OlapChunkSource(RuntimeProfile* runtime_profile, MorselPtr&& morsel, vectorized::OlapScanNode* scan_node,
-                    OlapScanContext* scan_ctx);
+                    OlapScanContext* scan_ctx, ChunkBufferLimiter* const buffer_limiter);

    ~OlapChunkSource() override;

    Status prepare(RuntimeState* state) override;

+    Status set_finished(RuntimeState* state) override;
    void close(RuntimeState* state) override;

    bool has_next_chunk() const override;
@ -54,8 +58,6 @@ public:
                                                           size_t* num_read_chunks, int worker_id,
                                                           workgroup::WorkGroupPtr running_wg) override;

-    int64_t last_spent_cpu_time_ns() override;
-
 private:
    // Yield scan io task when maximum time in nano-seconds has spent in current execution round.
    static constexpr int64_t YIELD_MAX_TIME_SPENT = 100'000'000L;
@ -63,6 +65,8 @@ private:
    // if it runs in the worker thread owned by other workgroup, which has running drivers.
    static constexpr int64_t YIELD_PREEMPT_MAX_TIME_SPENT = 20'000'000L;

+    static constexpr int UPDATE_AVG_ROW_BYTES_FREQUENCY = 8;
+
    Status _get_tablet(const TInternalScanRange* scan_range);
    Status _init_reader_params(const std::vector<std::unique_ptr<OlapScanRange>>& key_ranges,
                               const std::vector<uint32_t>& scanner_columns, std::vector<uint32_t>& reader_columns);
@ -77,6 +81,8 @@ private:
    void _decide_chunk_size();

 private:
+    using ChunkWithToken = std::pair<vectorized::ChunkPtr, ChunkBufferTokenPtr>;
+
    vectorized::TabletReaderParams _params{};
    vectorized::OlapScanNode* _scan_node;
    OlapScanContext* _scan_ctx;
@ -85,7 +91,8 @@ private:
    TInternalScanRange* _scan_range;

    Status _status = Status::OK();
-    UnboundedBlockingQueue<vectorized::ChunkPtr> _chunk_buffer;
+    UnboundedBlockingQueue<ChunkWithToken> _chunk_buffer;
+    ChunkBufferLimiter* const _buffer_limiter;
    vectorized::ConjunctivePredicates _not_push_down_predicates;
    std::vector<uint8_t> _selection;

@ -113,9 +120,12 @@ private:

    // The following are profile meatures
    int64_t _num_rows_read = 0;
-    int64_t _raw_rows_read = 0;
-    int64_t _compressed_bytes_read = 0;
-    int64_t _last_spent_cpu_time_ns = 0;
+
+    // Local counters for row-size estimation, will be reset after a batch
+    size_t _local_sum_row_bytes = 0;
+    size_t _local_num_rows = 0;
+    size_t _local_sum_chunks = 0;
+    size_t _local_max_chunk_rows = 0;

    RuntimeProfile::Counter* _bytes_read_counter = nullptr;
    RuntimeProfile::Counter* _rows_read_counter = nullptr;
--- a/be/src/exec/pipeline/scan/olap_scan_operator.cpp
+++ b/be/src/exec/pipeline/scan/olap_scan_operator.cpp
@ -3,6 +3,7 @@
 #include "exec/pipeline/scan/olap_scan_operator.h"

 #include "column/chunk.h"
+#include "exec/pipeline/scan/chunk_buffer_limiter.h"
 #include "exec/pipeline/scan/olap_chunk_source.h"
 #include "exec/pipeline/scan/olap_scan_context.h"
 #include "exec/vectorized/olap_scan_node.h"
@ -18,8 +19,9 @@ namespace starrocks::pipeline {

 // ==================== OlapScanOperatorFactory ====================

-OlapScanOperatorFactory::OlapScanOperatorFactory(int32_t id, ScanNode* scan_node, OlapScanContextPtr ctx)
-        : ScanOperatorFactory(id, scan_node), _ctx(std::move(ctx)) {}
+OlapScanOperatorFactory::OlapScanOperatorFactory(int32_t id, ScanNode* scan_node, ChunkBufferLimiterPtr buffer_limiter,
+                                                 OlapScanContextPtr ctx)
+        : ScanOperatorFactory(id, scan_node, std::move(buffer_limiter)), _ctx(std::move(ctx)) {}

 Status OlapScanOperatorFactory::do_prepare(RuntimeState* state) {
    return Status::OK();
@ -28,17 +30,14 @@ Status OlapScanOperatorFactory::do_prepare(RuntimeState* state) {
 void OlapScanOperatorFactory::do_close(RuntimeState*) {}

 OperatorPtr OlapScanOperatorFactory::do_create(int32_t dop, int32_t driver_sequence) {
-    return std::make_shared<OlapScanOperator>(this, _id, driver_sequence, _scan_node, _max_scan_concurrency,
-                                              _num_committed_scan_tasks, _ctx);
+    return std::make_shared<OlapScanOperator>(this, _id, driver_sequence, _scan_node, _buffer_limiter.get(), _ctx);
 }

 // ==================== OlapScanOperator ====================

 OlapScanOperator::OlapScanOperator(OperatorFactory* factory, int32_t id, int32_t driver_sequence, ScanNode* scan_node,
-                                   int max_scan_concurrency, std::atomic<int>& num_committed_scan_tasks,
-                                   OlapScanContextPtr ctx)
-        : ScanOperator(factory, id, driver_sequence, scan_node, max_scan_concurrency, num_committed_scan_tasks),
-          _ctx(std::move(ctx)) {
+                                   ChunkBufferLimiter* buffer_limiter, OlapScanContextPtr ctx)
+        : ScanOperator(factory, id, driver_sequence, scan_node, buffer_limiter), _ctx(std::move(ctx)) {
    _ctx->ref();
 }

@ -83,7 +82,7 @@ void OlapScanOperator::do_close(RuntimeState* state) {}
 ChunkSourcePtr OlapScanOperator::create_chunk_source(MorselPtr morsel, int32_t chunk_source_index) {
    auto* olap_scan_node = down_cast<vectorized::OlapScanNode*>(_scan_node);
    return std::make_shared<OlapChunkSource>(_chunk_source_profiles[chunk_source_index].get(), std::move(morsel),
-                                             olap_scan_node, _ctx.get());
+                                             olap_scan_node, _ctx.get(), _buffer_limiter);
 }

 } // namespace starrocks::pipeline
--- a/be/src/exec/pipeline/scan/olap_scan_operator.h
+++ b/be/src/exec/pipeline/scan/olap_scan_operator.h
@ -18,7 +18,8 @@ using OlapScanContextPtr = std::shared_ptr<OlapScanContext>;

 class OlapScanOperatorFactory final : public ScanOperatorFactory {
 public:
-    OlapScanOperatorFactory(int32_t id, ScanNode* scan_node, OlapScanContextPtr ctx);
+    OlapScanOperatorFactory(int32_t id, ScanNode* scan_node, ChunkBufferLimiterPtr buffer_limiter,
+                            OlapScanContextPtr ctx);

    ~OlapScanOperatorFactory() override = default;

@ -33,7 +34,7 @@ private:
 class OlapScanOperator final : public ScanOperator {
 public:
    OlapScanOperator(OperatorFactory* factory, int32_t id, int32_t driver_sequence, ScanNode* scan_node,
-                     int max_scan_concurrency, std::atomic<int>& num_committed_scan_tasks, OlapScanContextPtr ctx);
+                     ChunkBufferLimiter* buffer_limiter, OlapScanContextPtr ctx);

    ~OlapScanOperator() override;

--- a/be/src/exec/pipeline/scan/scan_operator.cpp
+++ b/be/src/exec/pipeline/scan/scan_operator.cpp
@ -3,8 +3,10 @@
 #include "exec/pipeline/scan/scan_operator.h"

 #include "column/chunk.h"
+#include "exec/pipeline/chunk_accumulate_operator.h"
 #include "exec/pipeline/limit_operator.h"
 #include "exec/pipeline/pipeline_builder.h"
+#include "exec/pipeline/scan/chunk_buffer_limiter.h"
 #include "exec/pipeline/scan/connector_scan_operator.h"
 #include "exec/vectorized/olap_scan_node.h"
 #include "exec/workgroup/scan_executor.h"
@ -17,12 +19,11 @@ namespace starrocks::pipeline {
 // ========== ScanOperator ==========

 ScanOperator::ScanOperator(OperatorFactory* factory, int32_t id, int32_t driver_sequence, ScanNode* scan_node,
-                           int max_scan_concurrency, std::atomic<int>& num_committed_scan_tasks)
+                           ChunkBufferLimiter* buffer_limiter)
        : SourceOperator(factory, id, scan_node->name(), scan_node->id(), driver_sequence),
          _scan_node(scan_node),
          _chunk_source_profiles(MAX_IO_TASKS_PER_OP),
-          _max_scan_concurrency(max_scan_concurrency),
-          _num_committed_scan_tasks(num_committed_scan_tasks),
+          _buffer_limiter(buffer_limiter),
          _is_io_task_running(MAX_IO_TASKS_PER_OP),
          _chunk_sources(MAX_IO_TASKS_PER_OP) {
    for (auto i = 0; i < MAX_IO_TASKS_PER_OP; i++) {
@ -48,8 +49,8 @@ Status ScanOperator::prepare(RuntimeState* state) {
    RETURN_IF_ERROR(SourceOperator::prepare(state));

    _unique_metrics->add_info_string("MorselQueueType", _morsel_queue->name());
-    auto* max_scan_concurrency_counter = ADD_COUNTER(_unique_metrics, "MaxScanConcurrency", TUnit::UNIT);
-    COUNTER_SET(max_scan_concurrency_counter, static_cast<int64_t>(_max_scan_concurrency));
+    _peak_buffer_size_counter = _unique_metrics->AddHighWaterMarkCounter("PeakChunkBufferSize", TUnit::UNIT);
+    _morsels_counter = ADD_COUNTER(_unique_metrics, "MorselsCount", TUnit::UNIT);

    if (_workgroup == nullptr) {
        DCHECK(_io_threads != nullptr);
@ -73,13 +74,25 @@ void ScanOperator::close(RuntimeState* state) {
    }
    // For the running io task, we close its chunk sources in ~ScanOperator not in ScanOperator::close.
    for (size_t i = 0; i < _chunk_sources.size(); i++) {
-        if (_chunk_sources[i] != nullptr && !_is_io_task_running[i]) {
-            _chunk_sources[i]->close(state);
-            _chunk_sources[i] = nullptr;
+        if (_chunk_sources[i] != nullptr) {
+            _chunk_sources[i]->set_finished(state);
+            if (!_is_io_task_running[i]) {
+                _chunk_sources[i]->close(state);
+                _chunk_sources[i] = nullptr;
+            }
        }
    }

+    _default_buffer_capacity_counter = ADD_COUNTER(_unique_metrics, "DefaultChunkBufferCapacity", TUnit::UNIT);
+    COUNTER_SET(_default_buffer_capacity_counter, static_cast<int64_t>(_buffer_limiter->default_capacity()));
+    _buffer_capacity_counter = ADD_COUNTER(_unique_metrics, "ChunkBufferCapacity", TUnit::UNIT);
+    COUNTER_SET(_buffer_capacity_counter, static_cast<int64_t>(_buffer_limiter->capacity()));
+
+    _tablets_counter = ADD_COUNTER(_unique_metrics, "TabletCount", TUnit::UNIT);
+    COUNTER_SET(_tablets_counter, static_cast<int64_t>(_morsel_queue->num_original_morsels()));
+
    _merge_chunk_source_profiles();
+
    do_close(state);
    Operator::close(state);
 }
@ -99,8 +112,7 @@ bool ScanOperator::has_output() const {
        }
    }

-    if (_num_running_io_tasks >= MAX_IO_TASKS_PER_OP ||
-        _exceed_max_scan_concurrency(_num_committed_scan_tasks.load())) {
+    if (_num_running_io_tasks >= MAX_IO_TASKS_PER_OP || _buffer_limiter->is_full()) {
        return false;
    }

@ -159,6 +171,9 @@ Status ScanOperator::set_finishing(RuntimeState* state) {

 StatusOr<vectorized::ChunkPtr> ScanOperator::pull_chunk(RuntimeState* state) {
    RETURN_IF_ERROR(_get_scan_status());
+
+    _peak_buffer_size_counter->set(_buffer_limiter->size());
+
    RETURN_IF_ERROR(_try_to_trigger_next_scan(state));
    if (_workgroup != nullptr) {
        _workgroup->incr_period_ask_chunk_num(1);
@ -190,19 +205,17 @@ Status ScanOperator::_try_to_trigger_next_scan(RuntimeState* state) {
        return Status::OK();
    }

-    // Firstly, find the picked-up morsel, whose can commit an io task.
    for (int i = 0; i < MAX_IO_TASKS_PER_OP; ++i) {
-        if (_chunk_sources[i] != nullptr && !_is_io_task_running[i] && _chunk_sources[i]->has_next_chunk()) {
-            RETURN_IF_ERROR(_trigger_next_scan(state, i));
+        if (_is_io_task_running[i]) {
+            continue;
        }
-    }

-    // Secondly, find the unused position of _chunk_sources to pick up a new morsel.
-    if (!_morsel_queue->empty()) {
-        for (int i = 0; i < MAX_IO_TASKS_PER_OP; ++i) {
-            if (_chunk_sources[i] == nullptr || (!_is_io_task_running[i] && !_chunk_sources[i]->has_output())) {
-                RETURN_IF_ERROR(_pickup_morsel(state, i));
-            }
+        if (_chunk_sources[i] == nullptr) {
+            RETURN_IF_ERROR(_pickup_morsel(state, i));
+        } else if (_chunk_sources[i]->has_next_chunk()) {
+            RETURN_IF_ERROR(_trigger_next_scan(state, i));
+        } else if (!_chunk_sources[i]->has_output()) {
+            RETURN_IF_ERROR(_pickup_morsel(state, i));
        }
    }

@ -218,11 +231,22 @@ inline bool is_uninitialized(const std::weak_ptr<QueryContext>& ptr) {
    return !ptr.owner_before(wp{}) && !wp{}.owner_before(ptr);
 }

+void ScanOperator::_finish_chunk_source_task(RuntimeState* state, int chunk_source_index, int64_t cpu_time_ns,
+                                             int64_t scan_rows, int64_t scan_bytes) {
+    _last_growth_cpu_time_ns += cpu_time_ns;
+    _last_scan_rows_num += scan_rows;
+    _last_scan_bytes += scan_bytes;
+    _num_running_io_tasks--;
+    _is_io_task_running[chunk_source_index] = false;
+}
+
 Status ScanOperator::_trigger_next_scan(RuntimeState* state, int chunk_source_index) {
-    if (!_try_to_increase_committed_scan_tasks()) {
+    ChunkBufferTokenPtr buffer_token;
+    if (buffer_token = _buffer_limiter->pin(1); buffer_token == nullptr) {
        return Status::OK();
    }

+    _chunk_sources[chunk_source_index]->pin_chunk_token(std::move(buffer_token));
    _num_running_io_tasks++;
    _is_io_task_running[chunk_source_index] = true;

@ -231,33 +255,39 @@ Status ScanOperator::_trigger_next_scan(RuntimeState* state, int chunk_source_in
    if (is_uninitialized(_query_ctx)) {
        _query_ctx = state->exec_env()->query_context_mgr()->get(state->query_id());
    }
+    int32_t driver_id = CurrentThread::current().get_driver_id();
    if (_workgroup != nullptr) {
-        workgroup::ScanTask task = workgroup::ScanTask(_workgroup, [wp = _query_ctx, this, state,
-                                                                    chunk_source_index](int worker_id) {
-            if (auto sp = wp.lock()) {
-                {
-                    SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(state->instance_mem_tracker());
-                    size_t num_read_chunks = 0;
-                    Status status = _chunk_sources[chunk_source_index]->buffer_next_batch_chunks_blocking_for_workgroup(
-                            _buffer_size, state, &num_read_chunks, worker_id, _workgroup);
-                    if (!status.ok() && !status.is_end_of_file()) {
-                        _set_scan_status(status);
+        workgroup::ScanTask task = workgroup::ScanTask(
+                _workgroup, [wp = _query_ctx, this, state, chunk_source_index, driver_id](int worker_id) {
+                    if (auto sp = wp.lock()) {
+                        // Set driver_id here to share some driver-local contents.
+                        // Current it's used by ExprContext's driver-local state
+                        CurrentThread::current().set_pipeline_driver_id(driver_id);
+                        DeferOp defer([]() { CurrentThread::current().set_pipeline_driver_id(0); });
+                        SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(state->instance_mem_tracker());
+
+                        auto& chunk_source = _chunk_sources[chunk_source_index];
+                        size_t num_read_chunks = 0;
+                        int64_t prev_cpu_time = chunk_source->get_cpu_time_spent();
+                        int64_t prev_scan_rows = chunk_source->get_scan_rows();
+                        int64_t prev_scan_bytes = chunk_source->get_scan_bytes();
+
+                        // Read chunk
+                        Status status = chunk_source->buffer_next_batch_chunks_blocking_for_workgroup(
+                                _buffer_size, state, &num_read_chunks, worker_id, _workgroup);
+                        if (!status.ok() && !status.is_end_of_file()) {
+                            _set_scan_status(status);
+                        }
+
+                        int64_t delta_cpu_time = chunk_source->get_cpu_time_spent() - prev_cpu_time;
+                        _workgroup->increment_real_runtime_ns(delta_cpu_time);
+                        _workgroup->incr_period_scaned_chunk_num(num_read_chunks);
+
+                        _finish_chunk_source_task(state, chunk_source_index, delta_cpu_time,
+                                                  chunk_source->get_scan_rows() - prev_scan_rows,
+                                                  chunk_source->get_scan_bytes() - prev_scan_bytes);
                    }
-                    // TODO (by laotan332): More detailed information is needed
-                    _workgroup->incr_period_scaned_chunk_num(num_read_chunks);
-                    _workgroup->increment_real_runtime_ns(_chunk_sources[chunk_source_index]->last_spent_cpu_time_ns());
-
-                    _last_growth_cpu_time_ns += _chunk_sources[chunk_source_index]->last_spent_cpu_time_ns();
-                    _last_scan_rows_num += _chunk_sources[chunk_source_index]->last_scan_rows_num();
-                    _last_scan_bytes += _chunk_sources[chunk_source_index]->last_scan_bytes();
-                }
-
-                _decrease_committed_scan_tasks();
-                _num_running_io_tasks--;
-                _is_io_task_running[chunk_source_index] = false;
-            }
-        });
-
+                });
        if (dynamic_cast<ConnectorScanOperator*>(this) != nullptr) {
            offer_task_success = ExecEnv::GetInstance()->hdfs_scan_executor()->submit(std::move(task));
        } else {
@ -265,23 +295,30 @@ Status ScanOperator::_trigger_next_scan(RuntimeState* state, int chunk_source_in
        }
    } else {
        PriorityThreadPool::Task task;
-        task.work_function = [wp = _query_ctx, this, state, chunk_source_index]() {
+        task.work_function = [wp = _query_ctx, this, state, chunk_source_index, driver_id]() {
            if (auto sp = wp.lock()) {
-                {
-                    SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(state->instance_mem_tracker());
-                    Status status =
-                            _chunk_sources[chunk_source_index]->buffer_next_batch_chunks_blocking(_buffer_size, state);
-                    if (!status.ok() && !status.is_end_of_file()) {
-                        _set_scan_status(status);
-                    }
-                    _last_growth_cpu_time_ns += _chunk_sources[chunk_source_index]->last_spent_cpu_time_ns();
-                    _last_scan_rows_num += _chunk_sources[chunk_source_index]->last_scan_rows_num();
-                    _last_scan_bytes += _chunk_sources[chunk_source_index]->last_scan_bytes();
-                }
+                // Set driver_id here to share some driver-local contents.
+                // Current it's used by ExprContext's driver-local state
+                CurrentThread::current().set_pipeline_driver_id(driver_id);
+                DeferOp defer([]() { CurrentThread::current().set_pipeline_driver_id(0); });

-                _decrease_committed_scan_tasks();
-                _num_running_io_tasks--;
-                _is_io_task_running[chunk_source_index] = false;
+                SCOPED_THREAD_LOCAL_MEM_TRACKER_SETTER(state->instance_mem_tracker());
+
+                auto& chunk_source = _chunk_sources[chunk_source_index];
+                int64_t prev_cpu_time = chunk_source->get_cpu_time_spent();
+                int64_t prev_scan_rows = chunk_source->get_scan_rows();
+                int64_t prev_scan_bytes = chunk_source->get_scan_bytes();
+
+                Status status =
+                        _chunk_sources[chunk_source_index]->buffer_next_batch_chunks_blocking(_buffer_size, state);
+                if (!status.ok() && !status.is_end_of_file()) {
+                    _set_scan_status(status);
+                }
+                int64_t delta_cpu_time = chunk_source->get_cpu_time_spent() - prev_cpu_time;
+
+                _finish_chunk_source_task(state, chunk_source_index, delta_cpu_time,
+                                          chunk_source->get_scan_rows() - prev_scan_rows,
+                                          chunk_source->get_scan_bytes() - prev_scan_bytes);
            }
        };
        // TODO(by satanson): set a proper priority
@ -293,6 +330,7 @@ Status ScanOperator::_trigger_next_scan(RuntimeState* state, int chunk_source_in
    if (offer_task_success) {
        _io_task_retry_cnt = 0;
    } else {
+        _chunk_sources[chunk_source_index]->unpin_chunk_token();
        _num_running_io_tasks--;
        _is_io_task_running[chunk_source_index] = false;
        // TODO(hcf) set a proper retry times
@ -315,6 +353,8 @@ Status ScanOperator::_pickup_morsel(RuntimeState* state, int chunk_source_index)

    ASSIGN_OR_RETURN(auto morsel, _morsel_queue->try_get());
    if (morsel != nullptr) {
+        COUNTER_UPDATE(_morsels_counter, 1);
+
        _chunk_sources[chunk_source_index] = create_chunk_source(std::move(morsel), chunk_source_index);
        auto status = _chunk_sources[chunk_source_index]->prepare(state);
        if (!status.ok()) {
@ -341,26 +381,12 @@ void ScanOperator::_merge_chunk_source_profiles() {
    _unique_metrics->copy_all_counters_from(merged_profile);
 }

-bool ScanOperator::_try_to_increase_committed_scan_tasks() {
-    int old_num = _num_committed_scan_tasks.fetch_add(1);
-    if (_exceed_max_scan_concurrency(old_num)) {
-        _decrease_committed_scan_tasks();
-        return false;
-    }
-    return true;
-}
-
-bool ScanOperator::_exceed_max_scan_concurrency(int num_committed_scan_tasks) const {
-    // _max_scan_concurrency takes effect, only when it is positive.
-    return _max_scan_concurrency > 0 && num_committed_scan_tasks >= _max_scan_concurrency;
-}
-
 // ========== ScanOperatorFactory ==========

-ScanOperatorFactory::ScanOperatorFactory(int32_t id, ScanNode* scan_node)
+ScanOperatorFactory::ScanOperatorFactory(int32_t id, ScanNode* scan_node, ChunkBufferLimiterPtr buffer_limiter)
        : SourceOperatorFactory(id, scan_node->name(), scan_node->id()),
          _scan_node(scan_node),
-          _max_scan_concurrency(scan_node->max_scan_concurrency()) {}
+          _buffer_limiter(std::move(buffer_limiter)) {}

 Status ScanOperatorFactory::prepare(RuntimeState* state) {
    RETURN_IF_ERROR(OperatorFactory::prepare(state));
@ -385,16 +411,16 @@ pipeline::OpFactories decompose_scan_node_to_pipeline(std::shared_ptr<ScanOperat
                                                      ScanNode* scan_node, pipeline::PipelineBuilderContext* context) {
    OpFactories ops;

-    const auto* morsel_queue = context->morsel_queue_of_source_operator(scan_operator.get());
-
-    // ScanOperator's degree_of_parallelism is not more than the number of morsels
-    // If table is empty, then morsel size is zero and we still set degree of parallelism to 1
-    const auto degree_of_parallelism =
-            std::min<size_t>(std::max<size_t>(1, morsel_queue->num_morsels()), context->degree_of_parallelism());
-    scan_operator->set_degree_of_parallelism(degree_of_parallelism);
+    size_t scan_dop = context->degree_of_parallelism_of_source_operator(scan_operator.get());
+    scan_operator->set_degree_of_parallelism(scan_dop);

    ops.emplace_back(std::move(scan_operator));

+    if (!scan_node->conjunct_ctxs().empty() || ops.back()->has_runtime_filters()) {
+        ops.emplace_back(
+                std::make_shared<ChunkAccumulateOperatorFactory>(context->next_operator_id(), scan_node->id()));
+    }
+
    size_t limit = scan_node->limit();
    if (limit != -1) {
        ops.emplace_back(
--- a/be/src/exec/pipeline/scan/scan_operator.h
+++ b/be/src/exec/pipeline/scan/scan_operator.h
@ -13,13 +13,20 @@ class ScanNode;

 namespace pipeline {

+class ChunkBufferLimiter;
+using ChunkBufferLimiterPtr = std::unique_ptr<ChunkBufferLimiter>;
+
 class ScanOperator : public SourceOperator {
 public:
+    static constexpr int MAX_IO_TASKS_PER_OP = 4;
+
    ScanOperator(OperatorFactory* factory, int32_t id, int32_t driver_sequence, ScanNode* scan_node,
-                 int max_scan_concurrency, std::atomic<int>& num_committed_scan_tasks);
+                 ChunkBufferLimiter* buffer_limiter);

    ~ScanOperator() override;

+    static size_t max_buffer_capacity() { return config::pipeline_io_buffer_size; }
+
    Status prepare(RuntimeState* state) override;

    // The running I/O task committed by ScanOperator holds the reference of query context,
@ -49,17 +56,8 @@ public:
    virtual void do_close(RuntimeState* state) = 0;
    virtual ChunkSourcePtr create_chunk_source(MorselPtr morsel, int32_t chunk_source_index) = 0;

-    virtual int64_t get_last_scan_rows_num() {
-        int64_t scan_rows_num = _last_scan_rows_num;
-        _last_scan_rows_num = 0;
-        return scan_rows_num;
-    }
-
-    virtual int64_t get_last_scan_bytes() {
-        int64_t res = _last_scan_bytes;
-        _last_scan_bytes = 0;
-        return res;
-    }
+    int64_t get_last_scan_rows_num() { return _last_scan_rows_num.exchange(0); }
+    int64_t get_last_scan_bytes() { return _last_scan_bytes.exchange(0); }

 private:
    // This method is only invoked when current morsel is reached eof
@ -67,6 +65,8 @@ private:
    Status _pickup_morsel(RuntimeState* state, int chunk_source_index);
    Status _trigger_next_scan(RuntimeState* state, int chunk_source_index);
    Status _try_to_trigger_next_scan(RuntimeState* state);
+    void _finish_chunk_source_task(RuntimeState* state, int chunk_source_index, int64_t cpu_time_ns, int64_t scan_rows,
+                                   int64_t scan_bytes);
    void _merge_chunk_source_profiles();

    inline void _set_scan_status(const Status& status) {
@ -81,10 +81,6 @@ private:
        return _scan_status;
    }

-    bool _try_to_increase_committed_scan_tasks();
-    void _decrease_committed_scan_tasks() { _num_committed_scan_tasks.fetch_sub(1); }
-    bool _exceed_max_scan_concurrency(int num_committed_scan_tasks) const;
-
 protected:
    ScanNode* _scan_node = nullptr;
    // ScanOperator may do parallel scan, so each _chunk_sources[i] needs to hold
@ -95,14 +91,10 @@ protected:
    std::vector<std::shared_ptr<RuntimeProfile>> _chunk_source_profiles;

    bool _is_finished = false;
-
-    const int _max_scan_concurrency;
-    // Shared by all the ScanOperators created by the same ScanOperatorFactory.
-    std::atomic<int>& _num_committed_scan_tasks;
+    // Shared among scan operators decomposed from a scan node, and owned by ScanOperatorFactory.
+    ChunkBufferLimiter* _buffer_limiter;

 private:
-    static constexpr int MAX_IO_TASKS_PER_OP = 4;
-
    const size_t _buffer_size = config::pipeline_io_buffer_size;

    int32_t _io_task_retry_cnt = 0;
@ -118,11 +110,20 @@ private:
    workgroup::WorkGroupPtr _workgroup = nullptr;
    std::atomic_int64_t _last_scan_rows_num = 0;
    std::atomic_int64_t _last_scan_bytes = 0;
+
+    RuntimeProfile::Counter* _default_buffer_capacity_counter = nullptr;
+    RuntimeProfile::Counter* _buffer_capacity_counter = nullptr;
+    RuntimeProfile::HighWaterMarkCounter* _peak_buffer_size_counter = nullptr;
+    // The total number of the original tablets in this fragment instance.
+    RuntimeProfile::Counter* _tablets_counter = nullptr;
+    // The number of morsels picked up by this scan operator.
+    // A tablet may be divided into multiple morsels.
+    RuntimeProfile::Counter* _morsels_counter = nullptr;
 };

 class ScanOperatorFactory : public SourceOperatorFactory {
 public:
-    ScanOperatorFactory(int32_t id, ScanNode* scan_node);
+    ScanOperatorFactory(int32_t id, ScanNode* scan_node, ChunkBufferLimiterPtr buffer_limiter);

    ~ScanOperatorFactory() override = default;

@ -140,9 +141,7 @@ public:

 protected:
    ScanNode* const _scan_node;
-
-    const int _max_scan_concurrency;
-    std::atomic<int> _num_committed_scan_tasks{0};
+    ChunkBufferLimiterPtr _buffer_limiter;
 };

 pipeline::OpFactories decompose_scan_node_to_pipeline(std::shared_ptr<ScanOperatorFactory> factory, ScanNode* scan_node,
--- a/be/src/exec/pipeline/select_operator.cpp
+++ b/be/src/exec/pipeline/select_operator.cpp
@ -12,6 +12,8 @@ Status SelectOperator::prepare(RuntimeState* state) {
 }

 void SelectOperator::close(RuntimeState* state) {
+    _curr_chunk.reset();
+    _pre_output_chunk.reset();
    Operator::close(state);
 }

--- a/be/src/exec/pipeline/set/except_context.cpp
+++ b/be/src/exec/pipeline/set/except_context.cpp
@ -21,6 +21,7 @@ Status ExceptContext::prepare(RuntimeState* state, const std::vector<ExprContext
 }

 void ExceptContext::close(RuntimeState* state) {
+    _hash_set.reset();
    if (_build_pool != nullptr) {
        _build_pool->free_all();
    }
--- a/be/src/exec/pipeline/set/except_context.h
+++ b/be/src/exec/pipeline/set/except_context.h
@ -29,10 +29,12 @@ class ExceptContext final : public ContextWithDependency {
 public:
    explicit ExceptContext(const int dst_tuple_id) : _dst_tuple_id(dst_tuple_id) {}

-    bool is_ht_empty() const { return _hash_set->empty(); }
+    bool is_ht_empty() const { return _is_hash_set_empty; }

    void finish_build_ht() {
+        _is_hash_set_empty = _hash_set->empty();
        _next_processed_iter = _hash_set->begin();
+        _hash_set_end_iter = _hash_set->end();
        _finished_dependency_index.fetch_add(1, std::memory_order_release);
    }

@ -42,7 +44,7 @@ public:
        return _finished_dependency_index.load(std::memory_order_acquire) == dependency_index;
    }

-    bool is_output_finished() const { return _next_processed_iter == _hash_set->end(); }
+    bool is_output_finished() const { return _next_processed_iter == _hash_set_end_iter; }

    // Called in the preparation phase of ExceptBuildSinkOperator.
    Status prepare(RuntimeState* state, const std::vector<ExprContext*>& build_exprs);
@ -77,6 +79,8 @@ private:
    // Used for traversal on the hash set to get the undeleted keys to dest chunk.
    // Init when the hash set is finished building in finish_build_ht().
    vectorized::ExceptHashSerializeSet::Iterator _next_processed_iter;
+    vectorized::ExceptHashSerializeSet::Iterator _hash_set_end_iter;
+    bool _is_hash_set_empty = false;

    // The BUILD, PROBES, and OUTPUT operators execute sequentially.
    // BUILD -> 1-th PROBE -> 2-th PROBE -> ... -> n-th PROBE -> OUTPUT.
--- a/be/src/exec/pipeline/set/intersect_context.cpp
+++ b/be/src/exec/pipeline/set/intersect_context.cpp
@ -20,6 +20,7 @@ Status IntersectContext::prepare(RuntimeState* state, const std::vector<ExprCont
 }

 void IntersectContext::close(RuntimeState* state) {
+    _hash_set.reset();
    if (_build_pool != nullptr) {
        _build_pool->free_all();
    }
--- a/be/src/exec/pipeline/set/intersect_context.h
+++ b/be/src/exec/pipeline/set/intersect_context.h
@ -31,10 +31,12 @@ public:
    IntersectContext(const int dst_tuple_id, const size_t intersect_times)
            : _dst_tuple_id(dst_tuple_id), _intersect_times(intersect_times) {}

-    bool is_ht_empty() const { return _hash_set->empty(); }
+    bool is_ht_empty() const { return _is_hash_set_empty; }

    void finish_build_ht() {
+        _is_hash_set_empty = _hash_set->empty();
        _next_processed_iter = _hash_set->begin();
+        _hash_set_end_iter = _hash_set->end();
        _finished_dependency_index.fetch_add(1, std::memory_order_release);
    }

@ -44,7 +46,7 @@ public:
        return _finished_dependency_index.load(std::memory_order_acquire) == dependency_index;
    }

-    bool is_output_finished() const { return _next_processed_iter == _hash_set->end(); }
+    bool is_output_finished() const { return _next_processed_iter == _hash_set_end_iter; }

    // Called in the preparation phase of IntersectBuildSinkOperator.
    Status prepare(RuntimeState* state, const std::vector<ExprContext*>& build_exprs);
@ -80,6 +82,8 @@ private:
    // Used for traversal on the hash set to get the undeleted keys to dest chunk.
    // Init when the hash set is finished building in finish_build_ht().
    vectorized::IntersectHashSerializeSet::Iterator _next_processed_iter;
+    vectorized::IntersectHashSerializeSet::Iterator _hash_set_end_iter;
+    bool _is_hash_set_empty = false;

    // The BUILD, PROBES, and OUTPUT operators execute sequentially.
    // BUILD -> 1-th PROBE -> 2-th PROBE -> ... -> n-th PROBE -> OUTPUT.
--- a/be/src/exec/pipeline/sort/partition_sort_sink_operator.cpp
+++ b/be/src/exec/pipeline/sort/partition_sort_sink_operator.cpp
@ -25,6 +25,7 @@ Status PartitionSortSinkOperator::prepare(RuntimeState* state) {

 void PartitionSortSinkOperator::close(RuntimeState* state) {
    _sort_context->unref(state);
+    _chunks_sorter.reset();
    Operator::close(state);
 }

--- a/be/src/exec/pipeline/sort/sort_context.cpp
+++ b/be/src/exec/pipeline/sort/sort_context.cpp
@ -13,6 +13,11 @@ using vectorized::Columns;
 using vectorized::SortedRun;
 using vectorized::SortedRuns;

+void SortContext::close(RuntimeState* state) {
+    _chunks_sorter_partions.clear();
+    _merged_runs.clear();
+}
+
 StatusOr<ChunkPtr> SortContext::pull_chunk() {
    if (!_is_merge_finish) {
        _merge_inputs();
@ -29,7 +34,9 @@ StatusOr<ChunkPtr> SortContext::pull_chunk() {

    SortedRun& run = _merged_runs.front();
    ChunkPtr res = run.steal_chunk(required_rows);
-    RETURN_IF_ERROR(res->downgrade());
+    if (res != nullptr) {
+        RETURN_IF_ERROR(res->downgrade());
+    }

    if (run.empty()) {
        _merged_runs.pop_front();
--- a/be/src/exec/pipeline/sort/sort_context.h
+++ b/be/src/exec/pipeline/sort/sort_context.h
@ -36,7 +36,7 @@ public:
        _chunks_sorter_partions.reserve(num_right_sinkers);
    }

-    void close(RuntimeState* state) override {}
+    void close(RuntimeState* state) override;

    void add_partition_chunks_sorter(std::shared_ptr<ChunksSorter> chunks_sorter) {
        _chunks_sorter_partions.push_back(chunks_sorter);
--- a/be/src/exec/scan_node.h
+++ b/be/src/exec/scan_node.h
@ -92,9 +92,6 @@ public:

    const std::string& name() const { return _name; }

-    // Used by pipeline, 0 means there is no limitation.
-    virtual int max_scan_concurrency() const { return 0; }
-
 protected:
    RuntimeProfile::Counter* _bytes_read_counter; // # bytes read from the scanner
    // # rows/tuples read from the scanner (including those discarded by eval_conjucts())
--- a/be/src/exec/tablet_sink.cpp
+++ b/be/src/exec/tablet_sink.cpp
@ -133,7 +133,7 @@ Status NodeChannel::init(RuntimeState* state) {
    return Status::OK();
 }

-void NodeChannel::open() {
+void NodeChannel::try_open() {
    PTabletWriterOpenRequest request;
    request.set_allocated_id(&_parent->_load_id);
    request.set_index_id(_index_id);
@ -179,6 +179,15 @@ void NodeChannel::open() {
    request.release_schema();
 }

+bool NodeChannel::is_open_done() {
+    if (_open_closure != nullptr) {
+        // open request already finished
+        return (_open_closure->count() != 2);
+    }
+
+    return true;
+}
+
 Status NodeChannel::open_wait() {
    _open_closure->join();
    if (_open_closure->cntl.Failed()) {
@ -207,7 +216,11 @@ Status NodeChannel::_serialize_chunk(const vectorized::Chunk* src, ChunkPB* dst)
    {
        SCOPED_RAW_TIMER(&_serialize_batch_ns);
        StatusOr<ChunkPB> res = serde::ProtobufChunkSerde::serialize(*src);
-        if (!res.ok()) return res.status();
+        if (!res.ok()) {
+            _cancelled = true;
+            _err_st = res.status();
+            return _err_st;
+        }
        res->Swap(dst);
    }
    DCHECK(dst->has_uncompressed_size());
@ -216,12 +229,14 @@ Status NodeChannel::_serialize_chunk(const vectorized::Chunk* src, ChunkPB* dst)
    size_t uncompressed_size = dst->uncompressed_size();

    if (_compress_codec != nullptr && _compress_codec->exceed_max_input_size(uncompressed_size)) {
-        return Status::InternalError(fmt::format("The input size for compression should be less than {}",
-                                                 _compress_codec->max_input_size()));
+        _cancelled = true;
+        _err_st = Status::InternalError(fmt::format("The input size for compression should be less than {}",
+                                                    _compress_codec->max_input_size()));
+        return _err_st;
    }

    // try compress the ChunkPB data
-    if (_compress_codec != nullptr && uncompressed_size > 0) {
+    if (config::table_sink_compression_enable && _compress_codec != nullptr && uncompressed_size > 0) {
        SCOPED_TIMER(_parent->_compress_timer);

        // Try compressing data to _compression_scratch, swap if compressed data is smaller
@ -246,6 +261,15 @@ Status NodeChannel::_serialize_chunk(const vectorized::Chunk* src, ChunkPB* dst)
    return Status::OK();
 }

+bool NodeChannel::is_full() {
+    if (_chunk_queue.size() >= _max_chunk_queue_size || _mem_tracker->limit()) {
+        if (!_check_prev_request_done()) {
+            return true;
+        }
+    }
+    return false;
+}
+
 Status NodeChannel::add_chunk(vectorized::Chunk* input, const int64_t* tablet_ids, const uint32_t* indexes,
                              uint32_t from, uint32_t size, bool eos) {
    if (_cancelled || _send_finished) {
@ -258,6 +282,12 @@ Status NodeChannel::add_chunk(vectorized::Chunk* input, const int64_t* tablet_id
            _cur_chunk = input->clone_empty_with_slot();
        }

+        if (is_full()) {
+            // wait previous request done then we can pop data from queue to send request
+            // and make new space to push data.
+            RETURN_IF_ERROR(_wait_one_prev_request());
+        }
+
        // 1. append data
        _cur_chunk->append_selective(*input, indexes, from, size);
        for (size_t i = 0; i < size; ++i) {
@ -280,13 +310,8 @@ Status NodeChannel::add_chunk(vectorized::Chunk* input, const int64_t* tablet_id

        // 4. check last request
        if (!_check_prev_request_done()) {
-            if (_chunk_queue.size() > _max_chunk_queue_size || _mem_tracker->limit()) {
-                // 4.1 wait if queue full
-                RETURN_IF_ERROR(_wait_one_prev_request());
-            } else {
-                // 4.2 noblock here so that channel cant send data
-                return Status::OK();
-            }
+            // 4.1 noblock here so that other node channel can send data
+            return Status::OK();
        }

    } else {
@ -418,6 +443,20 @@ bool NodeChannel::_check_prev_request_done() {
    return false;
 }

+bool NodeChannel::_check_all_prev_request_done() {
+    if (UNLIKELY(_next_packet_seq == 0)) {
+        return true;
+    }
+
+    for (size_t i = 0; i < _max_parallel_request_size; i++) {
+        if (_add_batch_closures[i]->count() != 1) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 Status NodeChannel::_wait_one_prev_request() {
    SCOPED_TIMER(_parent->_wait_response_timer);
    if (_next_packet_seq == 0) {
@ -448,6 +487,27 @@ Status NodeChannel::_wait_one_prev_request() {
    return Status::OK();
 }

+Status NodeChannel::try_close() {
+    if (_cancelled || _send_finished) {
+        return _err_st;
+    }
+
+    if (_check_prev_request_done()) {
+        auto st = add_chunk(nullptr, nullptr, nullptr, 0, 0, true);
+        if (!st.ok()) {
+            _cancelled = true;
+            _err_st = st;
+            return _err_st;
+        }
+    }
+
+    return Status::OK();
+}
+
+bool NodeChannel::is_close_done() {
+    return (_send_finished && _check_all_prev_request_done()) || _cancelled;
+}
+
 Status NodeChannel::close_wait(RuntimeState* state) {
    if (_cancelled) {
        return _err_st;
@ -470,8 +530,11 @@ Status NodeChannel::close_wait(RuntimeState* state) {
 }

 void NodeChannel::cancel(const Status& err_st) {
-    // we don't need to wait last rpc finished, cause closure's release/reset will join.
-    // But do we need brpc::StartCancel(call_id)?
+    // cancel rpc request, accelerate the release of related resources
+    for (auto closure : _add_batch_closures) {
+        closure->cancel();
+    }
+
    _cancelled = true;
    _err_st = err_st;

@ -577,14 +640,28 @@ Status OlapTableSink::init(const TDataSink& t_sink) {
 }

 Status OlapTableSink::prepare(RuntimeState* state) {
+    // profile must add to state's object pool
+    _profile = state->obj_pool()->add(new RuntimeProfile("OlapTableSink"));
+
+    // add all counter
+    _input_rows_counter = ADD_COUNTER(_profile, "RowsRead", TUnit::UNIT);
+    _output_rows_counter = ADD_COUNTER(_profile, "RowsReturned", TUnit::UNIT);
+    _filtered_rows_counter = ADD_COUNTER(_profile, "RowsFiltered", TUnit::UNIT);
+    _send_data_timer = ADD_TIMER(_profile, "SendDataTime");
+    _convert_chunk_timer = ADD_TIMER(_profile, "ConvertChunkTime");
+    _validate_data_timer = ADD_TIMER(_profile, "ValidateDataTime");
+    _open_timer = ADD_TIMER(_profile, "OpenTime");
+    _close_timer = ADD_TIMER(_profile, "CloseWaitTime");
+    _serialize_chunk_timer = ADD_TIMER(_profile, "SerializeChunkTime");
+    _wait_response_timer = ADD_TIMER(_profile, "WaitResponseTime");
+    _compress_timer = ADD_TIMER(_profile, "CompressTime");
+    _pack_chunk_timer = ADD_TIMER(_profile, "PackChunkTime");
+
    RETURN_IF_ERROR(DataSink::prepare(state));

    _sender_id = state->per_fragment_instance_idx();
    _num_senders = state->num_per_fragment_instances();

-    // profile must add to state's object pool
-    _profile = state->obj_pool()->add(new RuntimeProfile("OlapTableSink"));
-
    SCOPED_TIMER(_profile->total_time_counter());

    // Prepare the exprs to run.
@ -643,22 +720,6 @@ Status OlapTableSink::prepare(RuntimeState* state) {
        }
    }

-    // add all counter
-    _input_rows_counter = ADD_COUNTER(_profile, "RowsRead", TUnit::UNIT);
-    _output_rows_counter = ADD_COUNTER(_profile, "RowsReturned", TUnit::UNIT);
-    _filtered_rows_counter = ADD_COUNTER(_profile, "RowsFiltered", TUnit::UNIT);
-    _send_data_timer = ADD_TIMER(_profile, "SendDataTime");
-    _convert_chunk_timer = ADD_TIMER(_profile, "ConvertChunkTime");
-    _validate_data_timer = ADD_TIMER(_profile, "ValidateDataTime");
-    _open_timer = ADD_TIMER(_profile, "OpenTime");
-    _close_timer = ADD_TIMER(_profile, "CloseWaitTime");
-    _serialize_chunk_timer = ADD_TIMER(_profile, "SerializeChunkTime");
-    _wait_response_timer = ADD_TIMER(_profile, "WaitResponseTime");
-    _compress_timer = ADD_TIMER(_profile, "CompressTime");
-    _append_attachment_timer = ADD_TIMER(_profile, "AppendAttachmentTime");
-    _mark_tablet_timer = ADD_TIMER(_profile, "MarkTabletTime");
-    _pack_chunk_timer = ADD_TIMER(_profile, "PackChunkTime");
-
    _load_mem_limit = state->get_load_mem_limit();

    // open all channels
@ -686,13 +747,33 @@ Status OlapTableSink::prepare(RuntimeState* state) {
 Status OlapTableSink::open(RuntimeState* state) {
    SCOPED_TIMER(_profile->total_time_counter());
    SCOPED_TIMER(_open_timer);
+    RETURN_IF_ERROR(try_open(state));
+    RETURN_IF_ERROR(open_wait());
+
+    return Status::OK();
+}
+
+Status OlapTableSink::try_open(RuntimeState* state) {
    // Prepare the exprs to run.
    RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state));

    for (auto& index_channel : _channels) {
-        index_channel->for_each_node_channel([](NodeChannel* ch) { ch->open(); });
+        index_channel->for_each_node_channel([](NodeChannel* ch) { ch->try_open(); });
    }

+    return Status::OK();
+}
+
+bool OlapTableSink::is_open_done() {
+    bool open_done = true;
+    for (auto& index_channel : _channels) {
+        index_channel->for_each_node_channel([&open_done](NodeChannel* ch) { open_done &= ch->is_open_done(); });
+    }
+
+    return open_done;
+}
+
+Status OlapTableSink::open_wait() {
    Status err_st = Status::OK();
    for (auto& index_channel : _channels) {
        index_channel->for_each_node_channel([&index_channel, &err_st](NodeChannel* ch) {
@ -831,6 +912,15 @@ Status OlapTableSink::send_chunk(RuntimeState* state, vectorized::Chunk* chunk)
    return Status::OK();
 }

+bool OlapTableSink::is_full() {
+    bool full = false;
+    for (auto& index_channel : _channels) {
+        index_channel->for_each_node_channel([&full](NodeChannel* ch) { full |= ch->is_full(); });
+    }
+
+    return full;
+}
+
 Status OlapTableSink::_send_chunk_by_node(vectorized::Chunk* chunk, IndexChannel* channel,
                                          std::vector<uint16_t>& selection_idx) {
    Status err_st = Status::OK();
@ -862,7 +952,52 @@ Status OlapTableSink::_send_chunk_by_node(vectorized::Chunk* chunk, IndexChannel
    return Status::OK();
 }

+Status OlapTableSink::try_close(RuntimeState* state) {
+    Status err_st = Status::OK();
+    bool intolerable_failure = false;
+    for (auto& index_channel : _channels) {
+        index_channel->for_each_node_channel([&index_channel, &err_st, &intolerable_failure](NodeChannel* ch) {
+            auto st = ch->try_close();
+            if (!st.ok()) {
+                LOG(WARNING) << "close channel failed. channel_name=" << ch->name()
+                             << ", load_info=" << ch->print_load_info() << ", error_msg=" << st.get_error_msg();
+                err_st = st;
+                index_channel->mark_as_failed(ch);
+            }
+            if (index_channel->has_intolerable_failure()) {
+                intolerable_failure = true;
+            }
+        });
+    }
+
+    if (intolerable_failure) {
+        return err_st;
+    } else {
+        return Status::OK();
+    }
+}
+
+bool OlapTableSink::is_close_done() {
+    bool close_done = true;
+    for (auto& index_channel : _channels) {
+        index_channel->for_each_node_channel([&close_done](NodeChannel* ch) { close_done &= ch->is_close_done(); });
+    }
+
+    return close_done;
+}
+
 Status OlapTableSink::close(RuntimeState* state, Status close_status) {
+    if (close_status.ok()) {
+        do {
+            close_status = try_close(state);
+            if (!close_status.ok()) break;
+            SleepFor(MonoDelta::FromMilliseconds(5));
+        } while (!is_close_done());
+    }
+    return close_wait(state, close_status);
+}
+
+Status OlapTableSink::close_wait(RuntimeState* state, Status close_status) {
    Status status = close_status;
    if (status.ok()) {
        // only if status is ok can we call this _profile->total_time_counter().
@ -983,7 +1118,6 @@ void _print_decimalv3_error_msg(RuntimeState* state, const CppType& decimal, con
    if (state->has_reached_max_error_msg_num()) {
        return;
    }
-    std::stringstream ss;
    auto decimal_str = DecimalV3Cast::to_string<CppType>(decimal, desc->type().precision, desc->type().scale);
    std::string error_msg = strings::Substitute("Decimal '$0' is out of range. The type of '$1' is $2'", decimal_str,
                                                desc->col_name(), desc->type().debug_string());
@ -1004,8 +1138,8 @@ void OlapTableSink::_validate_decimal(RuntimeState* state, vectorized::Column* c
    auto* data = &data_column->get_data().front();

    int precision = desc->type().precision;
-    const auto max_decimal = get_scale_factor<CppType>(precision);
-    const auto min_decimal = -max_decimal;
+    const auto max_decimal = get_max_decimal<CppType>(precision);
+    const auto min_decimal = get_min_decimal<CppType>(precision);

    for (auto i = 0; i < num_rows; ++i) {
        if ((*validate_selection)[i] == VALID_SEL_OK) {
--- a/be/src/exec/tablet_sink.h
+++ b/be/src/exec/tablet_sink.h
@ -81,7 +81,7 @@ template <typename T>
 class ReusableClosure : public google::protobuf::Closure {
 public:
    ReusableClosure() : cid(INVALID_BTHREAD_ID), _refs(0) {}
-    ~ReusableClosure() { join(); }
+    ~ReusableClosure() {}

    int count() { return _refs.load(); }

@ -106,6 +106,12 @@ public:
        }
    }

+    void cancel() {
+        if (cid != INVALID_BTHREAD_ID) {
+            brpc::StartCancel(cid);
+        }
+    }
+
    void reset() {
        cntl.Reset();
        cid = cntl.call_id();
@ -129,13 +135,24 @@ public:

    Status init(RuntimeState* state);

-    // we use open/open_wait to parallel
-    void open();
+    // async open interface: try_open() -> [is_open_done()] -> open_wait()
+    // if is_open_done() return true, open_wait() will not block
+    // otherwise open_wait() will block
+    void try_open();
+    bool is_open_done();
    Status open_wait();

+    // async add chunk interface
+    // if is_full() return false, add_chunk() will not block
    Status add_chunk(vectorized::Chunk* chunk, const int64_t* tablet_ids, const uint32_t* indexes, uint32_t from,
                     uint32_t size, bool eos);
+    bool is_full();

+    // async close interface: try_close() -> [is_close_done()] -> close_wait()
+    // if is_close_done() return true, close_wait() will not block
+    // otherwise close_wait() will block
+    Status try_close();
+    bool is_close_done();
    Status close_wait(RuntimeState* state);

    void cancel(const Status& err_st);
@ -163,6 +180,7 @@ private:
    Status _wait_all_prev_request();
    Status _wait_one_prev_request();
    bool _check_prev_request_done();
+    bool _check_all_prev_request_done();
    Status _serialize_chunk(const vectorized::Chunk* src, ChunkPB* dst);

    std::unique_ptr<MemTracker> _mem_tracker = nullptr;
@ -266,11 +284,34 @@ public:

    Status prepare(RuntimeState* state) override;

+    // sync open interface
    Status open(RuntimeState* state) override;

+    // async open interface: try_open() -> [is_open_done()] -> open_wait()
+    // if is_open_done() return true, open_wait() will not block
+    // otherwise open_wait() will block
+    Status try_open(RuntimeState* state);
+
+    bool is_open_done();
+
+    Status open_wait();
+
+    // async add chunk interface
+    // if is_full() return false, add_chunk() will not block
    Status send_chunk(RuntimeState* state, vectorized::Chunk* chunk) override;

-    // close() will send RPCs too. If RPCs failed, return error.
+    bool is_full();
+
+    // async close interface: try_close() -> [is_close_done()] -> close_wait()
+    // if is_close_done() return true, close_wait() will not block
+    // otherwise close_wait() will block
+    Status try_close(RuntimeState* state);
+
+    bool is_close_done();
+
+    Status close_wait(RuntimeState* state, Status close_status);
+
+    // sync close() interface
    Status close(RuntimeState* state, Status close_status) override;

    // Returns the runtime profile for the sink.
--- a/be/src/exec/vectorized/aggregate/agg_hash_map.h
+++ b/be/src/exec/vectorized/aggregate/agg_hash_map.h
@ -312,6 +312,7 @@ struct AggHashMapWithOneNullableNumberKey {

 template <typename HashMap>
 struct AggHashMapWithOneStringKey {
+    using KeyType = typename HashMap::key_type;
    using Iterator = typename HashMap::iterator;
    using ResultVector = typename std::vector<Slice>;
    HashMap hash_map;
@ -400,6 +401,7 @@ struct AggHashMapWithOneStringKey {

 template <typename HashMap>
 struct AggHashMapWithOneNullableStringKey {
+    using KeyType = typename HashMap::key_type;
    using Iterator = typename HashMap::iterator;
    using ResultVector = typename std::vector<Slice>;
    HashMap hash_map;
@ -527,6 +529,7 @@ struct AggHashMapWithOneNullableStringKey {

 template <typename HashMap>
 struct AggHashMapWithSerializedKey {
+    using KeyType = typename HashMap::key_type;
    using Iterator = typename HashMap::iterator;
    using ResultVector = typename std::vector<Slice>;
    HashMap hash_map;
@ -643,6 +646,7 @@ struct AggHashMapWithSerializedKey {

 template <typename HashMap>
 struct AggHashMapWithSerializedKeyFixedSize {
+    using KeyType = typename HashMap::key_type;
    using Iterator = typename HashMap::iterator;
    using FixedSizeSliceKey = typename HashMap::key_type;
    using ResultVector = typename std::vector<FixedSizeSliceKey>;
--- a/be/src/exec/vectorized/aggregate/agg_hash_variant.h
+++ b/be/src/exec/vectorized/aggregate/agg_hash_variant.h
@ -402,14 +402,29 @@ struct AggHashMapVariant {
        return 0;
    }

-    size_t memory_usage() const {
+    size_t reserved_memory_usage(const MemPool* pool) const {
        switch (type) {
 #define M(NAME)      \
    case Type::NAME: \
-        return NAME->hash_map.dump_bound();
+        return NAME->hash_map.dump_bound() + pool->total_reserved_bytes();
+
            APPLY_FOR_AGG_VARIANT_ALL(M)
 #undef M
        }
+
+        return 0;
+    }
+
+    size_t allocated_memory_usage(const MemPool* pool) const {
+        switch (type) {
+#define M(NAME)      \
+    case Type::NAME: \
+        return sizeof(decltype(NAME)::element_type::KeyType) * NAME->hash_map.size() + pool->total_allocated_bytes();
+
+            APPLY_FOR_AGG_VARIANT_ALL(M)
+#undef M
+        }
+
        return 0;
    }
 };
@ -676,16 +691,29 @@ struct AggHashSetVariant {
        return 0;
    }

-    size_t memory_usage() const {
+    size_t reserved_memory_usage(const MemPool* pool) const {
        switch (type) {
 #define M(NAME)      \
    case Type::NAME: \
-        return NAME->hash_set.dump_bound();
+        return NAME->hash_set.dump_bound() + pool->total_reserved_bytes();
            APPLY_FOR_AGG_VARIANT_ALL(M)
 #undef M
        }
        return 0;
    }
+
+    size_t allocated_memory_usage(const MemPool* pool) const {
+        switch (type) {
+#define M(NAME)      \
+    case Type::NAME: \
+        return sizeof(decltype(NAME)::element_type::KeyType) * NAME->hash_set.size() + pool->total_allocated_bytes();
+
+            APPLY_FOR_AGG_VARIANT_ALL(M)
+#undef M
+        }
+
+        return 0;
+    }
 };

 } // namespace starrocks::vectorized
--- a/be/src/exec/vectorized/aggregate/aggregate_blocking_node.cpp
+++ b/be/src/exec/vectorized/aggregate/aggregate_blocking_node.cpp
@ -4,6 +4,7 @@

 #include "exec/pipeline/aggregate/aggregate_blocking_sink_operator.h"
 #include "exec/pipeline/aggregate/aggregate_blocking_source_operator.h"
+#include "exec/pipeline/chunk_accumulate_operator.h"
 #include "exec/pipeline/exchange/exchange_source_operator.h"
 #include "exec/pipeline/limit_operator.h"
 #include "exec/pipeline/operator.h"
@ -52,7 +53,7 @@ Status AggregateBlockingNode::open(RuntimeState* state) {

        DCHECK_LE(chunk->num_rows(), runtime_state()->chunk_size());

-        _aggregator->evaluate_exprs(chunk.get());
+        RETURN_IF_ERROR(_aggregator->evaluate_exprs(chunk.get()));

        size_t chunk_size = chunk->num_rows();
        {
@ -68,8 +69,7 @@ Status AggregateBlockingNode::open(RuntimeState* state) {
                APPLY_FOR_AGG_VARIANT_ALL(HASH_MAP_METHOD)
 #undef HASH_MAP_METHOD

-                _mem_tracker->set(_aggregator->hash_map_variant().memory_usage() +
-                                  _aggregator->mem_pool()->total_reserved_bytes());
+                _mem_tracker->set(_aggregator->hash_map_variant().reserved_memory_usage(_aggregator->mem_pool()));
                TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_map());
            }
            if (_aggregator->is_none_group_by_exprs()) {
@ -119,7 +119,7 @@ Status AggregateBlockingNode::open(RuntimeState* state) {

    COUNTER_SET(_aggregator->input_row_count(), _aggregator->num_input_rows());

-    _mem_tracker->set(_aggregator->hash_map_variant().memory_usage() + _aggregator->mem_pool()->total_reserved_bytes());
+    _mem_tracker->set(_aggregator->hash_map_variant().reserved_memory_usage(_aggregator->mem_pool()));

    return Status::OK();
 }
@ -170,7 +170,8 @@ Status AggregateBlockingNode::get_next(RuntimeState* state, ChunkPtr* chunk, boo
 std::vector<std::shared_ptr<pipeline::OperatorFactory> > AggregateBlockingNode::decompose_to_pipeline(
        pipeline::PipelineBuilderContext* context) {
    using namespace pipeline;
-    OpFactories operators_with_sink = _children[0]->decompose_to_pipeline(context);
+
+    OpFactories ops_with_sink = _children[0]->decompose_to_pipeline(context);
    auto& agg_node = _tnode.agg_node;
    if (agg_node.need_finalize) {
        // If finalize aggregate with group by clause, then it can be paralized
@ -182,7 +183,7 @@ std::vector<std::shared_ptr<pipeline::OperatorFactory> > AggregateBlockingNode::
            // 2. Otherwise, add LocalExchangeOperator
            // to shuffle multi-stream into #degree_of_parallelism# streams each of that pipes into AggregateBlockingSinkOperator.
            bool need_local_shuffle = true;
-            if (auto* exchange_op = dynamic_cast<ExchangeSourceOperatorFactory*>(operators_with_sink[0].get());
+            if (auto* exchange_op = dynamic_cast<ExchangeSourceOperatorFactory*>(ops_with_sink[0].get());
                exchange_op != nullptr) {
                auto& texchange_node = exchange_op->texchange_node();
                DCHECK(texchange_node.__isset.partition_type);
@ -194,18 +195,16 @@ std::vector<std::shared_ptr<pipeline::OperatorFactory> > AggregateBlockingNode::
            if (need_local_shuffle) {
                std::vector<ExprContext*> group_by_expr_ctxs;
                Expr::create_expr_trees(_pool, _tnode.agg_node.grouping_exprs, &group_by_expr_ctxs);
-                operators_with_sink = context->maybe_interpolate_local_shuffle_exchange(
-                        runtime_state(), operators_with_sink, group_by_expr_ctxs);
+                ops_with_sink = context->maybe_interpolate_local_shuffle_exchange(runtime_state(), ops_with_sink,
+                                                                                  group_by_expr_ctxs);
            }
        } else {
-            operators_with_sink =
-                    context->maybe_interpolate_local_passthrough_exchange(runtime_state(), operators_with_sink);
+            ops_with_sink = context->maybe_interpolate_local_passthrough_exchange(runtime_state(), ops_with_sink);
        }
    }
    // We cannot get degree of parallelism from PipelineBuilderContext, of which is only a suggest value
    // and we may set other parallelism for source operator in many special cases
-    size_t degree_of_parallelism =
-            down_cast<SourceOperatorFactory*>(operators_with_sink[0].get())->degree_of_parallelism();
+    size_t degree_of_parallelism = down_cast<SourceOperatorFactory*>(ops_with_sink[0].get())->degree_of_parallelism();

    // shared by sink operator and source operator
    AggregatorFactoryPtr aggregator_factory = std::make_shared<AggregatorFactory>(_tnode);
@ -216,23 +215,29 @@ std::vector<std::shared_ptr<pipeline::OperatorFactory> > AggregateBlockingNode::
                                                                                aggregator_factory);
    // Initialize OperatorFactory's fields involving runtime filters.
    this->init_runtime_filter_for_operator(sink_operator.get(), context, rc_rf_probe_collector);
-    operators_with_sink.push_back(std::move(sink_operator));
-    context->add_pipeline(operators_with_sink);
+    ops_with_sink.push_back(std::move(sink_operator));
+    context->add_pipeline(ops_with_sink);

-    OpFactories operators_with_source;
+    OpFactories ops_with_source;
    auto source_operator = std::make_shared<AggregateBlockingSourceOperatorFactory>(context->next_operator_id(), id(),
                                                                                    aggregator_factory);
    // Initialize OperatorFactory's fields involving runtime filters.
    this->init_runtime_filter_for_operator(source_operator.get(), context, rc_rf_probe_collector);
    // Aggregator must be used by a pair of sink and source operators,
-    // so operators_with_source's degree of parallelism must be equal with operators_with_sink's
+    // so ops_with_source's degree of parallelism must be equal with ops_with_sink's
    source_operator->set_degree_of_parallelism(degree_of_parallelism);
-    operators_with_source.push_back(std::move(source_operator));
+    ops_with_source.push_back(std::move(source_operator));
+
+    if (!_tnode.conjuncts.empty() || ops_with_source.back()->has_runtime_filters()) {
+        ops_with_source.emplace_back(
+                std::make_shared<ChunkAccumulateOperatorFactory>(context->next_operator_id(), id()));
+    }
+
    if (limit() != -1) {
-        operators_with_source.emplace_back(
+        ops_with_source.emplace_back(
                std::make_shared<LimitOperatorFactory>(context->next_operator_id(), id(), limit()));
    }
-    return operators_with_source;
+    return ops_with_source;
 }

 } // namespace starrocks::vectorized
--- a/be/src/exec/vectorized/aggregate/aggregate_streaming_node.cpp
+++ b/be/src/exec/vectorized/aggregate/aggregate_streaming_node.cpp
@ -58,7 +58,7 @@ Status AggregateStreamingNode::get_next(RuntimeState* state, ChunkPtr* chunk, bo
            size_t input_chunk_size = input_chunk->num_rows();
            _aggregator->update_num_input_rows(input_chunk_size);
            COUNTER_SET(_aggregator->input_row_count(), _aggregator->num_input_rows());
-            _aggregator->evaluate_exprs(input_chunk.get());
+            RETURN_IF_ERROR(_aggregator->evaluate_exprs(input_chunk.get()));

            if (_aggregator->streaming_preaggregation_mode() == TStreamingPreaggregationMode::FORCE_STREAMING) {
                // force execute streaming
@ -88,8 +88,7 @@ Status AggregateStreamingNode::get_next(RuntimeState* state, ChunkPtr* chunk, bo
                    _aggregator->compute_batch_agg_states(input_chunk_size);
                }

-                _mem_tracker->set(_aggregator->hash_map_variant().memory_usage() +
-                                  _aggregator->mem_pool()->total_reserved_bytes());
+                _mem_tracker->set(_aggregator->hash_map_variant().reserved_memory_usage(_aggregator->mem_pool()));
                TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_map());

                COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_map_variant().size());
@ -136,8 +135,7 @@ Status AggregateStreamingNode::get_next(RuntimeState* state, ChunkPtr* chunk, bo
                        _aggregator->compute_batch_agg_states(input_chunk_size);
                    }

-                    _mem_tracker->set(_aggregator->hash_map_variant().memory_usage() +
-                                      _aggregator->mem_pool()->total_reserved_bytes());
+                    _mem_tracker->set(_aggregator->hash_map_variant().reserved_memory_usage(_aggregator->mem_pool()));
                    TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_map());
                    COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_map_variant().size());
                    continue;
--- a/be/src/exec/vectorized/aggregate/distinct_blocking_node.cpp
+++ b/be/src/exec/vectorized/aggregate/distinct_blocking_node.cpp
@ -4,6 +4,8 @@

 #include "exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.h"
 #include "exec/pipeline/aggregate/aggregate_distinct_blocking_source_operator.h"
+#include "exec/pipeline/chunk_accumulate_operator.h"
+#include "exec/pipeline/exchange/exchange_source_operator.h"
 #include "exec/pipeline/limit_operator.h"
 #include "exec/pipeline/operator.h"
 #include "exec/pipeline/pipeline_builder.h"
@ -45,7 +47,7 @@ Status DistinctBlockingNode::open(RuntimeState* state) {
        }
        DCHECK_LE(chunk->num_rows(), runtime_state()->chunk_size());

-        _aggregator->evaluate_exprs(chunk.get());
+        RETURN_IF_ERROR(_aggregator->evaluate_exprs(chunk.get()));

        {
            SCOPED_TIMER(_aggregator->agg_compute_timer());
@ -59,8 +61,7 @@ Status DistinctBlockingNode::open(RuntimeState* state) {
            APPLY_FOR_AGG_VARIANT_ALL(HASH_SET_METHOD)
 #undef HASH_SET_METHOD

-            _mem_tracker->set(_aggregator->hash_set_variant().memory_usage() +
-                              _aggregator->mem_pool()->total_reserved_bytes());
+            _mem_tracker->set(_aggregator->hash_set_variant().reserved_memory_usage(_aggregator->mem_pool()));
            TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_set());

            _aggregator->update_num_input_rows(chunk->num_rows());
@ -90,7 +91,7 @@ Status DistinctBlockingNode::open(RuntimeState* state) {

    COUNTER_SET(_aggregator->input_row_count(), _aggregator->num_input_rows());

-    _mem_tracker->set(_aggregator->hash_set_variant().memory_usage() + _aggregator->mem_pool()->total_reserved_bytes());
+    _mem_tracker->set(_aggregator->hash_set_variant().reserved_memory_usage(_aggregator->mem_pool()));

    return Status::OK();
 }
@ -133,7 +134,8 @@ Status DistinctBlockingNode::get_next(RuntimeState* state, ChunkPtr* chunk, bool
 std::vector<std::shared_ptr<pipeline::OperatorFactory> > DistinctBlockingNode::decompose_to_pipeline(
        pipeline::PipelineBuilderContext* context) {
    using namespace pipeline;
-    OpFactories operators_with_sink = _children[0]->decompose_to_pipeline(context);
+
+    OpFactories ops_with_sink = _children[0]->decompose_to_pipeline(context);

    // Create a shared RefCountedRuntimeFilterCollector
    auto&& rc_rf_probe_collector = std::make_shared<RcRfProbeCollector>(2, std::move(this->runtime_filter_collector()));
@ -147,26 +149,43 @@ std::vector<std::shared_ptr<pipeline::OperatorFactory> > DistinctBlockingNode::d
    // Initialize OperatorFactory's fields involving runtime filters.
    this->init_runtime_filter_for_operator(sink_operator.get(), context, rc_rf_probe_collector);

-    OpFactories operators_with_source;
+    OpFactories ops_with_source;
    auto source_operator = std::make_shared<AggregateDistinctBlockingSourceOperatorFactory>(context->next_operator_id(),
                                                                                            id(), aggregator_factory);
    // Initialize OperatorFactory's fields involving runtime filters.
    this->init_runtime_filter_for_operator(source_operator.get(), context, rc_rf_probe_collector);

-    operators_with_sink = context->maybe_interpolate_local_shuffle_exchange(runtime_state(), operators_with_sink,
-                                                                            partition_expr_ctxs);
-    operators_with_sink.push_back(std::move(sink_operator));
-    context->add_pipeline(operators_with_sink);
+    bool need_local_shuffle = true;
+    if (auto* exchange_op = dynamic_cast<ExchangeSourceOperatorFactory*>(ops_with_sink[0].get());
+        exchange_op != nullptr) {
+        auto& texchange_node = exchange_op->texchange_node();
+        DCHECK(texchange_node.__isset.partition_type);
+        need_local_shuffle = texchange_node.partition_type != TPartitionType::HASH_PARTITIONED &&
+                             texchange_node.partition_type != TPartitionType::BUCKET_SHUFFLE_HASH_PARTITIONED;
+    }
+    if (need_local_shuffle) {
+        ops_with_sink =
+                context->maybe_interpolate_local_shuffle_exchange(runtime_state(), ops_with_sink, partition_expr_ctxs);
+    }
+
+    ops_with_sink.push_back(std::move(sink_operator));
+    context->add_pipeline(ops_with_sink);
    // Aggregator must be used by a pair of sink and source operators,
-    // so operators_with_source's degree of parallelism must be equal with operators_with_sink's
-    auto degree_of_parallelism = ((SourceOperatorFactory*)(operators_with_sink[0].get()))->degree_of_parallelism();
+    // so operators_with_source's degree of parallelism must be equal with ops_with_sink's
+    auto degree_of_parallelism = ((SourceOperatorFactory*)(ops_with_sink[0].get()))->degree_of_parallelism();
    source_operator->set_degree_of_parallelism(degree_of_parallelism);
-    operators_with_source.push_back(std::move(source_operator));
+    ops_with_source.push_back(std::move(source_operator));
+
+    if (!_tnode.conjuncts.empty() || ops_with_source.back()->has_runtime_filters()) {
+        ops_with_source.emplace_back(
+                std::make_shared<ChunkAccumulateOperatorFactory>(context->next_operator_id(), id()));
+    }
+
    if (limit() != -1) {
-        operators_with_source.emplace_back(
+        ops_with_source.emplace_back(
                std::make_shared<LimitOperatorFactory>(context->next_operator_id(), id(), limit()));
    }
-    return operators_with_source;
+    return ops_with_source;
 }

 } // namespace starrocks::vectorized
--- a/be/src/exec/vectorized/aggregate/distinct_streaming_node.cpp
+++ b/be/src/exec/vectorized/aggregate/distinct_streaming_node.cpp
@ -53,7 +53,7 @@ Status DistinctStreamingNode::get_next(RuntimeState* state, ChunkPtr* chunk, boo
            size_t input_chunk_size = input_chunk->num_rows();
            _aggregator->update_num_input_rows(input_chunk_size);
            COUNTER_SET(_aggregator->input_row_count(), _aggregator->num_input_rows());
-            _aggregator->evaluate_exprs(input_chunk.get());
+            RETURN_IF_ERROR(_aggregator->evaluate_exprs(input_chunk.get()));

            if (_aggregator->streaming_preaggregation_mode() == TStreamingPreaggregationMode::FORCE_STREAMING) {
                // force execute streaming
@ -80,8 +80,7 @@ Status DistinctStreamingNode::get_next(RuntimeState* state, ChunkPtr* chunk, boo

                COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_set_variant().size());

-                _mem_tracker->set(_aggregator->hash_set_variant().memory_usage() +
-                                  _aggregator->mem_pool()->total_reserved_bytes());
+                _mem_tracker->set(_aggregator->hash_set_variant().reserved_memory_usage(_aggregator->mem_pool()));
                TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_set());

                continue;
@ -114,8 +113,7 @@ Status DistinctStreamingNode::get_next(RuntimeState* state, ChunkPtr* chunk, boo

                    COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_set_variant().size());

-                    _mem_tracker->set(_aggregator->hash_set_variant().memory_usage() +
-                                      _aggregator->mem_pool()->total_reserved_bytes());
+                    _mem_tracker->set(_aggregator->hash_set_variant().reserved_memory_usage(_aggregator->mem_pool()));
                    TRY_CATCH_BAD_ALLOC(_aggregator->try_convert_to_two_level_set());

                    continue;
--- a/be/src/exec/vectorized/aggregator.cpp
+++ b/be/src/exec/vectorized/aggregator.cpp
@ -274,6 +274,10 @@ void Aggregator::close(RuntimeState* state) {
    }

    _is_closed = true;
+    // Clear the buffer
+    while (!_buffer.empty()) {
+        _buffer.pop();
+    }

    auto agg_close = [this, state]() {
        // _mem_pool is nullptr means prepare phase failed
--- a/be/src/exec/vectorized/aggregator.h
+++ b/be/src/exec/vectorized/aggregator.h
@ -70,7 +70,7 @@ public:

    void close(RuntimeState* state) override;

-    std::unique_ptr<MemPool>& mem_pool() { return _mem_pool; };
+    const MemPool* mem_pool() const { return _mem_pool.get(); }
    bool is_none_group_by_exprs() { return _group_by_expr_ctxs.empty(); }
    const std::vector<ExprContext*>& conjunct_ctxs() { return _conjunct_ctxs; }
    const std::vector<ExprContext*>& group_by_expr_ctxs() { return _group_by_expr_ctxs; }
--- a/be/src/exec/vectorized/analytor.cpp
+++ b/be/src/exec/vectorized/analytor.cpp
@ -307,6 +307,10 @@ void Analytor::close(RuntimeState* state) {
        return;
    }

+    while (!_buffer.empty()) {
+        _buffer.pop();
+    }
+    _input_chunks.clear();
    _is_closed = true;

    auto agg_close = [this, state]() {
--- a/Show More
+++ b/Show More