[BugFix] Fix shutdown tablet can not gc (backport #63595 ) (#63624 )

Signed-off-by: sevev <qiangzh95@gmail.com> Co-authored-by: zhangqiang <qiangzh95@gmail.com>
[BugFix] change CHECK to DCHECK in nullablecolumn to prevent the crash (backport #63553 ) (backport #63565 ) (#63606 )
2025-09-26 10:33:19 +00:00 · 2025-09-26 08:01:25 +00:00 · 2025-09-24 19:59:58 +08:00 · 2025-09-24 19:16:11 +08:00 · 2025-09-24 18:31:12 +08:00 · 2025-09-24 17:26:17 +08:00
1352 changed files with 57815 additions and 8556 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -1,137 +1,2 @@
 # committer will be the owner of all codes
 *   @StarRocks/starrocks-committer
-
-# cpp miscellaneous
-/be/src/common/     @StarRocks/cpp-misc-maintainer
-/be/src/gen_cpp/    @StarRocks/cpp-misc-maintainer
-/be/src/gutil/      @StarRocks/cpp-misc-maintainer
-/be/src/simd/       @StarRocks/cpp-misc-maintainer
-/be/src/testutil/   @StarRocks/cpp-misc-maintainer
-/be/src/util/       @StarRocks/cpp-misc-maintainer
-
-# execution engine 
-/be/src/column/  @StarRocks/execution-maintainer
-/be/src/exec/    @StarRocks/execution-maintainer
-/be/src/exprs/   @StarRocks/execution-maintainer
-/be/src/runtime/ @StarRocks/execution-maintainer
-/be/src/types/   @StarRocks/execution-maintainer
-/be/src/udf/     @StarRocks/execution-maintainer
-
-# open formats
-/be/src/formats/  @StarRocks/open-format-maintainer
-
-# storage engine 
-/be/src/fs/       @StarRocks/storage-maintainer
-/be/src/io/       @StarRocks/storage-maintainer
-/be/src/storage/  @StarRocks/storage-maintainer
-
-# /docs/ belong to docs-maintainer
-/docs/  @StarRocks/docs-maintainer
-
-# /docker
-/docker/  @StarRocks/docker-maintainer
-
-# metadata
-/fe/fe-core/src/main/java/com/starrocks/authentication/             @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/privilege/                  @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/common/util/concurrent/     @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/mysql/                      @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/healthchecker/              @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/clone/                      @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/consistency/                @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/ha/                         @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/journal/    @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/leader/     @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/meta/       @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/persist/    @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/alter/      @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/backup/     @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/catalog/    @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/metric/     @StarRocks/metadata-maintainer
-/fe/fe-core/src/main/java/com/starrocks/system/     @StarRocks/metadata-maintainer
-
-# connector
-/fe/fe-core/src/main/java/com/starrocks/connector/  @StarRocks/connector-maintainer
-/fe/fe-core/src/main/java/com/starrocks/credential/  @StarRocks/connector-maintainer
-
-# parser
-/fe/fe-core/src/main/java/com/starrocks/sql/ast/ @StarRocks/parser
-/fe/fe-core/src/main/java/com/starrocks/sql/parser/ @StarRocks/parser
-
-# analyzer
-/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/ @StarRocks/analyzer
-/fe/fe-core/src/main/java/com/starrocks/analysis/ @StarRocks/analyzer
-
-# optimizer
-/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/ @StarRocks/optimizer
-/fe/fe-core/src/main/java/com/starrocks/statistic/ @StarRocks/optimizer
-
-# scheduler
-/fe/fe-core/src/main/java/com/starrocks/qe/scheduler/ @StarRocks/scheduler-maintainer
-
-# sql/parser/StarRocksLex.g4 sql/parser/StarRocks.g4 belong to syntax-committer
-/fe/fe-core/src/main/java/com/starrocks/sql/parser/StarRocksLex.g4  @StarRocks/syntax-committer
-/fe/fe-core/src/main/java/com/starrocks/sql/parser/StarRocks.g4  @StarRocks/syntax-committer
-/gensrc/script/functions.py  @StarRocks/syntax-committer
-
-# /thirdparty/ /docker/dockerfiles/dev-env/dev-env.Dockerfile belong to thirdparty-maintainer
-/be/src/thirdparty/  @StarRocks/thirdparty-maintainer
-/thirdparty/  @StarRocks/thirdparty-maintainer
-/docker/dockerfiles/dev-env/dev-env.Dockerfile   @StarRocks/thirdparty-maintainer
-
-# cloud native
-/be/src/storage/lake/ @StarRocks/cloud-native-maintainer
-/be/src/runtime/lake_tablets_channel.h @StarRocks/cloud-native-maintainer
-/be/src/runtime/lake_tablets_channel.cpp @StarRocks/cloud-native-maintainer
-
-# error message
-/fe/fe-core/src/main/java/com/starrocks/common/ErrorCode.java  @StarRocks/msg-reviewer
-
-# StorageEngine/ExecEnv/GlobalEnv
-/be/src/runtime/exec_env.h @StarRocks/thread-committer
-/be/src/runtime/exec_env.cpp @StarRocks/thread-committer
-/be/src/storage/olap_server.cpp @StarRocks/thread-committer
-/be/src/storage/storage_engine.h @StarRocks/thread-committer
-/be/src/storage/storage_engine.cpp @StarRocks/thread-committer
-/be/src/service/starrocks_main.cpp @StarRocks/thread-committer
-/be/src/service/service_be/starrocks_be.cpp @StarRocks/thread-committer
-
-# restful
-/fe/fe-core/src/main/java/com/starrocks/http @StarRocks/restful-maintainer
-/be/src/http @StarRocks/restful-maintainer
-
-
-# load and unload
-/fe/fe-core/src/main/java/com/starrocks/load/*                  @StarRocks/load-unload-maintainer
-/fe/fe-core/src/main/java/com/starrocks/plan/StreamLoad*        @StarRocks/load-unload-maintainer
-/fe/fe-core/src/main/java/com/starrocks/plan/*Sink.java         @StarRocks/load-unload-maintainer
-/fe/fe-core/src/main/java/com/starrocks/sql/InsertPlanner.java  @StarRocks/load-unload-maintainer
-/fe/fe-core/src/main/java/com/starrocks/sql/LoadPlanner.java    @StarRocks/load-unload-maintainer
-/fe/fe-core/src/main/java/com/starrocks/backup/*                @StarRocks/load-unload-maintainer
-/fe/fe-core/src/main/java/com/starrocks/alter/Optimize*         @StarRocks/load-unload-maintainer
-/fe/fe-core/src/main/java/com/starrocks/alter/Compaction*       @StarRocks/load-unload-maintainer
-/fe/fe-core/src/main/java/com/starrocks/catalog/*Partition*     @StarRocks/load-unload-maintainer
-
-/be/src/storage/*                                       @StarRocks/load-unload-maintainer
-/be/src/exec/tablet_sink*                               @StarRocks/load-unload-maintainer
-/be/src/exec/csv_scanner.cpp                            @StarRocks/load-unload-maintainer
-/be/src/exec/json_scanner.cpp                           @StarRocks/load-unload-maintainer
-/be/src/exec/pipeline/olap_table_sink_operator.cpp      @StarRocks/load-unload-maintainer
-/be/src/formats/avro/*                          @StarRocks/load-unload-maintainer
-/be/src/formats/csv/*                           @StarRocks/load-unload-maintainer
-/be/src/formats/json/*                          @StarRocks/load-unload-maintainer
-/be/src/http/action/compaction_action.cpp       @StarRocks/load-unload-maintainer
-/be/src/http/action/*stream_load.cpp    @StarRocks/load-unload-maintainer
-/be/src/http/action/restore*            @StarRocks/load-unload-maintainer
-/be/src/runtime/batch_write/*           @StarRocks/load-unload-maintainer
-/be/src/runtime/routine_load/*          @StarRocks/load-unload-maintainer
-/be/src/runtime/stream_load/*           @StarRocks/load-unload-maintainer
-/be/src/runtime/load*                   @StarRocks/load-unload-maintainer
-/be/src/runtime/tablets_channel.cpp     @StarRocks/load-unload-maintainer
-/be/src/runtime/local_tablets_channel*  @StarRocks/load-unload-maintainer
-/be/src/runtime/export_sink.cpp         @StarRocks/load-unload-maintainer
-
-# meta upgrade/downgrade compatibility
-/fe/fe-core/src/main/java/com/starrocks/persist/gson/GsonUtils.java @StarRocks/meta-compatibility-maintainer
-
-
--- a/be/src/agent/agent_server.cpp
+++ b/be/src/agent/agent_server.cpp
@ -451,22 +451,35 @@ void AgentServer::Impl::submit_tasks(TAgentResult& agent_result, const std::vect
        }
    }

-#define HANDLE_TASK(t_task_type, all_tasks, do_func, AGENT_REQ, request, env)                                      \
-    for (auto* task : all_tasks) {                                                                                 \
-        auto pool = get_thread_pool(t_task_type);                                                                  \
-        auto signature = task->signature;                                                                          \
-        std::pair<bool, size_t> register_pair = register_task_info(task_type, signature);                          \
-        if (register_pair.first) {                                                                                 \
-            LOG(INFO) << "Submit task success. type=" << t_task_type << ", signature=" << signature                \
-                      << ", task_count_in_queue=" << register_pair.second;                                         \
-            ret_st = pool->submit_func(                                                                            \
-                    std::bind(do_func, std::make_shared<AGENT_REQ>(*task, task->request, time(nullptr)), env));    \
-            if (!ret_st.ok()) {                                                                                    \
-                LOG(WARNING) << "fail to submit task. reason: " << ret_st.message() << ", task: " << task;         \
-            }                                                                                                      \
-        } else {                                                                                                   \
-            LOG(INFO) << "Submit task failed, already exists type=" << t_task_type << ", signature=" << signature; \
-        }                                                                                                          \
+#define HANDLE_TASK(t_task_type, all_tasks, do_func, AGENT_REQ, request, env)                                          \
+    {                                                                                                                  \
+        std::string submit_log = "Submit task success. type=" + to_string(t_task_type) + ", signatures=";              \
+        size_t log_count = 0;                                                                                          \
+        size_t queue_len = 0;                                                                                          \
+        for (auto* task : all_tasks) {                                                                                 \
+            auto pool = get_thread_pool(t_task_type);                                                                  \
+            auto signature = task->signature;                                                                          \
+            std::pair<bool, size_t> register_pair = register_task_info(task_type, signature);                          \
+            if (register_pair.first) {                                                                                 \
+                if (log_count++ < 100) {                                                                               \
+                    submit_log += std::to_string(signature) + ",";                                                     \
+                }                                                                                                      \
+                queue_len = register_pair.second;                                                                      \
+                ret_st = pool->submit_func(                                                                            \
+                        std::bind(do_func, std::make_shared<AGENT_REQ>(*task, task->request, time(nullptr)), env));    \
+                if (!ret_st.ok()) {                                                                                    \
+                    LOG(WARNING) << "fail to submit task. reason: " << ret_st.message() << ", task: " << task;         \
+                }                                                                                                      \
+            } else {                                                                                                   \
+                LOG(INFO) << "Submit task failed, already exists type=" << t_task_type << ", signature=" << signature; \
+            }                                                                                                          \
+        }                                                                                                              \
+        if (queue_len > 0) {                                                                                           \
+            if (log_count >= 100) {                                                                                    \
+                submit_log += "...,";                                                                                  \
+            }                                                                                                          \
+            LOG(INFO) << submit_log << " task_count_in_queue=" << queue_len;                                           \
+        }                                                                                                              \
    }

    // batch submit tasks
--- a/be/src/agent/agent_task.cpp
+++ b/be/src/agent/agent_task.cpp
@ -119,7 +119,7 @@ static void alter_tablet(const TAlterTabletReqV2& agent_task_req, int64_t signat
    if (status == STARROCKS_SUCCESS) {
        swap(finish_tablet_infos, finish_task_request->finish_tablet_infos);
        finish_task_request->__isset.finish_tablet_infos = true;
-        LOG(INFO) << alter_msg_head << "alter success. signature: " << signature;
+        VLOG(2) << alter_msg_head << "alter success. signature: " << signature;
        error_msgs.emplace_back("alter success");
        task_status.__set_status_code(TStatusCode::OK);
    } else if (status == STARROCKS_TASK_REQUEST_ERROR) {
@ -156,13 +156,11 @@ static void unify_finish_agent_task(TStatusCode::type status_code, const std::ve

    finish_task(finish_task_request);
    size_t task_queue_size = remove_task_info(task_type, signature);
-    LOG(INFO) << "Remove task success. type=" << task_type << ", signature=" << signature
-              << ", task_count_in_queue=" << task_queue_size;
+    VLOG(1) << "Remove task success. type=" << task_type << ", signature=" << signature
+            << ", task_count_in_queue=" << task_queue_size;
 }

 void run_drop_tablet_task(const std::shared_ptr<DropTabletAgentTaskRequest>& agent_task_req, ExecEnv* exec_env) {
-    StarRocksMetrics::instance()->clone_requests_total.increment(1);
-
    const TDropTabletReq& drop_tablet_req = agent_task_req->task_req;

    bool force_drop = drop_tablet_req.__isset.force && drop_tablet_req.force;
@ -348,6 +346,7 @@ void run_clear_transaction_task(const std::shared_ptr<ClearTransactionAgentTaskR
 }

 void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req, ExecEnv* exec_env) {
+    StarRocksMetrics::instance()->clone_requests_total.increment(1);
    const TCloneReq& clone_req = agent_task_req->task_req;
    AgentStatus status = STARROCKS_SUCCESS;

@ -366,6 +365,7 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
    if (clone_req.__isset.is_local && clone_req.is_local) {
        DataDir* dest_store = StorageEngine::instance()->get_store(clone_req.dest_path_hash);
        if (dest_store == nullptr) {
+            StarRocksMetrics::instance()->clone_requests_failed.increment(1);
            LOG(WARNING) << "fail to get dest store. path_hash:" << clone_req.dest_path_hash;
            status_code = TStatusCode::RUNTIME_ERROR;
        } else {
@ -374,6 +374,7 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
                                                   need_rebuild_pk_index);
            Status res = StorageEngine::instance()->execute_task(&engine_task);
            if (!res.ok()) {
+                StarRocksMetrics::instance()->clone_requests_failed.increment(1);
                status_code = TStatusCode::RUNTIME_ERROR;
                LOG(WARNING) << "local tablet migration failed. status: " << res
                             << ", signature: " << agent_task_req->signature;
@ -392,6 +393,14 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
                    tablet_infos.push_back(tablet_info);
                }
                finish_task_request.__set_finish_tablet_infos(tablet_infos);
+
+                int64_t copy_size = engine_task.get_copy_size();
+                finish_task_request.__set_copy_size(copy_size);
+                StarRocksMetrics::instance()->clone_task_intra_node_copy_bytes.increment(copy_size);
+
+                int64_t copy_time_ms = engine_task.get_copy_time_ms();
+                finish_task_request.__set_copy_time_ms(copy_time_ms);
+                StarRocksMetrics::instance()->clone_task_intra_node_copy_duration_ms.increment(copy_time_ms);
            }
        }
    } else {
@ -399,6 +408,7 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
                                    &error_msgs, &tablet_infos, &status);
        Status res = StorageEngine::instance()->execute_task(&engine_task);
        if (!res.ok()) {
+            StarRocksMetrics::instance()->clone_requests_failed.increment(1);
            status_code = TStatusCode::RUNTIME_ERROR;
            LOG(WARNING) << "clone failed. status:" << res << ", signature:" << agent_task_req->signature;
            error_msgs.emplace_back("clone failed.");
@ -412,6 +422,14 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
                LOG(INFO) << "clone success, set tablet infos. status:" << status
                          << ", signature:" << agent_task_req->signature;
                finish_task_request.__set_finish_tablet_infos(tablet_infos);
+
+                int64_t copy_size = engine_task.get_copy_size();
+                finish_task_request.__set_copy_size(copy_size);
+                StarRocksMetrics::instance()->clone_task_inter_node_copy_bytes.increment(copy_size);
+
+                int64_t copy_time_ms = engine_task.get_copy_time_ms();
+                finish_task_request.__set_copy_time_ms(copy_time_ms);
+                StarRocksMetrics::instance()->clone_task_inter_node_copy_duration_ms.increment(copy_time_ms);
            }
        }
    }
@ -708,8 +726,7 @@ void run_upload_task(const std::shared_ptr<UploadAgentTaskRequest>& agent_task_r
    finish_task(finish_task_request);
    remove_task_info(agent_task_req->task_type, agent_task_req->signature);

-    LOG(INFO) << "Finished uploaded task signature=" << agent_task_req->signature
-              << " job id=" << upload_request.job_id;
+    VLOG(1) << "Finished uploaded task signature=" << agent_task_req->signature << " job id=" << upload_request.job_id;
 }

 void run_download_task(const std::shared_ptr<DownloadAgentTaskRequest>& agent_task_req, ExecEnv* exec_env) {
@ -744,8 +761,8 @@ void run_download_task(const std::shared_ptr<DownloadAgentTaskRequest>& agent_ta
    finish_task(finish_task_request);
    remove_task_info(agent_task_req->task_type, agent_task_req->signature);

-    LOG(INFO) << "Finished downloaded task signature=" << agent_task_req->signature
-              << " job id=" << download_request.job_id;
+    VLOG(1) << "Finished downloaded task signature=" << agent_task_req->signature
+            << " job id=" << download_request.job_id;
 }

 void run_make_snapshot_task(const std::shared_ptr<SnapshotAgentTaskRequest>& agent_task_req, ExecEnv* exec_env) {
@ -766,9 +783,9 @@ void run_make_snapshot_task(const std::shared_ptr<SnapshotAgentTaskRequest>& age
                     << " status=" << st.to_string();
        error_msgs.push_back("make_snapshot failed. status: " + st.to_string());
    } else {
-        LOG(INFO) << "Created snapshot tablet_id=" << snapshot_request.tablet_id
-                  << " schema_hash=" << snapshot_request.schema_hash << " version=" << snapshot_request.version
-                  << " snapshot_path=" << snapshot_path;
+        VLOG(1) << "Created snapshot tablet_id=" << snapshot_request.tablet_id
+                << " schema_hash=" << snapshot_request.schema_hash << " version=" << snapshot_request.version
+                << " snapshot_path=" << snapshot_path;
        if (snapshot_request.__isset.list_files) {
            // list and save all snapshot files
            // snapshot_path like: data/snapshot/20180417205230.1.86400
@ -818,7 +835,7 @@ void run_release_snapshot_task(const std::shared_ptr<ReleaseSnapshotAgentTaskReq
        error_msgs.push_back("release_snapshot failed. status: " +
                             boost::lexical_cast<std::string>(release_snapshot_status));
    } else {
-        LOG(INFO) << "Released snapshot path=" << snapshot_path << " status=" << release_snapshot_status;
+        VLOG(1) << "Released snapshot path=" << snapshot_path << " status=" << release_snapshot_status;
    }

    unify_finish_agent_task(status_code, error_msgs, agent_task_req->task_type, agent_task_req->signature);
@ -1045,8 +1062,8 @@ void run_remote_snapshot_task(const std::shared_ptr<RemoteSnapshotAgentTaskReque
    finish_task(finish_task_request);
 #endif
    auto task_queue_size = remove_task_info(agent_task_req->task_type, agent_task_req->signature);
-    LOG(INFO) << "Remove task success. type=" << agent_task_req->task_type
-              << ", signature=" << agent_task_req->signature << ", task_count_in_queue=" << task_queue_size;
+    VLOG(1) << "Remove task success. type=" << agent_task_req->task_type << ", signature=" << agent_task_req->signature
+            << ", task_count_in_queue=" << task_queue_size;
 }

 void run_replicate_snapshot_task(const std::shared_ptr<ReplicateSnapshotAgentTaskRequest>& agent_task_req,
@ -1090,8 +1107,8 @@ void run_replicate_snapshot_task(const std::shared_ptr<ReplicateSnapshotAgentTas
    finish_task(finish_task_request);
 #endif
    auto task_queue_size = remove_task_info(agent_task_req->task_type, agent_task_req->signature);
-    LOG(INFO) << "Remove task success. type=" << agent_task_req->task_type
-              << ", signature=" << agent_task_req->signature << ", task_count_in_queue=" << task_queue_size;
+    VLOG(1) << "Remove task success. type=" << agent_task_req->task_type << ", signature=" << agent_task_req->signature
+            << ", task_count_in_queue=" << task_queue_size;
 }

 } // namespace starrocks
--- a/be/src/agent/publish_version.cpp
+++ b/be/src/agent/publish_version.cpp
@ -27,6 +27,8 @@
 #include "storage/tablet.h"
 #include "storage/tablet_manager.h"
 #include "storage/txn_manager.h"
+#include "util/countdown_latch.h"
+#include "util/defer_op.h"
 #include "util/starrocks_metrics.h"
 #include "util/threadpool.h"
 #include "util/time.h"
@ -49,6 +51,7 @@ struct TabletPublishVersionTask {
    // or 0 which means tablet not found or publish task cannot be submitted
    int64_t max_continuous_version{0};
    bool is_double_write{false};
+    bool is_shadow{false};
 };

 void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionRequest& publish_version_req,
@ -91,7 +94,7 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
            }
        }
    } else {
-        std::vector<std::map<TabletInfo, RowsetSharedPtr>> partitions(num_partition);
+        std::vector<std::map<TabletInfo, std::pair<RowsetSharedPtr, bool>>> partitions(num_partition);
        for (size_t i = 0; i < publish_version_req.partition_version_infos.size(); i++) {
            StorageEngine::instance()->txn_manager()->get_txn_related_tablets(
                    transaction_id, publish_version_req.partition_version_infos[i].partition_id, &partitions[i]);
@ -108,7 +111,8 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
                task.partition_id = publish_version_req.partition_version_infos[i].partition_id;
                task.tablet_id = itr.first.tablet_id;
                task.version = publish_version_req.partition_version_infos[i].version;
-                task.rowset = std::move(itr.second);
+                task.rowset = std::move(itr.second.first);
+                task.is_shadow = itr.second.second;
                // rowset can be nullptr if it just prepared but not committed
                if (task.rowset != nullptr) {
                    task.rowset->rowset_meta()->set_gtid(publish_version_req.gtid);
@ -122,87 +126,101 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
    span->SetAttribute("num_tablet", num_active_tablet);

    std::mutex affected_dirs_lock;
+    CountDownLatch latch(static_cast<int>(tablet_tasks.size()));
    for (auto& tablet_task : tablet_tasks) {
        uint32_t retry_time = 0;
        Status st;
        while (retry_time++ < PUBLISH_VERSION_SUBMIT_MAX_RETRY) {
-            st = token->submit_func([&]() {
-                auto& task = tablet_task;
-                auto tablet_span = Tracer::Instance().add_span("tablet_publish_txn", span);
-                auto scoped_tablet_span = trace::Scope(tablet_span);
-                tablet_span->SetAttribute("txn_id", transaction_id);
-                tablet_span->SetAttribute("tablet_id", task.tablet_id);
-                tablet_span->SetAttribute("version", task.version);
-                if (!is_replication_txn && !task.rowset) {
-                    task.st = Status::NotFound(
-                            fmt::format("rowset not found  of tablet: {}, txn_id: {}", task.tablet_id, task.txn_id));
-                    LOG(WARNING) << task.st;
-                    return;
-                }
-                TabletSharedPtr tablet = StorageEngine::instance()->tablet_manager()->get_tablet(task.tablet_id);
-                if (!tablet) {
-                    // tablet may get dropped, it's ok to ignore this situation
-                    LOG(WARNING) << fmt::format(
-                            "publish_version tablet not found tablet_id: {}, version: {} txn_id: {}", task.tablet_id,
-                            task.version, task.txn_id);
-                    return;
-                }
-                {
-                    std::lock_guard lg(affected_dirs_lock);
-                    affected_dirs.insert(tablet->data_dir());
-                }
-                if (is_replication_txn) {
-                    task.st = StorageEngine::instance()->replication_txn_manager()->publish_txn(
-                            task.txn_id, task.partition_id, tablet, task.version);
-                    if (!task.st.ok()) {
-                        LOG(WARNING) << "Publish txn failed tablet:" << tablet->tablet_id()
-                                     << " version:" << task.version << " partition:" << task.partition_id
-                                     << " txn_id: " << task.txn_id;
-                        std::string_view msg = task.st.message();
-                        tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
-                    } else {
-                        VLOG(2) << "Publish txn success tablet:" << tablet->tablet_id() << " version:" << task.version
-                                << " tablet_max_version:" << tablet->max_continuous_version()
-                                << " partition:" << task.partition_id << " txn_id: " << task.txn_id;
-                    }
-                } else if (is_version_overwrite) {
-                    task.st = StorageEngine::instance()->txn_manager()->publish_overwrite_txn(
-                            task.partition_id, tablet, task.txn_id, task.version, task.rowset, wait_time);
-                    if (!task.st.ok()) {
-                        LOG(WARNING) << "Publish overwrite txn failed tablet:" << tablet->tablet_id()
-                                     << " version:" << task.version << " partition:" << task.partition_id
-                                     << " txn_id: " << task.txn_id << " rowset:" << task.rowset->rowset_id();
-                        std::string_view msg = task.st.message();
-                        tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
-                    } else {
-                        LOG(INFO) << "Publish overwrite txn success tablet:" << tablet->tablet_id()
-                                  << " version:" << task.version
-                                  << " tablet_max_version:" << tablet->max_continuous_version()
-                                  << " partition:" << task.partition_id << " txn_id: " << task.txn_id
-                                  << " rowset:" << task.rowset->rowset_id();
-                    }
-                } else {
-                    task.st = StorageEngine::instance()->txn_manager()->publish_txn(
-                            task.partition_id, tablet, task.txn_id, task.version, task.rowset, wait_time,
-                            task.is_double_write);
-                    if (!task.st.ok()) {
-                        LOG(WARNING) << "Publish txn failed tablet:" << tablet->tablet_id()
-                                     << " version:" << task.version << " partition:" << task.partition_id
-                                     << " txn_id: " << task.txn_id << " rowset:" << task.rowset->rowset_id();
-                        std::string_view msg = task.st.message();
-                        tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
-                    } else {
-                        if (task.is_double_write || VLOG_ROW_IS_ON) {
-                            LOG(INFO) << "Publish txn success tablet:" << tablet->tablet_id()
-                                      << " version:" << task.version
-                                      << " tablet_max_version:" << tablet->max_continuous_version()
-                                      << " is_double_write:" << task.is_double_write
-                                      << " partition:" << task.partition_id << " txn_id: " << task.txn_id
-                                      << " rowset:" << task.rowset->rowset_id();
+            auto task = std::make_shared<CancellableRunnable>(
+                    [&]() {
+                        DeferOp defer([&] { latch.count_down(); });
+                        auto& task = tablet_task;
+                        auto tablet_span = Tracer::Instance().add_span("tablet_publish_txn", span);
+                        auto scoped_tablet_span = trace::Scope(tablet_span);
+                        tablet_span->SetAttribute("txn_id", transaction_id);
+                        tablet_span->SetAttribute("tablet_id", task.tablet_id);
+                        tablet_span->SetAttribute("version", task.version);
+                        if (!is_replication_txn && !task.rowset) {
+                            task.st = Status::NotFound(fmt::format("rowset not found of tablet: {}, txn_id: {}",
+                                                                   task.tablet_id, task.txn_id));
+                            LOG(WARNING) << task.st;
+                            return;
                        }
-                    }
-                }
-            });
+                        TabletSharedPtr tablet =
+                                StorageEngine::instance()->tablet_manager()->get_tablet(task.tablet_id);
+                        if (!tablet) {
+                            // tablet may get dropped, it's ok to ignore this situation
+                            LOG(WARNING) << fmt::format(
+                                    "publish_version tablet not found tablet_id: {}, version: {} txn_id: {}",
+                                    task.tablet_id, task.version, task.txn_id);
+                            return;
+                        }
+                        {
+                            std::lock_guard lg(affected_dirs_lock);
+                            affected_dirs.insert(tablet->data_dir());
+                        }
+                        if (is_replication_txn) {
+                            task.st = StorageEngine::instance()->replication_txn_manager()->publish_txn(
+                                    task.txn_id, task.partition_id, tablet, task.version);
+                            if (!task.st.ok()) {
+                                LOG(WARNING) << "Publish txn failed tablet:" << tablet->tablet_id()
+                                             << " version:" << task.version << " partition:" << task.partition_id
+                                             << " txn_id: " << task.txn_id;
+                                std::string_view msg = task.st.message();
+                                tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
+                            } else {
+                                VLOG(2) << "Publish txn success tablet:" << tablet->tablet_id()
+                                        << " version:" << task.version
+                                        << " tablet_max_version:" << tablet->max_continuous_version()
+                                        << " partition:" << task.partition_id << " txn_id: " << task.txn_id;
+                            }
+                        } else if (is_version_overwrite) {
+                            task.st = StorageEngine::instance()->txn_manager()->publish_overwrite_txn(
+                                    task.partition_id, tablet, task.txn_id, task.version, task.rowset, wait_time);
+                            if (!task.st.ok()) {
+                                LOG(WARNING) << "Publish overwrite txn failed tablet:" << tablet->tablet_id()
+                                             << " version:" << task.version << " partition:" << task.partition_id
+                                             << " txn_id: " << task.txn_id << " rowset:" << task.rowset->rowset_id();
+                                std::string_view msg = task.st.message();
+                                tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
+                            } else {
+                                LOG(INFO) << "Publish overwrite txn success tablet:" << tablet->tablet_id()
+                                          << " version:" << task.version
+                                          << " tablet_max_version:" << tablet->max_continuous_version()
+                                          << " partition:" << task.partition_id << " txn_id: " << task.txn_id
+                                          << " rowset:" << task.rowset->rowset_id();
+                            }
+                        } else {
+                            task.st = StorageEngine::instance()->txn_manager()->publish_txn(
+                                    task.partition_id, tablet, task.txn_id, task.version, task.rowset, wait_time,
+                                    task.is_double_write);
+                            if (!task.st.ok()) {
+                                LOG(WARNING) << "Publish txn failed tablet:" << tablet->tablet_id()
+                                             << " version:" << task.version << " partition:" << task.partition_id
+                                             << " txn_id: " << task.txn_id << " rowset:" << task.rowset->rowset_id();
+                                std::string_view msg = task.st.message();
+                                tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
+                            } else {
+                                if (task.is_double_write || VLOG_ROW_IS_ON) {
+                                    LOG(INFO) << "Publish txn success tablet:" << tablet->tablet_id()
+                                              << " version:" << task.version
+                                              << " tablet_max_version:" << tablet->max_continuous_version()
+                                              << " is_double_write:" << task.is_double_write
+                                              << " partition:" << task.partition_id << " txn_id: " << task.txn_id
+                                              << " rowset:" << task.rowset->rowset_id();
+                                }
+                            }
+                        }
+                    },
+                    [&]() {
+                        tablet_task.st = Status::Cancelled(
+                                fmt::format("publish version task has been cancelled, tablet_id={}, version={}",
+                                            tablet_task.tablet_id, tablet_task.version));
+                        VLOG(1) << tablet_task.st;
+                        latch.count_down();
+                    });
+
+            st = token->submit(std::move(task));
            if (st.is_service_unavailable()) {
                int64_t retry_sleep_ms = 50 * retry_time;
                LOG(WARNING) << "publish version threadpool is busy, retry in  " << retry_sleep_ms
@ -217,10 +235,11 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
        }
        if (!st.ok()) {
            tablet_task.st = std::move(st);
+            latch.count_down();
        }
    }
    span->AddEvent("all_task_submitted");
-    token->wait();
+    latch.wait();
    span->AddEvent("all_task_finished");

    Status st;
@ -235,10 +254,13 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
            if (st.ok()) {
                st = task.st;
            }
-        } else {
+        } else if (!task.is_shadow) {
            auto& pair = tablet_publish_versions.emplace_back();
            pair.__set_tablet_id(task.tablet_id);
            pair.__set_version(task.version);
+        } else {
+            VLOG(1) << "publish_version success tablet:" << task.tablet_id << " version:" << task.version
+                    << " is_shadow:" << task.is_shadow;
        }
    }
    // return tablet and its version which has already finished.
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@ -401,8 +401,8 @@ void* DeleteTaskWorkerPool::_worker_thread_callback(void* arg_this) {

            int num_of_remove_task = 0;
            if (push_req.push_type == TPushType::CANCEL_DELETE) {
-                LOG(INFO) << "get delete push task. remove delete task txn_id: " << push_req.transaction_id
-                          << " priority: " << priority << " push_type: " << push_req.push_type;
+                VLOG(3) << "get delete push task. remove delete task txn_id: " << push_req.transaction_id
+                        << " priority: " << priority << " push_type: " << push_req.push_type;

                std::lock_guard l(worker_pool_this->_worker_thread_lock);
                auto& tasks = worker_pool_this->_tasks;
@ -435,8 +435,8 @@ void* DeleteTaskWorkerPool::_worker_thread_callback(void* arg_this) {
        }
        auto& push_req = agent_task_req->task_req;

-        LOG(INFO) << "get delete push task. signature: " << agent_task_req->signature << " priority: " << priority
-                  << " push_type: " << push_req.push_type;
+        VLOG(3) << "get delete push task. signature: " << agent_task_req->signature << " priority: " << priority
+                << " push_type: " << push_req.push_type;
        std::vector<TTabletInfo> tablet_infos;
        EngineBatchLoadTask engine_task(push_req, &tablet_infos, agent_task_req->signature, &status,
                                        GlobalEnv::GetInstance()->load_mem_tracker());
@ -848,7 +848,8 @@ void* ReportDataCacheMetricsTaskWorkerPool::_worker_thread_callback(void* arg_th
        request.__set_report_version(g_report_version.load(std::memory_order_relaxed));

        TDataCacheMetrics t_metrics{};
-        const LocalCacheEngine* cache = DataCache::GetInstance()->local_cache();
+        // TODO: mem_metrics + disk_metrics
+        const LocalCacheEngine* cache = DataCache::GetInstance()->local_disk_cache();
        if (cache != nullptr && cache->is_initialized()) {
            const auto metrics = cache->cache_metrics();
            DataCacheUtils::set_metrics_from_thrift(t_metrics, metrics);
--- a/be/src/bench/object_cache_bench.cpp
+++ b/be/src/bench/object_cache_bench.cpp
@ -129,8 +129,6 @@ void ObjectCacheBench::init_cache(CacheType cache_type) {
        _page_cache = std::make_shared<StoragePageCache>();
        _page_cache->init(_lru_cache.get());
    } else {
-        opt.engine = "starcache";
-
        _star_cache = std::make_shared<StarCacheEngine>();
        Status st = _star_cache->init(opt);
        if (!st.ok()) {
--- a/be/src/cache/block_cache/block_cache.cpp
+++ b/be/src/cache/block_cache/block_cache.cpp
@ -36,7 +36,7 @@ BlockCache::~BlockCache() {
    (void)shutdown();
 }

-Status BlockCache::init(const CacheOptions& options, std::shared_ptr<LocalCacheEngine> local_cache,
+Status BlockCache::init(const BlockCacheOptions& options, std::shared_ptr<LocalCacheEngine> local_cache,
                        std::shared_ptr<RemoteCacheEngine> remote_cache) {
    _block_size = std::min(options.block_size, MAX_BLOCK_SIZE);
    _local_cache = std::move(local_cache);
--- a/be/src/cache/block_cache/block_cache.h
+++ b/be/src/cache/block_cache/block_cache.h
@ -33,7 +33,7 @@ public:
    ~BlockCache();

    // Init the block cache instance
-    Status init(const CacheOptions& options, std::shared_ptr<LocalCacheEngine> local_cache,
+    Status init(const BlockCacheOptions& options, std::shared_ptr<LocalCacheEngine> local_cache,
                std::shared_ptr<RemoteCacheEngine> remote_cache);

    // Write data buffer to cache, the `offset` must be aligned by block size
--- a/be/src/cache/cache_options.h
+++ b/be/src/cache/cache_options.h
@ -42,7 +42,15 @@ struct DirSpace {
    size_t size;
 };

-struct CacheOptions {
+struct RemoteCacheOptions {
+    double skip_read_factor = 0;
+};
+
+struct MemCacheOptions {
+    size_t mem_space_size = 0;
+};
+
+struct DiskCacheOptions {
    // basic
    size_t mem_space_size = 0;
    std::vector<DirSpace> dir_spaces;
@ -54,7 +62,6 @@ struct CacheOptions {
    bool enable_direct_io = false;
    bool enable_tiered_cache = true;
    bool enable_datacache_persistence = false;
-    std::string engine;
    size_t max_concurrent_inserts = 0;
    size_t max_flying_memory_mb = 0;
    double scheduler_threads_per_cpu = 0;
@ -63,6 +70,10 @@ struct CacheOptions {
    std::string eviction_policy;
 };

+struct BlockCacheOptions {
+    size_t block_size = 0;
+};
+
 struct WriteCacheOptions {
    int8_t priority = 0;
    // If ttl_seconds=0 (default), no ttl restriction will be set. If an old one exists, remove it.
--- a/be/src/cache/datacache.cpp
+++ b/be/src/cache/datacache.cpp
@ -44,14 +44,9 @@ Status DataCache::init(const std::vector<StorePath>& store_paths) {
    _page_cache = std::make_shared<StoragePageCache>();

 #if defined(WITH_STARCACHE)
-    if (config::datacache_engine == "" || config::datacache_engine == "starcache") {
-        config::datacache_engine = "starcache";
-    } else {
-        config::datacache_engine = "lrucache";
-    }
-#else
-    config::datacache_engine = "lrucache";
+    _local_disk_cache_engine = "starcache";
 #endif
+    _local_mem_cache_engine = "lrucache";

    if (!config::datacache_enable) {
        config::disable_storage_page_cache = true;
@ -59,22 +54,22 @@ Status DataCache::init(const std::vector<StorePath>& store_paths) {
        return Status::OK();
    }

-    ASSIGN_OR_RETURN(auto cache_options, _init_cache_options());
+    ASSIGN_OR_RETURN(auto mem_cache_options, _init_mem_cache_options());

-    if (config::datacache_engine == "starcache") {
 #if defined(WITH_STARCACHE)
-        RETURN_IF_ERROR(_init_starcache_engine(&cache_options));
-        RETURN_IF_ERROR(_init_peer_cache(cache_options));
+    ASSIGN_OR_RETURN(auto disk_cache_options, _init_disk_cache_options());
+    RETURN_IF_ERROR(_init_starcache_engine(&disk_cache_options));

-        if (config::block_cache_enable) {
-            RETURN_IF_ERROR(_block_cache->init(cache_options, _local_cache, _remote_cache));
-        }
-#else
-        return Status::InternalError("starcache engine is not supported");
-#endif
-    } else {
-        RETURN_IF_ERROR(_init_lrucache_engine(cache_options));
+    auto remote_cache_options = _init_remote_cache_options();
+    RETURN_IF_ERROR(_init_peer_cache(remote_cache_options));
+
+    if (config::block_cache_enable) {
+        auto block_cache_options = _init_block_cache_options();
+        RETURN_IF_ERROR(_block_cache->init(block_cache_options, _local_disk_cache, _remote_cache));
    }
+#endif
+
+    RETURN_IF_ERROR(_init_lrucache_engine(mem_cache_options));

    RETURN_IF_ERROR(_init_page_cache());

@ -100,14 +95,15 @@ void DataCache::destroy() {
    LOG(INFO) << "pagecache shutdown successfully";

    _block_cache.reset();
-    _local_cache.reset();
+    _local_mem_cache.reset();
+    _local_disk_cache.reset();
    _remote_cache.reset();
    LOG(INFO) << "datacache shutdown successfully";
 }

 bool DataCache::adjust_mem_capacity(int64_t delta, size_t min_capacity) {
-    if (_local_cache != nullptr) {
-        Status st = _local_cache->adjust_mem_quota(delta, min_capacity);
+    if (_local_mem_cache != nullptr) {
+        Status st = _local_mem_cache->adjust_mem_quota(delta, min_capacity);
        if (st.ok()) {
            return true;
        } else {
@ -119,52 +115,67 @@ bool DataCache::adjust_mem_capacity(int64_t delta, size_t min_capacity) {
 }

 size_t DataCache::get_mem_capacity() const {
-    if (_local_cache != nullptr) {
-        return _local_cache->mem_quota();
+    if (_local_mem_cache != nullptr) {
+        return _local_mem_cache->mem_quota();
    } else {
        return 0;
    }
 }

-Status DataCache::_init_lrucache_engine(const CacheOptions& cache_options) {
-    _local_cache = std::make_shared<LRUCacheEngine>();
-    RETURN_IF_ERROR(_local_cache->init(cache_options));
+Status DataCache::_init_lrucache_engine(const MemCacheOptions& cache_options) {
+    _local_mem_cache = std::make_shared<LRUCacheEngine>();
+    RETURN_IF_ERROR(reinterpret_cast<LRUCacheEngine*>(_local_mem_cache.get())->init(cache_options));
    LOG(INFO) << "lrucache engine init successfully";
    return Status::OK();
 }

 Status DataCache::_init_page_cache() {
-    _page_cache->init(_local_cache.get());
+    _page_cache->init(_local_mem_cache.get());
    _page_cache->init_metrics();
    LOG(INFO) << "storage page cache init successfully";
    return Status::OK();
 }

 #if defined(WITH_STARCACHE)
-Status DataCache::_init_starcache_engine(CacheOptions* cache_options) {
+Status DataCache::_init_starcache_engine(DiskCacheOptions* cache_options) {
    // init starcache & disk monitor
    // TODO: DiskSpaceMonitor needs to be decoupled from StarCacheEngine.
-    _local_cache = std::make_shared<StarCacheEngine>();
-    _disk_space_monitor = std::make_shared<DiskSpaceMonitor>(_local_cache.get());
+    _local_disk_cache = std::make_shared<StarCacheEngine>();
+    _disk_space_monitor = std::make_shared<DiskSpaceMonitor>(_local_disk_cache.get());
    RETURN_IF_ERROR(_disk_space_monitor->init(&cache_options->dir_spaces));
-    RETURN_IF_ERROR(_local_cache->init(*cache_options));
+    RETURN_IF_ERROR(reinterpret_cast<StarCacheEngine*>(_local_disk_cache.get())->init(*cache_options));
    _disk_space_monitor->start();
    return Status::OK();
 }

-Status DataCache::_init_peer_cache(const CacheOptions& cache_options) {
+Status DataCache::_init_peer_cache(const RemoteCacheOptions& cache_options) {
    _remote_cache = std::make_shared<PeerCacheEngine>();
    return _remote_cache->init(cache_options);
 }
 #endif

-StatusOr<CacheOptions> DataCache::_init_cache_options() {
-    CacheOptions cache_options;
+RemoteCacheOptions DataCache::_init_remote_cache_options() {
+    RemoteCacheOptions cache_options{.skip_read_factor = config::datacache_skip_read_factor};
+    return cache_options;
+}
+
+StatusOr<MemCacheOptions> DataCache::_init_mem_cache_options() {
+    MemCacheOptions cache_options;
    RETURN_IF_ERROR(DataCacheUtils::parse_conf_datacache_mem_size(
            config::datacache_mem_size, _global_env->process_mem_limit(), &cache_options.mem_space_size));
-    cache_options.engine = config::datacache_engine;
+    return cache_options;
+}

-    if (config::datacache_engine == "starcache") {
+BlockCacheOptions DataCache::_init_block_cache_options() {
+    BlockCacheOptions cache_options;
+    cache_options.block_size = config::datacache_block_size;
+    return cache_options;
+}
+
+StatusOr<DiskCacheOptions> DataCache::_init_disk_cache_options() {
+    DiskCacheOptions cache_options;
+
+    if (_local_disk_cache_engine == "starcache") {
 #ifdef USE_STAROS
        std::vector<string> corresponding_starlet_dirs;
        if (config::datacache_unified_instance_enable && !config::starlet_cache_dir.empty()) {
@ -276,8 +287,8 @@ void DataCache::try_release_resource_before_core_dump() {
        return release_all || modules.contains(name);
    };

-    if (_local_cache != nullptr && need_release("data_cache")) {
-        (void)_local_cache->update_mem_quota(0, false);
+    if (_local_mem_cache != nullptr && need_release("data_cache")) {
+        (void)_local_mem_cache->update_mem_quota(0, false);
    }
 }

--- a/be/src/cache/datacache.h
+++ b/be/src/cache/datacache.h
@ -23,7 +23,7 @@ namespace starrocks {
 class Status;
 class StorePath;
 class RemoteCacheEngine;
-class CacheOptions;
+class DiskCacheOptions;
 class GlobalEnv;
 class DiskSpaceMonitor;
 class MemSpaceMonitor;
@ -39,10 +39,16 @@ public:

    void try_release_resource_before_core_dump();

-    void set_local_cache(std::shared_ptr<LocalCacheEngine> local_cache) { _local_cache = std::move(local_cache); }
+    void set_local_mem_cache(std::shared_ptr<LocalCacheEngine> local_mem_cache) {
+        _local_mem_cache = std::move(local_mem_cache);
+    }
+    void set_local_disk_cache(std::shared_ptr<LocalCacheEngine> local_disk_cache) {
+        _local_disk_cache = std::move(local_disk_cache);
+    }
    void set_page_cache(std::shared_ptr<StoragePageCache> page_cache) { _page_cache = std::move(page_cache); }

-    LocalCacheEngine* local_cache() { return _local_cache.get(); }
+    LocalCacheEngine* local_mem_cache() { return _local_mem_cache.get(); }
+    LocalCacheEngine* local_disk_cache() { return _local_disk_cache.get(); }
    BlockCache* block_cache() const { return _block_cache.get(); }
    void set_block_cache(std::shared_ptr<BlockCache> block_cache) { _block_cache = std::move(block_cache); }
    StoragePageCache* page_cache() const { return _page_cache.get(); }
@ -56,19 +62,26 @@ public:
    size_t get_mem_capacity() const;

 private:
-    StatusOr<CacheOptions> _init_cache_options();
+    StatusOr<MemCacheOptions> _init_mem_cache_options();
+    StatusOr<DiskCacheOptions> _init_disk_cache_options();
+    RemoteCacheOptions _init_remote_cache_options();
+    BlockCacheOptions _init_block_cache_options();
+
 #if defined(WITH_STARCACHE)
-    Status _init_starcache_engine(CacheOptions* cache_options);
-    Status _init_peer_cache(const CacheOptions& cache_options);
+    Status _init_starcache_engine(DiskCacheOptions* cache_options);
+    Status _init_peer_cache(const RemoteCacheOptions& cache_options);
 #endif
-    Status _init_lrucache_engine(const CacheOptions& cache_options);
+    Status _init_lrucache_engine(const MemCacheOptions& cache_options);
    Status _init_page_cache();

    GlobalEnv* _global_env;
    std::vector<StorePath> _store_paths;

    // cache engine
-    std::shared_ptr<LocalCacheEngine> _local_cache;
+    std::string _local_mem_cache_engine;
+    std::string _local_disk_cache_engine;
+    std::shared_ptr<LocalCacheEngine> _local_mem_cache;
+    std::shared_ptr<LocalCacheEngine> _local_disk_cache;
    std::shared_ptr<RemoteCacheEngine> _remote_cache;

    std::shared_ptr<BlockCache> _block_cache;
--- a/be/src/cache/local_cache_engine.h
+++ b/be/src/cache/local_cache_engine.h
@ -27,7 +27,6 @@ class LocalCacheEngine {
 public:
    virtual ~LocalCacheEngine() = default;

-    virtual Status init(const CacheOptions& options) = 0;
    virtual bool is_initialized() const = 0;

    // Write data to cache
--- a/be/src/cache/lrucache_engine.cpp
+++ b/be/src/cache/lrucache_engine.cpp
@ -17,7 +17,7 @@
 #include <butil/fast_rand.h>

 namespace starrocks {
-Status LRUCacheEngine::init(const CacheOptions& options) {
+Status LRUCacheEngine::init(const MemCacheOptions& options) {
    _cache = std::make_unique<ShardedLRUCache>(options.mem_space_size);
    _initialized.store(true, std::memory_order_relaxed);
    return Status::OK();
--- a/be/src/cache/lrucache_engine.h
+++ b/be/src/cache/lrucache_engine.h
@ -25,7 +25,7 @@ public:
    LRUCacheEngine() = default;
    virtual ~LRUCacheEngine() override = default;

-    Status init(const CacheOptions& options) override;
+    Status init(const MemCacheOptions& options);
    bool is_initialized() const override { return _initialized.load(std::memory_order_relaxed); }

    Status write(const std::string& key, const IOBuffer& buffer, WriteCacheOptions* options) override;
--- a/be/src/cache/peer_cache_engine.cpp
+++ b/be/src/cache/peer_cache_engine.cpp
@ -23,7 +23,7 @@

 namespace starrocks {

-Status PeerCacheEngine::init(const CacheOptions& options) {
+Status PeerCacheEngine::init(const RemoteCacheOptions& options) {
    _cache_adaptor.reset(starcache::create_default_adaptor(options.skip_read_factor));
    return Status::OK();
 }
--- a/be/src/cache/peer_cache_engine.h
+++ b/be/src/cache/peer_cache_engine.h
@ -24,7 +24,7 @@ public:
    PeerCacheEngine() = default;
    ~PeerCacheEngine() override = default;

-    Status init(const CacheOptions& options) override;
+    Status init(const RemoteCacheOptions& options) override;

    Status read(const std::string& key, size_t off, size_t size, IOBuffer* buffer, ReadCacheOptions* options) override;

--- a/be/src/cache/remote_cache_engine.h
+++ b/be/src/cache/remote_cache_engine.h
@ -25,7 +25,7 @@ public:
    virtual ~RemoteCacheEngine() = default;

    // Init remote cache
-    virtual Status init(const CacheOptions& options) = 0;
+    virtual Status init(const RemoteCacheOptions& options) = 0;

    // Write data to remote cache
    virtual Status write(const std::string& key, const IOBuffer& buffer, WriteCacheOptions* options) = 0;
--- a/be/src/cache/starcache_engine.cpp
+++ b/be/src/cache/starcache_engine.cpp
@ -27,7 +27,7 @@

 namespace starrocks {

-Status StarCacheEngine::init(const CacheOptions& options) {
+Status StarCacheEngine::init(const DiskCacheOptions& options) {
    starcache::CacheOptions opt;
    opt.mem_quota_bytes = options.mem_space_size;
    for (auto& dir : options.dir_spaces) {
--- a/be/src/cache/starcache_engine.h
+++ b/be/src/cache/starcache_engine.h
@ -26,7 +26,7 @@ public:
    StarCacheEngine() = default;
    virtual ~StarCacheEngine() override = default;

-    Status init(const CacheOptions& options) override;
+    Status init(const DiskCacheOptions& options);
    bool is_initialized() const override { return _initialized.load(std::memory_order_relaxed); }

    Status write(const std::string& key, const IOBuffer& buffer, WriteCacheOptions* options) override;
--- a/be/src/column/binary_column.cpp
+++ b/be/src/column/binary_column.cpp
@ -37,12 +37,12 @@ void BinaryColumnBase<T>::check_or_die() const {
    CHECK_EQ(_bytes.size(), _offsets.back());
    size_t size = this->size();
    for (size_t i = 0; i < size; i++) {
-        CHECK_GE(_offsets[i + 1], _offsets[i]);
+        DCHECK_GE(_offsets[i + 1], _offsets[i]);
    }
    if (_slices_cache) {
        for (size_t i = 0; i < size; i++) {
-            CHECK_EQ(_slices[i].data, get_slice(i).data);
-            CHECK_EQ(_slices[i].size, get_slice(i).size);
+            DCHECK_EQ(_slices[i].data, get_slice(i).data);
+            DCHECK_EQ(_slices[i].size, get_slice(i).size);
        }
    }
 }
@ -83,35 +83,69 @@ void BinaryColumnBase<T>::append(const Column& src, size_t offset, size_t count)
 }

 template <typename T>
-void BinaryColumnBase<T>::append_selective(const Column& src, const uint32_t* indexes, uint32_t from, uint32_t size) {
+void BinaryColumnBase<T>::append_selective(const Column& src, const uint32_t* indexes, uint32_t from,
+                                           const uint32_t size) {
    if (src.is_binary_view()) {
        down_cast<const ColumnView*>(&src)->append_to(*this, indexes, from, size);
        return;
    }
+
+    indexes += from;
+
    const auto& src_column = down_cast<const BinaryColumnBase<T>&>(src);
-    const auto& src_offsets = src_column.get_offset();
-    const auto& src_bytes = src_column.get_bytes();

-    size_t cur_row_count = _offsets.size() - 1;
-    size_t cur_byte_size = _bytes.size();
+    const size_t prev_num_offsets = _offsets.size();
+    const size_t prev_num_rows = prev_num_offsets - 1;

-    _offsets.resize(cur_row_count + size + 1);
+    _offsets.resize(prev_num_offsets + size * 2);
+    auto* __restrict new_offsets = _offsets.data() + prev_num_offsets;
+    const auto* __restrict src_offsets = src_column.get_offset().data();
+
+    // Buffer i-th start offset and end offset in new_offsets[i * 2] and new_offsets[i * 2 + 1].
    for (size_t i = 0; i < size; i++) {
-        uint32_t row_idx = indexes[from + i];
-        T str_size = src_offsets[row_idx + 1] - src_offsets[row_idx];
-        _offsets[cur_row_count + i + 1] = _offsets[cur_row_count + i] + str_size;
-        cur_byte_size += str_size;
+        const uint32_t src_idx = indexes[i];
+        new_offsets[i * 2] = src_offsets[src_idx];
+        new_offsets[i * 2 + 1] = src_offsets[src_idx + 1];
    }
-    _bytes.resize(cur_byte_size);

-    auto* dest_bytes = _bytes.data();
-    for (size_t i = 0; i < size; i++) {
-        uint32_t row_idx = indexes[from + i];
-        T str_size = src_offsets[row_idx + 1] - src_offsets[row_idx];
-        strings::memcpy_inlined(dest_bytes + _offsets[cur_row_count + i], src_bytes.data() + src_offsets[row_idx],
-                                str_size);
+    // Write bytes
+    {
+        size_t num_bytes = _bytes.size();
+        for (size_t i = 0; i < size; i++) {
+            num_bytes += new_offsets[i * 2 + 1] - new_offsets[i * 2];
+        }
+        _bytes.resize(num_bytes);
+        const auto* __restrict src_bytes = src_column.get_bytes().data();
+        auto* __restrict dest_bytes = _bytes.data();
+        size_t cur_offset = _offsets[prev_num_rows];
+
+        if (src_column.get_bytes().size() > 32 * 1024 * 1024ull) {
+            for (size_t i = 0; i < size; i++) {
+                if (i + 16 < size) {
+                    // If the source column is large enough, use prefetch to speed up copying.
+                    __builtin_prefetch(src_bytes + new_offsets[i * 2 + 32]);
+                }
+                const T str_size = new_offsets[i * 2 + 1] - new_offsets[i * 2];
+                strings::memcpy_inlined(dest_bytes + cur_offset, src_bytes + new_offsets[i * 2], str_size);
+                cur_offset += str_size;
+            }
+        } else {
+            for (size_t i = 0; i < size; i++) {
+                const T str_size = new_offsets[i * 2 + 1] - new_offsets[i * 2];
+                // Only copy 16 bytes extra when src_column is small enough, because the overhead of copying 16 bytes
+                // will be large when src_column is large enough.
+                strings::memcpy_inlined_overflow16(dest_bytes + cur_offset, src_bytes + new_offsets[i * 2], str_size);
+                cur_offset += str_size;
+            }
+        }
    }

+    // Write offsets.
+    for (int64_t i = 0; i < size; i++) {
+        new_offsets[i] = new_offsets[i - 1] + (new_offsets[i * 2 + 1] - new_offsets[i * 2]);
+    }
+    _offsets.resize(prev_num_offsets + size);
+
    _slices_cache = false;
 }

--- a/be/src/column/chunk.cpp
+++ b/be/src/column/chunk.cpp
@ -255,7 +255,7 @@ std::unique_ptr<Chunk> Chunk::clone_empty_with_slot(size_t size) const {
        columns[i] = _columns[i]->clone_empty();
        columns[i]->reserve(size);
    }
-    return std::make_unique<Chunk>(columns, _slot_id_to_index);
+    return std::make_unique<Chunk>(std::move(columns), _slot_id_to_index);
 }

 std::unique_ptr<Chunk> Chunk::clone_empty_with_schema() const {
--- a/be/src/column/column_access_path.h
+++ b/be/src/column/column_access_path.h
@ -74,6 +74,8 @@ public:

    bool is_index() const { return _type == TAccessPathType::type::INDEX; }

+    bool is_root() const { return _type == TAccessPathType::type::ROOT; }
+
    bool is_from_predicate() const { return _from_predicate; }

    bool is_extended() const { return _extended; }
--- a/be/src/column/datum_convert.cpp
+++ b/be/src/column/datum_convert.cpp
@ -18,6 +18,7 @@
 #include "runtime/mem_pool.h"
 #include "storage/olap_type_infra.h"
 #include "storage/type_traits.h"
+#include "types/logical_type.h"

 namespace starrocks {

@ -51,6 +52,7 @@ Status datum_from_string(TypeInfo* type_info, Datum* dst, const std::string& str
        return Status::OK();
    }
        /* Type need memory allocated */
+    case TYPE_VARBINARY:
    case TYPE_CHAR:
    case TYPE_VARCHAR: {
        /* Type need memory allocated */
@ -92,6 +94,7 @@ std::string datum_to_string(TypeInfo* type_info, const Datum& datum) {
    switch (type) {
    case TYPE_BOOLEAN:
        return datum_to_string<TYPE_TINYINT>(type_info, datum);
+    case TYPE_VARBINARY:
    case TYPE_CHAR:
    case TYPE_VARCHAR:
        return datum_to_string<TYPE_VARCHAR>(type_info, datum);
--- a/be/src/column/fixed_length_column_base.cpp
+++ b/be/src/column/fixed_length_column_base.cpp
@ -37,28 +37,36 @@ StatusOr<ColumnPtr> FixedLengthColumnBase<T>::upgrade_if_overflow() {

 template <typename T>
 void FixedLengthColumnBase<T>::append(const Column& src, size_t offset, size_t count) {
-    const auto& num_src = down_cast<const FixedLengthColumnBase<T>&>(src);
-    _data.insert(_data.end(), num_src._data.begin() + offset, num_src._data.begin() + offset + count);
+    DCHECK(this != &src);
+
+    const size_t orig_size = _data.size();
+    raw::stl_vector_resize_uninitialized(&_data, orig_size + count);
+
+    const T* src_data = reinterpret_cast<const T*>(src.raw_data());
+    strings::memcpy_inlined(_data.data() + orig_size, src_data + offset, count * sizeof(T));
 }

 template <typename T>
 void FixedLengthColumnBase<T>::append_selective(const Column& src, const uint32_t* indexes, uint32_t from,
                                                uint32_t size) {
+    DCHECK(this != &src);
    indexes += from;
-    const T* src_data = reinterpret_cast<const T*>(src.raw_data());

    const size_t orig_size = _data.size();
-    _data.resize(orig_size + size);
+    raw::stl_vector_resize_uninitialized(&_data, orig_size + size);
    auto* dest_data = _data.data() + orig_size;

+    const T* src_data = reinterpret_cast<const T*>(src.raw_data());
    SIMDGather::gather(dest_data, src_data, indexes, size);
 }

 template <typename T>
 void FixedLengthColumnBase<T>::append_value_multiple_times(const Column& src, uint32_t index, uint32_t size) {
-    const T* src_data = reinterpret_cast<const T*>(src.raw_data());
+    DCHECK(this != &src);
    size_t orig_size = _data.size();
    _data.resize(orig_size + size);
+
+    const T* src_data = reinterpret_cast<const T*>(src.raw_data());
    for (size_t i = 0; i < size; ++i) {
        _data[orig_size + i] = src_data[index];
    }
--- a/be/src/column/nullable_column.cpp
+++ b/be/src/column/nullable_column.cpp
@ -523,10 +523,10 @@ void NullableColumn::put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool
 }

 void NullableColumn::check_or_die() const {
-    CHECK_EQ(_null_column->size(), _data_column->size());
+    DCHECK_EQ(_null_column->size(), _data_column->size());
    // when _has_null=true, the column may have no null value, so don't check.
    if (!_has_null) {
-        CHECK(!SIMD::contain_nonzero(_null_column->get_data(), 0));
+        DCHECK(!SIMD::contain_nonzero(_null_column->get_data(), 0));
    }
    _data_column->check_or_die();
    _null_column->check_or_die();
--- a/be/src/column/schema.cpp
+++ b/be/src/column/schema.cpp
@ -17,6 +17,8 @@
 #include <algorithm>
 #include <utility>

+#include "exec/sorting/sorting.h"
+
 namespace starrocks {

 #ifdef BE_TEST
@ -28,8 +30,13 @@ Schema::Schema(Fields fields) : Schema(fields, KeysType::DUP_KEYS, {}) {
 #endif

 Schema::Schema(Fields fields, KeysType keys_type, std::vector<ColumnId> sort_key_idxes)
+        : Schema(std::move(fields), keys_type, std::move(sort_key_idxes), nullptr) {}
+
+Schema::Schema(Fields fields, KeysType keys_type, std::vector<ColumnId> sort_key_idxes,
+               std::shared_ptr<SortDescs> sort_descs)
        : _fields(std::move(fields)),
          _sort_key_idxes(std::move(sort_key_idxes)),
+          _sort_descs(std::move(sort_descs)),
          _name_to_index_append_buffer(nullptr),

          _keys_type(static_cast<uint8_t>(keys_type)) {
@ -52,9 +59,16 @@ Schema::Schema(Schema* schema, const std::vector<ColumnId>& cids)
        _fields[i] = schema->_fields[cids[i]];
        cids_to_field_id[cids[i]] = i;
    }
-    for (auto idx : ori_sort_idxes) {
+    if (schema->sort_descs()) {
+        _sort_descs = std::make_shared<SortDescs>();
+    }
+    for (size_t pos = 0; pos < ori_sort_idxes.size(); ++pos) {
+        auto idx = ori_sort_idxes[pos];
        if (cids_to_field_id.count(idx) > 0) {
            _sort_key_idxes.emplace_back(cids_to_field_id[idx]);
+            if (_sort_descs && pos < schema->sort_descs()->descs.size()) {
+                _sort_descs->descs.emplace_back(schema->sort_descs()->descs[pos]);
+            }
        }
    }
    auto is_key = [](const FieldPtr& f) { return f->is_key(); };
@ -88,6 +102,7 @@ Schema::Schema(Schema* schema)
        _fields[i] = schema->_fields[i];
    }
    _sort_key_idxes = schema->sort_key_idxes();
+    _sort_descs = schema->sort_descs();
    if (schema->_name_to_index_append_buffer == nullptr) {
        // share the name_to_index with schema, later append fields will be added to _name_to_index_append_buffer
        schema->_share_name_to_index = true;
@ -109,6 +124,7 @@ Schema::Schema(const Schema& schema)
        _fields[i] = schema._fields[i];
    }
    _sort_key_idxes = schema.sort_key_idxes();
+    _sort_descs = schema.sort_descs();
    if (schema._name_to_index_append_buffer == nullptr) {
        // share the name_to_index with schema&, later append fields will be added to _name_to_index_append_buffer
        schema._share_name_to_index = true;
@ -132,6 +148,7 @@ Schema& Schema::operator=(const Schema& other) {
        this->_fields[i] = other._fields[i];
    }
    this->_sort_key_idxes = other.sort_key_idxes();
+    this->_sort_descs = other.sort_descs();
    if (other._name_to_index_append_buffer == nullptr) {
        // share the name_to_index with schema&, later append fields will be added to _name_to_index_append_buffer
        other._share_name_to_index = true;
--- a/be/src/column/schema.h
+++ b/be/src/column/schema.h
@ -24,6 +24,8 @@

 namespace starrocks {

+struct SortDescs;
+
 // TODO: move constructor and move assignment
 class Schema {
 public:
@ -39,6 +41,9 @@ public:

    explicit Schema(Fields fields, KeysType keys_type, std::vector<ColumnId> sort_key_idxes);

+    explicit Schema(Fields fields, KeysType keys_type, std::vector<ColumnId> sort_key_idxes,
+                    std::shared_ptr<SortDescs> sort_descs);
+
    // if we use this constructor and share the name_to_index with another schema,
    // we must make sure another shema is read only!!!
    explicit Schema(Schema* schema);
@ -61,6 +66,10 @@ public:

    const std::vector<ColumnId> sort_key_idxes() const { return _sort_key_idxes; }
    void append_sort_key_idx(ColumnId idx) { _sort_key_idxes.emplace_back(idx); }
+    void set_sort_key_idxes(const std::vector<ColumnId>& sort_key_idxes) { _sort_key_idxes = sort_key_idxes; }
+
+    std::shared_ptr<SortDescs> sort_descs() const { return _sort_descs; }
+    void set_sort_descs(const std::shared_ptr<SortDescs>& sort_descs) { _sort_descs = sort_descs; }

    void reserve(size_t size) { _fields.reserve(size); }

@ -133,6 +142,7 @@ private:
    Fields _fields;
    size_t _num_keys = 0;
    std::vector<ColumnId> _sort_key_idxes;
+    std::shared_ptr<SortDescs> _sort_descs;
    std::shared_ptr<std::unordered_map<std::string_view, size_t>> _name_to_index;

    // If we share the same _name_to_index with another vectorized schema,
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@ -323,6 +323,14 @@ CONF_mBool(enable_zonemap_index_memory_page_cache, "true");
 // whether to enable the ordinal index memory cache
 CONF_mBool(enable_ordinal_index_memory_page_cache, "true");

+// ========================== ZONEMAP BEGIN ===================================
+// Enable ZoneMap for string (CHAR/VARCHAR) columns using prefix-based min/max
+CONF_mBool(enable_string_prefix_zonemap, "true");
+// Prefix length used for string ZoneMap min/max when enabled
+CONF_mInt32(string_prefix_zonemap_prefix_len, "16");
+
+// ========================== ZONEMAP END ===================================
+
 CONF_mInt32(base_compaction_check_interval_seconds, "60");
 CONF_mInt64(min_base_compaction_num_singleton_deltas, "5");
 CONF_mInt64(max_base_compaction_num_singleton_deltas, "100");
@ -568,6 +576,8 @@ CONF_mBool(enable_token_check, "true");
 // to open/close system metrics
 CONF_Bool(enable_system_metrics, "true");

+CONF_Bool(enable_jvm_metrics, "false");
+
 CONF_mBool(enable_prefetch, "true");

 // Number of cores StarRocks will used, this will effect only when it's greater than 0.
@ -915,6 +925,9 @@ CONF_mInt64(tablet_internal_parallel_min_scan_dop, "4");
 // Only the num rows of lake tablet less than lake_tablet_rows_splitted_ratio * splitted_scan_rows, than the lake tablet can be splitted.
 CONF_mDouble(lake_tablet_rows_splitted_ratio, "1.5");

+// Allow skipping invalid delete_predicate in order to get the segment data back, and do manual correction.
+CONF_mBool(lake_tablet_ignore_invalid_delete_predicate, "false");
+
 // The bitmap serialize version.
 CONF_Int16(bitmap_serialize_version, "1");
 // The max hdfs file handle.
@ -1073,6 +1086,8 @@ CONF_Int64(rpc_connect_timeout_ms, "30000");
 CONF_Int32(max_batch_publish_latency_ms, "100");

 // Config for opentelemetry tracing.
+// Valid example: jaeger_endpoint = localhost:14268
+// Invalid example: jaeger_endpoint = http://localhost:14268
 CONF_String(jaeger_endpoint, "");

 // Config for query debug trace
@ -1511,8 +1526,10 @@ CONF_mBool(lake_enable_vertical_compaction_fill_data_cache, "true");

 CONF_mInt32(dictionary_cache_refresh_timeout_ms, "60000"); // 1 min
 CONF_mInt32(dictionary_cache_refresh_threadpool_size, "8");
+
+// ======================= FLAT JSON start ==============================================
 // json flat flag
-CONF_mBool(enable_json_flat, "false");
+CONF_mBool(enable_json_flat, "true");

 // enable compaction is base on flat json, not whole json
 CONF_mBool(enable_compaction_flat_json, "true");
@ -1546,6 +1563,7 @@ CONF_mInt32(json_flat_column_max, "100");

 // for whitelist on flat json remain data, max set 1kb
 CONF_mInt32(json_flat_remain_filter_max_bytes, "1024");
+// ======================= FLAT JSON end ==============================================

 // Allowable intervals for continuous generation of pk dumps
 // Disable when pk_dump_interval_seconds <= 0
@ -1589,6 +1607,8 @@ CONF_mBool(apply_del_vec_after_all_index_filter, "true");
 CONF_mDouble(connector_sink_mem_high_watermark_ratio, "0.3");
 CONF_mDouble(connector_sink_mem_low_watermark_ratio, "0.1");
 CONF_mDouble(connector_sink_mem_urgent_space_ratio, "0.1");
+// Whether enable spill intermediate data for connector sink.
+CONF_mBool(enable_connector_sink_spill, "true");

 // .crm file can be removed after 1day.
 CONF_mInt32(unused_crm_file_threshold_second, "86400" /** 1day **/);
@ -1729,4 +1749,5 @@ CONF_mInt64(split_exchanger_buffer_chunk_num, "1000");

 // when to split hashmap/hashset into two level hashmap/hashset, negative number means use default value
 CONF_mInt64(two_level_memory_threshold, "-1");
+
 } // namespace starrocks::config
--- a/be/src/common/daemon.cpp
+++ b/be/src/common/daemon.cpp
@ -210,6 +210,7 @@ void jemalloc_tracker_daemon(void* arg_this) {

 static void init_starrocks_metrics(const std::vector<StorePath>& store_paths) {
    bool init_system_metrics = config::enable_system_metrics;
+    bool init_jvm_metrics = config::enable_jvm_metrics;
    std::set<std::string> disk_devices;
    std::vector<std::string> network_interfaces;
    std::vector<std::string> paths;
@ -229,7 +230,8 @@ static void init_starrocks_metrics(const std::vector<StorePath>& store_paths) {
            return;
        }
    }
-    StarRocksMetrics::instance()->initialize(paths, init_system_metrics, disk_devices, network_interfaces);
+    StarRocksMetrics::instance()->initialize(paths, init_system_metrics, init_jvm_metrics, disk_devices,
+                                             network_interfaces);
 }

 void sigterm_handler(int signo, siginfo_t* info, void* context) {
--- a/be/src/common/logging.h
+++ b/be/src/common/logging.h
@ -78,11 +78,11 @@
 #define VLOG_OPERATOR VLOG(3)
 #define VLOG_ROW VLOG(10)
 #define VLOG_PROGRESS VLOG(2)
-#define VLOG_CACHE VLOG(1)
+#define VLOG_CACHE VLOG(3)

-#define VLOG_CONNECTION_IS_ON VLOG_IS_ON(1)
+#define VLOG_CONNECTION_IS_ON VLOG_IS_ON(2)
 #define VLOG_RPC_IS_ON VLOG_IS_ON(2)
-#define VLOG_QUERY_IS_ON VLOG_IS_ON(1)
+#define VLOG_QUERY_IS_ON VLOG_IS_ON(2)
 #define VLOG_FILE_IS_ON VLOG_IS_ON(2)
 #define VLOG_OPERATOR_IS_ON VLOG_IS_ON(3)
 #define VLOG_ROW_IS_ON VLOG_IS_ON(10)
--- a/be/src/common/tracer.cpp
+++ b/be/src/common/tracer.cpp
@ -40,12 +40,17 @@ void Tracer::release_instance() {
    Instance().shutdown();
 }

+static inline opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> create_no_op_tracer() {
+    return opentelemetry::trace::Provider::GetTracerProvider()->GetTracer("no-op", OPENTELEMETRY_SDK_VERSION);
+}
+
 void Tracer::init(const std::string& service_name) {
    if (!config::jaeger_endpoint.empty()) {
        opentelemetry::exporter::jaeger::JaegerExporterOptions opts;
        vector<string> host_port = strings::Split(config::jaeger_endpoint, ":");
        if (host_port.size() != 2) {
            LOG(WARNING) << "bad jaeger_endpoint " << config::jaeger_endpoint;
+            _tracer = create_no_op_tracer();
            return;
        }
        opts.endpoint = host_port[0];
@ -63,7 +68,7 @@ void Tracer::init(const std::string& service_name) {
                new opentelemetry::sdk::trace::TracerProvider(std::move(processor), jaeger_resource));
        _tracer = provider->GetTracer(service_name, OPENTELEMETRY_SDK_VERSION);
    } else {
-        _tracer = opentelemetry::trace::Provider::GetTracerProvider()->GetTracer("no-op", OPENTELEMETRY_SDK_VERSION);
+        _tracer = create_no_op_tracer();
    }
 }

--- a/be/src/connector/CMakeLists.txt
+++ b/be/src/connector/CMakeLists.txt
@ -31,6 +31,8 @@ add_library(Connector STATIC
        utils.cpp
        async_flush_stream_poller.cpp
        sink_memory_manager.cpp
+        partition_chunk_writer.cpp
+        connector_sink_executor.cpp
        deletion_vector/deletion_vector.cpp
        deletion_vector/deletion_bitmap.cpp
 )
--- a/be/src/connector/async_flush_stream_poller.cpp
+++ b/be/src/connector/async_flush_stream_poller.cpp
@ -16,7 +16,7 @@

 namespace starrocks::connector {

-void AsyncFlushStreamPoller::enqueue(std::unique_ptr<Stream> stream) {
+void AsyncFlushStreamPoller::enqueue(std::shared_ptr<Stream> stream) {
    auto async_status = stream->io_status();
    _queue.push_back({
            .stream = std::move(stream),
--- a/be/src/connector/async_flush_stream_poller.h
+++ b/be/src/connector/async_flush_stream_poller.h
@ -34,7 +34,7 @@ public:

    virtual ~AsyncFlushStreamPoller() = default;

-    virtual void enqueue(std::unique_ptr<Stream> stream);
+    virtual void enqueue(std::shared_ptr<Stream> stream);

    // return a pair of
    // 1. io status
@ -45,7 +45,7 @@ public:

 private:
    struct StreamWithStatus {
-        std::unique_ptr<Stream> stream;
+        std::shared_ptr<Stream> stream;
        std::future<Status> async_status;
    };

--- a/be/src/connector/connector_chunk_sink.cpp
+++ b/be/src/connector/connector_chunk_sink.cpp
@ -24,21 +24,18 @@ namespace starrocks::connector {

 ConnectorChunkSink::ConnectorChunkSink(std::vector<std::string> partition_columns,
                                       std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
-                                       std::unique_ptr<LocationProvider> location_provider,
-                                       std::unique_ptr<formats::FileWriterFactory> file_writer_factory,
-                                       int64_t max_file_size, RuntimeState* state, bool support_null_partition)
+                                       std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory,
+                                       RuntimeState* state, bool support_null_partition)
        : _partition_column_names(std::move(partition_columns)),
          _partition_column_evaluators(std::move(partition_column_evaluators)),
-          _location_provider(std::move(location_provider)),
-          _file_writer_factory(std::move(file_writer_factory)),
-          _max_file_size(max_file_size),
+          _partition_chunk_writer_factory(std::move(partition_chunk_writer_factory)),
          _state(state),
          _support_null_partition(support_null_partition) {}

 Status ConnectorChunkSink::init() {
    RETURN_IF_ERROR(ColumnEvaluator::init(_partition_column_evaluators));
-    RETURN_IF_ERROR(_file_writer_factory->init());
-    _op_mem_mgr->init(&_writer_stream_pairs, _io_poller,
+    RETURN_IF_ERROR(_partition_chunk_writer_factory->init());
+    _op_mem_mgr->init(&_partition_chunk_writers, _io_poller,
                      [this](const CommitResult& r) { this->callback_on_commit(r); });
    return Status::OK();
 }
@ -49,38 +46,20 @@ Status ConnectorChunkSink::write_partition_chunk(const std::string& partition,
    // They are under the same dir path, but should not in the same data file.
    // We should record them in different files so that each data file could has its own meta info.
    // otherwise, the scanFileTask may filter data incorrectly.
-    auto it = _writer_stream_pairs.find(std::make_pair(partition, partition_field_null_list));
-    if (it != _writer_stream_pairs.end()) {
-        Writer* writer = it->second.first.get();
-        if (writer->get_written_bytes() >= _max_file_size) {
-            string null_fingerprint(partition_field_null_list.size(), '0');
-            std::transform(partition_field_null_list.begin(), partition_field_null_list.end(), null_fingerprint.begin(),
-                           [](int8_t b) { return b + '0'; });
-            callback_on_commit(writer->commit().set_extra_data(null_fingerprint));
-            _writer_stream_pairs.erase(it);
-            auto path =
-                    !_partition_column_names.empty() ? _location_provider->get(partition) : _location_provider->get();
-            ASSIGN_OR_RETURN(auto new_writer_and_stream, _file_writer_factory->create(path));
-            std::unique_ptr<Writer> new_writer = std::move(new_writer_and_stream.writer);
-            std::unique_ptr<Stream> new_stream = std::move(new_writer_and_stream.stream);
-            RETURN_IF_ERROR(new_writer->init());
-            RETURN_IF_ERROR(new_writer->write(chunk));
-            _writer_stream_pairs[std::make_pair(partition, partition_field_null_list)] =
-                    std::make_pair(std::move(new_writer), new_stream.get());
-            _io_poller->enqueue(std::move(new_stream));
-        } else {
-            RETURN_IF_ERROR(writer->write(chunk));
-        }
+    PartitionKey partition_key = std::make_pair(partition, partition_field_null_list);
+    auto it = _partition_chunk_writers.find(partition_key);
+    if (it != _partition_chunk_writers.end()) {
+        return it->second->write(chunk);
    } else {
-        auto path = !_partition_column_names.empty() ? _location_provider->get(partition) : _location_provider->get();
-        ASSIGN_OR_RETURN(auto new_writer_and_stream, _file_writer_factory->create(path));
-        std::unique_ptr<Writer> new_writer = std::move(new_writer_and_stream.writer);
-        std::unique_ptr<Stream> new_stream = std::move(new_writer_and_stream.stream);
-        RETURN_IF_ERROR(new_writer->init());
-        RETURN_IF_ERROR(new_writer->write(chunk));
-        _writer_stream_pairs[std::make_pair(partition, partition_field_null_list)] =
-                std::make_pair(std::move(new_writer), new_stream.get());
-        _io_poller->enqueue(std::move(new_stream));
+        auto writer = _partition_chunk_writer_factory->create(partition, partition_field_null_list);
+        auto commit_callback = [this](const CommitResult& r) { this->callback_on_commit(r); };
+        auto error_handler = [this](const Status& s) { this->set_status(s); };
+        writer->set_commit_callback(commit_callback);
+        writer->set_error_handler(error_handler);
+        writer->set_io_poller(_io_poller);
+        RETURN_IF_ERROR(writer->init());
+        RETURN_IF_ERROR(writer->write(chunk));
+        _partition_chunk_writers[partition_key] = writer;
    }
    return Status::OK();
 }
@ -100,19 +79,42 @@ Status ConnectorChunkSink::add(Chunk* chunk) {
 }

 Status ConnectorChunkSink::finish() {
-    for (auto& [partition_key, writer_and_stream] : _writer_stream_pairs) {
-        string extra_data(partition_key.second.size(), '0');
-        std::transform(partition_key.second.begin(), partition_key.second.end(), extra_data.begin(),
-                       [](int8_t b) { return b + '0'; });
-        callback_on_commit(writer_and_stream.first->commit().set_extra_data(extra_data));
+    for (auto& [partition_key, writer] : _partition_chunk_writers) {
+        RETURN_IF_ERROR(writer->finish());
    }
    return Status::OK();
 }

+void ConnectorChunkSink::push_rollback_action(const std::function<void()>& action) {
+    // Not a very frequent operation, so use unique_lock here is ok.
+    std::unique_lock<std::shared_mutex> wlck(_mutex);
+    _rollback_actions.push_back(std::move(action));
+}
+
 void ConnectorChunkSink::rollback() {
+    std::shared_lock<std::shared_mutex> rlck(_mutex);
    for (auto& action : _rollback_actions) {
        action();
    }
 }

+void ConnectorChunkSink::set_status(const Status& status) {
+    std::unique_lock<std::shared_mutex> wlck(_mutex);
+    _status = status;
+}
+
+Status ConnectorChunkSink::status() {
+    std::shared_lock<std::shared_mutex> rlck(_mutex);
+    return _status;
+}
+
+bool ConnectorChunkSink::is_finished() {
+    for (auto& [partition_key, writer] : _partition_chunk_writers) {
+        if (!writer->is_finished()) {
+            return false;
+        }
+    }
+    return true;
+}
+
 } // namespace starrocks::connector
--- a/be/src/connector/connector_chunk_sink.h
+++ b/be/src/connector/connector_chunk_sink.h
@ -20,8 +20,8 @@

 #include "column/chunk.h"
 #include "common/status.h"
+#include "connector/partition_chunk_writer.h"
 #include "connector/utils.h"
-#include "formats/file_writer.h"
 #include "fs/fs.h"
 #include "runtime/runtime_state.h"

@ -30,20 +30,14 @@ namespace starrocks::connector {
 class AsyncFlushStreamPoller;
 class SinkOperatorMemoryManager;

-using Writer = formats::FileWriter;
-using Stream = io::AsyncFlushOutputStream;
-using WriterStreamPair = std::pair<std::unique_ptr<Writer>, Stream*>;
 using PartitionKey = std::pair<std::string, std::vector<int8_t>>;
-using CommitResult = formats::FileWriter::CommitResult;
-using CommitFunc = std::function<void(const CommitResult& result)>;

 class ConnectorChunkSink {
 public:
    ConnectorChunkSink(std::vector<std::string> partition_columns,
                       std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
-                       std::unique_ptr<LocationProvider> location_provider,
-                       std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
-                       RuntimeState* state, bool support_null_partition);
+                       std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory, RuntimeState* state,
+                       bool support_null_partition);

    void set_io_poller(AsyncFlushStreamPoller* poller) { _io_poller = poller; }

@ -59,26 +53,35 @@ public:

    void rollback();

+    bool is_finished();
+
    virtual void callback_on_commit(const CommitResult& result) = 0;

    Status write_partition_chunk(const std::string& partition, const vector<int8_t>& partition_field_null_list,
                                 Chunk* chunk);

+    Status status();
+
+    void set_status(const Status& status);
+
 protected:
+    void push_rollback_action(const std::function<void()>& action);
+
    AsyncFlushStreamPoller* _io_poller = nullptr;
    SinkOperatorMemoryManager* _op_mem_mgr = nullptr;

    std::vector<std::string> _partition_column_names;
    std::vector<std::unique_ptr<ColumnEvaluator>> _partition_column_evaluators;
-    std::unique_ptr<LocationProvider> _location_provider;
-    std::unique_ptr<formats::FileWriterFactory> _file_writer_factory;
-    int64_t _max_file_size = 1024L * 1024 * 1024;
+    std::unique_ptr<PartitionChunkWriterFactory> _partition_chunk_writer_factory;
    RuntimeState* _state = nullptr;
    bool _support_null_partition{false};
    std::vector<std::function<void()>> _rollback_actions;

-    std::map<PartitionKey, WriterStreamPair> _writer_stream_pairs;
+    std::map<PartitionKey, PartitionChunkWriterPtr> _partition_chunk_writers;
    inline static std::string DEFAULT_PARTITION = "__DEFAULT_PARTITION__";
+
+    std::shared_mutex _mutex;
+    Status _status;
 };

 struct ConnectorChunkSinkContext {
--- a/be/src/connector/connector_sink_executor.cpp
+++ b/be/src/connector/connector_sink_executor.cpp
@ -0,0 +1,66 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "connector/connector_sink_executor.h"
+
+#include "column/chunk.h"
+#include "common/status.h"
+#include "connector/partition_chunk_writer.h"
+#include "storage/load_chunk_spiller.h"
+
+namespace starrocks::connector {
+
+Status ConnectorSinkSpillExecutor::init() {
+    return ThreadPoolBuilder(_executor_name)
+            .set_min_threads(0)
+            .set_max_threads(calc_max_thread_num())
+            .build(&_thread_pool);
+}
+
+int ConnectorSinkSpillExecutor::calc_max_thread_num() {
+    int dir_count = 0;
+    std::vector<starrocks::StorePath> spill_local_storage_paths;
+    Status st = parse_conf_store_paths(config::spill_local_storage_dir, &spill_local_storage_paths);
+    if (st.ok()) {
+        dir_count = spill_local_storage_paths.size();
+    }
+
+    int threads = config::lake_flush_thread_num_per_store;
+    if (threads == 0) {
+        threads = -2;
+    }
+    if (threads <= 0) {
+        threads = -threads;
+        threads *= CpuInfo::num_cores();
+    }
+    dir_count = std::max(1, dir_count);
+    dir_count = std::min(8, dir_count);
+    return dir_count * threads;
+}
+
+void ChunkSpillTask::run() {
+    auto res = _load_chunk_spiller->spill(*_chunk);
+    if (_cb) {
+        _cb(_chunk, res);
+    }
+}
+
+void MergeBlockTask::run() {
+    auto st = _writer->merge_blocks();
+    if (_cb) {
+        _cb(st);
+    }
+}
+
+} // namespace starrocks::connector
--- a/be/src/connector/connector_sink_executor.h
+++ b/be/src/connector/connector_sink_executor.h
@ -0,0 +1,100 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <fmt/format.h>
+
+#include <map>
+
+#include "column/chunk.h"
+#include "common/status.h"
+#include "connector/utils.h"
+#include "util/threadpool.h"
+
+namespace starrocks {
+class LoadChunkSpiller;
+}
+
+namespace starrocks::connector {
+
+class SpillPartitionChunkWriter;
+
+class ConnectorSinkExecutor {
+public:
+    ConnectorSinkExecutor(const std::string& executor_name) : _executor_name(executor_name) {}
+    virtual ~ConnectorSinkExecutor() {}
+
+    virtual Status init() = 0;
+
+    ThreadPool* get_thread_pool() { return _thread_pool.get(); }
+
+    std::unique_ptr<ThreadPoolToken> create_token() {
+        return _thread_pool->new_token(ThreadPool::ExecutionMode::SERIAL);
+    }
+
+    Status refresh_max_thread_num() {
+        if (_thread_pool != nullptr) {
+            return _thread_pool->update_max_threads(calc_max_thread_num());
+        }
+        return Status::OK();
+    }
+
+protected:
+    virtual int calc_max_thread_num() = 0;
+
+protected:
+    std::string _executor_name;
+    std::unique_ptr<ThreadPool> _thread_pool;
+};
+
+class ConnectorSinkSpillExecutor : public ConnectorSinkExecutor {
+public:
+    ConnectorSinkSpillExecutor() : ConnectorSinkExecutor("conn_sink_spill") {}
+
+    Status init() override;
+
+protected:
+    int calc_max_thread_num() override;
+};
+
+class ChunkSpillTask final : public Runnable {
+public:
+    ChunkSpillTask(LoadChunkSpiller* load_chunk_spiller, ChunkPtr chunk,
+                   std::function<void(ChunkPtr chunk, const StatusOr<size_t>&)> cb)
+            : _load_chunk_spiller(load_chunk_spiller), _chunk(chunk), _cb(std::move(cb)) {}
+
+    ~ChunkSpillTask() override = default;
+
+    void run() override;
+
+private:
+    LoadChunkSpiller* _load_chunk_spiller;
+    ChunkPtr _chunk;
+    std::function<void(ChunkPtr, const StatusOr<size_t>&)> _cb;
+};
+
+class MergeBlockTask : public Runnable {
+public:
+    MergeBlockTask(SpillPartitionChunkWriter* writer, std::function<void(const Status&)> cb)
+            : _writer(writer), _cb(std::move(cb)) {}
+
+    void run() override;
+
+private:
+    SpillPartitionChunkWriter* _writer;
+    std::function<void(const Status&)> _cb;
+};
+
+} // namespace starrocks::connector
--- a/be/src/connector/es_connector.cpp
+++ b/be/src/connector/es_connector.cpp
@ -14,7 +14,6 @@

 #include "connector/es_connector.h"

-#include "common/logging.h"
 #include "exec/es/es_predicate.h"
 #include "exec/es/es_query_builder.h"
 #include "exec/es/es_scan_reader.h"
@ -22,6 +21,7 @@
 #include "exec/es/es_scroll_query.h"
 #include "exec/exec_node.h"
 #include "exprs/expr.h"
+#include "service/backend_options.h"
 #include "storage/chunk_helper.h"

 namespace starrocks::connector {
--- a/be/src/connector/file_chunk_sink.cpp
+++ b/be/src/connector/file_chunk_sink.cpp
@ -31,12 +31,10 @@ namespace starrocks::connector {

 FileChunkSink::FileChunkSink(std::vector<std::string> partition_columns,
                             std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
-                             std::unique_ptr<LocationProvider> location_provider,
-                             std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
+                             std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory,
                             RuntimeState* state)
        : ConnectorChunkSink(std::move(partition_columns), std::move(partition_column_evaluators),
-                             std::move(location_provider), std::move(file_writer_factory), max_file_size, state, true) {
-}
+                             std::move(partition_chunk_writer_factory), state, true) {}

 void FileChunkSink::callback_on_commit(const CommitResult& result) {
    _rollback_actions.push_back(std::move(result.rollback_action));
@ -49,27 +47,27 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> FileChunkSinkProvider::create_chun
        std::shared_ptr<ConnectorChunkSinkContext> context, int32_t driver_id) {
    auto ctx = std::dynamic_pointer_cast<FileChunkSinkContext>(context);
    auto runtime_state = ctx->fragment_context->runtime_state();
-    auto fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
+    std::shared_ptr<FileSystem> fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
    auto column_evaluators = ColumnEvaluator::clone(ctx->column_evaluators);
-    auto location_provider = std::make_unique<connector::LocationProvider>(
+    auto location_provider = std::make_shared<connector::LocationProvider>(
            ctx->path, print_id(ctx->fragment_context->query_id()), runtime_state->be_number(), driver_id,
            boost::to_lower_copy(ctx->format));

-    std::unique_ptr<formats::FileWriterFactory> file_writer_factory;
+    std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
    if (boost::iequals(ctx->format, formats::PARQUET)) {
-        file_writer_factory = std::make_unique<formats::ParquetFileWriterFactory>(
-                std::move(fs), ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
-                std::nullopt, ctx->executor, runtime_state);
+        file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
+                fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators), std::nullopt,
+                ctx->executor, runtime_state);
    } else if (boost::iequals(ctx->format, formats::ORC)) {
-        file_writer_factory = std::make_unique<formats::ORCFileWriterFactory>(
-                std::move(fs), ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
-                ctx->executor, runtime_state);
+        file_writer_factory = std::make_shared<formats::ORCFileWriterFactory>(
+                fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators), ctx->executor,
+                runtime_state);
    } else if (boost::iequals(ctx->format, formats::CSV)) {
-        file_writer_factory = std::make_unique<formats::CSVFileWriterFactory>(
-                std::move(fs), ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
-                ctx->executor, runtime_state);
+        file_writer_factory = std::make_shared<formats::CSVFileWriterFactory>(
+                fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators), ctx->executor,
+                runtime_state);
    } else {
-        file_writer_factory = std::make_unique<formats::UnknownFileWriterFactory>(ctx->format);
+        file_writer_factory = std::make_shared<formats::UnknownFileWriterFactory>(ctx->format);
    }

    std::vector<std::string> partition_columns;
@ -78,9 +76,28 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> FileChunkSinkProvider::create_chun
        partition_columns.push_back(ctx->column_names[idx]);
        partition_column_evaluators.push_back(ctx->column_evaluators[idx]->clone());
    }
+
+    std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory;
+    // Disable the load spill for file sink temperarily
+    if (/* config::enable_connector_sink_spill */ false) {
+        auto partition_chunk_writer_ctx =
+                std::make_shared<SpillPartitionChunkWriterContext>(SpillPartitionChunkWriterContext{
+                        {file_writer_factory, location_provider, ctx->max_file_size, partition_columns.empty()},
+                        fs,
+                        ctx->fragment_context,
+                        nullptr,
+                        nullptr});
+        partition_chunk_writer_factory = std::make_unique<SpillPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
+    } else {
+        auto partition_chunk_writer_ctx =
+                std::make_shared<BufferPartitionChunkWriterContext>(BufferPartitionChunkWriterContext{
+                        {file_writer_factory, location_provider, ctx->max_file_size, partition_columns.empty()}});
+        partition_chunk_writer_factory =
+                std::make_unique<BufferPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
+    }
+
    return std::make_unique<connector::FileChunkSink>(partition_columns, std::move(partition_column_evaluators),
-                                                      std::move(location_provider), std::move(file_writer_factory),
-                                                      ctx->max_file_size, runtime_state);
+                                                      std::move(partition_chunk_writer_factory), runtime_state);
 }

 } // namespace starrocks::connector
--- a/be/src/connector/file_chunk_sink.h
+++ b/be/src/connector/file_chunk_sink.h
@ -36,9 +36,7 @@ class FileChunkSink : public ConnectorChunkSink {
 public:
    FileChunkSink(std::vector<std::string> partition_columns,
                  std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
-                  std::unique_ptr<LocationProvider> location_provider,
-                  std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
-                  RuntimeState* state);
+                  std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory, RuntimeState* state);

    ~FileChunkSink() override = default;

--- a/be/src/connector/hive_chunk_sink.cpp
+++ b/be/src/connector/hive_chunk_sink.cpp
@ -29,12 +29,10 @@ namespace starrocks::connector {

 HiveChunkSink::HiveChunkSink(std::vector<std::string> partition_columns,
                             std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
-                             std::unique_ptr<LocationProvider> location_provider,
-                             std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
+                             std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory,
                             RuntimeState* state)
        : ConnectorChunkSink(std::move(partition_columns), std::move(partition_column_evaluators),
-                             std::move(location_provider), std::move(file_writer_factory), max_file_size, state,
-                             false) {}
+                             std::move(partition_chunk_writer_factory), state, false) {}

 void HiveChunkSink::callback_on_commit(const CommitResult& result) {
    _rollback_actions.push_back(std::move(result.rollback_action));
@ -55,36 +53,56 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> HiveChunkSinkProvider::create_chun
        std::shared_ptr<ConnectorChunkSinkContext> context, int32_t driver_id) {
    auto ctx = std::dynamic_pointer_cast<HiveChunkSinkContext>(context);
    auto runtime_state = ctx->fragment_context->runtime_state();
-    auto fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value(); // must succeed
+    std::shared_ptr<FileSystem> fs =
+            FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value(); // must succeed
    auto data_column_evaluators = ColumnEvaluator::clone(ctx->data_column_evaluators);
-    auto location_provider = std::make_unique<connector::LocationProvider>(
+    auto location_provider = std::make_shared<connector::LocationProvider>(
            ctx->path, print_id(ctx->fragment_context->query_id()), runtime_state->be_number(), driver_id,
            boost::to_lower_copy(ctx->format));

-    std::unique_ptr<formats::FileWriterFactory> file_writer_factory;
+    std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
    if (boost::iequals(ctx->format, formats::PARQUET)) {
        // ensure hive compatibility since hive 3 and lower version accepts specific encoding
        ctx->options[formats::ParquetWriterOptions::USE_LEGACY_DECIMAL_ENCODING] = "true";
        ctx->options[formats::ParquetWriterOptions::USE_INT96_TIMESTAMP_ENCODING] = "true";
-        file_writer_factory = std::make_unique<formats::ParquetFileWriterFactory>(
-                std::move(fs), ctx->compression_type, ctx->options, ctx->data_column_names,
-                std::move(data_column_evaluators), std::nullopt, ctx->executor, runtime_state);
+        file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
+                fs, ctx->compression_type, ctx->options, ctx->data_column_names, std::move(data_column_evaluators),
+                std::nullopt, ctx->executor, runtime_state);
    } else if (boost::iequals(ctx->format, formats::ORC)) {
-        file_writer_factory = std::make_unique<formats::ORCFileWriterFactory>(
-                std::move(fs), ctx->compression_type, ctx->options, ctx->data_column_names,
-                std::move(data_column_evaluators), ctx->executor, runtime_state);
+        file_writer_factory = std::make_shared<formats::ORCFileWriterFactory>(
+                fs, ctx->compression_type, ctx->options, ctx->data_column_names, std::move(data_column_evaluators),
+                ctx->executor, runtime_state);
    } else if (boost::iequals(ctx->format, formats::TEXTFILE)) {
-        file_writer_factory = std::make_unique<formats::CSVFileWriterFactory>(
-                std::move(fs), ctx->compression_type, ctx->options, ctx->data_column_names,
-                std::move(data_column_evaluators), ctx->executor, runtime_state);
+        file_writer_factory = std::make_shared<formats::CSVFileWriterFactory>(
+                fs, ctx->compression_type, ctx->options, ctx->data_column_names, std::move(data_column_evaluators),
+                ctx->executor, runtime_state);
    } else {
-        file_writer_factory = std::make_unique<formats::UnknownFileWriterFactory>(ctx->format);
+        file_writer_factory = std::make_shared<formats::UnknownFileWriterFactory>(ctx->format);
+    }
+
+    std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory;
+    // Disable the load spill for hive sink temperarily
+    if (/* config::enable_connector_sink_spill */ false) {
+        auto partition_chunk_writer_ctx = std::make_shared<SpillPartitionChunkWriterContext>(
+                SpillPartitionChunkWriterContext{{file_writer_factory, location_provider, ctx->max_file_size,
+                                                  ctx->partition_column_names.empty()},
+                                                 fs,
+                                                 ctx->fragment_context,
+                                                 nullptr,
+                                                 nullptr});
+        partition_chunk_writer_factory = std::make_unique<SpillPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
+    } else {
+        auto partition_chunk_writer_ctx = std::make_shared<BufferPartitionChunkWriterContext>(
+                BufferPartitionChunkWriterContext{{file_writer_factory, location_provider, ctx->max_file_size,
+                                                   ctx->partition_column_names.empty()}});
+        partition_chunk_writer_factory =
+                std::make_unique<BufferPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
    }

    auto partition_column_evaluators = ColumnEvaluator::clone(ctx->partition_column_evaluators);
-    return std::make_unique<connector::HiveChunkSink>(
-            ctx->partition_column_names, std::move(partition_column_evaluators), std::move(location_provider),
-            std::move(file_writer_factory), ctx->max_file_size, runtime_state);
+    return std::make_unique<connector::HiveChunkSink>(ctx->partition_column_names,
+                                                      std::move(partition_column_evaluators),
+                                                      std::move(partition_chunk_writer_factory), runtime_state);
 }

 } // namespace starrocks::connector
--- a/be/src/connector/hive_chunk_sink.h
+++ b/be/src/connector/hive_chunk_sink.h
@ -38,9 +38,7 @@ class HiveChunkSink : public ConnectorChunkSink {
 public:
    HiveChunkSink(std::vector<std::string> partition_columns,
                  std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
-                  std::unique_ptr<LocationProvider> location_provider,
-                  std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
-                  RuntimeState* state);
+                  std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory, RuntimeState* state);

    ~HiveChunkSink() override = default;

--- a/be/src/connector/iceberg_chunk_sink.cpp
+++ b/be/src/connector/iceberg_chunk_sink.cpp
@ -30,15 +30,14 @@ namespace starrocks::connector {

 IcebergChunkSink::IcebergChunkSink(std::vector<std::string> partition_columns, std::vector<std::string> transform_exprs,
                                   std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
-                                   std::unique_ptr<LocationProvider> location_provider,
-                                   std::unique_ptr<formats::FileWriterFactory> file_writer_factory,
-                                   int64_t max_file_size, RuntimeState* state)
+                                   std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory,
+                                   RuntimeState* state)
        : ConnectorChunkSink(std::move(partition_columns), std::move(partition_column_evaluators),
-                             std::move(location_provider), std::move(file_writer_factory), max_file_size, state, true),
+                             std::move(partition_chunk_writer_factory), state, true),
          _transform_exprs(std::move(transform_exprs)) {}

 void IcebergChunkSink::callback_on_commit(const CommitResult& result) {
-    _rollback_actions.push_back(std::move(result.rollback_action));
+    push_rollback_action(std::move(result.rollback_action));
    if (result.io_status.ok()) {
        _state->update_num_rows_load_sink(result.file_statistics.record_count);

@ -82,27 +81,46 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> IcebergChunkSinkProvider::create_c
        std::shared_ptr<ConnectorChunkSinkContext> context, int32_t driver_id) {
    auto ctx = std::dynamic_pointer_cast<IcebergChunkSinkContext>(context);
    auto runtime_state = ctx->fragment_context->runtime_state();
-    auto fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
+    std::shared_ptr<FileSystem> fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
    auto column_evaluators = ColumnEvaluator::clone(ctx->column_evaluators);
-    auto location_provider = std::make_unique<connector::LocationProvider>(
+    auto location_provider = std::make_shared<connector::LocationProvider>(
            ctx->path, print_id(ctx->fragment_context->query_id()), runtime_state->be_number(), driver_id,
            boost::to_lower_copy(ctx->format));

-    std::unique_ptr<formats::FileWriterFactory> file_writer_factory;
-    if (boost::iequals(ctx->format, formats::PARQUET)) {
-        file_writer_factory = std::make_unique<formats::ParquetFileWriterFactory>(
-                std::move(fs), ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
-                ctx->parquet_field_ids, ctx->executor, runtime_state);
-    } else {
-        file_writer_factory = std::make_unique<formats::UnknownFileWriterFactory>(ctx->format);
-    }
-
    std::vector<std::string>& partition_columns = ctx->partition_column_names;
    std::vector<std::string>& transform_exprs = ctx->transform_exprs;
    auto partition_evaluators = ColumnEvaluator::clone(ctx->partition_evaluators);
-    return std::make_unique<connector::IcebergChunkSink>(
-            partition_columns, transform_exprs, std::move(partition_evaluators), std::move(location_provider),
-            std::move(file_writer_factory), ctx->max_file_size, runtime_state);
+    std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
+    if (boost::iequals(ctx->format, formats::PARQUET)) {
+        file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
+                fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
+                ctx->parquet_field_ids, ctx->executor, runtime_state);
+    } else {
+        file_writer_factory = std::make_shared<formats::UnknownFileWriterFactory>(ctx->format);
+    }
+
+    std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory;
+    if (config::enable_connector_sink_spill) {
+        auto partition_chunk_writer_ctx =
+                std::make_shared<SpillPartitionChunkWriterContext>(SpillPartitionChunkWriterContext{
+                        {file_writer_factory, location_provider, ctx->max_file_size, partition_columns.empty()},
+                        fs,
+                        ctx->fragment_context,
+                        runtime_state->desc_tbl().get_tuple_descriptor(ctx->tuple_desc_id),
+                        &ctx->column_evaluators,
+                        ctx->sort_ordering});
+        partition_chunk_writer_factory = std::make_unique<SpillPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
+    } else {
+        auto partition_chunk_writer_ctx =
+                std::make_shared<BufferPartitionChunkWriterContext>(BufferPartitionChunkWriterContext{
+                        {file_writer_factory, location_provider, ctx->max_file_size, partition_columns.empty()}});
+        partition_chunk_writer_factory =
+                std::make_unique<BufferPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
+    }
+
+    return std::make_unique<connector::IcebergChunkSink>(partition_columns, transform_exprs,
+                                                         std::move(partition_evaluators),
+                                                         std::move(partition_chunk_writer_factory), runtime_state);
 }

 Status IcebergChunkSink::add(Chunk* chunk) {
--- a/be/src/connector/iceberg_chunk_sink.h
+++ b/be/src/connector/iceberg_chunk_sink.h
@ -37,9 +37,7 @@ class IcebergChunkSink : public ConnectorChunkSink {
 public:
    IcebergChunkSink(std::vector<std::string> partition_columns, std::vector<std::string> transform_exprs,
                     std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
-                     std::unique_ptr<LocationProvider> location_provider,
-                     std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
-                     RuntimeState* state);
+                     std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory, RuntimeState* state);

    ~IcebergChunkSink() override = default;

@ -70,6 +68,8 @@ struct IcebergChunkSinkContext : public ConnectorChunkSinkContext {
    PriorityThreadPool* executor = nullptr;
    TCloudConfiguration cloud_conf;
    pipeline::FragmentContext* fragment_context = nullptr;
+    int tuple_desc_id = -1;
+    std::shared_ptr<SortOrdering> sort_ordering;
 };

 class IcebergChunkSinkProvider : public ConnectorChunkSinkProvider {
--- a/be/src/connector/lake_connector.cpp
+++ b/be/src/connector/lake_connector.cpp
@ -374,8 +374,9 @@ Status LakeDataSource::init_tablet_reader(RuntimeState* runtime_state) {
        _params.plan_node_id = _morsel->get_plan_node_id();
        _params.scan_range = _morsel->get_scan_range();
    }
-    ASSIGN_OR_RETURN(_reader, _tablet.new_reader(std::move(child_schema), need_split,
-                                                 _provider->could_split_physically(), _morsel->rowsets()));
+    ASSIGN_OR_RETURN(_reader,
+                     _tablet.new_reader(std::move(child_schema), need_split, _provider->could_split_physically(),
+                                        _morsel->rowsets(), _tablet_schema));
    if (reader_columns.size() == scanner_columns.size()) {
        _prj_iter = _reader;
    } else {
@ -434,7 +435,15 @@ Status LakeDataSource::_extend_schema_by_access_paths() {
        column.set_type(value_type);
        column.set_length(path->value_type().len);
        column.set_is_nullable(true);
-        column.set_extended_info(std::make_unique<ExtendedColumnInfo>(path.get(), root_column_index));
+        int32_t root_uid = _tablet_schema->column(static_cast<size_t>(root_column_index)).unique_id();
+        column.set_extended_info(std::make_unique<ExtendedColumnInfo>(path.get(), root_uid));
+
+        // For UNIQUE/AGG tables, extended flat JSON subcolumns behave like value columns
+        // and must carry a valid aggregation for pre-aggregation. Use REPLACE.
+        auto keys_type = _tablet_schema->keys_type();
+        if (keys_type == KeysType::UNIQUE_KEYS || keys_type == KeysType::AGG_KEYS) {
+            column.set_aggregation(StorageAggregateType::STORAGE_AGGREGATE_REPLACE);
+        }

        tmp_schema->append_column(column);
        VLOG(2) << "extend the access path column: " << path->linear_path();
@ -464,6 +473,28 @@ Status LakeDataSource::init_column_access_paths(Schema* schema) {
            LOG(WARNING) << "failed to find column in schema: " << root;
        }
    }
+    // Preserve access paths referenced by extended columns even if not selected by pushdown
+    {
+        std::unordered_set<const ColumnAccessPath*> kept;
+        kept.reserve(new_one.size());
+        for (const auto& p : new_one) kept.insert(p.get());
+
+        for (size_t i = 0; i < _tablet_schema->num_columns(); ++i) {
+            const auto& col = _tablet_schema->column(i);
+            if (!col.is_extended() || col.extended_info() == nullptr || col.extended_info()->access_path == nullptr) {
+                continue;
+            }
+            const ColumnAccessPath* needed = col.extended_info()->access_path;
+            if (kept.find(needed) != kept.end()) continue;
+            for (auto& owned : _column_access_paths) {
+                if (owned.get() == needed) {
+                    new_one.emplace_back(std::move(owned));
+                    kept.insert(needed);
+                    break;
+                }
+            }
+        }
+    }
    _column_access_paths = std::move(new_one);
    _params.column_access_paths = &_column_access_paths;

--- a/be/src/connector/partition_chunk_writer.cpp
+++ b/be/src/connector/partition_chunk_writer.cpp
@ -0,0 +1,388 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "connector/partition_chunk_writer.h"
+
+#include "column/chunk.h"
+#include "common/status.h"
+#include "connector/async_flush_stream_poller.h"
+#include "connector/connector_sink_executor.h"
+#include "connector/sink_memory_manager.h"
+#include "exec/pipeline/fragment_context.h"
+#include "formats/file_writer.h"
+#include "runtime/runtime_state.h"
+#include "storage/chunk_helper.h"
+#include "storage/convert_helper.h"
+#include "storage/load_spill_block_manager.h"
+#include "storage/storage_engine.h"
+#include "storage/types.h"
+#include "util/monotime.h"
+
+namespace starrocks::connector {
+
+PartitionChunkWriter::PartitionChunkWriter(std::string partition, std::vector<int8_t> partition_field_null_list,
+                                           const std::shared_ptr<PartitionChunkWriterContext>& ctx)
+        : _partition(std::move(partition)),
+          _partition_field_null_list(std::move(partition_field_null_list)),
+          _file_writer_factory(ctx->file_writer_factory),
+          _location_provider(ctx->location_provider),
+          _max_file_size(ctx->max_file_size),
+          _is_default_partition(ctx->is_default_partition) {
+    _commit_extra_data.resize(_partition_field_null_list.size(), '0');
+    std::transform(_partition_field_null_list.begin(), _partition_field_null_list.end(), _commit_extra_data.begin(),
+                   [](int8_t b) { return b + '0'; });
+}
+
+Status PartitionChunkWriter::create_file_writer_if_needed() {
+    if (!_file_writer) {
+        std::string path = _is_default_partition ? _location_provider->get() : _location_provider->get(_partition);
+        ASSIGN_OR_RETURN(auto new_writer_and_stream, _file_writer_factory->create(path));
+        _file_writer = std::move(new_writer_and_stream.writer);
+        _out_stream = std::move(new_writer_and_stream.stream);
+        RETURN_IF_ERROR(_file_writer->init());
+        _io_poller->enqueue(_out_stream);
+    }
+    return Status::OK();
+}
+
+void PartitionChunkWriter::commit_file() {
+    if (!_file_writer) {
+        return;
+    }
+    auto result = _file_writer->commit();
+    _commit_callback(result.set_extra_data(_commit_extra_data));
+    _file_writer = nullptr;
+    VLOG(3) << "commit to remote file, filename: " << _out_stream->filename()
+            << ", size: " << result.file_statistics.file_size;
+    _out_stream = nullptr;
+}
+
+Status BufferPartitionChunkWriter::init() {
+    return Status::OK();
+}
+
+Status BufferPartitionChunkWriter::write(Chunk* chunk) {
+    RETURN_IF_ERROR(create_file_writer_if_needed());
+    if (_file_writer->get_written_bytes() >= _max_file_size) {
+        commit_file();
+    }
+    return _file_writer->write(chunk);
+}
+
+Status BufferPartitionChunkWriter::flush() {
+    commit_file();
+    return Status::OK();
+}
+
+Status BufferPartitionChunkWriter::finish() {
+    commit_file();
+    return Status::OK();
+}
+
+SpillPartitionChunkWriter::SpillPartitionChunkWriter(std::string partition,
+                                                     std::vector<int8_t> partition_field_null_list,
+                                                     const std::shared_ptr<SpillPartitionChunkWriterContext>& ctx)
+        : PartitionChunkWriter(std::move(partition), std::move(partition_field_null_list), ctx),
+          _fs(ctx->fs),
+          _fragment_context(ctx->fragment_context),
+          _column_evaluators(ctx->column_evaluators),
+          _sort_ordering(ctx->sort_ordering) {
+    _chunk_spill_token = ExecEnv::GetInstance()->connector_sink_spill_executor()->create_token();
+    _block_merge_token = StorageEngine::instance()->load_spill_block_merge_executor()->create_token();
+    _tuple_desc = ctx->tuple_desc;
+    _writer_id = generate_uuid();
+}
+
+SpillPartitionChunkWriter::~SpillPartitionChunkWriter() {
+    if (_chunk_spill_token) {
+        _chunk_spill_token->shutdown();
+    }
+    if (_block_merge_token) {
+        _block_merge_token->shutdown();
+    }
+}
+
+Status SpillPartitionChunkWriter::init() {
+    std::string root_location = _location_provider->root_location();
+    _load_spill_block_mgr =
+            std::make_unique<LoadSpillBlockManager>(_fragment_context->query_id(), _writer_id, root_location, _fs);
+    RETURN_IF_ERROR(_load_spill_block_mgr->init());
+    _load_chunk_spiller = std::make_unique<LoadChunkSpiller>(_load_spill_block_mgr.get(),
+                                                             _fragment_context->runtime_state()->runtime_profile());
+    if (_column_evaluators) {
+        RETURN_IF_ERROR(ColumnEvaluator::init(*_column_evaluators));
+    }
+    return Status::OK();
+}
+
+Status SpillPartitionChunkWriter::write(Chunk* chunk) {
+    RETURN_IF_ERROR(create_file_writer_if_needed());
+
+    _chunks.push_back(chunk->clone_unique());
+    _chunk_bytes_usage += chunk->bytes_usage();
+    if (!_base_chunk) {
+        _base_chunk = _chunks.back();
+    }
+
+    int64_t max_flush_batch_size = _file_writer->get_flush_batch_size();
+    if (_sort_ordering || max_flush_batch_size == 0) {
+        max_flush_batch_size = _max_file_size;
+    }
+    if (_chunk_bytes_usage >= max_flush_batch_size) {
+        return _flush_to_file();
+    } else if (_mem_insufficent()) {
+        return _spill();
+    }
+    return Status::OK();
+}
+
+Status SpillPartitionChunkWriter::flush() {
+    RETURN_IF(!_file_writer, Status::OK());
+    return _spill();
+}
+
+Status SpillPartitionChunkWriter::finish() {
+    _chunk_spill_token->wait();
+    // If no chunks have been spilled, flush data to remote file directly.
+    if (_load_chunk_spiller->empty()) {
+        VLOG(2) << "flush to remote directly when finish, query_id: " << print_id(_fragment_context->query_id())
+                << ", writer_id: " << print_id(_writer_id);
+        RETURN_IF_ERROR(_flush_to_file());
+        commit_file();
+        return Status::OK();
+    }
+
+    auto cb = [this](const Status& st) {
+        LOG_IF(ERROR, !st.ok()) << "fail to merge spill blocks, query_id: " << print_id(_fragment_context->query_id())
+                                << ", writer_id: " << print_id(_writer_id);
+        _handle_err(st);
+        commit_file();
+    };
+    auto merge_task = std::make_shared<MergeBlockTask>(this, cb);
+    return _block_merge_token->submit(merge_task);
+}
+
+const int64_t SpillPartitionChunkWriter::kWaitMilliseconds = 10;
+
+bool SpillPartitionChunkWriter::is_finished() {
+    bool finished = _chunk_spill_token->wait_for(MonoDelta::FromMilliseconds(kWaitMilliseconds)) &&
+                    _block_merge_token->wait_for(MonoDelta::FromMilliseconds(kWaitMilliseconds));
+    return finished;
+}
+
+Status SpillPartitionChunkWriter::merge_blocks() {
+    RETURN_IF_ERROR(flush());
+    _chunk_spill_token->wait();
+
+    auto write_func = [this](Chunk* chunk) { return _flush_chunk(chunk, false); };
+    auto flush_func = [this]() {
+        // Commit file after each merge function to ensure the data written to one file is ordered,
+        // because data generated by different merge function may be unordered.
+        if (_sort_ordering) {
+            commit_file();
+        }
+        return Status::OK();
+    };
+    Status st = _load_chunk_spiller->merge_write(_max_file_size, _sort_ordering != nullptr, false /* do_agg */,
+                                                 write_func, flush_func);
+    VLOG(2) << "finish merge blocks, query_id: " << _fragment_context->query_id() << ", status: " << st.message();
+    return st;
+}
+
+Status SpillPartitionChunkWriter::_sort() {
+    RETURN_IF(!_result_chunk, Status::OK());
+
+    auto chunk = _result_chunk->clone_empty_with_schema(0);
+    _result_chunk->swap_chunk(*chunk);
+    SmallPermutation perm = create_small_permutation(static_cast<uint32_t>(chunk->num_rows()));
+    Columns columns;
+    for (auto sort_key_idx : _sort_ordering->sort_key_idxes) {
+        columns.push_back(chunk->get_column_by_index(sort_key_idx));
+    }
+
+    RETURN_IF_ERROR(stable_sort_and_tie_columns(false, columns, _sort_ordering->sort_descs, &perm));
+    std::vector<uint32_t> selective;
+    permutate_to_selective(perm, &selective);
+    _result_chunk->rolling_append_selective(*chunk, selective.data(), 0, chunk->num_rows());
+    return Status::OK();
+}
+
+Status SpillPartitionChunkWriter::_spill() {
+    RETURN_IF(_chunks.empty(), Status::OK());
+
+    RETURN_IF_ERROR(_merge_chunks());
+    if (_sort_ordering) {
+        RETURN_IF_ERROR(_sort());
+    }
+
+    auto callback = [this](const ChunkPtr& chunk, const StatusOr<size_t>& res) {
+        if (!res.ok()) {
+            LOG(ERROR) << "fail to spill connector partition chunk sink, write it to remote file directly. msg: "
+                       << res.status().message();
+            Status st = _flush_chunk(chunk.get(), true);
+            _handle_err(st);
+        } else {
+            VLOG(3) << "spill chunk data, filename: " << out_stream()->filename() << ", size: " << chunk->bytes_usage()
+                    << ", rows: " << chunk->num_rows() << ", partition: " << _partition
+                    << ", writer_id: " << _writer_id;
+        }
+        _spilling_bytes_usage.fetch_sub(chunk->bytes_usage(), std::memory_order_relaxed);
+    };
+    auto spill_task = std::make_shared<ChunkSpillTask>(_load_chunk_spiller.get(), _result_chunk, callback);
+    RETURN_IF_ERROR(_chunk_spill_token->submit(spill_task));
+    _spilling_bytes_usage.fetch_add(_result_chunk->bytes_usage(), std::memory_order_relaxed);
+    _chunk_bytes_usage = 0;
+    return Status::OK();
+}
+
+Status SpillPartitionChunkWriter::_flush_to_file() {
+    RETURN_IF(_chunks.empty(), Status::OK());
+
+    if (!_sort_ordering) {
+        for (auto& chunk : _chunks) {
+            RETURN_IF_ERROR(_flush_chunk(chunk.get(), false));
+        }
+    } else {
+        RETURN_IF_ERROR(_merge_chunks());
+        RETURN_IF_ERROR(_sort());
+        RETURN_IF_ERROR(_flush_chunk(_result_chunk.get(), true));
+        commit_file();
+    }
+    _chunks.clear();
+    _chunk_bytes_usage = 0;
+
+    return Status::OK();
+};
+
+Status SpillPartitionChunkWriter::_flush_chunk(Chunk* chunk, bool split) {
+    if (chunk->get_slot_id_to_index_map().empty()) {
+        auto& slot_map = _base_chunk->get_slot_id_to_index_map();
+        for (auto& it : slot_map) {
+            chunk->set_slot_id_to_index(it.first, _col_index_map[it.second]);
+        }
+    }
+
+    if (!split) {
+        return _write_chunk(chunk);
+    }
+    size_t chunk_size = config::vector_chunk_size;
+    for (size_t offset = 0; offset < chunk->num_rows(); offset += chunk_size) {
+        auto sub_chunk = chunk->clone_empty(chunk_size);
+        size_t num_rows = std::min(chunk_size, chunk->num_rows() - offset);
+        sub_chunk->append(*chunk, offset, num_rows);
+        RETURN_IF_ERROR(_write_chunk(sub_chunk.get()));
+    }
+    return Status::OK();
+}
+
+Status SpillPartitionChunkWriter::_write_chunk(Chunk* chunk) {
+    if (!_sort_ordering && _file_writer->get_written_bytes() >= _max_file_size) {
+        commit_file();
+    }
+    RETURN_IF_ERROR(create_file_writer_if_needed());
+    RETURN_IF_ERROR(_file_writer->write(chunk));
+    return Status::OK();
+}
+
+Status SpillPartitionChunkWriter::_merge_chunks() {
+    if (_chunks.empty()) {
+        return Status::OK();
+    }
+
+    // Create a target chunk with schema to make it can use some
+    // module functions of native table directly.
+    size_t num_rows = std::accumulate(_chunks.begin(), _chunks.end(), 0,
+                                      [](int sum, const ChunkPtr& chunk) { return sum + chunk->num_rows(); });
+    _result_chunk = _create_schema_chunk(_chunks.front(), num_rows);
+
+    std::unordered_map<Column*, size_t> col_ptr_index_map;
+    auto& columns = _chunks.front()->columns();
+    for (size_t i = 0; i < columns.size(); ++i) {
+        col_ptr_index_map[columns[i]->get_ptr()] = i;
+    }
+    for (auto& chunk : _chunks) {
+        for (size_t i = 0; i < _result_chunk->num_columns(); ++i) {
+            auto* dst_col = _result_chunk->get_column_by_index(i).get();
+            ColumnPtr src_col;
+            if (_column_evaluators) {
+                ASSIGN_OR_RETURN(src_col, (*_column_evaluators)[i]->evaluate(chunk.get()));
+            } else {
+                src_col = chunk->get_column_by_index(i);
+            }
+            dst_col->append(*src_col);
+            if (chunk == _chunks.front()) {
+                auto it = col_ptr_index_map.find(src_col.get());
+                if (it != col_ptr_index_map.end()) {
+                    _col_index_map[it->second] = i;
+                } else {
+                    return Status::InternalError("unknown column index: " + std::to_string(i));
+                }
+            }
+        }
+
+        chunk.reset();
+    }
+
+    _chunks.clear();
+    return Status::OK();
+}
+
+bool SpillPartitionChunkWriter::_mem_insufficent() {
+    // Return false because we will triger spill by sink memory manager.
+    return false;
+}
+
+void SpillPartitionChunkWriter::_handle_err(const Status& st) {
+    if (!st.ok()) {
+        _error_handler(st);
+    }
+}
+
+SchemaPtr SpillPartitionChunkWriter::_make_schema() {
+    Fields fields;
+    for (auto& slot : _tuple_desc->slots()) {
+        TypeDescriptor type_desc = slot->type();
+        TypeInfoPtr type_info = get_type_info(type_desc.type, type_desc.precision, type_desc.scale);
+        auto field = std::make_shared<Field>(slot->id(), slot->col_name(), type_info, slot->is_nullable());
+        fields.push_back(field);
+    }
+
+    SchemaPtr schema;
+    if (_sort_ordering) {
+        schema = std::make_shared<Schema>(std::move(fields), KeysType::DUP_KEYS, _sort_ordering->sort_key_idxes,
+                                          std::make_shared<SortDescs>(_sort_ordering->sort_descs));
+    } else {
+        schema = std::make_shared<Schema>(std::move(fields), KeysType::DUP_KEYS, std::vector<uint32_t>(), nullptr);
+    }
+    return schema;
+}
+
+ChunkPtr SpillPartitionChunkWriter::_create_schema_chunk(const ChunkPtr& base_chunk, size_t num_rows) {
+    if (!_schema) {
+        const SchemaPtr& schema = base_chunk->schema();
+        if (schema) {
+            _schema = schema;
+            if (_sort_ordering) {
+                _schema->set_sort_key_idxes(_sort_ordering->sort_key_idxes);
+                _schema->set_sort_descs(std::make_shared<SortDescs>(_sort_ordering->sort_descs));
+            }
+        } else {
+            _schema = _make_schema();
+        }
+    }
+    auto chunk = ChunkHelper::new_chunk(*_schema, num_rows);
+    return chunk;
+}
+
+} // namespace starrocks::connector
--- a/be/src/connector/partition_chunk_writer.h
+++ b/be/src/connector/partition_chunk_writer.h
@ -0,0 +1,256 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "column/chunk.h"
+#include "common/status.h"
+#include "connector/utils.h"
+#include "formats/file_writer.h"
+#include "fs/fs.h"
+#include "runtime/exec_env.h"
+#include "storage/load_chunk_spiller.h"
+#include "util/threadpool.h"
+#include "util/uid_util.h"
+
+namespace starrocks::connector {
+
+using CommitResult = formats::FileWriter::CommitResult;
+using CommitFunc = std::function<void(const CommitResult& result)>;
+using ErrorHandleFunc = std::function<void(const Status& status)>;
+
+class AsyncFlushStreamPoller;
+
+struct SortOrdering {
+    std::vector<uint32_t> sort_key_idxes;
+    SortDescs sort_descs;
+};
+
+struct PartitionChunkWriterContext {
+    std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
+    std::shared_ptr<LocationProvider> location_provider;
+    int64_t max_file_size = 0;
+    bool is_default_partition = false;
+};
+
+struct BufferPartitionChunkWriterContext : public PartitionChunkWriterContext {};
+
+struct SpillPartitionChunkWriterContext : public PartitionChunkWriterContext {
+    std::shared_ptr<FileSystem> fs;
+    pipeline::FragmentContext* fragment_context = nullptr;
+    TupleDescriptor* tuple_desc = nullptr;
+    std::vector<std::unique_ptr<ColumnEvaluator>>* column_evaluators;
+    std::shared_ptr<SortOrdering> sort_ordering;
+};
+
+class PartitionChunkWriter {
+public:
+    PartitionChunkWriter(std::string partition, std::vector<int8_t> partition_field_null_list,
+                         const std::shared_ptr<PartitionChunkWriterContext>& ctx);
+
+    virtual ~PartitionChunkWriter() = default;
+
+    virtual Status init() = 0;
+
+    virtual Status write(Chunk* chunk) = 0;
+
+    virtual Status flush() = 0;
+
+    virtual Status finish() = 0;
+
+    virtual bool is_finished() = 0;
+
+    virtual int64_t get_written_bytes() = 0;
+
+    virtual int64_t get_flushable_bytes() = 0;
+
+    const std::string& partition() const { return _partition; }
+
+    const std::vector<int8_t>& partition_field_null_list() const { return _partition_field_null_list; }
+
+    std::shared_ptr<formats::FileWriter> file_writer() { return _file_writer; }
+
+    std::shared_ptr<io::AsyncFlushOutputStream> out_stream() { return _out_stream; }
+
+    void set_io_poller(AsyncFlushStreamPoller* io_poller) { _io_poller = io_poller; }
+
+    void set_commit_callback(const CommitFunc& commit_callback) { _commit_callback = commit_callback; }
+
+    void set_error_handler(const ErrorHandleFunc& error_handler) { _error_handler = error_handler; }
+
+protected:
+    Status create_file_writer_if_needed();
+
+    void commit_file();
+
+protected:
+    std::string _partition;
+    std::vector<int8_t> _partition_field_null_list;
+    std::shared_ptr<formats::FileWriterFactory> _file_writer_factory;
+    std::shared_ptr<LocationProvider> _location_provider;
+    int64_t _max_file_size = 0;
+    bool _is_default_partition = false;
+    AsyncFlushStreamPoller* _io_poller = nullptr;
+
+    std::shared_ptr<formats::FileWriter> _file_writer;
+    std::shared_ptr<io::AsyncFlushOutputStream> _out_stream;
+    CommitFunc _commit_callback;
+    std::string _commit_extra_data;
+    ErrorHandleFunc _error_handler = nullptr;
+};
+
+class BufferPartitionChunkWriter : public PartitionChunkWriter {
+public:
+    BufferPartitionChunkWriter(std::string partition, std::vector<int8_t> partition_field_null_list,
+                               const std::shared_ptr<BufferPartitionChunkWriterContext>& ctx)
+            : PartitionChunkWriter(std::move(partition), std::move(partition_field_null_list), ctx) {}
+
+    Status init() override;
+
+    Status write(Chunk* chunk) override;
+
+    Status flush() override;
+
+    Status finish() override;
+
+    bool is_finished() override { return true; }
+
+    int64_t get_written_bytes() override { return _file_writer ? _file_writer->get_written_bytes() : 0; }
+
+    int64_t get_flushable_bytes() override { return _file_writer ? _file_writer->get_written_bytes() : 0; }
+};
+
+class SpillPartitionChunkWriter : public PartitionChunkWriter {
+public:
+    SpillPartitionChunkWriter(std::string partition, std::vector<int8_t> partition_field_null_list,
+                              const std::shared_ptr<SpillPartitionChunkWriterContext>& ctx);
+
+    ~SpillPartitionChunkWriter();
+
+    Status init() override;
+
+    Status write(Chunk* chunk) override;
+
+    Status flush() override;
+
+    Status finish() override;
+
+    bool is_finished() override;
+
+    int64_t get_written_bytes() override {
+        if (!_file_writer) {
+            return 0;
+        }
+        return _chunk_bytes_usage + _spilling_bytes_usage.load(std::memory_order_relaxed) +
+               _file_writer->get_written_bytes();
+    }
+
+    int64_t get_flushable_bytes() override { return _chunk_bytes_usage; }
+
+    Status merge_blocks();
+
+private:
+    Status _sort();
+
+    Status _spill();
+
+    Status _flush_to_file();
+
+    Status _flush_chunk(Chunk* chunk, bool split);
+
+    Status _write_chunk(Chunk* chunk);
+
+    Status _merge_chunks();
+
+    SchemaPtr _make_schema();
+
+    ChunkPtr _create_schema_chunk(const ChunkPtr& base_chunk, size_t row_nums);
+
+    bool _mem_insufficent();
+
+    void _handle_err(const Status& st);
+
+private:
+    std::shared_ptr<FileSystem> _fs = nullptr;
+    pipeline::FragmentContext* _fragment_context = nullptr;
+    TupleDescriptor* _tuple_desc = nullptr;
+    std::vector<std::unique_ptr<ColumnEvaluator>>* _column_evaluators;
+    std::shared_ptr<SortOrdering> _sort_ordering;
+    std::unique_ptr<ThreadPoolToken> _chunk_spill_token;
+    std::unique_ptr<ThreadPoolToken> _block_merge_token;
+    std::unique_ptr<LoadSpillBlockManager> _load_spill_block_mgr;
+    std::shared_ptr<LoadChunkSpiller> _load_chunk_spiller;
+    //std::function<StatusOr<ColumnPtr>(Chunk*, size_t)> _column_eval_func;
+    TUniqueId _writer_id;
+
+    std::list<ChunkPtr> _chunks;
+    int64_t _chunk_bytes_usage = 0;
+    std::atomic<int64_t> _spilling_bytes_usage = 0;
+    ChunkPtr _result_chunk;
+    ChunkPtr _base_chunk;
+    SchemaPtr _schema;
+    std::unordered_map<int, int> _col_index_map; // result chunk index -> chunk index
+
+    static const int64_t kWaitMilliseconds;
+};
+
+using PartitionChunkWriterPtr = std::shared_ptr<PartitionChunkWriter>;
+
+class PartitionChunkWriterFactory {
+public:
+    virtual ~PartitionChunkWriterFactory() = default;
+
+    virtual Status init() = 0;
+
+    virtual PartitionChunkWriterPtr create(std::string partition,
+                                           std::vector<int8_t> partition_field_null_list) const = 0;
+};
+
+class BufferPartitionChunkWriterFactory : public PartitionChunkWriterFactory {
+public:
+    BufferPartitionChunkWriterFactory(std::shared_ptr<BufferPartitionChunkWriterContext> ctx) : _ctx(ctx) {}
+
+    ~BufferPartitionChunkWriterFactory() = default;
+
+    Status init() override { return _ctx->file_writer_factory->init(); }
+
+    PartitionChunkWriterPtr create(std::string partition,
+                                   std::vector<int8_t> partition_field_null_list) const override {
+        return std::make_shared<BufferPartitionChunkWriter>(std::move(partition), std::move(partition_field_null_list),
+                                                            _ctx);
+    }
+
+private:
+    std::shared_ptr<BufferPartitionChunkWriterContext> _ctx;
+};
+
+class SpillPartitionChunkWriterFactory : public PartitionChunkWriterFactory {
+public:
+    SpillPartitionChunkWriterFactory(std::shared_ptr<SpillPartitionChunkWriterContext> ctx) : _ctx(ctx) {}
+
+    ~SpillPartitionChunkWriterFactory() = default;
+
+    Status init() override { return _ctx->file_writer_factory->init(); }
+
+    PartitionChunkWriterPtr create(std::string partition,
+                                   std::vector<int8_t> partition_field_null_list) const override {
+        return std::make_shared<SpillPartitionChunkWriter>(std::move(partition), std::move(partition_field_null_list),
+                                                           _ctx);
+    }
+
+private:
+    std::shared_ptr<SpillPartitionChunkWriterContext> _ctx;
+};
+
+} // namespace starrocks::connector
--- a/be/src/connector/sink_memory_manager.cpp
+++ b/be/src/connector/sink_memory_manager.cpp
@ -18,9 +18,9 @@

 namespace starrocks::connector {

-void SinkOperatorMemoryManager::init(std::map<PartitionKey, WriterStreamPair>* writer_stream_pairs,
+void SinkOperatorMemoryManager::init(std::map<PartitionKey, PartitionChunkWriterPtr>* partition_chunk_writers,
                                     AsyncFlushStreamPoller* io_poller, CommitFunc commit_func) {
-    _candidates = writer_stream_pairs;
+    _candidates = partition_chunk_writers;
    _commit_func = std::move(commit_func);
    _io_poller = io_poller;
 }
@ -30,24 +30,29 @@ bool SinkOperatorMemoryManager::kill_victim() {
        return false;
    }

-    // find file writer with the largest file size
-    PartitionKey partition;
-    WriterStreamPair* victim = nullptr;
-    for (auto& [key, writer_and_stream] : *_candidates) {
-        if (victim && victim->first->get_written_bytes() > writer_and_stream.first->get_written_bytes()) {
+    // Find a target file writer to flush.
+    // For buffered partition writer, choose the the writer with the largest file size.
+    // For spillable partition writer, choose the the writer with the largest memory size that can be spilled.
+    PartitionChunkWriterPtr victim = nullptr;
+    for (auto& [key, writer] : *_candidates) {
+        int64_t flushable_bytes = writer->get_flushable_bytes();
+        if (flushable_bytes == 0) {
            continue;
        }
-        partition = key;
-        victim = &writer_and_stream;
+        if (victim && flushable_bytes < victim->get_flushable_bytes()) {
+            continue;
+        }
+        victim = writer;
    }
    if (victim == nullptr) {
        return false;
    }

-    auto result = victim->first->commit();
-    _commit_func(result);
-    LOG(INFO) << "kill victim: " << victim->second->filename() << " size: " << result.file_statistics.file_size;
-    _candidates->erase(partition);
+    // The flush will decrease the writer flushable memory bytes, so it usually
+    // will not be choosed in a short time.
+    const auto filename = victim->out_stream()->filename();
+    const auto result = victim->flush();
+    LOG(INFO) << "kill victim: " << filename << ", result: " << result;
    return true;
 }

@ -59,8 +64,8 @@ int64_t SinkOperatorMemoryManager::update_releasable_memory() {

 int64_t SinkOperatorMemoryManager::update_writer_occupied_memory() {
    int64_t writer_occupied_memory = 0;
-    for (auto& [_, writer_and_stream] : *_candidates) {
-        writer_occupied_memory += writer_and_stream.first->get_written_bytes();
+    for (auto& [_, writer] : *_candidates) {
+        writer_occupied_memory += writer->get_flushable_bytes();
    }
    _writer_occupied_memory.store(writer_occupied_memory);
    return _writer_occupied_memory;
@ -113,33 +118,29 @@ bool SinkMemoryManager::_apply_on_mem_tracker(SinkOperatorMemoryManager* child_m

    auto available_memory = [&]() { return mem_tracker->limit() - mem_tracker->consumption(); };
    auto low_watermark = static_cast<int64_t>(mem_tracker->limit() * _low_watermark_ratio);
-    auto high_watermark = static_cast<int64_t>(mem_tracker->limit() * _high_watermark_ratio);
-    auto exceed_urgent_space = [&]() {
-        return _total_writer_occupied_memory() > _query_tracker->limit() * _urgent_space_ratio;
-    };
-
-    if (available_memory() <= low_watermark) {
-        child_manager->update_releasable_memory();
+    int64_t flush_watermark = _query_tracker->limit() * _urgent_space_ratio;
+    while (available_memory() <= low_watermark) {
        child_manager->update_writer_occupied_memory();
-        LOG_EVERY_SECOND(WARNING) << "consumption: " << mem_tracker->consumption()
-                                  << " releasable_memory: " << _total_releasable_memory()
-                                  << " writer_allocated_memory: " << _total_writer_occupied_memory();
-        // trigger early close
-        while (exceed_urgent_space() && available_memory() + _total_releasable_memory() < high_watermark) {
-            bool found = child_manager->kill_victim();
-            if (!found) {
-                break;
-            }
-            child_manager->update_releasable_memory();
-            child_manager->update_writer_occupied_memory();
+        int64_t total_occupied_memory = _total_writer_occupied_memory();
+        LOG_EVERY_SECOND(INFO) << "consumption: " << mem_tracker->consumption()
+                               << ", total_occupied_memory: " << total_occupied_memory
+                               << ", flush_watermark: " << flush_watermark;
+        if (total_occupied_memory < flush_watermark) {
+            break;
+        }
+        bool found = child_manager->kill_victim();
+        if (!found) {
+            break;
        }
    }

    child_manager->update_releasable_memory();
    if (available_memory() <= low_watermark && _total_releasable_memory() > 0) {
+        LOG_EVERY_SECOND(WARNING) << "memory usage is still high after flush, : available_memory" << available_memory()
+                                  << ", memory_low_watermark: " << low_watermark
+                                  << ", total_releasable_memory: " << _total_releasable_memory();
        return false;
    }
-
    return true;
 }

--- a/be/src/connector/sink_memory_manager.h
+++ b/be/src/connector/sink_memory_manager.h
@ -28,8 +28,8 @@ class SinkOperatorMemoryManager {
 public:
    SinkOperatorMemoryManager() = default;

-    void init(std::map<PartitionKey, WriterStreamPair>* writer_stream_pairs, AsyncFlushStreamPoller* io_poller,
-              CommitFunc commit_func);
+    void init(std::map<PartitionKey, PartitionChunkWriterPtr>* partition_chunk_writers,
+              AsyncFlushStreamPoller* io_poller, CommitFunc commit_func);

    // return true if a victim is found and killed, otherwise return false
    bool kill_victim();
@ -45,7 +45,7 @@ public:
    int64_t writer_occupied_memory() { return _writer_occupied_memory.load(); }

 private:
-    std::map<PartitionKey, WriterStreamPair>* _candidates = nullptr; // reference, owned by sink operator
+    std::map<PartitionKey, PartitionChunkWriterPtr>* _candidates = nullptr; // reference, owned by sink operator
    CommitFunc _commit_func;
    AsyncFlushStreamPoller* _io_poller;
    std::atomic_int64_t _releasable_memory{0};
--- a/be/src/connector/utils.h
+++ b/be/src/connector/utils.h
@ -104,6 +104,12 @@ public:
    // location = base_path/{query_id}_{be_number}_{driver_id}_index.file_suffix
    std::string get() { return fmt::format("{}/{}_{}.{}", _base_path, _file_name_prefix, _index++, _file_name_suffix); }

+    std::string root_location(const std::string& partition) {
+        return fmt::format("{}/{}", _base_path, PathUtils::remove_trailing_slash(partition));
+    }
+
+    std::string root_location() { return fmt::format("{}", PathUtils::remove_trailing_slash(_base_path)); }
+
 private:
    const std::string _base_path;
    const std::string _file_name_prefix;
--- a/be/src/exec/CMakeLists.txt
+++ b/be/src/exec/CMakeLists.txt
@ -51,11 +51,13 @@ set(EXEC_FILES
    aggregator.cpp
    sorted_streaming_aggregator.cpp
    aggregate/agg_hash_variant.cpp
+    aggregate/compress_serializer.cpp
    aggregate/aggregate_base_node.cpp
    aggregate/aggregate_blocking_node.cpp
    aggregate/distinct_blocking_node.cpp
    aggregate/aggregate_streaming_node.cpp
    aggregate/distinct_streaming_node.cpp
+    partition/bucket_aware_partition.cpp
    partition/chunks_partitioner.cpp
    partition/partition_hash_variant.cpp
    analytic_node.cpp
@ -156,6 +158,7 @@ set(EXEC_FILES
    schema_scanner/schema_be_cloud_native_compactions_scanner.cpp
    schema_scanner/schema_pipe_files.cpp
    schema_scanner/schema_pipes.cpp
+    schema_scanner/schema_recyclebin_catalogs.cpp
    schema_scanner/starrocks_role_edges_scanner.cpp
    schema_scanner/starrocks_grants_to_scanner.cpp
    schema_scanner/schema_helper.cpp
--- a/be/src/exec/agg_hash_fwd.h
+++ b/be/src/exec/agg_hash_fwd.h
@ -0,0 +1,28 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cstdint>
+
+namespace starrocks {
+using AggDataPtr = uint8_t*;
+using int128_t = __int128;
+
+class SliceWithHash;
+class HashOnSliceWithHash;
+class EqualOnSliceWithHash;
+
+} // namespace starrocks
--- a/be/src/exec/aggregate/agg_hash_map.h
+++ b/be/src/exec/aggregate/agg_hash_map.h
@ -14,27 +14,25 @@

 #pragma once

+#include <any>
 #include <cstdint>
 #include <limits>
-#include <type_traits>
 #include <utility>

 #include "column/column.h"
 #include "column/column_hash.h"
-#include "column/column_helper.h"
 #include "column/hash_set.h"
 #include "column/type_traits.h"
 #include "column/vectorized_fwd.h"
 #include "common/compiler_util.h"
 #include "exec/aggregate/agg_hash_set.h"
 #include "exec/aggregate/agg_profile.h"
+#include "exec/aggregate/compress_serializer.h"
 #include "gutil/casts.h"
 #include "gutil/strings/fastmem.h"
 #include "runtime/mem_pool.h"
 #include "util/fixed_hash_map.h"
-#include "util/hash_util.hpp"
 #include "util/phmap/phmap.h"
-#include "util/phmap/phmap_dump.h"

 namespace starrocks {

@ -245,9 +243,10 @@ struct AggHashMapWithOneNumberKeyWithNullable
        DCHECK(!key_column->is_nullable());
        const auto column = down_cast<const ColumnType*>(key_column);

-        size_t bucket_count = this->hash_map.bucket_count();
-
-        if (bucket_count < prefetch_threhold) {
+        if constexpr (is_no_prefetch_map<HashMap>) {
+            this->template compute_agg_noprefetch<Func, HTBuildOp>(column, agg_states,
+                                                                   std::forward<Func>(allocate_func), extra);
+        } else if (this->hash_map.bucket_count() < prefetch_threhold) {
            this->template compute_agg_noprefetch<Func, HTBuildOp>(column, agg_states,
                                                                   std::forward<Func>(allocate_func), extra);
        } else {
@ -1091,4 +1090,151 @@ struct AggHashMapWithSerializedKeyFixedSize
    int32_t _chunk_size;
 };

+template <typename HashMap>
+struct AggHashMapWithCompressedKeyFixedSize
+        : public AggHashMapWithKey<HashMap, AggHashMapWithCompressedKeyFixedSize<HashMap>> {
+    using Self = AggHashMapWithCompressedKeyFixedSize<HashMap>;
+    using Base = AggHashMapWithKey<HashMap, AggHashMapWithCompressedKeyFixedSize<HashMap>>;
+    using KeyType = typename HashMap::key_type;
+    using Iterator = typename HashMap::iterator;
+    using FixedSizeSliceKey = typename HashMap::key_type;
+    using ResultVector = typename std::vector<FixedSizeSliceKey>;
+
+    template <class... Args>
+    AggHashMapWithCompressedKeyFixedSize(int chunk_size, Args&&... args)
+            : Base(chunk_size, std::forward<Args>(args)...),
+              mem_pool(std::make_unique<MemPool>()),
+              _chunk_size(chunk_size) {
+        fixed_keys.reserve(chunk_size);
+    }
+
+    AggDataPtr get_null_key_data() { return nullptr; }
+    void set_null_key_data(AggDataPtr data) {}
+
+    template <AllocFunc<Self> Func, typename HTBuildOp>
+    ALWAYS_NOINLINE void compute_agg_noprefetch(size_t chunk_size, const Columns& key_columns, MemPool* pool,
+                                                Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
+                                                ExtraAggParam* extra) {
+        [[maybe_unused]] size_t hash_table_size = this->hash_map.size();
+        auto* __restrict not_founds = extra->not_founds;
+        // serialize
+        bitcompress_serialize(key_columns, bases, offsets, chunk_size, sizeof(FixedSizeSliceKey), fixed_keys.data());
+
+        for (size_t i = 0; i < chunk_size; ++i) {
+            if constexpr (HTBuildOp::process_limit) {
+                if (hash_table_size < extra->limits) {
+                    _emplace_key(fixed_keys[i], (*agg_states)[i], allocate_func, [&] { hash_table_size++; });
+                } else {
+                    _find_key((*agg_states)[i], (*not_founds)[i], fixed_keys[i]);
+                }
+            } else if constexpr (HTBuildOp::allocate) {
+                _emplace_key(fixed_keys[i], (*agg_states)[i], allocate_func,
+                             FillNotFounds<HTBuildOp::fill_not_found>(not_founds, i));
+            } else if constexpr (HTBuildOp::fill_not_found) {
+                _find_key((*agg_states)[i], (*not_founds)[i], fixed_keys[i]);
+            }
+        }
+    }
+
+    template <AllocFunc<Self> Func, typename HTBuildOp>
+    ALWAYS_NOINLINE void compute_agg_prefetch(size_t chunk_size, const Columns& key_columns, MemPool* pool,
+                                              Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
+                                              ExtraAggParam* extra) {
+        [[maybe_unused]] size_t hash_table_size = this->hash_map.size();
+        auto* __restrict not_founds = extra->not_founds;
+        // serialize
+        bitcompress_serialize(key_columns, bases, offsets, chunk_size, sizeof(FixedSizeSliceKey), fixed_keys.data());
+
+        hashs.reserve(chunk_size);
+        for (size_t i = 0; i < chunk_size; ++i) {
+            hashs[i] = this->hash_map.hash_function()(fixed_keys[i]);
+        }
+
+        size_t prefetch_index = AGG_HASH_MAP_DEFAULT_PREFETCH_DIST;
+        for (size_t i = 0; i < chunk_size; ++i) {
+            if (prefetch_index < chunk_size) {
+                this->hash_map.prefetch_hash(hashs[prefetch_index++]);
+            }
+            if constexpr (HTBuildOp::process_limit) {
+                if (hash_table_size < extra->limits) {
+                    _emplace_key_with_hash(fixed_keys[i], hashs[i], (*agg_states)[i], allocate_func,
+                                           [&] { hash_table_size++; });
+                } else {
+                    _find_key((*agg_states)[i], (*not_founds)[i], fixed_keys[i]);
+                }
+            } else if constexpr (HTBuildOp::allocate) {
+                _emplace_key_with_hash(fixed_keys[i], hashs[i], (*agg_states)[i], allocate_func,
+                                       FillNotFounds<HTBuildOp::fill_not_found>(not_founds, i));
+            } else if constexpr (HTBuildOp::fill_not_found) {
+                _find_key((*agg_states)[i], (*not_founds)[i], fixed_keys[i]);
+            }
+        }
+    }
+
+    template <AllocFunc<Self> Func, typename HTBuildOp>
+    void compute_agg_states(size_t chunk_size, const Columns& key_columns, MemPool* pool, Func&& allocate_func,
+                            Buffer<AggDataPtr>* agg_states, ExtraAggParam* extra) {
+        auto* buffer = reinterpret_cast<uint8_t*>(fixed_keys.data());
+        memset(buffer, 0x0, sizeof(FixedSizeSliceKey) * chunk_size);
+
+        if constexpr (is_no_prefetch_map<HashMap>) {
+            this->template compute_agg_noprefetch<Func, HTBuildOp>(
+                    chunk_size, key_columns, pool, std::forward<Func>(allocate_func), agg_states, extra);
+        } else if (this->hash_map.bucket_count() < prefetch_threhold) {
+            this->template compute_agg_noprefetch<Func, HTBuildOp>(
+                    chunk_size, key_columns, pool, std::forward<Func>(allocate_func), agg_states, extra);
+        } else {
+            this->template compute_agg_prefetch<Func, HTBuildOp>(chunk_size, key_columns, pool,
+                                                                 std::forward<Func>(allocate_func), agg_states, extra);
+        }
+    }
+
+    template <AllocFunc<Self> Func, typename EmplaceCallBack>
+    ALWAYS_INLINE void _emplace_key(KeyType key, AggDataPtr& target_state, Func&& allocate_func,
+                                    EmplaceCallBack&& callback) {
+        auto iter = this->hash_map.lazy_emplace(key, [&](const auto& ctor) {
+            callback();
+            AggDataPtr pv = allocate_func(key);
+            ctor(key, pv);
+        });
+        target_state = iter->second;
+    }
+
+    template <AllocFunc<Self> Func, typename EmplaceCallBack>
+    ALWAYS_INLINE void _emplace_key_with_hash(KeyType key, size_t hash, AggDataPtr& target_state, Func&& allocate_func,
+                                              EmplaceCallBack&& callback) {
+        auto iter = this->hash_map.lazy_emplace_with_hash(key, hash, [&](const auto& ctor) {
+            callback();
+            AggDataPtr pv = allocate_func(key);
+            ctor(key, pv);
+        });
+        target_state = iter->second;
+    }
+
+    template <typename... Args>
+    ALWAYS_INLINE void _find_key(AggDataPtr& target_state, uint8_t& not_found, Args&&... args) {
+        if (auto iter = this->hash_map.find(std::forward<Args>(args)...); iter != this->hash_map.end()) {
+            target_state = iter->second;
+        } else {
+            not_found = 1;
+        }
+    }
+
+    void insert_keys_to_columns(ResultVector& keys, Columns& key_columns, int32_t chunk_size) {
+        bitcompress_deserialize(key_columns, bases, offsets, used_bits, chunk_size, sizeof(FixedSizeSliceKey),
+                                keys.data());
+    }
+
+    static constexpr bool has_single_null_key = false;
+
+    std::vector<int> used_bits;
+    std::vector<int> offsets;
+    std::vector<std::any> bases;
+    std::vector<FixedSizeSliceKey> fixed_keys;
+    std::vector<size_t> hashs;
+    std::unique_ptr<MemPool> mem_pool;
+    ResultVector results;
+    int32_t _chunk_size;
+};
+
 } // namespace starrocks
--- a/be/src/exec/aggregate/agg_hash_set.h
+++ b/be/src/exec/aggregate/agg_hash_set.h
@ -14,19 +14,17 @@

 #pragma once

+#include <any>
+
 #include "column/column_hash.h"
-#include "column/column_helper.h"
 #include "column/hash_set.h"
 #include "column/type_traits.h"
 #include "column/vectorized_fwd.h"
 #include "exec/aggregate/agg_profile.h"
 #include "gutil/casts.h"
 #include "runtime/mem_pool.h"
-#include "runtime/runtime_state.h"
 #include "util/fixed_hash_map.h"
-#include "util/hash_util.hpp"
 #include "util/phmap/phmap.h"
-#include "util/runtime_profile.h"

 namespace starrocks {

@ -111,14 +109,6 @@ struct AggHashSet {
    }
 };

-template <typename T>
-struct no_prefetch_set : std::false_type {};
-template <PhmapSeed seed>
-struct no_prefetch_set<Int8AggHashSet<seed>> : std::true_type {};
-
-template <class T>
-constexpr bool is_no_prefetch_set = no_prefetch_set<T>::value;
-
 // handle one number hash key
 template <LogicalType logical_type, typename HashSet>
 struct AggHashSetOfOneNumberKey : public AggHashSet<HashSet, AggHashSetOfOneNumberKey<logical_type, HashSet>> {
@ -147,12 +137,10 @@ struct AggHashSetOfOneNumberKey : public AggHashSet<HashSet, AggHashSetOfOneNumb

        if constexpr (is_no_prefetch_set<HashSet>) {
            this->template build_set_noprefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
+        } else if (this->hash_set.bucket_count() < prefetch_threhold) {
+            this->template build_set_noprefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
        } else {
-            if (this->hash_set.bucket_count() < prefetch_threhold) {
-                this->template build_set_noprefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
-            } else {
-                this->template build_set_prefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
-            }
+            this->template build_set_prefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
        }
    }

@ -754,10 +742,94 @@ struct AggHashSetOfSerializedKeyFixedSize : public AggHashSet<HashSet, AggHashSe
    uint8_t* buffer;
    ResultVector results;
    Buffer<Slice> tmp_slices;
-    // std::vector<Slice> tmp_slices;

    int32_t _chunk_size;
    std::vector<size_t> hashes;
 };

+template <typename HashSet>
+struct AggHashSetCompressedFixedSize : public AggHashSet<HashSet, AggHashSetCompressedFixedSize<HashSet>> {
+    using Base = AggHashSet<HashSet, AggHashSetCompressedFixedSize<HashSet>>;
+    using Iterator = typename HashSet::iterator;
+    using KeyType = typename HashSet::key_type;
+    using FixedSizeSliceKey = typename HashSet::key_type;
+    using ResultVector = typename std::vector<FixedSizeSliceKey>;
+
+    bool has_null_column = false;
+    static constexpr size_t max_fixed_size = sizeof(FixedSizeSliceKey);
+
+    template <class... Args>
+    AggHashSetCompressedFixedSize(int32_t chunk_size, Args&&... args)
+            : Base(chunk_size, std::forward<Args>(args)...), _chunk_size(chunk_size) {
+        fixed_keys.reserve(chunk_size);
+    }
+
+    // When compute_and_allocate=false:
+    // Elements queried in HashSet will be added to HashSet
+    // elements that cannot be queried are not processed,
+    // and are mainly used in the first stage of two-stage aggregation when aggr reduction is low
+    template <bool compute_and_allocate>
+    void build_set(size_t chunk_size, const Columns& key_columns, MemPool* pool, Filter* not_founds) {
+        if constexpr (!compute_and_allocate) {
+            DCHECK(not_founds);
+            not_founds->assign(chunk_size, 0);
+        }
+
+        auto* buffer = reinterpret_cast<uint8_t*>(fixed_keys.data());
+        memset(buffer, 0x0, sizeof(FixedSizeSliceKey) * chunk_size);
+        bitcompress_serialize(key_columns, bases, offsets, chunk_size, sizeof(FixedSizeSliceKey), fixed_keys.data());
+
+        if constexpr (is_no_prefetch_set<HashSet>) {
+            this->template build_set_noprefetch<compute_and_allocate>(chunk_size, pool, not_founds);
+        } else if (this->hash_set.bucket_count() < prefetch_threhold) {
+            this->template build_set_noprefetch<compute_and_allocate>(chunk_size, pool, not_founds);
+        } else {
+            this->template build_set_prefetch<compute_and_allocate>(chunk_size, pool, not_founds);
+        }
+    }
+
+    template <bool compute_and_allocate>
+    ALWAYS_NOINLINE void build_set_prefetch(size_t chunk_size, MemPool* pool, Filter* not_founds) {
+        auto* keys = reinterpret_cast<FixedSizeSliceKey*>(fixed_keys.data());
+        AGG_HASH_SET_PRECOMPUTE_HASH_VALS();
+
+        for (size_t i = 0; i < chunk_size; ++i) {
+            AGG_HASH_SET_PREFETCH_HASH_VAL();
+            if constexpr (compute_and_allocate) {
+                this->hash_set.emplace_with_hash(hashes[i], keys[i]);
+            } else {
+                (*not_founds)[i] = this->hash_set.find(keys[i], hashes[i]) == this->hash_set.end();
+            }
+        }
+    }
+
+    template <bool compute_and_allocate>
+    ALWAYS_NOINLINE void build_set_noprefetch(size_t chunk_size, MemPool* pool, Filter* not_founds) {
+        for (size_t i = 0; i < chunk_size; ++i) {
+            if constexpr (compute_and_allocate) {
+                this->hash_set.insert(fixed_keys[i]);
+            } else {
+                (*not_founds)[i] = !this->hash_set.contains(fixed_keys[i]);
+            }
+        }
+    }
+
+    void insert_keys_to_columns(ResultVector& keys, Columns& key_columns, int32_t chunk_size) {
+        bitcompress_deserialize(key_columns, bases, offsets, used_bits, chunk_size, sizeof(FixedSizeSliceKey),
+                                keys.data());
+    }
+
+    static constexpr bool has_single_null_key = false;
+    bool has_null_key = false;
+
+    std::vector<int> used_bits;
+    std::vector<int> offsets;
+    std::vector<std::any> bases;
+    std::vector<FixedSizeSliceKey> fixed_keys;
+    std::vector<size_t> hashes;
+    ResultVector results;
+
+    int32_t _chunk_size;
+};
+
 } // namespace starrocks
--- a/be/src/exec/aggregate/agg_hash_variant.cpp
+++ b/be/src/exec/aggregate/agg_hash_variant.cpp
@ -15,11 +15,91 @@
 #include "exec/aggregate/agg_hash_variant.h"

 #include <tuple>
-#include <type_traits>
 #include <variant>

+#include "runtime/runtime_state.h"
 #include "util/phmap/phmap.h"

+#define APPLY_FOR_AGG_VARIANT_ALL(M) \
+    M(phase1_uint8)                  \
+    M(phase1_int8)                   \
+    M(phase1_int16)                  \
+    M(phase1_int32)                  \
+    M(phase1_int64)                  \
+    M(phase1_int128)                 \
+    M(phase1_decimal32)              \
+    M(phase1_decimal64)              \
+    M(phase1_decimal128)             \
+    M(phase1_decimal256)             \
+    M(phase1_date)                   \
+    M(phase1_timestamp)              \
+    M(phase1_string)                 \
+    M(phase1_slice)                  \
+    M(phase1_null_uint8)             \
+    M(phase1_null_int8)              \
+    M(phase1_null_int16)             \
+    M(phase1_null_int32)             \
+    M(phase1_null_int64)             \
+    M(phase1_null_int128)            \
+    M(phase1_null_decimal32)         \
+    M(phase1_null_decimal64)         \
+    M(phase1_null_decimal128)        \
+    M(phase1_null_decimal256)        \
+    M(phase1_null_date)              \
+    M(phase1_null_timestamp)         \
+    M(phase1_null_string)            \
+    M(phase1_slice_two_level)        \
+    M(phase1_int32_two_level)        \
+    M(phase1_null_string_two_level)  \
+    M(phase1_string_two_level)       \
+                                     \
+    M(phase2_uint8)                  \
+    M(phase2_int8)                   \
+    M(phase2_int16)                  \
+    M(phase2_int32)                  \
+    M(phase2_int64)                  \
+    M(phase2_int128)                 \
+    M(phase2_decimal32)              \
+    M(phase2_decimal64)              \
+    M(phase2_decimal128)             \
+    M(phase2_decimal256)             \
+    M(phase2_date)                   \
+    M(phase2_timestamp)              \
+    M(phase2_string)                 \
+    M(phase2_slice)                  \
+    M(phase2_null_uint8)             \
+    M(phase2_null_int8)              \
+    M(phase2_null_int16)             \
+    M(phase2_null_int32)             \
+    M(phase2_null_int64)             \
+    M(phase2_null_int128)            \
+    M(phase2_null_decimal32)         \
+    M(phase2_null_decimal64)         \
+    M(phase2_null_decimal128)        \
+    M(phase2_null_decimal256)        \
+    M(phase2_null_date)              \
+    M(phase2_null_timestamp)         \
+    M(phase2_null_string)            \
+    M(phase2_slice_two_level)        \
+    M(phase2_int32_two_level)        \
+    M(phase2_null_string_two_level)  \
+    M(phase2_string_two_level)       \
+                                     \
+    M(phase1_slice_fx4)              \
+    M(phase1_slice_fx8)              \
+    M(phase1_slice_fx16)             \
+    M(phase2_slice_fx4)              \
+    M(phase2_slice_fx8)              \
+    M(phase2_slice_fx16)             \
+    M(phase1_slice_cx1)              \
+    M(phase1_slice_cx4)              \
+    M(phase1_slice_cx8)              \
+    M(phase1_slice_cx16)             \
+    M(phase2_slice_cx1)              \
+    M(phase2_slice_cx4)              \
+    M(phase2_slice_cx8)              \
+    M(phase2_slice_cx16)
+
 namespace starrocks {
 namespace detail {
 template <AggHashMapVariant::Type>
@ -65,6 +145,10 @@ DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_string_two_level, OneStringTwoLe
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_fx4, SerializedKeyFixedSize4AggHashMap<PhmapSeed1>);
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_fx8, SerializedKeyFixedSize8AggHashMap<PhmapSeed1>);
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_fx16, SerializedKeyFixedSize16AggHashMap<PhmapSeed1>);
+DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_cx1, CompressedFixedSize1AggHashMap<PhmapSeed1>);
+DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_cx4, CompressedFixedSize4AggHashMap<PhmapSeed1>);
+DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_cx8, CompressedFixedSize8AggHashMap<PhmapSeed1>);
+DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_cx16, CompressedFixedSize16AggHashMap<PhmapSeed1>);
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_uint8, UInt8AggHashMapWithOneNumberKey<PhmapSeed2>);
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_int8, Int8AggHashMapWithOneNumberKey<PhmapSeed2>);
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_int16, Int16AggHashMapWithOneNumberKey<PhmapSeed2>);
@ -99,6 +183,10 @@ DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_string_two_level, OneStringTwoLe
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_fx4, SerializedKeyFixedSize4AggHashMap<PhmapSeed2>);
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_fx8, SerializedKeyFixedSize8AggHashMap<PhmapSeed2>);
 DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_fx16, SerializedKeyFixedSize16AggHashMap<PhmapSeed2>);
+DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_cx1, CompressedFixedSize1AggHashMap<PhmapSeed2>);
+DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_cx4, CompressedFixedSize4AggHashMap<PhmapSeed2>);
+DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_cx8, CompressedFixedSize8AggHashMap<PhmapSeed2>);
+DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_cx16, CompressedFixedSize16AggHashMap<PhmapSeed2>);

 template <AggHashSetVariant::Type>
 struct AggHashSetVariantTypeTraits;
@ -180,6 +268,15 @@ DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_fx4, SerializedKeyAggHashS
 DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_fx8, SerializedKeyAggHashSetFixedSize8<PhmapSeed2>);
 DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_fx16, SerializedKeyAggHashSetFixedSize16<PhmapSeed2>);

+DEFINE_SET_TYPE(AggHashSetVariant::Type::phase1_slice_cx1, CompressedAggHashSetFixedSize1<PhmapSeed1>);
+DEFINE_SET_TYPE(AggHashSetVariant::Type::phase1_slice_cx4, CompressedAggHashSetFixedSize4<PhmapSeed1>);
+DEFINE_SET_TYPE(AggHashSetVariant::Type::phase1_slice_cx8, CompressedAggHashSetFixedSize8<PhmapSeed1>);
+DEFINE_SET_TYPE(AggHashSetVariant::Type::phase1_slice_cx16, CompressedAggHashSetFixedSize16<PhmapSeed1>);
+DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_cx1, CompressedAggHashSetFixedSize1<PhmapSeed2>);
+DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_cx4, CompressedAggHashSetFixedSize4<PhmapSeed2>);
+DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_cx8, CompressedAggHashSetFixedSize8<PhmapSeed2>);
+DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_cx16, CompressedAggHashSetFixedSize16<PhmapSeed2>);
+
 } // namespace detail
 void AggHashMapVariant::init(RuntimeState* state, Type type, AggStatistics* agg_stat) {
    _type = type;
--- a/be/src/exec/aggregate/agg_hash_variant.h
+++ b/be/src/exec/aggregate/agg_hash_variant.h
@ -17,93 +17,15 @@

 #pragma once

-#include <type_traits>
-#include <utility>
-#include <variant>
-
-#include "column/hash_set.h"
 #include "exec/aggregate/agg_hash_map.h"
 #include "exec/aggregate/agg_hash_set.h"
 #include "exec/aggregate/agg_profile.h"
 #include "types/logical_type.h"
-#include "util/phmap/phmap.h"

 namespace starrocks {

 enum AggrPhase { AggrPhase1, AggrPhase2 };

-#define APPLY_FOR_AGG_VARIANT_ALL(M) \
-    M(phase1_uint8)                  \
-    M(phase1_int8)                   \
-    M(phase1_int16)                  \
-    M(phase1_int32)                  \
-    M(phase1_int64)                  \
-    M(phase1_int128)                 \
-    M(phase1_decimal32)              \
-    M(phase1_decimal64)              \
-    M(phase1_decimal128)             \
-    M(phase1_decimal256)             \
-    M(phase1_date)                   \
-    M(phase1_timestamp)              \
-    M(phase1_string)                 \
-    M(phase1_slice)                  \
-    M(phase1_null_uint8)             \
-    M(phase1_null_int8)              \
-    M(phase1_null_int16)             \
-    M(phase1_null_int32)             \
-    M(phase1_null_int64)             \
-    M(phase1_null_int128)            \
-    M(phase1_null_decimal32)         \
-    M(phase1_null_decimal64)         \
-    M(phase1_null_decimal128)        \
-    M(phase1_null_decimal256)        \
-    M(phase1_null_date)              \
-    M(phase1_null_timestamp)         \
-    M(phase1_null_string)            \
-    M(phase1_slice_two_level)        \
-    M(phase1_int32_two_level)        \
-    M(phase1_null_string_two_level)  \
-    M(phase1_string_two_level)       \
-                                     \
-    M(phase2_uint8)                  \
-    M(phase2_int8)                   \
-    M(phase2_int16)                  \
-    M(phase2_int32)                  \
-    M(phase2_int64)                  \
-    M(phase2_int128)                 \
-    M(phase2_decimal32)              \
-    M(phase2_decimal64)              \
-    M(phase2_decimal128)             \
-    M(phase2_decimal256)             \
-    M(phase2_date)                   \
-    M(phase2_timestamp)              \
-    M(phase2_string)                 \
-    M(phase2_slice)                  \
-    M(phase2_null_uint8)             \
-    M(phase2_null_int8)              \
-    M(phase2_null_int16)             \
-    M(phase2_null_int32)             \
-    M(phase2_null_int64)             \
-    M(phase2_null_int128)            \
-    M(phase2_null_decimal32)         \
-    M(phase2_null_decimal64)         \
-    M(phase2_null_decimal128)        \
-    M(phase2_null_decimal256)        \
-    M(phase2_null_date)              \
-    M(phase2_null_timestamp)         \
-    M(phase2_null_string)            \
-    M(phase2_slice_two_level)        \
-    M(phase2_int32_two_level)        \
-    M(phase2_null_string_two_level)  \
-    M(phase2_string_two_level)       \
-                                     \
-    M(phase1_slice_fx4)              \
-    M(phase1_slice_fx8)              \
-    M(phase1_slice_fx16)             \
-    M(phase2_slice_fx4)              \
-    M(phase2_slice_fx8)              \
-    M(phase2_slice_fx16)
-
 // Aggregate Hash maps

 // no-nullable single key maps:
@ -187,6 +109,16 @@ using SerializedKeyFixedSize8AggHashMap = AggHashMapWithSerializedKeyFixedSize<F
 template <PhmapSeed seed>
 using SerializedKeyFixedSize16AggHashMap = AggHashMapWithSerializedKeyFixedSize<FixedSize16SliceAggHashMap<seed>>;

+// fixed compress key
+template <PhmapSeed seed>
+using CompressedFixedSize1AggHashMap = AggHashMapWithCompressedKeyFixedSize<Int8AggHashMap<seed>>;
+template <PhmapSeed seed>
+using CompressedFixedSize4AggHashMap = AggHashMapWithCompressedKeyFixedSize<Int32AggHashMap<seed>>;
+template <PhmapSeed seed>
+using CompressedFixedSize8AggHashMap = AggHashMapWithCompressedKeyFixedSize<Int64AggHashMap<seed>>;
+template <PhmapSeed seed>
+using CompressedFixedSize16AggHashMap = AggHashMapWithCompressedKeyFixedSize<Int128AggHashMap<seed>>;
+
 // Hash sets
 //
 template <PhmapSeed seed>
@ -270,6 +202,15 @@ using SerializedKeyAggHashSetFixedSize8 = AggHashSetOfSerializedKeyFixedSize<Fix
 template <PhmapSeed seed>
 using SerializedKeyAggHashSetFixedSize16 = AggHashSetOfSerializedKeyFixedSize<FixedSize16SliceAggHashSet<seed>>;

+template <PhmapSeed seed>
+using CompressedAggHashSetFixedSize1 = AggHashSetCompressedFixedSize<Int8AggHashSet<seed>>;
+template <PhmapSeed seed>
+using CompressedAggHashSetFixedSize4 = AggHashSetCompressedFixedSize<Int32AggHashSet<seed>>;
+template <PhmapSeed seed>
+using CompressedAggHashSetFixedSize8 = AggHashSetCompressedFixedSize<Int64AggHashSet<seed>>;
+template <PhmapSeed seed>
+using CompressedAggHashSetFixedSize16 = AggHashSetCompressedFixedSize<Int128AggHashSet<seed>>;
+
 // aggregate key
 template <class HashMapWithKey>
 struct CombinedFixedSizeKey {
@ -294,6 +235,24 @@ static_assert(!is_combined_fixed_size_key<Int32TwoLevelAggHashSetOfOneNumberKey<
 static_assert(is_combined_fixed_size_key<SerializedKeyAggHashSetFixedSize4<PhmapSeed1>>);
 static_assert(!is_combined_fixed_size_key<Int32TwoLevelAggHashMapWithOneNumberKey<PhmapSeed1>>);

+template <class HashMapWithKey>
+struct CompressedFixedSizeKey {
+    static auto constexpr value = false;
+};
+
+template <typename HashMap>
+struct CompressedFixedSizeKey<AggHashMapWithCompressedKeyFixedSize<HashMap>> {
+    static auto constexpr value = true;
+};
+
+template <typename HashSet>
+struct CompressedFixedSizeKey<AggHashSetCompressedFixedSize<HashSet>> {
+    static auto constexpr value = true;
+};
+
+template <typename HashMapOrSetWithKey>
+inline constexpr bool is_compressed_fixed_size_key = CompressedFixedSizeKey<HashMapOrSetWithKey>::value;
+
 // 1) For different group by columns type, size, cardinality, volume, we should choose different
 // hash functions and different hashmaps.
 // When runtime, we will only have one hashmap.
@ -341,6 +300,10 @@ using AggHashMapWithKeyPtr = std::variant<
        std::unique_ptr<SerializedKeyFixedSize4AggHashMap<PhmapSeed1>>,
        std::unique_ptr<SerializedKeyFixedSize8AggHashMap<PhmapSeed1>>,
        std::unique_ptr<SerializedKeyFixedSize16AggHashMap<PhmapSeed1>>,
+        std::unique_ptr<CompressedFixedSize1AggHashMap<PhmapSeed1>>,
+        std::unique_ptr<CompressedFixedSize4AggHashMap<PhmapSeed1>>,
+        std::unique_ptr<CompressedFixedSize8AggHashMap<PhmapSeed1>>,
+        std::unique_ptr<CompressedFixedSize16AggHashMap<PhmapSeed1>>,
        std::unique_ptr<UInt8AggHashMapWithOneNumberKey<PhmapSeed2>>,
        std::unique_ptr<Int8AggHashMapWithOneNumberKey<PhmapSeed2>>,
        std::unique_ptr<Int16AggHashMapWithOneNumberKey<PhmapSeed2>>,
@ -373,7 +336,11 @@ using AggHashMapWithKeyPtr = std::variant<
        std::unique_ptr<NullOneStringTwoLevelAggHashMap<PhmapSeed2>>,
        std::unique_ptr<SerializedKeyFixedSize4AggHashMap<PhmapSeed2>>,
        std::unique_ptr<SerializedKeyFixedSize8AggHashMap<PhmapSeed2>>,
-        std::unique_ptr<SerializedKeyFixedSize16AggHashMap<PhmapSeed2>>>;
+        std::unique_ptr<SerializedKeyFixedSize16AggHashMap<PhmapSeed2>>,
+        std::unique_ptr<CompressedFixedSize1AggHashMap<PhmapSeed2>>,
+        std::unique_ptr<CompressedFixedSize4AggHashMap<PhmapSeed2>>,
+        std::unique_ptr<CompressedFixedSize8AggHashMap<PhmapSeed2>>,
+        std::unique_ptr<CompressedFixedSize16AggHashMap<PhmapSeed2>>>;

 using AggHashSetWithKeyPtr = std::variant<
        std::unique_ptr<UInt8AggHashSetOfOneNumberKey<PhmapSeed1>>,
@ -441,7 +408,16 @@ using AggHashSetWithKeyPtr = std::variant<
        std::unique_ptr<SerializedKeyAggHashSetFixedSize16<PhmapSeed1>>,
        std::unique_ptr<SerializedKeyAggHashSetFixedSize4<PhmapSeed2>>,
        std::unique_ptr<SerializedKeyAggHashSetFixedSize8<PhmapSeed2>>,
-        std::unique_ptr<SerializedKeyAggHashSetFixedSize16<PhmapSeed2>>>;
+        std::unique_ptr<SerializedKeyAggHashSetFixedSize16<PhmapSeed2>>,
+
+        std::unique_ptr<CompressedAggHashSetFixedSize1<PhmapSeed1>>,
+        std::unique_ptr<CompressedAggHashSetFixedSize4<PhmapSeed1>>,
+        std::unique_ptr<CompressedAggHashSetFixedSize8<PhmapSeed1>>,
+        std::unique_ptr<CompressedAggHashSetFixedSize16<PhmapSeed1>>,
+        std::unique_ptr<CompressedAggHashSetFixedSize1<PhmapSeed2>>,
+        std::unique_ptr<CompressedAggHashSetFixedSize4<PhmapSeed2>>,
+        std::unique_ptr<CompressedAggHashSetFixedSize8<PhmapSeed2>>,
+        std::unique_ptr<CompressedAggHashSetFixedSize16<PhmapSeed2>>>;
 } // namespace detail
 struct AggHashMapVariant {
    enum class Type {
@ -481,6 +457,11 @@ struct AggHashMapVariant {
        phase1_slice_fx8,
        phase1_slice_fx16,

+        phase1_slice_cx1,
+        phase1_slice_cx4,
+        phase1_slice_cx8,
+        phase1_slice_cx16,
+
        phase2_uint8,
        phase2_int8,
        phase2_int16,
@ -517,6 +498,10 @@ struct AggHashMapVariant {
        phase2_slice_fx8,
        phase2_slice_fx16,

+        phase2_slice_cx1,
+        phase2_slice_cx4,
+        phase2_slice_cx8,
+        phase2_slice_cx16,
    };

    detail::AggHashMapWithKeyPtr hash_map_with_key;
@ -630,6 +615,14 @@ struct AggHashSetVariant {
        phase2_slice_fx8,
        phase2_slice_fx16,

+        phase1_slice_cx1,
+        phase1_slice_cx4,
+        phase1_slice_cx8,
+        phase1_slice_cx16,
+        phase2_slice_cx1,
+        phase2_slice_cx4,
+        phase2_slice_cx8,
+        phase2_slice_cx16,
    };

    detail::AggHashSetWithKeyPtr hash_set_with_key;
--- a/be/src/exec/aggregate/aggregate_base_node.cpp
+++ b/be/src/exec/aggregate/aggregate_base_node.cpp
@ -14,7 +14,7 @@

 #include "exec/aggregate/aggregate_base_node.h"

-#include "gutil/strings/substitute.h"
+#include "exec/aggregator.h"

 namespace starrocks {

--- a/be/src/exec/aggregate/aggregate_base_node.h
+++ b/be/src/exec/aggregate/aggregate_base_node.h
@ -14,9 +14,7 @@

 #pragma once

-#include <any>
-
-#include "exec/aggregator.h"
+#include "exec/aggregator_fwd.h"
 #include "exec/exec_node.h"

 namespace starrocks {
--- a/be/src/exec/aggregate/aggregate_blocking_node.cpp
+++ b/be/src/exec/aggregate/aggregate_blocking_node.cpp
@ -16,13 +16,10 @@

 #include <memory>
 #include <type_traits>
-#include <variant>

 #include "exec/aggregator.h"
 #include "exec/pipeline/aggregate/aggregate_blocking_sink_operator.h"
 #include "exec/pipeline/aggregate/aggregate_blocking_source_operator.h"
-#include "exec/pipeline/aggregate/aggregate_streaming_sink_operator.h"
-#include "exec/pipeline/aggregate/aggregate_streaming_source_operator.h"
 #include "exec/pipeline/aggregate/sorted_aggregate_streaming_sink_operator.h"
 #include "exec/pipeline/aggregate/sorted_aggregate_streaming_source_operator.h"
 #include "exec/pipeline/aggregate/spillable_aggregate_blocking_sink_operator.h"
@ -32,12 +29,8 @@
 #include "exec/pipeline/chunk_accumulate_operator.h"
 #include "exec/pipeline/exchange/local_exchange_source_operator.h"
 #include "exec/pipeline/limit_operator.h"
-#include "exec/pipeline/noop_sink_operator.h"
 #include "exec/pipeline/operator.h"
 #include "exec/pipeline/pipeline_builder.h"
-#include "exec/pipeline/spill_process_operator.h"
-#include "exec/sorted_streaming_aggregator.h"
-#include "gutil/casts.h"
 #include "runtime/current_thread.h"
 #include "simd/simd.h"

@ -121,8 +114,7 @@ Status AggregateBlockingNode::open(RuntimeState* state) {
        if (_aggregator->hash_map_variant().size() == 0) {
            _aggregator->set_ht_eos();
        }
-        _aggregator->hash_map_variant().visit(
-                [&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
+        _aggregator->it_hash() = _aggregator->state_allocator().begin();
    } else if (_aggregator->is_none_group_by_exprs()) {
        // for aggregate no group by, if _num_input_rows is 0,
        // In update phase, we directly return empty chunk.
--- a/be/src/exec/aggregate/aggregate_streaming_node.cpp
+++ b/be/src/exec/aggregate/aggregate_streaming_node.cpp
@ -204,7 +204,7 @@ Status AggregateStreamingNode::get_next(RuntimeState* state, ChunkPtr* chunk, bo

 Status AggregateStreamingNode::_output_chunk_from_hash_map(ChunkPtr* chunk) {
    if (!_aggregator->it_hash().has_value()) {
-        _aggregator->it_hash() = _aggregator->_state_allocator.begin();
+        _aggregator->it_hash() = _aggregator->state_allocator().begin();
        COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_map_variant().size());
    }

--- a/be/src/exec/aggregate/compress_serializer.cpp
+++ b/be/src/exec/aggregate/compress_serializer.cpp
@ -0,0 +1,296 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <any>
+#include <optional>
+
+#include "column/column_helper.h"
+#include "column/column_visitor_adapter.h"
+#include "column/decimalv3_column.h"
+#include "column/nullable_column.h"
+#include "common/status.h"
+#include "exprs/literal.h"
+#include "types/logical_type_infra.h"
+#include "util/unaligned_access.h"
+
+namespace starrocks {
+
+template <size_t N>
+struct int_type {};
+
+template <>
+struct int_type<1> {
+    using type = int8_t;
+};
+template <>
+struct int_type<2> {
+    using type = int16_t;
+};
+template <>
+struct int_type<4> {
+    using type = int32_t;
+};
+template <>
+struct int_type<8> {
+    using type = int64_t;
+};
+template <>
+struct int_type<16> {
+    using type = __int128;
+};
+
+template <class T>
+int leading_zeros(T v) {
+    if (v == 0) return sizeof(T) * 8;
+    typename std::make_unsigned<T>::type uv = v;
+    return __builtin_clzll(static_cast<size_t>(uv)) - (sizeof(size_t) * 8 - sizeof(T) * 8);
+}
+
+template <>
+int leading_zeros<int128_t>(int128_t v) {
+    uint64_t high = (uint64_t)(v >> 64);
+    uint64_t low = (uint64_t)v;
+
+    if (high != 0) {
+        return leading_zeros(high);
+    } else {
+        return 64 + leading_zeros(low);
+    }
+}
+
+template <class T>
+int get_used_bits(T min, T max) {
+    using IntType = typename int_type<sizeof(T)>::type;
+    auto vmin = unaligned_load<IntType>(&min);
+    auto vmax = unaligned_load<IntType>(&max);
+    IntType delta = vmax - vmin;
+    return sizeof(T) * 8 - (leading_zeros<IntType>(delta));
+}
+
+std::optional<int> get_used_bits(LogicalType ltype, const VectorizedLiteral& begin, const VectorizedLiteral& end,
+                                 std::any& base) {
+    size_t used_bits = 0;
+    bool applied = scalar_type_dispatch(ltype, [&]<LogicalType Type>() {
+        if constexpr ((lt_is_integer<Type> || lt_is_decimal<Type> ||
+                       lt_is_date<Type>)&&(sizeof(RunTimeCppType<Type>) <= 16)) {
+            RunTimeCppType<Type> cs_min = ColumnHelper::get_const_value<Type>(begin.value().get());
+            RunTimeCppType<Type> cs_max = ColumnHelper::get_const_value<Type>(end.value().get());
+            base = cs_min;
+            used_bits = get_used_bits(cs_min, cs_max);
+            return true;
+        }
+        return false;
+    });
+    if (applied) {
+        return used_bits;
+    }
+    return {};
+}
+
+template <class TSrc, class TDst>
+void bitcompress_serialize(const TSrc* __restrict val, const uint8_t* __restrict nulls, TSrc base, size_t n, int offset,
+                           TDst* __restrict dst) {
+    using UTSrc = typename std::make_unsigned<TSrc>::type;
+    if (nulls == nullptr) {
+        for (size_t i = 0; i < n; ++i) {
+            TDst v = UTSrc(val[i] - base);
+            dst[i] |= v << offset;
+        }
+    } else {
+        for (size_t i = 0; i < n; ++i) {
+            TDst v = UTSrc(val[i] - base) & ~(-static_cast<TSrc>(nulls[i]));
+            dst[i] |= TDst(nulls[i]) << offset;
+            dst[i] |= v << (offset + 1);
+        }
+    }
+}
+
+template <class Dst>
+class CompressSerializer : public ColumnVisitorAdapter<CompressSerializer<Dst>> {
+public:
+    using Base = ColumnVisitorAdapter<CompressSerializer<Dst>>;
+    CompressSerializer(Dst* dst, const std::any& base, int offset)
+            : Base(this), _dst(dst), _base(base), _offset(offset) {}
+
+    Status do_visit(const NullableColumn& column) {
+        _null_data = column.null_column_data().data();
+        return column.data_column()->accept(this);
+    }
+
+    template <typename Column, typename T>
+    void bit_compress(const Column& column) {
+        if constexpr (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16) {
+            using SrcType = typename int_type<sizeof(T)>::type;
+            const auto& container = column.get_data();
+            const auto& raw_data = container.data();
+            size_t n = container.size();
+            auto base = std::any_cast<T>(_base);
+            auto tbase = unaligned_load<SrcType>(&base);
+            bitcompress_serialize((SrcType*)raw_data, _null_data, tbase, n, _offset, _dst);
+        } else {
+            CHECK(false) << "unreachable";
+        }
+    }
+
+    template <typename T>
+    Status do_visit(const FixedLengthColumn<T>& column) {
+        bit_compress<FixedLengthColumn<T>, T>(column);
+        return Status::OK();
+    }
+
+    template <typename T>
+    Status do_visit(const DecimalV3Column<T>& column) {
+        bit_compress<DecimalV3Column<T>, T>(column);
+        return Status::OK();
+    }
+
+    template <typename T>
+    Status do_visit(const T& column) {
+        CHECK(false) << "unreachable";
+        return Status::NotSupported("unsupported type");
+    }
+
+private:
+    Dst* _dst;
+    const std::any& _base;
+    int _offset;
+    const uint8_t* _null_data = nullptr;
+};
+
+template <class T>
+T mask(T bits) {
+    if (bits == sizeof(T) * 8) return ~T(0);
+    return (T(1) << bits) - 1;
+}
+
+template <class TSrc, class TDst>
+void bitcompress_deserialize(const TSrc* __restrict src, uint8_t* __restrict nulls, TDst base, int n, int used_bits,
+                             int offset, TDst* __restrict dst) {
+    typename std::make_unsigned<TSrc>::type* usrc = (typename std::make_unsigned<TSrc>::type*)src;
+    const uint8_t mask1 = mask<uint8_t>(1);
+    const TSrc mask2 = mask<TSrc>(used_bits - offset - (nulls != nullptr));
+    if (nulls == nullptr) {
+        for (size_t i = 0; i < n; ++i) {
+            dst[i] = ((usrc[i] >> (offset)) & mask2) + base;
+        }
+    } else {
+        for (size_t i = 0; i < n; ++i) {
+            nulls[i] = (usrc[i] >> offset) & mask1;
+            dst[i] = ((usrc[i] >> (offset + 1)) & mask2) + base;
+        }
+    }
+}
+
+template <class Src>
+class CompressDeserializer final : public ColumnVisitorMutableAdapter<CompressDeserializer<Src>> {
+public:
+    using Base = ColumnVisitorMutableAdapter<CompressDeserializer<Src>>;
+    explicit CompressDeserializer(size_t num_rows, Src* src, const std::any& base, int offset, int used_bits)
+            : Base(this), _num_rows(num_rows), _src(src), _base(base), _offset(offset), _used_bits(used_bits) {}
+
+    Status do_visit(NullableColumn* column) {
+        // TODO: opt me
+        column->null_column_data().resize(_num_rows);
+        _null_data = column->null_column_data().data();
+        RETURN_IF_ERROR(column->data_column()->accept_mutable(this));
+        column->update_has_null();
+        return Status::OK();
+    }
+
+    template <typename Column, typename T>
+    void bit_decompress(Column* column) {
+        if constexpr (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16) {
+            using DstType = typename int_type<sizeof(T)>::type;
+            column->resize(_num_rows);
+            auto& container = column->get_data();
+            auto* raw_data = container.data();
+            auto base = std::any_cast<T>(_base);
+            auto tbase = unaligned_load<DstType>(&base);
+            bitcompress_deserialize(_src, _null_data, tbase, _num_rows, _used_bits, _offset, (DstType*)raw_data);
+        } else {
+            CHECK(false) << "unreachable";
+        }
+    }
+
+    template <typename T>
+    Status do_visit(FixedLengthColumn<T>* column) {
+        bit_decompress<FixedLengthColumn<T>, T>(column);
+        return Status::OK();
+    }
+
+    template <typename T>
+    Status do_visit(DecimalV3Column<T>* column) {
+        bit_decompress<DecimalV3Column<T>, T>(column);
+        return Status::OK();
+    }
+
+    template <typename T>
+    Status do_visit(const T& column) {
+        DCHECK(false) << "unreachable";
+        return Status::NotSupported("unsupported type");
+    }
+
+private:
+    size_t _num_rows;
+    const Src* _src;
+    const std::any& _base;
+    int _offset;
+    int _used_bits;
+    uint8_t* _null_data = nullptr;
+};
+
+void bitcompress_serialize(const Columns& columns, const std::vector<std::any>& bases, const std::vector<int>& offsets,
+                           size_t num_rows, size_t fixed_key_size, void* buffer) {
+    for (size_t i = 0; i < columns.size(); ++i) {
+        if (fixed_key_size == 1) {
+            CompressSerializer<uint8_t> serializer((uint8_t*)buffer, bases[i], offsets[i]);
+            (void)columns[i]->accept(&serializer);
+        } else if (fixed_key_size == 4) {
+            CompressSerializer<int> serializer((int*)buffer, bases[i], offsets[i]);
+            (void)columns[i]->accept(&serializer);
+        } else if (fixed_key_size == 8) {
+            CompressSerializer<int64_t> serializer((int64_t*)buffer, bases[i], offsets[i]);
+            (void)columns[i]->accept(&serializer);
+        } else if (fixed_key_size == 16) {
+            CompressSerializer<int128_t> serializer((int128_t*)buffer, bases[i], offsets[i]);
+            (void)columns[i]->accept(&serializer);
+        } else {
+            DCHECK(false) << "unreachable path";
+        }
+    }
+}
+
+void bitcompress_deserialize(Columns& columns, const std::vector<std::any>& bases, const std::vector<int>& offsets,
+                             const std::vector<int>& used_bits, size_t num_rows, size_t fixed_key_size, void* buffer) {
+    for (size_t i = 0; i < columns.size(); ++i) {
+        if (fixed_key_size == 1) {
+            CompressDeserializer<uint8_t> deserializer(num_rows, (uint8_t*)buffer, bases[i], offsets[i], used_bits[i]);
+            (void)columns[i]->accept_mutable(&deserializer);
+        } else if (fixed_key_size == 4) {
+            CompressDeserializer<int> deserializer(num_rows, (int*)buffer, bases[i], offsets[i], used_bits[i]);
+            (void)columns[i]->accept_mutable(&deserializer);
+        } else if (fixed_key_size == 8) {
+            CompressDeserializer<int64_t> deserializer(num_rows, (int64_t*)buffer, bases[i], offsets[i], used_bits[i]);
+            (void)columns[i]->accept_mutable(&deserializer);
+        } else if (fixed_key_size == 16) {
+            CompressDeserializer<int128_t> deserializer(num_rows, (int128_t*)buffer, bases[i], offsets[i],
+                                                        used_bits[i]);
+            (void)columns[i]->accept_mutable(&deserializer);
+        } else {
+            DCHECK(false) << "unreachable path";
+        }
+    }
+}
+
+} // namespace starrocks
--- a/be/src/exec/aggregate/compress_serializer.h
+++ b/be/src/exec/aggregate/compress_serializer.h
@ -0,0 +1,48 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <any>
+
+#include "column/column.h"
+#include "types/logical_type.h"
+
+namespace starrocks {
+class VectorizedLiteral;
+/**
+ * Calculates the number of bits used between a given range for a specified logical type.
+ * 
+ * This function calculates the number of bits required for a given logical type and a specified range
+ * of start and end values. The result is an optional integer representing the calculated number of bits.
+ * 
+ * If we input a column that does not support bit compress, we will return an empty optional.
+ */
+std::optional<int> get_used_bits(LogicalType ltype, const VectorizedLiteral& begin, const VectorizedLiteral& end,
+                                 std::any& base);
+
+/**
+ * serialize column data into a bit-compressed format.
+ */
+void bitcompress_serialize(const Columns& columns, const std::vector<std::any>& bases, const std::vector<int>& offsets,
+                           size_t num_rows, size_t fixed_key_size, void* buffer);
+
+/**
+ * deserialize column data from a bit-compressed format.
+ * 
+ */
+void bitcompress_deserialize(Columns& columns, const std::vector<std::any>& bases, const std::vector<int>& offsets,
+                             const std::vector<int>& used_bits, size_t num_rows, size_t fixed_key_size, void* buffer);
+
+} // namespace starrocks
--- a/be/src/exec/aggregator.cpp
+++ b/be/src/exec/aggregator.cpp
@ -17,27 +17,27 @@
 #include <algorithm>
 #include <memory>
 #include <type_traits>
-#include <variant>
+#include <utility>

 #include "column/chunk.h"
 #include "column/column_helper.h"
 #include "column/vectorized_fwd.h"
-#include "common/config.h"
+#include "common/logging.h"
 #include "common/status.h"
 #include "exec/agg_runtime_filter_builder.h"
+#include "exec/aggregate/agg_hash_variant.h"
 #include "exec/aggregate/agg_profile.h"
 #include "exec/exec_node.h"
-#include "exec/limited_pipeline_chunk_buffer.h"
 #include "exec/pipeline/operator.h"
-#include "exec/spill/spiller.hpp"
 #include "exprs/agg/agg_state_if.h"
 #include "exprs/agg/agg_state_merge.h"
 #include "exprs/agg/agg_state_union.h"
+#include "exprs/agg/aggregate_factory.h"
 #include "exprs/agg/aggregate_state_allocator.h"
+#include "exprs/literal.h"
 #include "gen_cpp/PlanNodes_types.h"
 #include "runtime/current_thread.h"
 #include "runtime/descriptors.h"
-#include "runtime/memory/roaring_hook.h"
 #include "types/logical_type.h"
 #include "udf/java/utils.h"
 #include "util/runtime_profile.h"
@ -52,6 +52,60 @@ static const std::string AGG_STATE_MERGE_SUFFIX = "_merge";
 static const std::string AGG_STATE_IF_SUFFIX = "_if";
 static const std::string FUNCTION_COUNT = "count";

+template <class HashMapWithKey>
+struct AllocateState {
+    AllocateState(Aggregator* aggregator_) : aggregator(aggregator_) {}
+    inline AggDataPtr operator()(const typename HashMapWithKey::KeyType& key);
+    inline AggDataPtr operator()(std::nullptr_t);
+
+private:
+    Aggregator* aggregator;
+};
+
+template <class HashMapWithKey>
+inline AggDataPtr AllocateState<HashMapWithKey>::operator()(const typename HashMapWithKey::KeyType& key) {
+    AggDataPtr agg_state = aggregator->_state_allocator.allocate();
+    *reinterpret_cast<typename HashMapWithKey::KeyType*>(agg_state) = key;
+    size_t created = 0;
+    size_t aggregate_function_sz = aggregator->_agg_fn_ctxs.size();
+    try {
+        for (int i = 0; i < aggregate_function_sz; i++) {
+            aggregator->_agg_functions[i]->create(aggregator->_agg_fn_ctxs[i],
+                                                  agg_state + aggregator->_agg_states_offsets[i]);
+            created++;
+        }
+        return agg_state;
+    } catch (std::bad_alloc& e) {
+        for (size_t i = 0; i < created; ++i) {
+            aggregator->_agg_functions[i]->destroy(aggregator->_agg_fn_ctxs[i],
+                                                   agg_state + aggregator->_agg_states_offsets[i]);
+        }
+        aggregator->_state_allocator.rollback();
+        throw;
+    }
+}
+
+template <class HashMapWithKey>
+inline AggDataPtr AllocateState<HashMapWithKey>::operator()(std::nullptr_t) {
+    AggDataPtr agg_state = aggregator->_state_allocator.allocate_null_key_data();
+    size_t created = 0;
+    size_t aggregate_function_sz = aggregator->_agg_fn_ctxs.size();
+    try {
+        for (int i = 0; i < aggregate_function_sz; i++) {
+            aggregator->_agg_functions[i]->create(aggregator->_agg_fn_ctxs[i],
+                                                  agg_state + aggregator->_agg_states_offsets[i]);
+            created++;
+        }
+        return agg_state;
+    } catch (std::bad_alloc& e) {
+        for (int i = 0; i < created; i++) {
+            aggregator->_agg_functions[i]->destroy(aggregator->_agg_fn_ctxs[i],
+                                                   agg_state + aggregator->_agg_states_offsets[i]);
+        }
+        throw;
+    }
+}
+
 template <bool UseIntermediateAsOutput>
 bool AggFunctionTypes::is_result_nullable() const {
    if constexpr (UseIntermediateAsOutput) {
@ -143,6 +197,9 @@ AggregatorParamsPtr convert_to_aggregator_params(const TPlanNode& tnode) {
        params->intermediate_aggr_exprs = tnode.agg_node.intermediate_aggr_exprs;
        params->enable_pipeline_share_limit =
                tnode.agg_node.__isset.enable_pipeline_share_limit ? tnode.agg_node.enable_pipeline_share_limit : false;
+        params->grouping_min_max =
+                tnode.agg_node.__isset.group_by_min_max ? tnode.agg_node.group_by_min_max : std::vector<TExpr>{};
+
        break;
    }
    default:
@ -358,6 +415,16 @@ Status Aggregator::prepare(RuntimeState* state, ObjectPool* pool, RuntimeProfile

    RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _params->conjuncts, &_conjunct_ctxs, state, true));
    RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _params->grouping_exprs, &_group_by_expr_ctxs, state, true));
+    RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _params->grouping_min_max, &_group_by_min_max, state, true));
+    _ranges.resize(_group_by_expr_ctxs.size());
+    if (_group_by_min_max.size() == _group_by_expr_ctxs.size() * 2) {
+        for (size_t i = 0; i < _group_by_expr_ctxs.size(); ++i) {
+            std::pair<VectorizedLiteral*, VectorizedLiteral*> range;
+            range.first = down_cast<VectorizedLiteral*>(_group_by_min_max[i * 2]->root());
+            range.second = down_cast<VectorizedLiteral*>(_group_by_min_max[i * 2 + 1]->root());
+            _ranges[i] = range;
+        }
+    }

    // add profile attributes
    if (!_params->sql_grouping_keys.empty()) {
@ -582,7 +649,7 @@ Status Aggregator::_create_aggregate_function(starrocks::RuntimeState* state, co
            TypeDescriptor return_type = TypeDescriptor::from_thrift(fn.ret_type);
            TypeDescriptor serde_type = TypeDescriptor::from_thrift(fn.aggregate_fn.intermediate_type);
            DCHECK_LE(1, fn.arg_types.size());
-            TypeDescriptor arg_type = arg_types[0];
+            const TypeDescriptor& arg_type = arg_types[0];
            auto* func = get_aggregate_function(func_name, return_type, arg_types, is_result_nullable, fn.binary_type,
                                                state->func_version());
            if (func == nullptr) {
@ -1287,19 +1354,76 @@ Status Aggregator::evaluate_agg_fn_exprs(Chunk* chunk, bool use_intermediate) {
    return Status::OK();
 }

-bool is_group_columns_fixed_size(std::vector<ExprContext*>& group_by_expr_ctxs, std::vector<ColumnType>& group_by_types,
-                                 size_t* max_size, bool* has_null) {
+bool could_apply_bitcompress_opt(
+        const std::vector<ColumnType>& group_by_types,
+        const std::vector<std::optional<std::pair<VectorizedLiteral*, VectorizedLiteral*>>>& ranges,
+        std::vector<std::any>& base, std::vector<int>& used_bytes, size_t* max_size, bool* has_null) {
+    size_t accumulated = 0;
+    size_t accumulated_fixed_length_bits = 0;
+    for (size_t i = 0; i < group_by_types.size(); i++) {
+        size_t size = 0;
+        // 1 bytes for null flag.
+        if (group_by_types[i].is_nullable) {
+            *has_null = true;
+            size += 1;
+        }
+        if (group_by_types[i].result_type.is_complex_type()) {
+            return false;
+        }
+        LogicalType ltype = group_by_types[i].result_type.type;
+
+        size_t fixed_base_size = get_size_of_fixed_length_type(ltype);
+        if (fixed_base_size == 0) return false;
+        accumulated_fixed_length_bits += fixed_base_size * 8;
+
+        if (!ranges[i].has_value()) {
+            return false;
+        }
+        auto used_bits = get_used_bits(ltype, *ranges[i]->first, *ranges[i]->second, base[i]);
+        if (!used_bits.has_value()) {
+            return false;
+        }
+        size += used_bits.value();
+
+        accumulated += size;
+        used_bytes[i] = accumulated;
+    }
+    auto get_level = [](size_t used_bits) {
+        if (used_bits <= sizeof(uint8_t) * 8)
+            return 1;
+        else if (used_bits <= sizeof(uint16_t) * 8)
+            return 2;
+        else if (used_bits <= sizeof(uint32_t) * 8)
+            return 3;
+        else if (used_bits <= sizeof(uint64_t) * 8)
+            return 4;
+        else if (used_bits <= sizeof(int128_t) * 8)
+            return 5;
+        else
+            return 6;
+    };
+    // If they are at the same level, grouping by compressed key will not optimize performance, so we disable it.
+    // eg: For example, two int32 values both have a threshold of 0-2^32, so they need to use group by int64.
+    // In this case, there will be no optimization effect. We disable this situation.
+    if (get_level(accumulated_fixed_length_bits) > get_level(accumulated)) {
+        *max_size = accumulated;
+        return true;
+    }
+    return false;
+}
+
+bool is_group_columns_fixed_size(std::vector<ColumnType>& group_by_types, size_t* max_size, bool* has_null) {
    size_t size = 0;
    *has_null = false;

-    for (size_t i = 0; i < group_by_expr_ctxs.size(); i++) {
-        ExprContext* ctx = group_by_expr_ctxs[i];
+    for (size_t i = 0; i < group_by_types.size(); i++) {
+        // 1 bytes for null flag.
        if (group_by_types[i].is_nullable) {
            *has_null = true;
-            size += 1; // 1 bytes for  null flag.
+            size += 1;
        }
-        LogicalType ltype = ctx->root()->type().type;
-        if (ctx->root()->type().is_complex_type()) {
+        LogicalType ltype = group_by_types[i].result_type.type;
+        if (group_by_types[i].result_type.is_complex_type()) {
            return false;
        }
        size_t byte_size = get_size_of_fixed_length_type(ltype);
@ -1311,20 +1435,30 @@ bool is_group_columns_fixed_size(std::vector<ExprContext*>& group_by_expr_ctxs,
 }

 template <typename HashVariantType>
-void Aggregator::_init_agg_hash_variant(HashVariantType& hash_variant) {
+typename HashVariantType::Type Aggregator::_get_hash_table_type() {
    auto type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice : HashVariantType::Type::phase2_slice;
-    if (_group_by_expr_ctxs.size() == 1) {
-        type = HashVariantResolver<HashVariantType>::instance().get_unary_type(
-                _aggr_phase, _group_by_types[0].result_type.type, _has_nullable_key);
+    if (_group_by_types.empty()) {
+        return type;
    }
+    // using one key hash table
+    if (_group_by_types.size() == 1) {
+        bool nullable = _group_by_types[0].is_nullable;
+        LogicalType type = _group_by_types[0].result_type.type;
+        return HashVariantResolver<HashVariantType>::instance().get_unary_type(_aggr_phase, type, nullable);
+    }
+    return type;
+}

+template <typename HashVariantType>
+typename HashVariantType::Type Aggregator::_try_to_apply_fixed_size_opt(typename HashVariantType::Type type,
+                                                                        bool* has_null, int* fixed_size) {
    bool has_null_column = false;
    int fixed_byte_size = 0;
    // this optimization don't need to be limited to multi-column group by.
    // single column like float/double/decimal/largeint could also be applied to.
    if (type == HashVariantType::Type::phase1_slice || type == HashVariantType::Type::phase2_slice) {
        size_t max_size = 0;
-        if (is_group_columns_fixed_size(_group_by_expr_ctxs, _group_by_types, &max_size, &has_null_column)) {
+        if (is_group_columns_fixed_size(_group_by_types, &max_size, &has_null_column)) {
            // we need reserve a byte for serialization length for nullable columns
            if (max_size < 4 || (!has_null_column && max_size == 4)) {
                type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_fx4
@ -1341,6 +1475,99 @@ void Aggregator::_init_agg_hash_variant(HashVariantType& hash_variant) {
            }
        }
    }
+    *has_null = has_null_column;
+    *fixed_size = fixed_byte_size;
+    return type;
+}
+
+template <typename HashVariantType>
+typename HashVariantType::Type Aggregator::_try_to_apply_compressed_key_opt(typename HashVariantType::Type input_type,
+                                                                            CompressKeyContext* ctx) {
+    typename HashVariantType::Type type = input_type;
+    if (_group_by_types.empty()) {
+        return type;
+    }
+    for (size_t i = 0; i < _ranges.size(); ++i) {
+        if (!_ranges[i].has_value()) {
+            return type;
+        }
+    }
+
+    // check apply bit compress opt
+    {
+        bool has_null_column;
+        size_t new_max_bit_size = 0;
+        std::vector<int>& offsets = ctx->offsets;
+        std::vector<int>& used_bits = ctx->used_bits;
+        std::vector<std::any>& bases = ctx->bases;
+
+        size_t group_by_keys = _group_by_types.size();
+        used_bits.resize(group_by_keys);
+        offsets.resize(group_by_keys);
+        bases.resize(group_by_keys);
+
+        if (could_apply_bitcompress_opt(_group_by_types, _ranges, bases, used_bits, &new_max_bit_size,
+                                        &has_null_column)) {
+            if (_group_by_types.size() > 0) {
+                if (new_max_bit_size <= 8) {
+                    type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_cx1
+                                                     : HashVariantType::Type::phase2_slice_cx1;
+                } else if (new_max_bit_size <= 4 * 8) {
+                    type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_cx4
+                                                     : HashVariantType::Type::phase2_slice_cx4;
+                } else if (new_max_bit_size <= 8 * 8) {
+                    type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_cx8
+                                                     : HashVariantType::Type::phase2_slice_cx8;
+                } else if (new_max_bit_size <= 16 * 8) {
+                    type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_cx16
+                                                     : HashVariantType::Type::phase2_slice_cx16;
+                }
+            }
+        }
+
+        offsets[0] = 0;
+        for (size_t i = 1; i < group_by_keys; ++i) {
+            offsets[i] = used_bits[i - 1];
+        }
+    }
+    return type;
+}
+
+template <typename HashVariantType>
+void Aggregator::_build_hash_variant(HashVariantType& hash_variant, typename HashVariantType::Type type,
+                                     CompressKeyContext&& context) {
+    hash_variant.init(_state, type, _agg_stat);
+    hash_variant.visit([&](auto& variant) {
+        if constexpr (is_compressed_fixed_size_key<std::decay_t<decltype(*variant)>>) {
+            variant->offsets = std::move(context.offsets);
+            variant->used_bits = std::move(context.used_bits);
+            variant->bases = std::move(context.bases);
+        }
+    });
+}
+
+template <typename HashVariantType>
+void Aggregator::_init_agg_hash_variant(HashVariantType& hash_variant) {
+    auto type = _get_hash_table_type<HashVariantType>();
+
+    CompressKeyContext compress_key_ctx;
+    bool apply_compress_key_opt = false;
+    typename HashVariantType::Type prev_type = type;
+    type = _try_to_apply_compressed_key_opt<HashVariantType>(type, &compress_key_ctx);
+    apply_compress_key_opt = prev_type != type;
+    if (apply_compress_key_opt) {
+        // build with compressed key
+        VLOG_ROW << "apply compressed key";
+        _build_hash_variant<HashVariantType>(hash_variant, type, std::move(compress_key_ctx));
+        return;
+    }
+
+    bool has_null_column = false;
+    int fixed_byte_size = 0;
+
+    if (_group_by_types.size() > 1) {
+        type = _try_to_apply_fixed_size_opt<HashVariantType>(type, &has_null_column, &fixed_byte_size);
+    }

    VLOG_ROW << "hash type is "
             << static_cast<typename std::underlying_type<typename HashVariantType::Type>::type>(type);
--- a/be/src/exec/aggregator.h
+++ b/be/src/exec/aggregator.h
@ -19,40 +19,34 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <mutex>
 #include <new>
-#include <queue>
 #include <utility>

 #include "column/chunk.h"
 #include "column/column_helper.h"
-#include "column/type_traits.h"
 #include "column/vectorized_fwd.h"
 #include "common/object_pool.h"
 #include "common/statusor.h"
 #include "exec/aggregate/agg_hash_variant.h"
 #include "exec/aggregate/agg_profile.h"
-#include "exec/chunk_buffer_memory_manager.h"
+#include "exec/aggregator_fwd.h"
 #include "exec/limited_pipeline_chunk_buffer.h"
 #include "exec/pipeline/context_with_dependency.h"
 #include "exec/pipeline/schedule/observer.h"
 #include "exec/pipeline/spill_process_channel.h"
-#include "exprs/agg/aggregate_factory.h"
+#include "exprs/agg/aggregate.h"
 #include "exprs/expr.h"
-#include "gen_cpp/QueryPlanExtra_types.h"
-#include "gutil/strings/substitute.h"
-#include "runtime/current_thread.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem_pool.h"
 #include "runtime/memory/counting_allocator.h"
 #include "runtime/runtime_state.h"
 #include "runtime/types.h"
-#include "util/defer_op.h"

 namespace starrocks {
 class RuntimeFilter;
 class AggInRuntimeFilterMerger;
 struct HashTableKeyAllocator;
+class VectorizedLiteral;

 struct RawHashTableIterator {
    RawHashTableIterator(HashTableKeyAllocator* alloc_, size_t x_, int y_) : alloc(alloc_), x(x_), y(y_) {}
@ -117,19 +111,6 @@ inline uint8_t* RawHashTableIterator::value() {
    return static_cast<uint8_t*>(alloc->vecs[x].first) + alloc->aggregate_key_size * y;
 }

-class Aggregator;
-class SortedStreamingAggregator;
-
-template <class HashMapWithKey>
-struct AllocateState {
-    AllocateState(Aggregator* aggregator_) : aggregator(aggregator_) {}
-    inline AggDataPtr operator()(const typename HashMapWithKey::KeyType& key);
-    inline AggDataPtr operator()(std::nullptr_t);
-
-private:
-    Aggregator* aggregator;
-};
-
 struct AggFunctionTypes {
    TypeDescriptor result_type;
    TypeDescriptor serde_type; // for serialize
@ -227,6 +208,7 @@ struct AggregatorParams {
    std::vector<TExpr> grouping_exprs;
    std::vector<TExpr> aggregate_functions;
    std::vector<TExpr> intermediate_aggr_exprs;
+    std::vector<TExpr> grouping_min_max;

    // Incremental MV
    // Whether it's testing, use MemStateTable in testing, instead use IMTStateTable.
@ -255,12 +237,6 @@ AggregatorParamsPtr convert_to_aggregator_params(const TPlanNode& tnode);
 // it contains common data struct and algorithm of aggregation
 class Aggregator : public pipeline::ContextWithDependency {
 public:
-#ifdef NDEBUG
-    static constexpr size_t two_level_memory_threshold = 33554432; // 32M, L3 Cache
-#else
-    static constexpr size_t two_level_memory_threshold = 64;
-#endif
-
    Aggregator(AggregatorParamsPtr params);

    ~Aggregator() noexcept override {
@ -414,7 +390,7 @@ public:

    bool is_streaming_all_states() const { return _streaming_all_states; }

-    HashTableKeyAllocator _state_allocator;
+    HashTableKeyAllocator& state_allocator() { return _state_allocator; }

    void attach_sink_observer(RuntimeState* state, pipeline::PipelineObserver* observer) {
        _pip_observable.attach_sink_observer(state, observer);
@ -435,6 +411,8 @@ protected:
    std::unique_ptr<MemPool> _mem_pool;
    // used to count heap memory usage of agg states
    std::unique_ptr<CountingAllocatorWithHook> _allocator;
+
+    HashTableKeyAllocator _state_allocator;
    // The open phase still relies on the TFunction object for some initialization operations
    std::vector<TFunction> _fns;

@ -501,6 +479,8 @@ protected:

    // Exprs used to evaluate group by column
    std::vector<ExprContext*> _group_by_expr_ctxs;
+    std::vector<ExprContext*> _group_by_min_max;
+    std::vector<std::optional<std::pair<VectorizedLiteral*, VectorizedLiteral*>>> _ranges;
    Columns _group_by_columns;
    std::vector<ColumnType> _group_by_types;

@ -598,6 +578,24 @@ protected:
    // Choose different agg hash map/set by different group by column's count, type, nullable
    template <typename HashVariantType>
    void _init_agg_hash_variant(HashVariantType& hash_variant);
+    // get spec hash table/set type
+    template <typename HashVariantType>
+    typename HashVariantType::Type _get_hash_table_type();
+
+    template <typename HashVariantType>
+    typename HashVariantType::Type _try_to_apply_fixed_size_opt(typename HashVariantType::Type type,
+                                                                bool* has_null_column, int* fixed_byte_size);
+    struct CompressKeyContext {
+        std::vector<int> offsets;
+        std::vector<int> used_bits;
+        std::vector<std::any> bases;
+    };
+    template <typename HashVariantType>
+    typename HashVariantType::Type _try_to_apply_compressed_key_opt(typename HashVariantType::Type input_type,
+                                                                    CompressKeyContext* ctx);
+    template <typename HashVariantType>
+    void _build_hash_variant(HashVariantType& hash_variant, typename HashVariantType::Type type,
+                             CompressKeyContext&& context);

    void _release_agg_memory();

@ -608,7 +606,7 @@ protected:

    int64_t get_two_level_threahold() {
        if (config::two_level_memory_threshold < 0) {
-            return two_level_memory_threshold;
+            return agg::two_level_memory_threshold;
        }
        return config::two_level_memory_threshold;
    }
@ -617,50 +615,6 @@ protected:
    friend struct AllocateState;
 };

-template <class HashMapWithKey>
-inline AggDataPtr AllocateState<HashMapWithKey>::operator()(const typename HashMapWithKey::KeyType& key) {
-    AggDataPtr agg_state = aggregator->_state_allocator.allocate();
-    *reinterpret_cast<typename HashMapWithKey::KeyType*>(agg_state) = key;
-    size_t created = 0;
-    size_t aggregate_function_sz = aggregator->_agg_fn_ctxs.size();
-    try {
-        for (int i = 0; i < aggregate_function_sz; i++) {
-            aggregator->_agg_functions[i]->create(aggregator->_agg_fn_ctxs[i],
-                                                  agg_state + aggregator->_agg_states_offsets[i]);
-            created++;
-        }
-        return agg_state;
-    } catch (std::bad_alloc& e) {
-        for (size_t i = 0; i < created; ++i) {
-            aggregator->_agg_functions[i]->destroy(aggregator->_agg_fn_ctxs[i],
-                                                   agg_state + aggregator->_agg_states_offsets[i]);
-        }
-        aggregator->_state_allocator.rollback();
-        throw;
-    }
-}
-
-template <class HashMapWithKey>
-inline AggDataPtr AllocateState<HashMapWithKey>::operator()(std::nullptr_t) {
-    AggDataPtr agg_state = aggregator->_state_allocator.allocate_null_key_data();
-    size_t created = 0;
-    size_t aggregate_function_sz = aggregator->_agg_fn_ctxs.size();
-    try {
-        for (int i = 0; i < aggregate_function_sz; i++) {
-            aggregator->_agg_functions[i]->create(aggregator->_agg_fn_ctxs[i],
-                                                  agg_state + aggregator->_agg_states_offsets[i]);
-            created++;
-        }
-        return agg_state;
-    } catch (std::bad_alloc& e) {
-        for (int i = 0; i < created; i++) {
-            aggregator->_agg_functions[i]->destroy(aggregator->_agg_fn_ctxs[i],
-                                                   agg_state + aggregator->_agg_states_offsets[i]);
-        }
-        throw;
-    }
-}
-
 inline bool LimitedMemAggState::has_limited(const Aggregator& aggregator) const {
    return limited_memory_size > 0 && aggregator.memory_usage() >= limited_memory_size;
 }
@ -702,11 +656,4 @@ private:
    std::atomic<int64_t> _shared_limit_countdown;
 };

-using AggregatorFactory = AggregatorFactoryBase<Aggregator>;
-using AggregatorFactoryPtr = std::shared_ptr<AggregatorFactory>;
-
-using SortedStreamingAggregatorPtr = std::shared_ptr<SortedStreamingAggregator>;
-using StreamingAggregatorFactory = AggregatorFactoryBase<SortedStreamingAggregator>;
-using StreamingAggregatorFactoryPtr = std::shared_ptr<StreamingAggregatorFactory>;
-
 } // namespace starrocks
--- a/be/src/exec/aggregator_fwd.h
+++ b/be/src/exec/aggregator_fwd.h
@ -0,0 +1,32 @@
+#pragma once
+
+#include <cstddef>
+#include <memory>
+
+namespace starrocks {
+namespace agg {
+#ifdef NDEBUG
+constexpr size_t two_level_memory_threshold = 33554432; // 32M, L3 Cache
+#else
+constexpr size_t two_level_memory_threshold = 64;
+#endif
+} // namespace agg
+
+class Aggregator;
+class SortedStreamingAggregator;
+using AggregatorPtr = std::shared_ptr<Aggregator>;
+using SortedStreamingAggregatorPtr = std::shared_ptr<SortedStreamingAggregator>;
+
+template <class HashMapWithKey>
+struct AllocateState;
+
+template <class T>
+class AggregatorFactoryBase;
+
+using AggregatorFactory = AggregatorFactoryBase<Aggregator>;
+using AggregatorFactoryPtr = std::shared_ptr<AggregatorFactory>;
+
+using StreamingAggregatorFactory = AggregatorFactoryBase<SortedStreamingAggregator>;
+using StreamingAggregatorFactoryPtr = std::shared_ptr<StreamingAggregatorFactory>;
+
+} // namespace starrocks
--- a/be/src/exec/cross_join_node.cpp
+++ b/be/src/exec/cross_join_node.cpp
@ -88,6 +88,13 @@ Status CrossJoinNode::init(const TPlanNode& tnode, RuntimeState* state) {
                _build_runtime_filters.emplace_back(rf_desc);
            }
        }
+        if (tnode.nestloop_join_node.__isset.common_slot_map) {
+            for (const auto& [key, val] : tnode.nestloop_join_node.common_slot_map) {
+                ExprContext* context;
+                RETURN_IF_ERROR(Expr::create_expr_tree(_pool, val, &context, state, true));
+                _common_expr_ctxs.insert({key, context});
+            }
+        }
        return Status::OK();
    }

@ -608,10 +615,10 @@ std::vector<std::shared_ptr<pipeline::OperatorFactory>> CrossJoinNode::_decompos

    OpFactories left_ops = _children[0]->decompose_to_pipeline(context);
    // communication with CrossJoinRight through shared_data.
-    auto left_factory =
-            std::make_shared<ProbeFactory>(context->next_operator_id(), id(), _row_descriptor, child(0)->row_desc(),
-                                           child(1)->row_desc(), _sql_join_conjuncts, std::move(_join_conjuncts),
-                                           std::move(_conjunct_ctxs), std::move(cross_join_context), _join_op);
+    auto left_factory = std::make_shared<ProbeFactory>(
+            context->next_operator_id(), id(), _row_descriptor, child(0)->row_desc(), child(1)->row_desc(),
+            _sql_join_conjuncts, std::move(_join_conjuncts), std::move(_conjunct_ctxs), std::move(_common_expr_ctxs),
+            std::move(cross_join_context), _join_op);
    // Initialize OperatorFactory's fields involving runtime filters.
    this->init_runtime_filter_for_operator(left_factory.get(), context, rc_rf_probe_collector);
    if (!context->is_colocate_group()) {
--- a/be/src/exec/cross_join_node.h
+++ b/be/src/exec/cross_join_node.h
@ -128,6 +128,8 @@ private:

    std::vector<RuntimeFilterBuildDescriptor*> _build_runtime_filters;
    bool _interpolate_passthrough = false;
+
+    std::map<SlotId, ExprContext*> _common_expr_ctxs;
 };

 } // namespace starrocks
--- a/be/src/exec/csv_scanner.cpp
+++ b/be/src/exec/csv_scanner.cpp
@ -253,14 +253,13 @@ Status CSVScanner::_init_reader() {

        _curr_reader = std::make_unique<ScannerCSVReader>(file, _state, _parse_options);
        _curr_reader->set_counter(_counter);
-        if (_scan_range.ranges[_curr_file_index].size > 0 &&
-            _scan_range.ranges[_curr_file_index].format_type == TFileFormatType::FORMAT_CSV_PLAIN) {
+        if (range_desc.size > 0 && range_desc.format_type == TFileFormatType::FORMAT_CSV_PLAIN) {
            // Does not set limit for compressed file.
-            _curr_reader->set_limit(_scan_range.ranges[_curr_file_index].size);
+            _curr_reader->set_limit(range_desc.size);
        }
-        if (_scan_range.ranges[_curr_file_index].start_offset > 0) {
+        if (range_desc.start_offset > 0) {
            // Skip the first record started from |start_offset|.
-            auto status = file->skip(_scan_range.ranges[_curr_file_index].start_offset);
+            auto status = file->skip(range_desc.start_offset);
            if (status.is_time_out()) {
                // open this file next time
                --_curr_file_index;
@ -271,7 +270,8 @@ Status CSVScanner::_init_reader() {
            RETURN_IF_ERROR(_curr_reader->next_record(&dummy));
        }

-        if (_parse_options.skip_header) {
+        // only the first range needs to skip header
+        if (_parse_options.skip_header && range_desc.start_offset == 0) {
            for (int64_t i = 0; i < _parse_options.skip_header; i++) {
                CSVReader::Record dummy;
                auto st = _curr_reader->next_record(&dummy);
--- a/be/src/exec/exec_node.h
+++ b/be/src/exec/exec_node.h
@ -35,7 +35,6 @@
 #pragma once

 #include <functional>
-#include <mutex>
 #include <sstream>
 #include <vector>

@ -48,10 +47,7 @@
 #include "runtime/descriptors.h"
 #include "runtime/mem_pool.h"
 #include "runtime/query_statistics.h"
-#include "service/backend_options.h"
-#include "util/blocking_queue.hpp"
 #include "util/runtime_profile.h"
-#include "util/uid_util.h" // for print_id

 namespace starrocks {

--- a/be/src/exec/hash_join_components.cpp
+++ b/be/src/exec/hash_join_components.cpp
@ -120,6 +120,16 @@ public:

    const ChunkPtr& back() { return _chunks.back(); }

+    void append_selective_to_back(const Chunk& src, const uint32_t* indexes, uint32_t from, uint32_t size) {
+        auto& chunk = _chunks.back();
+        const size_t prev_bytes = chunk->memory_usage();
+
+        chunk->append_selective(src, indexes, from, size);
+        const size_t new_bytes = chunk->memory_usage();
+
+        _tracker->consume(new_bytes - prev_bytes);
+    }
+
    bool is_full() const {
        return _chunks.size() >= 4 || _tracker->consumption() > config::partition_hash_join_probe_limit_size;
    }
@ -213,10 +223,10 @@ Status PartitionedHashJoinProberImpl::push_probe_chunk(RuntimeState* state, Chun
    }
    std::vector<uint32_t> hash_values;
    {
-        hash_values.assign(num_rows, HashUtil::FNV_SEED);
+        hash_values.assign(num_rows, 0);

        for (const ColumnPtr& column : partition_columns) {
-            column->fnv_hash(hash_values.data(), 0, num_rows);
+            column->crc32_hash(hash_values.data(), 0, num_rows);
        }
        // find partition id
        for (size_t i = 0; i < hash_values.size(); ++i) {
@ -329,7 +339,9 @@ StatusOr<ChunkPtr> PartitionedHashJoinProberImpl::probe_remain(RuntimeState* sta
 }

 void PartitionedHashJoinProberImpl::reset(RuntimeState* runtime_state) {
-    _probers.clear();
+    for (auto& prober : _probers) {
+        prober->reset(runtime_state);
+    }
    _partition_input_channels.clear();
    _all_input_finished = false;
    _remain_partition_idx = 0;
@ -362,7 +374,7 @@ bool SingleHashJoinBuilder::anti_join_key_column_has_null() const {
    return false;
 }

-Status SingleHashJoinBuilder::do_append_chunk(const ChunkPtr& chunk) {
+Status SingleHashJoinBuilder::do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) {
    if (UNLIKELY(_ht.get_row_count() + chunk->num_rows() >= max_hash_table_element_size)) {
        return Status::NotSupported(strings::Substitute("row count of right table in hash join > $0", UINT32_MAX));
    }
@ -404,7 +416,7 @@ enum class CacheLevel { L2, L3, MEMORY };

 class AdaptivePartitionHashJoinBuilder final : public HashJoinBuilder {
 public:
-    AdaptivePartitionHashJoinBuilder(HashJoiner& hash_joiner);
+    explicit AdaptivePartitionHashJoinBuilder(HashJoiner& hash_joiner);
    ~AdaptivePartitionHashJoinBuilder() override = default;

    void create(const HashTableParam& param) override;
@ -413,7 +425,7 @@ public:

    void reset(const HashTableParam& param) override;

-    Status do_append_chunk(const ChunkPtr& chunk) override;
+    Status do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) override;

    Status build(RuntimeState* state) override;

@ -432,27 +444,53 @@ public:

    void clone_readable(HashJoinBuilder* builder) override;

+    Status prepare_for_spill_start(RuntimeState* state) override;
    ChunkPtr convert_to_spill_schema(const ChunkPtr& chunk) const override;

 private:
-    size_t _estimated_row_size(const HashTableParam& param) const;
-    size_t _estimated_probe_cost(const HashTableParam& param) const;
+    static double _calculate_cache_miss_factor(const HashJoiner& hash_joiner);
+
+    size_t _estimate_hash_table_probing_bytes_per_row(const HashTableParam& param) const;
+    size_t _estimate_probe_row_bytes(const HashTableParam& param) const;
    template <CacheLevel T>
-    size_t _estimated_build_cost(size_t build_row_size) const;
-    void _adjust_partition_rows(size_t build_row_size);
+    size_t _estimate_cost_by_bytes(size_t row_bytes) const;

    void _init_partition_nums(const HashTableParam& param);
-    Status _convert_to_single_partition();
-    Status _append_chunk_to_partitions(const ChunkPtr& chunk);
+    void _adjust_partition_rows(size_t hash_table_bytes_per_row, size_t hash_table_probing_bytes_per_row);
+
+    Status _do_append_chunk(RuntimeState* state, const ChunkPtr& chunk);
+    Status _append_chunk_to_partitions(RuntimeState* state, const ChunkPtr& chunk);
+    Status _transfer_to_appending_stage(RuntimeState* state);
+    Status _convert_to_single_partition(RuntimeState* state);
+    Status _flush_buffer_chunks(RuntimeState* state);
+
+    bool _need_partition_join_for_build(size_t ht_num_rows) const;
+    bool _need_partition_join_for_append(size_t ht_num_rows) const;

 private:
    std::vector<std::unique_ptr<SingleHashJoinBuilder>> _builders;

-    size_t _partition_num = 0;
-    size_t _partition_join_min_rows = 0;
-    size_t _partition_join_max_rows = 0;
+    // Split append chunk into two stages:
+    // - BUFFERING: buffers chunks without partitioning until the number of rows exceeds _partition_join_l2_max_rows or _partition_join_l3_max_rows.
+    // - APPENDING: partitions all incoming chunks.
+    enum class Stage { BUFFERING, APPENDING };
+    Stage _stage = Stage::BUFFERING;
+    MemTracker _mem_tracker;
+    std::vector<PartitionChunkChannel> _partition_input_channels;
+    std::vector<ChunkPtr> _unpartition_chunks;

-    size_t _probe_estimated_costs = 0;
+    size_t _partition_num = 0;
+
+    size_t _hash_table_probing_bytes_per_row = 0;
+    size_t _hash_table_bytes_per_row = 0;
+    size_t _partition_join_l2_min_rows = 0;
+    size_t _partition_join_l2_max_rows = 0;
+    size_t _partition_join_l3_min_rows = 0;
+    size_t _partition_join_l3_max_rows = 0;
+
+    size_t _probe_row_shuffle_cost = 0;
+    size_t _l2_benefit = 0;
+    size_t _l3_benefit = 0;

    size_t _fit_L2_cache_max_rows = 0;
    size_t _fit_L3_cache_max_rows = 0;
@ -461,10 +499,15 @@ private:
    size_t _L3_cache_size = 0;

    size_t _pushed_chunks = 0;
+
+    // Shared read-only data accessed concurrently by threads can lead to better cache performance.
+    // Therefore, for broadcast joins, this parameter is used to reduce benefit of partitioned hash joins as the number
+    // of prober threads (DOP) increases.
+    const double _cache_miss_factor;
 };

 AdaptivePartitionHashJoinBuilder::AdaptivePartitionHashJoinBuilder(HashJoiner& hash_joiner)
-        : HashJoinBuilder(hash_joiner) {
+        : HashJoinBuilder(hash_joiner), _cache_miss_factor(_calculate_cache_miss_factor(hash_joiner)) {
    static constexpr size_t DEFAULT_L2_CACHE_SIZE = 1 * 1024 * 1024;
    static constexpr size_t DEFAULT_L3_CACHE_SIZE = 32 * 1024 * 1024;
    const auto& cache_sizes = CpuInfo::get_cache_sizes();
@ -474,100 +517,173 @@ AdaptivePartitionHashJoinBuilder::AdaptivePartitionHashJoinBuilder(HashJoiner& h
    _L3_cache_size = _L3_cache_size ? _L3_cache_size : DEFAULT_L3_CACHE_SIZE;
 }

-size_t AdaptivePartitionHashJoinBuilder::_estimated_row_size(const HashTableParam& param) const {
+double AdaptivePartitionHashJoinBuilder::_calculate_cache_miss_factor(const HashJoiner& hash_joiner) {
+    if (hash_joiner.distribution_mode() != TJoinDistributionMode::BROADCAST) {
+        return 1.0; // No broadcast join, no cache reuse between different probers.
+    }
+
+    const size_t max_prober_dop = hash_joiner.max_dop();
+    if (max_prober_dop <= 1) {
+        return 1.0;
+    }
+    if (max_prober_dop > 8) {
+        return 0.1;
+    }
+    return 1 - (max_prober_dop - 1) * 0.1;
+}
+
+size_t AdaptivePartitionHashJoinBuilder::_estimate_hash_table_probing_bytes_per_row(const HashTableParam& param) const {
    size_t estimated_each_row = 0;

+    // Probing a row need
+    // 1. touch `first` and `next` vectors,
+    // 2 and compare join keys between builder and prober.
+    // 3. output columns from the build side.
+
+    // 1. `first` and `next` bytes
+    estimated_each_row += 8;
+
+    // 2. key bytes
+    for (const auto& join_key : param.join_keys) {
+        if (join_key.type != nullptr) {
+            estimated_each_row += get_size_of_fixed_length_type(join_key.type->type);
+            // The benefit from non-fixed key columns is less than those from fixed key columns, so the penalty (/4) is applied here.
+            estimated_each_row += type_estimated_overhead_bytes(join_key.type->type) / 4;
+        }
+    }
+
+    // 3. output bytes
    for (auto* tuple : param.build_row_desc->tuple_descriptors()) {
-        for (auto slot : tuple->slots()) {
-            if (param.build_output_slots.contains(slot->id())) {
+        for (const auto* slot : tuple->slots()) {
+            if (param.build_output_slots.empty() || param.build_output_slots.contains(slot->id())) {
                estimated_each_row += get_size_of_fixed_length_type(slot->type().type);
                estimated_each_row += type_estimated_overhead_bytes(slot->type().type);
            }
        }
    }

-    // for hash table bucket
-    estimated_each_row += 4;
-
-    return estimated_each_row;
+    return std::max<size_t>(estimated_each_row * _cache_miss_factor, 1);
 }

 // We could use a better estimation model.
-size_t AdaptivePartitionHashJoinBuilder::_estimated_probe_cost(const HashTableParam& param) const {
+size_t AdaptivePartitionHashJoinBuilder::_estimate_probe_row_bytes(const HashTableParam& param) const {
    size_t size = 0;

+    // shuffling probe bytes
    for (auto* tuple : param.probe_row_desc->tuple_descriptors()) {
-        for (auto slot : tuple->slots()) {
-            if (param.probe_output_slots.contains(slot->id())) {
-                size += get_size_of_fixed_length_type(slot->type().type);
-                size += type_estimated_overhead_bytes(slot->type().type);
-            }
+        for (const auto* slot : tuple->slots()) {
+            size += get_size_of_fixed_length_type(slot->type().type);
+            size += type_estimated_overhead_bytes(slot->type().type);
        }
    }
-    // we define probe cost is bytes size * 6
-    return size * 6;
+
+    return std::max<size_t>(size, 1);
 }

 template <>
-size_t AdaptivePartitionHashJoinBuilder::_estimated_build_cost<CacheLevel::L2>(size_t build_row_size) const {
-    return build_row_size / 2;
+size_t AdaptivePartitionHashJoinBuilder::_estimate_cost_by_bytes<CacheLevel::L2>(size_t row_bytes) const {
+    return row_bytes / 2;
 }
-
 template <>
-size_t AdaptivePartitionHashJoinBuilder::_estimated_build_cost<CacheLevel::L3>(size_t build_row_size) const {
-    return build_row_size;
+size_t AdaptivePartitionHashJoinBuilder::_estimate_cost_by_bytes<CacheLevel::L3>(size_t row_bytes) const {
+    return row_bytes;
 }
-
 template <>
-size_t AdaptivePartitionHashJoinBuilder::_estimated_build_cost<CacheLevel::MEMORY>(size_t build_row_size) const {
-    return build_row_size * 2;
+size_t AdaptivePartitionHashJoinBuilder::_estimate_cost_by_bytes<CacheLevel::MEMORY>(size_t row_bytes) const {
+    return row_bytes * 2;
 }

-void AdaptivePartitionHashJoinBuilder::_adjust_partition_rows(size_t build_row_size) {
-    build_row_size = std::max(build_row_size, 4UL);
-    _fit_L2_cache_max_rows = _L2_cache_size / build_row_size;
-    _fit_L3_cache_max_rows = _L3_cache_size / build_row_size;
+bool AdaptivePartitionHashJoinBuilder::_need_partition_join_for_build(size_t ht_num_rows) const {
+    return (_partition_join_l2_min_rows < ht_num_rows && ht_num_rows <= _partition_join_l2_max_rows) ||
+           (_partition_join_l3_min_rows < ht_num_rows && ht_num_rows <= _partition_join_l3_max_rows);
+}

-    // If the hash table is smaller than the L2 cache. we don't think partition hash join is needed.
-    _partition_join_min_rows = _fit_L2_cache_max_rows;
-    // If the hash table after partition can't be loaded to L3. we don't think partition hash join is needed.
-    _partition_join_max_rows = _fit_L3_cache_max_rows * _partition_num;
+bool AdaptivePartitionHashJoinBuilder::_need_partition_join_for_append(size_t ht_num_rows) const {
+    return ht_num_rows <= _partition_join_l2_max_rows || ht_num_rows <= _partition_join_l3_max_rows;
+}

-    if (_probe_estimated_costs + _estimated_build_cost<CacheLevel::L2>(build_row_size) <
-        _estimated_build_cost<CacheLevel::L3>(build_row_size)) {
-        // overhead after hash table partitioning + probe extra cost < cost before partitioning
-        // nothing to do
-    } else if (_probe_estimated_costs + _estimated_build_cost<CacheLevel::L3>(build_row_size) <
-               _estimated_build_cost<CacheLevel::MEMORY>(build_row_size)) {
-        // It is only after this that performance gains can be realized beyond the L3 cache.
-        _partition_join_min_rows = _fit_L3_cache_max_rows;
+void AdaptivePartitionHashJoinBuilder::_adjust_partition_rows(size_t hash_table_bytes_per_row,
+                                                              size_t hash_table_probing_bytes_per_row) {
+    if (hash_table_bytes_per_row == _hash_table_bytes_per_row &&
+        hash_table_probing_bytes_per_row == _hash_table_probing_bytes_per_row) {
+        return; // No need to adjust partition rows.
+    }
+
+    _hash_table_bytes_per_row = hash_table_bytes_per_row;
+    _hash_table_probing_bytes_per_row = hash_table_probing_bytes_per_row;
+
+    hash_table_bytes_per_row = std::max<size_t>(hash_table_bytes_per_row, 1);
+
+    _fit_L2_cache_max_rows = _L2_cache_size / hash_table_bytes_per_row;
+    _fit_L3_cache_max_rows = _L3_cache_size / hash_table_bytes_per_row;
+
+    _partition_join_l2_min_rows = -1;
+    _partition_join_l2_max_rows = 0;
+    _partition_join_l3_min_rows = -1;
+    _partition_join_l3_max_rows = 0;
+
+    const auto l2_benefit = _estimate_cost_by_bytes<CacheLevel::L3>(hash_table_probing_bytes_per_row) -
+                            _estimate_cost_by_bytes<CacheLevel::L2>(hash_table_probing_bytes_per_row);
+    const auto l3_benefit = _estimate_cost_by_bytes<CacheLevel::MEMORY>(hash_table_probing_bytes_per_row) -
+                            _estimate_cost_by_bytes<CacheLevel::L3>(hash_table_probing_bytes_per_row);
+
+    if (_probe_row_shuffle_cost < l3_benefit) { // Partitioned joins benefit from L3 cache.
+        // Partitioned joins benefit from L3 cache when probing a row has cache miss in non-partitioned join but not in partitioned join.
+        // 1. min_rows > (l3_cache_size/hash_table_bytes_per_row)*(l3_benefit/(l3_benefit-_probe_row_shuffle_cost)), because:
+        //   - l3_benefit * non_partition_cache_miss_rate > _probe_row_shuffle_cost
+        //   - non_partition_cache_miss_rate = 1 - l3_cache_size/(min_rows*hash_table_bytes_per_row)
+        // 2. max_rows < (l3_cache_size/hash_table_bytes_per_row)*(l3_benefit/_probe_row_shuffle_cost)*num_partitions, because:
+        //   - l3_benefit * partition_cache_hit_rate > _probe_row_shuffle_cost
+        //   - partition_cache_hit_rate = l3_cache_size/(max_rows_per_partition*hash_table_bytes_per_row)
+        _partition_join_l3_min_rows = _fit_L3_cache_max_rows * l3_benefit / (l3_benefit - _probe_row_shuffle_cost);
+        _partition_join_l3_max_rows = _fit_L3_cache_max_rows * _partition_num * l3_benefit / _probe_row_shuffle_cost;
+        _partition_join_l3_max_rows *= 2; // relax the restriction
+
+        if (_probe_row_shuffle_cost < l2_benefit) { // Partitioned joins benefit from L2 cache.
+            _partition_join_l2_min_rows = _fit_L2_cache_max_rows * l2_benefit / (l2_benefit - _probe_row_shuffle_cost);
+            _partition_join_l2_min_rows *= 2; // Make the restriction more stringent
+            _partition_join_l2_max_rows =
+                    (_fit_L2_cache_max_rows * _partition_num) * l2_benefit / _probe_row_shuffle_cost;
+        }
    } else {
        // Partitioned joins don't have performance gains. Not using partition hash join.
        _partition_num = 1;
    }

-    VLOG_OPERATOR << "TRACE:"
-                  << "partition_num=" << _partition_num << " partition_join_min_rows=" << _partition_join_min_rows
-                  << " partition_join_max_rows=" << _partition_join_max_rows << " probe cost=" << _probe_estimated_costs
-                  << " build cost L2=" << _estimated_build_cost<CacheLevel::L2>(build_row_size)
-                  << " build cost L3=" << _estimated_build_cost<CacheLevel::L3>(build_row_size)
-                  << " build cost Mem=" << _estimated_build_cost<CacheLevel::MEMORY>(build_row_size);
+    _l2_benefit = l2_benefit;
+    _l3_benefit = l3_benefit;
+
+    VLOG_OPERATOR << "TRACE: _adjust_partition_rows "
+                  << "[partition_num=" << _partition_num << "] "
+                  << "[partition_join_l2_min_rows=" << _partition_join_l2_min_rows << "] "
+                  << "[partition_join_l2_max_rows=" << _partition_join_l2_max_rows << "] "
+                  << "[partition_join_l3_min_rows=" << _partition_join_l3_min_rows << "] "
+                  << "[partition_join_l3_max_rows=" << _partition_join_l3_max_rows << "] "
+                  << "[hash_table_probing_bytes_per_row=" << hash_table_probing_bytes_per_row << "] "
+                  << "[hash_table_bytes_per_row=" << hash_table_bytes_per_row << "] "
+                  << "[l2_benefit=" << l2_benefit << "] "
+                  << "[l3_benefit=" << l3_benefit << "] "
+                  << "[probe_shuffle_cost=" << _probe_row_shuffle_cost << "] ";
 }

 void AdaptivePartitionHashJoinBuilder::_init_partition_nums(const HashTableParam& param) {
    _partition_num = 16;

-    size_t estimated_bytes_each_row = _estimated_row_size(param);
+    _probe_row_shuffle_cost =
+            std::max<size_t>(_estimate_cost_by_bytes<CacheLevel::L3>(_estimate_probe_row_bytes(param)), 1);

-    _probe_estimated_costs = _estimated_probe_cost(param);
+    const size_t hash_table_probing_bytes_per_row = _estimate_hash_table_probing_bytes_per_row(param);
+    _adjust_partition_rows(1, hash_table_probing_bytes_per_row);

-    _adjust_partition_rows(estimated_bytes_each_row);
-
-    COUNTER_SET(_hash_joiner.build_metrics().partition_nums, (int64_t)_partition_num);
+    COUNTER_SET(_hash_joiner.build_metrics().partition_nums, static_cast<int64_t>(_partition_num));
 }

 void AdaptivePartitionHashJoinBuilder::create(const HashTableParam& param) {
    _init_partition_nums(param);
+
+    if (_partition_num > 1) {
+        _partition_input_channels.resize(_partition_num, PartitionChunkChannel(&_mem_tracker));
+    }
    for (size_t i = 0; i < _partition_num; ++i) {
        _builders.emplace_back(std::make_unique<SingleHashJoinBuilder>(_hash_joiner));
        _builders.back()->create(param);
@ -579,10 +695,14 @@ void AdaptivePartitionHashJoinBuilder::close() {
        builder->close();
    }
    _builders.clear();
+    _partition_input_channels.clear();
    _partition_num = 0;
-    _partition_join_min_rows = 0;
-    _partition_join_max_rows = 0;
-    _probe_estimated_costs = 0;
+    _partition_join_l2_min_rows = 0;
+    _partition_join_l2_max_rows = 0;
+    _partition_join_l3_min_rows = 0;
+    _partition_join_l3_max_rows = 0;
+    _probe_row_shuffle_cost = 0;
+    _hash_table_probing_bytes_per_row = 0;
    _fit_L2_cache_max_rows = 0;
    _fit_L3_cache_max_rows = 0;
    _pushed_chunks = 0;
@ -637,17 +757,70 @@ int64_t AdaptivePartitionHashJoinBuilder::ht_mem_usage() const {
                           [](int64_t sum, const auto& builder) { return sum + builder->ht_mem_usage(); });
 }

-Status AdaptivePartitionHashJoinBuilder::_convert_to_single_partition() {
+Status AdaptivePartitionHashJoinBuilder::_convert_to_single_partition(RuntimeState* state) {
+    VLOG_OPERATOR << "TRACE: convert_to_single_partition "
+                  << "[partition_num=" << _partition_num << "] "
+                  << "[partition_join_l2_min_rows=" << _partition_join_l2_min_rows << "] "
+                  << "[partition_join_l2_max_rows=" << _partition_join_l2_max_rows << "] "
+                  << "[partition_join_l3_min_rows=" << _partition_join_l3_min_rows << "] "
+                  << "[partition_join_l3_max_rows=" << _partition_join_l3_max_rows << "] "
+                  << "[hash_table_row_count=" << hash_table_row_count() << "] ";
+
    // merge all partition data to the first partition
-    for (size_t i = 1; i < _builders.size(); ++i) {
-        _builders[0]->hash_table().merge_ht(_builders[i]->hash_table());
+    if (_stage == Stage::BUFFERING) {
+        _mem_tracker.set(0);
+        for (const auto& unpartition_chunk : _unpartition_chunks) {
+            RETURN_IF_ERROR(_builders[0]->do_append_chunk(state, unpartition_chunk));
+        }
+        _unpartition_chunks.clear();
+    } else {
+        for (size_t i = 0; i < _builders.size(); ++i) {
+            if (i != 0) {
+                _builders[0]->hash_table().merge_ht(_builders[i]->hash_table());
+            }
+            auto& channel = _partition_input_channels[i];
+            while (!channel.is_empty()) {
+                RETURN_IF_ERROR(_builders[0]->do_append_chunk(state, channel.pull()));
+            }
+        }
+        _partition_input_channels.clear();
    }
    _builders.resize(1);
+
    _partition_num = 1;
+    COUNTER_SET(_hash_joiner.build_metrics().partition_nums, static_cast<int64_t>(1));
+
    return Status::OK();
 }

-Status AdaptivePartitionHashJoinBuilder::_append_chunk_to_partitions(const ChunkPtr& chunk) {
+Status AdaptivePartitionHashJoinBuilder::_transfer_to_appending_stage(RuntimeState* state) {
+    _stage = Stage::APPENDING;
+    _mem_tracker.set(0); // All the buffered chunks are moved to the partition builders, so clear the memory tracker.
+    for (const auto& unpartition_chunk : _unpartition_chunks) {
+        RETURN_IF_ERROR(_append_chunk_to_partitions(state, unpartition_chunk));
+    }
+    _unpartition_chunks.clear();
+
+    return Status::OK();
+}
+
+Status AdaptivePartitionHashJoinBuilder::_do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) {
+    if (_stage == Stage::BUFFERING) {
+        _mem_tracker.consume(chunk->memory_usage());
+        _unpartition_chunks.push_back(chunk);
+
+        const size_t num_rows = hash_table_row_count();
+        if (num_rows >= _partition_join_l2_min_rows || num_rows >= _partition_join_l3_min_rows) {
+            RETURN_IF_ERROR(_transfer_to_appending_stage(state));
+        }
+
+        return Status::OK();
+    } else {
+        return _append_chunk_to_partitions(state, chunk);
+    }
+}
+
+Status AdaptivePartitionHashJoinBuilder::_append_chunk_to_partitions(RuntimeState* state, const ChunkPtr& chunk) {
    const std::vector<ExprContext*>& build_partition_keys = _hash_joiner.build_expr_ctxs();

    size_t num_rows = chunk->num_rows();
@ -660,10 +833,10 @@ Status AdaptivePartitionHashJoinBuilder::_append_chunk_to_partitions(const Chunk
    }
    std::vector<uint32_t> hash_values;
    {
-        hash_values.assign(num_rows, HashUtil::FNV_SEED);
+        hash_values.assign(num_rows, 0);

        for (const ColumnPtr& column : partition_columns) {
-            column->fnv_hash(hash_values.data(), 0, num_rows);
+            column->crc32_hash(hash_values.data(), 0, num_rows);
        }
        // find partition id
        for (size_t i = 0; i < hash_values.size(); ++i) {
@ -698,45 +871,83 @@ Status AdaptivePartitionHashJoinBuilder::_append_chunk_to_partitions(const Chunk
        if (size == 0) {
            continue;
        }
-        // TODO: make builder implements append with selective
-        auto partition_chunk = chunk->clone_empty();
-        partition_chunk->append_selective(*chunk, selection.data(), from, size);
-        RETURN_IF_ERROR(_builders[i]->append_chunk(std::move(partition_chunk)));
+
+        auto& channel = _partition_input_channels[i];
+
+        if (channel.is_empty()) {
+            channel.push(chunk->clone_empty());
+        }
+
+        if (channel.back()->num_rows() + size <= state->chunk_size()) {
+            channel.append_selective_to_back(*chunk, selection.data(), from, size);
+        } else {
+            channel.push(chunk->clone_empty());
+            channel.append_selective_to_back(*chunk, selection.data(), from, size);
+        }
+
+        while (channel.is_full()) {
+            RETURN_IF_ERROR(_builders[i]->append_chunk(state, channel.pull()));
+        }
    }
    return Status::OK();
 }

-Status AdaptivePartitionHashJoinBuilder::do_append_chunk(const ChunkPtr& chunk) {
-    if (_partition_num > 1 && hash_table_row_count() > _partition_join_max_rows) {
-        RETURN_IF_ERROR(_convert_to_single_partition());
+Status AdaptivePartitionHashJoinBuilder::do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) {
+    if (_partition_num > 1 && !_need_partition_join_for_append(hash_table_row_count())) {
+        RETURN_IF_ERROR(_convert_to_single_partition(state));
    }

    if (_partition_num > 1 && ++_pushed_chunks % 8 == 0) {
-        size_t build_row_size = ht_mem_usage() / hash_table_row_count();
-        _adjust_partition_rows(build_row_size);
+        const size_t build_row_size = (ht_mem_usage() + _mem_tracker.consumption()) / hash_table_row_count();
+        _adjust_partition_rows(build_row_size, _hash_table_probing_bytes_per_row);
        if (_partition_num == 1) {
-            RETURN_IF_ERROR(_convert_to_single_partition());
+            RETURN_IF_ERROR(_convert_to_single_partition(state));
        }
    }

    if (_partition_num > 1) {
-        RETURN_IF_ERROR(_append_chunk_to_partitions(chunk));
+        RETURN_IF_ERROR(_do_append_chunk(state, chunk));
    } else {
-        RETURN_IF_ERROR(_builders[0]->do_append_chunk(chunk));
+        RETURN_IF_ERROR(_builders[0]->do_append_chunk(state, chunk));
    }

    return Status::OK();
 }

+Status AdaptivePartitionHashJoinBuilder::prepare_for_spill_start(RuntimeState* state) {
+    if (_partition_num > 1) {
+        return _flush_buffer_chunks(state);
+    }
+    return Status::OK();
+}
+
 ChunkPtr AdaptivePartitionHashJoinBuilder::convert_to_spill_schema(const ChunkPtr& chunk) const {
    return _builders[0]->convert_to_spill_schema(chunk);
 }

+Status AdaptivePartitionHashJoinBuilder::_flush_buffer_chunks(RuntimeState* state) {
+    if (_stage == Stage::BUFFERING) {
+        RETURN_IF_ERROR(_transfer_to_appending_stage(state));
+    }
+    for (size_t i = 0; i < _partition_input_channels.size(); ++i) {
+        auto& channel = _partition_input_channels[i];
+        while (!channel.is_empty()) {
+            RETURN_IF_ERROR(_builders[i]->do_append_chunk(state, channel.pull()));
+        }
+    }
+
+    return Status::OK();
+}
+
 Status AdaptivePartitionHashJoinBuilder::build(RuntimeState* state) {
    DCHECK_EQ(_partition_num, _builders.size());

-    if (_partition_num > 1 && hash_table_row_count() < _partition_join_min_rows) {
-        RETURN_IF_ERROR(_convert_to_single_partition());
+    if (_partition_num > 1) {
+        if (!_need_partition_join_for_build(hash_table_row_count())) {
+            RETURN_IF_ERROR(_convert_to_single_partition(state));
+        } else {
+            RETURN_IF_ERROR(_flush_buffer_chunks(state));
+        }
    }

    for (auto& builder : _builders) {
@ -769,17 +980,20 @@ std::unique_ptr<HashJoinProberImpl> AdaptivePartitionHashJoinBuilder::create_pro
    }
 }

-void AdaptivePartitionHashJoinBuilder::clone_readable(HashJoinBuilder* builder) {
+void AdaptivePartitionHashJoinBuilder::clone_readable(HashJoinBuilder* other_builder) {
    for (auto& builder : _builders) {
        DCHECK(builder->ready());
    }
    DCHECK(_ready);
    DCHECK_EQ(_partition_num, _builders.size());
-    auto other = down_cast<AdaptivePartitionHashJoinBuilder*>(builder);
+    auto other = down_cast<AdaptivePartitionHashJoinBuilder*>(other_builder);
    other->_builders.clear();
    other->_partition_num = _partition_num;
-    other->_partition_join_max_rows = _partition_join_max_rows;
-    other->_partition_join_min_rows = _partition_join_min_rows;
+    other->_partition_join_l2_min_rows = _partition_join_l2_min_rows;
+    other->_partition_join_l2_max_rows = _partition_join_l2_max_rows;
+    other->_partition_join_l3_min_rows = _partition_join_l3_min_rows;
+    other->_partition_join_l3_max_rows = _partition_join_l3_max_rows;
+    other->_partition_join_l3_max_rows = _partition_join_l3_max_rows;
    other->_ready = _ready;
    for (size_t i = 0; i < _partition_num; ++i) {
        other->_builders.emplace_back(std::make_unique<SingleHashJoinBuilder>(_hash_joiner));
--- a/be/src/exec/hash_join_components.h
+++ b/be/src/exec/hash_join_components.h
@ -92,11 +92,11 @@ public:
    virtual void create(const HashTableParam& param) = 0;

    // append chunk to hash table
-    Status append_chunk(const ChunkPtr& chunk) {
+    Status append_chunk(RuntimeState* state, const ChunkPtr& chunk) {
        _inc_row_count(chunk->num_rows());
-        return do_append_chunk(chunk);
+        return do_append_chunk(state, chunk);
    }
-    virtual Status do_append_chunk(const ChunkPtr& chunk) = 0;
+    virtual Status do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) = 0;

    virtual Status build(RuntimeState* state) = 0;

@ -125,6 +125,7 @@ public:
    // clone readable to to builder
    virtual void clone_readable(HashJoinBuilder* builder) = 0;

+    virtual Status prepare_for_spill_start(RuntimeState* state) { return Status::OK(); }
    virtual ChunkPtr convert_to_spill_schema(const ChunkPtr& chunk) const = 0;

 protected:
@ -149,7 +150,7 @@ public:

    void reset(const HashTableParam& param) override;

-    Status do_append_chunk(const ChunkPtr& chunk) override;
+    Status do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) override;

    Status build(RuntimeState* state) override;

--- a/be/src/exec/hash_join_node.cpp
+++ b/be/src/exec/hash_join_node.cpp
@ -126,6 +126,14 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) {
        _build_equivalence_partition_expr_ctxs = _build_expr_ctxs;
    }

+    if (tnode.__isset.hash_join_node && tnode.hash_join_node.__isset.common_slot_map) {
+        for (const auto& [key, val] : tnode.hash_join_node.common_slot_map) {
+            ExprContext* context;
+            RETURN_IF_ERROR(Expr::create_expr_tree(_pool, val, &context, state, true));
+            _common_expr_ctxs.insert({key, context});
+        }
+    }
+
    RETURN_IF_ERROR(Expr::create_expr_trees(_pool, tnode.hash_join_node.other_join_conjuncts,
                                            &_other_join_conjunct_ctxs, state));

@ -483,8 +491,8 @@ pipeline::OpFactories HashJoinNode::_decompose_to_pipeline(pipeline::PipelineBui
    HashJoinerParam param(pool, _hash_join_node, _is_null_safes, _build_expr_ctxs, _probe_expr_ctxs,
                          _other_join_conjunct_ctxs, _conjunct_ctxs, child(1)->row_desc(), child(0)->row_desc(),
                          child(1)->type(), child(0)->type(), child(1)->conjunct_ctxs().empty(), _build_runtime_filters,
-                          _output_slots, _output_slots, _distribution_mode, _enable_late_materialization,
-                          _enable_partition_hash_join, _is_skew_join);
+                          _output_slots, _output_slots, context->degree_of_parallelism(), _distribution_mode,
+                          _enable_late_materialization, _enable_partition_hash_join, _is_skew_join, _common_expr_ctxs);
    auto hash_joiner_factory = std::make_shared<starrocks::pipeline::HashJoinerFactory>(param);

    // Create a shared RefCountedRuntimeFilterCollector
--- a/be/src/exec/hash_join_node.h
+++ b/be/src/exec/hash_join_node.h
@ -140,6 +140,8 @@ private:
    bool _probe_eos = false; // probe table scan finished;
    size_t _runtime_join_filter_pushdown_limit = 1024000;

+    std::map<SlotId, ExprContext*> _common_expr_ctxs;
+
    RuntimeProfile::Counter* _build_timer = nullptr;
    RuntimeProfile::Counter* _build_ht_timer = nullptr;
    RuntimeProfile::Counter* _copy_right_table_chunk_timer = nullptr;
--- a/be/src/exec/hash_joiner.cpp
+++ b/be/src/exec/hash_joiner.cpp
@ -33,6 +33,7 @@
 #include "pipeline/hashjoin/hash_joiner_fwd.h"
 #include "runtime/current_thread.h"
 #include "simd/simd.h"
+#include "storage/chunk_helper.h"
 #include "util/runtime_profile.h"

 namespace starrocks {
@ -73,6 +74,7 @@ HashJoiner::HashJoiner(const HashJoinerParam& param)
          _probe_expr_ctxs(param._probe_expr_ctxs),
          _other_join_conjunct_ctxs(param._other_join_conjunct_ctxs),
          _conjunct_ctxs(param._conjunct_ctxs),
+          _common_expr_ctxs(param._common_expr_ctxs),
          _build_row_descriptor(param._build_row_descriptor),
          _probe_row_descriptor(param._probe_row_descriptor),
          _build_node_type(param._build_node_type),
@ -82,6 +84,7 @@ HashJoiner::HashJoiner(const HashJoinerParam& param)
          _probe_output_slots(param._probe_output_slots),
          _build_runtime_filters(param._build_runtime_filters.begin(), param._build_runtime_filters.end()),
          _enable_late_materialization(param._enable_late_materialization),
+          _max_dop(param._max_dop),
          _is_skew_join(param._is_skew_join) {
    _is_push_down = param._hash_join_node.is_push_down;
    if (_join_type == TJoinOp::LEFT_ANTI_JOIN && param._hash_join_node.is_rewritten_from_not_in) {
@ -157,6 +160,11 @@ void HashJoiner::_init_hash_table_param(HashTableParam* param, RuntimeState* sta
    param->column_view_concat_rows_limit = state->column_view_concat_rows_limit();
    param->column_view_concat_bytes_limit = state->column_view_concat_bytes_limit();
    std::set<SlotId> predicate_slots;
+    for (const auto& [slot_id, ctx] : _common_expr_ctxs) {
+        std::vector<SlotId> expr_slots;
+        ctx->root()->get_slot_ids(&expr_slots);
+        predicate_slots.insert(expr_slots.begin(), expr_slots.end());
+    }
    for (ExprContext* expr_context : _conjunct_ctxs) {
        std::vector<SlotId> expr_slots;
        expr_context->root()->get_slot_ids(&expr_slots);
@ -178,7 +186,7 @@ void HashJoiner::_init_hash_table_param(HashTableParam* param, RuntimeState* sta
        }
    }
 }
-Status HashJoiner::append_chunk_to_ht(const ChunkPtr& chunk) {
+Status HashJoiner::append_chunk_to_ht(RuntimeState* state, const ChunkPtr& chunk) {
    if (_phase != HashJoinPhase::BUILD) {
        return Status::OK();
    }
@ -187,7 +195,7 @@ Status HashJoiner::append_chunk_to_ht(const ChunkPtr& chunk) {
    }

    update_build_rows(chunk->num_rows());
-    return _hash_join_builder->append_chunk(chunk);
+    return _hash_join_builder->append_chunk(state, chunk);
 }

 Status HashJoiner::append_chunk_to_spill_buffer(RuntimeState* state, const ChunkPtr& chunk) {
@ -387,6 +395,9 @@ Status HashJoiner::_calc_filter_for_other_conjunct(ChunkPtr* chunk, Filter& filt
    hit_all = false;
    filter.assign((*chunk)->num_rows(), 1);

+    CommonExprEvalScopeGuard guard(*chunk, _common_expr_ctxs);
+    RETURN_IF_ERROR(guard.evaluate());
+
    for (auto* ctx : _other_join_conjunct_ctxs) {
        ASSIGN_OR_RETURN(ColumnPtr column, ctx->evaluate((*chunk).get()))
        size_t true_count = ColumnHelper::count_true_with_notnull(column);
@ -515,6 +526,8 @@ Status HashJoiner::_process_other_conjunct(ChunkPtr* chunk, JoinHashTable& hash_

 Status HashJoiner::_process_where_conjunct(ChunkPtr* chunk) {
    SCOPED_TIMER(probe_metrics().where_conjunct_evaluate_timer);
+    CommonExprEvalScopeGuard guard(*chunk, _common_expr_ctxs);
+    RETURN_IF_ERROR(guard.evaluate());
    return ExecNode::eval_conjuncts(_conjunct_ctxs, (*chunk).get());
 }

--- a/be/src/exec/hash_joiner.h
+++ b/be/src/exec/hash_joiner.h
@ -70,9 +70,10 @@ struct HashJoinerParam {
                    const RowDescriptor& build_row_descriptor, const RowDescriptor& probe_row_descriptor,
                    TPlanNodeType::type build_node_type, TPlanNodeType::type probe_node_type,
                    bool build_conjunct_ctxs_is_empty, std::list<RuntimeFilterBuildDescriptor*> build_runtime_filters,
-                    std::set<SlotId> build_output_slots, std::set<SlotId> probe_output_slots,
+                    std::set<SlotId> build_output_slots, std::set<SlotId> probe_output_slots, size_t max_dop,
                    const TJoinDistributionMode::type distribution_mode, bool enable_late_materialization,
-                    bool enable_partition_hash_join, bool is_skew_join)
+                    bool enable_partition_hash_join, bool is_skew_join,
+                    const std::map<SlotId, ExprContext*>& common_expr_ctxs)
            : _pool(pool),
              _hash_join_node(hash_join_node),
              _is_null_safes(std::move(is_null_safes)),
@ -88,10 +89,12 @@ struct HashJoinerParam {
              _build_runtime_filters(std::move(build_runtime_filters)),
              _build_output_slots(std::move(build_output_slots)),
              _probe_output_slots(std::move(probe_output_slots)),
+              _max_dop(max_dop),
              _distribution_mode(distribution_mode),
              _enable_late_materialization(enable_late_materialization),
              _enable_partition_hash_join(enable_partition_hash_join),
-              _is_skew_join(is_skew_join) {}
+              _is_skew_join(is_skew_join),
+              _common_expr_ctxs(common_expr_ctxs) {}

    HashJoinerParam(HashJoinerParam&&) = default;
    HashJoinerParam(HashJoinerParam&) = default;
@ -113,10 +116,13 @@ struct HashJoinerParam {
    std::set<SlotId> _build_output_slots;
    std::set<SlotId> _probe_output_slots;

+    size_t _max_dop;
+
    const TJoinDistributionMode::type _distribution_mode;
    const bool _enable_late_materialization;
    const bool _enable_partition_hash_join;
    const bool _is_skew_join;
+    const std::map<SlotId, ExprContext*> _common_expr_ctxs;
 };

 inline bool could_short_circuit(TJoinOp::type join_type) {
@ -205,7 +211,7 @@ public:

    void enter_eos_phase() { _phase = HashJoinPhase::EOS; }
    // build phase
-    Status append_chunk_to_ht(const ChunkPtr& chunk);
+    Status append_chunk_to_ht(RuntimeState* state, const ChunkPtr& chunk);

    Status append_chunk_to_spill_buffer(RuntimeState* state, const ChunkPtr& chunk);

@ -343,6 +349,9 @@ public:
        return DeferOp([this]() { _probe_observable.notify_source_observers(); });
    }

+    size_t max_dop() const { return _max_dop; }
+    TJoinDistributionMode::type distribution_mode() const { return _hash_join_node.distribution_mode; }
+
 private:
    static bool _has_null(const ColumnPtr& column);

@ -361,7 +370,7 @@ private:
                const_column->data_column()->assign(chunk->num_rows(), 0);
                key_columns.emplace_back(const_column->data_column());
            } else {
-                key_columns.emplace_back(column_ptr);
+                key_columns.emplace_back(std::move(column_ptr));
            }
        }
        return Status::OK();
@ -433,6 +442,7 @@ private:
    const std::vector<ExprContext*>& _other_join_conjunct_ctxs;
    // Conjuncts in Join followed by a filter predicate, usually in Where and Having.
    const std::vector<ExprContext*>& _conjunct_ctxs;
+    const std::map<SlotId, ExprContext*>& _common_expr_ctxs;
    const RowDescriptor& _build_row_descriptor;
    const RowDescriptor& _probe_row_descriptor;
    const TPlanNodeType::type _build_node_type;
@ -483,6 +493,8 @@ private:
    pipeline::Observable _builder_observable;
    pipeline::Observable _probe_observable;

+    size_t _max_dop = 0;
+
    bool _is_skew_join = false;
 };

--- a/be/src/exec/hdfs_scanner.cpp
+++ b/be/src/exec/hdfs_scanner.cpp
@ -237,8 +237,9 @@ Status HdfsScanner::get_next(RuntimeState* runtime_state, ChunkPtr* chunk) {
    // short circuit for min/max optimization.
    if (_scanner_ctx.can_use_min_max_optimization()) {
        // 3 means we output 3 values: min, max, and null
-        _scanner_ctx.append_or_update_min_max_column_to_chunk(chunk, 3);
-        size_t row_count = (*chunk)->num_rows();
+        const size_t row_count = 3;
+        (*chunk)->set_num_rows(row_count);
+        _scanner_ctx.append_or_update_min_max_column_to_chunk(chunk, row_count);
        _scanner_ctx.append_or_update_partition_column_to_chunk(chunk, row_count);
        _scanner_ctx.append_or_update_extended_column_to_chunk(chunk, row_count);
        _scanner_ctx.no_more_chunks = true;
--- a/be/src/exec/join/join_hash_map.cpp
+++ b/be/src/exec/join/join_hash_map.cpp
@ -26,6 +26,7 @@
 #include "simd/simd.h"
 #include "types/logical_type_infra.h"
 #include "util/runtime_profile.h"
+#include "util/stack_util.h"

 namespace starrocks {

@ -47,6 +48,10 @@ private:
    template <LogicalType LT>
    static std::pair<bool, JoinHashMapMethodUnaryType> _try_use_range_direct_mapping(RuntimeState* state,
                                                                                     JoinHashTableItems* table_items);
+    // @return: <can_use, JoinHashMapMethodUnaryType>, where `JoinHashMapMethodUnaryType` is effective only when `can_use` is true.
+    template <LogicalType LT>
+    static std::pair<bool, JoinHashMapMethodUnaryType> _try_use_linear_chained(RuntimeState* state,
+                                                                               JoinHashTableItems* table_items);
 };

 std::tuple<JoinKeyConstructorUnaryType, JoinHashMapMethodUnaryType>
@ -152,6 +157,10 @@ JoinHashMapMethodUnaryType JoinHashMapSelector::_determine_hash_map_method(
                }
            }

+            if (const auto [can_use, hash_map_type] = _try_use_linear_chained<LT>(state, table_items); can_use) {
+                return hash_map_type;
+            }
+
            return JoinHashMapMethodTypeTraits<JoinHashMapMethodType::BUCKET_CHAINED, LT>::unary_type;
        }
    });
@ -220,6 +229,28 @@ std::pair<bool, JoinHashMapMethodUnaryType> JoinHashMapSelector::_try_use_range_
    return {false, JoinHashMapMethodUnaryType::BUCKET_CHAINED_INT};
 }

+template <LogicalType LT>
+std::pair<bool, JoinHashMapMethodUnaryType> JoinHashMapSelector::_try_use_linear_chained(
+        RuntimeState* state, JoinHashTableItems* table_items) {
+    if (!state->enable_hash_join_linear_chained_opt()) {
+        return {false, JoinHashMapMethodTypeTraits<JoinHashMapMethodType::BUCKET_CHAINED, LT>::unary_type};
+    }
+
+    const uint64_t bucket_size = JoinHashMapHelper::calc_bucket_size(table_items->row_count + 1);
+    if (bucket_size > LinearChainedJoinHashMap<LT>::max_supported_bucket_size()) {
+        return {false, JoinHashMapMethodTypeTraits<JoinHashMapMethodType::BUCKET_CHAINED, LT>::unary_type};
+    }
+
+    const bool is_left_anti_join_without_other_conjunct =
+            (table_items->join_type == TJoinOp::LEFT_ANTI_JOIN || table_items->join_type == TJoinOp::LEFT_SEMI_JOIN) &&
+            !table_items->with_other_conjunct;
+    if (is_left_anti_join_without_other_conjunct) {
+        return {true, JoinHashMapMethodTypeTraits<JoinHashMapMethodType::LINEAR_CHAINED_SET, LT>::unary_type};
+    } else {
+        return {true, JoinHashMapMethodTypeTraits<JoinHashMapMethodType::LINEAR_CHAINED, LT>::unary_type};
+    }
+}
+
 // ------------------------------------------------------------------------------------
 // JoinHashMap
 // ------------------------------------------------------------------------------------
@ -483,6 +514,15 @@ void JoinHashTable::_init_join_keys() {
 }

 int64_t JoinHashTable::mem_usage() const {
+    // Theoretically, `_table_items` may be a nullptr after a cancel, even though in practice we haven’t observed any
+    // cases where `_table_items` was unexpectedly cleared or left uninitialized.
+    // To prevent potential null pointer exceptions, we add a defensive check here.
+    if (_table_items == nullptr) {
+        LOG(WARNING) << "table_items is nullptr in mem_usage, stack:" << get_stack_trace();
+        DCHECK(false);
+        return 0;
+    }
+
    int64_t usage = 0;
    if (_table_items->build_chunk != nullptr) {
        usage += _table_items->build_chunk->memory_usage();
@ -617,6 +657,21 @@ void JoinHashTable::merge_ht(const JoinHashTable& ht) {
        }
        columns[i]->append(*other_columns[i], 1, other_columns[i]->size() - 1);
    }
+
+    auto& key_columns = _table_items->key_columns;
+    auto& other_key_columns = ht._table_items->key_columns;
+    for (size_t i = 0; i < key_columns.size(); i++) {
+        // If the join key is slot ref, will get from build chunk directly,
+        // otherwise will append from key_column of input
+        if (_table_items->join_keys[i].col_ref == nullptr) {
+            // upgrade to nullable column
+            if (!key_columns[i]->is_nullable() && other_key_columns[i]->is_nullable()) {
+                const size_t row_count = key_columns[i]->size();
+                key_columns[i] = NullableColumn::create(key_columns[i], NullColumn::create(row_count, 0));
+            }
+            key_columns[i]->append(*other_key_columns[i]);
+        }
+    }
 }

 ChunkPtr JoinHashTable::convert_to_spill_schema(const ChunkPtr& chunk) const {
--- a/be/src/exec/join/join_hash_map.h
+++ b/be/src/exec/join/join_hash_map.h
@ -327,26 +327,6 @@ private:
    HashTableProbeState* _probe_state = nullptr;
 };

-#define JoinHashMapForOneKey(LT) JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::BUCKET_CHAINED>
-#define JoinHashMapForDirectMapping(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::DIRECT_MAPPING>
-#define JoinHashMapForFixedSizeKey(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::BUCKET_CHAINED>
-#define JoinHashMapForSerializedKey(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED, JoinHashMapMethodType::BUCKET_CHAINED>
-#define JoinHashMapForOneKeyRangeDirectMapping(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::RANGE_DIRECT_MAPPING>
-#define JoinHashSetForOneKeyRangeDirectMapping(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::RANGE_DIRECT_MAPPING_SET>
-#define JoinHashMapForOneKeyDenseRangeDirectMapping(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::DENSE_RANGE_DIRECT_MAPPING>
-#define JoinHashMapForFixedSizeKeyRangeDirectMapping(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::RANGE_DIRECT_MAPPING>
-#define JoinHashSetForFixedSizeKeyRangeDirectMapping(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::RANGE_DIRECT_MAPPING_SET>
-#define JoinHashMapForFixedSizeKeyDenseRangeDirectMapping(LT) \
-    JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::DENSE_RANGE_DIRECT_MAPPING>
-
 // ------------------------------------------------------------------------------------
 // JoinHashTable
 // ------------------------------------------------------------------------------------
@ -420,42 +400,55 @@ private:
    void _remove_duplicate_index_for_right_anti_join(Filter* filter);
    void _remove_duplicate_index_for_full_outer_join(Filter* filter);

-    using JoinHashMapVariant =
-            std::variant<std::unique_ptr<JoinHashMapForEmpty>, //
-                         std::unique_ptr<JoinHashMapForDirectMapping(TYPE_BOOLEAN)>,
-                         std::unique_ptr<JoinHashMapForDirectMapping(TYPE_TINYINT)>,
-                         std::unique_ptr<JoinHashMapForDirectMapping(TYPE_SMALLINT)>,
+#define JoinHashMapForIntBigintKey(MT)                                                                                \
+    std::unique_ptr<JoinHashMap<TYPE_INT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,               \
+            std::unique_ptr<JoinHashMap<TYPE_BIGINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,    \
+            std::unique_ptr<                                                                                          \
+                    JoinHashMap<TYPE_INT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::MT>>, \
+            std::unique_ptr<JoinHashMap<TYPE_BIGINT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE,                   \
+                                        JoinHashMapMethodType::MT>>

-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_INT)>, //
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_BIGINT)>,
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_LARGEINT)>, //
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_FLOAT)>,
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_DOUBLE)>, //
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_VARCHAR)>,
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_DATE)>, //
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_DATETIME)>,
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_DECIMALV2)>,
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_DECIMAL32)>,
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_DECIMAL64)>,
-                         std::unique_ptr<JoinHashMapForOneKey(TYPE_DECIMAL128)>,
+#define JoinHashMapForSmallKey(MT)                                                                                  \
+    std::unique_ptr<JoinHashMap<TYPE_BOOLEAN, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,         \
+            std::unique_ptr<JoinHashMap<TYPE_TINYINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
+            std::unique_ptr<JoinHashMap<TYPE_SMALLINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>

-                         std::unique_ptr<JoinHashMapForSerializedKey(TYPE_VARCHAR)>,
-                         std::unique_ptr<JoinHashMapForFixedSizeKey(TYPE_INT)>,
-                         std::unique_ptr<JoinHashMapForFixedSizeKey(TYPE_BIGINT)>,
-                         std::unique_ptr<JoinHashMapForFixedSizeKey(TYPE_LARGEINT)>,
+#define JoinHashMapForNonSmallKey(MT)                                                                                  \
+    std::unique_ptr<JoinHashMap<TYPE_INT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,                \
+            std::unique_ptr<JoinHashMap<TYPE_BIGINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,     \
+            std::unique_ptr<JoinHashMap<TYPE_LARGEINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,   \
+            std::unique_ptr<JoinHashMap<TYPE_FLOAT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,      \
+            std::unique_ptr<JoinHashMap<TYPE_DOUBLE, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,     \
+            std::unique_ptr<JoinHashMap<TYPE_DATE, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,       \
+            std::unique_ptr<JoinHashMap<TYPE_DATETIME, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,   \
+            std::unique_ptr<JoinHashMap<TYPE_DECIMALV2, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,  \
+            std::unique_ptr<JoinHashMap<TYPE_DECIMAL32, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,  \
+            std::unique_ptr<JoinHashMap<TYPE_DECIMAL64, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,  \
+            std::unique_ptr<JoinHashMap<TYPE_DECIMAL128, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
+            std::unique_ptr<JoinHashMap<TYPE_VARCHAR, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>,    \
+                                                                                                                       \
+            std::unique_ptr<                                                                                           \
+                    JoinHashMap<TYPE_INT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::MT>>,  \
+            std::unique_ptr<JoinHashMap<TYPE_BIGINT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE,                    \
+                                        JoinHashMapMethodType::MT>>,                                                   \
+            std::unique_ptr<JoinHashMap<TYPE_LARGEINT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE,                  \
+                                        JoinHashMapMethodType::MT>>,                                                   \
+                                                                                                                       \
+            std::unique_ptr<JoinHashMap<TYPE_VARCHAR, JoinKeyConstructorType::SERIALIZED, JoinHashMapMethodType::MT>>

-                         std::unique_ptr<JoinHashMapForOneKeyRangeDirectMapping(TYPE_INT)>,
-                         std::unique_ptr<JoinHashMapForOneKeyRangeDirectMapping(TYPE_BIGINT)>,
-                         std::unique_ptr<JoinHashSetForOneKeyRangeDirectMapping(TYPE_INT)>,
-                         std::unique_ptr<JoinHashSetForOneKeyRangeDirectMapping(TYPE_BIGINT)>,
-                         std::unique_ptr<JoinHashMapForOneKeyDenseRangeDirectMapping(TYPE_INT)>,
-                         std::unique_ptr<JoinHashMapForOneKeyDenseRangeDirectMapping(TYPE_BIGINT)>,
-                         std::unique_ptr<JoinHashMapForFixedSizeKeyRangeDirectMapping(TYPE_INT)>,
-                         std::unique_ptr<JoinHashMapForFixedSizeKeyRangeDirectMapping(TYPE_BIGINT)>,
-                         std::unique_ptr<JoinHashSetForFixedSizeKeyRangeDirectMapping(TYPE_INT)>,
-                         std::unique_ptr<JoinHashSetForFixedSizeKeyRangeDirectMapping(TYPE_BIGINT)>,
-                         std::unique_ptr<JoinHashMapForFixedSizeKeyDenseRangeDirectMapping(TYPE_INT)>,
-                         std::unique_ptr<JoinHashMapForFixedSizeKeyDenseRangeDirectMapping(TYPE_BIGINT)>>;
+    using JoinHashMapVariant = std::variant<std::unique_ptr<JoinHashMapForEmpty>,
+                                            JoinHashMapForSmallKey(DIRECT_MAPPING),                //
+                                            JoinHashMapForNonSmallKey(BUCKET_CHAINED),             //
+                                            JoinHashMapForNonSmallKey(LINEAR_CHAINED),             //
+                                            JoinHashMapForNonSmallKey(LINEAR_CHAINED_SET),         //
+                                            JoinHashMapForIntBigintKey(RANGE_DIRECT_MAPPING),      //
+                                            JoinHashMapForIntBigintKey(RANGE_DIRECT_MAPPING_SET),  //
+                                            JoinHashMapForIntBigintKey(DENSE_RANGE_DIRECT_MAPPING) //
+                                            >;
+
+#undef JoinHashMapForNonSmallKey
+#undef JoinHashMapForSmallKey
+#undef JoinHashMapForIntBigintKey

    bool _is_empty_map = true;
    JoinKeyConstructorUnaryType _key_constructor_type;
--- a/be/src/exec/join/join_hash_map.tpp
+++ b/be/src/exec/join/join_hash_map.tpp
@ -400,7 +400,7 @@ void JoinHashMap<LT, CT, MT>::_search_ht(RuntimeState* state, ChunkPtr* probe_ch

        auto& build_data = BuildKeyConstructor().get_key_data(*_table_items);
        auto& probe_data = ProbeKeyConstructor().get_key_data(*_probe_state);
-        HashMapMethod().lookup_init(*_table_items, _probe_state, probe_data, _probe_state->null_array);
+        HashMapMethod().lookup_init(*_table_items, _probe_state, build_data, probe_data, _probe_state->null_array);
        _probe_state->consider_probe_time_locality();

        if (_table_items->is_collision_free_and_unique) {
@ -629,9 +629,11 @@ void JoinHashMap<LT, CT, MT>::_search_ht_impl(RuntimeState* state, const Buffer<
 #define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
 #endif

-#define PREFETCH_AND_COWAIT(x, y) \
-    XXH_PREFETCH(x);              \
-    XXH_PREFETCH(y);              \
+#define PREFETCH_AND_COWAIT(cur_data, next_index)            \
+    if constexpr (!HashMapMethod::AreKeysInChainIdentical) { \
+        XXH_PREFETCH(cur_data);                              \
+    }                                                        \
+    XXH_PREFETCH(next_index);                                \
    co_await std::suspend_always{};

 // When a probe row corresponds to multiple Build rows,
@ -994,6 +996,19 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_semi_join(RuntimeState* st
        }
    }

+    if (match_count == probe_row_count) {
+        _probe_state->match_flag = JoinMatchFlag::ALL_MATCH_ONE;
+    } else if (match_count * 2 >= probe_row_count) {
+        _probe_state->match_flag = JoinMatchFlag::MOST_MATCH_ONE;
+        uint8_t* match_filter_data = _probe_state->probe_match_filter.data();
+        memset(match_filter_data, 0, sizeof(uint8_t) * probe_row_count);
+        for (uint32_t i = 0; i < match_count; i++) {
+            match_filter_data[_probe_state->probe_index[i]] = 1;
+        }
+    } else {
+        _probe_state->match_flag = JoinMatchFlag::NORMAL;
+    }
+
    PROBE_OVER()
 }

@ -1001,10 +1016,10 @@ template <LogicalType LT, JoinKeyConstructorType CT, JoinHashMapMethodType MT>
 template <bool first_probe, bool is_collision_free_and_unique>
 void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_anti_join(RuntimeState* state, const Buffer<CppType>& build_data,
                                                                const Buffer<CppType>& probe_data) {
-    size_t match_count = 0;
-
-    size_t probe_row_count = _probe_state->probe_row_count;
    DCHECK_LT(0, _table_items->row_count);
+
+    size_t match_count = 0;
+    const size_t probe_row_count = _probe_state->probe_row_count;
    if (_table_items->join_type == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && _probe_state->null_array != nullptr) {
        // process left anti join from not in
        for (size_t i = 0; i < probe_row_count; i++) {
@ -1022,6 +1037,19 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_anti_join(RuntimeState* st
        }
    }

+    if (match_count == probe_row_count) {
+        _probe_state->match_flag = JoinMatchFlag::ALL_MATCH_ONE;
+    } else if (match_count * 2 >= probe_row_count) {
+        _probe_state->match_flag = JoinMatchFlag::MOST_MATCH_ONE;
+        uint8_t* match_filter_data = _probe_state->probe_match_filter.data();
+        memset(match_filter_data, 0, sizeof(uint8_t) * probe_row_count);
+        for (uint32_t i = 0; i < match_count; i++) {
+            match_filter_data[_probe_state->probe_index[i]] = 1;
+        }
+    } else {
+        _probe_state->match_flag = JoinMatchFlag::NORMAL;
+    }
+
    PROBE_OVER()
 }

@ -1122,7 +1150,13 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_right_outer_join(RuntimeState*
                _probe_state->build_match_index[build_index] = 1;
                match_count++;

-                RETURN_IF_CHUNK_FULL()
+                if constexpr (!is_collision_free_and_unique) {
+                    RETURN_IF_CHUNK_FULL()
+                }
+            }
+
+            if constexpr (is_collision_free_and_unique) {
+                break;
            }
            build_index = _table_items->next[build_index];
        }
@ -1190,9 +1224,15 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_right_semi_join(RuntimeState* s
                    _probe_state->build_match_index[build_index] = 1;
                    match_count++;

-                    RETURN_IF_CHUNK_FULL()
+                    if constexpr (!is_collision_free_and_unique) {
+                        RETURN_IF_CHUNK_FULL()
+                    }
                }
            }
+
+            if constexpr (is_collision_free_and_unique) {
+                break;
+            }
            build_index = _table_items->next[build_index];
        }
    }
@ -1247,6 +1287,10 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_right_anti_join(RuntimeState* s
            if (HashMapMethod().equal(build_data[index], probe_data[i])) {
                _probe_state->build_match_index[index] = 1;
            }
+
+            if constexpr (is_collision_free_and_unique) {
+                break;
+            }
            index = _table_items->next[index];
        }
    }
@ -1311,7 +1355,13 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_full_outer_join(RuntimeState* s
                    _probe_state->cur_row_match_count++;
                    match_count++;

-                    RETURN_IF_CHUNK_FULL()
+                    if constexpr (!is_collision_free_and_unique) {
+                        RETURN_IF_CHUNK_FULL()
+                    }
+                }
+
+                if constexpr (is_collision_free_and_unique) {
+                    break;
                }
                build_index = _table_items->next[build_index];
            }
@ -1399,8 +1449,15 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_semi_join_with_other_conju
                _probe_state->build_index[match_count] = build_index;
                match_count++;

-                RETURN_IF_CHUNK_FULL()
+                if constexpr (!is_collision_free_and_unique) {
+                    RETURN_IF_CHUNK_FULL()
+                }
            }
+
+            if constexpr (is_collision_free_and_unique) {
+                break;
+            }
+
            build_index = _table_items->next[build_index];
        }
    }
@ -1463,8 +1520,15 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_null_aware_anti_join_with_other
                match_count++;
                _probe_state->cur_row_match_count++;

-                RETURN_IF_CHUNK_FULL()
+                if constexpr (!is_collision_free_and_unique) {
+                    RETURN_IF_CHUNK_FULL()
+                }
            }
+
+            if constexpr (is_collision_free_and_unique) {
+                break;
+            }
+
            build_index = _table_items->next[build_index];
        }

@ -1503,7 +1567,13 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_right_outer_right_semi_right_an
                _probe_state->build_index[match_count] = build_index;
                match_count++;

-                RETURN_IF_CHUNK_FULL()
+                if constexpr (!is_collision_free_and_unique) {
+                    RETURN_IF_CHUNK_FULL()
+                }
+            }
+
+            if constexpr (is_collision_free_and_unique) {
+                break;
            }
            build_index = _table_items->next[build_index];
        }
@ -1552,7 +1622,13 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_outer_left_anti_full_outer
                    _probe_state->cur_row_match_count++;
                    match_count++;

-                    RETURN_IF_CHUNK_FULL()
+                    if constexpr (!is_collision_free_and_unique) {
+                        RETURN_IF_CHUNK_FULL()
+                    }
+                }
+
+                if constexpr (is_collision_free_and_unique) {
+                    break;
                }
                build_index = _table_items->next[build_index];
            }
--- a/be/src/exec/join/join_hash_map_method.h
+++ b/be/src/exec/join/join_hash_map_method.h
@ -56,16 +56,101 @@ public:
    using CppType = typename RunTimeTypeTraits<LT>::CppType;
    using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;

+    static constexpr bool AreKeysInChainIdentical = false;
+
    static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
    static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
                                     const Buffer<uint8_t>* is_nulls);

    static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
-                            const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
+                            const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
+                            const Buffer<uint8_t>* is_nulls);

    static bool equal(const CppType& x, const CppType& y) { return x == y; }
 };

+// The `LinearChainedJoinHashMap` uses linear probing to store distinct keys and chained to storage for linked lists of
+// identical keys.
+// - `first` stores the build index of the header for the linked list for each distinct key.
+// - `next` maintains the linked list structure for each distinct key.
+//
+// Fingerprint
+// - Each `first` entry uses the highest 1 byte to store the fingerprint and the lower 3 bytes for the build index,
+//   thus supporting up to 0xFFFFFF buckets.
+// - The fingerprint is generated via hashing.
+//   During hashing, `bucket_num_with_fp = hash % (bucket_size * 8)` is computed instead of `hash % bucket_size`.
+//   - The lower 8 bits of `bucket_num_with_fp` represent the fingerprint (`fp`),
+//   - while `bucket_num_with_fp >> 8` yields the bucket number.
+//
+// Insert and probe
+// - During insertion, linear probing is used in `first` to locate either the first empty bucket or an existing matching key.
+//   The new build index is then inserted into the corresponding linked list in `next`.
+// - During probing, linear probing is used in `first` to locate either an empty bucket or the bucket_num for a matching key.
+//   - If an empty bucket is found, it indicates no matching key exists.
+//   - If a matching key exists, the entire linked list (with `first[bucket_num]` as its header) in `next` stores build
+//     indexes for all the same keys.
+//
+// The following diagram illustrates the structure of `LinearChainedJoinHashMap`:
+//
+// build keys                  first              next
+//                             ┌──────────────┐   ┌───┐
+//                             │FP|build_index│   │   │◄───┐
+//                             │1B     3B     │   │   │◄┐  │
+//                             ├──────────────┤   ├───┤ │  │
+//                    ┌───────►│              │   │   │ │  │
+//           ┌────┐   │     ┌──┤              │   │   │ │  │
+// ┌──────┐  │    │   │     │  ├──────────────┤   ├───┤ │  │
+// │ key  ├─►│hash├───┘     └─►│              │   │   ├─┘  │
+// └──────┘  │    │         ┌──┤              │   │   │◄─┐ │
+//           └────┘         │  ├──────────────┤   ├───┤  │ │
+//                          │  │              │   │   │  │ │
+//                          │  │              │   │   │  │ │
+//                          │  ├──────────────┤   ├───┤  │ │
+//                          └─►│              ├──►│   │  │ │
+//                             │              │   │   ├──┘ │
+//                             ├──────────────┤   ├───┤    │
+//                             │              │   │   │    │
+//                             │              │   │   │    │
+//                             ├──────────────┤   ├───┤    │
+//                             │              │   │   │    │
+//                             │              ├──►│   ├────┘
+//                             └──────────────┘   └───┘
+template <LogicalType LT, bool NeedBuildChained = true>
+class LinearChainedJoinHashMap {
+public:
+    using CppType = typename RunTimeTypeTraits<LT>::CppType;
+    using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;
+
+    static constexpr bool AreKeysInChainIdentical = true;
+
+    static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
+    static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
+                                     const Buffer<uint8_t>* is_nulls);
+
+    static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
+                            const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
+                            const Buffer<uint8_t>* is_nulls);
+
+    static bool equal(const CppType& x, const CppType& y) { return true; }
+
+    static uint32_t max_supported_bucket_size() { return DATA_MASK; }
+
+private:
+    static constexpr uint32_t FP_BITS = 8;
+    static constexpr uint32_t FP_MASK = 0xFF00'0000ul;
+    static constexpr uint32_t DATA_MASK = 0x00FF'FFFFul;
+
+    static uint32_t _combine_data_fp(const uint32_t data, const uint32_t fp) { return fp | data; }
+    static uint32_t _extract_data(const uint32_t v) { return v & DATA_MASK; }
+    static uint32_t _extract_fp(const uint32_t v) { return v & FP_MASK; }
+
+    static uint32_t _get_bucket_num_from_hash(const uint32_t hash) { return hash >> FP_BITS; }
+    static uint32_t _get_fp_from_hash(const uint32_t hash) { return hash << (32 - FP_BITS); }
+};
+
+template <LogicalType LT>
+using LinearChainedJoinHashSet = LinearChainedJoinHashMap<LT, false>;
+
 // The bucket-chained linked list formed by first` and `next` is the same as that of `BucketChainedJoinHashMap`.
 //
 // `DirectMappingJoinHashMap` maps to a position in `first` using `key-MIN_VALUE`.
@ -101,12 +186,15 @@ public:
    using CppType = typename RunTimeTypeTraits<LT>::CppType;
    using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;

+    static constexpr bool AreKeysInChainIdentical = true;
+
    static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
    static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
                                     const Buffer<uint8_t>* is_nulls);

    static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
-                            const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
+                            const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
+                            const Buffer<uint8_t>* is_nulls);

    static bool equal(const CppType& x, const CppType& y) { return true; }
 };
@ -149,12 +237,15 @@ public:
    using CppType = typename RunTimeTypeTraits<LT>::CppType;
    using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;

+    static constexpr bool AreKeysInChainIdentical = true;
+
    static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
    static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
                                     const Buffer<uint8_t>* is_nulls);

    static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
-                            const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
+                            const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
+                            const Buffer<uint8_t>* is_nulls);

    static bool equal(const CppType& x, const CppType& y) { return true; }
 };
@ -168,12 +259,15 @@ public:
    using CppType = typename RunTimeTypeTraits<LT>::CppType;
    using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;

+    static constexpr bool AreKeysInChainIdentical = true;
+
    static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
    static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
                                     const Buffer<uint8_t>* is_nulls);

    static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
-                            const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
+                            const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
+                            const Buffer<uint8_t>* is_nulls);

    static bool equal(const CppType& x, const CppType& y) { return true; }
 };
@ -221,12 +315,15 @@ public:
    using CppType = typename RunTimeTypeTraits<LT>::CppType;
    using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;

+    static constexpr bool AreKeysInChainIdentical = true;
+
    static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
    static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
                                     const Buffer<uint8_t>* is_nulls);

    static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
-                            const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
+                            const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
+                            const Buffer<uint8_t>* is_nulls);

    static bool equal(const CppType& x, const CppType& y) { return true; }
 };
--- a/be/src/exec/join/join_hash_map_method.hpp
+++ b/be/src/exec/join/join_hash_map_method.hpp
@ -84,7 +84,8 @@ void BucketChainedJoinHashMap<LT>::construct_hash_table(JoinHashTableItems* tabl

 template <LogicalType LT>
 void BucketChainedJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
-                                               const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls) {
+                                               const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
+                                               const Buffer<uint8_t>* is_nulls) {
    const uint32_t row_count = probe_state->probe_row_count;
    const auto* firsts = table_items.first.data();
    const auto* buckets = probe_state->buckets.data();
@ -92,8 +93,8 @@ void BucketChainedJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_i

    if (is_nulls == nullptr) {
        for (uint32_t i = 0; i < row_count; i++) {
-            probe_state->buckets[i] = JoinHashMapHelper::calc_bucket_num<CppType>(keys[i], table_items.bucket_size,
-                                                                                  table_items.log_bucket_size);
+            probe_state->buckets[i] = JoinHashMapHelper::calc_bucket_num<CppType>(
+                    probe_keys[i], table_items.bucket_size, table_items.log_bucket_size);
        }
        SIMDGather::gather(nexts, firsts, buckets, row_count);
    } else {
@ -107,14 +108,197 @@ void BucketChainedJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_i
        };
        for (uint32_t i = 0; i < row_count; i++) {
            if (need_calc_bucket_num(i)) {
-                probe_state->buckets[i] = JoinHashMapHelper::calc_bucket_num<CppType>(keys[i], table_items.bucket_size,
-                                                                                      table_items.log_bucket_size);
+                probe_state->buckets[i] = JoinHashMapHelper::calc_bucket_num<CppType>(
+                        probe_keys[i], table_items.bucket_size, table_items.log_bucket_size);
            }
        }
        SIMDGather::gather(nexts, firsts, buckets, is_nulls_data, row_count);
    }
 }

+// ------------------------------------------------------------------------------------
+// LinearChainedJoinHashMap
+// ------------------------------------------------------------------------------------
+
+template <LogicalType LT, bool NeedBuildChained>
+void LinearChainedJoinHashMap<LT, NeedBuildChained>::build_prepare(RuntimeState* state,
+                                                                   JoinHashTableItems* table_items) {
+    table_items->bucket_size = JoinHashMapHelper::calc_bucket_size(table_items->row_count + 1);
+    table_items->log_bucket_size = __builtin_ctz(table_items->bucket_size);
+    table_items->first.resize(table_items->bucket_size, 0);
+    table_items->next.resize(table_items->row_count + 1, 0);
+}
+
+template <LogicalType LT, bool NeedBuildChained>
+void LinearChainedJoinHashMap<LT, NeedBuildChained>::construct_hash_table(JoinHashTableItems* table_items,
+                                                                          const Buffer<CppType>& keys,
+                                                                          const Buffer<uint8_t>* is_nulls) {
+    auto process = [&]<bool IsNullable>() {
+        const auto num_rows = 1 + table_items->row_count;
+        const uint32_t bucket_size_mask = table_items->bucket_size - 1;
+
+        auto* __restrict next = table_items->next.data();
+        auto* __restrict first = table_items->first.data();
+        const uint8_t* __restrict is_nulls_data = IsNullable ? is_nulls->data() : nullptr;
+
+        auto need_calc_bucket_num = [&](const uint32_t index) {
+            // Only check `is_nulls_data[i]` for the nullable slice type. The hash calculation overhead for
+            // fixed-size types is small, and thus we do not check it to allow vectorization of the hash calculation.
+            if constexpr (!IsNullable || !std::is_same_v<CppType, Slice>) {
+                return true;
+            } else {
+                return is_nulls_data[index] == 0;
+            }
+        };
+        auto is_null = [&](const uint32_t index) {
+            if constexpr (!IsNullable) {
+                return false;
+            } else {
+                return is_nulls_data[index] != 0;
+            }
+        };
+
+        for (uint32_t i = 1; i < num_rows; i++) {
+            // Use `next` stores `bucket_num` temporarily.
+            if (need_calc_bucket_num(i)) {
+                next[i] = JoinHashMapHelper::calc_bucket_num<CppType>(keys[i], table_items->bucket_size << FP_BITS,
+                                                                      table_items->log_bucket_size + FP_BITS);
+            }
+        }
+
+        for (uint32_t i = 1; i < num_rows; i++) {
+            if (i + 16 < num_rows && !is_null(i + 16)) {
+                __builtin_prefetch(first + _get_bucket_num_from_hash(next[i + 16]));
+            }
+
+            if (is_null(i)) {
+                next[i] = 0;
+                continue;
+            }
+
+            const uint32_t hash = next[i];
+            const uint32_t fp = _get_fp_from_hash(hash);
+            uint32_t bucket_num = _get_bucket_num_from_hash(hash);
+
+            uint32_t probe_times = 1;
+            while (true) {
+                if (first[bucket_num] == 0) {
+                    if constexpr (NeedBuildChained) {
+                        next[i] = 0;
+                    }
+                    first[bucket_num] = _combine_data_fp(i, fp);
+                    break;
+                }
+
+                if (fp == _extract_fp(first[bucket_num]) && keys[i] == keys[_extract_data(first[bucket_num])]) {
+                    if constexpr (NeedBuildChained) {
+                        next[i] = _extract_data(first[bucket_num]);
+                        first[bucket_num] = _combine_data_fp(i, fp);
+                    }
+                    break;
+                }
+
+                bucket_num = (bucket_num + probe_times) & bucket_size_mask;
+                probe_times++;
+            }
+        }
+
+        if constexpr (!NeedBuildChained) {
+            table_items->next.clear();
+        }
+    };
+
+    if (is_nulls == nullptr) {
+        process.template operator()<false>();
+    } else {
+        process.template operator()<true>();
+    }
+}
+
+template <LogicalType LT, bool NeedBuildChained>
+void LinearChainedJoinHashMap<LT, NeedBuildChained>::lookup_init(const JoinHashTableItems& table_items,
+                                                                 HashTableProbeState* probe_state,
+                                                                 const Buffer<CppType>& build_keys,
+                                                                 const Buffer<CppType>& probe_keys,
+                                                                 const Buffer<uint8_t>* is_nulls) {
+    auto process = [&]<bool IsNullable>() {
+        const uint32_t bucket_size_mask = table_items.bucket_size - 1;
+        const uint32_t row_count = probe_state->probe_row_count;
+
+        const auto* firsts = table_items.first.data();
+        auto* hashes = probe_state->buckets.data();
+        auto* nexts = probe_state->next.data();
+        const uint8_t* is_nulls_data = IsNullable ? is_nulls->data() : nullptr;
+
+        auto need_calc_bucket_num = [&](const uint32_t index) {
+            if constexpr (!IsNullable || !std::is_same_v<CppType, Slice>) {
+                // Only check `is_nulls_data[i]` for the nullable slice type. The hash calculation overhead for
+                // fixed-size types is small, and thus we do not check it to allow vectorization of the hash calculation.
+                return true;
+            } else {
+                return is_nulls_data[index] == 0;
+            }
+        };
+        auto is_null = [&](const uint32_t index) {
+            if constexpr (!IsNullable) {
+                return false;
+            } else {
+                return is_nulls_data[index] != 0;
+            }
+        };
+
+        for (uint32_t i = 0; i < row_count; i++) {
+            if (need_calc_bucket_num(i)) {
+                hashes[i] = JoinHashMapHelper::calc_bucket_num<CppType>(
+                        probe_keys[i], table_items.bucket_size << FP_BITS, table_items.log_bucket_size + FP_BITS);
+            }
+        }
+
+        for (uint32_t i = 0; i < row_count; i++) {
+            if (i + 16 < row_count && !is_null(i + 16)) {
+                __builtin_prefetch(firsts + _get_bucket_num_from_hash(hashes[i + 16]));
+            }
+
+            if (is_null(i)) {
+                nexts[i] = 0;
+                continue;
+            }
+
+            const uint32_t hash = hashes[i];
+            const uint32_t fp = _get_fp_from_hash(hash);
+            uint32_t bucket_num = _get_bucket_num_from_hash(hash);
+
+            uint32_t probe_times = 1;
+            while (true) {
+                if (firsts[bucket_num] == 0) {
+                    nexts[i] = 0;
+                    break;
+                }
+
+                const uint32_t cur_fp = _extract_fp(firsts[bucket_num]);
+                const uint32_t cur_index = _extract_data(firsts[bucket_num]);
+                if (fp == cur_fp && probe_keys[i] == build_keys[cur_index]) {
+                    if constexpr (NeedBuildChained) {
+                        nexts[i] = cur_index;
+                    } else {
+                        nexts[i] = 1;
+                    }
+                    break;
+                }
+
+                bucket_num = (bucket_num + probe_times) & bucket_size_mask;
+                probe_times++;
+            }
+        }
+    };
+
+    if (is_nulls == nullptr) {
+        process.template operator()<false>();
+    } else {
+        process.template operator()<true>();
+    }
+}
+
 // ------------------------------------------------------------------------------------
 // DirectMappingJoinHashMap
 // ------------------------------------------------------------------------------------
@ -155,7 +339,8 @@ void DirectMappingJoinHashMap<LT>::construct_hash_table(JoinHashTableItems* tabl

 template <LogicalType LT>
 void DirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
-                                               const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls) {
+                                               const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
+                                               const Buffer<uint8_t>* is_nulls) {
    probe_state->active_coroutines = 0; // the ht data is not large, so disable it always.

    static constexpr CppType MIN_VALUE = RunTimeTypeLimits<LT>::min_value();
@ -163,13 +348,13 @@ void DirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_i

    if (is_nulls == nullptr) {
        for (size_t i = 0; i < probe_row_count; i++) {
-            probe_state->next[i] = table_items.first[keys[i] - MIN_VALUE];
+            probe_state->next[i] = table_items.first[probe_keys[i] - MIN_VALUE];
        }
    } else {
        const auto* is_nulls_data = is_nulls->data();
        for (size_t i = 0; i < probe_row_count; i++) {
            if (is_nulls_data[i] == 0) {
-                probe_state->next[i] = table_items.first[keys[i] - MIN_VALUE];
+                probe_state->next[i] = table_items.first[probe_keys[i] - MIN_VALUE];
            } else {
                probe_state->next[i] = 0;
            }
@ -215,7 +400,8 @@ void RangeDirectMappingJoinHashMap<LT>::construct_hash_table(JoinHashTableItems*

 template <LogicalType LT>
 void RangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_items,
-                                                    HashTableProbeState* probe_state, const Buffer<CppType>& keys,
+                                                    HashTableProbeState* probe_state, const Buffer<CppType>& build_keys,
+                                                    const Buffer<CppType>& probe_keys,
                                                    const Buffer<uint8_t>* is_nulls) {
    probe_state->active_coroutines = 0; // the ht data is not large, so disable it always.

@ -224,8 +410,8 @@ void RangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& ta
    const size_t num_rows = probe_state->probe_row_count;
    if (is_nulls == nullptr) {
        for (size_t i = 0; i < num_rows; i++) {
-            if ((keys[i] >= min_value) & (keys[i] <= max_value)) {
-                const uint64_t index = keys[i] - min_value;
+            if ((probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
+                const uint64_t index = probe_keys[i] - min_value;
                probe_state->next[i] = table_items.first[index];
            } else {
                probe_state->next[i] = 0;
@ -234,8 +420,8 @@ void RangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& ta
    } else {
        const auto* is_nulls_data = is_nulls->data();
        for (size_t i = 0; i < num_rows; i++) {
-            if ((is_nulls_data[i] == 0) & (keys[i] >= min_value) & (keys[i] <= max_value)) {
-                const uint64_t index = keys[i] - min_value;
+            if ((is_nulls_data[i] == 0) & (probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
+                const uint64_t index = probe_keys[i] - min_value;
                probe_state->next[i] = table_items.first[index];
            } else {
                probe_state->next[i] = 0;
@ -281,7 +467,8 @@ void RangeDirectMappingJoinHashSet<LT>::construct_hash_table(JoinHashTableItems*

 template <LogicalType LT>
 void RangeDirectMappingJoinHashSet<LT>::lookup_init(const JoinHashTableItems& table_items,
-                                                    HashTableProbeState* probe_state, const Buffer<CppType>& keys,
+                                                    HashTableProbeState* probe_state, const Buffer<CppType>& build_keys,
+                                                    const Buffer<CppType>& probe_keys,
                                                    const Buffer<uint8_t>* is_nulls) {
    probe_state->active_coroutines = 0; // the ht data is not large, so disable it always.

@ -290,8 +477,8 @@ void RangeDirectMappingJoinHashSet<LT>::lookup_init(const JoinHashTableItems& ta
    const size_t num_rows = probe_state->probe_row_count;
    if (is_nulls == nullptr) {
        for (size_t i = 0; i < num_rows; i++) {
-            if ((keys[i] >= min_value) & (keys[i] <= max_value)) {
-                const uint64_t index = keys[i] - min_value;
+            if ((probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
+                const uint64_t index = probe_keys[i] - min_value;
                const uint32_t group = index / 8;
                const uint32_t offset = index % 8;
                probe_state->next[i] = (table_items.key_bitset[group] & (1 << offset)) != 0;
@ -302,8 +489,8 @@ void RangeDirectMappingJoinHashSet<LT>::lookup_init(const JoinHashTableItems& ta
    } else {
        const auto* is_nulls_data = is_nulls->data();
        for (size_t i = 0; i < num_rows; i++) {
-            if ((is_nulls_data[i] == 0) & (keys[i] >= min_value) & (keys[i] <= max_value)) {
-                const uint64_t index = keys[i] - min_value;
+            if ((is_nulls_data[i] == 0) & (probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
+                const uint64_t index = probe_keys[i] - min_value;
                const uint32_t group = index / 8;
                const uint32_t offset = index % 8;
                probe_state->next[i] = (table_items.key_bitset[group] & (1 << offset)) != 0;
@ -387,7 +574,9 @@ void DenseRangeDirectMappingJoinHashMap<LT>::construct_hash_table(JoinHashTableI

 template <LogicalType LT>
 void DenseRangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_items,
-                                                         HashTableProbeState* probe_state, const Buffer<CppType>& keys,
+                                                         HashTableProbeState* probe_state,
+                                                         const Buffer<CppType>& build_keys,
+                                                         const Buffer<CppType>& probe_keys,
                                                         const Buffer<uint8_t>* is_nulls) {
    probe_state->active_coroutines = 0; // the ht data is not large, so disable it always.

@ -415,8 +604,8 @@ void DenseRangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItem
    const size_t num_rows = probe_state->probe_row_count;
    if (is_nulls == nullptr) {
        for (size_t i = 0; i < num_rows; i++) {
-            if ((keys[i] >= min_value) & (keys[i] <= max_value)) {
-                const uint64_t bucket_num = keys[i] - min_value;
+            if ((probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
+                const uint64_t bucket_num = probe_keys[i] - min_value;
                probe_state->next[i] = get_dense_first(bucket_num);
            } else {
                probe_state->next[i] = 0;
@ -425,8 +614,8 @@ void DenseRangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItem
    } else {
        const auto* is_nulls_data = is_nulls->data();
        for (size_t i = 0; i < num_rows; i++) {
-            if ((is_nulls_data[i] == 0) & (keys[i] >= min_value) & (keys[i] <= max_value)) {
-                const uint64_t bucket_num = keys[i] - min_value;
+            if ((is_nulls_data[i] == 0) & (probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
+                const uint64_t bucket_num = probe_keys[i] - min_value;
                probe_state->next[i] = get_dense_first(bucket_num);
            } else {
                probe_state->next[i] = 0;
--- a/be/src/exec/join/join_hash_table_descriptor.h
+++ b/be/src/exec/join/join_hash_table_descriptor.h
@ -114,7 +114,8 @@ struct JoinHashTableItems {
        // 1) the ht's size is enough large, for example, larger than (1UL << 27) bytes.
        // 2) smaller ht but most buckets have more than one keys
        cache_miss_serious = row_count > (1UL << 18) &&
-                             ((probe_bytes > (1UL << 25) && keys_per_bucket > 2) ||
+                             ((probe_bytes > (1UL << 24) && keys_per_bucket >= 10) ||
+                              (probe_bytes > (1UL << 25) && keys_per_bucket > 2) ||
                              (probe_bytes > (1UL << 26) && keys_per_bucket > 1.5) || probe_bytes > (1UL << 27));
        VLOG_QUERY << "ht cache miss serious = " << cache_miss_serious << " row# = " << row_count
                   << " , bytes = " << probe_bytes << " , depth = " << keys_per_bucket;
--- a/be/src/exec/join/join_type_traits.h
+++ b/be/src/exec/join/join_type_traits.h
@ -43,7 +43,9 @@ namespace starrocks {
    M(DIRECT_MAPPING)                      \
    M(RANGE_DIRECT_MAPPING)                \
    M(RANGE_DIRECT_MAPPING_SET)            \
-    M(DENSE_RANGE_DIRECT_MAPPING)
+    M(DENSE_RANGE_DIRECT_MAPPING)          \
+    M(LINEAR_CHAINED)                      \
+    M(LINEAR_CHAINED_SET)

 #define APPLY_JOIN_KEY_CONSTRUCTOR_UNARY_TYPE(M) \
    M(ONE_KEY_BOOLEAN)                           \
@ -89,7 +91,33 @@ namespace starrocks {
    M(RANGE_DIRECT_MAPPING_SET_INT)              \
    M(RANGE_DIRECT_MAPPING_SET_BIGINT)           \
    M(DENSE_RANGE_DIRECT_MAPPING_INT)            \
-    M(DENSE_RANGE_DIRECT_MAPPING_BIGINT)
+    M(DENSE_RANGE_DIRECT_MAPPING_BIGINT)         \
+                                                 \
+    M(LINEAR_CHAINED_INT)                        \
+    M(LINEAR_CHAINED_BIGINT)                     \
+    M(LINEAR_CHAINED_LARGEINT)                   \
+    M(LINEAR_CHAINED_FLOAT)                      \
+    M(LINEAR_CHAINED_DOUBLE)                     \
+    M(LINEAR_CHAINED_DATE)                       \
+    M(LINEAR_CHAINED_DATETIME)                   \
+    M(LINEAR_CHAINED_DECIMALV2)                  \
+    M(LINEAR_CHAINED_DECIMAL32)                  \
+    M(LINEAR_CHAINED_DECIMAL64)                  \
+    M(LINEAR_CHAINED_DECIMAL128)                 \
+    M(LINEAR_CHAINED_VARCHAR)                    \
+                                                 \
+    M(LINEAR_CHAINED_SET_INT)                    \
+    M(LINEAR_CHAINED_SET_BIGINT)                 \
+    M(LINEAR_CHAINED_SET_LARGEINT)               \
+    M(LINEAR_CHAINED_SET_FLOAT)                  \
+    M(LINEAR_CHAINED_SET_DOUBLE)                 \
+    M(LINEAR_CHAINED_SET_DATE)                   \
+    M(LINEAR_CHAINED_SET_DATETIME)               \
+    M(LINEAR_CHAINED_SET_DECIMALV2)              \
+    M(LINEAR_CHAINED_SET_DECIMAL32)              \
+    M(LINEAR_CHAINED_SET_DECIMAL64)              \
+    M(LINEAR_CHAINED_SET_DECIMAL128)             \
+    M(LINEAR_CHAINED_SET_VARCHAR)

 enum class JoinKeyConstructorType {
 #define NAME_TO_ENUM(NAME) NAME,
@ -237,6 +265,36 @@ REGISTER_JOIN_MAP_METHOD_TYPE(DENSE_RANGE_DIRECT_MAPPING, TYPE_INT, DenseRangeDi
 REGISTER_JOIN_MAP_METHOD_TYPE(DENSE_RANGE_DIRECT_MAPPING, TYPE_BIGINT, DenseRangeDirectMappingJoinHashMap,
                              DENSE_RANGE_DIRECT_MAPPING_BIGINT);

+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_INT, LinearChainedJoinHashMap, LINEAR_CHAINED_INT);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_BIGINT, LinearChainedJoinHashMap, LINEAR_CHAINED_BIGINT);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_LARGEINT, LinearChainedJoinHashMap, LINEAR_CHAINED_LARGEINT);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_FLOAT, LinearChainedJoinHashMap, LINEAR_CHAINED_FLOAT);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DOUBLE, LinearChainedJoinHashMap, LINEAR_CHAINED_DOUBLE);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DATE, LinearChainedJoinHashMap, LINEAR_CHAINED_DATE);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DATETIME, LinearChainedJoinHashMap, LINEAR_CHAINED_DATETIME);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DECIMALV2, LinearChainedJoinHashMap, LINEAR_CHAINED_DECIMALV2);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DECIMAL32, LinearChainedJoinHashMap, LINEAR_CHAINED_DECIMAL32);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DECIMAL64, LinearChainedJoinHashMap, LINEAR_CHAINED_DECIMAL64);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DECIMAL128, LinearChainedJoinHashMap, LINEAR_CHAINED_DECIMAL128);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_VARCHAR, LinearChainedJoinHashMap, LINEAR_CHAINED_VARCHAR);
+
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_INT, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_INT);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_BIGINT, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_BIGINT);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_LARGEINT, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_LARGEINT);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_FLOAT, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_FLOAT);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DOUBLE, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_DOUBLE);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DATE, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_DATE);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DATETIME, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_DATETIME);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DECIMALV2, LinearChainedJoinHashSet,
+                              LINEAR_CHAINED_SET_DECIMALV2);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DECIMAL32, LinearChainedJoinHashSet,
+                              LINEAR_CHAINED_SET_DECIMAL32);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DECIMAL64, LinearChainedJoinHashSet,
+                              LINEAR_CHAINED_SET_DECIMAL64);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DECIMAL128, LinearChainedJoinHashSet,
+                              LINEAR_CHAINED_SET_DECIMAL128);
+REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_VARCHAR, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_VARCHAR);
+
 #undef REGISTER_JOIN_MAP_TYPE

 // ------------------------------------------------------------------------------------
--- a/be/src/exec/lake_meta_scanner.cpp
+++ b/be/src/exec/lake_meta_scanner.cpp
@ -44,6 +44,10 @@ Status LakeMetaScanner::_real_init() {
    reader_params.low_card_threshold = _parent->_meta_scan_node.__isset.low_cardinality_threshold
                                               ? _parent->_meta_scan_node.low_cardinality_threshold
                                               : DICT_DECODE_MAX_SIZE;
+    // Pass column access paths and extend schema similar to OLAP path
+    if (_parent->_meta_scan_node.__isset.column_access_paths && !_parent->_column_access_paths.empty()) {
+        reader_params.column_access_paths = &_parent->_column_access_paths;
+    }

    _reader = std::make_unique<LakeMetaReader>();
    TEST_SYNC_POINT_CALLBACK("lake_meta_scanner:open_mock_reader", &_reader);
--- a/be/src/exec/olap_meta_scanner.cpp
+++ b/be/src/exec/olap_meta_scanner.cpp
@ -72,7 +72,8 @@ Status OlapMetaScanner::_init_meta_reader_params() {
            column.set_type(path->value_type().type);
            column.set_length(path->value_type().len);
            column.set_is_nullable(true);
-            column.set_extended_info(std::make_unique<ExtendedColumnInfo>(path.get(), root_column_index));
+            int32_t root_uid = tmp_schema->column(static_cast<size_t>(root_column_index)).unique_id();
+            column.set_extended_info(std::make_unique<ExtendedColumnInfo>(path.get(), root_uid));

            tmp_schema->append_column(column);
            VLOG(2) << "extend the tablet-schema: " << column.debug_string();
--- a/be/src/exec/partition/bucket_aware_partition.cpp
+++ b/be/src/exec/partition/bucket_aware_partition.cpp
@ -0,0 +1,63 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "exec/partition/bucket_aware_partition.h"
+
+#include "column/nullable_column.h"
+#include "gutil/casts.h"
+
+namespace starrocks {
+
+void calc_hash_values_and_bucket_ids(const std::vector<const Column*>& partitions_columns,
+                                     BucketAwarePartitionCtx ctx) {
+    size_t num_rows = partitions_columns[0]->size();
+    const auto& bucket_properties = ctx.bucket_properties;
+    auto& hash_values = ctx.hash_values;
+    auto& bucket_ids = ctx.bucket_ids;
+    auto& round_hashes = ctx.round_hashes;
+    auto& round_ids = ctx.round_ids;
+
+    hash_values.assign(num_rows, 0);
+    bucket_ids.assign(num_rows, 0);
+    for (int i = 0; i < partitions_columns.size(); ++i) {
+        // TODO, enhance it if we try to support more bucket functions.
+        DCHECK(bucket_properties[i].bucket_func == TBucketFunction::MURMUR3_X86_32);
+        round_hashes.assign(num_rows, 0);
+        round_ids.assign(num_rows, 0);
+        partitions_columns[i]->murmur_hash3_x86_32(&round_hashes[0], 0, num_rows);
+        for (int j = 0; j < num_rows; j++) {
+            hash_values[j] ^= round_hashes[j];
+            round_ids[j] = (round_hashes[j] & std::numeric_limits<int>::max()) % bucket_properties[i].bucket_num;
+        }
+        if (partitions_columns[i]->has_null()) {
+            const auto& null_data = down_cast<const NullableColumn*>(partitions_columns[i])->null_column()->get_data();
+            for (int j = 0; j < num_rows; j++) {
+                round_ids[j] = null_data[j] ? bucket_properties[i].bucket_num : round_ids[j];
+            }
+        }
+
+        if (i == partitions_columns.size() - 1) {
+            for (int j = 0; j < num_rows; j++) {
+                bucket_ids[j] += round_ids[j];
+            }
+        } else {
+            for (int j = 0; j < num_rows; j++) {
+                // bucket mapping, same behavior as FE
+                bucket_ids[j] = (round_ids[j] + bucket_ids[j]) * (bucket_properties[i + 1].bucket_num + 1);
+            }
+        }
+    }
+}
+
+} // namespace starrocks
--- a/be/src/exec/partition/bucket_aware_partition.h
+++ b/be/src/exec/partition/bucket_aware_partition.h
@ -0,0 +1,42 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "column/column.h"
+#include "gen_cpp/Partitions_types.h"
+
+namespace starrocks {
+
+struct BucketAwarePartitionCtx {
+    BucketAwarePartitionCtx(const std::vector<TBucketProperty>& bucket_properties, std::vector<uint32_t>& hash_values,
+                            std::vector<uint32_t>& round_hashes, std::vector<uint32_t>& bucket_ids,
+                            std::vector<uint32_t>& round_ids)
+            : bucket_properties(bucket_properties),
+              hash_values(hash_values),
+              round_hashes(round_hashes),
+              bucket_ids(bucket_ids),
+              round_ids(round_ids) {}
+    const std::vector<TBucketProperty>& bucket_properties;
+    std::vector<uint32_t>& hash_values;
+    std::vector<uint32_t>& round_hashes;
+    std::vector<uint32_t>& bucket_ids;
+    std::vector<uint32_t>& round_ids;
+};
+
+void calc_hash_values_and_bucket_ids(const std::vector<const Column*>& partitions_columns, BucketAwarePartitionCtx ctx);
+
+} // namespace starrocks
--- a/be/src/exec/pipeline/aggregate/aggregate_blocking_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_blocking_sink_operator.cpp
@ -69,8 +69,7 @@ Status AggregateBlockingSinkOperator::set_finishing(RuntimeState* state) {
        if (_aggregator->hash_map_variant().size() == 0) {
            _aggregator->set_ht_eos();
        }
-        _aggregator->hash_map_variant().visit(
-                [&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
+        _aggregator->it_hash() = _aggregator->state_allocator().begin();

    } else if (_aggregator->is_none_group_by_exprs()) {
        // for aggregate no group by, if _num_input_rows is 0,
--- a/be/src/exec/pipeline/aggregate/aggregate_streaming_source_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/aggregate_streaming_source_operator.cpp
@ -89,8 +89,7 @@ DEFINE_FAIL_POINT(force_reset_aggregator_after_agg_streaming_sink_finish);

 Status AggregateStreamingSourceOperator::_output_chunk_from_hash_map(ChunkPtr* chunk, RuntimeState* state) {
    if (!_aggregator->it_hash().has_value()) {
-        _aggregator->hash_map_variant().visit(
-                [&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
+        _aggregator->it_hash() = _aggregator->state_allocator().begin();
        COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_map_variant().size());
    }

--- a/be/src/exec/pipeline/aggregate/sorted_aggregate_streaming_source_operator.h
+++ b/be/src/exec/pipeline/aggregate/sorted_aggregate_streaming_source_operator.h
@ -14,7 +14,7 @@

 #pragma once

-#include "exec/aggregator.h"
+#include "exec/aggregator_fwd.h"
 #include "exec/pipeline/source_operator.h"

 namespace starrocks::pipeline {
--- a/be/src/exec/pipeline/aggregate/spillable_aggregate_blocking_sink_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/spillable_aggregate_blocking_sink_operator.cpp
@ -66,8 +66,7 @@ Status SpillableAggregateBlockingSinkOperator::set_finishing(RuntimeState* state
    }
    if (!_aggregator->spill_channel()->has_task()) {
        if (_aggregator->hash_map_variant().size() > 0 || !_streaming_chunks.empty()) {
-            _aggregator->hash_map_variant().visit(
-                    [&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
+            _aggregator->it_hash() = _aggregator->state_allocator().begin();
            _aggregator->spill_channel()->add_spill_task(_build_spill_task(state));
        }
    }
@ -270,8 +269,7 @@ Status SpillableAggregateBlockingSinkOperator::_try_to_spill_by_auto(RuntimeStat
 Status SpillableAggregateBlockingSinkOperator::_spill_all_data(RuntimeState* state, bool should_spill_hash_table) {
    RETURN_IF(_aggregator->hash_map_variant().size() == 0, Status::OK());
    if (should_spill_hash_table) {
-        _aggregator->hash_map_variant().visit(
-                [&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
+        _aggregator->it_hash() = _aggregator->state_allocator().begin();
    }
    CHECK(!_aggregator->spill_channel()->has_task());
    RETURN_IF_ERROR(_aggregator->spill_aggregate_data(state, _build_spill_task(state, should_spill_hash_table)));
--- a/be/src/exec/pipeline/aggregate/spillable_aggregate_blocking_source_operator.h
+++ b/be/src/exec/pipeline/aggregate/spillable_aggregate_blocking_source_operator.h
@ -16,9 +16,8 @@

 #include <utility>

-#include "exec/aggregator.h"
+#include "exec/aggregator_fwd.h"
 #include "exec/pipeline/aggregate/aggregate_blocking_source_operator.h"
-#include "exec/sorted_streaming_aggregator.h"
 #include "runtime/runtime_state.h"
 #include "storage/chunk_helper.h"

--- a/be/src/exec/pipeline/aggregate/spillable_aggregate_distinct_blocking_operator.h
+++ b/be/src/exec/pipeline/aggregate/spillable_aggregate_distinct_blocking_operator.h
@ -16,7 +16,7 @@

 #include <utility>

-#include "exec/aggregator.h"
+#include "exec/aggregator_fwd.h"
 #include "exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.h"
 #include "exec/pipeline/aggregate/aggregate_distinct_blocking_source_operator.h"
 #include "exec/pipeline/operator.h"
--- a/be/src/exec/pipeline/aggregate/spillable_partitionwise_aggregate_operator.cpp
+++ b/be/src/exec/pipeline/aggregate/spillable_partitionwise_aggregate_operator.cpp
@ -49,9 +49,7 @@ Status SpillablePartitionWiseAggregateSinkOperator::set_finishing(RuntimeState*
    }
    if (!_agg_op->aggregator()->spill_channel()->has_task()) {
        if (_agg_op->aggregator()->hash_map_variant().size() > 0 || !_streaming_chunks.empty()) {
-            _agg_op->aggregator()->hash_map_variant().visit([&](auto& hash_map_with_key) {
-                _agg_op->aggregator()->it_hash() = _agg_op->aggregator()->_state_allocator.begin();
-            });
+            _agg_op->aggregator()->it_hash() = _agg_op->aggregator()->state_allocator().begin();
            _agg_op->aggregator()->spill_channel()->add_spill_task(_build_spill_task(state));
        }
    }
@ -279,9 +277,7 @@ ChunkPtr& SpillablePartitionWiseAggregateSinkOperator::_append_hash_column(Chunk
 Status SpillablePartitionWiseAggregateSinkOperator::_spill_all_data(RuntimeState* state, bool should_spill_hash_table) {
    RETURN_IF(_agg_op->aggregator()->hash_map_variant().size() == 0, Status::OK());
    if (should_spill_hash_table) {
-        _agg_op->aggregator()->hash_map_variant().visit([&](auto& hash_map_with_key) {
-            _agg_op->aggregator()->it_hash() = _agg_op->aggregator()->_state_allocator.begin();
-        });
+        _agg_op->aggregator()->it_hash() = _agg_op->aggregator()->state_allocator().begin();
    }
    CHECK(!_agg_op->aggregator()->spill_channel()->has_task());
    RETURN_IF_ERROR(
--- a/be/src/exec/pipeline/exchange/exchange_sink_operator.cpp
+++ b/be/src/exec/pipeline/exchange/exchange_sink_operator.cpp
@ -23,6 +23,7 @@
 #include <utility>

 #include "common/config.h"
+#include "exec/partition/bucket_aware_partition.h"
 #include "exec/pipeline/exchange/shuffler.h"
 #include "exec/pipeline/exchange/sink_buffer.h"
 #include "exprs/expr.h"
@ -640,38 +641,13 @@ Status ExchangeSinkOperator::push_chunk(RuntimeState* state, const ChunkPtr& chu
 }

 void ExchangeSinkOperator::_calc_hash_values_and_bucket_ids() {
-    size_t num_rows = _partitions_columns[0]->size();
-    _hash_values.assign(num_rows, 0);
-    _bucket_ids.assign(num_rows, 0);
-    for (int i = 0; i < _partitions_columns.size(); ++i) {
-        // TODO, enhance it if we try to support more bucket functions.
-        DCHECK(_bucket_properties[i].bucket_func == TBucketFunction::MURMUR3_X86_32);
-        _round_hashes.assign(num_rows, 0);
-        _round_ids.assign(num_rows, 0);
-        _partitions_columns[i]->murmur_hash3_x86_32(&_round_hashes[0], 0, num_rows);
-        for (int j = 0; j < num_rows; j++) {
-            _hash_values[j] ^= _round_hashes[j];
-            _round_ids[j] = (_round_hashes[j] & std::numeric_limits<int>::max()) % _bucket_properties[i].bucket_num;
-        }
-        if (_partitions_columns[i]->has_null()) {
-            const auto& null_data =
-                    down_cast<const NullableColumn*>(_partitions_columns[i].get())->null_column()->get_data();
-            for (int j = 0; j < num_rows; j++) {
-                _round_ids[j] = null_data[j] ? _bucket_properties[i].bucket_num : _round_ids[j];
-            }
-        }
-
-        if (i == _partitions_columns.size() - 1) {
-            for (int j = 0; j < num_rows; j++) {
-                _bucket_ids[j] += _round_ids[j];
-            }
-        } else {
-            for (int j = 0; j < num_rows; j++) {
-                // bucket mapping, same behavior as FE
-                _bucket_ids[j] = (_round_ids[j] + _bucket_ids[j]) * (_bucket_properties[i + 1].bucket_num + 1);
-            }
-        }
+    std::vector<const Column*> partitions_columns;
+    for (size_t i = 0; i < _partitions_columns.size(); i++) {
+        partitions_columns.emplace_back(_partitions_columns[i].get());
    }
+
+    BucketAwarePartitionCtx bctx(_bucket_properties, _hash_values, _round_hashes, _bucket_ids, _round_ids);
+    calc_hash_values_and_bucket_ids(partitions_columns, bctx);
 }

 void ExchangeSinkOperator::update_metrics(RuntimeState* state) {
--- a/be/src/exec/pipeline/exec_state_reporter.cpp
+++ b/be/src/exec/pipeline/exec_state_reporter.cpp
@ -24,6 +24,7 @@
 #include "runtime/exec_env.h"
 #include "service/backend_options.h"
 #include "util/network_util.h"
+#include "util/starrocks_metrics.h"
 #include "util/thrift_rpc_helper.h"

 namespace starrocks::pipeline {
@ -247,7 +248,7 @@ Status ExecStateReporter::report_epoch(const TMVMaintenanceTasks& params, ExecEn
 }

 ExecStateReporter::ExecStateReporter(const CpuUtil::CpuIds& cpuids) {
-    auto status = ThreadPoolBuilder("ex_state_report") // exec state reporter
+    auto status = ThreadPoolBuilder("exec_state_report") // exec state reporter
                          .set_min_threads(1)
                          .set_max_threads(2)
                          .set_max_queue_size(1000)
@ -257,8 +258,9 @@ ExecStateReporter::ExecStateReporter(const CpuUtil::CpuIds& cpuids) {
    if (!status.ok()) {
        LOG(FATAL) << "Cannot create thread pool for ExecStateReport: error=" << status.to_string();
    }
+    REGISTER_THREAD_POOL_METRICS(exec_state_report, _thread_pool);

-    status = ThreadPoolBuilder("priority_ex_state_report") // priority exec state reporter with infinite queue
+    status = ThreadPoolBuilder("priority_exec_state_report") // priority exec state reporter with infinite queue
                     .set_min_threads(1)
                     .set_max_threads(2)
                     .set_idle_timeout(MonoDelta::FromMilliseconds(2000))
@ -267,6 +269,7 @@ ExecStateReporter::ExecStateReporter(const CpuUtil::CpuIds& cpuids) {
    if (!status.ok()) {
        LOG(FATAL) << "Cannot create thread pool for priority ExecStateReport: error=" << status.to_string();
    }
+    REGISTER_THREAD_POOL_METRICS(priority_exec_state_report, _priority_thread_pool);
 }

 void ExecStateReporter::submit(std::function<void()>&& report_task, bool priority) {
--- a/be/src/exec/pipeline/fragment_executor.cpp
+++ b/be/src/exec/pipeline/fragment_executor.cpp
@ -676,7 +676,8 @@ Status FragmentExecutor::_prepare_stream_load_pipe(ExecEnv* exec_env, const Unif
                            delete ctx;
                        }
                    });
-                    RETURN_IF_ERROR(exec_env->stream_context_mgr()->put_channel_context(label, channel_id, ctx));
+                    RETURN_IF_ERROR(
+                            exec_env->stream_context_mgr()->put_channel_context(label, table_name, channel_id, ctx));
                }
                stream_load_contexts.push_back(ctx);
            }
@ -975,7 +976,8 @@ void FragmentExecutor::_fail_cleanup(bool fragment_has_registed) {
    }
 }

-Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const TExecPlanFragmentParams& request) {
+Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const TExecPlanFragmentParams& request,
+                                                        TExecPlanFragmentResult* response) {
    DCHECK(!request.__isset.fragment);
    DCHECK(request.__isset.params);
    const TPlanFragmentExecParams& params = request.params;
@ -996,6 +998,7 @@ Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const
    RuntimeState* runtime_state = fragment_ctx->runtime_state();

    std::unordered_set<int> notify_ids;
+    std::vector<int32_t> closed_scan_nodes;

    for (const auto& [node_id, scan_ranges] : params.per_node_scan_ranges) {
        if (scan_ranges.size() == 0) continue;
@ -1019,6 +1022,10 @@ Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const
        RETURN_IF_ERROR(morsel_queue_factory->append_morsels(0, std::move(morsels)));
        morsel_queue_factory->set_has_more(has_more_morsel);
        notify_ids.insert(node_id);
+
+        if (morsel_queue_factory->reach_limit()) {
+            closed_scan_nodes.push_back(node_id);
+        }
    }

    if (params.__isset.node_to_per_driver_seq_scan_ranges) {
@ -1047,6 +1054,14 @@ Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const
            }
            morsel_queue_factory->set_has_more(has_more_morsel);
            notify_ids.insert(node_id);
+
+            if (morsel_queue_factory->reach_limit()) {
+                closed_scan_nodes.push_back(node_id);
+            }
+        }
+
+        if (closed_scan_nodes.size() > 0) {
+            response->__set_closed_scan_nodes(closed_scan_nodes);
        }
    }

--- a/be/src/exec/pipeline/fragment_executor.h
+++ b/be/src/exec/pipeline/fragment_executor.h
@ -107,7 +107,8 @@ public:
                   const TExecPlanFragmentParams& unique_request);
    Status execute(ExecEnv* exec_env);

-    static Status append_incremental_scan_ranges(ExecEnv* exec_env, const TExecPlanFragmentParams& request);
+    static Status append_incremental_scan_ranges(ExecEnv* exec_env, const TExecPlanFragmentParams& request,
+                                                 TExecPlanFragmentResult* response);

 private:
    void _fail_cleanup(bool fragment_has_registed);
--- a/be/src/exec/pipeline/hashjoin/hash_join_build_operator.cpp
+++ b/be/src/exec/pipeline/hashjoin/hash_join_build_operator.cpp
@ -38,7 +38,7 @@ HashJoinBuildOperator::HashJoinBuildOperator(OperatorFactory* factory, int32_t i
          _distribution_mode(distribution_mode) {}

 Status HashJoinBuildOperator::push_chunk(RuntimeState* state, const ChunkPtr& chunk) {
-    return _join_builder->append_chunk_to_ht(chunk);
+    return _join_builder->append_chunk_to_ht(state, chunk);
 }

 Status HashJoinBuildOperator::prepare(RuntimeState* state) {
--- a/Show More
+++ b/Show More