[Enhancement] support encode_sort_key function (backport #61781) (#61976)

Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com> Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-08-19 18:36:47 +08:00 · 2025-08-19 18:36:47 +08:00 · ae28c45368
parent 90f1f3be58
commit ae28c45368
20 changed files with 784 additions and 119 deletions
--- a/be/src/column/datum_convert.cpp
+++ b/be/src/column/datum_convert.cpp
@ -18,6 +18,7 @@
 #include "runtime/mem_pool.h"
 #include "storage/olap_type_infra.h"
 #include "storage/type_traits.h"
+#include "types/logical_type.h"

 namespace starrocks {

@ -51,6 +52,7 @@ Status datum_from_string(TypeInfo* type_info, Datum* dst, const std::string& str
        return Status::OK();
    }
        /* Type need memory allocated */
+    case TYPE_VARBINARY:
    case TYPE_CHAR:
    case TYPE_VARCHAR: {
        /* Type need memory allocated */
@ -92,6 +94,7 @@ std::string datum_to_string(TypeInfo* type_info, const Datum& datum) {
    switch (type) {
    case TYPE_BOOLEAN:
        return datum_to_string<TYPE_TINYINT>(type_info, datum);
+    case TYPE_VARBINARY:
    case TYPE_CHAR:
    case TYPE_VARCHAR:
        return datum_to_string<TYPE_VARCHAR>(type_info, datum);
--- a/be/src/exprs/utility_functions.cpp
+++ b/be/src/exprs/utility_functions.cpp
@ -14,6 +14,7 @@

 #include "exprs/utility_functions.h"

+#include "column/column_visitor_adapter.h"
 #include "gen_cpp/FrontendService_types.h"
 #include "runtime/client_cache.h"

@ -29,7 +30,9 @@
 #include <limits>
 #include <random>

+#include "column/binary_column.h"
 #include "column/column_builder.h"
+#include "column/column_helper.h"
 #include "column/column_viewer.h"
 #include "column/vectorized_fwd.h"
 #include "common/config.h"
@ -39,6 +42,8 @@
 #include "gutil/casts.h"
 #include "runtime/runtime_state.h"
 #include "service/backend_options.h"
+#include "storage/key_coder.h"
+#include "storage/primary_key_encoder.h"
 #include "types/logical_type.h"
 #include "util/cidr.h"
 #include "util/monotime.h"
@ -366,6 +371,188 @@ StatusOr<ColumnPtr> UtilityFunctions::equiwidth_bucket(FunctionContext* context,
    return builder.build(false);
 }

+// Helpers copied and adapted from storage/primary_key_encoder.cpp for order-preserving encoding
+namespace detail {
+
+inline void encode_float32(float v, std::string* dest) {
+    uint32_t u;
+    static_assert(sizeof(u) == sizeof(v), "size mismatch");
+    memcpy(&u, &v, sizeof(u));
+    // If negative, flip all bits; else flip sign bit only.
+    u ^= (u & 0x80000000u) ? 0xFFFFFFFFu : 0x80000000u;
+    u = starrocks::encoding_utils::to_bigendian(u);
+    dest->append(reinterpret_cast<const char*>(&u), sizeof(u));
+}
+
+inline void encode_float64(double v, std::string* dest) {
+    uint64_t u;
+    static_assert(sizeof(u) == sizeof(v), "size mismatch");
+    memcpy(&u, &v, sizeof(u));
+    u ^= (u & 0x8000000000000000ull) ? 0xFFFFFFFFFFFFFFFFull : 0x8000000000000000ull;
+    u = starrocks::encoding_utils::to_bigendian(u);
+    dest->append(reinterpret_cast<char*>(&u), sizeof(u));
+}
+
+struct EncoderVisitor : public ColumnVisitorAdapter<EncoderVisitor> {
+    bool is_last_field = false;
+    std::vector<std::string>* buffs;
+    const Buffer<uint8_t>* null_mask = nullptr; // Track null rows to skip processing
+
+    explicit EncoderVisitor() : ColumnVisitorAdapter(this) {}
+
+    // Nullable wrapper: handle null values and data together
+    Status do_visit(const NullableColumn& column) {
+        auto& nulls = column.immutable_null_column_data();
+
+        for (size_t i = 0; i < column.size(); i++) {
+            if (nulls[i]) {
+                (*buffs)[i].append("\0", 1);
+            } else {
+                (*buffs)[i].append("\1", 1);
+            }
+        }
+
+        // Set the null mask so that subsequent visitor methods can skip null rows
+        const Buffer<uint8_t>* original_null_mask = null_mask;
+        null_mask = &nulls;
+
+        // Process the underlying data column
+        Status status = column.data_column()->accept(this);
+
+        // Restore the original null mask
+        null_mask = original_null_mask;
+
+        return status;
+    }
+
+    // Const: forward to data
+    Status do_visit(const ConstColumn& column) {
+        for (size_t i = 0; i < column.size(); i++) {
+            RETURN_IF_ERROR(column.data_column()->accept(this));
+        }
+        return Status::OK();
+    }
+
+    // Strings/binary
+    template <typename T>
+    Status do_visit(const BinaryColumnBase<T>& column) {
+        for (size_t i = 0; i < column.size(); i++) {
+            // Skip processing for null rows
+            if (null_mask && (*null_mask)[i]) {
+                continue;
+            }
+            Slice s = column.get_slice(i);
+            encoding_utils::encode_slice(s, &(*buffs)[i], is_last_field);
+        }
+        return Status::OK();
+    }
+
+    // Fixed-length numerics
+    template <typename T>
+    Status do_visit(const FixedLengthColumn<T>& column) {
+        const auto& data = column.get_data();
+        for (size_t i = 0; i < column.size(); i++) {
+            // Skip processing for null rows
+            if (null_mask && (*null_mask)[i]) {
+                continue;
+            }
+            if constexpr (std::is_same_v<T, float>) {
+                encode_float32(data[i], &(*buffs)[i]);
+            } else if constexpr (std::is_same_v<T, double>) {
+                encode_float64(data[i], &(*buffs)[i]);
+            } else if constexpr (std::is_integral_v<T> && sizeof(T) <= 8) {
+                encoding_utils::encode_integral<T>(data[i], &(*buffs)[i]);
+            } else if constexpr (std::is_same_v<T, DateValue>) {
+                encoding_utils::encode_integral<int32_t>(data[i].julian(), &(*buffs)[i]);
+            } else if constexpr (std::is_same_v<T, DateTimeValue>) {
+                encoding_utils::encode_integral<int64_t>(data[i].to_int64(), &(*buffs)[i]);
+            } else if constexpr (std::is_same_v<T, TimestampValue>) {
+                encoding_utils::encode_integral<int64_t>(data[i].timestamp(), &(*buffs)[i]);
+            } else {
+                return Status::NotSupported(
+                        fmt::format("encode_sort_key: unsupported argument type: {}", typeid(T).name()));
+            }
+        }
+        return Status::OK();
+    }
+
+    // Unsupported complex types map/array/struct/json/hll/bitmap/percentile
+    template <typename T>
+    Status do_visit(const T& column) {
+        // Even for unsupported types, we should check null mask to be consistent
+        // but since we're returning an error anyway, we can skip the null check
+        return Status::NotSupported("encode_sort_key: unsupported argument type");
+    }
+};
+} // namespace detail
+
+// The encoding strategy of encode_sort_key is as follows:
+//
+// - For each input column, each row's value is encoded into a binary buffer
+//   using an order-preserving encoding. The encoding method depends on the
+//   column's type:
+//     * For integral types, values are converted to big-endian and, for signed
+//       types, the sign bit is flipped to preserve order.
+//     * For floating-point types, a custom encoding is used to ensure correct
+//       sort order.
+//     * For string types (Slice), a special encoding is used for non-last fields
+//       to escape 0x00 bytes and append a 0x00 0x00 terminator, while the last
+//       field is appended directly.
+//     * For date/time types, the internal integer representation is encoded as
+//       an integral value.
+//     * Unsupported types return an error.
+//
+// - For each column, a NULL marker (byte value 0x00 or 0x01) is appended for every
+//   row, regardless of the column's nullability, to ensure consistent encoding
+//   even if the column's nullability metadata is lost during processing.
+//
+// - After encoding each column, a separator byte (0x00) is appended between
+//   columns (except after the last column) to delimit fields in the composite
+//   key.
+//
+// - The result is a composite binary key for each row, which preserves the
+//   original sort order of the input columns when compared lexicographically.
+StatusOr<ColumnPtr> UtilityFunctions::encode_sort_key(FunctionContext* context, const Columns& columns) {
+    const char* NOT_NULL_MARKER = "\1";
+    const char* COLUMN_SEPARATOR = "\0";
+    int num_args = columns.size();
+    RETURN_IF(num_args < 1, Status::InvalidArgument("encode_sort_key requires at least 1 argument"));
+
+    size_t num_rows = columns[0]->size();
+    auto result = ColumnBuilder<TYPE_VARBINARY>(num_rows);
+
+    std::vector<std::string> buffs(num_rows);
+    detail::EncoderVisitor visitor;
+    visitor.buffs = &buffs;
+    for (int j = 0; j < num_args; ++j) {
+        // Insert NOT_NULL markers for all rows.
+        // This is necessary because the function may receive columns whose nullability
+        // does not accurately reflect the original data source.
+        // For example, a nullable column that contains no null values may be converted
+        // to a non-nullable column during processing. To ensure consistency in the sort key
+        // encoding, we explicitly add NOT_NULL markers for every row.
+        if (!columns[j]->is_nullable()) {
+            for (auto& buff : buffs) {
+                buff.append(NOT_NULL_MARKER, 1);
+            }
+        }
+        visitor.is_last_field = (j + 1 == num_args);
+        RETURN_IF_ERROR(columns[j]->accept(&visitor));
+
+        // Add a separator between each column
+        if (j < num_args - 1) {
+            for (auto& buff : buffs) {
+                buff.append(COLUMN_SEPARATOR, 1);
+            }
+        }
+    }
+
+    for (size_t i = 0; i < num_rows; i++) {
+        result.append(std::move(buffs[i]));
+    }
+    return result.build(ColumnHelper::is_all_const(columns));
+}
+
 } // namespace starrocks

 #include "gen_cpp/opcode/UtilityFunctions.inc"
--- a/be/src/exprs/utility_functions.h
+++ b/be/src/exprs/utility_functions.h
@ -67,6 +67,9 @@ public:

    // Build a equi-width histogram
    DEFINE_VECTORIZED_FN(equiwidth_bucket);
+
+    // Build an order-preserving composite binary key from heterogeneous arguments
+    DEFINE_VECTORIZED_FN(encode_sort_key);
 };

 } // namespace starrocks
--- a/be/src/storage/key_coder.cpp
+++ b/be/src/storage/key_coder.cpp
@ -62,6 +62,7 @@ private:
        add_mapping<TYPE_DECIMALV2>();
        add_mapping<TYPE_CHAR>();
        add_mapping<TYPE_VARCHAR>();
+        add_mapping<TYPE_VARBINARY>();
        add_mapping<TYPE_BOOLEAN>();
    }

--- a/be/src/storage/key_coder.h
+++ b/be/src/storage/key_coder.h
@ -409,4 +409,8 @@ public:
    }
 };

+// Reuse VARCHAR's key coder behavior for VARBINARY
+template <>
+class KeyCoderTraits<TYPE_VARBINARY> : public KeyCoderTraits<TYPE_VARCHAR> {};
+
 } // namespace starrocks
--- a/be/src/storage/primary_key_encoder.cpp
+++ b/be/src/storage/primary_key_encoder.cpp
@ -55,58 +55,7 @@ namespace starrocks {
 constexpr uint8_t SORT_KEY_NULL_FIRST_MARKER = 0x00;
 constexpr uint8_t SORT_KEY_NORMAL_MARKER = 0x01;

-template <class UT>
-UT to_bigendian(UT v);
-
-template <>
-uint8_t to_bigendian(uint8_t v) {
-    return v;
-}
-template <>
-uint16_t to_bigendian(uint16_t v) {
-    return BigEndian::FromHost16(v);
-}
-template <>
-uint32_t to_bigendian(uint32_t v) {
-    return BigEndian::FromHost32(v);
-}
-template <>
-uint64_t to_bigendian(uint64_t v) {
-    return BigEndian::FromHost64(v);
-}
-template <>
-uint128_t to_bigendian(uint128_t v) {
-    return BigEndian::FromHost128(v);
-}
-
-template <class T>
-void encode_integral(const T& v, std::string* dest) {
-    if constexpr (std::is_signed<T>::value) {
-        typedef typename std::make_unsigned<T>::type UT;
-        UT uv = v;
-        uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
-        uv = to_bigendian(uv);
-        dest->append(reinterpret_cast<const char*>(&uv), sizeof(uv));
-    } else {
-        T nv = to_bigendian(v);
-        dest->append(reinterpret_cast<const char*>(&nv), sizeof(nv));
-    }
-}
-
-template <class T>
-void decode_integral(Slice* src, T* v) {
-    if constexpr (std::is_signed<T>::value) {
-        typedef typename std::make_unsigned<T>::type UT;
-        UT uv = *(UT*)(src->data);
-        uv = to_bigendian(uv);
-        uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
-        *v = uv;
-    } else {
-        T nv = *(T*)(src->data);
-        *v = to_bigendian(nv);
-    }
-    src->remove_prefix(sizeof(T));
-}
+using namespace encoding_utils;

 template <int LEN>
 static bool SSEEncodeChunk(const uint8_t** srcp, uint8_t** dstp) {
@ -164,7 +113,8 @@ static inline void EncodeChunkLoop(const uint8_t** srcp, uint8_t** dstp, int len
    }
 }

-inline void encode_slice(const Slice& s, std::string* dst, bool is_last) {
+// Implementation of encoding_utils::encode_slice
+void encoding_utils::encode_slice(const Slice& s, std::string* dst, bool is_last) {
    if (is_last) {
        dst->append(s.data, s.size);
    } else {
--- a/be/src/storage/primary_key_encoder.h
+++ b/be/src/storage/primary_key_encoder.h
@ -17,9 +17,80 @@
 #include "column/field.h"
 #include "column/vectorized_fwd.h"
 #include "common/status.h"
+#include "gutil/endian.h"

 namespace starrocks {

+// Utility functions for encoding various data types with order-preserving encoding
+namespace encoding_utils {
+
+// Endian conversion utilities
+template <class UT>
+inline UT to_bigendian(UT v) {
+    return v;
+}
+
+template <>
+inline uint8_t to_bigendian(uint8_t v) {
+    return v;
+}
+
+template <>
+inline uint16_t to_bigendian(uint16_t v) {
+    return BigEndian::FromHost16(v);
+}
+
+template <>
+inline uint32_t to_bigendian(uint32_t v) {
+    return BigEndian::FromHost32(v);
+}
+
+template <>
+inline uint64_t to_bigendian(uint64_t v) {
+    return BigEndian::FromHost64(v);
+}
+
+template <>
+inline uint128_t to_bigendian(uint128_t v) {
+    return BigEndian::FromHost128(v);
+}
+
+// Integral encoding with order-preserving transformation
+template <class T>
+inline void encode_integral(const T& v, std::string* dest) {
+    if constexpr (std::is_signed<T>::value) {
+        typedef typename std::make_unsigned<T>::type UT;
+        UT uv = v;
+        uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
+        uv = to_bigendian(uv);
+        dest->append(reinterpret_cast<const char*>(&uv), sizeof(uv));
+    } else {
+        T nv = to_bigendian(v);
+        dest->append(reinterpret_cast<const char*>(&nv), sizeof(nv));
+    }
+}
+
+// Integral decoding with order-preserving transformation
+template <class T>
+inline void decode_integral(Slice* src, T* v) {
+    if constexpr (std::is_signed<T>::value) {
+        typedef typename std::make_unsigned<T>::type UT;
+        UT uv = *(UT*)(src->data);
+        uv = to_bigendian(uv);
+        uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
+        *v = uv;
+    } else {
+        T nv = *(T*)(src->data);
+        *v = to_bigendian(nv);
+    }
+    src->remove_prefix(sizeof(T));
+}
+
+// Slice encoding with SSE optimizations for middle fields
+void encode_slice(const Slice& s, std::string* dst, bool is_last);
+
+} // namespace encoding_utils
+
 // Utility methods to encode composite primary key into single binary key, while
 // preserving original sort order.
 // Currently only bool, integral types(tinyint, smallint, int, bigint, largeint)
--- a/be/src/storage/row_store_encoder_simple.cpp
+++ b/be/src/storage/row_store_encoder_simple.cpp
@ -30,6 +30,9 @@

 namespace starrocks {

+// Import encoding utilities from primary_key_encoder.h
+using encoding_utils::encode_integral;
+
 Status RowStoreEncoderSimple::encode_columns_to_full_row_column(const Schema& schema, const Columns& columns,
                                                                BinaryColumn& dest) {
    RETURN_IF_ERROR(is_supported(schema));
--- a/be/src/storage/row_store_encoder_util.h
+++ b/be/src/storage/row_store_encoder_util.h
@ -19,62 +19,26 @@
 #include "common/status.h"
 #include "gutil/endian.h"
 #include "gutil/stringprintf.h"
+#include "storage/primary_key_encoder.h"
 #include "types/bitmap_value.h"

 namespace starrocks {
 //declare and defines of template class must be in a file, so we exctract it here

-template <class UT>
-inline UT to_bigendian(UT v);
-
-template <>
-inline uint8_t to_bigendian(uint8_t v) {
-    return v;
-}
-template <>
-inline uint16_t to_bigendian(uint16_t v) {
-    return BigEndian::FromHost16(v);
-}
-template <>
-inline uint32_t to_bigendian(uint32_t v) {
-    return BigEndian::FromHost32(v);
-}
-template <>
-inline uint64_t to_bigendian(uint64_t v) {
-    return BigEndian::FromHost64(v);
-}
-template <>
-inline uint128_t to_bigendian(uint128_t v) {
-    return BigEndian::FromHost128(v);
-}
-
-template <class T>
-inline size_t encode_integral(const T& v, std::string* dest) {
-    if constexpr (std::is_signed<T>::value) {
-        typedef typename std::make_unsigned<T>::type UT;
-        UT uv = v;
-        uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
-        uv = to_bigendian(uv);
-        dest->append(reinterpret_cast<const char*>(&uv), sizeof(uv));
-        return sizeof(uv);
-    } else {
-        T nv = to_bigendian(v);
-        dest->append(reinterpret_cast<const char*>(&nv), sizeof(nv));
-        return sizeof(nv);
-    }
-}
+// Note: to_bigendian and encode_integral functions are now available in
+// storage/primary_key_encoder.h under the encoding_utils namespace

 template <class T>
 inline void decode_integral(Slice* src, T* v) {
    if constexpr (std::is_signed<T>::value) {
        typedef typename std::make_unsigned<T>::type UT;
        UT uv = *(UT*)(src->data);
-        uv = to_bigendian(uv);
+        uv = encoding_utils::to_bigendian(uv);
        uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
        *v = uv;
    } else {
        T nv = *(T*)(src->data);
-        *v = to_bigendian(nv);
+        *v = encoding_utils::to_bigendian(nv);
    }
    src->remove_prefix(sizeof(T));
 }
--- a/be/src/storage/rowset/zone_map_index.cpp
+++ b/be/src/storage/rowset/zone_map_index.cpp
@ -119,6 +119,25 @@ struct ZoneMapDatum<TYPE_CHAR> : public ZoneMapDatumBase<TYPE_CHAR> {
 template <>
 struct ZoneMapDatum<TYPE_VARCHAR> final : public ZoneMapDatum<TYPE_CHAR> {};

+template <>
+struct ZoneMapDatum<TYPE_VARBINARY> final : public ZoneMapDatum<TYPE_CHAR> {
+    void resize_container_for_fit(TypeInfo* type_info, const void* v) override {
+        static const int INIT_SIZE = 64;
+        const Slice* slice = reinterpret_cast<const Slice*>(v);
+        if (slice->size > _length) {
+            _length = std::max<int>(BitUtil::next_power_of_two(slice->size), INIT_SIZE);
+            raw::stl_string_resize_uninitialized(&_value_container, _length);
+            value.data = _value_container.data();
+            value.size = 0;
+        }
+    }
+
+    void reset(TypeInfo* type_info) override {
+        value.data = _value_container.data();
+        value.size = 0;
+    }
+};
+
 template <LogicalType type>
 struct ZoneMap {
    ZoneMapDatum<type> min_value;
--- a/be/test/exprs/utility_functions_test.cpp
+++ b/be/test/exprs/utility_functions_test.cpp
@ -20,8 +20,10 @@

 #include "column/column_helper.h"
 #include "column/column_viewer.h"
+#include "common/statusor.h"
 #include "exprs/function_context.h"
 #include "runtime/runtime_state.h"
+#include "testutil/assert.h"
 #include "types/logical_type.h"
 #include "util/random.h"
 #include "util/time.h"
@ -127,4 +129,168 @@ TEST_F(UtilityFunctionsTest, uuidTest) {
    }
 }

+TEST_F(UtilityFunctionsTest, encodeSortKeyBasicOrdering) {
+    FunctionContext* ctx = FunctionContext::create_test_context();
+    auto ptr = std::unique_ptr<FunctionContext>(ctx);
+
+    // Prepare columns of different types
+    // int32
+    auto c_int = Int32Column::create();
+    c_int->append(1);
+    c_int->append(2);
+    c_int->append(10);
+
+    // binary
+    auto c_str = BinaryColumn::create();
+    c_str->append(Slice("a"));
+    c_str->append(Slice("b"));
+    c_str->append(Slice("c"));
+
+    // date
+    auto c_date = DateColumn::create();
+    c_date->append(DateValue::create(2021, 1, 1));
+    c_date->append(DateValue::create(2021, 1, 2));
+    c_date->append(DateValue::create(2021, 1, 10));
+
+    // timestamp
+    auto c_timestamp = TimestampColumn::create();
+    c_timestamp->append(TimestampValue::create(2021, 1, 1, 0, 0, 0, 0));
+    c_timestamp->append(TimestampValue::create(2021, 1, 2, 0, 0, 0, 0));
+    c_timestamp->append(TimestampValue::create(2021, 1, 10, 0, 0, 0, 0));
+
+    // float32
+    auto c_float32 = FloatColumn::create();
+    c_float32->append(1.0f);
+    c_float32->append(2.0f);
+    c_float32->append(10.0f);
+
+    // float64
+    auto c_float64 = DoubleColumn::create();
+    c_float64->append(1.0);
+    c_float64->append(2.0);
+    c_float64->append(10.0);
+
+    Columns cols;
+    cols.emplace_back(c_int);
+    cols.emplace_back(c_str);
+    cols.emplace_back(c_date);
+    cols.emplace_back(c_timestamp);
+    cols.emplace_back(c_float32);
+    cols.emplace_back(c_float64);
+
+    // verify each column
+    for (auto& col : cols) {
+        ASSIGN_OR_ASSERT_FAIL(ColumnPtr out, UtilityFunctions::encode_sort_key(ctx, {col}));
+        auto* bin = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out);
+        ASSERT_EQ(3, bin->size());
+
+        std::vector<Slice> keys = {bin->get_slice(0), bin->get_slice(1), bin->get_slice(2)};
+        ASSERT_LT(keys[0].compare(keys[1]), 0) << col->get_name();
+        ASSERT_LT(keys[1].compare(keys[2]), 0) << col->get_name();
+    }
+
+    // verify all columns
+    ASSIGN_OR_ASSERT_FAIL(ColumnPtr out, UtilityFunctions::encode_sort_key(ctx, cols));
+    auto* bin = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out);
+    ASSERT_EQ(3, bin->size());
+
+    // Verify lexicographic order aligns with tuple order
+    std::vector<Slice> keys = {bin->get_slice(0), bin->get_slice(1), bin->get_slice(2)};
+    ASSERT_LT(keys[0].compare(keys[1]), 0);
+    ASSERT_LT(keys[1].compare(keys[2]), 0);
+}
+
+TEST_F(UtilityFunctionsTest, encodeSortKeyNullHandling) {
+    FunctionContext* ctx = FunctionContext::create_test_context();
+    auto ptr = std::unique_ptr<FunctionContext>(ctx);
+
+    // Create nullable columns with mixed null and non-null values
+    auto c_int = NullableColumn::create(Int32Column::create(), NullColumn::create());
+    c_int->append_datum(Datum(int32_t(1))); // non-null
+    c_int->append_nulls(1);                 // null
+    c_int->append_datum(Datum(int32_t(2))); // non-null
+    c_int->append_nulls(1);                 // null
+
+    auto c_str = NullableColumn::create(BinaryColumn::create(), NullColumn::create());
+    c_str->append_nulls(1);                 // null
+    c_str->append_datum(Datum(Slice("z"))); // non-null
+    c_str->append_nulls(1);                 // null
+    c_str->append_datum(Datum(Slice("a"))); // non-null
+
+    Columns cols;
+    cols.emplace_back(c_int);
+    cols.emplace_back(c_str);
+
+    ASSIGN_OR_ASSERT_FAIL(ColumnPtr out, UtilityFunctions::encode_sort_key(ctx, cols));
+    auto* bin = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out);
+    ASSERT_EQ(4, bin->size());
+
+    // Get the sort keys for each row
+    Slice k0 = bin->get_slice(0); // (1,NULL) - first non-null, second null
+    Slice k1 = bin->get_slice(1); // (NULL,"z") - first null, second non-null
+    Slice k2 = bin->get_slice(2); // (2,NULL) - first non-null, second null
+    Slice k3 = bin->get_slice(3); // (NULL,"a") - first null, second non-null
+
+    // Verify that null values sort before non-null values
+    // Row 1 (NULL,"z") should sort before Row 0 (1,NULL) - null in first column takes precedence
+    ASSERT_LT(k1.compare(k0), 0) << "NULL should sort before non-NULL in first column";
+
+    // Row 3 (NULL,"a") should sort before Row 2 (2,NULL) - null in first column takes precedence
+    ASSERT_LT(k3.compare(k2), 0) << "NULL should sort before non-NULL in first column";
+
+    // Row 3 (NULL,"a") should sort before Row 0 (1,NULL) - null in first column takes precedence
+    ASSERT_LT(k3.compare(k0), 0) << "NULL in first column should sort before any non-NULL";
+
+    // Verify that null values only contain null markers and no underlying data encoding
+    // This tests the fix where null rows don't process underlying data
+    // The first byte should be the null marker (\0 for null, \1 for non-null)
+    ASSERT_EQ('\0', k1.data[0]) << "First column null marker should be \\0";
+    ASSERT_EQ('\1', k0.data[0]) << "First column non-null marker should be \\1";
+    ASSERT_EQ('\0', k3.data[0]) << "First column null marker should be \\0";
+    ASSERT_EQ('\1', k2.data[0]) << "First column non-null marker should be \\1";
+
+    // Verify that rows with null in second column also have correct null markers
+    // The second column null marker should be at position after first column encoding
+    // For row 0: first column is non-null (1), so second column null marker should be after int32 encoding
+    // For row 2: first column is non-null (2), so second column null marker should be after int32 encoding
+    ASSERT_EQ('\0', k0.data[5])
+            << "Second column null marker should be \\0 for row 0"; // After int32 encoding (4 bytes) + null marker (1 byte)
+    ASSERT_EQ('\0', k2.data[5]) << "Second column null marker should be \\0 for row 2";
+
+    // Verify that the sort keys are deterministic and consistent
+    // Running the same operation should produce identical results
+    ASSIGN_OR_ASSERT_FAIL(ColumnPtr out2, UtilityFunctions::encode_sort_key(ctx, cols));
+    auto* bin2 = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out2);
+    ASSERT_EQ(4, bin2->size());
+
+    for (size_t i = 0; i < 4; i++) {
+        ASSERT_EQ(bin->get_slice(i).to_string(), bin2->get_slice(i).to_string())
+                << "Sort keys should be deterministic for row " << i;
+    }
+}
+
+TEST_F(UtilityFunctionsTest, encodeSortKeyStringEscaping) {
+    FunctionContext* ctx = FunctionContext::create_test_context();
+    auto ptr = std::unique_ptr<FunctionContext>(ctx);
+
+    auto c1 = BinaryColumn::create();
+    auto c2 = BinaryColumn::create();
+    c1->append(Slice("a\0b", 3));
+    c1->append(Slice("a", 1));
+    c2->append(Slice("x", 1));
+    c2->append(Slice("\0", 1));
+
+    Columns cols;
+    cols.emplace_back(c1);
+    cols.emplace_back(c2);
+
+    ASSIGN_OR_ASSERT_FAIL(ColumnPtr out, UtilityFunctions::encode_sort_key(ctx, cols));
+    auto* bin = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out);
+    ASSERT_EQ(2, bin->size());
+    // Ensure keys are comparable and not identical
+    Slice k0 = bin->get_slice(0);
+    Slice k1 = bin->get_slice(1);
+    ASSERT_NE(k0.to_string(), k1.to_string());
+}
+
 } // namespace starrocks
--- a/be/test/storage/rowset/zone_map_index_test.cpp
+++ b/be/test/storage/rowset/zone_map_index_test.cpp
@ -359,4 +359,83 @@ TEST_F(ColumnZoneMapTest, NormalTestCharPage) {
    test_string("NormalTestCharPage", type_info);
 }

+// Test for varbinary
+TEST_F(ColumnZoneMapTest, NormalTestVarbinaryPage) {
+    TabletColumn varbinary_column = create_varbinary_key(0);
+    TypeInfoPtr type_info = get_type_info(varbinary_column);
+    test_string("NormalTestVarbinaryPage", type_info);
+}
+
+// Test for varbinary with binary data
+TEST_F(ColumnZoneMapTest, VarbinaryWithBinaryData) {
+    std::string filename = kTestDir + "/VarbinaryWithBinaryData";
+
+    TabletColumn varbinary_column = create_varbinary_key(0);
+    TypeInfoPtr type_info = get_type_info(varbinary_column);
+
+    auto writer = ZoneMapIndexWriter::create(type_info.get());
+
+    // Add binary data with various patterns
+    std::vector<std::string> binary_values1 = {
+            std::string("\x00\x01\x02\x03", 4), // Binary data starting with null bytes
+            std::string("\xFF\xFE\xFD\xFC", 4), // Binary data with high bytes
+            std::string("ABCD", 4),             // ASCII data
+            std::string("\x00\x00\x00\x00", 4), // All null bytes
+    };
+
+    for (auto& value : binary_values1) {
+        Slice slice(value);
+        writer->add_values((const uint8_t*)&slice, 1);
+    }
+    writer->flush();
+
+    // Add more binary data with different patterns
+    std::vector<std::string> binary_values2 = {
+            std::string("\x01\x02\x03\x04", 4), std::string("\xFE\xFD\xFC\xFB", 4), std::string("EFGH", 4),
+            std::string("\xFF\xFF\xFF\xFF", 4), // All high bytes
+    };
+
+    for (auto& value : binary_values2) {
+        Slice slice(value);
+        writer->add_values((const uint8_t*)&slice, 1);
+    }
+    writer->add_nulls(1);
+    writer->flush();
+
+    // Add null values
+    writer->add_nulls(3);
+    writer->flush();
+
+    // Write out zone map index
+    ColumnIndexMetaPB index_meta;
+    write_file(*writer, index_meta, filename);
+
+    // Read and verify
+    ZoneMapIndexReader column_zone_map;
+    load_zone_map(column_zone_map, index_meta, filename);
+
+    ASSERT_EQ(3, column_zone_map.num_pages());
+    const std::vector<ZoneMapPB>& zone_maps = column_zone_map.page_zone_maps();
+    ASSERT_EQ(3, zone_maps.size());
+
+    // Check first page - should have min/max from binary_values1
+    // For binary data, comparison is byte-by-byte, so "\x00\x00\x00\x00" is min and "\xFF\xFE\xFD\xFC" is max
+    check_result(zone_maps[0], true, true, std::string("\x00\x00\x00\x00", 4), std::string("\xFF\xFE\xFD\xFC", 4),
+                 false, true);
+
+    // Check second page - should have min/max from binary_values2 plus null
+    // "\x01\x02\x03\x04" is min and "\xFF\xFF\xFF\xFF" is max
+    check_result(zone_maps[1], true, true, std::string("\x01\x02\x03\x04", 4), std::string("\xFF\xFF\xFF\xFF", 4), true,
+                 true);
+
+    // Check third page - should be all nulls
+    check_result(zone_maps[2], false, false, "", "", true, false);
+
+    // Check segment zonemap - should cover all data
+    // The segment zonemap should have the overall min/max across all pages
+    const auto& segment_zonemap = index_meta.zone_map_index().segment_zone_map();
+    check_result(segment_zonemap, true, true, std::string("\x00\x00\x00\x00", 4), std::string("\xFF\xFF\xFF\xFF", 4),
+                 true, true);
+}
+
 } // namespace starrocks
--- a/be/test/storage/tablet_schema_helper.h
+++ b/be/test/storage/tablet_schema_helper.h
@ -164,6 +164,18 @@ inline TabletColumn create_varchar_key(int32_t id, bool is_nullable = true, int
    return column;
 }

+inline TabletColumn create_varbinary_key(int32_t id, bool is_nullable = true, int length = 8) {
+    TabletColumn column;
+    column.set_unique_id(id);
+    column.set_name(std::to_string(id));
+    column.set_type(TYPE_VARBINARY);
+    column.set_is_key(true);
+    column.set_is_nullable(is_nullable);
+    column.set_length(length);
+    column.set_index_length(4);
+    return column;
+}
+
 inline TabletColumn create_array(int32_t id, bool is_nullable = true, int length = 24) {
    TabletColumn column;
    column.set_unique_id(id);
--- a/fe/fe-core/src/main/java/com/starrocks/catalog/FunctionSet.java
+++ b/fe/fe-core/src/main/java/com/starrocks/catalog/FunctionSet.java
@ -254,6 +254,7 @@ public class FunctionSet {
    public static final String ISNOTNULL = "isnotnull";
    public static final String ASSERT_TRUE = "assert_true";
    public static final String HOST_NAME = "host_name";
+    public static final String ENCODE_SORT_KEY = "encode_sort_key";
    // Aggregate functions:
    public static final String APPROX_COUNT_DISTINCT = "approx_count_distinct";
    public static final String APPROX_COUNT_DISTINCT_HLL_SKETCH = "approx_count_distinct_hll_sketch";
--- a/fe/fe-core/src/main/java/com/starrocks/catalog/Type.java
+++ b/fe/fe-core/src/main/java/com/starrocks/catalog/Type.java
@ -821,7 +821,7 @@ public abstract class Type implements Cloneable {
            return true;
        }

-        return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType();
+        return !isOnlyMetricType() && !isJsonType() && !isFunctionType();
    }

    public boolean canGroupBy() {
@ -839,7 +839,7 @@ public abstract class Type implements Cloneable {
            }
            return true;
        }
-        return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType();
+        return !isOnlyMetricType() && !isJsonType() && !isFunctionType();
    }

    public boolean canOrderBy() {
@ -847,8 +847,7 @@ public abstract class Type implements Cloneable {
        if (isArrayType()) {
            return ((ArrayType) this).getItemType().canOrderBy();
        }
-        return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType() && !isStructType() &&
-                !isMapType();
+        return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isStructType() && !isMapType();
    }

    public boolean canPartitionBy() {
@ -883,8 +882,9 @@ public abstract class Type implements Cloneable {

    public boolean canDistributedBy() {
        // TODO(mofei) support distributed by for JSON
+        // Allow VARBINARY as distribution key
        return !isComplexType() && !isFloatingPointType() && !isOnlyMetricType() && !isJsonType()
-                && !isFunctionType() && !isBinaryType();
+                && !isFunctionType();
    }

    public boolean canBeWindowFunctionArgumentTypes() {
--- a/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/CreateTableAnalyzer.java
+++ b/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/CreateTableAnalyzer.java
@ -435,8 +435,8 @@ public class CreateTableAnalyzer {
                    }
                    ColumnDef cd = columnDefs.get(idx);
                    Type t = cd.getType();
-                    if (!(t.isBoolean() || t.isIntegerType() || t.isLargeint() || t.isVarchar() || t.isDate() ||
-                            t.isDatetime())) {
+                    if (!(t.isBoolean() || t.isIntegerType() || t.isLargeint() || t.isVarchar() || t.isBinaryType() ||
+                            t.isDate() || t.isDatetime())) {
                        throw new SemanticException("sort key column[" + cd.getName() + "] type not supported: " + t.toSql());
                    }
                }
--- a/fe/fe-core/src/test/java/com/starrocks/catalog/CreateTableTest.java
+++ b/fe/fe-core/src/test/java/com/starrocks/catalog/CreateTableTest.java
@ -1131,7 +1131,7 @@ public class CreateTableTest {
    }

    @Test
-    public void testCreateVarBinaryTable() {
+    public void testCreateVarBinaryTable() throws Exception {
        // duplicate table
        ExceptionChecker.expectThrowsNoException(() -> createTable(
                "create table test.varbinary_tbl\n" +
@ -1174,20 +1174,16 @@ public class CreateTableTest {
                "distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1');"));

        // failed
-        ExceptionChecker.expectThrowsWithMsg(AnalysisException.class,
-                "Invalid data type of key column 'k2': 'VARBINARY'",
-                () -> createTable("create table test.varbinary_tbl0\n"
+        createTable("create table test.varbinary_tbl00\n"
                        + "(k1 int, k2 varbinary)\n"
                        + "duplicate key(k1, k2)\n"
                        + "distributed by hash(k1) buckets 1\n"
-                        + "properties('replication_num' = '1');"));
-        ExceptionChecker.expectThrowsWithMsg(DdlException.class,
-                "VARBINARY(10) column can not be distribution column",
-                () -> createTable("create table test.varbinary_tbl0 \n"
+                + "properties('replication_num' = '1');");
+        createTable("create table test.varbinary_tbl01 \n"
                        + "(k1 int, k2 varbinary(10) )\n"
                        + "duplicate key(k1)\n"
                        + "distributed by hash(k2) buckets 1\n"
-                        + "properties('replication_num' = '1');"));
+                + "properties('replication_num' = '1');");
        ExceptionChecker.expectThrowsWithMsg(DdlException.class,
                "Column[j] type[VARBINARY] cannot be a range partition key",
                () -> createTable("create table test.varbinary_tbl0 \n" +
@ -1199,7 +1195,7 @@ public class CreateTableTest {
    }

    @Test
-    public void testCreateBinaryTable() {
+    public void testCreateBinaryTable() throws Exception {
        // duplicate table
        ExceptionChecker.expectThrowsNoException(() -> createTable(
                "create table test.binary_tbl\n" +
@ -1242,20 +1238,16 @@ public class CreateTableTest {
                "distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1');"));

        // failed
-        ExceptionChecker.expectThrowsWithMsg(AnalysisException.class,
-                "Invalid data type of key column 'k2': 'VARBINARY'",
-                () -> createTable("create table test.binary_tbl0\n"
+        createTable("create table test.binary_tbl01\n"
                        + "(k1 int, k2 binary)\n"
                        + "duplicate key(k1, k2)\n"
                        + "distributed by hash(k1) buckets 1\n"
-                        + "properties('replication_num' = '1');"));
-        ExceptionChecker.expectThrowsWithMsg(DdlException.class,
-                "VARBINARY(10) column can not be distribution column",
-                () -> createTable("create table test.binary_tbl0 \n"
+                + "properties('replication_num' = '1');");
+        createTable("create table test.binary_tbl11 \n"
                        + "(k1 int, k2 binary(10) )\n"
                        + "duplicate key(k1)\n"
                        + "distributed by hash(k2) buckets 1\n"
-                        + "properties('replication_num' = '1');"));
+                + "properties('replication_num' = '1');");
        ExceptionChecker.expectThrowsWithMsg(DdlException.class,
                "Column[j] type[VARBINARY] cannot be a range partition key",
                () -> createTable("create table test.binary_tbl0 \n" +
--- a/gensrc/script/functions.py
+++ b/gensrc/script/functions.py
@ -825,6 +825,7 @@ vectorized_functions = [
    [100019, 'assert_true', True, False, 'BOOLEAN', ['BOOLEAN', "VARCHAR"], 'UtilityFunctions::assert_true'],
    [100018, 'host_name', True, False, 'VARCHAR', [], "UtilityFunctions::host_name"],
    [100020, 'get_query_profile', True, False, 'VARCHAR', ['VARCHAR'], "UtilityFunctions::get_query_profile"],
+    [100024, 'encode_sort_key', True, False, 'VARBINARY', ['ANY_ELEMENT', '...'], 'UtilityFunctions::encode_sort_key'],

    # json string function
    [110022, "get_json_int", False, False, "BIGINT", ["VARCHAR", "VARCHAR"], "JsonFunctions::get_json_bigint",
--- a/test/sql/test_make_sort_key/R/test_make_sort_key_json
+++ b/test/sql/test_make_sort_key/R/test_make_sort_key_json
@ -0,0 +1,123 @@
+-- name: test_encode_sort_key_json
+CREATE DATABASE test_encode_sort_key_json;
+-- result:
+-- !result
+USE test_encode_sort_key_json;
+-- result:
+-- !result
+CREATE TABLE `json_test_table` (
+  `id` int(11) NOT NULL COMMENT "",
+  `json_data` json ,
+  `json_array` json ,
+  `json_nested` json,
+  `sort_key` varbinary(1024) AS (
+    encode_sort_key(
+      get_json_int(json_data, '$.age'),
+      get_json_string(json_data, '$.name'),
+      get_json_string(json_data, '$.city'),
+      get_json_string(json_array, '$[0]'),
+      get_json_double(json_nested, '$.user.profile.score')
+    )
+  ) COMMENT "Auto-generated sort key from extracted JSON fields"
+) ENGINE=OLAP 
+DISTRIBUTED BY HASH(sort_key) BUCKETS 2
+ORDER BY (sort_key)
+PROPERTIES ( "replication_num" = "1");
+-- result:
+-- !result
+INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
+(1, parse_json('{"name": "Alice", "age": 25, "city": "New York"}'), 
+     parse_json('["apple", "banana", "cherry"]'),
+     parse_json('{"user": {"id": 101, "profile": {"verified": true, "score": 95.5}}}')),
+(2, parse_json('{"name": "Bob", "age": 30, "city": "Los Angeles"}'), 
+     parse_json('["orange", "grape"]'),
+     parse_json('{"user": {"id": 102, "profile": {"verified": false, "score": 87.2}}}')),
+(3, parse_json('{"name": "Charlie", "age": 28, "city": "Chicago"}'), 
+     parse_json('["mango", "pineapple", "kiwi", "strawberry"]'),
+     parse_json('{"user": {"id": 103, "profile": {"verified": true, "score": 92.8}}}')),
+(4, parse_json('{"name": "Diana", "age": 22, "city": "Miami"}'), 
+     parse_json('["pear"]'),
+     parse_json('{"user": {"id": 104, "profile": {"verified": true, "score": 89.1}}}')),
+(5, parse_json('{"name": "Eve", "age": 35, "city": "Seattle"}'), 
+     parse_json('["blueberry", "raspberry", "blackberry"]'),
+     parse_json('{"user": {"id": 105, "profile": {"verified": false, "score": 78.9}}}'));
+-- result:
+-- !result
+SELECT id, 
+       json_data, 
+       get_json_int(json_data, '$.age') as age,
+       get_json_string(json_data, '$.name') as name,
+       get_json_string(json_data, '$.city') as city,
+       get_json_string(json_array, '$[0]') as first_fruit,
+       get_json_double(json_nested, '$.user.profile.score') as score,
+       sort_key,
+       length(sort_key) as sort_key_length
+FROM json_test_table 
+ORDER BY id;
+-- result:
+1	{"age": 25, "city": "New York", "name": "Alice"}	25	Alice	New York	apple	95.5	b'\x01\x80\x00\x00\x00\x00\x00\x00\x19\x00\x01Alice\x00\x00\x00\x01New York\x00\x00\x00\x01apple\x00\x00\x00\x01\xc0W\xe0\x00\x00\x00\x00\x00'	49
+2	{"age": 30, "city": "Los Angeles", "name": "Bob"}	30	Bob	Los Angeles	orange	87.2	b'\x01\x80\x00\x00\x00\x00\x00\x00\x1e\x00\x01Bob\x00\x00\x00\x01Los Angeles\x00\x00\x00\x01orange\x00\x00\x00\x01\xc0U\xcc\xcc\xcc\xcc\xcc\xcd'	51
+3	{"age": 28, "city": "Chicago", "name": "Charlie"}	28	Charlie	Chicago	mango	92.8	b'\x01\x80\x00\x00\x00\x00\x00\x00\x1c\x00\x01Charlie\x00\x00\x00\x01Chicago\x00\x00\x00\x01mango\x00\x00\x00\x01\xc0W333333'	50
+4	{"age": 22, "city": "Miami", "name": "Diana"}	22	Diana	Miami	pear	89.1	b'\x01\x80\x00\x00\x00\x00\x00\x00\x16\x00\x01Diana\x00\x00\x00\x01Miami\x00\x00\x00\x01pear\x00\x00\x00\x01\xc0VFfffff'	45
+5	{"age": 35, "city": "Seattle", "name": "Eve"}	35	Eve	Seattle	blueberry	78.9	b'\x01\x80\x00\x00\x00\x00\x00\x00#\x00\x01Eve\x00\x00\x00\x01Seattle\x00\x00\x00\x01blueberry\x00\x00\x00\x01\xc0S\xb9\x99\x99\x99\x99\x9a'	50
+-- !result
+SELECT id, json_data, json_array
+FROM json_test_table 
+ORDER BY sort_key;
+-- result:
+4	{"age": 22, "city": "Miami", "name": "Diana"}	["pear"]
+1	{"age": 25, "city": "New York", "name": "Alice"}	["apple", "banana", "cherry"]
+3	{"age": 28, "city": "Chicago", "name": "Charlie"}	["mango", "pineapple", "kiwi", "strawberry"]
+2	{"age": 30, "city": "Los Angeles", "name": "Bob"}	["orange", "grape"]
+5	{"age": 35, "city": "Seattle", "name": "Eve"}	["blueberry", "raspberry", "blackberry"]
+-- !result
+SELECT id, json_data, json_array
+FROM json_test_table 
+ORDER BY sort_key DESC;
+-- result:
+5	{"age": 35, "city": "Seattle", "name": "Eve"}	["blueberry", "raspberry", "blackberry"]
+2	{"age": 30, "city": "Los Angeles", "name": "Bob"}	["orange", "grape"]
+3	{"age": 28, "city": "Chicago", "name": "Charlie"}	["mango", "pineapple", "kiwi", "strawberry"]
+1	{"age": 25, "city": "New York", "name": "Alice"}	["apple", "banana", "cherry"]
+4	{"age": 22, "city": "Miami", "name": "Diana"}	["pear"]
+-- !result
+SELECT id, json_data, json_array
+FROM json_test_table 
+WHERE sort_key > (SELECT sort_key FROM json_test_table WHERE id = 2)
+ORDER BY id;
+-- result:
+5	{"age": 35, "city": "Seattle", "name": "Eve"}	["blueberry", "raspberry", "blackberry"]
+-- !result
+INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
+(6, NULL, parse_json('["test"]'), parse_json('{"test": null}')),
+(7, parse_json('{"age": 40}'), parse_json('[]'), parse_json('{"user": {"id": 106}}'));
+-- result:
+-- !result
+SELECT id, 
+       get_json_int(json_data, '$.age') as age
+FROM json_test_table 
+ORDER BY id;
+-- result:
+1	25
+2	30
+3	28
+4	22
+5	35
+6	None
+7	40
+-- !result
+SELECT id, json_data, json_array
+FROM json_test_table 
+ORDER BY sort_key;
+-- result:
+6	None	["test"]
+4	{"age": 22, "city": "Miami", "name": "Diana"}	["pear"]
+1	{"age": 25, "city": "New York", "name": "Alice"}	["apple", "banana", "cherry"]
+3	{"age": 28, "city": "Chicago", "name": "Charlie"}	["mango", "pineapple", "kiwi", "strawberry"]
+2	{"age": 30, "city": "Los Angeles", "name": "Bob"}	["orange", "grape"]
+5	{"age": 35, "city": "Seattle", "name": "Eve"}	["blueberry", "raspberry", "blackberry"]
+7	{"age": 40}	[]
+-- !result
+DROP DATABASE test_encode_sort_key_json;
+-- result:
+-- !result
--- a/test/sql/test_make_sort_key/T/test_make_sort_key_json
+++ b/test/sql/test_make_sort_key/T/test_make_sort_key_json
@ -0,0 +1,86 @@
+-- name: test_encode_sort_key_json
+CREATE DATABASE test_encode_sort_key_json;
+USE test_encode_sort_key_json;
+
+-- Create a table with JSON data types and a single generated sort key column
+-- The sort key extracts specific fields from JSON and combines them for efficient sorting
+CREATE TABLE `json_test_table` (
+  `id` int(11) NOT NULL COMMENT "",
+  `json_data` json ,
+  `json_array` json ,
+  `json_nested` json,
+  `sort_key` varbinary(1024) AS (
+    encode_sort_key(
+      get_json_int(json_data, '$.age'),
+      get_json_string(json_data, '$.name'),
+      get_json_string(json_data, '$.city'),
+      get_json_string(json_array, '$[0]'),
+      get_json_double(json_nested, '$.user.profile.score')
+    )
+  ) COMMENT "Auto-generated sort key from extracted JSON fields"
+) ENGINE=OLAP 
+DISTRIBUTED BY HASH(sort_key) BUCKETS 2
+ORDER BY (sort_key)
+PROPERTIES ( "replication_num" = "1");
+
+-- Insert test data with various JSON structures
+-- The sort key will be automatically generated from extracted JSON fields
+INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
+(1, parse_json('{"name": "Alice", "age": 25, "city": "New York"}'), 
+     parse_json('["apple", "banana", "cherry"]'),
+     parse_json('{"user": {"id": 101, "profile": {"verified": true, "score": 95.5}}}')),
+(2, parse_json('{"name": "Bob", "age": 30, "city": "Los Angeles"}'), 
+     parse_json('["orange", "grape"]'),
+     parse_json('{"user": {"id": 102, "profile": {"verified": false, "score": 87.2}}}')),
+(3, parse_json('{"name": "Charlie", "age": 28, "city": "Chicago"}'), 
+     parse_json('["mango", "pineapple", "kiwi", "strawberry"]'),
+     parse_json('{"user": {"id": 103, "profile": {"verified": true, "score": 92.8}}}')),
+(4, parse_json('{"name": "Diana", "age": 22, "city": "Miami"}'), 
+     parse_json('["pear"]'),
+     parse_json('{"user": {"id": 104, "profile": {"verified": true, "score": 89.1}}}')),
+(5, parse_json('{"name": "Eve", "age": 35, "city": "Seattle"}'), 
+     parse_json('["blueberry", "raspberry", "blackberry"]'),
+     parse_json('{"user": {"id": 105, "profile": {"verified": false, "score": 78.9}}}'));
+
+-- Test 1: Verify that the generated sort key column is automatically populated
+-- This shows how encode_sort_key extracts and combines JSON fields
+SELECT id, 
+       json_data, 
+       get_json_int(json_data, '$.age') as age,
+       get_json_string(json_data, '$.name') as name,
+       get_json_string(json_data, '$.city') as city,
+       get_json_string(json_array, '$[0]') as first_fruit,
+       get_json_double(json_nested, '$.user.profile.score') as score,
+       sort_key,
+       length(sort_key) as sort_key_length
+FROM json_test_table 
+ORDER BY id;
+
+SELECT id, json_data, json_array
+FROM json_test_table 
+ORDER BY sort_key;
+
+SELECT id, json_data, json_array
+FROM json_test_table 
+ORDER BY sort_key DESC;
+
+SELECT id, json_data, json_array
+FROM json_test_table 
+WHERE sort_key > (SELECT sort_key FROM json_test_table WHERE id = 2)
+ORDER BY id;
+
+INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
+(6, NULL, parse_json('["test"]'), parse_json('{"test": null}')),
+(7, parse_json('{"age": 40}'), parse_json('[]'), parse_json('{"user": {"id": 106}}'));
+
+SELECT id, 
+       get_json_int(json_data, '$.age') as age
+FROM json_test_table 
+ORDER BY id;
+
+SELECT id, json_data, json_array
+FROM json_test_table 
+ORDER BY sort_key;
+
+-- Clean up
+DROP DATABASE test_encode_sort_key_json;