add be ut

Signed-off-by: Murphy <mofei@starrocks.com>
fix fe ut
2025-08-12 18:23:56 +08:00 · 2025-08-12 18:20:08 +08:00 · 2025-08-12 05:26:45 +00:00 · 2025-08-12 13:21:13 +08:00 · 2025-08-12 12:44:06 +08:00 · 2025-08-12 03:23:10 +00:00
12 changed files with 550 additions and 25 deletions
--- a/be/src/storage/key_coder.cpp
+++ b/be/src/storage/key_coder.cpp
@ -62,6 +62,7 @@ private:
        add_mapping<TYPE_DECIMALV2>();
        add_mapping<TYPE_CHAR>();
        add_mapping<TYPE_VARCHAR>();
+        add_mapping<TYPE_VARBINARY>();
        add_mapping<TYPE_BOOLEAN>();
    }

--- a/be/src/storage/key_coder.h
+++ b/be/src/storage/key_coder.h
@ -409,4 +409,8 @@ public:
    }
 };

+// Reuse VARCHAR's key coder behavior for VARBINARY
+template <>
+class KeyCoderTraits<TYPE_VARBINARY> : public KeyCoderTraits<TYPE_VARCHAR> {};
+
 } // namespace starrocks
--- a/be/src/storage/rowset/zone_map_index.cpp
+++ b/be/src/storage/rowset/zone_map_index.cpp
@ -119,6 +119,26 @@ struct ZoneMapDatum<TYPE_CHAR> : public ZoneMapDatumBase<TYPE_CHAR> {
 template <>
 struct ZoneMapDatum<TYPE_VARCHAR> final : public ZoneMapDatum<TYPE_CHAR> {};

+template <>
+struct ZoneMapDatum<TYPE_VARBINARY> final : public ZoneMapDatum<TYPE_CHAR> {
+    void resize_container_for_fit(TypeInfo* type_info, const void* v) override {
+        static const int INIT_SIZE = 64;
+        const Slice* slice = reinterpret_cast<const Slice*>(v);
+        if (slice->size > _length) {
+            _length = std::max<int>(BitUtil::next_power_of_two(slice->size), INIT_SIZE);
+            raw::stl_string_resize_uninitialized(&_value_container, _length);
+            value.data = _value_container.data();
+            // Don't reset size to 0 for VARBINARY - keep the actual data size
+            value.size = slice->size;
+        }
+    }
+
+    void reset(TypeInfo* type_info) override {
+        value.data = _value_container.data();
+        value.size = 0;
+    }
+};
+
 template <LogicalType type>
 struct ZoneMap {
    ZoneMapDatum<type> min_value;
--- a/be/test/storage/rowset/zone_map_index_test.cpp
+++ b/be/test/storage/rowset/zone_map_index_test.cpp
@ -359,4 +359,83 @@ TEST_F(ColumnZoneMapTest, NormalTestCharPage) {
    test_string("NormalTestCharPage", type_info);
 }

+// Test for varbinary
+TEST_F(ColumnZoneMapTest, NormalTestVarbinaryPage) {
+    TabletColumn varbinary_column = create_varbinary_key(0);
+    TypeInfoPtr type_info = get_type_info(varbinary_column);
+    test_string("NormalTestVarbinaryPage", type_info);
+}
+
+// Test for varbinary with binary data
+TEST_F(ColumnZoneMapTest, VarbinaryWithBinaryData) {
+    std::string filename = kTestDir + "/VarbinaryWithBinaryData";
+
+    TabletColumn varbinary_column = create_varbinary_key(0);
+    TypeInfoPtr type_info = get_type_info(varbinary_column);
+
+    auto writer = ZoneMapIndexWriter::create(type_info.get());
+
+    // Add binary data with various patterns
+    std::vector<std::string> binary_values1 = {
+            std::string("\x00\x01\x02\x03", 4), // Binary data starting with null bytes
+            std::string("\xFF\xFE\xFD\xFC", 4), // Binary data with high bytes
+            std::string("ABCD", 4),             // ASCII data
+            std::string("\x00\x00\x00\x00", 4), // All null bytes
+    };
+
+    for (auto& value : binary_values1) {
+        Slice slice(value);
+        writer->add_values((const uint8_t*)&slice, 1);
+    }
+    writer->flush();
+
+    // Add more binary data with different patterns
+    std::vector<std::string> binary_values2 = {
+            std::string("\x01\x02\x03\x04", 4), std::string("\xFE\xFD\xFC\xFB", 4), std::string("EFGH", 4),
+            std::string("\xFF\xFF\xFF\xFF", 4), // All high bytes
+    };
+
+    for (auto& value : binary_values2) {
+        Slice slice(value);
+        writer->add_values((const uint8_t*)&slice, 1);
+    }
+    writer->add_nulls(1);
+    writer->flush();
+
+    // Add null values
+    writer->add_nulls(3);
+    writer->flush();
+
+    // Write out zone map index
+    ColumnIndexMetaPB index_meta;
+    write_file(*writer, index_meta, filename);
+
+    // Read and verify
+    ZoneMapIndexReader column_zone_map;
+    load_zone_map(column_zone_map, index_meta, filename);
+
+    ASSERT_EQ(3, column_zone_map.num_pages());
+    const std::vector<ZoneMapPB>& zone_maps = column_zone_map.page_zone_maps();
+    ASSERT_EQ(3, zone_maps.size());
+
+    // Check first page - should have min/max from binary_values1
+    // For binary data, comparison is byte-by-byte, so "\x00\x00\x00\x00" is min and "\xFF\xFE\xFD\xFC" is max
+    check_result(zone_maps[0], true, true, std::string("\x00\x00\x00\x00", 4), std::string("\xFF\xFE\xFD\xFC", 4),
+                 false, true);
+
+    // Check second page - should have min/max from binary_values2 plus null
+    // "\x01\x02\x03\x04" is min and "\xFF\xFF\xFF\xFF" is max
+    check_result(zone_maps[1], true, true, std::string("\x01\x02\x03\x04", 4), std::string("\xFF\xFF\xFF\xFF", 4), true,
+                 true);
+
+    // Check third page - should be all nulls
+    check_result(zone_maps[2], false, false, "", "", true, false);
+
+    // Check segment zonemap - should cover all data
+    // The segment zonemap should have the overall min/max across all pages
+    const auto& segment_zonemap = index_meta.zone_map_index().segment_zone_map();
+    check_result(segment_zonemap, true, true, std::string("\x00\x00\x00\x00", 4), std::string("\xFF\xFF\xFF\xFF", 4),
+                 true, true);
+}
+
 } // namespace starrocks
--- a/be/test/storage/tablet_schema_helper.h
+++ b/be/test/storage/tablet_schema_helper.h
@ -164,6 +164,18 @@ inline TabletColumn create_varchar_key(int32_t id, bool is_nullable = true, int
    return column;
 }

+inline TabletColumn create_varbinary_key(int32_t id, bool is_nullable = true, int length = 8) {
+    TabletColumn column;
+    column.set_unique_id(id);
+    column.set_name(std::to_string(id));
+    column.set_type(TYPE_VARBINARY);
+    column.set_is_key(true);
+    column.set_is_nullable(is_nullable);
+    column.set_length(length);
+    column.set_index_length(4);
+    return column;
+}
+
 inline TabletColumn create_array(int32_t id, bool is_nullable = true, int length = 24) {
    TabletColumn column;
    column.set_unique_id(id);
--- a/fe/fe-core/src/main/java/com/starrocks/catalog/Type.java
+++ b/fe/fe-core/src/main/java/com/starrocks/catalog/Type.java
@ -821,7 +821,7 @@ public abstract class Type implements Cloneable {
            return true;
        }

-        return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType();
+        return !isOnlyMetricType() && !isJsonType() && !isFunctionType();
    }

    public boolean canGroupBy() {
@ -839,7 +839,7 @@ public abstract class Type implements Cloneable {
            }
            return true;
        }
-        return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType();
+        return !isOnlyMetricType() && !isJsonType() && !isFunctionType();
    }

    public boolean canOrderBy() {
@ -847,8 +847,7 @@ public abstract class Type implements Cloneable {
        if (isArrayType()) {
            return ((ArrayType) this).getItemType().canOrderBy();
        }
-        return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType() && !isStructType() &&
-                !isMapType();
+        return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isStructType() && !isMapType();
    }

    public boolean canPartitionBy() {
@ -883,8 +882,9 @@ public abstract class Type implements Cloneable {

    public boolean canDistributedBy() {
        // TODO(mofei) support distributed by for JSON
+        // Allow VARBINARY as distribution key
        return !isComplexType() && !isFloatingPointType() && !isOnlyMetricType() && !isJsonType()
-                && !isFunctionType() && !isBinaryType();
+                && !isFunctionType();
    }

    public boolean canBeWindowFunctionArgumentTypes() {
--- a/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/CreateTableAnalyzer.java
+++ b/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/CreateTableAnalyzer.java
@ -435,8 +435,8 @@ public class CreateTableAnalyzer {
                    }
                    ColumnDef cd = columnDefs.get(idx);
                    Type t = cd.getType();
-                    if (!(t.isBoolean() || t.isIntegerType() || t.isLargeint() || t.isVarchar() || t.isDate() ||
-                            t.isDatetime())) {
+                    if (!(t.isBoolean() || t.isIntegerType() || t.isLargeint() || t.isVarchar() || t.isBinaryType() ||
+                            t.isDate() || t.isDatetime())) {
                        throw new SemanticException("sort key column[" + cd.getName() + "] type not supported: " + t.toSql());
                    }
                }
--- a/fe/fe-core/src/test/java/com/starrocks/catalog/CreateTableTest.java
+++ b/fe/fe-core/src/test/java/com/starrocks/catalog/CreateTableTest.java
@ -1131,7 +1131,7 @@ public class CreateTableTest {
    }

    @Test
-    public void testCreateVarBinaryTable() {
+    public void testCreateVarBinaryTable() throws Exception {
        // duplicate table
        ExceptionChecker.expectThrowsNoException(() -> createTable(
                "create table test.varbinary_tbl\n" +
@ -1174,20 +1174,16 @@ public class CreateTableTest {
                "distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1');"));

        // failed
-        ExceptionChecker.expectThrowsWithMsg(AnalysisException.class,
-                "Invalid data type of key column 'k2': 'VARBINARY'",
-                () -> createTable("create table test.varbinary_tbl0\n"
+        createTable("create table test.varbinary_tbl00\n"
                        + "(k1 int, k2 varbinary)\n"
                        + "duplicate key(k1, k2)\n"
                        + "distributed by hash(k1) buckets 1\n"
-                        + "properties('replication_num' = '1');"));
-        ExceptionChecker.expectThrowsWithMsg(DdlException.class,
-                "VARBINARY(10) column can not be distribution column",
-                () -> createTable("create table test.varbinary_tbl0 \n"
+                + "properties('replication_num' = '1');");
+        createTable("create table test.varbinary_tbl01 \n"
                        + "(k1 int, k2 varbinary(10) )\n"
                        + "duplicate key(k1)\n"
                        + "distributed by hash(k2) buckets 1\n"
-                        + "properties('replication_num' = '1');"));
+                + "properties('replication_num' = '1');");
        ExceptionChecker.expectThrowsWithMsg(DdlException.class,
                "Column[j] type[VARBINARY] cannot be a range partition key",
                () -> createTable("create table test.varbinary_tbl0 \n" +
@ -1199,7 +1195,7 @@ public class CreateTableTest {
    }

    @Test
-    public void testCreateBinaryTable() {
+    public void testCreateBinaryTable() throws Exception {
        // duplicate table
        ExceptionChecker.expectThrowsNoException(() -> createTable(
                "create table test.binary_tbl\n" +
@ -1242,20 +1238,16 @@ public class CreateTableTest {
                "distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1');"));

        // failed
-        ExceptionChecker.expectThrowsWithMsg(AnalysisException.class,
-                "Invalid data type of key column 'k2': 'VARBINARY'",
-                () -> createTable("create table test.binary_tbl0\n"
+        createTable("create table test.binary_tbl01\n"
                        + "(k1 int, k2 binary)\n"
                        + "duplicate key(k1, k2)\n"
                        + "distributed by hash(k1) buckets 1\n"
-                        + "properties('replication_num' = '1');"));
-        ExceptionChecker.expectThrowsWithMsg(DdlException.class,
-                "VARBINARY(10) column can not be distribution column",
-                () -> createTable("create table test.binary_tbl0 \n"
+                + "properties('replication_num' = '1');");
+        createTable("create table test.binary_tbl11 \n"
                        + "(k1 int, k2 binary(10) )\n"
                        + "duplicate key(k1)\n"
                        + "distributed by hash(k2) buckets 1\n"
-                        + "properties('replication_num' = '1');"));
+                + "properties('replication_num' = '1');");
        ExceptionChecker.expectThrowsWithMsg(DdlException.class,
                "Column[j] type[VARBINARY] cannot be a range partition key",
                () -> createTable("create table test.binary_tbl0 \n" +
--- a/test/sql/test_binary_type/R/test_varbinary_groupby_join
+++ b/test/sql/test_binary_type/R/test_varbinary_groupby_join
@ -0,0 +1,64 @@
+-- name: test_varbinary_groupby_join
+create database db_${uuid0};
+-- result:
+-- !result
+use db_${uuid0};
+-- result:
+-- !result
+create table a(
+  id int,
+  kb varbinary,
+  v int
+)
+DUPLICATE KEY(id)
+DISTRIBUTED BY HASH(id)
+BUCKETS 1
+PROPERTIES('replication_num'='1');
+-- result:
+-- !result
+create table b(
+  id int,
+  kb varbinary,
+  v int
+)
+DUPLICATE KEY(id)
+DISTRIBUTED BY HASH(id)
+BUCKETS 1
+PROPERTIES('replication_num'='1');
+-- result:
+-- !result
+insert into a values
+  (1, x'0102', 10),
+  (2, x'0102', 20),
+  (3, x'0AFF', 30),
+  (4, x'', 40);
+-- result:
+-- !result
+insert into b values
+  (10, x'0102', 100),
+  (20, x'0AFF', 200),
+  (30, x'BEEF', 300),
+  (40, x'', 400);
+-- result:
+-- !result
+select hex(kb), count(*), sum(v) from a group by kb order by hex(kb);
+-- result:
+        	2	30
+0AFF    	1	30
+BEEF    	0	0
+        	1	40
+-- !result
+select hex(a.kb), a.v, b.v from a join b on a.kb = b.kb order by hex(a.kb), a.v, b.v;
+-- result:
+        	40	400
+0102    	10	100
+0102    	20	100
+0AFF    	30	200
+-- !result
+select hex(a.kb), a.v, ifnull(b.v, -1) from a left join b on a.kb = b.kb order by hex(a.kb), a.v, ifnull(b.v, -1);
+-- result:
+        	40	400
+0102    	10	100
+0102    	20	100
+0AFF    	30	200
+-- !result
--- a/test/sql/test_binary_type/T/test_varbinary_groupby_join
+++ b/test/sql/test_binary_type/T/test_varbinary_groupby_join
@ -0,0 +1,44 @@
+-- name: test_varbinary_groupby_join
+create database db_${uuid0};
+use db_${uuid0};
+
+-- Create tables with VARBINARY
+create table a(
+  id int,
+  kb varbinary,
+  v int
+)
+DUPLICATE KEY(id)
+DISTRIBUTED BY HASH(id)
+BUCKETS 1
+PROPERTIES('replication_num'='1');
+
+create table b(
+  id int,
+  kb varbinary,
+  v int
+)
+DUPLICATE KEY(id)
+DISTRIBUTED BY HASH(id)
+BUCKETS 1
+PROPERTIES('replication_num'='1');
+
+-- Insert rows using hex literal x'..'
+insert into a values
+  (1, x'0102', 10),
+  (2, x'0102', 20),
+  (3, x'0AFF', 30);
+
+insert into b values
+  (10, x'0102', 100),
+  (20, x'0AFF', 200),
+  (30, x'BEEF', 300);
+
+-- GROUP BY on VARBINARY
+select hex(kb), count(*), sum(v) from a group by kb order by hex(kb);
+
+-- JOIN on VARBINARY equality
+select hex(a.kb), a.v, b.v from a join b on a.kb = b.kb order by hex(a.kb), a.v, b.v;
+
+-- LEFT JOIN with unmatched key
+select hex(a.kb), a.v, ifnull(b.v, -1) from a left join b on a.kb = b.kb order by hex(a.kb), a.v, ifnull(b.v, -1);
--- a/test/sql/test_make_sort_key/R/test_make_sort_key_json
+++ b/test/sql/test_make_sort_key/R/test_make_sort_key_json
@ -0,0 +1,158 @@
+-- name: test_make_sort_key_json
+CREATE DATABASE test_make_sort_key_json;
+-- result:
+-- !result
+USE test_make_sort_key_json;
+-- result:
+-- !result
+CREATE TABLE `json_test_table` (
+  `id` int(11) NOT NULL COMMENT "",
+  `json_data` json NOT NULL COMMENT "",
+  `json_array` json NOT NULL COMMENT "",
+  `json_nested` json NOT NULL COMMENT "",
+  `sort_key` varbinary(1024) AS (
+    make_sort_key(
+      get_json_int(json_data, '$.age'),
+      get_json_string(json_data, '$.name'),
+      get_json_string(json_data, '$.city'),
+      get_json_string(json_array, '$[0]'),
+      get_json_double(json_nested, '$.user.profile.score')
+    )
+  ) COMMENT "Auto-generated sort key from extracted JSON fields"
+) ENGINE=OLAP 
+DISTRIBUTED BY HASH(sort_key) BUCKETS 1 
+ORDER BY (sort_key)
+PROPERTIES ( "replication_num" = "1");
+-- result:
+E: (1064, 'VARBINARY(1024) column can not be distribution column')
+-- !result
+INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
+(1, parse_json('{"name": "Alice", "age": 25, "city": "New York"}'), 
+     parse_json('["apple", "banana", "cherry"]'),
+     parse_json('{"user": {"id": 101, "profile": {"verified": true, "score": 95.5}}}')),
+(2, parse_json('{"name": "Bob", "age": 30, "city": "Los Angeles"}'), 
+     parse_json('["orange", "grape"]'),
+     parse_json('{"user": {"id": 102, "profile": {"verified": false, "score": 87.2}}}')),
+(3, parse_json('{"name": "Charlie", "age": 28, "city": "Chicago"}'), 
+     parse_json('["mango", "pineapple", "kiwi", "strawberry"]'),
+     parse_json('{"user": {"id": 103, "profile": {"verified": true, "score": 92.8}}}')),
+(4, parse_json('{"name": "Diana", "age": 22, "city": "Miami"}'), 
+     parse_json('["pear"]'),
+     parse_json('{"user": {"id": 104, "profile": {"verified": true, "score": 89.1}}}')),
+(5, parse_json('{"name": "Eve", "age": 35, "city": "Seattle"}'), 
+     parse_json('["blueberry", "raspberry", "blackberry"]'),
+     parse_json('{"user": {"id": 105, "profile": {"verified": false, "score": 78.9}}}'));
+-- result:
+E: (1064, 'Getting analyzing error. Detail message: Table json_test_table is not found.')
+-- !result
+SELECT id, 
+       json_data, 
+       json_extract(json_data, '$.age') as age,
+       json_extract(json_data, '$.name') as name,
+       json_extract(json_data, '$.city') as city,
+       json_extract(json_array, '$[0]') as first_fruit,
+       json_extract(json_nested, '$.user.profile.score') as score,
+       sort_key,
+       length(sort_key) as sort_key_length
+FROM json_test_table 
+ORDER BY id;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+SELECT id, json_data, json_array
+FROM json_test_table 
+ORDER BY sort_key;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+SELECT id, 
+       json_extract(json_data, '$.age') as age,
+       json_extract(json_data, '$.name') as name,
+       json_extract(json_data, '$.city') as city,
+       json_extract(json_array, '$[0]') as first_fruit,
+       json_extract(json_nested, '$.user.profile.score') as score,
+       sort_key
+FROM json_test_table 
+ORDER BY sort_key;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+SELECT id, json_data, json_array
+FROM json_test_table 
+WHERE sort_key > (SELECT sort_key FROM json_test_table WHERE id = 2)
+ORDER BY id;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+SELECT id, 
+       json_extract(json_data, '$.age') as age,
+       json_extract(json_data, '$.name') as name,
+       json_extract(json_data, '$.city') as city,
+       json_extract(json_array, '$[0]') as first_fruit,
+       json_extract(json_nested, '$.user.profile.score') as score
+FROM json_test_table 
+ORDER BY sort_key;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
+(6, NULL, parse_json('["test"]'), parse_json('{"test": null}')),
+(7, parse_json('{"age": 40}'), parse_json('[]'), parse_json('{"user": {"id": 106}}'));
+-- result:
+E: (1064, 'Getting analyzing error. Detail message: Table json_test_table is not found.')
+-- !result
+SELECT id, 
+       json_data,
+       json_extract(json_data, '$.age') as age,
+       json_extract(json_data, '$.name') as name,
+       json_extract(json_data, '$.city') as city,
+       json_extract(json_array, '$[0]') as first_fruit,
+       json_extract(json_nested, '$.user.profile.score') as score,
+       sort_key
+FROM json_test_table 
+WHERE id IN (6, 7)
+ORDER BY id;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+SELECT 
+    count(*) as total_rows,
+    count(sort_key) as rows_with_sort_keys,
+    avg(length(sort_key)) as avg_sort_key_length
+FROM json_test_table;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+SHOW CREATE TABLE json_test_table;
+-- result:
+E: (1064, 'Getting analyzing error. Detail message: Table json_test_table is not found.')
+-- !result
+UPDATE json_test_table 
+SET json_data = parse_json('{"name": "Alice Updated", "age": 26, "city": "New York"}')
+WHERE id = 1;
+-- result:
+E: (1064, 'Getting analyzing error. Detail message: Table json_test_table is not found.')
+-- !result
+SELECT id, 
+       json_data,
+       json_extract(json_data, '$.age') as age,
+       json_extract(json_data, '$.name') as name,
+       json_extract(json_data, '$.city') as city,
+       sort_key
+FROM json_test_table 
+WHERE id = 1;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+SELECT id, json_data, json_array
+FROM json_test_table 
+WHERE sort_key BETWEEN 
+    (SELECT sort_key FROM json_test_table WHERE id = 4) AND 
+    (SELECT sort_key FROM json_test_table WHERE id = 2)
+ORDER BY sort_key;
+-- result:
+E: (5502, "Getting analyzing error. Detail message: Unknown table 'test_make_sort_key_json.json_test_table'.")
+-- !result
+DROP DATABASE test_make_sort_key_json;
+-- result:
+-- !result
--- a/test/sql/test_make_sort_key/T/test_make_sort_key_json
+++ b/test/sql/test_make_sort_key/T/test_make_sort_key_json
@ -0,0 +1,151 @@
+-- name: test_make_sort_key_json
+CREATE DATABASE test_make_sort_key_json;
+USE test_make_sort_key_json;
+
+-- Create a table with JSON data types and a single generated sort key column
+-- The sort key extracts specific fields from JSON and combines them for efficient sorting
+CREATE TABLE `json_test_table` (
+  `id` int(11) NOT NULL COMMENT "",
+  `json_data` json NOT NULL COMMENT "",
+  `json_array` json NOT NULL COMMENT "",
+  `json_nested` json NOT NULL COMMENT "",
+  `sort_key` varbinary(1024) AS (
+    make_sort_key(
+      get_json_int(json_data, '$.age'),
+      get_json_string(json_data, '$.name'),
+      get_json_string(json_data, '$.city'),
+      get_json_string(json_array, '$[0]'),
+      get_json_double(json_nested, '$.user.profile.score')
+    )
+  ) COMMENT "Auto-generated sort key from extracted JSON fields"
+) ENGINE=OLAP 
+DISTRIBUTED BY HASH(sort_key) BUCKETS 1 
+ORDER BY (sort_key)
+PROPERTIES ( "replication_num" = "1");
+
+-- Insert test data with various JSON structures
+-- The sort key will be automatically generated from extracted JSON fields
+INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
+(1, parse_json('{"name": "Alice", "age": 25, "city": "New York"}'), 
+     parse_json('["apple", "banana", "cherry"]'),
+     parse_json('{"user": {"id": 101, "profile": {"verified": true, "score": 95.5}}}')),
+(2, parse_json('{"name": "Bob", "age": 30, "city": "Los Angeles"}'), 
+     parse_json('["orange", "grape"]'),
+     parse_json('{"user": {"id": 102, "profile": {"verified": false, "score": 87.2}}}')),
+(3, parse_json('{"name": "Charlie", "age": 28, "city": "Chicago"}'), 
+     parse_json('["mango", "pineapple", "kiwi", "strawberry"]'),
+     parse_json('{"user": {"id": 103, "profile": {"verified": true, "score": 92.8}}}')),
+(4, parse_json('{"name": "Diana", "age": 22, "city": "Miami"}'), 
+     parse_json('["pear"]'),
+     parse_json('{"user": {"id": 104, "profile": {"verified": true, "score": 89.1}}}')),
+(5, parse_json('{"name": "Eve", "age": 35, "city": "Seattle"}'), 
+     parse_json('["blueberry", "raspberry", "blackberry"]'),
+     parse_json('{"user": {"id": 105, "profile": {"verified": false, "score": 78.9}}}'));
+
+-- Test 1: Verify that the generated sort key column is automatically populated
+-- This shows how make_sort_key extracts and combines JSON fields
+SELECT id, 
+       json_data, 
+       get_json_int(json_data, '$.age') as age,
+       get_json_string(json_data, '$.name') as name,
+       get_json_string(json_data, '$.city') as city,
+       get_json_string(json_array, '$[0]') as first_fruit,
+       get_json_double(json_nested, '$.user.profile.score') as score,
+       sort_key,
+       length(sort_key) as sort_key_length
+FROM json_test_table 
+ORDER BY id;
+
+-- Test 2: Use the generated sort key for ordering
+-- This demonstrates the performance benefit of pre-computed sort keys
+SELECT id, json_data, json_array
+FROM json_test_table 
+ORDER BY sort_key;
+
+-- Test 3: Show how the sort key combines multiple extracted fields
+-- The sort key contains: age, name, city, first_fruit, score
+SELECT id, 
+       get_json_int(json_data, '$.age') as age,
+       get_json_string(json_data, '$.name') as name,
+       get_json_string(json_data, '$.city') as city,
+       get_json_string(json_array, '$[0]') as first_fruit,
+       get_json_double(json_nested, '$.user.profile.score') as score,
+       sort_key
+FROM json_test_table 
+ORDER BY sort_key;
+
+-- Test 4: Use generated sort key in WHERE clause for filtering
+-- This shows practical usage of the generated column
+SELECT id, json_data, json_array
+FROM json_test_table 
+WHERE sort_key > (SELECT sort_key FROM json_test_table WHERE id = 2)
+ORDER BY id;
+
+-- Test 5: Demonstrate that the sort key properly handles different data types
+-- The extracted fields include: int (age), string (name, city, first_fruit), double (score)
+SELECT id, 
+       get_json_int(json_data, '$.age') as age,
+       get_json_string(json_data, '$.name') as name,
+       get_json_string(json_data, '$.city') as city,
+       get_json_string(json_array, '$[0]') as first_fruit,
+       json_extract(json_nested, '$.user.profile.score') as score
+FROM json_test_table 
+ORDER BY sort_key;
+
+-- Test 6: Test with NULL JSON values and missing fields
+-- This verifies how the function handles NULL inputs and missing JSON paths
+INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
+(6, NULL, parse_json('["test"]'), parse_json('{"test": null}')),
+(7, parse_json('{"age": 40}'), parse_json('[]'), parse_json('{"user": {"id": 106}}'));
+
+-- Verify that NULL JSON and missing fields generate appropriate sort keys
+SELECT id, 
+       json_data,
+       get_json_int(json_data, '$.age') as age,
+       get_json_string(json_data, '$.name') as name,
+       get_json_string(json_data, '$.city') as city,
+       get_json_string(json_array, '$[0]') as first_fruit,
+       get_json_double(json_nested, '$.user.profile.score') as score,
+       sort_key
+FROM json_test_table 
+WHERE id IN (6, 7)
+ORDER BY id;
+
+-- Test 7: Performance test - verify the generated column is populated
+-- This demonstrates the function's performance characteristics
+SELECT 
+    count(*) as total_rows,
+    count(sort_key) as rows_with_sort_keys,
+    avg(length(sort_key)) as avg_sort_key_length
+FROM json_test_table;
+
+-- Test 8: Show table structure to verify the generated column definition
+SHOW CREATE TABLE json_test_table;
+
+-- Test 9: Demonstrate that the generated column is automatically updated
+-- Update a JSON value and verify the sort key changes
+UPDATE json_test_table 
+SET json_data = parse_json('{"name": "Alice Updated", "age": 26, "city": "New York"}')
+WHERE id = 1;
+
+-- Verify the sort key was automatically updated with new extracted values
+SELECT id, 
+       json_data,
+       get_json_int(json_data, '$.age') as age,
+       get_json_string(json_data, '$.name') as name,
+       get_json_string(json_data, '$.city') as city,
+       sort_key
+FROM json_test_table 
+WHERE id = 1;
+
+-- Test 10: Use the sort key for efficient range queries
+-- This shows how the extracted fields enable efficient filtering and sorting
+SELECT id, json_data, json_array
+FROM json_test_table 
+WHERE sort_key BETWEEN 
+    (SELECT sort_key FROM json_test_table WHERE id = 4) AND 
+    (SELECT sort_key FROM json_test_table WHERE id = 2)
+ORDER BY sort_key;
+
+-- Clean up
+DROP DATABASE test_make_sort_key_json;
Author	SHA1	Message	Date
Murphy	2e8ea070a1	add be ut Signed-off-by: Murphy <mofei@starrocks.com>	2025-08-12 18:23:56 +08:00
Murphy	f2b7b8ee57	fix fe ut Signed-off-by: Murphy <mofei@starrocks.com>	2025-08-12 18:20:08 +08:00
Cursor Agent	e1fccd0b34	Add test cases for VARBINARY group by and join operations Co-authored-by: huanmingwong <huanmingwong@gmail.com>	2025-08-12 05:26:45 +00:00
Murphy	047737a32f	fix Signed-off-by: Murphy <mofei@starrocks.com>	2025-08-12 13:21:13 +08:00
Murphy	1e4e8846d6	add sql test Signed-off-by: Murphy <mofei@starrocks.com>	2025-08-12 12:44:06 +08:00
Cursor Agent	0ed0746a1f	Add VARBINARY support in key coder, distribution, and sort key Co-authored-by: huanmingwong <huanmingwong@gmail.com>	2025-08-12 03:23:10 +00:00