Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com> Co-authored-by: Cursor Agent <cursoragent@cursor.com>
This commit is contained in:
parent
90f1f3be58
commit
ae28c45368
|
|
@ -18,6 +18,7 @@
|
|||
#include "runtime/mem_pool.h"
|
||||
#include "storage/olap_type_infra.h"
|
||||
#include "storage/type_traits.h"
|
||||
#include "types/logical_type.h"
|
||||
|
||||
namespace starrocks {
|
||||
|
||||
|
|
@ -51,6 +52,7 @@ Status datum_from_string(TypeInfo* type_info, Datum* dst, const std::string& str
|
|||
return Status::OK();
|
||||
}
|
||||
/* Type need memory allocated */
|
||||
case TYPE_VARBINARY:
|
||||
case TYPE_CHAR:
|
||||
case TYPE_VARCHAR: {
|
||||
/* Type need memory allocated */
|
||||
|
|
@ -92,6 +94,7 @@ std::string datum_to_string(TypeInfo* type_info, const Datum& datum) {
|
|||
switch (type) {
|
||||
case TYPE_BOOLEAN:
|
||||
return datum_to_string<TYPE_TINYINT>(type_info, datum);
|
||||
case TYPE_VARBINARY:
|
||||
case TYPE_CHAR:
|
||||
case TYPE_VARCHAR:
|
||||
return datum_to_string<TYPE_VARCHAR>(type_info, datum);
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "exprs/utility_functions.h"
|
||||
|
||||
#include "column/column_visitor_adapter.h"
|
||||
#include "gen_cpp/FrontendService_types.h"
|
||||
#include "runtime/client_cache.h"
|
||||
|
||||
|
|
@ -29,7 +30,9 @@
|
|||
#include <limits>
|
||||
#include <random>
|
||||
|
||||
#include "column/binary_column.h"
|
||||
#include "column/column_builder.h"
|
||||
#include "column/column_helper.h"
|
||||
#include "column/column_viewer.h"
|
||||
#include "column/vectorized_fwd.h"
|
||||
#include "common/config.h"
|
||||
|
|
@ -39,6 +42,8 @@
|
|||
#include "gutil/casts.h"
|
||||
#include "runtime/runtime_state.h"
|
||||
#include "service/backend_options.h"
|
||||
#include "storage/key_coder.h"
|
||||
#include "storage/primary_key_encoder.h"
|
||||
#include "types/logical_type.h"
|
||||
#include "util/cidr.h"
|
||||
#include "util/monotime.h"
|
||||
|
|
@ -366,6 +371,188 @@ StatusOr<ColumnPtr> UtilityFunctions::equiwidth_bucket(FunctionContext* context,
|
|||
return builder.build(false);
|
||||
}
|
||||
|
||||
// Helpers copied and adapted from storage/primary_key_encoder.cpp for order-preserving encoding
|
||||
namespace detail {
|
||||
|
||||
inline void encode_float32(float v, std::string* dest) {
|
||||
uint32_t u;
|
||||
static_assert(sizeof(u) == sizeof(v), "size mismatch");
|
||||
memcpy(&u, &v, sizeof(u));
|
||||
// If negative, flip all bits; else flip sign bit only.
|
||||
u ^= (u & 0x80000000u) ? 0xFFFFFFFFu : 0x80000000u;
|
||||
u = starrocks::encoding_utils::to_bigendian(u);
|
||||
dest->append(reinterpret_cast<const char*>(&u), sizeof(u));
|
||||
}
|
||||
|
||||
inline void encode_float64(double v, std::string* dest) {
|
||||
uint64_t u;
|
||||
static_assert(sizeof(u) == sizeof(v), "size mismatch");
|
||||
memcpy(&u, &v, sizeof(u));
|
||||
u ^= (u & 0x8000000000000000ull) ? 0xFFFFFFFFFFFFFFFFull : 0x8000000000000000ull;
|
||||
u = starrocks::encoding_utils::to_bigendian(u);
|
||||
dest->append(reinterpret_cast<char*>(&u), sizeof(u));
|
||||
}
|
||||
|
||||
struct EncoderVisitor : public ColumnVisitorAdapter<EncoderVisitor> {
|
||||
bool is_last_field = false;
|
||||
std::vector<std::string>* buffs;
|
||||
const Buffer<uint8_t>* null_mask = nullptr; // Track null rows to skip processing
|
||||
|
||||
explicit EncoderVisitor() : ColumnVisitorAdapter(this) {}
|
||||
|
||||
// Nullable wrapper: handle null values and data together
|
||||
Status do_visit(const NullableColumn& column) {
|
||||
auto& nulls = column.immutable_null_column_data();
|
||||
|
||||
for (size_t i = 0; i < column.size(); i++) {
|
||||
if (nulls[i]) {
|
||||
(*buffs)[i].append("\0", 1);
|
||||
} else {
|
||||
(*buffs)[i].append("\1", 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Set the null mask so that subsequent visitor methods can skip null rows
|
||||
const Buffer<uint8_t>* original_null_mask = null_mask;
|
||||
null_mask = &nulls;
|
||||
|
||||
// Process the underlying data column
|
||||
Status status = column.data_column()->accept(this);
|
||||
|
||||
// Restore the original null mask
|
||||
null_mask = original_null_mask;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
// Const: forward to data
|
||||
Status do_visit(const ConstColumn& column) {
|
||||
for (size_t i = 0; i < column.size(); i++) {
|
||||
RETURN_IF_ERROR(column.data_column()->accept(this));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Strings/binary
|
||||
template <typename T>
|
||||
Status do_visit(const BinaryColumnBase<T>& column) {
|
||||
for (size_t i = 0; i < column.size(); i++) {
|
||||
// Skip processing for null rows
|
||||
if (null_mask && (*null_mask)[i]) {
|
||||
continue;
|
||||
}
|
||||
Slice s = column.get_slice(i);
|
||||
encoding_utils::encode_slice(s, &(*buffs)[i], is_last_field);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Fixed-length numerics
|
||||
template <typename T>
|
||||
Status do_visit(const FixedLengthColumn<T>& column) {
|
||||
const auto& data = column.get_data();
|
||||
for (size_t i = 0; i < column.size(); i++) {
|
||||
// Skip processing for null rows
|
||||
if (null_mask && (*null_mask)[i]) {
|
||||
continue;
|
||||
}
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
encode_float32(data[i], &(*buffs)[i]);
|
||||
} else if constexpr (std::is_same_v<T, double>) {
|
||||
encode_float64(data[i], &(*buffs)[i]);
|
||||
} else if constexpr (std::is_integral_v<T> && sizeof(T) <= 8) {
|
||||
encoding_utils::encode_integral<T>(data[i], &(*buffs)[i]);
|
||||
} else if constexpr (std::is_same_v<T, DateValue>) {
|
||||
encoding_utils::encode_integral<int32_t>(data[i].julian(), &(*buffs)[i]);
|
||||
} else if constexpr (std::is_same_v<T, DateTimeValue>) {
|
||||
encoding_utils::encode_integral<int64_t>(data[i].to_int64(), &(*buffs)[i]);
|
||||
} else if constexpr (std::is_same_v<T, TimestampValue>) {
|
||||
encoding_utils::encode_integral<int64_t>(data[i].timestamp(), &(*buffs)[i]);
|
||||
} else {
|
||||
return Status::NotSupported(
|
||||
fmt::format("encode_sort_key: unsupported argument type: {}", typeid(T).name()));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Unsupported complex types map/array/struct/json/hll/bitmap/percentile
|
||||
template <typename T>
|
||||
Status do_visit(const T& column) {
|
||||
// Even for unsupported types, we should check null mask to be consistent
|
||||
// but since we're returning an error anyway, we can skip the null check
|
||||
return Status::NotSupported("encode_sort_key: unsupported argument type");
|
||||
}
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
// The encoding strategy of encode_sort_key is as follows:
|
||||
//
|
||||
// - For each input column, each row's value is encoded into a binary buffer
|
||||
// using an order-preserving encoding. The encoding method depends on the
|
||||
// column's type:
|
||||
// * For integral types, values are converted to big-endian and, for signed
|
||||
// types, the sign bit is flipped to preserve order.
|
||||
// * For floating-point types, a custom encoding is used to ensure correct
|
||||
// sort order.
|
||||
// * For string types (Slice), a special encoding is used for non-last fields
|
||||
// to escape 0x00 bytes and append a 0x00 0x00 terminator, while the last
|
||||
// field is appended directly.
|
||||
// * For date/time types, the internal integer representation is encoded as
|
||||
// an integral value.
|
||||
// * Unsupported types return an error.
|
||||
//
|
||||
// - For each column, a NULL marker (byte value 0x00 or 0x01) is appended for every
|
||||
// row, regardless of the column's nullability, to ensure consistent encoding
|
||||
// even if the column's nullability metadata is lost during processing.
|
||||
//
|
||||
// - After encoding each column, a separator byte (0x00) is appended between
|
||||
// columns (except after the last column) to delimit fields in the composite
|
||||
// key.
|
||||
//
|
||||
// - The result is a composite binary key for each row, which preserves the
|
||||
// original sort order of the input columns when compared lexicographically.
|
||||
StatusOr<ColumnPtr> UtilityFunctions::encode_sort_key(FunctionContext* context, const Columns& columns) {
|
||||
const char* NOT_NULL_MARKER = "\1";
|
||||
const char* COLUMN_SEPARATOR = "\0";
|
||||
int num_args = columns.size();
|
||||
RETURN_IF(num_args < 1, Status::InvalidArgument("encode_sort_key requires at least 1 argument"));
|
||||
|
||||
size_t num_rows = columns[0]->size();
|
||||
auto result = ColumnBuilder<TYPE_VARBINARY>(num_rows);
|
||||
|
||||
std::vector<std::string> buffs(num_rows);
|
||||
detail::EncoderVisitor visitor;
|
||||
visitor.buffs = &buffs;
|
||||
for (int j = 0; j < num_args; ++j) {
|
||||
// Insert NOT_NULL markers for all rows.
|
||||
// This is necessary because the function may receive columns whose nullability
|
||||
// does not accurately reflect the original data source.
|
||||
// For example, a nullable column that contains no null values may be converted
|
||||
// to a non-nullable column during processing. To ensure consistency in the sort key
|
||||
// encoding, we explicitly add NOT_NULL markers for every row.
|
||||
if (!columns[j]->is_nullable()) {
|
||||
for (auto& buff : buffs) {
|
||||
buff.append(NOT_NULL_MARKER, 1);
|
||||
}
|
||||
}
|
||||
visitor.is_last_field = (j + 1 == num_args);
|
||||
RETURN_IF_ERROR(columns[j]->accept(&visitor));
|
||||
|
||||
// Add a separator between each column
|
||||
if (j < num_args - 1) {
|
||||
for (auto& buff : buffs) {
|
||||
buff.append(COLUMN_SEPARATOR, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_rows; i++) {
|
||||
result.append(std::move(buffs[i]));
|
||||
}
|
||||
return result.build(ColumnHelper::is_all_const(columns));
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
|
||||
#include "gen_cpp/opcode/UtilityFunctions.inc"
|
||||
|
|
|
|||
|
|
@ -67,6 +67,9 @@ public:
|
|||
|
||||
// Build a equi-width histogram
|
||||
DEFINE_VECTORIZED_FN(equiwidth_bucket);
|
||||
|
||||
// Build an order-preserving composite binary key from heterogeneous arguments
|
||||
DEFINE_VECTORIZED_FN(encode_sort_key);
|
||||
};
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ private:
|
|||
add_mapping<TYPE_DECIMALV2>();
|
||||
add_mapping<TYPE_CHAR>();
|
||||
add_mapping<TYPE_VARCHAR>();
|
||||
add_mapping<TYPE_VARBINARY>();
|
||||
add_mapping<TYPE_BOOLEAN>();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -409,4 +409,8 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
// Reuse VARCHAR's key coder behavior for VARBINARY
|
||||
template <>
|
||||
class KeyCoderTraits<TYPE_VARBINARY> : public KeyCoderTraits<TYPE_VARCHAR> {};
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
|
|
@ -55,58 +55,7 @@ namespace starrocks {
|
|||
constexpr uint8_t SORT_KEY_NULL_FIRST_MARKER = 0x00;
|
||||
constexpr uint8_t SORT_KEY_NORMAL_MARKER = 0x01;
|
||||
|
||||
template <class UT>
|
||||
UT to_bigendian(UT v);
|
||||
|
||||
template <>
|
||||
uint8_t to_bigendian(uint8_t v) {
|
||||
return v;
|
||||
}
|
||||
template <>
|
||||
uint16_t to_bigendian(uint16_t v) {
|
||||
return BigEndian::FromHost16(v);
|
||||
}
|
||||
template <>
|
||||
uint32_t to_bigendian(uint32_t v) {
|
||||
return BigEndian::FromHost32(v);
|
||||
}
|
||||
template <>
|
||||
uint64_t to_bigendian(uint64_t v) {
|
||||
return BigEndian::FromHost64(v);
|
||||
}
|
||||
template <>
|
||||
uint128_t to_bigendian(uint128_t v) {
|
||||
return BigEndian::FromHost128(v);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void encode_integral(const T& v, std::string* dest) {
|
||||
if constexpr (std::is_signed<T>::value) {
|
||||
typedef typename std::make_unsigned<T>::type UT;
|
||||
UT uv = v;
|
||||
uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
|
||||
uv = to_bigendian(uv);
|
||||
dest->append(reinterpret_cast<const char*>(&uv), sizeof(uv));
|
||||
} else {
|
||||
T nv = to_bigendian(v);
|
||||
dest->append(reinterpret_cast<const char*>(&nv), sizeof(nv));
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void decode_integral(Slice* src, T* v) {
|
||||
if constexpr (std::is_signed<T>::value) {
|
||||
typedef typename std::make_unsigned<T>::type UT;
|
||||
UT uv = *(UT*)(src->data);
|
||||
uv = to_bigendian(uv);
|
||||
uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
|
||||
*v = uv;
|
||||
} else {
|
||||
T nv = *(T*)(src->data);
|
||||
*v = to_bigendian(nv);
|
||||
}
|
||||
src->remove_prefix(sizeof(T));
|
||||
}
|
||||
using namespace encoding_utils;
|
||||
|
||||
template <int LEN>
|
||||
static bool SSEEncodeChunk(const uint8_t** srcp, uint8_t** dstp) {
|
||||
|
|
@ -164,7 +113,8 @@ static inline void EncodeChunkLoop(const uint8_t** srcp, uint8_t** dstp, int len
|
|||
}
|
||||
}
|
||||
|
||||
inline void encode_slice(const Slice& s, std::string* dst, bool is_last) {
|
||||
// Implementation of encoding_utils::encode_slice
|
||||
void encoding_utils::encode_slice(const Slice& s, std::string* dst, bool is_last) {
|
||||
if (is_last) {
|
||||
dst->append(s.data, s.size);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -17,9 +17,80 @@
|
|||
#include "column/field.h"
|
||||
#include "column/vectorized_fwd.h"
|
||||
#include "common/status.h"
|
||||
#include "gutil/endian.h"
|
||||
|
||||
namespace starrocks {
|
||||
|
||||
// Utility functions for encoding various data types with order-preserving encoding
|
||||
namespace encoding_utils {
|
||||
|
||||
// Endian conversion utilities
|
||||
template <class UT>
|
||||
inline UT to_bigendian(UT v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint8_t to_bigendian(uint8_t v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint16_t to_bigendian(uint16_t v) {
|
||||
return BigEndian::FromHost16(v);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32_t to_bigendian(uint32_t v) {
|
||||
return BigEndian::FromHost32(v);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint64_t to_bigendian(uint64_t v) {
|
||||
return BigEndian::FromHost64(v);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint128_t to_bigendian(uint128_t v) {
|
||||
return BigEndian::FromHost128(v);
|
||||
}
|
||||
|
||||
// Integral encoding with order-preserving transformation
|
||||
template <class T>
|
||||
inline void encode_integral(const T& v, std::string* dest) {
|
||||
if constexpr (std::is_signed<T>::value) {
|
||||
typedef typename std::make_unsigned<T>::type UT;
|
||||
UT uv = v;
|
||||
uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
|
||||
uv = to_bigendian(uv);
|
||||
dest->append(reinterpret_cast<const char*>(&uv), sizeof(uv));
|
||||
} else {
|
||||
T nv = to_bigendian(v);
|
||||
dest->append(reinterpret_cast<const char*>(&nv), sizeof(nv));
|
||||
}
|
||||
}
|
||||
|
||||
// Integral decoding with order-preserving transformation
|
||||
template <class T>
|
||||
inline void decode_integral(Slice* src, T* v) {
|
||||
if constexpr (std::is_signed<T>::value) {
|
||||
typedef typename std::make_unsigned<T>::type UT;
|
||||
UT uv = *(UT*)(src->data);
|
||||
uv = to_bigendian(uv);
|
||||
uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
|
||||
*v = uv;
|
||||
} else {
|
||||
T nv = *(T*)(src->data);
|
||||
*v = to_bigendian(nv);
|
||||
}
|
||||
src->remove_prefix(sizeof(T));
|
||||
}
|
||||
|
||||
// Slice encoding with SSE optimizations for middle fields
|
||||
void encode_slice(const Slice& s, std::string* dst, bool is_last);
|
||||
|
||||
} // namespace encoding_utils
|
||||
|
||||
// Utility methods to encode composite primary key into single binary key, while
|
||||
// preserving original sort order.
|
||||
// Currently only bool, integral types(tinyint, smallint, int, bigint, largeint)
|
||||
|
|
|
|||
|
|
@ -30,6 +30,9 @@
|
|||
|
||||
namespace starrocks {
|
||||
|
||||
// Import encoding utilities from primary_key_encoder.h
|
||||
using encoding_utils::encode_integral;
|
||||
|
||||
Status RowStoreEncoderSimple::encode_columns_to_full_row_column(const Schema& schema, const Columns& columns,
|
||||
BinaryColumn& dest) {
|
||||
RETURN_IF_ERROR(is_supported(schema));
|
||||
|
|
|
|||
|
|
@ -19,62 +19,26 @@
|
|||
#include "common/status.h"
|
||||
#include "gutil/endian.h"
|
||||
#include "gutil/stringprintf.h"
|
||||
#include "storage/primary_key_encoder.h"
|
||||
#include "types/bitmap_value.h"
|
||||
|
||||
namespace starrocks {
|
||||
//declare and defines of template class must be in a file, so we exctract it here
|
||||
|
||||
template <class UT>
|
||||
inline UT to_bigendian(UT v);
|
||||
|
||||
template <>
|
||||
inline uint8_t to_bigendian(uint8_t v) {
|
||||
return v;
|
||||
}
|
||||
template <>
|
||||
inline uint16_t to_bigendian(uint16_t v) {
|
||||
return BigEndian::FromHost16(v);
|
||||
}
|
||||
template <>
|
||||
inline uint32_t to_bigendian(uint32_t v) {
|
||||
return BigEndian::FromHost32(v);
|
||||
}
|
||||
template <>
|
||||
inline uint64_t to_bigendian(uint64_t v) {
|
||||
return BigEndian::FromHost64(v);
|
||||
}
|
||||
template <>
|
||||
inline uint128_t to_bigendian(uint128_t v) {
|
||||
return BigEndian::FromHost128(v);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline size_t encode_integral(const T& v, std::string* dest) {
|
||||
if constexpr (std::is_signed<T>::value) {
|
||||
typedef typename std::make_unsigned<T>::type UT;
|
||||
UT uv = v;
|
||||
uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
|
||||
uv = to_bigendian(uv);
|
||||
dest->append(reinterpret_cast<const char*>(&uv), sizeof(uv));
|
||||
return sizeof(uv);
|
||||
} else {
|
||||
T nv = to_bigendian(v);
|
||||
dest->append(reinterpret_cast<const char*>(&nv), sizeof(nv));
|
||||
return sizeof(nv);
|
||||
}
|
||||
}
|
||||
// Note: to_bigendian and encode_integral functions are now available in
|
||||
// storage/primary_key_encoder.h under the encoding_utils namespace
|
||||
|
||||
template <class T>
|
||||
inline void decode_integral(Slice* src, T* v) {
|
||||
if constexpr (std::is_signed<T>::value) {
|
||||
typedef typename std::make_unsigned<T>::type UT;
|
||||
UT uv = *(UT*)(src->data);
|
||||
uv = to_bigendian(uv);
|
||||
uv = encoding_utils::to_bigendian(uv);
|
||||
uv ^= static_cast<UT>(1) << (sizeof(UT) * 8 - 1);
|
||||
*v = uv;
|
||||
} else {
|
||||
T nv = *(T*)(src->data);
|
||||
*v = to_bigendian(nv);
|
||||
*v = encoding_utils::to_bigendian(nv);
|
||||
}
|
||||
src->remove_prefix(sizeof(T));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -119,6 +119,25 @@ struct ZoneMapDatum<TYPE_CHAR> : public ZoneMapDatumBase<TYPE_CHAR> {
|
|||
template <>
|
||||
struct ZoneMapDatum<TYPE_VARCHAR> final : public ZoneMapDatum<TYPE_CHAR> {};
|
||||
|
||||
template <>
|
||||
struct ZoneMapDatum<TYPE_VARBINARY> final : public ZoneMapDatum<TYPE_CHAR> {
|
||||
void resize_container_for_fit(TypeInfo* type_info, const void* v) override {
|
||||
static const int INIT_SIZE = 64;
|
||||
const Slice* slice = reinterpret_cast<const Slice*>(v);
|
||||
if (slice->size > _length) {
|
||||
_length = std::max<int>(BitUtil::next_power_of_two(slice->size), INIT_SIZE);
|
||||
raw::stl_string_resize_uninitialized(&_value_container, _length);
|
||||
value.data = _value_container.data();
|
||||
value.size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void reset(TypeInfo* type_info) override {
|
||||
value.data = _value_container.data();
|
||||
value.size = 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <LogicalType type>
|
||||
struct ZoneMap {
|
||||
ZoneMapDatum<type> min_value;
|
||||
|
|
|
|||
|
|
@ -20,8 +20,10 @@
|
|||
|
||||
#include "column/column_helper.h"
|
||||
#include "column/column_viewer.h"
|
||||
#include "common/statusor.h"
|
||||
#include "exprs/function_context.h"
|
||||
#include "runtime/runtime_state.h"
|
||||
#include "testutil/assert.h"
|
||||
#include "types/logical_type.h"
|
||||
#include "util/random.h"
|
||||
#include "util/time.h"
|
||||
|
|
@ -127,4 +129,168 @@ TEST_F(UtilityFunctionsTest, uuidTest) {
|
|||
}
|
||||
}
|
||||
|
||||
TEST_F(UtilityFunctionsTest, encodeSortKeyBasicOrdering) {
|
||||
FunctionContext* ctx = FunctionContext::create_test_context();
|
||||
auto ptr = std::unique_ptr<FunctionContext>(ctx);
|
||||
|
||||
// Prepare columns of different types
|
||||
// int32
|
||||
auto c_int = Int32Column::create();
|
||||
c_int->append(1);
|
||||
c_int->append(2);
|
||||
c_int->append(10);
|
||||
|
||||
// binary
|
||||
auto c_str = BinaryColumn::create();
|
||||
c_str->append(Slice("a"));
|
||||
c_str->append(Slice("b"));
|
||||
c_str->append(Slice("c"));
|
||||
|
||||
// date
|
||||
auto c_date = DateColumn::create();
|
||||
c_date->append(DateValue::create(2021, 1, 1));
|
||||
c_date->append(DateValue::create(2021, 1, 2));
|
||||
c_date->append(DateValue::create(2021, 1, 10));
|
||||
|
||||
// timestamp
|
||||
auto c_timestamp = TimestampColumn::create();
|
||||
c_timestamp->append(TimestampValue::create(2021, 1, 1, 0, 0, 0, 0));
|
||||
c_timestamp->append(TimestampValue::create(2021, 1, 2, 0, 0, 0, 0));
|
||||
c_timestamp->append(TimestampValue::create(2021, 1, 10, 0, 0, 0, 0));
|
||||
|
||||
// float32
|
||||
auto c_float32 = FloatColumn::create();
|
||||
c_float32->append(1.0f);
|
||||
c_float32->append(2.0f);
|
||||
c_float32->append(10.0f);
|
||||
|
||||
// float64
|
||||
auto c_float64 = DoubleColumn::create();
|
||||
c_float64->append(1.0);
|
||||
c_float64->append(2.0);
|
||||
c_float64->append(10.0);
|
||||
|
||||
Columns cols;
|
||||
cols.emplace_back(c_int);
|
||||
cols.emplace_back(c_str);
|
||||
cols.emplace_back(c_date);
|
||||
cols.emplace_back(c_timestamp);
|
||||
cols.emplace_back(c_float32);
|
||||
cols.emplace_back(c_float64);
|
||||
|
||||
// verify each column
|
||||
for (auto& col : cols) {
|
||||
ASSIGN_OR_ASSERT_FAIL(ColumnPtr out, UtilityFunctions::encode_sort_key(ctx, {col}));
|
||||
auto* bin = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out);
|
||||
ASSERT_EQ(3, bin->size());
|
||||
|
||||
std::vector<Slice> keys = {bin->get_slice(0), bin->get_slice(1), bin->get_slice(2)};
|
||||
ASSERT_LT(keys[0].compare(keys[1]), 0) << col->get_name();
|
||||
ASSERT_LT(keys[1].compare(keys[2]), 0) << col->get_name();
|
||||
}
|
||||
|
||||
// verify all columns
|
||||
ASSIGN_OR_ASSERT_FAIL(ColumnPtr out, UtilityFunctions::encode_sort_key(ctx, cols));
|
||||
auto* bin = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out);
|
||||
ASSERT_EQ(3, bin->size());
|
||||
|
||||
// Verify lexicographic order aligns with tuple order
|
||||
std::vector<Slice> keys = {bin->get_slice(0), bin->get_slice(1), bin->get_slice(2)};
|
||||
ASSERT_LT(keys[0].compare(keys[1]), 0);
|
||||
ASSERT_LT(keys[1].compare(keys[2]), 0);
|
||||
}
|
||||
|
||||
TEST_F(UtilityFunctionsTest, encodeSortKeyNullHandling) {
|
||||
FunctionContext* ctx = FunctionContext::create_test_context();
|
||||
auto ptr = std::unique_ptr<FunctionContext>(ctx);
|
||||
|
||||
// Create nullable columns with mixed null and non-null values
|
||||
auto c_int = NullableColumn::create(Int32Column::create(), NullColumn::create());
|
||||
c_int->append_datum(Datum(int32_t(1))); // non-null
|
||||
c_int->append_nulls(1); // null
|
||||
c_int->append_datum(Datum(int32_t(2))); // non-null
|
||||
c_int->append_nulls(1); // null
|
||||
|
||||
auto c_str = NullableColumn::create(BinaryColumn::create(), NullColumn::create());
|
||||
c_str->append_nulls(1); // null
|
||||
c_str->append_datum(Datum(Slice("z"))); // non-null
|
||||
c_str->append_nulls(1); // null
|
||||
c_str->append_datum(Datum(Slice("a"))); // non-null
|
||||
|
||||
Columns cols;
|
||||
cols.emplace_back(c_int);
|
||||
cols.emplace_back(c_str);
|
||||
|
||||
ASSIGN_OR_ASSERT_FAIL(ColumnPtr out, UtilityFunctions::encode_sort_key(ctx, cols));
|
||||
auto* bin = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out);
|
||||
ASSERT_EQ(4, bin->size());
|
||||
|
||||
// Get the sort keys for each row
|
||||
Slice k0 = bin->get_slice(0); // (1,NULL) - first non-null, second null
|
||||
Slice k1 = bin->get_slice(1); // (NULL,"z") - first null, second non-null
|
||||
Slice k2 = bin->get_slice(2); // (2,NULL) - first non-null, second null
|
||||
Slice k3 = bin->get_slice(3); // (NULL,"a") - first null, second non-null
|
||||
|
||||
// Verify that null values sort before non-null values
|
||||
// Row 1 (NULL,"z") should sort before Row 0 (1,NULL) - null in first column takes precedence
|
||||
ASSERT_LT(k1.compare(k0), 0) << "NULL should sort before non-NULL in first column";
|
||||
|
||||
// Row 3 (NULL,"a") should sort before Row 2 (2,NULL) - null in first column takes precedence
|
||||
ASSERT_LT(k3.compare(k2), 0) << "NULL should sort before non-NULL in first column";
|
||||
|
||||
// Row 3 (NULL,"a") should sort before Row 0 (1,NULL) - null in first column takes precedence
|
||||
ASSERT_LT(k3.compare(k0), 0) << "NULL in first column should sort before any non-NULL";
|
||||
|
||||
// Verify that null values only contain null markers and no underlying data encoding
|
||||
// This tests the fix where null rows don't process underlying data
|
||||
// The first byte should be the null marker (\0 for null, \1 for non-null)
|
||||
ASSERT_EQ('\0', k1.data[0]) << "First column null marker should be \\0";
|
||||
ASSERT_EQ('\1', k0.data[0]) << "First column non-null marker should be \\1";
|
||||
ASSERT_EQ('\0', k3.data[0]) << "First column null marker should be \\0";
|
||||
ASSERT_EQ('\1', k2.data[0]) << "First column non-null marker should be \\1";
|
||||
|
||||
// Verify that rows with null in second column also have correct null markers
|
||||
// The second column null marker should be at position after first column encoding
|
||||
// For row 0: first column is non-null (1), so second column null marker should be after int32 encoding
|
||||
// For row 2: first column is non-null (2), so second column null marker should be after int32 encoding
|
||||
ASSERT_EQ('\0', k0.data[5])
|
||||
<< "Second column null marker should be \\0 for row 0"; // After int32 encoding (4 bytes) + null marker (1 byte)
|
||||
ASSERT_EQ('\0', k2.data[5]) << "Second column null marker should be \\0 for row 2";
|
||||
|
||||
// Verify that the sort keys are deterministic and consistent
|
||||
// Running the same operation should produce identical results
|
||||
ASSIGN_OR_ASSERT_FAIL(ColumnPtr out2, UtilityFunctions::encode_sort_key(ctx, cols));
|
||||
auto* bin2 = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out2);
|
||||
ASSERT_EQ(4, bin2->size());
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
ASSERT_EQ(bin->get_slice(i).to_string(), bin2->get_slice(i).to_string())
|
||||
<< "Sort keys should be deterministic for row " << i;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(UtilityFunctionsTest, encodeSortKeyStringEscaping) {
|
||||
FunctionContext* ctx = FunctionContext::create_test_context();
|
||||
auto ptr = std::unique_ptr<FunctionContext>(ctx);
|
||||
|
||||
auto c1 = BinaryColumn::create();
|
||||
auto c2 = BinaryColumn::create();
|
||||
c1->append(Slice("a\0b", 3));
|
||||
c1->append(Slice("a", 1));
|
||||
c2->append(Slice("x", 1));
|
||||
c2->append(Slice("\0", 1));
|
||||
|
||||
Columns cols;
|
||||
cols.emplace_back(c1);
|
||||
cols.emplace_back(c2);
|
||||
|
||||
ASSIGN_OR_ASSERT_FAIL(ColumnPtr out, UtilityFunctions::encode_sort_key(ctx, cols));
|
||||
auto* bin = ColumnHelper::cast_to_raw<TYPE_VARBINARY>(out);
|
||||
ASSERT_EQ(2, bin->size());
|
||||
// Ensure keys are comparable and not identical
|
||||
Slice k0 = bin->get_slice(0);
|
||||
Slice k1 = bin->get_slice(1);
|
||||
ASSERT_NE(k0.to_string(), k1.to_string());
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
|
|
@ -359,4 +359,83 @@ TEST_F(ColumnZoneMapTest, NormalTestCharPage) {
|
|||
test_string("NormalTestCharPage", type_info);
|
||||
}
|
||||
|
||||
// Test for varbinary
|
||||
TEST_F(ColumnZoneMapTest, NormalTestVarbinaryPage) {
|
||||
TabletColumn varbinary_column = create_varbinary_key(0);
|
||||
TypeInfoPtr type_info = get_type_info(varbinary_column);
|
||||
test_string("NormalTestVarbinaryPage", type_info);
|
||||
}
|
||||
|
||||
// Test for varbinary with binary data
|
||||
TEST_F(ColumnZoneMapTest, VarbinaryWithBinaryData) {
|
||||
std::string filename = kTestDir + "/VarbinaryWithBinaryData";
|
||||
|
||||
TabletColumn varbinary_column = create_varbinary_key(0);
|
||||
TypeInfoPtr type_info = get_type_info(varbinary_column);
|
||||
|
||||
auto writer = ZoneMapIndexWriter::create(type_info.get());
|
||||
|
||||
// Add binary data with various patterns
|
||||
std::vector<std::string> binary_values1 = {
|
||||
std::string("\x00\x01\x02\x03", 4), // Binary data starting with null bytes
|
||||
std::string("\xFF\xFE\xFD\xFC", 4), // Binary data with high bytes
|
||||
std::string("ABCD", 4), // ASCII data
|
||||
std::string("\x00\x00\x00\x00", 4), // All null bytes
|
||||
};
|
||||
|
||||
for (auto& value : binary_values1) {
|
||||
Slice slice(value);
|
||||
writer->add_values((const uint8_t*)&slice, 1);
|
||||
}
|
||||
writer->flush();
|
||||
|
||||
// Add more binary data with different patterns
|
||||
std::vector<std::string> binary_values2 = {
|
||||
std::string("\x01\x02\x03\x04", 4), std::string("\xFE\xFD\xFC\xFB", 4), std::string("EFGH", 4),
|
||||
std::string("\xFF\xFF\xFF\xFF", 4), // All high bytes
|
||||
};
|
||||
|
||||
for (auto& value : binary_values2) {
|
||||
Slice slice(value);
|
||||
writer->add_values((const uint8_t*)&slice, 1);
|
||||
}
|
||||
writer->add_nulls(1);
|
||||
writer->flush();
|
||||
|
||||
// Add null values
|
||||
writer->add_nulls(3);
|
||||
writer->flush();
|
||||
|
||||
// Write out zone map index
|
||||
ColumnIndexMetaPB index_meta;
|
||||
write_file(*writer, index_meta, filename);
|
||||
|
||||
// Read and verify
|
||||
ZoneMapIndexReader column_zone_map;
|
||||
load_zone_map(column_zone_map, index_meta, filename);
|
||||
|
||||
ASSERT_EQ(3, column_zone_map.num_pages());
|
||||
const std::vector<ZoneMapPB>& zone_maps = column_zone_map.page_zone_maps();
|
||||
ASSERT_EQ(3, zone_maps.size());
|
||||
|
||||
// Check first page - should have min/max from binary_values1
|
||||
// For binary data, comparison is byte-by-byte, so "\x00\x00\x00\x00" is min and "\xFF\xFE\xFD\xFC" is max
|
||||
check_result(zone_maps[0], true, true, std::string("\x00\x00\x00\x00", 4), std::string("\xFF\xFE\xFD\xFC", 4),
|
||||
false, true);
|
||||
|
||||
// Check second page - should have min/max from binary_values2 plus null
|
||||
// "\x01\x02\x03\x04" is min and "\xFF\xFF\xFF\xFF" is max
|
||||
check_result(zone_maps[1], true, true, std::string("\x01\x02\x03\x04", 4), std::string("\xFF\xFF\xFF\xFF", 4), true,
|
||||
true);
|
||||
|
||||
// Check third page - should be all nulls
|
||||
check_result(zone_maps[2], false, false, "", "", true, false);
|
||||
|
||||
// Check segment zonemap - should cover all data
|
||||
// The segment zonemap should have the overall min/max across all pages
|
||||
const auto& segment_zonemap = index_meta.zone_map_index().segment_zone_map();
|
||||
check_result(segment_zonemap, true, true, std::string("\x00\x00\x00\x00", 4), std::string("\xFF\xFF\xFF\xFF", 4),
|
||||
true, true);
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
|
|
@ -164,6 +164,18 @@ inline TabletColumn create_varchar_key(int32_t id, bool is_nullable = true, int
|
|||
return column;
|
||||
}
|
||||
|
||||
inline TabletColumn create_varbinary_key(int32_t id, bool is_nullable = true, int length = 8) {
|
||||
TabletColumn column;
|
||||
column.set_unique_id(id);
|
||||
column.set_name(std::to_string(id));
|
||||
column.set_type(TYPE_VARBINARY);
|
||||
column.set_is_key(true);
|
||||
column.set_is_nullable(is_nullable);
|
||||
column.set_length(length);
|
||||
column.set_index_length(4);
|
||||
return column;
|
||||
}
|
||||
|
||||
inline TabletColumn create_array(int32_t id, bool is_nullable = true, int length = 24) {
|
||||
TabletColumn column;
|
||||
column.set_unique_id(id);
|
||||
|
|
|
|||
|
|
@ -254,6 +254,7 @@ public class FunctionSet {
|
|||
public static final String ISNOTNULL = "isnotnull";
|
||||
public static final String ASSERT_TRUE = "assert_true";
|
||||
public static final String HOST_NAME = "host_name";
|
||||
public static final String ENCODE_SORT_KEY = "encode_sort_key";
|
||||
// Aggregate functions:
|
||||
public static final String APPROX_COUNT_DISTINCT = "approx_count_distinct";
|
||||
public static final String APPROX_COUNT_DISTINCT_HLL_SKETCH = "approx_count_distinct_hll_sketch";
|
||||
|
|
|
|||
|
|
@ -821,7 +821,7 @@ public abstract class Type implements Cloneable {
|
|||
return true;
|
||||
}
|
||||
|
||||
return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType();
|
||||
return !isOnlyMetricType() && !isJsonType() && !isFunctionType();
|
||||
}
|
||||
|
||||
public boolean canGroupBy() {
|
||||
|
|
@ -839,7 +839,7 @@ public abstract class Type implements Cloneable {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType();
|
||||
return !isOnlyMetricType() && !isJsonType() && !isFunctionType();
|
||||
}
|
||||
|
||||
public boolean canOrderBy() {
|
||||
|
|
@ -847,8 +847,7 @@ public abstract class Type implements Cloneable {
|
|||
if (isArrayType()) {
|
||||
return ((ArrayType) this).getItemType().canOrderBy();
|
||||
}
|
||||
return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isBinaryType() && !isStructType() &&
|
||||
!isMapType();
|
||||
return !isOnlyMetricType() && !isJsonType() && !isFunctionType() && !isStructType() && !isMapType();
|
||||
}
|
||||
|
||||
public boolean canPartitionBy() {
|
||||
|
|
@ -883,8 +882,9 @@ public abstract class Type implements Cloneable {
|
|||
|
||||
public boolean canDistributedBy() {
|
||||
// TODO(mofei) support distributed by for JSON
|
||||
// Allow VARBINARY as distribution key
|
||||
return !isComplexType() && !isFloatingPointType() && !isOnlyMetricType() && !isJsonType()
|
||||
&& !isFunctionType() && !isBinaryType();
|
||||
&& !isFunctionType();
|
||||
}
|
||||
|
||||
public boolean canBeWindowFunctionArgumentTypes() {
|
||||
|
|
|
|||
|
|
@ -435,8 +435,8 @@ public class CreateTableAnalyzer {
|
|||
}
|
||||
ColumnDef cd = columnDefs.get(idx);
|
||||
Type t = cd.getType();
|
||||
if (!(t.isBoolean() || t.isIntegerType() || t.isLargeint() || t.isVarchar() || t.isDate() ||
|
||||
t.isDatetime())) {
|
||||
if (!(t.isBoolean() || t.isIntegerType() || t.isLargeint() || t.isVarchar() || t.isBinaryType() ||
|
||||
t.isDate() || t.isDatetime())) {
|
||||
throw new SemanticException("sort key column[" + cd.getName() + "] type not supported: " + t.toSql());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1131,7 +1131,7 @@ public class CreateTableTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testCreateVarBinaryTable() {
|
||||
public void testCreateVarBinaryTable() throws Exception {
|
||||
// duplicate table
|
||||
ExceptionChecker.expectThrowsNoException(() -> createTable(
|
||||
"create table test.varbinary_tbl\n" +
|
||||
|
|
@ -1174,20 +1174,16 @@ public class CreateTableTest {
|
|||
"distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1');"));
|
||||
|
||||
// failed
|
||||
ExceptionChecker.expectThrowsWithMsg(AnalysisException.class,
|
||||
"Invalid data type of key column 'k2': 'VARBINARY'",
|
||||
() -> createTable("create table test.varbinary_tbl0\n"
|
||||
createTable("create table test.varbinary_tbl00\n"
|
||||
+ "(k1 int, k2 varbinary)\n"
|
||||
+ "duplicate key(k1, k2)\n"
|
||||
+ "distributed by hash(k1) buckets 1\n"
|
||||
+ "properties('replication_num' = '1');"));
|
||||
ExceptionChecker.expectThrowsWithMsg(DdlException.class,
|
||||
"VARBINARY(10) column can not be distribution column",
|
||||
() -> createTable("create table test.varbinary_tbl0 \n"
|
||||
+ "properties('replication_num' = '1');");
|
||||
createTable("create table test.varbinary_tbl01 \n"
|
||||
+ "(k1 int, k2 varbinary(10) )\n"
|
||||
+ "duplicate key(k1)\n"
|
||||
+ "distributed by hash(k2) buckets 1\n"
|
||||
+ "properties('replication_num' = '1');"));
|
||||
+ "properties('replication_num' = '1');");
|
||||
ExceptionChecker.expectThrowsWithMsg(DdlException.class,
|
||||
"Column[j] type[VARBINARY] cannot be a range partition key",
|
||||
() -> createTable("create table test.varbinary_tbl0 \n" +
|
||||
|
|
@ -1199,7 +1195,7 @@ public class CreateTableTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testCreateBinaryTable() {
|
||||
public void testCreateBinaryTable() throws Exception {
|
||||
// duplicate table
|
||||
ExceptionChecker.expectThrowsNoException(() -> createTable(
|
||||
"create table test.binary_tbl\n" +
|
||||
|
|
@ -1242,20 +1238,16 @@ public class CreateTableTest {
|
|||
"distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1');"));
|
||||
|
||||
// failed
|
||||
ExceptionChecker.expectThrowsWithMsg(AnalysisException.class,
|
||||
"Invalid data type of key column 'k2': 'VARBINARY'",
|
||||
() -> createTable("create table test.binary_tbl0\n"
|
||||
createTable("create table test.binary_tbl01\n"
|
||||
+ "(k1 int, k2 binary)\n"
|
||||
+ "duplicate key(k1, k2)\n"
|
||||
+ "distributed by hash(k1) buckets 1\n"
|
||||
+ "properties('replication_num' = '1');"));
|
||||
ExceptionChecker.expectThrowsWithMsg(DdlException.class,
|
||||
"VARBINARY(10) column can not be distribution column",
|
||||
() -> createTable("create table test.binary_tbl0 \n"
|
||||
+ "properties('replication_num' = '1');");
|
||||
createTable("create table test.binary_tbl11 \n"
|
||||
+ "(k1 int, k2 binary(10) )\n"
|
||||
+ "duplicate key(k1)\n"
|
||||
+ "distributed by hash(k2) buckets 1\n"
|
||||
+ "properties('replication_num' = '1');"));
|
||||
+ "properties('replication_num' = '1');");
|
||||
ExceptionChecker.expectThrowsWithMsg(DdlException.class,
|
||||
"Column[j] type[VARBINARY] cannot be a range partition key",
|
||||
() -> createTable("create table test.binary_tbl0 \n" +
|
||||
|
|
|
|||
|
|
@ -825,6 +825,7 @@ vectorized_functions = [
|
|||
[100019, 'assert_true', True, False, 'BOOLEAN', ['BOOLEAN', "VARCHAR"], 'UtilityFunctions::assert_true'],
|
||||
[100018, 'host_name', True, False, 'VARCHAR', [], "UtilityFunctions::host_name"],
|
||||
[100020, 'get_query_profile', True, False, 'VARCHAR', ['VARCHAR'], "UtilityFunctions::get_query_profile"],
|
||||
[100024, 'encode_sort_key', True, False, 'VARBINARY', ['ANY_ELEMENT', '...'], 'UtilityFunctions::encode_sort_key'],
|
||||
|
||||
# json string function
|
||||
[110022, "get_json_int", False, False, "BIGINT", ["VARCHAR", "VARCHAR"], "JsonFunctions::get_json_bigint",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,123 @@
|
|||
-- name: test_encode_sort_key_json
|
||||
CREATE DATABASE test_encode_sort_key_json;
|
||||
-- result:
|
||||
-- !result
|
||||
USE test_encode_sort_key_json;
|
||||
-- result:
|
||||
-- !result
|
||||
CREATE TABLE `json_test_table` (
|
||||
`id` int(11) NOT NULL COMMENT "",
|
||||
`json_data` json ,
|
||||
`json_array` json ,
|
||||
`json_nested` json,
|
||||
`sort_key` varbinary(1024) AS (
|
||||
encode_sort_key(
|
||||
get_json_int(json_data, '$.age'),
|
||||
get_json_string(json_data, '$.name'),
|
||||
get_json_string(json_data, '$.city'),
|
||||
get_json_string(json_array, '$[0]'),
|
||||
get_json_double(json_nested, '$.user.profile.score')
|
||||
)
|
||||
) COMMENT "Auto-generated sort key from extracted JSON fields"
|
||||
) ENGINE=OLAP
|
||||
DISTRIBUTED BY HASH(sort_key) BUCKETS 2
|
||||
ORDER BY (sort_key)
|
||||
PROPERTIES ( "replication_num" = "1");
|
||||
-- result:
|
||||
-- !result
|
||||
INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
|
||||
(1, parse_json('{"name": "Alice", "age": 25, "city": "New York"}'),
|
||||
parse_json('["apple", "banana", "cherry"]'),
|
||||
parse_json('{"user": {"id": 101, "profile": {"verified": true, "score": 95.5}}}')),
|
||||
(2, parse_json('{"name": "Bob", "age": 30, "city": "Los Angeles"}'),
|
||||
parse_json('["orange", "grape"]'),
|
||||
parse_json('{"user": {"id": 102, "profile": {"verified": false, "score": 87.2}}}')),
|
||||
(3, parse_json('{"name": "Charlie", "age": 28, "city": "Chicago"}'),
|
||||
parse_json('["mango", "pineapple", "kiwi", "strawberry"]'),
|
||||
parse_json('{"user": {"id": 103, "profile": {"verified": true, "score": 92.8}}}')),
|
||||
(4, parse_json('{"name": "Diana", "age": 22, "city": "Miami"}'),
|
||||
parse_json('["pear"]'),
|
||||
parse_json('{"user": {"id": 104, "profile": {"verified": true, "score": 89.1}}}')),
|
||||
(5, parse_json('{"name": "Eve", "age": 35, "city": "Seattle"}'),
|
||||
parse_json('["blueberry", "raspberry", "blackberry"]'),
|
||||
parse_json('{"user": {"id": 105, "profile": {"verified": false, "score": 78.9}}}'));
|
||||
-- result:
|
||||
-- !result
|
||||
SELECT id,
|
||||
json_data,
|
||||
get_json_int(json_data, '$.age') as age,
|
||||
get_json_string(json_data, '$.name') as name,
|
||||
get_json_string(json_data, '$.city') as city,
|
||||
get_json_string(json_array, '$[0]') as first_fruit,
|
||||
get_json_double(json_nested, '$.user.profile.score') as score,
|
||||
sort_key,
|
||||
length(sort_key) as sort_key_length
|
||||
FROM json_test_table
|
||||
ORDER BY id;
|
||||
-- result:
|
||||
1 {"age": 25, "city": "New York", "name": "Alice"} 25 Alice New York apple 95.5 b'\x01\x80\x00\x00\x00\x00\x00\x00\x19\x00\x01Alice\x00\x00\x00\x01New York\x00\x00\x00\x01apple\x00\x00\x00\x01\xc0W\xe0\x00\x00\x00\x00\x00' 49
|
||||
2 {"age": 30, "city": "Los Angeles", "name": "Bob"} 30 Bob Los Angeles orange 87.2 b'\x01\x80\x00\x00\x00\x00\x00\x00\x1e\x00\x01Bob\x00\x00\x00\x01Los Angeles\x00\x00\x00\x01orange\x00\x00\x00\x01\xc0U\xcc\xcc\xcc\xcc\xcc\xcd' 51
|
||||
3 {"age": 28, "city": "Chicago", "name": "Charlie"} 28 Charlie Chicago mango 92.8 b'\x01\x80\x00\x00\x00\x00\x00\x00\x1c\x00\x01Charlie\x00\x00\x00\x01Chicago\x00\x00\x00\x01mango\x00\x00\x00\x01\xc0W333333' 50
|
||||
4 {"age": 22, "city": "Miami", "name": "Diana"} 22 Diana Miami pear 89.1 b'\x01\x80\x00\x00\x00\x00\x00\x00\x16\x00\x01Diana\x00\x00\x00\x01Miami\x00\x00\x00\x01pear\x00\x00\x00\x01\xc0VFfffff' 45
|
||||
5 {"age": 35, "city": "Seattle", "name": "Eve"} 35 Eve Seattle blueberry 78.9 b'\x01\x80\x00\x00\x00\x00\x00\x00#\x00\x01Eve\x00\x00\x00\x01Seattle\x00\x00\x00\x01blueberry\x00\x00\x00\x01\xc0S\xb9\x99\x99\x99\x99\x9a' 50
|
||||
-- !result
|
||||
SELECT id, json_data, json_array
|
||||
FROM json_test_table
|
||||
ORDER BY sort_key;
|
||||
-- result:
|
||||
4 {"age": 22, "city": "Miami", "name": "Diana"} ["pear"]
|
||||
1 {"age": 25, "city": "New York", "name": "Alice"} ["apple", "banana", "cherry"]
|
||||
3 {"age": 28, "city": "Chicago", "name": "Charlie"} ["mango", "pineapple", "kiwi", "strawberry"]
|
||||
2 {"age": 30, "city": "Los Angeles", "name": "Bob"} ["orange", "grape"]
|
||||
5 {"age": 35, "city": "Seattle", "name": "Eve"} ["blueberry", "raspberry", "blackberry"]
|
||||
-- !result
|
||||
SELECT id, json_data, json_array
|
||||
FROM json_test_table
|
||||
ORDER BY sort_key DESC;
|
||||
-- result:
|
||||
5 {"age": 35, "city": "Seattle", "name": "Eve"} ["blueberry", "raspberry", "blackberry"]
|
||||
2 {"age": 30, "city": "Los Angeles", "name": "Bob"} ["orange", "grape"]
|
||||
3 {"age": 28, "city": "Chicago", "name": "Charlie"} ["mango", "pineapple", "kiwi", "strawberry"]
|
||||
1 {"age": 25, "city": "New York", "name": "Alice"} ["apple", "banana", "cherry"]
|
||||
4 {"age": 22, "city": "Miami", "name": "Diana"} ["pear"]
|
||||
-- !result
|
||||
SELECT id, json_data, json_array
|
||||
FROM json_test_table
|
||||
WHERE sort_key > (SELECT sort_key FROM json_test_table WHERE id = 2)
|
||||
ORDER BY id;
|
||||
-- result:
|
||||
5 {"age": 35, "city": "Seattle", "name": "Eve"} ["blueberry", "raspberry", "blackberry"]
|
||||
-- !result
|
||||
INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
|
||||
(6, NULL, parse_json('["test"]'), parse_json('{"test": null}')),
|
||||
(7, parse_json('{"age": 40}'), parse_json('[]'), parse_json('{"user": {"id": 106}}'));
|
||||
-- result:
|
||||
-- !result
|
||||
SELECT id,
|
||||
get_json_int(json_data, '$.age') as age
|
||||
FROM json_test_table
|
||||
ORDER BY id;
|
||||
-- result:
|
||||
1 25
|
||||
2 30
|
||||
3 28
|
||||
4 22
|
||||
5 35
|
||||
6 None
|
||||
7 40
|
||||
-- !result
|
||||
SELECT id, json_data, json_array
|
||||
FROM json_test_table
|
||||
ORDER BY sort_key;
|
||||
-- result:
|
||||
6 None ["test"]
|
||||
4 {"age": 22, "city": "Miami", "name": "Diana"} ["pear"]
|
||||
1 {"age": 25, "city": "New York", "name": "Alice"} ["apple", "banana", "cherry"]
|
||||
3 {"age": 28, "city": "Chicago", "name": "Charlie"} ["mango", "pineapple", "kiwi", "strawberry"]
|
||||
2 {"age": 30, "city": "Los Angeles", "name": "Bob"} ["orange", "grape"]
|
||||
5 {"age": 35, "city": "Seattle", "name": "Eve"} ["blueberry", "raspberry", "blackberry"]
|
||||
7 {"age": 40} []
|
||||
-- !result
|
||||
DROP DATABASE test_encode_sort_key_json;
|
||||
-- result:
|
||||
-- !result
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
-- name: test_encode_sort_key_json
|
||||
CREATE DATABASE test_encode_sort_key_json;
|
||||
USE test_encode_sort_key_json;
|
||||
|
||||
-- Create a table with JSON data types and a single generated sort key column
|
||||
-- The sort key extracts specific fields from JSON and combines them for efficient sorting
|
||||
CREATE TABLE `json_test_table` (
|
||||
`id` int(11) NOT NULL COMMENT "",
|
||||
`json_data` json ,
|
||||
`json_array` json ,
|
||||
`json_nested` json,
|
||||
`sort_key` varbinary(1024) AS (
|
||||
encode_sort_key(
|
||||
get_json_int(json_data, '$.age'),
|
||||
get_json_string(json_data, '$.name'),
|
||||
get_json_string(json_data, '$.city'),
|
||||
get_json_string(json_array, '$[0]'),
|
||||
get_json_double(json_nested, '$.user.profile.score')
|
||||
)
|
||||
) COMMENT "Auto-generated sort key from extracted JSON fields"
|
||||
) ENGINE=OLAP
|
||||
DISTRIBUTED BY HASH(sort_key) BUCKETS 2
|
||||
ORDER BY (sort_key)
|
||||
PROPERTIES ( "replication_num" = "1");
|
||||
|
||||
-- Insert test data with various JSON structures
|
||||
-- The sort key will be automatically generated from extracted JSON fields
|
||||
INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
|
||||
(1, parse_json('{"name": "Alice", "age": 25, "city": "New York"}'),
|
||||
parse_json('["apple", "banana", "cherry"]'),
|
||||
parse_json('{"user": {"id": 101, "profile": {"verified": true, "score": 95.5}}}')),
|
||||
(2, parse_json('{"name": "Bob", "age": 30, "city": "Los Angeles"}'),
|
||||
parse_json('["orange", "grape"]'),
|
||||
parse_json('{"user": {"id": 102, "profile": {"verified": false, "score": 87.2}}}')),
|
||||
(3, parse_json('{"name": "Charlie", "age": 28, "city": "Chicago"}'),
|
||||
parse_json('["mango", "pineapple", "kiwi", "strawberry"]'),
|
||||
parse_json('{"user": {"id": 103, "profile": {"verified": true, "score": 92.8}}}')),
|
||||
(4, parse_json('{"name": "Diana", "age": 22, "city": "Miami"}'),
|
||||
parse_json('["pear"]'),
|
||||
parse_json('{"user": {"id": 104, "profile": {"verified": true, "score": 89.1}}}')),
|
||||
(5, parse_json('{"name": "Eve", "age": 35, "city": "Seattle"}'),
|
||||
parse_json('["blueberry", "raspberry", "blackberry"]'),
|
||||
parse_json('{"user": {"id": 105, "profile": {"verified": false, "score": 78.9}}}'));
|
||||
|
||||
-- Test 1: Verify that the generated sort key column is automatically populated
|
||||
-- This shows how encode_sort_key extracts and combines JSON fields
|
||||
SELECT id,
|
||||
json_data,
|
||||
get_json_int(json_data, '$.age') as age,
|
||||
get_json_string(json_data, '$.name') as name,
|
||||
get_json_string(json_data, '$.city') as city,
|
||||
get_json_string(json_array, '$[0]') as first_fruit,
|
||||
get_json_double(json_nested, '$.user.profile.score') as score,
|
||||
sort_key,
|
||||
length(sort_key) as sort_key_length
|
||||
FROM json_test_table
|
||||
ORDER BY id;
|
||||
|
||||
SELECT id, json_data, json_array
|
||||
FROM json_test_table
|
||||
ORDER BY sort_key;
|
||||
|
||||
SELECT id, json_data, json_array
|
||||
FROM json_test_table
|
||||
ORDER BY sort_key DESC;
|
||||
|
||||
SELECT id, json_data, json_array
|
||||
FROM json_test_table
|
||||
WHERE sort_key > (SELECT sort_key FROM json_test_table WHERE id = 2)
|
||||
ORDER BY id;
|
||||
|
||||
INSERT INTO json_test_table (id, json_data, json_array, json_nested) VALUES
|
||||
(6, NULL, parse_json('["test"]'), parse_json('{"test": null}')),
|
||||
(7, parse_json('{"age": 40}'), parse_json('[]'), parse_json('{"user": {"id": 106}}'));
|
||||
|
||||
SELECT id,
|
||||
get_json_int(json_data, '$.age') as age
|
||||
FROM json_test_table
|
||||
ORDER BY id;
|
||||
|
||||
SELECT id, json_data, json_array
|
||||
FROM json_test_table
|
||||
ORDER BY sort_key;
|
||||
|
||||
-- Clean up
|
||||
DROP DATABASE test_encode_sort_key_json;
|
||||
Loading…
Reference in New Issue