[Feature] Introduce Variant column definition (#61099)

Signed-off-by: Xu Bai <xuba@cisco.com>
This commit is contained in:
Xu Bai 2025-09-03 13:22:19 +08:00 committed by GitHub
parent bfd8a0cfcd
commit e970bfd5de
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
50 changed files with 1848 additions and 23 deletions

View File

@ -44,4 +44,5 @@ add_library(Column STATIC
stream_chunk.cpp
column_view/column_view_base.cpp
column_view/column_view_helper.cpp
variant_column.cpp
)

View File

@ -93,6 +93,7 @@ public:
virtual bool is_object() const { return false; }
virtual bool is_json() const { return false; }
virtual bool is_variant() const { return false; }
virtual bool is_array() const { return false; }
@ -100,6 +101,7 @@ public:
virtual bool is_nullable_view() const { return false; }
virtual bool is_array_view() const { return false; }
virtual bool is_json_view() const { return false; }
virtual bool is_variant_view() const { return false; }
virtual bool is_binary_view() const { return false; }
virtual bool is_struct_view() const { return false; }
virtual bool is_map_view() const { return false; }

View File

@ -29,6 +29,7 @@ public:
ColumnView(ColumnView&& column_view) = delete;
bool is_view() const override { return true; }
bool is_json_view() const override { return ColumnHelper::get_data_column(_default_column.get())->is_json(); }
bool is_variant_view() const override { return ColumnHelper::get_data_column(_default_column.get())->is_variant(); }
bool is_map_view() const override { return ColumnHelper::get_data_column(_default_column.get())->is_map(); }
bool is_array_view() const override { return ColumnHelper::get_data_column(_default_column.get())->is_array(); }
bool is_binary_view() const override { return ColumnHelper::get_data_column(_default_column.get())->is_binary(); }

View File

@ -26,6 +26,7 @@ static bool should_use_view(LogicalType ltype) {
case TYPE_STRUCT:
case TYPE_MAP:
case TYPE_JSON:
case TYPE_VARIANT:
case TYPE_VARCHAR:
case TYPE_CHAR:
case TYPE_BINARY:

View File

@ -48,6 +48,7 @@ VISIT_IMPL(HyperLogLogColumn)
VISIT_IMPL(BitmapColumn)
VISIT_IMPL(PercentileColumn)
VISIT_IMPL(JsonColumn)
VISIT_IMPL(VariantColumn)
VISIT_IMPL(FixedLengthColumn<int96_t>)
VISIT_IMPL(FixedLengthColumn<uint24_t>)
VISIT_IMPL(FixedLengthColumn<decimal12_t>)
@ -71,5 +72,6 @@ VISIT_IMPL(FixedLengthColumnBase<uint24_t>)
VISIT_IMPL(FixedLengthColumnBase<int96_t>)
VISIT_IMPL(FixedLengthColumnBase<decimal12_t>)
VISIT_IMPL(ObjectColumn<JsonValue>)
VISIT_IMPL(ObjectColumn<VariantValue>)
} // namespace starrocks

View File

@ -57,6 +57,7 @@ public:
virtual Status visit(const BitmapColumn& column);
virtual Status visit(const PercentileColumn& column);
virtual Status visit(const JsonColumn& column);
virtual Status visit(const VariantColumn& column);
virtual Status visit(const FixedLengthColumn<int96_t>& column);
virtual Status visit(const FixedLengthColumn<uint24_t>& column);
virtual Status visit(const FixedLengthColumn<decimal12_t>& column);
@ -82,6 +83,7 @@ public:
virtual Status visit(const FixedLengthColumnBase<uint24_t>& column);
virtual Status visit(const FixedLengthColumnBase<decimal12_t>& column);
virtual Status visit(const ObjectColumn<JsonValue>& column);
virtual Status visit(const ObjectColumn<VariantValue>& column);
virtual Status visit(const ArrayViewColumn& column) {
return Status::NotSupported("ArrayViewColumn is not supported");
}

View File

@ -95,6 +95,8 @@ public:
Status visit(const LargeBinaryColumn& column) override { return _impl->do_visit(column); }
Status visit(const VariantColumn& column) override { return _impl->do_visit(column); }
private:
Impl* _impl;
};
@ -170,6 +172,8 @@ public:
Status visit(LargeBinaryColumn* column) override { return _impl->do_visit(column); }
Status visit(VariantColumn* column) override { return _impl->do_visit(column); }
private:
Impl* _impl;
};

View File

@ -49,6 +49,8 @@ VISIT_IMPL(BitmapColumn)
VISIT_IMPL(PercentileColumn)
VISIT_IMPL(JsonColumn)
VISIT_IMPL(ObjectColumn<JsonValue>)
VISIT_IMPL(VariantColumn)
VISIT_IMPL(ObjectColumn<VariantValue>)
VISIT_IMPL(FixedLengthColumn<int96_t>)
VISIT_IMPL(FixedLengthColumn<uint24_t>)
VISIT_IMPL(FixedLengthColumn<decimal12_t>)

View File

@ -56,6 +56,7 @@ public:
virtual Status visit(BitmapColumn* column);
virtual Status visit(PercentileColumn* column);
virtual Status visit(JsonColumn* column);
virtual Status visit(VariantColumn* column);
virtual Status visit(FixedLengthColumn<int96_t>* column);
virtual Status visit(FixedLengthColumn<uint24_t>* column);
virtual Status visit(FixedLengthColumn<decimal12_t>* column);
@ -81,6 +82,7 @@ public:
virtual Status visit(FixedLengthColumnBase<uint24_t>* column);
virtual Status visit(FixedLengthColumnBase<decimal12_t>* column);
virtual Status visit(ObjectColumn<JsonValue>* column);
virtual Status visit(ObjectColumn<VariantValue>* column);
virtual Status visit(ArrayViewColumn* column) { return Status::NotSupported("ArrayViewColumn is not supported"); }
virtual Status visit(ColumnView* column) { return Status::NotSupported("ColumnView is not supported"); }
};

View File

@ -51,6 +51,7 @@ public:
bool is_nullable() const override { return _data->is_nullable(); }
bool is_json() const override { return _data->is_json(); }
bool is_variant() const override { return _data->is_variant(); }
bool is_array() const override { return _data->is_array(); }
bool is_null(size_t index) const override { return _data->is_null(0); }

View File

@ -34,6 +34,7 @@ class BitmapValue;
class HyperLogLog;
class PercentileValue;
class JsonValue;
class VariantValue;
} // namespace starrocks
namespace starrocks {
@ -89,6 +90,7 @@ public:
const BitmapValue* get_bitmap() const { return get<BitmapValue*>(); }
const PercentileValue* get_percentile() const { return get<PercentileValue*>(); }
const JsonValue* get_json() const { return get<JsonValue*>(); }
const VariantValue* get_variant() const { return get<VariantValue*>(); }
void set_int8(int8_t v) { set<decltype(v)>(v); }
void set_uint8(uint8_t v) { set<decltype(v)>(v); }
@ -114,6 +116,7 @@ public:
void set_bitmap(BitmapValue* v) { set<decltype(v)>(v); }
void set_percentile(PercentileValue* v) { set<decltype(v)>(v); }
void set_json(JsonValue* v) { set<decltype(v)>(v); }
void set_variant(VariantValue* v) { set<decltype(v)>(v); }
template <typename T>
const T& get() const {
@ -198,7 +201,7 @@ private:
using Variant =
std::variant<std::monostate, int8_t, uint8_t, int16_t, uint16_t, uint24_t, int32_t, uint32_t, int64_t,
uint64_t, int96_t, int128_t, int256_t, Slice, decimal12_t, DecimalV2Value, float, double,
DatumArray, DatumMap, HyperLogLog*, BitmapValue*, PercentileValue*, JsonValue*>;
DatumArray, DatumMap, HyperLogLog*, BitmapValue*, PercentileValue*, JsonValue*, VariantValue*>;
Variant _value;
};

View File

@ -92,6 +92,7 @@ public:
bool is_nullable() const override { return true; }
bool is_json() const override { return _data_column->is_json(); }
bool is_variant() const override { return _data_column->is_variant(); }
bool is_array() const override { return _data_column->is_array(); }
bool is_array_view() const override { return _data_column->is_array_view(); }

View File

@ -18,6 +18,7 @@
#include "gutil/casts.h"
#include "types/bitmap_value.h"
#include "types/hll.h"
#include "types/variant_value.h"
#include "util/json.h"
#include "util/mysql_row_buffer.h"
#include "util/percentile_value.h"
@ -115,7 +116,16 @@ bool ObjectColumn<T>::append_strings(const Slice* data, size_t size) {
_pool.reserve(_pool.size() + size);
for (size_t i = 0; i < size; i++) {
const auto& s = data[i];
_pool.emplace_back(s);
if constexpr (std::is_same_v<T, VariantValue>) {
auto variant_result = T::create(s);
if (!variant_result.ok()) {
LOG(WARNING) << "Failed to create VariantValue from Slice: " << variant_result.status().to_string();
return false;
}
_pool.emplace_back(std::move(*variant_result));
} else {
_pool.emplace_back(s);
}
}
_cache_ok = false;
@ -339,5 +349,6 @@ template class ObjectColumn<HyperLogLog>;
template class ObjectColumn<BitmapValue>;
template class ObjectColumn<PercentileValue>;
template class ObjectColumn<JsonValue>;
template class ObjectColumn<VariantValue>;
} // namespace starrocks

View File

@ -23,6 +23,7 @@
#include "gutil/strings/substitute.h"
#include "types/bitmap_value.h"
#include "types/hll.h"
#include "types/variant_value.h"
#include "util/json.h"
#include "util/percentile_value.h"

View File

@ -22,10 +22,12 @@
#include "column/nullable_column.h"
#include "column/object_column.h"
#include "column/struct_column.h"
#include "column/variant_column.h"
#include "column/vectorized_fwd.h"
#include "types/constexpr.h"
#include "types/int256.h"
#include "types/logical_type.h"
#include "types/variant_value.h"
#include "util/json.h"
namespace starrocks {
@ -84,6 +86,8 @@ template <>
inline constexpr bool isArithmeticLT<TYPE_JSON> = false;
template <>
inline constexpr bool isArithmeticLT<TYPE_VARBINARY> = false;
template <>
inline constexpr bool isArithmeticLT<TYPE_VARIANT> = false;
template <LogicalType logical_type>
constexpr bool isSliceLT = false;
@ -296,6 +300,13 @@ struct RunTimeTypeTraits<TYPE_JSON> {
using ProxyContainerType = ColumnType::Container;
};
template <>
struct RunTimeTypeTraits<TYPE_VARIANT> {
using CppType = VariantValue*;
using ColumnType = VariantColumn;
using ProxyContainerType = ColumnType::Container;
};
template <>
struct RunTimeTypeTraits<TYPE_BINARY> {
using CppType = Slice;
@ -510,4 +521,12 @@ struct RunTimeTypeLimits<TYPE_JSON> {
static value_type max_value() { return JsonValue{vpack::Slice::maxKeySlice()}; }
};
template <>
struct RunTimeTypeLimits<TYPE_VARIANT> {
using value_type = VariantValue;
static value_type min_value() { return VariantValue::of_null(); }
static value_type max_value() { return VariantValue::create(Slice::max_value()).value(); }
};
} // namespace starrocks

View File

@ -0,0 +1,84 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "variant_column.h"
#include "types/variant_value.h"
#include "util/mysql_row_buffer.h"
namespace starrocks {
uint32_t VariantColumn::serialize(size_t idx, uint8_t* pos) const {
return static_cast<uint32_t>(get_object(idx)->serialize(pos));
}
void VariantColumn::serialize_batch(uint8_t* dst, Buffer<uint32_t>& slice_sizes, size_t chunk_size,
uint32_t max_one_row_size) const {
for (size_t i = 0; i < chunk_size; ++i) {
slice_sizes[i] += serialize(i, dst + i * max_one_row_size + slice_sizes[i]);
}
}
uint32_t VariantColumn::serialize_size(size_t idx) const {
return static_cast<uint32_t>(get_object(idx)->serialize_size());
}
const uint8_t* VariantColumn::deserialize_and_append(const uint8_t* pos) {
// Read the first 4 bytes to get the size of the variant
uint32_t variant_length = *reinterpret_cast<const uint32_t*>(pos);
auto variant_result = VariantValue::create(Slice(pos, variant_length + sizeof(uint32_t)));
if (!variant_result.ok()) {
throw std::runtime_error("Failed to deserialize variant: " + variant_result.status().to_string());
}
uint32_t serialize_size = variant_result->serialize_size();
append(std::move(*variant_result));
return pos + serialize_size;
}
void VariantColumn::put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol) const {
VariantValue* variant = get_object(idx);
DCHECK(variant != nullptr) << "Variant value is null at index " << idx;
auto json = variant->to_json();
if (!json.ok()) {
buf->push_null();
} else {
buf->push_string(json->data(), json->size(), '\'');
}
}
void VariantColumn::append_datum(const Datum& datum) {
BaseClass::append(datum.get<VariantValue*>());
}
void VariantColumn::append(const Column& src, size_t offset, size_t count) {
BaseClass::append(src, offset, count);
}
void VariantColumn::append(const VariantValue& object) {
BaseClass::append(object);
}
void VariantColumn::append(const VariantValue* object) {
BaseClass::append(object);
}
void VariantColumn::append(VariantValue&& object) {
BaseClass::append(std::move(object));
}
} // namespace starrocks

View File

@ -0,0 +1,64 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <utility>
#include "column/column.h"
#include "column/object_column.h"
#include "column/vectorized_fwd.h"
#include "types/variant_value.h"
namespace starrocks {
class VariantColumn final
: public CowFactory<ColumnFactory<ObjectColumn<VariantValue>, VariantColumn>, VariantColumn, Column> {
public:
using ValueType = VariantValue;
using SuperClass = CowFactory<ColumnFactory<ObjectColumn<VariantValue>, VariantColumn>, VariantColumn, Column>;
using BaseClass = VariantColumnBase;
VariantColumn() = default;
explicit VariantColumn(size_t size) : SuperClass(size) {}
VariantColumn(const VariantColumn& rhs) : SuperClass(rhs) {}
VariantColumn(VariantColumn&& rhs) noexcept : SuperClass(std::move(rhs)) {}
MutableColumnPtr clone() const override { return BaseClass::clone(); }
MutableColumnPtr clone_empty() const override { return this->create(); }
uint32_t serialize(size_t idx, uint8_t* pos) const override;
uint32_t serialize_size(size_t idx) const override;
void serialize_batch(uint8_t* dst, Buffer<uint32_t>& slice_sizes, size_t chunk_size,
uint32_t max_one_row_size) const override;
const uint8_t* deserialize_and_append(const uint8_t* pos) override;
void append_datum(const Datum& datum) override;
void append(const Column& src, size_t offset, size_t count) override;
// Add a forwarding function to expose the base class append function
void append(const Column& src) { append(src, 0, src.size()); }
void append(const VariantValue* object);
void append(VariantValue&& object);
void append(const VariantValue& object);
bool is_variant() const override { return true; }
std::string get_name() const override { return "variant"; }
void put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol = false) const override;
};
} // namespace starrocks

View File

@ -27,6 +27,7 @@ class HyperLogLog;
class BitmapValue;
class PercentileValue;
class JsonValue;
class VariantValue;
class DateValue;
class TimestampValue;
@ -104,6 +105,9 @@ using PercentileColumn = ObjectColumn<PercentileValue>;
using JsonColumnBase = ObjectColumn<JsonValue>;
class JsonColumn;
using VariantColumnBase = ObjectColumn<VariantValue>;
class VariantColumn;
class MapColumn;
class StructColumn;

View File

@ -184,6 +184,7 @@ Status MySQLDataSource::open(RuntimeState* state) {
case TYPE_DOUBLE:
case TYPE_FLOAT:
case TYPE_JSON:
case TYPE_VARIANT:
case TYPE_FUNCTION:
case TYPE_VARBINARY:
case TYPE_UNSIGNED_TINYINT:

View File

@ -121,7 +121,8 @@ std::string VExtLiteral::_value_to_string(ColumnPtr& column) {
[&](auto&& arg) {
using T = std::decay_t<decltype(arg)>;
if constexpr (is_type_in<T, std::monostate, int96_t, decimal12_t, DecimalV2Value, DatumArray,
DatumMap, HyperLogLog*, BitmapValue*, PercentileValue*, JsonValue*>()) {
DatumMap, HyperLogLog*, BitmapValue*, PercentileValue*, JsonValue*,
VariantValue*>()) {
// ignore these types
} else if constexpr (std::is_same_v<T, Slice>) {
res = std::string(arg.data, arg.size);

View File

@ -957,8 +957,8 @@ struct ColumnRangeBuilder {
template <LogicalType ltype, typename E, CompoundNodeType Type>
Status operator()(ChunkPredicateBuilder<E, Type>* parent, const SlotDescriptor* slot,
std::map<std::string, ColumnValueRangeType>* column_value_ranges) {
if constexpr (ltype == TYPE_TIME || ltype == TYPE_NULL || ltype == TYPE_JSON || lt_is_float<ltype> ||
lt_is_binary<ltype>) {
if constexpr (ltype == TYPE_TIME || ltype == TYPE_NULL || ltype == TYPE_JSON || ltype == TYPE_VARIANT ||
lt_is_float<ltype> || lt_is_binary<ltype>) {
return Status::OK();
} else {
// Treat tinyint and boolean as int

View File

@ -676,7 +676,7 @@ public:
return _min = TimestampValue::MIN_TIMESTAMP_VALUE && _max == TimestampValue::MAX_TIMESTAMP_VALUE;
} else if constexpr (IsDecimal<CppType>) {
return _min == DecimalV2Value::get_min_decimal() && _max == DecimalV2Value::get_max_decimal();
} else if constexpr (Type != TYPE_JSON) {
} else if constexpr (Type != TYPE_JSON && Type != TYPE_VARIANT) {
return _min == RunTimeTypeLimits<Type>::min_value() && _max == RunTimeTypeLimits<Type>::max_value();
} else {
return false;
@ -1078,7 +1078,7 @@ private:
} else if constexpr (IsDecimal<CppType>) {
_min = DecimalV2Value::get_max_decimal();
_max = DecimalV2Value::get_min_decimal();
} else if constexpr (Type != TYPE_JSON) {
} else if constexpr (Type != TYPE_JSON && Type != TYPE_VARIANT) {
// for json vaue, cpp type is JsonValue*
// but min/max value type is JsonValue
// and JsonValue needs special serialization handling.
@ -1105,7 +1105,7 @@ private:
} else if constexpr (IsDecimal<CppType>) {
_max = DecimalV2Value::get_max_decimal();
_min = DecimalV2Value::get_min_decimal();
} else if constexpr (Type != TYPE_JSON) {
} else if constexpr (Type != TYPE_JSON && Type != TYPE_VARIANT) {
// for json vaue, cpp type is JsonValue*
// but min/max value type is JsonValue
// and JsonValue needs special serialization handling.

View File

@ -176,7 +176,7 @@ Status MysqlTableWriter::_build_insert_sql(int from, int to, std::string_view* s
_stmt_buffer.push_back('"');
} else if constexpr (type == TYPE_TINYINT || type == TYPE_BOOLEAN) {
fmt::format_to(std::back_inserter(_stmt_buffer), "{}", (int32_t)viewer.value(i));
} else if constexpr (type == TYPE_JSON) {
} else if constexpr (type == TYPE_JSON || type == TYPE_VARIANT) {
fmt::format_to(std::back_inserter(_stmt_buffer), "{}", *viewer.value(i));
} else {
fmt::format_to(std::back_inserter(_stmt_buffer), "{}", viewer.value(i));

View File

@ -23,7 +23,6 @@
#include "util/raw_container.h"
namespace starrocks {
// Date: Julian Date -2000-01-01 ~ 9999-01-01
// MAX USE 22 bits
typedef int32_t JulianDate;
@ -234,6 +233,10 @@ public:
template <bool use_iso8601_format = false, bool igonre_microsecond = false>
static int to_string(Timestamp timestamp, char* s, size_t n);
// Convert timestamp to string with timezone format (e.g., "2025-04-16 12:34:56.78-04:00")
template <bool use_iso8601_format = false, bool ignore_microsecond = false>
static std::string to_string_with_timezone(Timestamp timestamp, const cctz::time_zone& timezone);
inline static double time_to_literal(double time);
inline static Timestamp of_epoch_second(int64_t seconds, int64_t microseconds);
@ -566,4 +569,27 @@ int timestamp::to_string(Timestamp timestamp, char* to, size_t n) {
return 19;
}
template <bool use_iso8601_format, bool ignore_microsecond>
std::string timestamp::to_string_with_timezone(Timestamp timestamp, const cctz::time_zone& timezone) {
// Convert StarRocks timestamp to unix seconds
int64_t julian = to_julian(timestamp);
int64_t time_microseconds = to_time(timestamp);
// Calculate unix seconds from julian date
int64_t unix_seconds = (julian - date::UNIX_EPOCH_JULIAN) * SECS_PER_DAY + time_microseconds / USECS_PER_SEC;
int64_t microseconds = time_microseconds % USECS_PER_SEC;
// Convert to time_point
auto time_point = std::chrono::system_clock::from_time_t(unix_seconds);
time_point += std::chrono::microseconds(microseconds);
// Format the timestamp with timezone using cctz
constexpr const char* base_format = use_iso8601_format ? "%Y-%m-%dT%H:%M:" : "%Y-%m-%d %H:%M:";
constexpr const char* time_format = ignore_microsecond ? "%S" : "%E*S";
constexpr auto tz_format = "%Ez";
const std::string format_str = std::string(base_format) + time_format + tz_format;
return cctz::format(format_str, time_point, timezone);
}
} // namespace starrocks

View File

@ -268,14 +268,16 @@ bool TypeDescriptor::support_join() const {
if (type == TYPE_ARRAY || type == TYPE_MAP || type == TYPE_STRUCT) {
return std::all_of(children.begin(), children.end(), [](const TypeDescriptor& t) { return t.support_join(); });
}
return type != TYPE_JSON && type != TYPE_OBJECT && type != TYPE_PERCENTILE && type != TYPE_HLL;
return type != TYPE_JSON && type != TYPE_OBJECT && type != TYPE_PERCENTILE && type != TYPE_HLL &&
type != TYPE_VARIANT;
}
bool TypeDescriptor::support_orderby() const {
if (type == TYPE_ARRAY) {
return children[0].support_orderby();
}
return type != TYPE_JSON && type != TYPE_OBJECT && type != TYPE_PERCENTILE && type != TYPE_HLL && type != TYPE_MAP;
return type != TYPE_JSON && type != TYPE_OBJECT && type != TYPE_PERCENTILE && type != TYPE_HLL &&
type != TYPE_MAP && type != TYPE_VARIANT;
}
bool TypeDescriptor::support_groupby() const {
@ -283,7 +285,8 @@ bool TypeDescriptor::support_groupby() const {
return std::all_of(children.begin(), children.end(),
[](const TypeDescriptor& t) { return t.support_groupby(); });
}
return type != TYPE_JSON && type != TYPE_OBJECT && type != TYPE_PERCENTILE && type != TYPE_HLL;
return type != TYPE_JSON && type != TYPE_OBJECT && type != TYPE_PERCENTILE && type != TYPE_HLL &&
type != TYPE_VARIANT;
}
TypeDescriptor TypeDescriptor::from_storage_type_info(TypeInfo* type_info) {
@ -321,6 +324,7 @@ int TypeDescriptor::get_slot_size() const {
case TYPE_OBJECT:
case TYPE_PERCENTILE:
case TYPE_JSON:
case TYPE_VARIANT:
case TYPE_VARBINARY:
return sizeof(Slice);

View File

@ -113,6 +113,13 @@ struct TypeDescriptor {
return res;
}
static TypeDescriptor create_variant_type() {
TypeDescriptor res;
res.type = TYPE_VARIANT;
res.len = 128; // Default length for variant type
return res;
}
static TypeDescriptor create_array_type(const TypeDescriptor& children) {
TypeDescriptor res;
res.type = TYPE_ARRAY;
@ -211,6 +218,8 @@ struct TypeDescriptor {
return TypeDescriptor::create_decimalv3_type(TYPE_DECIMAL256, precision, scale);
case TYPE_JSON:
return TypeDescriptor::create_json_type();
case TYPE_VARIANT:
return TypeDescriptor::create_variant_type();
case TYPE_OBJECT:
return TypeDescriptor::create_bitmap_type();
default:
@ -325,7 +334,9 @@ struct TypeDescriptor {
// For some types with potential huge length, whose memory consumption is far more than normal types,
// they need a different chunk_size setting
bool is_huge_type() const { return type == TYPE_JSON || type == TYPE_OBJECT || type == TYPE_HLL; }
bool is_huge_type() const {
return type == TYPE_JSON || type == TYPE_OBJECT || type == TYPE_HLL || type == TYPE_VARIANT;
}
/// Returns the size of a slot for this type.
int get_slot_size() const;

View File

@ -31,10 +31,12 @@
#include "column/nullable_column.h"
#include "column/object_column.h"
#include "column/struct_column.h"
#include "column/variant_column.h"
#include "gutil/strings/substitute.h"
#include "runtime/descriptors.h"
#include "serde/protobuf_serde.h"
#include "types/hll.h"
#include "types/variant_value.h"
#include "util/coding.h"
#include "util/json.h"
#include "util/percentile_value.h"
@ -379,6 +381,55 @@ public:
}
};
class VariantColumnSerde {
public:
static int64_t max_serialized_size(const VariantColumn& column) {
const auto& pool = column.get_pool();
int64_t size = 0;
size += sizeof(uint32_t); // num_objects
for (const auto& obj : pool) {
size += sizeof(uint64_t);
size += obj.serialize_size();
}
return size;
}
static uint8_t* serialize(const VariantColumn& column, uint8_t* buff) {
buff = write_little_endian_32(column.get_pool().size(), buff);
for (const auto& v : column.get_pool()) {
constexpr uint64_t size_field_length = sizeof(uint64_t);
uint64_t actual = v.serialize(buff + size_field_length);
buff = write_little_endian_64(actual, buff);
buff += actual;
}
return buff;
}
static StatusOr<const uint8_t*> deserialize(const uint8_t* buff, VariantColumn* column) {
uint32_t num_objects = 0;
buff = read_little_endian_32(buff, &num_objects);
column->reset_column();
auto& pool = column->get_pool();
pool.reserve(num_objects);
for (int i = 0; i < num_objects; ++i) {
uint64_t serialized_size = 0;
buff = read_little_endian_64(buff, &serialized_size);
auto variant = VariantValue::create(Slice(buff, serialized_size));
if (!variant.ok()) {
return Status::Corruption(fmt::format("Failed to deserialize VariantValue at index {}: {}", i,
variant.status().to_string()));
}
pool.emplace_back(std::move(variant.value()));
buff += serialized_size;
}
return buff;
}
};
class NullableColumnSerde {
public:
static int64_t max_serialized_size(const NullableColumn& column, const int encode_level) {
@ -543,6 +594,11 @@ public:
return Status::OK();
}
Status do_visit(const VariantColumn& column) {
_size += VariantColumnSerde::max_serialized_size(column);
return Status::OK();
}
int64_t size() const { return _size; }
private:
@ -607,6 +663,11 @@ public:
return Status::OK();
}
Status do_visit(const VariantColumn& column) {
_cur = VariantColumnSerde::serialize(column, _cur);
return Status::OK();
}
uint8_t* cur() const { return _cur; }
int64_t bytes() const { return _cur - _buff; }
@ -679,6 +740,16 @@ public:
return Status::OK();
}
Status do_visit(VariantColumn* column) {
auto v = VariantColumnSerde::deserialize(_cur, column);
if (!v.ok()) {
return v.status();
}
_cur = v.value();
return Status::OK();
}
const uint8_t* cur() const { return _cur; }
int64_t bytes() const { return _cur - _buff; }

View File

@ -625,9 +625,10 @@ bool ChunkPipelineAccumulator::is_finished() const {
}
template <class ColumnT>
inline constexpr bool is_object = std::is_same_v<ColumnT, ArrayColumn> || std::is_same_v<ColumnT, StructColumn> ||
std::is_same_v<ColumnT, MapColumn> || std::is_same_v<ColumnT, JsonColumn> ||
std::is_same_v<ObjectColumn<typename ColumnT::ValueType>, ColumnT>;
inline constexpr bool is_object =
std::is_same_v<ColumnT, ArrayColumn> || std::is_same_v<ColumnT, StructColumn> ||
std::is_same_v<ColumnT, MapColumn> || std::is_same_v<ColumnT, JsonColumn> ||
std::is_same_v<ColumnT, VariantColumn> || std::is_same_v<ObjectColumn<typename ColumnT::ValueType>, ColumnT>;
// Selective-copy data from SegmentedColumn according to provided index
class SegmentedColumnSelectiveCopy final : public ColumnVisitorAdapter<SegmentedColumnSelectiveCopy> {
@ -737,7 +738,7 @@ public:
return {};
}
// Inefficient fallback implementation, it's usually used for Array/Struct/Map/Json
// Inefficient fallback implementation, it's usually used for Array/Struct/Map/Json/Variant
template <class ColumnT>
typename std::enable_if_t<is_object<ColumnT>, Status> do_visit(const ColumnT& column) {
_result = column.clone_empty();

View File

@ -698,6 +698,7 @@ ColumnPredicate* new_column_in_predicate_generic(const TypeInfoPtr& type_info, C
case TYPE_OBJECT:
case TYPE_PERCENTILE:
case TYPE_JSON:
case TYPE_VARIANT:
case TYPE_NULL:
case TYPE_FUNCTION:
case TYPE_TIME:

View File

@ -405,6 +405,7 @@ ColumnPredicate* new_column_not_in_predicate(const TypeInfoPtr& type_info, Colum
case TYPE_OBJECT:
case TYPE_PERCENTILE:
case TYPE_JSON:
case TYPE_VARIANT:
case TYPE_NULL:
case TYPE_FUNCTION:
case TYPE_TIME:

View File

@ -182,6 +182,7 @@ static ColumnPredicate* new_column_predicate(const TypeInfoPtr& type_info, Colum
case TYPE_OBJECT:
case TYPE_PERCENTILE:
case TYPE_JSON:
case TYPE_VARIANT:
case TYPE_NULL:
case TYPE_FUNCTION:
case TYPE_TIME:

View File

@ -1638,6 +1638,7 @@ const FieldConverter* get_field_converter(LogicalType from_type, LogicalType to_
TYPE_CASE_CLAUSE(TYPE_OBJECT)
TYPE_CASE_CLAUSE(TYPE_PERCENTILE)
TYPE_CASE_CLAUSE(TYPE_JSON)
TYPE_CASE_CLAUSE(TYPE_VARIANT)
TYPE_CASE_CLAUSE(TYPE_VARBINARY)
case TYPE_DECIMAL32:
case TYPE_DECIMAL64:

View File

@ -39,8 +39,8 @@ struct RuntimeColumnPredicateBuilder {
const SlotDescriptor* slot, int32_t driver_sequence,
ObjectPool* pool) {
// keep consistent with ColumnRangeBuilder
if constexpr (ltype == TYPE_TIME || ltype == TYPE_NULL || ltype == TYPE_JSON || lt_is_float<ltype> ||
lt_is_binary<ltype>) {
if constexpr (ltype == TYPE_TIME || ltype == TYPE_NULL || ltype == TYPE_JSON || ltype == TYPE_VARIANT ||
lt_is_float<ltype> || lt_is_binary<ltype>) {
DCHECK(false) << "unreachable path";
return Status::NotSupported("unreachable path");
} else {

View File

@ -103,6 +103,7 @@ uint32_t TabletColumn::get_field_length_by_type(LogicalType type, uint32_t strin
case TYPE_HLL:
case TYPE_PERCENTILE:
case TYPE_JSON:
case TYPE_VARIANT:
case TYPE_VARBINARY:
return string_length + sizeof(get_olap_string_max_length());
case TYPE_ARRAY:

View File

@ -28,4 +28,5 @@ add_library(Types STATIC
checker/type_checker.cpp
type_checker_manager.cpp
int256.cpp
variant_value.cpp
)

View File

@ -60,10 +60,12 @@ LogicalType string_to_logical_type(const std::string& type_str) {
if (upper_type_str == "DECIMAL256") return TYPE_DECIMAL256;
if (upper_type_str == "INT256") return TYPE_INT256;
if (upper_type_str == "JSON") return TYPE_JSON;
if (upper_type_str == "BINARY") return TYPE_BINARY;
if (upper_type_str == "VARBINARY") return TYPE_VARBINARY;
if (upper_type_str == "ANY_ARRAY") return TYPE_ARRAY;
if (upper_type_str == "ANY_STRUCT") return TYPE_STRUCT;
if (upper_type_str == "ANY_MAP") return TYPE_MAP;
if (upper_type_str == "VARIANT") return TYPE_VARIANT;
LOG(WARNING) << "invalid type string. [type='" << type_str << "']";
return TYPE_UNKNOWN;
}
@ -152,6 +154,8 @@ const char* logical_type_to_string(LogicalType type) {
return "MAX_VALUE";
case TYPE_VARBINARY:
return "VARBINARY";
case TYPE_VARIANT:
return "VARIANT";
}
return "";
}
@ -285,6 +289,7 @@ public:
_data[TYPE_VARBINARY] = TYPE_VARBINARY;
_data[TYPE_DECIMAL256] = TYPE_DECIMAL256;
_data[TYPE_INT256] = TYPE_INT256;
_data[TYPE_VARIANT] = TYPE_VARIANT;
}
LogicalType get_logical_type(LogicalType field_type) { return _data[field_type]; }

View File

@ -72,10 +72,11 @@ enum LogicalType {
TYPE_PERCENTILE = 53,
TYPE_JSON = 54,
TYPE_VARIANT = 55,
// max value of LogicalType, newly-added type should not exceed this value.
// used to create a fixed-size hash map.
TYPE_MAX_VALUE = 55
TYPE_MAX_VALUE = 56
};
// TODO(lism): support varbinary for zone map.
@ -168,6 +169,7 @@ inline bool is_semi_type(LogicalType type) {
case TYPE_ARRAY:
case TYPE_MAP:
case TYPE_JSON:
case TYPE_VARIANT:
return true;
default:
return false;
@ -244,6 +246,7 @@ constexpr bool is_scalar_logical_type(LogicalType ltype) {
case TYPE_DECIMAL128: /* 26 */
case TYPE_DECIMAL256: /* 27 */
case TYPE_JSON:
case TYPE_VARIANT:
return true;
default:
return false;
@ -322,6 +325,7 @@ VALUE_GUARD(LogicalType, ObjectLTGuard, lt_is_object, TYPE_OBJECT)
VALUE_GUARD(LogicalType, StringLTGuard, lt_is_string, TYPE_CHAR, TYPE_VARCHAR)
VALUE_GUARD(LogicalType, BinaryLTGuard, lt_is_binary, TYPE_BINARY, TYPE_VARBINARY)
VALUE_GUARD(LogicalType, JsonGuard, lt_is_json, TYPE_JSON)
VALUE_GUARD(LogicalType, VariantGuard, lt_is_variant, TYPE_VARIANT)
VALUE_GUARD(LogicalType, FunctionGuard, lt_is_function, TYPE_FUNCTION)
VALUE_GUARD(LogicalType, ObjectFamilyLTGuard, lt_is_object_family, TYPE_JSON, TYPE_HLL, TYPE_OBJECT, TYPE_PERCENTILE)
VALUE_GUARD(LogicalType, ArrayGuard, lt_is_array, TYPE_ARRAY)

View File

@ -55,6 +55,7 @@ namespace starrocks {
M(TYPE_TIME) \
M(TYPE_JSON) \
M(TYPE_VARBINARY) \
M(TYPE_VARIANT) \
M(TYPE_BOOLEAN)
#define APPLY_FOR_COMPLEX_TYPE(M) \
@ -104,7 +105,8 @@ namespace starrocks {
M(FUNCTION) \
M(BINARY) \
M(VARBINARY) \
M(JSON)
M(JSON) \
M(VARIANT)
#define APPLY_FOR_MIN_MAX_COMPRESSABLE_TYPE(M) \
APPLY_FOR_ALL_INT_TYPE(M) \

View File

@ -0,0 +1,191 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "variant_value.h"
#include <arrow/util/endian.h>
#include <boost/uuid/uuid_io.hpp>
#include <cstring>
#include "util/url_coding.h"
#include "util/variant_util.h"
namespace starrocks {
StatusOr<VariantValue> VariantValue::create(const Slice& slice) {
// Validate slice first
if (slice.get_data() == nullptr) {
return Status::InvalidArgument("Invalid variant slice: null data pointer");
}
if (slice.get_size() < sizeof(uint32_t)) {
return Status::InvalidArgument("Invalid variant slice: too small to contain size header");
}
const char* variant_raw = slice.get_data();
// The first 4 bytes are the size of the variant
uint32_t variant_size;
std::memcpy(&variant_size, variant_raw, sizeof(uint32_t));
// Check variant size limit (16MB)
if (variant_size > kMaxVariantSize) {
return Status::InvalidArgument("Variant size exceeds maximum limit: " + std::to_string(variant_size) + " > " +
std::to_string(kMaxVariantSize));
}
if (variant_size > slice.get_size() - sizeof(uint32_t)) {
return Status::InvalidArgument(
"Invalid variant size: " + std::to_string(variant_size) +
" exceeds available data: " + std::to_string(slice.get_size() - sizeof(uint32_t)));
}
const auto variant = std::string_view(variant_raw + sizeof(uint32_t), variant_size);
auto metadata_status = load_metadata(variant);
if (!metadata_status.ok()) {
return metadata_status.status();
}
const auto& metadata_view = metadata_status.value();
if (metadata_view.size() > variant_size) {
return Status::InvalidArgument("Metadata size exceeds variant size");
}
std::string metadata(metadata_view);
RETURN_IF_ERROR(validate_metadata(metadata));
std::string value(variant_raw + sizeof(uint32_t) + metadata_view.size(), variant_size - metadata_view.size());
return VariantValue(std::move(metadata), std::move(value));
}
Status VariantValue::validate_metadata(const std::string_view metadata) {
// metadata at least 3 bytes: version, dictionarySize and at least one offset.
if (metadata.size() < kMinMetadataSize) {
return Status::InternalError("Variant metadata is too short");
}
const uint8_t header = static_cast<uint8_t>(metadata[0]);
if (const uint8_t version = header & kVersionMask; version != 1) {
return Status::NotSupported("Unsupported variant version: " + std::to_string(version));
}
return Status::OK();
}
VariantValue VariantValue::of_null() {
static constexpr uint8_t header = static_cast<uint8_t>(VariantPrimitiveType::NULL_TYPE) << 2;
static constexpr uint8_t null_chars[] = {header};
return VariantValue(VariantMetadata::kEmptyMetadata,
std::string_view{reinterpret_cast<const char*>(null_chars), 1});
}
StatusOr<std::string_view> VariantValue::load_metadata(const std::string_view variant) {
if (variant.empty()) {
return Status::InvalidArgument("Variant is empty");
}
// Check variant size limit (16MB)
if (variant.size() > kMaxVariantSize) {
return Status::InvalidArgument("Variant size exceeds maximum limit: " + std::to_string(variant.size()) + " > " +
std::to_string(kMaxVariantSize));
}
const uint8_t header = static_cast<uint8_t>(variant[0]);
if (const uint8_t version = header & kVersionMask; version != 1) {
return Status::NotSupported("Unsupported variant version: " + std::to_string(version));
}
const uint8_t offset_size = 1 + ((header & kOffsetSizeMask) >> kOffsetSizeShift);
if (offset_size < 1 || offset_size > 4) {
return Status::InvalidArgument("Invalid offset size in variant metadata: " + std::to_string(offset_size) +
", expected 1, 2, 3 or 4 bytes");
}
if (variant.size() < kHeaderSize + offset_size) {
return Status::InvalidArgument("Variant too short to contain dict_size");
}
uint32_t dict_size = VariantUtil::readLittleEndianUnsigned(variant.data() + 1, offset_size);
uint32_t offset_list_offset = kHeaderSize + offset_size;
// Check for potential overflow in offset list size calculation
if (dict_size > (kMaxVariantSize - offset_list_offset) / offset_size - 1) {
return Status::InvalidArgument("Dict size too large: " + std::to_string(dict_size));
}
uint32_t required_offset_list_size = (1 + dict_size) * offset_size;
uint32_t data_offset = offset_list_offset + required_offset_list_size;
uint32_t last_offset_pos = offset_list_offset + dict_size * offset_size;
if (last_offset_pos + offset_size > variant.size()) {
return Status::InvalidArgument("Variant too short to contain all offsets");
}
uint32_t last_data_size = VariantUtil::readLittleEndianUnsigned(variant.data() + last_offset_pos, offset_size);
uint32_t end_offset = data_offset + last_data_size;
if (end_offset > variant.size()) {
return Status::CapacityLimitExceed("Variant metadata end offset exceeds variant size: " +
std::to_string(end_offset) + " > " + std::to_string(variant.size()));
}
return std::string_view(variant.data(), end_offset);
}
size_t VariantValue::serialize(uint8_t* dst) const {
size_t offset = 0;
// The first 4 bytes are the total size of the variant
uint32_t total_size = static_cast<uint32_t>(_metadata.size() + _value.size());
memcpy(dst + offset, &total_size, sizeof(uint32_t));
offset += sizeof(uint32_t);
// metadata
memcpy(dst + offset, _metadata.data(), _metadata.size());
offset += _metadata.size();
// value
memcpy(dst + offset, _value.data(), _value.size());
offset += _value.size();
return offset;
}
uint32_t VariantValue::serialize_size() const {
return sizeof(uint32_t) + _metadata.size() + _value.size();
}
StatusOr<std::string> VariantValue::to_json(cctz::time_zone timezone) const {
std::stringstream json_str;
auto status = VariantUtil::variant_to_json(_metadata, _value, json_str, timezone);
if (!status.ok()) {
return status;
}
return json_str.str();
}
std::string VariantValue::to_string() const {
auto json_result = to_json();
if (!json_result.ok()) {
return "";
}
return json_result.value();
}
std::ostream& operator<<(std::ostream& os, const VariantValue& value) {
return os << value.to_string();
}
} // namespace starrocks

View File

@ -0,0 +1,103 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cctz/time_zone.h>
#include <string_view>
#include "common/statusor.h"
#include "fmt/format.h"
#include "util/slice.h"
namespace starrocks {
class VariantValue {
public:
VariantValue(const std::string_view metadata, const std::string_view value) : _metadata(metadata), _value(value) {}
VariantValue(std::string metadata, std::string value) : _metadata(std::move(metadata)), _value(std::move(value)) {}
VariantValue() = default;
/**
* Static factory method to create a VariantValue from a Slice.
* @param slice The Slice must contain the full variant binary including size header.
* The first 4 bytes of the Slice are expected to be the size of the variant.
* The memory layout is: [total size (4 bytes)][metadata][value].
* @return The created VariantValue or an error status.
*/
static StatusOr<VariantValue> create(const Slice& slice);
VariantValue(const VariantValue& rhs) = default;
VariantValue(VariantValue&& rhs) noexcept = default;
static Status validate_metadata(const std::string_view metadata);
VariantValue& operator=(const VariantValue& rhs) = default;
VariantValue& operator=(VariantValue&& rhs) noexcept = default;
static VariantValue of_null();
// Load metadata from the variant binary.
// will slice the variant binary to extract metadata
static StatusOr<std::string_view> load_metadata(std::string_view variant);
// Serialize the VariantValue to a byte array.
// return the number of bytes written
size_t serialize(uint8_t* dst) const;
// Calculate the size of the serialized VariantValue.
// 4 bytes for value size + metadata size + value size
uint32_t serialize_size() const;
uint64_t mem_usage() const { return serialize_size(); }
// Convert to a JSON string
StatusOr<std::string> to_json(cctz::time_zone timezone = cctz::local_time_zone()) const;
std::string to_string() const;
std::string get_metadata() const { return _metadata; }
std::string get_value() const { return _value; }
// Variant value has a maximum size limit of 16MB to prevent excessive memory usage.
static constexpr uint32_t kMaxVariantSize = 16 * 1024 * 1024;
private:
static constexpr uint8_t kVersionMask = 0b1111;
static constexpr uint8_t kSortedStrings = 0b10000;
static constexpr uint8_t kOffsetSizeMask = 0b11000000;
static constexpr uint8_t kOffsetSizeShift = 6;
static constexpr uint8_t kHeaderSize = 1;
static constexpr size_t kMinMetadataSize = 3;
// Now directly store strings instead of string_views
std::string _metadata;
std::string _value;
};
// append json string to the stream
std::ostream& operator<<(std::ostream& os, const VariantValue& json);
} // namespace starrocks
// fmt::format
template <>
struct fmt::formatter<starrocks::VariantValue> : formatter<std::string> {
template <typename FormatContext>
auto format(const starrocks::VariantValue& p, FormatContext& ctx) -> decltype(ctx.out()) {
return formatter<std::string>::format(p.to_string(), ctx);
}
}; // namespace fmt

View File

@ -114,6 +114,7 @@ set(UTIL_FILES
internal_service_recoverable_stub.cpp
lake_service_recoverable_stub.cpp
byte_buffer.cpp
variant_util.cpp
)
add_library(Util STATIC

View File

@ -0,0 +1,288 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "variant_util.h"
#include <arrow/util/endian.h>
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_io.hpp>
#include "exprs/cast_expr.h"
#include "formats/parquet/variant.h"
#include "runtime/decimalv3.h"
#include "types/timestamp_value.h"
#include "url_coding.h"
namespace starrocks {
uint32_t VariantUtil::readLittleEndianUnsigned(const void* from, uint8_t size) {
uint32_t result = 0;
memcpy(&result, from, size);
return arrow::bit_util::FromLittleEndian(result);
}
std::string VariantUtil::type_to_string(VariantType type) {
switch (type) {
case VariantType::OBJECT:
return "Object";
case VariantType::ARRAY:
return "Array";
case VariantType::NULL_TYPE:
return "Null";
case VariantType::BOOLEAN:
return "Boolean";
case VariantType::INT8:
return "Int8";
case VariantType::INT16:
return "Int16";
case VariantType::INT32:
return "Int32";
case VariantType::INT64:
return "Int64";
case VariantType::DOUBLE:
return "Double";
case VariantType::DECIMAL4:
return "Decimal4";
case VariantType::DECIMAL8:
return "Decimal8";
case VariantType::DECIMAL16:
return "Decimal16";
case VariantType::DATE:
return "Date";
case VariantType::TIMESTAMP_TZ:
return "TimestampTZ";
case VariantType::TIMESTAMP_NTZ:
return "TimestampNTZ";
case VariantType::FLOAT:
return "Float";
case VariantType::BINARY:
return "Binary";
case VariantType::STRING:
return "String";
case VariantType::TIME_NTZ:
return "TimeNTZ";
case VariantType::TIMESTAMP_TZ_NANOS:
return "TimestampTZNanos";
case VariantType::TIMESTAMP_NTZ_NANOS:
return "TimestampNTZNanos";
case VariantType::UUID:
return "UUID";
default:
return "Unknown";
}
}
std::string epoch_day_to_date(int32_t epoch_days) {
std::time_t raw_time = epoch_days * 86400; // to seconds
std::tm* ptm = std::gmtime(&raw_time); // to UTC
char buffer[11];
std::strftime(buffer, sizeof(buffer), "%Y-%m-%d", ptm);
return buffer;
}
std::string VariantUtil::decimal4_to_string(DecimalValue<int32_t> decimal) {
return DecimalV3Cast::to_string<int32_t>(decimal.value, decimal_precision_limit<int32_t>, decimal.scale);
}
std::string VariantUtil::decimal8_to_string(DecimalValue<int64_t> decimal) {
return DecimalV3Cast::to_string<int64_t>(decimal.value, decimal_precision_limit<int64_t>, decimal.scale);
}
std::string VariantUtil::decimal16_to_string(DecimalValue<int128_t> decimal) {
return DecimalV3Cast::to_string<int128_t>(decimal.value, decimal_precision_limit<int128_t>, decimal.scale);
}
void append_quoted_string(std::stringstream& ss, const std::string& str) {
ss << '"' << str << '"';
}
Status VariantUtil::variant_to_json(std::string_view metadata, std::string_view value, std::stringstream& json_str,
cctz::time_zone timezone) {
Variant variant{metadata, value};
switch (variant.type()) {
case VariantType::NULL_TYPE:
json_str << "null";
break;
case VariantType::BOOLEAN: {
bool res = *variant.get_bool();
json_str << (res ? "true" : "false");
break;
}
case VariantType::INT8:
json_str << std::to_string(*variant.get_int8());
break;
case VariantType::INT16:
json_str << std::to_string(*variant.get_int16());
break;
case VariantType::INT32:
json_str << std::to_string(*variant.get_int32());
break;
case VariantType::INT64:
json_str << std::to_string(*variant.get_int64());
break;
case VariantType::FLOAT: {
const float f = *variant.get_float();
if (std::isfinite(f)) {
json_str << std::to_string(f);
} else {
append_quoted_string(json_str, std::to_string(f));
}
break;
}
case VariantType::DOUBLE: {
const double d = *variant.get_double();
if (std::isfinite(d)) {
json_str << std::to_string(d);
} else {
append_quoted_string(json_str, std::to_string(d));
}
break;
}
case VariantType::DECIMAL4: {
DecimalValue<int32_t> decimal = *variant.get_decimal4();
json_str << decimal4_to_string(decimal);
break;
}
case VariantType::DECIMAL8: {
DecimalValue<int64_t> decimal = *variant.get_decimal8();
json_str << decimal8_to_string(decimal);
break;
}
case VariantType::DECIMAL16: {
DecimalValue<int128_t> decimal = *variant.get_decimal16();
json_str << decimal16_to_string(decimal);
break;
}
case VariantType::STRING: {
json_str << *variant.get_string();
break;
}
case VariantType::BINARY: {
const std::string_view binary = *variant.get_binary();
const std::string binary_str(binary.data(), binary.size());
std::string encoded;
base64_encode(binary_str, &encoded);
append_quoted_string(json_str, encoded);
break;
}
case VariantType::UUID: {
const auto uuid_arr = *variant.get_uuid();
boost::uuids::uuid uuid{};
for (size_t i = 0; i < uuid.size(); ++i) {
uuid.data[i] = uuid_arr[i];
}
append_quoted_string(json_str, boost::uuids::to_string(uuid));
break;
}
case VariantType::DATE: {
int32_t date = *variant.get_date();
std::string date_str = epoch_day_to_date(date);
append_quoted_string(json_str, date_str);
break;
}
case VariantType::TIMESTAMP_TZ: {
const int64_t timestamp_micros = *variant.get_timestamp_micros();
TimestampValue tsv{};
tsv.from_unix_second(timestamp_micros / 1000000, timestamp_micros % 1000000);
std::string timestamp_str = timestamp::to_string_with_timezone<false, false>(tsv.timestamp(), timezone);
append_quoted_string(json_str, timestamp_str);
break;
}
case VariantType::TIMESTAMP_NTZ: {
const int64_t timestamp_micros = *variant.get_timestamp_micros_ntz();
TimestampValue tsv{};
tsv.from_unix_second(timestamp_micros / 1000000, timestamp_micros % 1000000);
std::string timestamp_str = tsv.to_string(false);
append_quoted_string(json_str, timestamp_str);
break;
}
case VariantType::OBJECT: {
auto info = get_object_info(value);
if (!info.ok()) {
return info.status();
}
const auto& [num_elements, id_start_offset, id_size, offset_start_offset, offset_size, data_start_offset] =
info.value();
json_str << "{";
for (size_t i = 0; i < num_elements; ++i) {
if (i > 0) {
json_str << ",";
}
uint32_t id = readLittleEndianUnsigned(value.data() + id_start_offset + i * id_size, id_size);
uint32_t offset =
readLittleEndianUnsigned(value.data() + offset_start_offset + i * offset_size, offset_size);
auto key = variant.metadata().get_key(id);
if (!key.ok()) {
return key.status();
}
json_str << *key << ":";
if (uint32_t next_pos = data_start_offset + offset; next_pos < value.size()) {
std::string_view next_value = value.substr(next_pos, value.size() - next_pos);
// Recursively convert the next value to JSON
auto status = variant_to_json(metadata, next_value, json_str, timezone);
if (!status.ok()) {
return status;
}
} else {
return Status::InternalError("Invalid offset in object: " + std::to_string(offset));
}
}
json_str << "}";
break;
}
case VariantType::ARRAY: {
auto info = get_array_info(value);
if (!info.ok()) {
return info.status();
}
const auto& [num_elements, offset_size, offset_start_offset, data_start_offset] = info.value();
json_str << "[";
for (size_t i = 0; i < num_elements; ++i) {
if (i > 0) {
json_str << ",";
}
uint32_t offset =
readLittleEndianUnsigned(value.data() + offset_start_offset + i * offset_size, offset_size);
if (uint32_t next_pos = data_start_offset + offset; next_pos < value.size()) {
std::string_view next_value = value.substr(next_pos, value.size() - next_pos);
// Recursively convert the next value to JSON
auto status = variant_to_json(metadata, next_value, json_str, timezone);
if (!status.ok()) {
return status;
}
} else {
return Status::InternalError("Invalid offset in array: " + std::to_string(offset));
}
}
json_str << "]";
break;
}
default:
return Status::NotSupported("Unsupported variant type: " + type_to_string(variant.type()));
}
return Status::OK();
}
uint8_t VariantUtil::primitiveHeader(VariantPrimitiveType primitive) {
return static_cast<uint8_t>(primitive) << 2;
}
} // namespace starrocks

View File

@ -0,0 +1,36 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cctz/time_zone.h>
#include "formats/parquet/variant.h"
#include "types/variant_value.h"
namespace starrocks {
struct VariantUtil {
static std::string type_to_string(VariantType type);
static Status variant_to_json(std::string_view metadata, std::string_view value, std::stringstream& json_str,
cctz::time_zone timezone = cctz::local_time_zone());
static uint32_t readLittleEndianUnsigned(const void* from, uint8_t size);
static std::string decimal4_to_string(DecimalValue<int32_t> decimal);
static std::string decimal8_to_string(DecimalValue<int64_t> decimal);
static std::string decimal16_to_string(DecimalValue<int128_t> decimal);
static uint8_t primitiveHeader(VariantPrimitiveType primitive);
};
} // namespace starrocks

View File

@ -15,6 +15,7 @@ set(EXEC_FILES
./column/field_test.cpp
./column/fixed_length_column_test.cpp
./column/json_column_test.cpp
./column/variant_column_test.cpp
./column/map_column_test.cpp
./column/struct_column_test.cpp
./column/nullable_column_test.cpp
@ -460,6 +461,7 @@ set(EXEC_FILES
./types/int256_test.cpp
./types/int256_arithmetic_test.cpp
./types/type_checker_test.cpp
./types/variant_value_test.cpp
./simd/batch_run_counter_test.cpp
./simd/simd_test.cpp
./simd/simd_selector_test.cpp

View File

@ -139,6 +139,16 @@ PARALLEL_TEST(ColumnViewTest, test_create_json_column_view) {
DCHECK(json_column_view->is_json_view());
}
}
PARALLEL_TEST(ColumnViewTest, test_create_variant_column_view) {
TypeDescriptor type_desc = TypeDescriptor(LogicalType::TYPE_VARIANT);
for (auto nullable : {true, false}) {
auto opt_variant_column_view = ColumnViewHelper::create_column_view(type_desc, nullable, 0, 0);
DCHECK(opt_variant_column_view.has_value());
auto variant_column_view = std::move(opt_variant_column_view.value());
DCHECK(variant_column_view->is_variant_view());
EXPECT_TRUE(variant_column_view->is_variant_view());
}
}
PARALLEL_TEST(ColumnViewTest, test_create_binary_column_view) {
for (auto ltype : {LogicalType::TYPE_VARBINARY, LogicalType::TYPE_VARCHAR, LogicalType::TYPE_CHAR}) {
for (auto nullable : {true, false}) {

View File

@ -0,0 +1,241 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "column/column_builder.h"
#include "column/type_traits.h"
#include "formats/parquet/variant.h"
#include "testutil/parallel_test.h"
#include "types/logical_type.h"
#include "types/variant_value.h"
namespace starrocks {
uint8_t primitiveHeader(VariantPrimitiveType primitive) {
return static_cast<uint8_t>(primitive) << 2;
}
// NOLINTNEXTLINE
PARALLEL_TEST(VariantColumnTest, test_build_column) {
// create from type traits
{
const uint8_t int_chars[] = {primitiveHeader(VariantPrimitiveType::INT32), 0xD2, 0x02, 0x96, 0x49};
std::string_view int_string(reinterpret_cast<const char*>(int_chars), sizeof(int_chars));
VariantValue variant{VariantMetadata::kEmptyMetadata, int_string};
auto column = RunTimeColumnType<TYPE_VARIANT>::create();
EXPECT_EQ("variant", column->get_name());
EXPECT_TRUE(column->is_variant());
column->append(&variant);
auto res = column->get_object(0);
ASSERT_EQ(res->serialize_size(), variant.serialize_size());
ASSERT_EQ(res->get_metadata(), variant.get_metadata());
ASSERT_EQ(res->get_value(), variant.get_value());
EXPECT_EQ(res->to_string(), variant.to_string());
EXPECT_EQ("1234567890", res->to_string());
}
// create from builder
{
ColumnBuilder<TYPE_VARIANT> builder(1);
const uint8_t int_chars[] = {primitiveHeader(VariantPrimitiveType::INT32), 0xD2, 0x02, 0x96, 0x49};
std::string_view int_string(reinterpret_cast<const char*>(int_chars), sizeof(int_chars));
VariantValue variant{VariantMetadata::kEmptyMetadata, int_string};
builder.append(&variant);
auto column = builder.build(false);
EXPECT_EQ("variant", column->get_name());
EXPECT_TRUE(column->is_variant());
VariantColumn::Ptr column_ptr = ColumnHelper::cast_to<TYPE_VARIANT>(column);
ASSERT_EQ(column_ptr->size(), 1);
auto res = column_ptr->get_object(0);
ASSERT_EQ(res->serialize_size(), variant.serialize_size());
ASSERT_EQ(res->get_metadata(), variant.get_metadata());
ASSERT_EQ(res->get_value(), variant.get_value());
EXPECT_EQ(res->to_string(), variant.to_string());
EXPECT_EQ("1234567890", res->to_string());
}
// clone
{
VariantColumn::Ptr column = VariantColumn::create();
const uint8_t int_chars[] = {primitiveHeader(VariantPrimitiveType::INT32), 0xD2, 0x02, 0x96, 0x49};
std::string_view int_string(reinterpret_cast<const char*>(int_chars), sizeof(int_chars));
VariantValue variant{VariantMetadata::kEmptyMetadata, int_string};
column->append(&variant);
{
auto copy = column->clone();
ASSERT_EQ(copy->size(), 1);
auto res = copy->get(0).get_variant();
ASSERT_EQ(res->serialize_size(), variant.serialize_size());
ASSERT_EQ(res->get_metadata(), variant.get_metadata());
ASSERT_EQ(res->get_value(), variant.get_value());
EXPECT_EQ(res->to_string(), variant.to_string());
EXPECT_EQ("1234567890", res->to_string());
}
// clone nullable by helper
{
TypeDescriptor desc = TypeDescriptor::create_variant_type();
auto copy = ColumnHelper::clone_column(desc, true, column, column->size());
ASSERT_EQ(copy->size(), 1);
ASSERT_TRUE(copy->is_nullable());
// unwrap nullable column
Column* unwrapped = ColumnHelper::get_data_column(copy.get());
VariantColumn* variant_column_ptr = down_cast<VariantColumn*>(unwrapped);
ASSERT_EQ(variant_column_ptr->size(), 1);
auto res = variant_column_ptr->get(0).get_variant();
ASSERT_EQ(res->serialize_size(), variant.serialize_size());
ASSERT_EQ(res->get_metadata(), variant.get_metadata());
ASSERT_EQ(res->get_value(), variant.get_value());
EXPECT_EQ(res->to_string(), variant.to_string());
EXPECT_EQ("1234567890", res->to_string());
}
// clone variant_column by helper
{
TypeDescriptor desc = TypeDescriptor::create_variant_type();
ColumnPtr copy = ColumnHelper::clone_column(desc, false, column, column->size());
ASSERT_EQ(copy->size(), 1);
ASSERT_FALSE(copy->is_nullable());
VariantColumn::Ptr variant_column_ptr = ColumnHelper::cast_to<TYPE_VARIANT>(copy);
ASSERT_EQ(variant_column_ptr->size(), 1);
auto res = variant_column_ptr->get(0).get_variant();
ASSERT_EQ(res->serialize_size(), variant.serialize_size());
ASSERT_EQ(res->get_metadata(), variant.get_metadata());
ASSERT_EQ(res->get_value(), variant.get_value());
EXPECT_EQ(res->to_string(), variant.to_string());
EXPECT_EQ("1234567890", res->to_string());
VariantColumn* variant_column = ColumnHelper::cast_to_raw<TYPE_VARIANT>(copy.get());
ASSERT_EQ(variant_column->size(), 1);
auto raw_res = variant_column->get(0).get_variant();
ASSERT_EQ(res->serialize_size(), variant.serialize_size());
ASSERT_EQ(raw_res->get_metadata(), variant.get_metadata());
ASSERT_EQ(raw_res->get_value(), variant.get_value());
EXPECT_EQ(raw_res->to_string(), variant.to_string());
EXPECT_EQ("1234567890", raw_res->to_string());
}
}
}
// NOLINTNEXTLINE
PARALLEL_TEST(VariantColumnTest, test_serialize) {
std::string_view empty_metadata = VariantMetadata::kEmptyMetadata;
const uint8_t uuid_chars[] = {primitiveHeader(VariantPrimitiveType::UUID),
0xf2,
0x4f,
0x9b,
0x64,
0x81,
0xfa,
0x49,
0xd1,
0xb7,
0x4e,
0x8c,
0x09,
0xa6,
0xe3,
0x1c,
0x56};
std::string_view uuid_string(reinterpret_cast<const char*>(uuid_chars), sizeof(uuid_chars));
VariantValue variant{empty_metadata, uuid_string};
auto column = RunTimeColumnType<TYPE_VARIANT>::create();
EXPECT_EQ("variant", column->get_name());
EXPECT_TRUE(column->is_variant());
column->append(&variant);
EXPECT_EQ(variant.serialize_size(), column->serialize_size(0));
// deserialize
std::vector<uint8_t> buffer;
buffer.resize(variant.serialize_size());
column->serialize(0, buffer.data());
auto new_column = column->clone_empty();
new_column->deserialize_and_append(buffer.data());
const VariantValue* deserialized_variant = new_column->get(0).get_variant();
ASSERT_TRUE(deserialized_variant != nullptr);
EXPECT_EQ(variant.serialize_size(), deserialized_variant->serialize_size());
EXPECT_EQ(variant.to_string(), deserialized_variant->to_string());
EXPECT_EQ("\"f24f9b64-81fa-49d1-b74e-8c09a6e31c56\"", deserialized_variant->to_json().value());
}
// NOLINTNEXTLINE
PARALLEL_TEST(VariantColumnTest, put_mysql_row_buffer) {
const uint8_t int_chars[] = {primitiveHeader(VariantPrimitiveType::INT32), 0xD2, 0x02, 0x96, 0x49};
std::string_view int_string(reinterpret_cast<const char*>(int_chars), sizeof(int_chars));
VariantValue variant{VariantMetadata::kEmptyMetadata, int_string};
auto column = VariantColumn::create();
column->append(&variant);
MysqlRowBuffer buf;
column->put_mysql_row_buffer(&buf, 0);
EXPECT_EQ("\n1234567890", buf.data());
}
// NOLINTNEXTLINE
PARALLEL_TEST(VariantColumnTest, test_create_variant_column) {
auto variant_column = VariantColumn::create();
// Test basic column operations that exercise visitor patterns
EXPECT_EQ(0, variant_column->size());
EXPECT_TRUE(variant_column->empty());
EXPECT_FALSE(variant_column->is_nullable());
EXPECT_FALSE(variant_column->is_constant());
// Test column cloning which uses visitor patterns internally
auto cloned = variant_column->clone();
EXPECT_EQ(0, cloned->size());
// Test memory operations
size_t memory_usage = variant_column->memory_usage();
EXPECT_GE(memory_usage, 0);
}
// NOLINTNEXTLINE
PARALLEL_TEST(VariantColumnTest, test_append_strings) {
const auto variant_column = VariantColumn::create();
const uint8_t int1_value[] = {primitiveHeader(VariantPrimitiveType::INT8), 0x01};
const std::string_view int1_value_str(reinterpret_cast<const char*>(int1_value), sizeof(int1_value));
constexpr uint32_t int1_total_size = sizeof(int1_value) + VariantMetadata::kEmptyMetadata.size();
std::string variant_string;
variant_string.resize(int1_total_size + sizeof(uint32_t));
memcpy(variant_string.data(), &int1_total_size, sizeof(uint32_t));
memcpy(variant_string.data() + sizeof(uint32_t), VariantMetadata::kEmptyMetadata.data(),
VariantMetadata::kEmptyMetadata.size());
memcpy(variant_string.data() + sizeof(uint32_t) + VariantMetadata::kEmptyMetadata.size(), int1_value_str.data(),
int1_value_str.size());
const Slice slice(variant_string.data(), variant_string.size());
variant_column->append_strings(&slice, 1);
ASSERT_EQ(1, variant_column->size());
auto expected = VariantValue::create(slice);
ASSERT_TRUE(expected.ok());
const VariantValue* actual = variant_column->get_object(0);
ASSERT_EQ(expected->serialize_size(), actual->serialize_size());
ASSERT_EQ(expected->get_metadata(), actual->get_metadata());
ASSERT_EQ(expected->get_value(), actual->get_value());
EXPECT_EQ(expected->to_string(), actual->to_string());
EXPECT_EQ("1", actual->to_string());
// Append bad data
const Slice bad_slice("");
const bool result = variant_column->append_strings(&bad_slice, 1);
ASSERT_FALSE(result) << "Appending empty slice should fail";
}
} // namespace starrocks

View File

@ -747,4 +747,98 @@ TEST_F(TypeDescriptorTest, test_promote_types) {
}
}
// params: [LogicalType, expected length, logical_type_string, is_scalar_logical_type, is_huge_type]
class FromLogicalTypeTest : public ::testing::TestWithParam<std::tuple<LogicalType, int, std::string, bool, bool>> {
public:
};
// NOLINTNEXTLINE
TEST_P(FromLogicalTypeTest, test_from_logical_type) {
auto param = GetParam();
LogicalType logical_type = std::get<0>(param);
int expected_len = std::get<1>(param);
std::string expected_lts = std::get<2>(param);
bool is_slt = std::get<3>(param);
bool is_ht = std::get<4>(param);
ASSERT_EQ(expected_lts, logical_type_to_string(logical_type));
ASSERT_EQ(is_slt, is_scalar_logical_type(logical_type));
ASSERT_EQ(logical_type, string_to_logical_type(expected_lts));
auto type_desc = TypeDescriptor::from_logical_type(logical_type);
ASSERT_EQ(logical_type, type_desc.type);
ASSERT_EQ(expected_len, type_desc.len);
if (logical_type == TYPE_DECIMAL || logical_type == TYPE_DECIMALV2 || logical_type == TYPE_DECIMAL32 ||
logical_type == TYPE_DECIMAL64 || logical_type == TYPE_DECIMAL128) {
ASSERT_EQ(type_desc.precision, 27); // default precision for decimal types
ASSERT_EQ(type_desc.scale, 9); // default scale for decimal types
} else {
ASSERT_EQ(-1, type_desc.precision);
ASSERT_EQ(-1, type_desc.scale);
}
ASSERT_EQ(is_ht, type_desc.is_huge_type());
}
// clang-format off
INSTANTIATE_TEST_SUITE_P(
LogicalTypes,
FromLogicalTypeTest,
::testing::Values(
// Basic types
std::make_tuple(LogicalType::TYPE_BOOLEAN, -1, "BOOLEAN", true, false),
std::make_tuple(LogicalType::TYPE_TINYINT, -1, "TINYINT", true, false),
std::make_tuple(LogicalType::TYPE_SMALLINT, -1, "SMALLINT", true, false),
std::make_tuple(LogicalType::TYPE_INT, -1, "INT", true, false),
std::make_tuple(LogicalType::TYPE_BIGINT, -1, "BIGINT", true, false),
std::make_tuple(LogicalType::TYPE_LARGEINT, -1, "LARGEINT", true, false),
std::make_tuple(LogicalType::TYPE_FLOAT, -1, "FLOAT", true, false),
std::make_tuple(LogicalType::TYPE_DOUBLE, -1, "DOUBLE", true, false),
// string/binary types
std::make_tuple(LogicalType::TYPE_CHAR, TypeDescriptor::MAX_CHAR_LENGTH, "CHAR", true, false),
std::make_tuple(LogicalType::TYPE_VARCHAR, TypeDescriptor::MAX_VARCHAR_LENGTH, "VARCHAR", true, false),
std::make_tuple(LogicalType::TYPE_BINARY, -1, "BINARY", true, false),
std::make_tuple(LogicalType::TYPE_VARBINARY, -1, "VARBINARY", true, false),
// decimal types
std::make_tuple(LogicalType::TYPE_DECIMAL, -1, "DECIMAL", true, false),
std::make_tuple(LogicalType::TYPE_DECIMALV2, -1, "DECIMAL_V2", true, false),
std::make_tuple(LogicalType::TYPE_DECIMAL32, -1, "DECIMAL32", true, false),
std::make_tuple(LogicalType::TYPE_DECIMAL64, -1, "DECIMAL64", true, false),
std::make_tuple(LogicalType::TYPE_DECIMAL128, -1, "DECIMAL128", true, false),
std::make_tuple(LogicalType::TYPE_HLL, HLL_COLUMN_DEFAULT_LEN, "HLL", false, true),
std::make_tuple(LogicalType::TYPE_JSON, kJsonDefaultSize, "JSON", true, true),
std::make_tuple(LogicalType::TYPE_VARIANT, 128, "VARIANT", true, true),
std::make_tuple(LogicalType::TYPE_OBJECT, TypeDescriptor::DEFAULT_BITMAP_LENGTH, "OBJECT", false, true)
)
);
// clang-format on
TEST_F(TypeDescriptorTest, test_create_variant_type) {
// Test create_variant_type() static method
TypeDescriptor variant_desc = TypeDescriptor::create_variant_type();
ASSERT_EQ(LogicalType::TYPE_VARIANT, variant_desc.type);
ASSERT_EQ(128, variant_desc.len);
// Verify other default fields are properly initialized
ASSERT_EQ(-1, variant_desc.precision);
ASSERT_EQ(-1, variant_desc.scale);
ASSERT_TRUE(variant_desc.children.empty());
// Verify the type descriptor is valid
ASSERT_TRUE(variant_desc.is_huge_type());
ASSERT_FALSE(variant_desc.is_complex_type());
ASSERT_EQ("VARIANT", variant_desc.debug_string());
// Test that multiple calls return equivalent objects
TypeDescriptor variant_desc2 = TypeDescriptor::create_variant_type();
ASSERT_EQ(variant_desc.type, variant_desc2.type);
ASSERT_EQ(variant_desc.len, variant_desc2.len);
}
} // namespace starrocks

View File

@ -24,9 +24,11 @@
#include "column/fixed_length_column.h"
#include "column/json_column.h"
#include "column/nullable_column.h"
#include "column/variant_column.h"
#include "gutil/strings/substitute.h"
#include "testutil/parallel_test.h"
#include "util/json.h"
#include "util/variant_util.h"
namespace starrocks::serde {
@ -84,6 +86,88 @@ PARALLEL_TEST(ColumnArraySerdeTest, json_column) {
}
}
// NOLINTNEXTLINE
PARALLEL_TEST(ColumnArraySerdeTest, variant_column) {
auto c1 = VariantColumn::create();
ASSERT_EQ(4, ColumnArraySerde::max_serialized_size(*c1));
// Prepare 5 int8 variant values
const uint8_t int8_values[][2] = {
{VariantUtil::primitiveHeader(VariantPrimitiveType::INT8), 0x01}, // 1
{VariantUtil::primitiveHeader(VariantPrimitiveType::INT8), 0x02}, // 2
{VariantUtil::primitiveHeader(VariantPrimitiveType::INT8), 0x03}, // 3
{VariantUtil::primitiveHeader(VariantPrimitiveType::INT8), 0x04}, // 4
{VariantUtil::primitiveHeader(VariantPrimitiveType::INT8), 0x05}, // 5
};
size_t expected_max_size = sizeof(uint32_t);
for (size_t i = 0; i < std::size(int8_values); ++i) {
std::string_view value(reinterpret_cast<const char*>(int8_values[i]), sizeof(int8_values[i]));
VariantValue variant(VariantMetadata::kEmptyMetadata, value);
c1->append(&variant);
expected_max_size += sizeof(uint64_t) + variant.serialize_size();
}
ASSERT_EQ(expected_max_size, ColumnArraySerde::max_serialized_size(*c1));
auto c2 = VariantColumn::create();
std::vector<uint8_t> buffer;
buffer.resize(ColumnArraySerde::max_serialized_size(*c1));
auto p1 = ColumnArraySerde::serialize(*c1, buffer.data());
auto p2 = ColumnArraySerde::deserialize(buffer.data(), c2.get());
ASSERT_EQ(buffer.data() + buffer.size(), p1);
ASSERT_EQ(buffer.data() + buffer.size(), p2);
ASSERT_EQ(5, c2->size());
for (size_t i = 0; i < c1->size(); i++) {
const VariantValue* datum1 = c1->get(i).get_variant();
const VariantValue* datum2 = c2->get(i).get_variant();
ASSERT_EQ(datum1->serialize_size(), datum2->serialize_size());
ASSERT_EQ(datum1->get_metadata(), datum2->get_metadata());
ASSERT_EQ(datum1->get_value(), datum2->get_value());
EXPECT_EQ(datum1->to_string(), datum2->to_string());
}
// no effect
for (auto level = -1; level < 8; ++level) {
buffer.resize(ColumnArraySerde::max_serialized_size(*c1), level);
p1 = ColumnArraySerde::serialize(*c1, buffer.data(), false, level);
p2 = ColumnArraySerde::deserialize(buffer.data(), c2.get(), false, level);
ASSERT_EQ(buffer.data() + buffer.size(), p1);
ASSERT_EQ(buffer.data() + buffer.size(), p2);
ASSERT_EQ(5, c2->size());
for (size_t i = 0; i < c1->size(); i++) {
const VariantValue* datum1 = c1->get(i).get_variant();
const VariantValue* datum2 = c2->get(i).get_variant();
ASSERT_EQ(datum1->serialize_size(), datum2->serialize_size());
ASSERT_EQ(datum1->get_metadata(), datum2->get_metadata());
ASSERT_EQ(datum1->get_value(), datum2->get_value());
EXPECT_EQ(datum1->to_string(), datum2->to_string());
}
}
}
// NOLINTNEXTLINE
PARALLEL_TEST(ColumnArraySerdeTest, variant_column_failed_deserialize) {
auto c1 = VariantColumn::create();
ASSERT_EQ(4, ColumnArraySerde::max_serialized_size(*c1));
// Prepare a variant value with an unsupported version
constexpr uint8_t v2_metadata_charts[] = {0x02, 0x00, 0x00};
const std::string_view v2_metadata(reinterpret_cast<const char*>(v2_metadata_charts), sizeof(v2_metadata_charts));
const VariantValue variant(v2_metadata, "");
c1->append(&variant);
ASSERT_EQ(1, c1->size());
std::vector<uint8_t> buffer;
buffer.resize(ColumnArraySerde::max_serialized_size(*c1));
ColumnArraySerde::serialize(*c1, buffer.data());
auto c2 = VariantColumn::create();
auto p2 = ColumnArraySerde::deserialize(buffer.data(), c2.get());
ASSERT_TRUE(p2 == nullptr); // Deserialization should fail due to empty value
ASSERT_EQ(0, c2->size()); // Deserialization should fail, resulting in an empty column
}
// NOLINTNEXTLINE
PARALLEL_TEST(ColumnArraySerdeTest, decimal_column) {
auto c1 = DecimalColumn::create();

View File

@ -0,0 +1,437 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "types/variant_value.h"
#include <fs/fs.h>
#include <gtest/gtest.h>
#include <boost/uuid/uuid_io.hpp>
#include "cctz/time_zone.h"
#include "formats/parquet/variant.h"
#include "runtime/decimalv3.h"
#include "types/timestamp_value.h"
#include "util/timezone_utils.h"
#include "util/variant_util.h"
namespace starrocks {
class VariantValueTest : public testing::Test {
public:
VariantValueTest() = default;
~VariantValueTest() override = default;
protected:
void SetUp() override {
std::string starrocks_home = getenv("STARROCKS_HOME");
test_exec_dir = starrocks_home + "/be/test/formats/parquet/test_data/variant";
_primitive_metadata_file_names = {
"primitive_null.metadata", "primitive_boolean_true.metadata", "primitive_boolean_false.metadata",
"primitive_date.metadata", "primitive_decimal4.metadata", "primitive_decimal8.metadata",
"primitive_decimal16.metadata", "primitive_float.metadata", "primitive_double.metadata",
"primitive_int8.metadata", "primitive_int16.metadata", "primitive_int32.metadata",
"primitive_int64.metadata", "primitive_binary.metadata", "primitive_string.metadata",
"array_primitive.metadata",
};
_boolean_file_names = {
{"primitive_boolean_true.metadata", "primitive_boolean_true.value"},
{"primitive_boolean_false.metadata", "primitive_boolean_false.value"},
};
}
std::string read_file_content(const std::string& file_path) {
FileSystem* fs = FileSystem::Default();
auto random_access_file = *fs->new_random_access_file(file_path);
return *random_access_file->read_all();
}
std::pair<std::string, std::string> load_variant_data(const std::string& metadata_file,
const std::string& value_file) {
std::string metadata_content = read_file_content(test_exec_dir + "/" + metadata_file);
std::string value_content = read_file_content(test_exec_dir + "/" + value_file);
return {std::move(metadata_content), std::move(value_content)};
}
protected:
std::string test_exec_dir;
std::vector<std::string> _primitive_metadata_file_names;
std::vector<std::pair<std::string, std::string>> _boolean_file_names;
};
TEST_F(VariantValueTest, NullToJson) {
uint8_t null_chars[] = {static_cast<uint8_t>(VariantPrimitiveType::NULL_TYPE) << 2};
std::string_view null_value(reinterpret_cast<const char*>(null_chars), 1);
VariantValue v(VariantMetadata::kEmptyMetadata, null_value);
auto json = v.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("null", *json);
}
TEST_F(VariantValueTest, BooleanToJson) {
auto [t_fst, t_snd] = _boolean_file_names[0];
auto [t_metadata, t_value] = load_variant_data(t_fst, t_snd);
VariantValue variant_true{std::string_view(t_metadata), std::string_view(t_value)};
auto json_true = variant_true.to_json();
ASSERT_TRUE(json_true.ok());
EXPECT_EQ("true", *json_true);
auto [f_fst, f_snd] = _boolean_file_names[1];
auto [f_metadata, f_value] = load_variant_data(f_fst, f_snd);
VariantValue variant_false{std::string_view(f_metadata), std::string_view(f_value)};
auto json_false = variant_false.to_json();
ASSERT_TRUE(json_false.ok());
EXPECT_EQ("false", *json_false);
}
TEST_F(VariantValueTest, IntegerToJson) {
// Test int8
{
auto [int8_metadata, int8_value] = load_variant_data("primitive_int8.metadata", "primitive_int8.value");
VariantValue variant{std::string_view(int8_metadata), std::string_view(int8_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("42", *json);
}
// Test int16
{
auto [int16_metadata, int16_value] = load_variant_data("primitive_int16.metadata", "primitive_int16.value");
VariantValue variant{std::string_view(int16_metadata), std::string_view(int16_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("1234", *json);
}
// Test int32
{
auto [int32_metadata, int32_value] = load_variant_data("primitive_int32.metadata", "primitive_int32.value");
VariantValue variant{std::string_view(int32_metadata), std::string_view(int32_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("123456", *json);
}
// Test int64
{
auto [int64_metadata, int64_value] = load_variant_data("primitive_int64.metadata", "primitive_int64.value");
VariantValue variant{std::string_view(int64_metadata), std::string_view(int64_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("1234567890123456789", *json);
}
}
TEST_F(VariantValueTest, FloatToJson) {
// Test float
{
auto [float_metadata, float_value] = load_variant_data("primitive_float.metadata", "primitive_float.value");
VariantValue variant{std::string_view(float_metadata), std::string_view(float_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("1234567936.000000", *json);
}
// Test double
{
auto [double_metadata, double_value] = load_variant_data("primitive_double.metadata", "primitive_double.value");
VariantValue variant{std::string_view(double_metadata), std::string_view(double_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("1234567890.123400", *json);
}
}
TEST_F(VariantValueTest, StringToJson) {
// Test long string
{
auto [string_metadata, string_value] = load_variant_data("primitive_string.metadata", "primitive_string.value");
VariantValue variant{std::string_view(string_metadata), std::string_view(string_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ(
"This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes "
"several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!",
*json);
}
// Test short string
{
auto [short_string_metadata, short_string_value] =
load_variant_data("short_string.metadata", "short_string.value");
VariantValue variant{std::string_view(short_string_metadata), std::string_view(short_string_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("Less than 64 bytes (❤️ with utf8)", *json);
}
}
TEST_F(VariantValueTest, BinaryToJson) {
auto [binary_metadata, binary_value] = load_variant_data("primitive_binary.metadata", "primitive_binary.value");
VariantValue variant{std::string_view(binary_metadata), std::string_view(binary_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
ASSERT_EQ("\"AxM33q2+78r+\"", *json);
}
TEST_F(VariantValueTest, DecimalToJson) {
// Test decimal4
{
auto [decimal4_metadata, decimal4_value] =
load_variant_data("primitive_decimal4.metadata", "primitive_decimal4.value");
VariantValue variant{std::string_view(decimal4_metadata), std::string_view(decimal4_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("12.34", *json);
}
// Test decimal8
{
auto [decimal8_metadata, decimal8_value] =
load_variant_data("primitive_decimal8.metadata", "primitive_decimal8.value");
VariantValue variant{std::string_view(decimal8_metadata), std::string_view(decimal8_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("12345678.90", *json);
}
// Test decimal16
{
auto [decimal16_metadata, decimal16_value] =
load_variant_data("primitive_decimal16.metadata", "primitive_decimal16.value");
VariantValue variant{std::string_view(decimal16_metadata), std::string_view(decimal16_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("12345678912345678.90", *json);
}
}
TEST_F(VariantValueTest, UUIDToJson) {
std::string_view empty_metadata = VariantMetadata::kEmptyMetadata;
const uint8_t uuid_chars[] = {VariantUtil::primitiveHeader(VariantPrimitiveType::UUID),
0xf2,
0x4f,
0x9b,
0x64,
0x81,
0xfa,
0x49,
0xd1,
0xb7,
0x4e,
0x8c,
0x09,
0xa6,
0xe3,
0x1c,
0x56};
std::string_view uuid_string(reinterpret_cast<const char*>(uuid_chars), sizeof(uuid_chars));
VariantValue variant{empty_metadata, uuid_string};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("\"f24f9b64-81fa-49d1-b74e-8c09a6e31c56\"", *json);
}
TEST_F(VariantValueTest, TimestampToJson) {
// Test timestamp with timezone
{
auto [ts_metadata, ts_value] = load_variant_data("primitive_timestamp.metadata", "primitive_timestamp.value");
VariantValue variant{std::string_view(ts_metadata), std::string_view(ts_value)};
// Set timezone to -04:00
cctz::time_zone tz;
ASSERT_TRUE(TimezoneUtils::find_cctz_time_zone("-04:00", tz));
auto json = variant.to_json(tz);
ASSERT_TRUE(json.ok());
EXPECT_EQ("\"2025-04-16 12:34:56.78-04:00\"", *json);
}
// Test timestamp without timezone
{
auto [ts_ntz_metadata, ts_ntz_value] =
load_variant_data("primitive_timestampntz.metadata", "primitive_timestampntz.value");
VariantValue variant{std::string_view(ts_ntz_metadata), std::string_view(ts_ntz_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("\"2025-04-16 12:34:56.780000\"", *json);
}
}
TEST_F(VariantValueTest, DateToJson) {
auto [date_metadata, date_value] = load_variant_data("primitive_date.metadata", "primitive_date.value");
VariantValue variant{std::string_view(date_metadata), std::string_view(date_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("\"2025-04-16\"", *json);
}
TEST_F(VariantValueTest, ObjectToJson) {
// Test simple object
{
auto [object_metadata, object_value] = load_variant_data("object_primitive.metadata", "object_primitive.value");
VariantValue variant{std::string_view(object_metadata), std::string_view(object_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
// Should be a valid JSON object
EXPECT_TRUE(json->front() == '{');
EXPECT_TRUE(json->back() == '}');
EXPECT_EQ(
"{boolean_false_field:false,boolean_true_field:true,double_field:1.23456789,int_field:1,null_field:"
"null,string_field:Apache Parquet,timestamp_field:2025-04-16T12:34:56.78}",
*json);
}
// Test empty object
{
auto [object_empty_metadata, object_empty_value] =
load_variant_data("object_empty.metadata", "object_empty.value");
VariantValue variant{std::string_view(object_empty_metadata), std::string_view(object_empty_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("{}", *json);
}
// Test nested object
{
auto [object_nested_metadata, object_nested_value] =
load_variant_data("object_nested.metadata", "object_nested.value");
VariantValue variant{std::string_view(object_nested_metadata), std::string_view(object_nested_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
// Should be a valid JSON object
EXPECT_TRUE(json->front() == '{');
EXPECT_TRUE(json->back() == '}');
EXPECT_EQ(
"{id:1,observation:{location:In the "
"Volcano,time:12:34:56,value:{humidity:456,temperature:123}},species:{name:lava "
"monster,population:6789}}",
*json);
}
}
TEST_F(VariantValueTest, ArrayToJson) {
// Test primitive array
{
auto [array_metadata, array_value] = load_variant_data("array_primitive.metadata", "array_primitive.value");
VariantValue variant{std::string_view(array_metadata), std::string_view(array_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
// Should be a valid JSON array
EXPECT_TRUE(json->front() == '[');
EXPECT_TRUE(json->back() == ']');
EXPECT_EQ("[2,1,5,9]", *json);
}
// Test empty array
{
auto [array_empty_metadata, array_empty_value] = load_variant_data("array_empty.metadata", "array_empty.value");
VariantValue variant{std::string_view(array_empty_metadata), std::string_view(array_empty_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
EXPECT_EQ("[]", *json);
}
// Test nested array
{
auto [array_nested_metadata, array_nested_value] =
load_variant_data("array_nested.metadata", "array_nested.value");
VariantValue variant{std::string_view(array_nested_metadata), std::string_view(array_nested_value)};
auto json = variant.to_json();
ASSERT_TRUE(json.ok());
// Should be a valid JSON array
EXPECT_TRUE(json->front() == '[');
EXPECT_TRUE(json->back() == ']');
EXPECT_EQ("[{id:1,thing:{names:[Contrarian,Spider]}},null,{id:2,names:[Apple,Ray,null],type:if}]", *json);
}
}
TEST_F(VariantValueTest, InvalidVariant) {
// Test invalid variant with empty metadata
{
Slice empty_slice(""); // Empty slice
auto empty_variant = VariantValue::create(empty_slice);
ASSERT_FALSE(empty_variant.ok());
EXPECT_EQ("Invalid argument: Invalid variant slice: too small to contain size header",
empty_variant.status().to_string());
}
// Test invalid variant with unsupported version
{
// Create proper format: 4-byte size header + variant data
constexpr char v2_metadata_char[] = {0x2, 0x0, 0x0}; // version 2 metadata
constexpr uint32_t data_size = sizeof(v2_metadata_char);
// Construct proper slice with size header
std::string variant_data;
variant_data.resize(sizeof(uint32_t) + data_size);
// Write size header (little endian)
std::memcpy(variant_data.data(), &data_size, sizeof(uint32_t));
// Write variant data
std::memcpy(variant_data.data() + sizeof(uint32_t), v2_metadata_char, data_size);
Slice variant_slice(variant_data);
auto unsupported_variant = VariantValue::create(variant_slice);
ASSERT_FALSE(unsupported_variant.ok());
EXPECT_EQ("Not supported: Unsupported variant version: 2", unsupported_variant.status().to_string());
}
// Test exceed maximum variant size
{
// Create size header that indicates data larger than max size
constexpr uint32_t oversized_data_size = VariantValue::kMaxVariantSize + 1;
std::string variant_data;
variant_data.resize(sizeof(uint32_t));
// Write oversized size in header
std::memcpy(variant_data.data(), &oversized_data_size, sizeof(uint32_t));
Slice variant_slice(variant_data);
auto large_variant = VariantValue::create(variant_slice);
ASSERT_FALSE(large_variant.ok());
EXPECT_EQ("Invalid argument: Variant size exceeds maximum limit: " +
std::to_string(VariantValue::kMaxVariantSize + 1) + " > " +
std::to_string(VariantValue::kMaxVariantSize),
large_variant.status().to_string());
}
// Test variant with size header but insufficient actual data
{
constexpr uint32_t claimed_size = 100; // Claim 100 bytes
constexpr uint32_t actual_data_size = 10; // But only provide 10 bytes
std::string variant_data;
variant_data.resize(sizeof(uint32_t) + actual_data_size);
// Write size header claiming more data than available
std::memcpy(variant_data.data(), &claimed_size, sizeof(uint32_t));
// Fill with some dummy data
std::memset(variant_data.data() + sizeof(uint32_t), 0x42, actual_data_size);
Slice variant_slice(variant_data);
auto insufficient_variant = VariantValue::create(variant_slice);
ASSERT_FALSE(insufficient_variant.ok());
EXPECT_EQ("Invalid argument: Invalid variant size: 100 exceeds available data: 10",
insufficient_variant.status().to_string());
}
}
} // namespace starrocks

View File

@ -102,7 +102,8 @@ enum TPrimitiveType {
FUNCTION,
VARBINARY,
DECIMAL256,
INT256
INT256,
VARIANT
}
enum TTypeNodeType {