Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com> Co-authored-by: Cursor Agent <cursoragent@cursor.com>
This commit is contained in:
parent
6ca0ee936b
commit
98032f2b0a
|
|
@ -323,6 +323,11 @@ CONF_mBool(enable_zonemap_index_memory_page_cache, "true");
|
|||
// whether to enable the ordinal index memory cache
|
||||
CONF_mBool(enable_ordinal_index_memory_page_cache, "true");
|
||||
|
||||
// Enable ZoneMap for string (CHAR/VARCHAR) columns using prefix-based min/max
|
||||
CONF_mBool(enable_string_prefix_zonemap, "true");
|
||||
// Prefix length used for string ZoneMap min/max when enabled
|
||||
CONF_mInt32(string_prefix_zonemap_prefix_len, "16");
|
||||
|
||||
CONF_mInt32(base_compaction_check_interval_seconds, "60");
|
||||
CONF_mInt64(min_base_compaction_num_singleton_deltas, "5");
|
||||
CONF_mInt64(max_base_compaction_num_singleton_deltas, "100");
|
||||
|
|
|
|||
|
|
@ -394,6 +394,9 @@ Status ScalarColumnWriter::init() {
|
|||
if (_opts.need_zone_map) {
|
||||
_has_index_builder = true;
|
||||
_zone_map_index_builder = ZoneMapIndexWriter::create(type_info());
|
||||
if (_opts.zone_map_truncate_string) {
|
||||
_zone_map_index_builder->enable_truncate_string();
|
||||
}
|
||||
}
|
||||
if (_opts.need_bitmap_index) {
|
||||
_has_index_builder = true;
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ struct ColumnWriterOptions {
|
|||
// space saving = 1 - compressed_size / uncompressed_size
|
||||
double compression_min_space_saving = 0.1;
|
||||
bool need_zone_map = false;
|
||||
bool zone_map_truncate_string = false; // truncate string at write time to reduce comparison/metadata overhead.
|
||||
bool need_bitmap_index = false;
|
||||
bool need_bloom_filter = false;
|
||||
bool need_vector_index = false;
|
||||
|
|
|
|||
|
|
@ -194,6 +194,8 @@ Status FlatJsonColumnWriter::_init_flat_writers() {
|
|||
opts.meta->set_name(_flat_paths[i]);
|
||||
opts.need_flat = false;
|
||||
opts.need_zone_map = config::json_flat_create_zonemap && is_zone_map_key_type(_flat_types[i]);
|
||||
opts.need_zone_map |= config::enable_string_prefix_zonemap && is_string_type(_flat_types[i]);
|
||||
opts.zone_map_truncate_string = config::enable_string_prefix_zonemap && is_string_type(_flat_types[i]);
|
||||
|
||||
// Set global dict for sub-columns that support it
|
||||
if (is_string_type(_flat_types[i])) {
|
||||
|
|
|
|||
|
|
@ -169,6 +169,10 @@ Status SegmentWriter::init(const std::vector<uint32_t>& column_indexes, bool has
|
|||
const bool enable_dup_zone_map =
|
||||
_tablet_schema->keys_type() == KeysType::DUP_KEYS && is_zone_map_key_type(column.type());
|
||||
opts.need_zone_map = column.is_key() || enable_pk_zone_map || enable_dup_zone_map || column.is_sort_key();
|
||||
// Create prefix zonemap for string type, but only truncate it for non-key columns
|
||||
opts.need_zone_map |= config::enable_string_prefix_zonemap && is_string_type(column.type());
|
||||
opts.zone_map_truncate_string =
|
||||
config::enable_string_prefix_zonemap && is_string_type(column.type()) && !column.is_key();
|
||||
if (column.type() == LogicalType::TYPE_ARRAY) {
|
||||
opts.need_zone_map = false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@
|
|||
|
||||
#include "column/column_helper.h"
|
||||
#include "column/column_viewer.h"
|
||||
#include "common/config.h"
|
||||
#include "storage/chunk_helper.h"
|
||||
#include "storage/decimal_type_info.h"
|
||||
#include "storage/olap_define.h"
|
||||
|
|
@ -169,6 +170,8 @@ public:
|
|||
// length is only used for CHAR/VARCHAR, and used to allocate enough memory for min/max value.
|
||||
explicit ZoneMapIndexWriterImpl(TypeInfo* type_info);
|
||||
|
||||
void enable_truncate_string() override { _truncate_string = true; }
|
||||
|
||||
void add_values(const void* values, size_t count) override;
|
||||
|
||||
void add_nulls(uint32_t count) override { _page_zone_map.has_null |= count > 0; }
|
||||
|
|
@ -181,6 +184,8 @@ public:
|
|||
uint64_t size() const override { return _estimated_size; }
|
||||
|
||||
private:
|
||||
void _truncate_string_minmax_if_needed(ZoneMap<type>* zm);
|
||||
|
||||
void _reset_zone_map(ZoneMap<type>* zone_map) {
|
||||
// we should allocate max varchar length and set to max for min value
|
||||
zone_map->min_value.reset(_type_info);
|
||||
|
|
@ -197,6 +202,9 @@ private:
|
|||
// serialized ZoneMapPB for each data page
|
||||
std::vector<std::string> _values;
|
||||
uint64_t _estimated_size = 0;
|
||||
|
||||
// Whether truncate the string to `string_prefix_zonemap_prefix_len` length
|
||||
bool _truncate_string = false;
|
||||
};
|
||||
|
||||
template <LogicalType type>
|
||||
|
|
@ -205,6 +213,28 @@ ZoneMapIndexWriterImpl<type>::ZoneMapIndexWriterImpl(TypeInfo* type_info) : _typ
|
|||
_reset_zone_map(&_segment_zone_map);
|
||||
}
|
||||
|
||||
// Truncate string min/max values at write time to reduce comparison/metadata overhead.
|
||||
// For max values that are truncated, append 0xFF to preserve an upper bound.
|
||||
template <LogicalType LT>
|
||||
void ZoneMapIndexWriterImpl<LT>::_truncate_string_minmax_if_needed(ZoneMap<LT>* zm) {
|
||||
if (!_truncate_string) {
|
||||
return;
|
||||
}
|
||||
const size_t kPrefixLen = std::max<int32_t>(8, config::string_prefix_zonemap_prefix_len);
|
||||
if constexpr (is_string_type(LT) || is_binary_type(LT)) {
|
||||
auto& min_slice = zm->min_value.value;
|
||||
auto& max_slice = zm->max_value.value;
|
||||
if (min_slice.size > kPrefixLen) {
|
||||
min_slice.size = kPrefixLen;
|
||||
}
|
||||
if (max_slice.size > kPrefixLen) {
|
||||
// Safe, original buffer has length > kPrefixLen, ensure buffer has room for 0xFF
|
||||
max_slice.data[kPrefixLen] = static_cast<char>(0xFF);
|
||||
max_slice.size = kPrefixLen + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <LogicalType type>
|
||||
void ZoneMapIndexWriterImpl<type>::add_values(const void* values, size_t count) {
|
||||
if (count > 0) {
|
||||
|
|
@ -215,10 +245,12 @@ void ZoneMapIndexWriterImpl<type>::add_values(const void* values, size_t count)
|
|||
if (unaligned_load<CppType>(pmin) < _page_zone_map.min_value.value) {
|
||||
_page_zone_map.min_value.resize_container_for_fit(_type_info, pmin);
|
||||
_type_info->direct_copy(&_page_zone_map.min_value.value, pmin);
|
||||
_truncate_string_minmax_if_needed(&_page_zone_map);
|
||||
}
|
||||
if (unaligned_load<CppType>(pmax) > _page_zone_map.max_value.value) {
|
||||
_page_zone_map.max_value.resize_container_for_fit(_type_info, pmax);
|
||||
_type_info->direct_copy(&_page_zone_map.max_value.value, pmax);
|
||||
_truncate_string_minmax_if_needed(&_page_zone_map);
|
||||
}
|
||||
} else {
|
||||
_page_zone_map.min_value.resize_container_for_fit(_type_info, pmin);
|
||||
|
|
@ -226,6 +258,7 @@ void ZoneMapIndexWriterImpl<type>::add_values(const void* values, size_t count)
|
|||
|
||||
_page_zone_map.max_value.resize_container_for_fit(_type_info, pmax);
|
||||
_type_info->direct_copy(&_page_zone_map.max_value.value, pmax);
|
||||
_truncate_string_minmax_if_needed(&_page_zone_map);
|
||||
}
|
||||
_page_zone_map.has_not_null = true;
|
||||
}
|
||||
|
|
@ -239,10 +272,12 @@ Status ZoneMapIndexWriterImpl<type>::flush() {
|
|||
if (_page_zone_map.min_value.value < _segment_zone_map.min_value.value) {
|
||||
_segment_zone_map.min_value.resize_container_for_fit(_type_info, &_page_zone_map.min_value.value);
|
||||
_type_info->direct_copy(&_segment_zone_map.min_value.value, &_page_zone_map.min_value.value);
|
||||
_truncate_string_minmax_if_needed(&_segment_zone_map);
|
||||
}
|
||||
if (_page_zone_map.max_value.value > _segment_zone_map.max_value.value) {
|
||||
_segment_zone_map.max_value.resize_container_for_fit(_type_info, &_page_zone_map.max_value.value);
|
||||
_type_info->direct_copy(&_segment_zone_map.max_value.value, &_page_zone_map.max_value.value);
|
||||
_truncate_string_minmax_if_needed(&_segment_zone_map);
|
||||
}
|
||||
} else {
|
||||
_segment_zone_map.min_value.resize_container_for_fit(_type_info, &_page_zone_map.min_value.value);
|
||||
|
|
@ -250,6 +285,7 @@ Status ZoneMapIndexWriterImpl<type>::flush() {
|
|||
|
||||
_segment_zone_map.max_value.resize_container_for_fit(_type_info, &_page_zone_map.max_value.value);
|
||||
_type_info->direct_copy(&_segment_zone_map.max_value.value, &_page_zone_map.max_value.value);
|
||||
_truncate_string_minmax_if_needed(&_segment_zone_map);
|
||||
}
|
||||
_segment_zone_map.has_not_null = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,6 +62,8 @@ public:
|
|||
|
||||
virtual ~ZoneMapIndexWriter() = default;
|
||||
|
||||
virtual void enable_truncate_string() = 0;
|
||||
|
||||
virtual void add_values(const void* values, size_t count) = 0;
|
||||
|
||||
virtual void add_nulls(uint32_t count) = 0;
|
||||
|
|
|
|||
|
|
@ -143,14 +143,8 @@ inline bool is_decimalv3_field_type(LogicalType type) {
|
|||
LogicalType string_to_logical_type(const std::string& type_str);
|
||||
const char* logical_type_to_string(LogicalType type);
|
||||
|
||||
inline bool is_binary_type(LogicalType type) {
|
||||
switch (type) {
|
||||
case TYPE_BINARY:
|
||||
case TYPE_VARBINARY:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
constexpr bool is_binary_type(LogicalType type) {
|
||||
return type == TYPE_BINARY || type == TYPE_VARBINARY;
|
||||
}
|
||||
|
||||
inline bool is_scalar_field_type(LogicalType type) {
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@
|
|||
#include <string>
|
||||
|
||||
#include "cache/object_cache/page_cache.h"
|
||||
#include "common/config.h"
|
||||
#include "fs/fs_memory.h"
|
||||
#include "storage/tablet_schema_helper.h"
|
||||
#include "testutil/assert.h"
|
||||
|
|
@ -100,13 +101,12 @@ protected:
|
|||
ASSERT_EQ(3, column_zone_map.num_pages());
|
||||
const std::vector<ZoneMapPB>& zone_maps = column_zone_map.page_zone_maps();
|
||||
ASSERT_EQ(3, zone_maps.size());
|
||||
ASSERT_EQ("aaaa", zone_maps[0].min());
|
||||
ASSERT_EQ("ffff", zone_maps[0].max());
|
||||
size_t pfx = config::enable_string_prefix_zonemap ? (size_t)config::string_prefix_zonemap_prefix_len : 64;
|
||||
check_result_prefix(zone_maps[0], true, true, "aaaa", "ffff", false, true, pfx);
|
||||
ASSERT_EQ(false, zone_maps[0].has_null());
|
||||
ASSERT_EQ(true, zone_maps[0].has_not_null());
|
||||
|
||||
ASSERT_EQ("aaaaa", zone_maps[1].min());
|
||||
ASSERT_EQ("fffff", zone_maps[1].max());
|
||||
check_result_prefix(zone_maps[1], true, true, "aaaaa", "fffff", true, true, pfx);
|
||||
ASSERT_EQ(true, zone_maps[1].has_null());
|
||||
ASSERT_EQ(true, zone_maps[1].has_not_null());
|
||||
|
||||
|
|
@ -119,6 +119,23 @@ protected:
|
|||
void check_result(const ZoneMapPB& zone_map, bool has_min, bool has_max, const std::string& min,
|
||||
const std::string& max, bool has_null, bool has_not_null);
|
||||
|
||||
// Check with prefix truncation semantics for string zonemap entries: min is prefix; max is prefix possibly with 0xFF.
|
||||
void check_result_prefix(const ZoneMapPB& zone_map, bool has_min, bool has_max, const std::string& min,
|
||||
const std::string& max, bool has_null, bool has_not_null, size_t prefix_len = 64) {
|
||||
ASSERT_EQ(has_min, zone_map.has_min());
|
||||
ASSERT_EQ(has_max, zone_map.has_max());
|
||||
if (has_min) {
|
||||
const auto& zmin = zone_map.min();
|
||||
ASSERT_TRUE(min.rfind(zmin, 0) == 0 || zmin == min.substr(0, std::min(prefix_len, min.size())));
|
||||
ASSERT_TRUE(zmin <= min);
|
||||
}
|
||||
if (has_max) {
|
||||
ASSERT_TRUE(zone_map.max() >= max);
|
||||
}
|
||||
ASSERT_EQ(has_null, zone_map.has_null());
|
||||
ASSERT_EQ(has_not_null, zone_map.has_not_null());
|
||||
}
|
||||
|
||||
std::shared_ptr<MemoryFileSystem> _fs = nullptr;
|
||||
std::unique_ptr<MemTracker> _mem_tracker = nullptr;
|
||||
};
|
||||
|
|
@ -268,12 +285,13 @@ TEST_F(ColumnZoneMapTest, StringResize) {
|
|||
const auto& zone_maps = reader.page_zone_maps();
|
||||
ASSERT_EQ(2, zone_maps.size());
|
||||
|
||||
check_result(zone_maps[0], true, true, str1, str2, false, true);
|
||||
check_result(zone_maps[1], true, true, str3, str4, false, true);
|
||||
size_t pfx = config::enable_string_prefix_zonemap ? (size_t)config::string_prefix_zonemap_prefix_len : 64;
|
||||
check_result_prefix(zone_maps[0], true, true, str1, str2, false, true, pfx);
|
||||
check_result_prefix(zone_maps[1], true, true, str3, str4, false, true, pfx);
|
||||
|
||||
// segment zonemap
|
||||
const auto& segment_zonemap = index_meta.zone_map_index().segment_zone_map();
|
||||
check_result(segment_zonemap, true, true, str1, str4, false, true);
|
||||
check_result_prefix(segment_zonemap, true, true, str1, str4, false, true, pfx);
|
||||
}
|
||||
|
||||
TEST_F(ColumnZoneMapTest, AllNullPage) {
|
||||
|
|
@ -349,6 +367,7 @@ TEST_F(ColumnZoneMapTest, NormalTestIntPage) {
|
|||
TEST_F(ColumnZoneMapTest, NormalTestVarcharPage) {
|
||||
TabletColumn varchar_column = create_varchar_key(0);
|
||||
TypeInfoPtr type_info = get_type_info(varchar_column);
|
||||
// Use prefix check inside test_string by reading page checks
|
||||
test_string("NormalTestVarcharPage", type_info);
|
||||
}
|
||||
|
||||
|
|
@ -438,4 +457,62 @@ TEST_F(ColumnZoneMapTest, VarbinaryWithBinaryData) {
|
|||
true, true);
|
||||
}
|
||||
|
||||
TEST_F(ColumnZoneMapTest, StringPrefixZonemapVariants) {
|
||||
// Enable string prefix zonemap for this test context
|
||||
bool old_switch = config::enable_string_prefix_zonemap;
|
||||
int old_len = config::string_prefix_zonemap_prefix_len;
|
||||
config::enable_string_prefix_zonemap = true;
|
||||
config::string_prefix_zonemap_prefix_len = 16;
|
||||
|
||||
// Build a segment with various string lengths and patterns
|
||||
std::string filename = kTestDir + "/StringPrefixZonemapVariants";
|
||||
|
||||
TabletColumn varchar_column = create_varchar_key(0);
|
||||
TypeInfoPtr type_info = get_type_info(varchar_column);
|
||||
|
||||
auto writer = ZoneMapIndexWriter::create(type_info.get());
|
||||
|
||||
// Short strings
|
||||
std::vector<Slice> shorts = {{"a", 1}, {"b", 1}, {"c", 1}};
|
||||
writer->add_values(shorts.data(), shorts.size());
|
||||
writer->flush();
|
||||
|
||||
// Common prefix strings
|
||||
std::vector<std::string> cp = {"prefix_0001", "prefix_0002", "prefix_9999"};
|
||||
std::vector<Slice> cp_slices;
|
||||
for (auto& s : cp) cp_slices.push_back({s.data(), s.size()});
|
||||
writer->add_values(cp_slices.data(), cp_slices.size());
|
||||
writer->flush();
|
||||
|
||||
// Random long strings (> 64 to ensure truncation even if config changes)
|
||||
std::string long1(80, 'X');
|
||||
std::string long2(120, 'Y');
|
||||
std::vector<Slice> longs = {{long1.data(), long1.size()}, {long2.data(), long2.size()}};
|
||||
writer->add_values(longs.data(), longs.size());
|
||||
writer->flush();
|
||||
|
||||
// Write index out
|
||||
ColumnIndexMetaPB index_meta;
|
||||
write_file(*writer, index_meta, filename);
|
||||
|
||||
// Read back
|
||||
ZoneMapIndexReader reader;
|
||||
load_zone_map(reader, index_meta, filename);
|
||||
|
||||
ASSERT_EQ(3, reader.num_pages());
|
||||
const auto& zone_maps = reader.page_zone_maps();
|
||||
size_t pfx = (size_t)config::string_prefix_zonemap_prefix_len;
|
||||
|
||||
// Page 0: shorts
|
||||
check_result_prefix(zone_maps[0], true, true, "a", "c", false, true, pfx);
|
||||
// Page 1: common prefix
|
||||
check_result_prefix(zone_maps[1], true, true, cp.front(), cp.back(), false, true, pfx);
|
||||
// Page 2: long strings
|
||||
check_result_prefix(zone_maps[2], true, true, long1, long2, false, true, pfx);
|
||||
|
||||
// Restore config
|
||||
config::enable_string_prefix_zonemap = old_switch;
|
||||
config::string_prefix_zonemap_prefix_len = old_len;
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
Loading…
Reference in New Issue