[Enhancement] create adaptive zonemap index for strings (backport #61965) (#62361)

Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
This commit is contained in:
mergify[bot] 2025-08-28 10:23:33 +08:00 committed by GitHub
parent 512567775d
commit f4ee640fc7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 588 additions and 15 deletions

View File

@ -323,10 +323,19 @@ CONF_mBool(enable_zonemap_index_memory_page_cache, "true");
// whether to enable the ordinal index memory cache
CONF_mBool(enable_ordinal_index_memory_page_cache, "true");
// ========================== ZONEMAP BEGIN ===================================
// Enable ZoneMap for string (CHAR/VARCHAR) columns using prefix-based min/max
CONF_mBool(enable_string_prefix_zonemap, "true");
// Prefix length used for string ZoneMap min/max when enabled
CONF_mInt32(string_prefix_zonemap_prefix_len, "16");
// Adaptive creation of string zonemap index based on page overlap quality.
// If the estimated overlap ratio across consecutive pages is greater than this threshold,
// skip writing the page-level string zonemap index. Range: [0.0, 1.0].
CONF_mDouble(string_zonemap_overlap_threshold, "0.8");
// Minimum number of non-empty pages before applying the adaptive check.
CONF_mInt32(string_zonemap_min_pages_for_adaptive_check, "16");
// ========================== ZONEMAP END ===================================
CONF_mInt32(base_compaction_check_interval_seconds, "60");
CONF_mInt64(min_base_compaction_num_singleton_deltas, "5");
@ -1739,4 +1748,5 @@ CONF_mInt64(split_exchanger_buffer_chunk_num, "1000");
// when to split hashmap/hashset into two level hashmap/hashset, negative number means use default value
CONF_mInt64(two_level_memory_threshold, "-1");
} // namespace starrocks::config

View File

@ -37,11 +37,7 @@
#include <cstddef>
#include <memory>
#include "column/array_column.h"
#include "column/column_helper.h"
#include "column/hash_set.h"
#include "column/nullable_column.h"
#include "common/logging.h"
#include "fs/fs.h"
#include "gutil/strings/substitute.h"
#include "simd/simd.h"
@ -397,6 +393,11 @@ Status ScalarColumnWriter::init() {
if (_opts.zone_map_truncate_string) {
_zone_map_index_builder->enable_truncate_string();
}
if (is_string_type(_type_info->type())) {
_zone_map_index_quality_judger =
ZoneMapIndexQualityJudger::create(_type_info.get(), config::string_zonemap_overlap_threshold,
config::string_zonemap_min_pages_for_adaptive_check);
}
}
if (_opts.need_bitmap_index) {
_has_index_builder = true;
@ -594,6 +595,22 @@ Status ScalarColumnWriter::_write_data_page(Page* page) {
Status ScalarColumnWriter::finish_current_page() {
if (_zone_map_index_builder != nullptr) {
RETURN_IF_ERROR(_zone_map_index_builder->flush());
if (_zone_map_index_quality_judger != nullptr) {
std::optional<ZoneMapPB> last_zonemap = _zone_map_index_builder->get_last_zonemap();
if (last_zonemap.has_value()) {
_zone_map_index_quality_judger->feed(last_zonemap.value());
}
CreateIndexDecision decision = _zone_map_index_quality_judger->make_decision();
if (decision == CreateIndexDecision::Bad) {
_zone_map_index_builder.reset();
_zone_map_index_quality_judger.reset();
VLOG(2) << "ZoneMapIndexQualityJudger decided to not create the index for this column";
} else if (decision == CreateIndexDecision::Good) {
// Stop judging
_zone_map_index_quality_judger.reset();
VLOG(2) << "ZoneMapIndexQualityJudger decided to create the index for this column";
}
}
}
if (_bloom_filter_index_builder != nullptr) {

View File

@ -138,6 +138,7 @@ class OrdinalIndexWriter;
class PageBuilder;
class BloomFilterIndexWriter;
class ZoneMapIndexWriter;
class ZoneMapIndexQualityJudger;
class ColumnWriter {
public:
@ -299,6 +300,7 @@ private:
std::unique_ptr<OrdinalIndexWriter> _ordinal_index_builder;
std::unique_ptr<ZoneMapIndexWriter> _zone_map_index_builder;
std::unique_ptr<ZoneMapIndexQualityJudger> _zone_map_index_quality_judger;
std::unique_ptr<BitmapIndexWriter> _bitmap_index_builder;
std::unique_ptr<BloomFilterIndexWriter> _bloom_filter_index_builder;
std::unique_ptr<InvertedWriter> _inverted_index_builder;

View File

@ -159,6 +159,19 @@ struct ZoneMap {
dst->set_has_null(has_null);
dst->set_has_not_null(has_not_null);
}
void from_proto(const ZoneMapPB& src, TypeInfo* type_info) {
Slice min_slice(src.min());
min_value.resize_container_for_fit(type_info, &min_slice);
type_info->direct_copy(&min_value.value, &min_slice);
Slice max_slice(src.max());
max_value.resize_container_for_fit(type_info, &max_slice);
type_info->direct_copy(&max_value.value, &max_slice);
has_null = src.has_null();
has_not_null = src.has_not_null();
}
};
template <LogicalType type>
@ -179,6 +192,8 @@ public:
// mark the end of one data page so that we can finalize the corresponding zone map
Status flush() override;
std::optional<ZoneMapPB> get_last_zonemap() override;
Status finish(WritableFile* wfile, ColumnIndexMetaPB* index_meta) override;
uint64_t size() const override { return _estimated_size; }
@ -308,6 +323,18 @@ Status ZoneMapIndexWriterImpl<type>::flush() {
return Status::OK();
}
template <LogicalType type>
std::optional<ZoneMapPB> ZoneMapIndexWriterImpl<type>::get_last_zonemap() {
if (_values.empty()) {
return std::nullopt;
}
ZoneMapPB zone_map_pb;
if (!zone_map_pb.ParseFromString(_values.back())) {
return std::nullopt;
}
return zone_map_pb;
}
struct ZoneMapIndexWriterBuilder {
template <LogicalType ftype>
std::unique_ptr<ZoneMapIndexWriter> operator()(TypeInfo* type_info) {
@ -411,4 +438,95 @@ size_t ZoneMapIndexReader::mem_usage() const {
return size;
}
template <LogicalType type>
class ZoneMapIndexQualityJudgerImpl final : public ZoneMapIndexQualityJudger {
public:
ZoneMapIndexQualityJudgerImpl(TypeInfo* type_info, double overlap_threshold, int32_t sample_pages)
: _type_info(type_info), _overlap_threshold(overlap_threshold), _sample_pages(sample_pages) {}
~ZoneMapIndexQualityJudgerImpl() override = default;
void feed(const ZoneMapPB& page_zone_map) override;
CreateIndexDecision make_decision() const override;
private:
TypeInfo* _type_info;
const double _overlap_threshold;
const int32_t _sample_pages;
std::vector<ZoneMap<type>> _page_zone_maps;
};
struct ZoneMapIndexQualityJudgerBuilder {
template <LogicalType ftype>
std::unique_ptr<ZoneMapIndexQualityJudger> operator()(TypeInfo* type_info, double overlap_threshold,
int32_t sample_pages) {
return std::make_unique<ZoneMapIndexQualityJudgerImpl<ftype>>(type_info, overlap_threshold, sample_pages);
}
};
std::unique_ptr<ZoneMapIndexQualityJudger> ZoneMapIndexQualityJudger::create(TypeInfo* type_info,
double overlap_threshold,
int32_t sample_pages) {
return field_type_dispatch_zonemap_index(type_info->type(), ZoneMapIndexQualityJudgerBuilder(), type_info,
overlap_threshold, sample_pages);
}
template <LogicalType type>
void ZoneMapIndexQualityJudgerImpl<type>::feed(const ZoneMapPB& proto_zone_map) {
if (_page_zone_maps.size() < _sample_pages) {
ZoneMap<type> zone_map;
zone_map.from_proto(proto_zone_map, _type_info);
_page_zone_maps.push_back(std::move(zone_map));
}
}
template <LogicalType type>
struct ZoneMapWrapper {
const ZoneMap<type>& zone_map;
ZoneMapWrapper(const ZoneMap<type>& zone_map) : zone_map(zone_map) {}
bool is_overlap_with(const ZoneMapWrapper& other) const {
// If either zone map has null values, they can potentially overlap
// since null values can exist alongside any non-null values
if (zone_map.has_null || other.zone_map.has_null) {
return true;
}
// For non-null zones, check value range overlap
return (other.zone_map.min_value.value >= zone_map.min_value.value &&
other.zone_map.min_value.value <= zone_map.max_value.value) ||
(other.zone_map.max_value.value >= zone_map.min_value.value &&
other.zone_map.max_value.value <= zone_map.max_value.value);
}
};
template <LogicalType type>
CreateIndexDecision ZoneMapIndexQualityJudgerImpl<type>::make_decision() const {
// If not enough sampled pages, return Unknown
if (_page_zone_maps.size() < static_cast<size_t>(_sample_pages)) {
return CreateIndexDecision::Unknown;
}
std::vector<ZoneMapWrapper<type>> parsed_zonemap;
for (auto& zonemap : _page_zone_maps) {
parsed_zonemap.emplace_back(zonemap);
}
double total_overlap = 0.0;
for (size_t i = 0; i < parsed_zonemap.size(); ++i) {
for (size_t j = i + 1; j < parsed_zonemap.size(); ++j) {
if (parsed_zonemap[i].is_overlap_with(parsed_zonemap[j])) {
total_overlap += 1.0;
}
}
}
double overlap_ratio = total_overlap / (parsed_zonemap.size() * (parsed_zonemap.size() - 1) / 2.0);
// If overlap ratio is less than or equal to threshold, it's a good index
if (overlap_ratio <= _overlap_threshold) {
return CreateIndexDecision::Good;
} else {
return CreateIndexDecision::Bad;
}
}
} // namespace starrocks

View File

@ -71,6 +71,9 @@ public:
// mark the end of one data page so that we can finalize the corresponding zone map
virtual Status flush() = 0;
// Return the zonemap of last page
virtual std::optional<ZoneMapPB> get_last_zonemap() = 0;
virtual Status finish(WritableFile* wfile, ColumnIndexMetaPB* index_meta) = 0;
virtual uint64_t size() const = 0;
@ -110,4 +113,36 @@ private:
std::vector<ZoneMapPB> _page_zone_maps;
};
enum CreateIndexDecision {
Unknown,
Good, // It deserves to create the index
Bad, // It's a bad index
};
// ZoneMapIndexQualityJudger is used to judge whether to write index for string types based on overlap quality.
//
// Greater overlap implies reduced clustering. Therefore, clustering can be quantified using the overlap ratio of zonemaps.
// To calculate:
// 1. s_k(i): Sum of overlaps with page(i) across all pages.
// 2. overlap_ratio: Total sum of s_k divided by the square of the number of pages: (Σ s_k) / (num_pages²).
// 3. Quality: Defined as 1 - overlap_ratio, where higher values indicate better clustering.
// 4. If pages are perfectly separated: Quality ≈ 1.
// 5. If all pages overlap completely: Quality = 0.
class ZoneMapIndexQualityJudger {
public:
static std::unique_ptr<ZoneMapIndexQualityJudger> create(TypeInfo* type_info, double overlap_threshold,
int32_t sample_pages);
virtual ~ZoneMapIndexQualityJudger() = default;
// Feed the zonemap into this judger, it will be buffered until it can make a decision
virtual void feed(const ZoneMapPB& page_zone_map) = 0;
// Make a decision based on the overlap quality.
// If the overlap quality is good, return Good.
// If the overlap quality is bad, return Bad.
// If the sampled pages are not enough, return Unknown.
virtual CreateIndexDecision make_decision() const = 0;
};
} // namespace starrocks

View File

@ -141,7 +141,7 @@ public:
.format = formats::PARQUET,
.file_statistics =
{
.record_count = num_rows,
.record_count = static_cast<int64_t>(num_rows),
},
.location = "path/to/directory/data.parquet",
};

View File

@ -40,10 +40,13 @@
#include <string>
#include "cache/object_cache/page_cache.h"
#include "column/binary_column.h"
#include "common/config.h"
#include "fs/fs_memory.h"
#include "storage/rowset/column_writer.h"
#include "storage/tablet_schema_helper.h"
#include "testutil/assert.h"
#include "util/slice.h"
namespace starrocks {
@ -68,18 +71,18 @@ protected:
Slice slice(value);
builder->add_values((const uint8_t*)&slice, 1);
}
builder->flush();
ASSERT_OK(builder->flush());
std::vector<std::string> values2 = {"aaaaa", "bbbbb", "ccccc", "ddddd", "eeeee", "fffff"};
for (auto& value : values2) {
Slice slice(value);
builder->add_values((const uint8_t*)&slice, 1);
}
builder->add_nulls(1);
builder->flush();
ASSERT_OK(builder->flush());
for (int i = 0; i < 6; ++i) {
builder->add_nulls(1);
}
builder->flush();
ASSERT_OK(builder->flush());
// write out zone map index
ColumnIndexMetaPB index_meta;
{
@ -338,15 +341,15 @@ TEST_F(ColumnZoneMapTest, NormalTestIntPage) {
for (auto value : values1) {
builder->add_values((const uint8_t*)&value, 1);
}
builder->flush();
ASSERT_OK(builder->flush());
std::vector<int> values2 = {2, 12, 31, 23, 21, 22};
for (auto value : values2) {
builder->add_values((const uint8_t*)&value, 1);
}
builder->add_nulls(1);
builder->flush();
ASSERT_OK(builder->flush());
builder->add_nulls(6);
builder->flush();
ASSERT_OK(builder->flush());
// write out zone map index
ColumnIndexMetaPB index_meta;
write_file(*builder, index_meta, filename);
@ -406,7 +409,7 @@ TEST_F(ColumnZoneMapTest, VarbinaryWithBinaryData) {
Slice slice(value);
writer->add_values((const uint8_t*)&slice, 1);
}
writer->flush();
ASSERT_OK(writer->flush());
// Add more binary data with different patterns
std::vector<std::string> binary_values2 = {
@ -419,11 +422,11 @@ TEST_F(ColumnZoneMapTest, VarbinaryWithBinaryData) {
writer->add_values((const uint8_t*)&slice, 1);
}
writer->add_nulls(1);
writer->flush();
ASSERT_OK(writer->flush());
// Add null values
writer->add_nulls(3);
writer->flush();
ASSERT_OK(writer->flush());
// Write out zone map index
ColumnIndexMetaPB index_meta;
@ -515,4 +518,392 @@ TEST_F(ColumnZoneMapTest, StringPrefixZonemapVariants) {
config::string_prefix_zonemap_prefix_len = old_len;
}
} // namespace starrocks
class ZoneMapIndexQualityJudgerTest : public testing::Test {
protected:
void SetUp() override {}
// Helper function to create ZoneMapPB for testing
ZoneMapPB create_zone_map_pb(const std::string& min_val, const std::string& max_val, bool has_null = false,
bool has_not_null = true) {
ZoneMapPB zone_map;
zone_map.set_min(min_val);
zone_map.set_max(max_val);
zone_map.set_has_null(has_null);
zone_map.set_has_not_null(has_not_null);
return zone_map;
}
// Helper function to create TypeInfo for string types
TypeInfoPtr create_string_type_info() {
TabletColumn varchar_column = create_varchar_key(0);
return get_type_info(varchar_column);
}
};
TEST_F(ZoneMapIndexQualityJudgerTest, InsufficientSamplePages) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 5); // Require 5 pages
// Feed only 3 pages (less than required 5)
judger->feed(create_zone_map_pb("a", "c"));
judger->feed(create_zone_map_pb("d", "f"));
judger->feed(create_zone_map_pb("g", "i"));
// Should return Unknown when not enough samples
ASSERT_EQ(CreateIndexDecision::Unknown, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, GoodIndexLowOverlap) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.3, 3); // 30% overlap threshold
// Create well-separated zones with minimal overlap
judger->feed(create_zone_map_pb("a", "c")); // a-c
judger->feed(create_zone_map_pb("d", "f")); // d-f
judger->feed(create_zone_map_pb("g", "i")); // g-i
// These zones have no overlap, so overlap ratio = 0
// Since 0 <= 0.3, this should be a good index
ASSERT_EQ(CreateIndexDecision::Good, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, BadIndexHighOverlap) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.2, 3); // 20% overlap threshold
// Create highly overlapping zones
judger->feed(create_zone_map_pb("a", "z")); // a-z (covers everything)
judger->feed(create_zone_map_pb("b", "y")); // b-y (mostly overlaps with a-z)
judger->feed(create_zone_map_pb("c", "x")); // c-x (mostly overlaps with a-z)
// These zones have significant overlap
// Zone 1 overlaps with zones 2 and 3
// Zone 2 overlaps with zones 1 and 3
// Zone 3 overlaps with zones 1 and 2
// Total overlaps: 6 (excluding self-comparisons)
// Overlap ratio: 6 / (3 * 3) = 6/9 = 0.67
// Since 0.67 > 0.2, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, HighOverlapThreshold) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 3); // 50% overlap threshold
// Create zones with moderate overlap
judger->feed(create_zone_map_pb("a", "e")); // a-e
judger->feed(create_zone_map_pb("c", "g")); // c-g (overlaps with a-e)
judger->feed(create_zone_map_pb("f", "j")); // f-j (overlaps with c-g)
// Zone 1 overlaps with zone 2 (a-e overlaps with c-g from c to e)
// Zone 2 overlaps with zone 3 (c-g overlaps with f-j from f to g)
// Zone 1 does not overlap with zone 3 (a-e ends at e, f-j starts at f)
// Total overlaps: 2 (unique pairs)
// Overlap ratio: 2 / (3 * 2 / 2) = 2/3 = 0.67
// Since 0.67 > 0.5, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, EdgeCaseExactThreshold) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.33, 3); // 33.33% overlap threshold
// Create zones with exactly the threshold overlap
judger->feed(create_zone_map_pb("a", "d")); // a-d
judger->feed(create_zone_map_pb("c", "f")); // c-f (overlaps with a-d)
judger->feed(create_zone_map_pb("e", "h")); // e-h (overlaps with c-f)
// Zone 1 overlaps with zone 2 (a-d overlaps with c-f from c to d)
// Zone 2 overlaps with zone 3 (c-f overlaps with e-h from e to f)
// Zone 1 does not overlap with zone 3 (a-d ends at d, e-h starts at e)
// Total overlaps: 2 (unique pairs)
// Overlap ratio: 2 / (3 * 2 / 2) = 2/3 = 0.67
// Since 0.67 > 0.33, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, NullValueOverlapBehavior) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 4);
// Test case 1: Zones with no nulls - should not overlap
judger->feed(create_zone_map_pb("a", "c", false, true)); // a-c, no nulls
judger->feed(create_zone_map_pb("d", "f", false, true)); // d-f, no nulls
judger->feed(create_zone_map_pb("g", "i", false, true)); // g-i, no nulls
judger->feed(create_zone_map_pb("j", "l", false, true)); // j-l, no nulls
// These zones have no value overlap and no nulls
// Total overlaps: 0
// Overlap ratio: 0 / (4 * 3 / 2) = 0 / 6 = 0
// Since 0 <= 0.5, this should be a good index
ASSERT_EQ(CreateIndexDecision::Good, judger->make_decision());
// Test case 2: Mix of zones with and without nulls
auto judger2 = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 3);
judger2->feed(create_zone_map_pb("a", "c", false, true)); // a-c, no nulls
judger2->feed(create_zone_map_pb("d", "f", true, true)); // d-f, has nulls
judger2->feed(create_zone_map_pb("g", "i", false, true)); // g-i, no nulls
// Zone 1 overlaps with zone 2 (zone 2 has nulls)
// Zone 2 overlaps with zones 1 and 3 (zone 2 has nulls)
// Zone 3 overlaps with zone 2 (zone 2 has nulls)
// Total overlaps: 3 (each unique pair counted once)
// Overlap ratio: 3 / (3 * 2 / 2) = 3/3 = 1.0
// Since 1.0 > 0.5, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger2->make_decision());
// Test case 3: All zones have nulls - should all overlap
auto judger3 = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 3);
judger3->feed(create_zone_map_pb("a", "c", true, true)); // a-c, has nulls
judger3->feed(create_zone_map_pb("d", "f", true, true)); // d-f, has nulls
judger3->feed(create_zone_map_pb("g", "i", true, true)); // g-i, has nulls
// All zones have nulls, so they all overlap with each other
// Total overlaps: 3 (each unique pair counted once)
// Overlap ratio: 3 / (3 * 2 / 2) = 3/3 = 1.0
// Since 1.0 > 0.5, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger3->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, NullValueEdgeCases) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.3, 3);
// Test case: One zone with all nulls, others with no nulls
judger->feed(create_zone_map_pb("a", "c", false, true)); // a-c, no nulls
judger->feed(create_zone_map_pb("d", "f", false, true)); // d-f, no nulls
judger->feed(create_zone_map_pb("", "", true, false)); // all nulls
// Zone 3 (all nulls) overlaps with zones 1 and 2
// Zones 1 and 2 don't overlap with each other (no value overlap, no nulls)
// Total overlaps: 2 (each unique pair counted once)
// Overlap ratio: 2 / (3 * 2 / 2) = 2/3 = 0.67
// Since 0.67 > 0.3, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, LargeSampleSize) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.1, 10); // 10% overlap threshold, 10 pages
// Create 10 zones with minimal overlap
for (char c = 'a'; c <= 'j'; ++c) {
std::string min_val(1, c);
std::string max_val(1, c + 2);
judger->feed(create_zone_map_pb(min_val, max_val));
}
// These zones have minimal overlap
// Each zone overlaps with at most 2 others (adjacent ranges)
// Total overlaps: approximately 18 (each unique pair counted once)
// Overlap ratio: 18 / (10 * 9 / 2) = 18/45 = 0.4
// Since 0.4 > 0.1, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, ZeroOverlapThreshold) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.0, 3); // 0% overlap threshold
// Create zones with any overlap
judger->feed(create_zone_map_pb("a", "c"));
judger->feed(create_zone_map_pb("b", "d")); // Overlaps with a-c
judger->feed(create_zone_map_pb("e", "g"));
// Zone 1 overlaps with zone 2
// Total overlaps: 1 (each unique pair counted once)
// Overlap ratio: 1 / (3 * 2 / 2) = 1/3 = 0.33
// Since 0.33 > 0.0, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, OneHundredPercentOverlapThreshold) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 1.0, 3); // 100% overlap threshold
// Create zones with any overlap
judger->feed(create_zone_map_pb("a", "c"));
judger->feed(create_zone_map_pb("b", "d")); // Overlaps with a-c
judger->feed(create_zone_map_pb("e", "g"));
// Zone 1 overlaps with zone 2
// Total overlaps: 1 (each unique pair counted once)
// Overlap ratio: 1 / (3 * 2 / 2) = 1/3 = 0.33
// Since 0.33 <= 1.0, this should be a good index
ASSERT_EQ(CreateIndexDecision::Good, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, CompleteOverlap) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 3);
// Create zones with complete overlap
judger->feed(create_zone_map_pb("a", "z")); // a-z (covers everything)
judger->feed(create_zone_map_pb("a", "z")); // a-z (identical to first)
judger->feed(create_zone_map_pb("a", "z")); // a-z (identical to first)
// All zones are identical, so every zone overlaps with every other zone
// Total overlaps: 3 (each unique pair counted once)
// Overlap ratio: 3 / (3 * 2 / 2) = 3/3 = 1.0
// Since 1.0 > 0.5, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, DifferentDataTypes) {
// Test with different data types to ensure the judger works correctly
// Test with INT type
TabletColumn int_column = create_int_key(0);
TypeInfoPtr int_type_info = get_type_info(int_column);
auto int_judger = ZoneMapIndexQualityJudger::create(int_type_info.get(), 0.3, 3);
// Create int zones with no overlap
ZoneMapPB int_zone1;
int_zone1.set_min("1");
int_zone1.set_max("10");
int_zone1.set_has_null(false);
int_zone1.set_has_not_null(true);
ZoneMapPB int_zone2;
int_zone2.set_min("11");
int_zone2.set_max("20");
int_zone2.set_has_null(false);
int_zone2.set_has_not_null(true);
ZoneMapPB int_zone3;
int_zone3.set_min("21");
int_zone3.set_max("30");
int_zone3.set_has_null(false);
int_zone3.set_has_not_null(true);
int_judger->feed(int_zone1);
int_judger->feed(int_zone2);
int_judger->feed(int_zone3);
// These int zones have no overlap, so overlap ratio = 0
// Since 0 <= 0.3, this should be a good index
ASSERT_EQ(CreateIndexDecision::Good, int_judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, BoundaryConditions) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 2); // 2 pages minimum
// Test with exactly the minimum required pages
judger->feed(create_zone_map_pb("a", "c"));
judger->feed(create_zone_map_pb("d", "f"));
// Should be able to make a decision with exactly 2 pages
ASSERT_NE(CreateIndexDecision::Unknown, judger->make_decision());
// These zones have no overlap, so overlap ratio = 0
// Since 0 <= 0.5, this should be a good index
ASSERT_EQ(CreateIndexDecision::Good, judger->make_decision());
}
TEST_F(ZoneMapIndexQualityJudgerTest, OverlapCalculationAccuracy) {
auto type_info = create_string_type_info();
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.25, 4);
// Create zones with known overlap pattern
judger->feed(create_zone_map_pb("a", "c")); // a-c
judger->feed(create_zone_map_pb("b", "d")); // b-d (overlaps with a-c)
judger->feed(create_zone_map_pb("e", "g")); // e-g
judger->feed(create_zone_map_pb("f", "h")); // f-h (overlaps with e-g)
// Zone 1 overlaps with zone 2
// Zone 3 overlaps with zone 4
// Total overlaps: 2 (each unique pair counted once)
// Overlap ratio: 2 / (4 * 3 / 2) = 2/6 = 0.33
// Since 0.33 > 0.25, this should be a bad index
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
}
// Test class for ZoneMapIndexBuilder integration with ZoneMapIndexQualityJudger
class ZoneMapIndexBuilderIntegrationTest : public testing::Test {
protected:
const std::string kTestDir = "/zone_map_index_builder_test";
void SetUp() override {
_mem_tracker = std::make_unique<MemTracker>();
_fs = std::make_shared<MemoryFileSystem>();
ASSERT_TRUE(_fs->create_dir(kTestDir).ok());
}
void TearDown() override {}
std::unique_ptr<MemTracker> _mem_tracker;
std::shared_ptr<MemoryFileSystem> _fs;
};
TEST_F(ZoneMapIndexBuilderIntegrationTest, AdaptiveIndexCreation) {
// Set configuration for adaptive behavior
config::string_zonemap_min_pages_for_adaptive_check = 3;
config::string_zonemap_overlap_threshold = 0.5;
// Create a string column writer with zone map enabled
TabletColumn varchar_column = create_varchar_key(0);
ColumnWriterOptions opts;
ColumnMetaPB meta;
meta.set_column_id(0);
meta.set_unique_id(0);
meta.set_type(varchar_column.type());
meta.set_length(varchar_column.length());
meta.set_encoding(DEFAULT_ENCODING);
meta.set_compression(NO_COMPRESSION);
meta.set_is_nullable(false);
opts.meta = &meta;
opts.need_zone_map = true;
TypeInfoPtr type_info = get_type_info(varchar_column);
// Create in-memory file system for testing
auto fs = std::make_shared<MemoryFileSystem>();
ASSERT_TRUE(fs->create_dir("/tmp").ok());
ASSIGN_OR_ABORT(auto wfile, fs->new_writable_file("/tmp/zonemap_adaptive_test"));
auto writer = std::make_unique<ScalarColumnWriter>(opts, type_info, wfile.get());
ASSERT_TRUE(writer->init().ok());
// Add 3 pages with low overlap (should result in "Good" decision)
// Page 1: a-c
BinaryColumn col1;
col1.append(Slice("a"));
col1.append(Slice("b"));
col1.append(Slice("c"));
ASSERT_TRUE(writer->append(col1).ok());
ASSERT_TRUE(writer->finish_current_page().ok());
// Page 2: d-f
BinaryColumn col2;
col2.append(Slice("d"));
col2.append(Slice("e"));
col2.append(Slice("f"));
ASSERT_TRUE(writer->append(col2).ok());
ASSERT_TRUE(writer->finish_current_page().ok());
// Page 3: g-i
BinaryColumn col3;
col3.append(Slice("g"));
col3.append(Slice("h"));
col3.append(Slice("i"));
ASSERT_TRUE(writer->append(col3).ok());
ASSERT_TRUE(writer->finish_current_page().ok());
// Finish writing - this should trigger the quality judger decision
ASSERT_TRUE(writer->finish().ok());
// Write the zone map index - this is required to actually write the index to the file
ASSERT_TRUE(writer->write_zone_map().ok());
// Close the file
ASSERT_OK(wfile->close());
// Verify that the file was created and has content
// This indicates that the zone map index was created (not skipped by quality judger)
ASSERT_TRUE(fs->path_exists("/tmp/zonemap_adaptive_test").ok());
// Check that the file size is greater than 0
ASSIGN_OR_ABORT(auto file_size, fs->get_file_size("/tmp/zonemap_adaptive_test"));
ASSERT_GT(file_size, 0);
}
} // namespace starrocks