Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com> Co-authored-by: Cursor Agent <cursoragent@cursor.com>
This commit is contained in:
parent
512567775d
commit
f4ee640fc7
|
|
@ -323,10 +323,19 @@ CONF_mBool(enable_zonemap_index_memory_page_cache, "true");
|
|||
// whether to enable the ordinal index memory cache
|
||||
CONF_mBool(enable_ordinal_index_memory_page_cache, "true");
|
||||
|
||||
// ========================== ZONEMAP BEGIN ===================================
|
||||
// Enable ZoneMap for string (CHAR/VARCHAR) columns using prefix-based min/max
|
||||
CONF_mBool(enable_string_prefix_zonemap, "true");
|
||||
// Prefix length used for string ZoneMap min/max when enabled
|
||||
CONF_mInt32(string_prefix_zonemap_prefix_len, "16");
|
||||
// Adaptive creation of string zonemap index based on page overlap quality.
|
||||
// If the estimated overlap ratio across consecutive pages is greater than this threshold,
|
||||
// skip writing the page-level string zonemap index. Range: [0.0, 1.0].
|
||||
CONF_mDouble(string_zonemap_overlap_threshold, "0.8");
|
||||
// Minimum number of non-empty pages before applying the adaptive check.
|
||||
CONF_mInt32(string_zonemap_min_pages_for_adaptive_check, "16");
|
||||
|
||||
// ========================== ZONEMAP END ===================================
|
||||
|
||||
CONF_mInt32(base_compaction_check_interval_seconds, "60");
|
||||
CONF_mInt64(min_base_compaction_num_singleton_deltas, "5");
|
||||
|
|
@ -1739,4 +1748,5 @@ CONF_mInt64(split_exchanger_buffer_chunk_num, "1000");
|
|||
|
||||
// when to split hashmap/hashset into two level hashmap/hashset, negative number means use default value
|
||||
CONF_mInt64(two_level_memory_threshold, "-1");
|
||||
|
||||
} // namespace starrocks::config
|
||||
|
|
|
|||
|
|
@ -37,11 +37,7 @@
|
|||
#include <cstddef>
|
||||
#include <memory>
|
||||
|
||||
#include "column/array_column.h"
|
||||
#include "column/column_helper.h"
|
||||
#include "column/hash_set.h"
|
||||
#include "column/nullable_column.h"
|
||||
#include "common/logging.h"
|
||||
#include "fs/fs.h"
|
||||
#include "gutil/strings/substitute.h"
|
||||
#include "simd/simd.h"
|
||||
|
|
@ -397,6 +393,11 @@ Status ScalarColumnWriter::init() {
|
|||
if (_opts.zone_map_truncate_string) {
|
||||
_zone_map_index_builder->enable_truncate_string();
|
||||
}
|
||||
if (is_string_type(_type_info->type())) {
|
||||
_zone_map_index_quality_judger =
|
||||
ZoneMapIndexQualityJudger::create(_type_info.get(), config::string_zonemap_overlap_threshold,
|
||||
config::string_zonemap_min_pages_for_adaptive_check);
|
||||
}
|
||||
}
|
||||
if (_opts.need_bitmap_index) {
|
||||
_has_index_builder = true;
|
||||
|
|
@ -594,6 +595,22 @@ Status ScalarColumnWriter::_write_data_page(Page* page) {
|
|||
Status ScalarColumnWriter::finish_current_page() {
|
||||
if (_zone_map_index_builder != nullptr) {
|
||||
RETURN_IF_ERROR(_zone_map_index_builder->flush());
|
||||
if (_zone_map_index_quality_judger != nullptr) {
|
||||
std::optional<ZoneMapPB> last_zonemap = _zone_map_index_builder->get_last_zonemap();
|
||||
if (last_zonemap.has_value()) {
|
||||
_zone_map_index_quality_judger->feed(last_zonemap.value());
|
||||
}
|
||||
CreateIndexDecision decision = _zone_map_index_quality_judger->make_decision();
|
||||
if (decision == CreateIndexDecision::Bad) {
|
||||
_zone_map_index_builder.reset();
|
||||
_zone_map_index_quality_judger.reset();
|
||||
VLOG(2) << "ZoneMapIndexQualityJudger decided to not create the index for this column";
|
||||
} else if (decision == CreateIndexDecision::Good) {
|
||||
// Stop judging
|
||||
_zone_map_index_quality_judger.reset();
|
||||
VLOG(2) << "ZoneMapIndexQualityJudger decided to create the index for this column";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (_bloom_filter_index_builder != nullptr) {
|
||||
|
|
|
|||
|
|
@ -138,6 +138,7 @@ class OrdinalIndexWriter;
|
|||
class PageBuilder;
|
||||
class BloomFilterIndexWriter;
|
||||
class ZoneMapIndexWriter;
|
||||
class ZoneMapIndexQualityJudger;
|
||||
|
||||
class ColumnWriter {
|
||||
public:
|
||||
|
|
@ -299,6 +300,7 @@ private:
|
|||
|
||||
std::unique_ptr<OrdinalIndexWriter> _ordinal_index_builder;
|
||||
std::unique_ptr<ZoneMapIndexWriter> _zone_map_index_builder;
|
||||
std::unique_ptr<ZoneMapIndexQualityJudger> _zone_map_index_quality_judger;
|
||||
std::unique_ptr<BitmapIndexWriter> _bitmap_index_builder;
|
||||
std::unique_ptr<BloomFilterIndexWriter> _bloom_filter_index_builder;
|
||||
std::unique_ptr<InvertedWriter> _inverted_index_builder;
|
||||
|
|
|
|||
|
|
@ -159,6 +159,19 @@ struct ZoneMap {
|
|||
dst->set_has_null(has_null);
|
||||
dst->set_has_not_null(has_not_null);
|
||||
}
|
||||
|
||||
void from_proto(const ZoneMapPB& src, TypeInfo* type_info) {
|
||||
Slice min_slice(src.min());
|
||||
min_value.resize_container_for_fit(type_info, &min_slice);
|
||||
type_info->direct_copy(&min_value.value, &min_slice);
|
||||
|
||||
Slice max_slice(src.max());
|
||||
max_value.resize_container_for_fit(type_info, &max_slice);
|
||||
type_info->direct_copy(&max_value.value, &max_slice);
|
||||
|
||||
has_null = src.has_null();
|
||||
has_not_null = src.has_not_null();
|
||||
}
|
||||
};
|
||||
|
||||
template <LogicalType type>
|
||||
|
|
@ -179,6 +192,8 @@ public:
|
|||
// mark the end of one data page so that we can finalize the corresponding zone map
|
||||
Status flush() override;
|
||||
|
||||
std::optional<ZoneMapPB> get_last_zonemap() override;
|
||||
|
||||
Status finish(WritableFile* wfile, ColumnIndexMetaPB* index_meta) override;
|
||||
|
||||
uint64_t size() const override { return _estimated_size; }
|
||||
|
|
@ -308,6 +323,18 @@ Status ZoneMapIndexWriterImpl<type>::flush() {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
template <LogicalType type>
|
||||
std::optional<ZoneMapPB> ZoneMapIndexWriterImpl<type>::get_last_zonemap() {
|
||||
if (_values.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
ZoneMapPB zone_map_pb;
|
||||
if (!zone_map_pb.ParseFromString(_values.back())) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return zone_map_pb;
|
||||
}
|
||||
|
||||
struct ZoneMapIndexWriterBuilder {
|
||||
template <LogicalType ftype>
|
||||
std::unique_ptr<ZoneMapIndexWriter> operator()(TypeInfo* type_info) {
|
||||
|
|
@ -411,4 +438,95 @@ size_t ZoneMapIndexReader::mem_usage() const {
|
|||
return size;
|
||||
}
|
||||
|
||||
template <LogicalType type>
|
||||
class ZoneMapIndexQualityJudgerImpl final : public ZoneMapIndexQualityJudger {
|
||||
public:
|
||||
ZoneMapIndexQualityJudgerImpl(TypeInfo* type_info, double overlap_threshold, int32_t sample_pages)
|
||||
: _type_info(type_info), _overlap_threshold(overlap_threshold), _sample_pages(sample_pages) {}
|
||||
~ZoneMapIndexQualityJudgerImpl() override = default;
|
||||
|
||||
void feed(const ZoneMapPB& page_zone_map) override;
|
||||
CreateIndexDecision make_decision() const override;
|
||||
|
||||
private:
|
||||
TypeInfo* _type_info;
|
||||
const double _overlap_threshold;
|
||||
const int32_t _sample_pages;
|
||||
std::vector<ZoneMap<type>> _page_zone_maps;
|
||||
};
|
||||
|
||||
struct ZoneMapIndexQualityJudgerBuilder {
|
||||
template <LogicalType ftype>
|
||||
std::unique_ptr<ZoneMapIndexQualityJudger> operator()(TypeInfo* type_info, double overlap_threshold,
|
||||
int32_t sample_pages) {
|
||||
return std::make_unique<ZoneMapIndexQualityJudgerImpl<ftype>>(type_info, overlap_threshold, sample_pages);
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<ZoneMapIndexQualityJudger> ZoneMapIndexQualityJudger::create(TypeInfo* type_info,
|
||||
double overlap_threshold,
|
||||
int32_t sample_pages) {
|
||||
return field_type_dispatch_zonemap_index(type_info->type(), ZoneMapIndexQualityJudgerBuilder(), type_info,
|
||||
overlap_threshold, sample_pages);
|
||||
}
|
||||
|
||||
template <LogicalType type>
|
||||
void ZoneMapIndexQualityJudgerImpl<type>::feed(const ZoneMapPB& proto_zone_map) {
|
||||
if (_page_zone_maps.size() < _sample_pages) {
|
||||
ZoneMap<type> zone_map;
|
||||
zone_map.from_proto(proto_zone_map, _type_info);
|
||||
_page_zone_maps.push_back(std::move(zone_map));
|
||||
}
|
||||
}
|
||||
|
||||
template <LogicalType type>
|
||||
struct ZoneMapWrapper {
|
||||
const ZoneMap<type>& zone_map;
|
||||
|
||||
ZoneMapWrapper(const ZoneMap<type>& zone_map) : zone_map(zone_map) {}
|
||||
|
||||
bool is_overlap_with(const ZoneMapWrapper& other) const {
|
||||
// If either zone map has null values, they can potentially overlap
|
||||
// since null values can exist alongside any non-null values
|
||||
if (zone_map.has_null || other.zone_map.has_null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// For non-null zones, check value range overlap
|
||||
return (other.zone_map.min_value.value >= zone_map.min_value.value &&
|
||||
other.zone_map.min_value.value <= zone_map.max_value.value) ||
|
||||
(other.zone_map.max_value.value >= zone_map.min_value.value &&
|
||||
other.zone_map.max_value.value <= zone_map.max_value.value);
|
||||
}
|
||||
};
|
||||
|
||||
template <LogicalType type>
|
||||
CreateIndexDecision ZoneMapIndexQualityJudgerImpl<type>::make_decision() const {
|
||||
// If not enough sampled pages, return Unknown
|
||||
if (_page_zone_maps.size() < static_cast<size_t>(_sample_pages)) {
|
||||
return CreateIndexDecision::Unknown;
|
||||
}
|
||||
|
||||
std::vector<ZoneMapWrapper<type>> parsed_zonemap;
|
||||
for (auto& zonemap : _page_zone_maps) {
|
||||
parsed_zonemap.emplace_back(zonemap);
|
||||
}
|
||||
|
||||
double total_overlap = 0.0;
|
||||
for (size_t i = 0; i < parsed_zonemap.size(); ++i) {
|
||||
for (size_t j = i + 1; j < parsed_zonemap.size(); ++j) {
|
||||
if (parsed_zonemap[i].is_overlap_with(parsed_zonemap[j])) {
|
||||
total_overlap += 1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
double overlap_ratio = total_overlap / (parsed_zonemap.size() * (parsed_zonemap.size() - 1) / 2.0);
|
||||
// If overlap ratio is less than or equal to threshold, it's a good index
|
||||
if (overlap_ratio <= _overlap_threshold) {
|
||||
return CreateIndexDecision::Good;
|
||||
} else {
|
||||
return CreateIndexDecision::Bad;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
|
|
@ -71,6 +71,9 @@ public:
|
|||
// mark the end of one data page so that we can finalize the corresponding zone map
|
||||
virtual Status flush() = 0;
|
||||
|
||||
// Return the zonemap of last page
|
||||
virtual std::optional<ZoneMapPB> get_last_zonemap() = 0;
|
||||
|
||||
virtual Status finish(WritableFile* wfile, ColumnIndexMetaPB* index_meta) = 0;
|
||||
|
||||
virtual uint64_t size() const = 0;
|
||||
|
|
@ -110,4 +113,36 @@ private:
|
|||
std::vector<ZoneMapPB> _page_zone_maps;
|
||||
};
|
||||
|
||||
enum CreateIndexDecision {
|
||||
Unknown,
|
||||
Good, // It deserves to create the index
|
||||
Bad, // It's a bad index
|
||||
};
|
||||
|
||||
// ZoneMapIndexQualityJudger is used to judge whether to write index for string types based on overlap quality.
|
||||
//
|
||||
// Greater overlap implies reduced clustering. Therefore, clustering can be quantified using the overlap ratio of zonemaps.
|
||||
// To calculate:
|
||||
// 1. s_k(i): Sum of overlaps with page(i) across all pages.
|
||||
// 2. overlap_ratio: Total sum of s_k divided by the square of the number of pages: (Σ s_k) / (num_pages²).
|
||||
// 3. Quality: Defined as 1 - overlap_ratio, where higher values indicate better clustering.
|
||||
// 4. If pages are perfectly separated: Quality ≈ 1.
|
||||
// 5. If all pages overlap completely: Quality = 0.
|
||||
class ZoneMapIndexQualityJudger {
|
||||
public:
|
||||
static std::unique_ptr<ZoneMapIndexQualityJudger> create(TypeInfo* type_info, double overlap_threshold,
|
||||
int32_t sample_pages);
|
||||
|
||||
virtual ~ZoneMapIndexQualityJudger() = default;
|
||||
|
||||
// Feed the zonemap into this judger, it will be buffered until it can make a decision
|
||||
virtual void feed(const ZoneMapPB& page_zone_map) = 0;
|
||||
|
||||
// Make a decision based on the overlap quality.
|
||||
// If the overlap quality is good, return Good.
|
||||
// If the overlap quality is bad, return Bad.
|
||||
// If the sampled pages are not enough, return Unknown.
|
||||
virtual CreateIndexDecision make_decision() const = 0;
|
||||
};
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ public:
|
|||
.format = formats::PARQUET,
|
||||
.file_statistics =
|
||||
{
|
||||
.record_count = num_rows,
|
||||
.record_count = static_cast<int64_t>(num_rows),
|
||||
},
|
||||
.location = "path/to/directory/data.parquet",
|
||||
};
|
||||
|
|
|
|||
|
|
@ -40,10 +40,13 @@
|
|||
#include <string>
|
||||
|
||||
#include "cache/object_cache/page_cache.h"
|
||||
#include "column/binary_column.h"
|
||||
#include "common/config.h"
|
||||
#include "fs/fs_memory.h"
|
||||
#include "storage/rowset/column_writer.h"
|
||||
#include "storage/tablet_schema_helper.h"
|
||||
#include "testutil/assert.h"
|
||||
#include "util/slice.h"
|
||||
|
||||
namespace starrocks {
|
||||
|
||||
|
|
@ -68,18 +71,18 @@ protected:
|
|||
Slice slice(value);
|
||||
builder->add_values((const uint8_t*)&slice, 1);
|
||||
}
|
||||
builder->flush();
|
||||
ASSERT_OK(builder->flush());
|
||||
std::vector<std::string> values2 = {"aaaaa", "bbbbb", "ccccc", "ddddd", "eeeee", "fffff"};
|
||||
for (auto& value : values2) {
|
||||
Slice slice(value);
|
||||
builder->add_values((const uint8_t*)&slice, 1);
|
||||
}
|
||||
builder->add_nulls(1);
|
||||
builder->flush();
|
||||
ASSERT_OK(builder->flush());
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
builder->add_nulls(1);
|
||||
}
|
||||
builder->flush();
|
||||
ASSERT_OK(builder->flush());
|
||||
// write out zone map index
|
||||
ColumnIndexMetaPB index_meta;
|
||||
{
|
||||
|
|
@ -338,15 +341,15 @@ TEST_F(ColumnZoneMapTest, NormalTestIntPage) {
|
|||
for (auto value : values1) {
|
||||
builder->add_values((const uint8_t*)&value, 1);
|
||||
}
|
||||
builder->flush();
|
||||
ASSERT_OK(builder->flush());
|
||||
std::vector<int> values2 = {2, 12, 31, 23, 21, 22};
|
||||
for (auto value : values2) {
|
||||
builder->add_values((const uint8_t*)&value, 1);
|
||||
}
|
||||
builder->add_nulls(1);
|
||||
builder->flush();
|
||||
ASSERT_OK(builder->flush());
|
||||
builder->add_nulls(6);
|
||||
builder->flush();
|
||||
ASSERT_OK(builder->flush());
|
||||
// write out zone map index
|
||||
ColumnIndexMetaPB index_meta;
|
||||
write_file(*builder, index_meta, filename);
|
||||
|
|
@ -406,7 +409,7 @@ TEST_F(ColumnZoneMapTest, VarbinaryWithBinaryData) {
|
|||
Slice slice(value);
|
||||
writer->add_values((const uint8_t*)&slice, 1);
|
||||
}
|
||||
writer->flush();
|
||||
ASSERT_OK(writer->flush());
|
||||
|
||||
// Add more binary data with different patterns
|
||||
std::vector<std::string> binary_values2 = {
|
||||
|
|
@ -419,11 +422,11 @@ TEST_F(ColumnZoneMapTest, VarbinaryWithBinaryData) {
|
|||
writer->add_values((const uint8_t*)&slice, 1);
|
||||
}
|
||||
writer->add_nulls(1);
|
||||
writer->flush();
|
||||
ASSERT_OK(writer->flush());
|
||||
|
||||
// Add null values
|
||||
writer->add_nulls(3);
|
||||
writer->flush();
|
||||
ASSERT_OK(writer->flush());
|
||||
|
||||
// Write out zone map index
|
||||
ColumnIndexMetaPB index_meta;
|
||||
|
|
@ -515,4 +518,392 @@ TEST_F(ColumnZoneMapTest, StringPrefixZonemapVariants) {
|
|||
config::string_prefix_zonemap_prefix_len = old_len;
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
class ZoneMapIndexQualityJudgerTest : public testing::Test {
|
||||
protected:
|
||||
void SetUp() override {}
|
||||
|
||||
// Helper function to create ZoneMapPB for testing
|
||||
ZoneMapPB create_zone_map_pb(const std::string& min_val, const std::string& max_val, bool has_null = false,
|
||||
bool has_not_null = true) {
|
||||
ZoneMapPB zone_map;
|
||||
zone_map.set_min(min_val);
|
||||
zone_map.set_max(max_val);
|
||||
zone_map.set_has_null(has_null);
|
||||
zone_map.set_has_not_null(has_not_null);
|
||||
return zone_map;
|
||||
}
|
||||
|
||||
// Helper function to create TypeInfo for string types
|
||||
TypeInfoPtr create_string_type_info() {
|
||||
TabletColumn varchar_column = create_varchar_key(0);
|
||||
return get_type_info(varchar_column);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, InsufficientSamplePages) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 5); // Require 5 pages
|
||||
|
||||
// Feed only 3 pages (less than required 5)
|
||||
judger->feed(create_zone_map_pb("a", "c"));
|
||||
judger->feed(create_zone_map_pb("d", "f"));
|
||||
judger->feed(create_zone_map_pb("g", "i"));
|
||||
|
||||
// Should return Unknown when not enough samples
|
||||
ASSERT_EQ(CreateIndexDecision::Unknown, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, GoodIndexLowOverlap) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.3, 3); // 30% overlap threshold
|
||||
|
||||
// Create well-separated zones with minimal overlap
|
||||
judger->feed(create_zone_map_pb("a", "c")); // a-c
|
||||
judger->feed(create_zone_map_pb("d", "f")); // d-f
|
||||
judger->feed(create_zone_map_pb("g", "i")); // g-i
|
||||
|
||||
// These zones have no overlap, so overlap ratio = 0
|
||||
// Since 0 <= 0.3, this should be a good index
|
||||
ASSERT_EQ(CreateIndexDecision::Good, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, BadIndexHighOverlap) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.2, 3); // 20% overlap threshold
|
||||
|
||||
// Create highly overlapping zones
|
||||
judger->feed(create_zone_map_pb("a", "z")); // a-z (covers everything)
|
||||
judger->feed(create_zone_map_pb("b", "y")); // b-y (mostly overlaps with a-z)
|
||||
judger->feed(create_zone_map_pb("c", "x")); // c-x (mostly overlaps with a-z)
|
||||
|
||||
// These zones have significant overlap
|
||||
// Zone 1 overlaps with zones 2 and 3
|
||||
// Zone 2 overlaps with zones 1 and 3
|
||||
// Zone 3 overlaps with zones 1 and 2
|
||||
// Total overlaps: 6 (excluding self-comparisons)
|
||||
// Overlap ratio: 6 / (3 * 3) = 6/9 = 0.67
|
||||
// Since 0.67 > 0.2, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, HighOverlapThreshold) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 3); // 50% overlap threshold
|
||||
|
||||
// Create zones with moderate overlap
|
||||
judger->feed(create_zone_map_pb("a", "e")); // a-e
|
||||
judger->feed(create_zone_map_pb("c", "g")); // c-g (overlaps with a-e)
|
||||
judger->feed(create_zone_map_pb("f", "j")); // f-j (overlaps with c-g)
|
||||
|
||||
// Zone 1 overlaps with zone 2 (a-e overlaps with c-g from c to e)
|
||||
// Zone 2 overlaps with zone 3 (c-g overlaps with f-j from f to g)
|
||||
// Zone 1 does not overlap with zone 3 (a-e ends at e, f-j starts at f)
|
||||
// Total overlaps: 2 (unique pairs)
|
||||
// Overlap ratio: 2 / (3 * 2 / 2) = 2/3 = 0.67
|
||||
// Since 0.67 > 0.5, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, EdgeCaseExactThreshold) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.33, 3); // 33.33% overlap threshold
|
||||
|
||||
// Create zones with exactly the threshold overlap
|
||||
judger->feed(create_zone_map_pb("a", "d")); // a-d
|
||||
judger->feed(create_zone_map_pb("c", "f")); // c-f (overlaps with a-d)
|
||||
judger->feed(create_zone_map_pb("e", "h")); // e-h (overlaps with c-f)
|
||||
|
||||
// Zone 1 overlaps with zone 2 (a-d overlaps with c-f from c to d)
|
||||
// Zone 2 overlaps with zone 3 (c-f overlaps with e-h from e to f)
|
||||
// Zone 1 does not overlap with zone 3 (a-d ends at d, e-h starts at e)
|
||||
// Total overlaps: 2 (unique pairs)
|
||||
// Overlap ratio: 2 / (3 * 2 / 2) = 2/3 = 0.67
|
||||
// Since 0.67 > 0.33, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, NullValueOverlapBehavior) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 4);
|
||||
|
||||
// Test case 1: Zones with no nulls - should not overlap
|
||||
judger->feed(create_zone_map_pb("a", "c", false, true)); // a-c, no nulls
|
||||
judger->feed(create_zone_map_pb("d", "f", false, true)); // d-f, no nulls
|
||||
judger->feed(create_zone_map_pb("g", "i", false, true)); // g-i, no nulls
|
||||
judger->feed(create_zone_map_pb("j", "l", false, true)); // j-l, no nulls
|
||||
|
||||
// These zones have no value overlap and no nulls
|
||||
// Total overlaps: 0
|
||||
// Overlap ratio: 0 / (4 * 3 / 2) = 0 / 6 = 0
|
||||
// Since 0 <= 0.5, this should be a good index
|
||||
ASSERT_EQ(CreateIndexDecision::Good, judger->make_decision());
|
||||
|
||||
// Test case 2: Mix of zones with and without nulls
|
||||
auto judger2 = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 3);
|
||||
judger2->feed(create_zone_map_pb("a", "c", false, true)); // a-c, no nulls
|
||||
judger2->feed(create_zone_map_pb("d", "f", true, true)); // d-f, has nulls
|
||||
judger2->feed(create_zone_map_pb("g", "i", false, true)); // g-i, no nulls
|
||||
|
||||
// Zone 1 overlaps with zone 2 (zone 2 has nulls)
|
||||
// Zone 2 overlaps with zones 1 and 3 (zone 2 has nulls)
|
||||
// Zone 3 overlaps with zone 2 (zone 2 has nulls)
|
||||
// Total overlaps: 3 (each unique pair counted once)
|
||||
// Overlap ratio: 3 / (3 * 2 / 2) = 3/3 = 1.0
|
||||
// Since 1.0 > 0.5, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger2->make_decision());
|
||||
|
||||
// Test case 3: All zones have nulls - should all overlap
|
||||
auto judger3 = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 3);
|
||||
judger3->feed(create_zone_map_pb("a", "c", true, true)); // a-c, has nulls
|
||||
judger3->feed(create_zone_map_pb("d", "f", true, true)); // d-f, has nulls
|
||||
judger3->feed(create_zone_map_pb("g", "i", true, true)); // g-i, has nulls
|
||||
|
||||
// All zones have nulls, so they all overlap with each other
|
||||
// Total overlaps: 3 (each unique pair counted once)
|
||||
// Overlap ratio: 3 / (3 * 2 / 2) = 3/3 = 1.0
|
||||
// Since 1.0 > 0.5, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger3->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, NullValueEdgeCases) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.3, 3);
|
||||
|
||||
// Test case: One zone with all nulls, others with no nulls
|
||||
judger->feed(create_zone_map_pb("a", "c", false, true)); // a-c, no nulls
|
||||
judger->feed(create_zone_map_pb("d", "f", false, true)); // d-f, no nulls
|
||||
judger->feed(create_zone_map_pb("", "", true, false)); // all nulls
|
||||
|
||||
// Zone 3 (all nulls) overlaps with zones 1 and 2
|
||||
// Zones 1 and 2 don't overlap with each other (no value overlap, no nulls)
|
||||
// Total overlaps: 2 (each unique pair counted once)
|
||||
// Overlap ratio: 2 / (3 * 2 / 2) = 2/3 = 0.67
|
||||
// Since 0.67 > 0.3, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, LargeSampleSize) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.1, 10); // 10% overlap threshold, 10 pages
|
||||
|
||||
// Create 10 zones with minimal overlap
|
||||
for (char c = 'a'; c <= 'j'; ++c) {
|
||||
std::string min_val(1, c);
|
||||
std::string max_val(1, c + 2);
|
||||
judger->feed(create_zone_map_pb(min_val, max_val));
|
||||
}
|
||||
|
||||
// These zones have minimal overlap
|
||||
// Each zone overlaps with at most 2 others (adjacent ranges)
|
||||
// Total overlaps: approximately 18 (each unique pair counted once)
|
||||
// Overlap ratio: 18 / (10 * 9 / 2) = 18/45 = 0.4
|
||||
// Since 0.4 > 0.1, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, ZeroOverlapThreshold) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.0, 3); // 0% overlap threshold
|
||||
|
||||
// Create zones with any overlap
|
||||
judger->feed(create_zone_map_pb("a", "c"));
|
||||
judger->feed(create_zone_map_pb("b", "d")); // Overlaps with a-c
|
||||
judger->feed(create_zone_map_pb("e", "g"));
|
||||
|
||||
// Zone 1 overlaps with zone 2
|
||||
// Total overlaps: 1 (each unique pair counted once)
|
||||
// Overlap ratio: 1 / (3 * 2 / 2) = 1/3 = 0.33
|
||||
// Since 0.33 > 0.0, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, OneHundredPercentOverlapThreshold) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 1.0, 3); // 100% overlap threshold
|
||||
|
||||
// Create zones with any overlap
|
||||
judger->feed(create_zone_map_pb("a", "c"));
|
||||
judger->feed(create_zone_map_pb("b", "d")); // Overlaps with a-c
|
||||
judger->feed(create_zone_map_pb("e", "g"));
|
||||
|
||||
// Zone 1 overlaps with zone 2
|
||||
// Total overlaps: 1 (each unique pair counted once)
|
||||
// Overlap ratio: 1 / (3 * 2 / 2) = 1/3 = 0.33
|
||||
// Since 0.33 <= 1.0, this should be a good index
|
||||
ASSERT_EQ(CreateIndexDecision::Good, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, CompleteOverlap) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 3);
|
||||
|
||||
// Create zones with complete overlap
|
||||
judger->feed(create_zone_map_pb("a", "z")); // a-z (covers everything)
|
||||
judger->feed(create_zone_map_pb("a", "z")); // a-z (identical to first)
|
||||
judger->feed(create_zone_map_pb("a", "z")); // a-z (identical to first)
|
||||
|
||||
// All zones are identical, so every zone overlaps with every other zone
|
||||
// Total overlaps: 3 (each unique pair counted once)
|
||||
// Overlap ratio: 3 / (3 * 2 / 2) = 3/3 = 1.0
|
||||
// Since 1.0 > 0.5, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, DifferentDataTypes) {
|
||||
// Test with different data types to ensure the judger works correctly
|
||||
|
||||
// Test with INT type
|
||||
TabletColumn int_column = create_int_key(0);
|
||||
TypeInfoPtr int_type_info = get_type_info(int_column);
|
||||
auto int_judger = ZoneMapIndexQualityJudger::create(int_type_info.get(), 0.3, 3);
|
||||
|
||||
// Create int zones with no overlap
|
||||
ZoneMapPB int_zone1;
|
||||
int_zone1.set_min("1");
|
||||
int_zone1.set_max("10");
|
||||
int_zone1.set_has_null(false);
|
||||
int_zone1.set_has_not_null(true);
|
||||
|
||||
ZoneMapPB int_zone2;
|
||||
int_zone2.set_min("11");
|
||||
int_zone2.set_max("20");
|
||||
int_zone2.set_has_null(false);
|
||||
int_zone2.set_has_not_null(true);
|
||||
|
||||
ZoneMapPB int_zone3;
|
||||
int_zone3.set_min("21");
|
||||
int_zone3.set_max("30");
|
||||
int_zone3.set_has_null(false);
|
||||
int_zone3.set_has_not_null(true);
|
||||
|
||||
int_judger->feed(int_zone1);
|
||||
int_judger->feed(int_zone2);
|
||||
int_judger->feed(int_zone3);
|
||||
|
||||
// These int zones have no overlap, so overlap ratio = 0
|
||||
// Since 0 <= 0.3, this should be a good index
|
||||
ASSERT_EQ(CreateIndexDecision::Good, int_judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, BoundaryConditions) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.5, 2); // 2 pages minimum
|
||||
|
||||
// Test with exactly the minimum required pages
|
||||
judger->feed(create_zone_map_pb("a", "c"));
|
||||
judger->feed(create_zone_map_pb("d", "f"));
|
||||
|
||||
// Should be able to make a decision with exactly 2 pages
|
||||
ASSERT_NE(CreateIndexDecision::Unknown, judger->make_decision());
|
||||
|
||||
// These zones have no overlap, so overlap ratio = 0
|
||||
// Since 0 <= 0.5, this should be a good index
|
||||
ASSERT_EQ(CreateIndexDecision::Good, judger->make_decision());
|
||||
}
|
||||
|
||||
TEST_F(ZoneMapIndexQualityJudgerTest, OverlapCalculationAccuracy) {
|
||||
auto type_info = create_string_type_info();
|
||||
auto judger = ZoneMapIndexQualityJudger::create(type_info.get(), 0.25, 4);
|
||||
|
||||
// Create zones with known overlap pattern
|
||||
judger->feed(create_zone_map_pb("a", "c")); // a-c
|
||||
judger->feed(create_zone_map_pb("b", "d")); // b-d (overlaps with a-c)
|
||||
judger->feed(create_zone_map_pb("e", "g")); // e-g
|
||||
judger->feed(create_zone_map_pb("f", "h")); // f-h (overlaps with e-g)
|
||||
|
||||
// Zone 1 overlaps with zone 2
|
||||
// Zone 3 overlaps with zone 4
|
||||
// Total overlaps: 2 (each unique pair counted once)
|
||||
// Overlap ratio: 2 / (4 * 3 / 2) = 2/6 = 0.33
|
||||
// Since 0.33 > 0.25, this should be a bad index
|
||||
ASSERT_EQ(CreateIndexDecision::Bad, judger->make_decision());
|
||||
}
|
||||
|
||||
// Test class for ZoneMapIndexBuilder integration with ZoneMapIndexQualityJudger
|
||||
class ZoneMapIndexBuilderIntegrationTest : public testing::Test {
|
||||
protected:
|
||||
const std::string kTestDir = "/zone_map_index_builder_test";
|
||||
|
||||
void SetUp() override {
|
||||
_mem_tracker = std::make_unique<MemTracker>();
|
||||
_fs = std::make_shared<MemoryFileSystem>();
|
||||
ASSERT_TRUE(_fs->create_dir(kTestDir).ok());
|
||||
}
|
||||
|
||||
void TearDown() override {}
|
||||
|
||||
std::unique_ptr<MemTracker> _mem_tracker;
|
||||
std::shared_ptr<MemoryFileSystem> _fs;
|
||||
};
|
||||
|
||||
TEST_F(ZoneMapIndexBuilderIntegrationTest, AdaptiveIndexCreation) {
|
||||
// Set configuration for adaptive behavior
|
||||
config::string_zonemap_min_pages_for_adaptive_check = 3;
|
||||
config::string_zonemap_overlap_threshold = 0.5;
|
||||
|
||||
// Create a string column writer with zone map enabled
|
||||
TabletColumn varchar_column = create_varchar_key(0);
|
||||
ColumnWriterOptions opts;
|
||||
ColumnMetaPB meta;
|
||||
meta.set_column_id(0);
|
||||
meta.set_unique_id(0);
|
||||
meta.set_type(varchar_column.type());
|
||||
meta.set_length(varchar_column.length());
|
||||
meta.set_encoding(DEFAULT_ENCODING);
|
||||
meta.set_compression(NO_COMPRESSION);
|
||||
meta.set_is_nullable(false);
|
||||
opts.meta = &meta;
|
||||
opts.need_zone_map = true;
|
||||
TypeInfoPtr type_info = get_type_info(varchar_column);
|
||||
|
||||
// Create in-memory file system for testing
|
||||
auto fs = std::make_shared<MemoryFileSystem>();
|
||||
ASSERT_TRUE(fs->create_dir("/tmp").ok());
|
||||
ASSIGN_OR_ABORT(auto wfile, fs->new_writable_file("/tmp/zonemap_adaptive_test"));
|
||||
|
||||
auto writer = std::make_unique<ScalarColumnWriter>(opts, type_info, wfile.get());
|
||||
ASSERT_TRUE(writer->init().ok());
|
||||
|
||||
// Add 3 pages with low overlap (should result in "Good" decision)
|
||||
// Page 1: a-c
|
||||
BinaryColumn col1;
|
||||
col1.append(Slice("a"));
|
||||
col1.append(Slice("b"));
|
||||
col1.append(Slice("c"));
|
||||
ASSERT_TRUE(writer->append(col1).ok());
|
||||
ASSERT_TRUE(writer->finish_current_page().ok());
|
||||
|
||||
// Page 2: d-f
|
||||
BinaryColumn col2;
|
||||
col2.append(Slice("d"));
|
||||
col2.append(Slice("e"));
|
||||
col2.append(Slice("f"));
|
||||
ASSERT_TRUE(writer->append(col2).ok());
|
||||
ASSERT_TRUE(writer->finish_current_page().ok());
|
||||
|
||||
// Page 3: g-i
|
||||
BinaryColumn col3;
|
||||
col3.append(Slice("g"));
|
||||
col3.append(Slice("h"));
|
||||
col3.append(Slice("i"));
|
||||
ASSERT_TRUE(writer->append(col3).ok());
|
||||
ASSERT_TRUE(writer->finish_current_page().ok());
|
||||
|
||||
// Finish writing - this should trigger the quality judger decision
|
||||
ASSERT_TRUE(writer->finish().ok());
|
||||
|
||||
// Write the zone map index - this is required to actually write the index to the file
|
||||
ASSERT_TRUE(writer->write_zone_map().ok());
|
||||
|
||||
// Close the file
|
||||
ASSERT_OK(wfile->close());
|
||||
|
||||
// Verify that the file was created and has content
|
||||
// This indicates that the zone map index was created (not skipped by quality judger)
|
||||
ASSERT_TRUE(fs->path_exists("/tmp/zonemap_adaptive_test").ok());
|
||||
// Check that the file size is greater than 0
|
||||
ASSIGN_OR_ABORT(auto file_size, fs->get_file_size("/tmp/zonemap_adaptive_test"));
|
||||
ASSERT_GT(file_size, 0);
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
Loading…
Reference in New Issue