[Enhancement] Full sort use german string for comparison (#62929)

Signed-off-by: satanson <ranpanf@gmail.com>
This commit is contained in:
satanson 2025-09-15 20:23:40 +08:00 committed by GitHub
parent 1ed65e2a6d
commit b956f11544
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 733 additions and 36 deletions

View File

@ -45,4 +45,5 @@ add_library(Column STATIC
column_view/column_view_base.cpp
column_view/column_view_helper.cpp
variant_column.cpp
german_string.cpp
)

View File

@ -51,7 +51,7 @@ template <typename T>
void BinaryColumnBase<T>::append(const Slice& str) {
_bytes.insert(_bytes.end(), str.data, str.data + str.size);
_offsets.emplace_back(_bytes.size());
_slices_cache = false;
invalidate_slice_cache();
}
template <typename T>
@ -79,7 +79,7 @@ void BinaryColumnBase<T>::append(const Column& src, size_t offset, size_t count)
new_offsets[i] += delta;
}
_slices_cache = false;
invalidate_slice_cache();
}
template <typename T>
@ -146,7 +146,7 @@ void BinaryColumnBase<T>::append_selective(const Column& src, const uint32_t* in
}
_offsets.resize(prev_num_offsets + size);
_slices_cache = false;
invalidate_slice_cache();
}
template <typename T>
@ -175,7 +175,7 @@ void BinaryColumnBase<T>::append_value_multiple_times(const Column& src, uint32_
str_size);
}
_slices_cache = false;
invalidate_slice_cache();
}
//TODO(fzh): optimize copy using SIMD
@ -229,7 +229,7 @@ bool BinaryColumnBase<T>::append_strings(const Slice* data, size_t size) {
strings::memcpy_inlined(bytes + offsets[i], p, data[i].size);
}
_slices_cache = false;
invalidate_slice_cache();
return true;
}
@ -284,7 +284,7 @@ bool BinaryColumnBase<T>::append_strings_overflow(const Slice* data, size_t size
_offsets.emplace_back(_bytes.size());
}
}
_slices_cache = false;
invalidate_slice_cache();
return true;
}
@ -305,7 +305,7 @@ bool BinaryColumnBase<T>::append_continuous_strings(const Slice* data, size_t si
_offsets.emplace_back(new_size);
}
DCHECK_EQ(_bytes.size(), new_size);
_slices_cache = false;
invalidate_slice_cache();
return true;
}
@ -359,7 +359,7 @@ void BinaryColumnBase<T>::append_bytes(char* const* data, uint32_t* length, size
for (size_t i = 0; i < size; i++) {
_bytes.insert(_bytes.end(), data[i], data[i] + length[i]);
}
_slices_cache = false;
invalidate_slice_cache();
}
template <typename T, size_t copy_length>
@ -399,7 +399,7 @@ void BinaryColumnBase<T>::append_bytes_overflow(char* const* data, uint32_t* len
} else {
append_bytes(data, lengths, size);
}
_slices_cache = false;
invalidate_slice_cache();
}
template <typename T>
@ -414,7 +414,7 @@ void BinaryColumnBase<T>::append_value_multiple_times(const void* value, size_t
_bytes.insert(_bytes.end(), p, pend);
_offsets.emplace_back(_bytes.size());
}
_slices_cache = false;
invalidate_slice_cache();
}
template <typename T>
@ -436,6 +436,22 @@ void BinaryColumnBase<T>::_build_slices() const {
_slices_cache = true;
}
template <typename T>
void BinaryColumnBase<T>::_build_german_strings() const {
DCHECK(_offsets.size() > 0);
_german_strings_cache = false;
_german_strings.clear();
const auto num_rows = _offsets.size() - 1;
_german_strings.resize(num_rows);
const auto* base = _bytes.data();
for (auto i = 0; i < num_rows; ++i) {
_german_strings[i] = GermanString(base + _offsets[i], _offsets[i + 1] - _offsets[i]);
}
_german_strings_cache = true;
}
template <typename T>
void BinaryColumnBase<T>::fill_default(const Filter& filter) {
std::vector<uint32_t> indexes;
@ -506,7 +522,7 @@ void BinaryColumnBase<T>::assign(size_t n, size_t idx) {
_bytes.insert(_bytes.end(), start, end);
_offsets.emplace_back(_bytes.size());
}
_slices_cache = false;
invalidate_slice_cache();
}
//TODO(kks): improve this
@ -519,7 +535,7 @@ void BinaryColumnBase<T>::remove_first_n_values(size_t count) {
auto* binary_column = down_cast<const BinaryColumnBase<T>*>(column.get());
_offsets = std::move(binary_column->_offsets);
_bytes = std::move(binary_column->_bytes);
_slices_cache = false;
invalidate_slice_cache();
}
template <typename T>

View File

@ -17,6 +17,7 @@
#include "column/bytes.h"
#include "column/column.h"
#include "column/datum.h"
#include "column/german_string.h"
#include "column/vectorized_fwd.h"
#include "common/statusor.h"
#include "gutil/strings/fastmem.h"
@ -48,6 +49,7 @@ public:
};
using Container = Buffer<Slice>;
using GermanStringContainer = Buffer<GermanString>;
using ProxyContainer = BinaryDataProxyContainer;
using ImmContainer = BinaryDataProxyContainer;
@ -304,6 +306,20 @@ public:
return _slices;
}
GermanStringContainer& get_german_strings() {
if (!_german_strings_cache) {
_build_german_strings();
}
return _german_strings;
}
const GermanStringContainer& get_german_strings() const {
if (!_german_strings_cache) {
_build_german_strings();
}
return _german_strings;
}
const BinaryDataProxyContainer& get_proxy_data() const { return _immuable_container; }
Bytes& get_bytes() { return _bytes; }
@ -342,7 +358,10 @@ public:
_slices_cache = false;
}
void invalidate_slice_cache() { _slices_cache = false; }
void invalidate_slice_cache() {
_slices_cache = false;
_german_strings_cache = false;
}
std::string debug_item(size_t idx) const override;
@ -366,12 +385,16 @@ public:
private:
void _build_slices() const;
void _build_german_strings() const;
Bytes _bytes;
Offsets _offsets;
mutable Container _slices;
mutable bool _slices_cache = false;
mutable GermanStringContainer _german_strings;
mutable bool _german_strings_cache = false;
BinaryDataProxyContainer _immuable_container = BinaryDataProxyContainer(*this);
};

View File

@ -0,0 +1,103 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "column/german_string.h"
#include "gutil/strings/fastmem.h"
#include "util/hash_util.hpp"
#include "util/misc.h"
#include "util/raw_container.h"
#include "util/slice.h"
namespace starrocks {
GermanString::GermanString() {
auto* p = reinterpret_cast<char*>(this);
std::fill(p, p + sizeof(GermanString), 0);
}
GermanString::GermanString(const starrocks::GermanString& rhs) {
strings::memcpy_inlined(this, &rhs, sizeof(GermanString));
}
GermanString::GermanString(const char* str, size_t len, void* ptr) {
if (len <= INLINE_MAX_LENGTH) {
auto* p = reinterpret_cast<char*>(this);
std::fill(p, p + sizeof(GermanString), 0);
strings::memcpy_inlined(short_rep.str, str, len);
} else {
strings::memcpy_inlined(long_rep.prefix, str, PREFIX_LENGTH);
strings::memcpy_inlined(ptr, str, len);
long_rep.ptr = reinterpret_cast<uintptr_t>(ptr);
}
this->len = len;
}
GermanString::GermanString(const void* str, size_t len) {
if (len <= INLINE_MAX_LENGTH) {
auto* p = reinterpret_cast<char*>(this);
std::fill(p, p + sizeof(GermanString), 0);
strings::memcpy_inlined(short_rep.str, str, len);
} else {
strings::memcpy_inlined(long_rep.prefix, str, PREFIX_LENGTH);
long_rep.ptr = reinterpret_cast<uintptr_t>(str);
}
this->len = len;
}
GermanString& GermanString::operator=(const Slice& slice) {
*this = GermanString(slice);
return *this;
}
GermanString::GermanString(const GermanString& rhs, void* ptr) {
strings::memcpy_inlined(this, &rhs, sizeof(GermanString));
if (!rhs.is_inline()) {
// NOLINTNEXTLINE(performance-no-int-to-ptr)
const auto* rhs_ptr = reinterpret_cast<const char*>(rhs.long_rep.ptr);
strings::memcpy_inlined(ptr, rhs_ptr, rhs.len);
long_rep.ptr = reinterpret_cast<uintptr_t>(ptr);
}
}
GermanString::operator std::string() const {
if (len <= INLINE_MAX_LENGTH) {
return std::string(short_rep.str, len);
} else {
std::string s;
raw::make_room(&s, len);
char* data = s.data();
// NOLINTNEXTLINE(performance-no-int-to-ptr)
strings::memcpy_inlined(data, reinterpret_cast<const char*>(long_rep.ptr), len);
return s;
}
}
uint32_t GermanString::fnv_hash(uint32_t seed) const {
if (is_inline()) {
return HashUtil::fnv_hash(short_rep.str, len, seed);
} else {
// // NOLINTNEXTLINE(performance-no-int-to-ptr)
return HashUtil::fnv_hash(reinterpret_cast<const char*>(long_rep.ptr), len, seed);
}
}
uint32_t GermanString::crc32_hash(uint32_t seed) const {
if (is_inline()) {
return HashUtil::zlib_crc_hash(short_rep.str, len, seed);
} else {
// NOLINTNEXTLINE(performance-no-int-to-ptr)
return HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(long_rep.ptr), len, seed);
}
}
} // namespace starrocks

View File

@ -0,0 +1,139 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <glog/logging.h>
#include <cstdint>
#include <string>
#include <string_view>
#include <vector>
#include "runtime/mem_pool.h"
#include "runtime/memory/column_allocator.h"
#include "util/memcmp.h"
#include "util/raw_container.h"
#include "util/slice.h"
namespace starrocks {
class Slice;
// A GermanString is a string that can be stored inline or as a pointer to a larger buffer.
class GermanString {
public:
static constexpr uint32_t INLINE_MAX_LENGTH = 12;
static constexpr uint32_t PREFIX_LENGTH = 4;
GermanString();
GermanString(const char* str, size_t len, void* ptr);
GermanString(const void* str, size_t len);
GermanString(const GermanString& rhs, void* ptr);
GermanString(const GermanString& rhs);
explicit GermanString(const Slice& slice) : GermanString(slice.data, slice.size){};
GermanString& operator=(const Slice& slice);
GermanString(const std::string_view& str, void* ptr) : GermanString(str.data(), str.size(), ptr){};
GermanString(const std::string& str);
GermanString& operator=(const std::string& str);
explicit operator std::string() const;
bool is_inline() const { return len <= INLINE_MAX_LENGTH; }
int compare(const GermanString& rhs) const {
auto byte_swap = [](uint32_t v) -> uint32_t {
uint32_t t1 = (v >> 16u) | (v << 16u);
uint32_t t2 = t1 & 0x00ff00ff;
uint32_t t3 = t1 & 0xff00ff00;
return (t2 << 8u) | (t3 >> 8u);
};
auto lhs_prefix = *reinterpret_cast<const uint32_t*>(this->long_rep.prefix);
auto rhs_prefix = *reinterpret_cast<const uint32_t*>(rhs.long_rep.prefix);
if (lhs_prefix != rhs_prefix) {
// if prefix not equal, we can determine the result by prefix
lhs_prefix = byte_swap(lhs_prefix);
rhs_prefix = byte_swap(rhs_prefix);
return (lhs_prefix > rhs_prefix) - (lhs_prefix < rhs_prefix);
}
auto min_len = std::min<uint32_t>(this->len, rhs.len);
auto r = memcompare(this->get_data(), min_len, rhs.get_data(), min_len);
return r != 0 ? r : ((this->len > rhs.len) - (this->len < rhs.len));
}
uint32_t fnv_hash(uint32_t seed) const;
uint32_t crc32_hash(uint32_t seed) const;
std::string to_string() const { return static_cast<std::string>(*this); }
const char* get_data() const {
if (is_inline()) {
return short_rep.str;
} else {
// NOLINTNEXTLINE(performance-no-int-to-ptr)
return reinterpret_cast<const char*>(long_rep.ptr);
}
}
inline bool operator==(const GermanString& rhs) const {
// compare first 8 bytes;
const auto lhs_h0 = *reinterpret_cast<const uint64_t*>(this);
const auto rhs_h0 = *reinterpret_cast<const uint64_t*>(&rhs);
if (lhs_h0 != rhs_h0) {
return false;
}
// compare second 8 bytes;
const auto lhs_h1 = *(reinterpret_cast<const uint64_t*>(this) + 1);
const auto rhs_h1 = *(reinterpret_cast<const uint64_t*>(&rhs) + 1);
if (lhs_h1 == rhs_h1) {
return true;
}
return memcompare(get_data(), this->len, rhs.get_data(), rhs.len) == 0;
}
inline bool operator!=(const GermanString& rhs) const { return !(*this == rhs); }
inline bool operator<(const GermanString& rhs) const { return compare(rhs) < 0; }
inline bool operator<=(const GermanString& rhs) const { return compare(rhs) <= 0; }
inline bool operator>(const GermanString& rhs) const { return compare(rhs) > 0; }
inline bool operator>=(const GermanString& rhs) const { return compare(rhs) >= 0; }
inline bool operator<=>(const GermanString& rhs) const { return compare(rhs); }
friend std::ostream& operator<<(std::ostream& os, const GermanString& gs) {
os << gs.to_string();
return os;
}
alignas(8) union {
uint32_t len;
struct {
uint32_t len;
char str[INLINE_MAX_LENGTH];
} short_rep;
struct {
uint32_t len;
char prefix[PREFIX_LENGTH];
uintptr_t ptr;
} long_rep;
};
};
} // namespace starrocks
namespace std {
static inline std::string to_string(const starrocks::GermanString& gs) {
return static_cast<std::string>(gs);
}
} // namespace std

View File

@ -45,6 +45,7 @@ ChunksSorter::ChunksSorter(RuntimeState* state, const std::vector<ExprContext*>*
_sort_desc(*is_asc, *is_null_first),
_sort_keys(std::move(sort_keys)),
_is_topn(is_topn) {
set_use_german_string(state->enable_full_sort_use_german_string());
DCHECK(_sort_exprs != nullptr);
DCHECK(is_asc != nullptr);
DCHECK(is_null_first != nullptr);
@ -62,6 +63,7 @@ void ChunksSorter::setup_runtime(RuntimeState* state, RuntimeProfile* profile, M
_sort_cnt = ADD_COUNTER(profile, "SortingCnt", TUnit::UNIT);
profile->add_info_string("SortKeys", _sort_keys);
profile->add_info_string("SortType", _is_topn ? "TopN" : "All");
profile->add_info_string("UseGermanString", is_use_german_string() ? "True" : "False");
}
StatusOr<ChunkPtr> ChunksSorter::materialize_chunk_before_sort(Chunk* chunk, TupleDescriptor* materialized_tuple_desc,

View File

@ -138,6 +138,8 @@ public:
virtual size_t reserved_bytes(const ChunkPtr& chunk) { return chunk != nullptr ? chunk->memory_usage() : 0; }
virtual void cancel() {}
bool is_use_german_string() const { return _use_german_string; }
void set_use_german_string(bool value) { this->_use_german_string = value; }
protected:
size_t _get_number_of_order_by_columns() const { return _sort_exprs->size(); }
@ -149,6 +151,7 @@ protected:
const SortDescs _sort_desc;
const std::string _sort_keys;
const bool _is_topn;
bool _use_german_string = false;
RuntimeProfile::Counter* _build_timer = nullptr;
RuntimeProfile::Counter* _sort_timer = nullptr;

View File

@ -21,6 +21,7 @@
#include "column/column_visitor_adapter.h"
#include "column/const_column.h"
#include "column/datum.h"
#include "column/german_string.h"
#include "column/json_column.h"
#include "column/nullable_column.h"
#include "column/vectorized_fwd.h"
@ -172,10 +173,17 @@ public:
template <typename SizeT>
Status do_visit(const BinaryColumnBase<SizeT>& _) {
using ColumnType = const BinaryColumnBase<SizeT>;
using Container = typename BinaryColumnBase<SizeT>::BinaryDataProxyContainer;
auto& left_data = down_cast<const ColumnType*>(_left_col)->get_proxy_data();
auto& right_data = down_cast<const ColumnType*>(_right_col)->get_proxy_data();
return merge_ordinary_column<Container, Slice>(left_data, right_data);
if (use_german_string) {
using Container = typename BinaryColumnBase<SizeT>::GermanStringContainer;
auto& left_data = down_cast<const ColumnType*>(_left_col)->get_german_strings();
auto& right_data = down_cast<const ColumnType*>(_right_col)->get_german_strings();
return merge_ordinary_column<Container, GermanString>(left_data, right_data);
} else {
using Container = typename BinaryColumnBase<SizeT>::BinaryDataProxyContainer;
auto& left_data = down_cast<const ColumnType*>(_left_col)->get_proxy_data();
auto& right_data = down_cast<const ColumnType*>(_right_col)->get_proxy_data();
return merge_ordinary_column<Container, Slice>(left_data, right_data);
}
}
Status do_visit(const NullableColumn& _) {
@ -185,6 +193,7 @@ public:
const auto* lhs_data = down_cast<const NullableColumn*>(_left_col)->data_column().get();
const auto* rhs_data = down_cast<const NullableColumn*>(_right_col)->data_column().get();
MergeTwoColumn merge2({_sort_order, _null_first}, lhs_data, rhs_data, _equal_ranges, _perm);
merge2.set_use_german_string(is_use_german_string());
return lhs_data->accept(&merge2);
}
@ -192,6 +201,9 @@ public:
return do_visit_slow(_);
}
bool is_use_german_string() const { return use_german_string; }
void set_use_german_string(bool value) { use_german_string = value; }
private:
constexpr static uint32_t kLeftIndex = 0;
constexpr static uint32_t kRightIndex = 1;
@ -202,6 +214,7 @@ private:
const Column* _right_col;
std::vector<EqualRange>* _equal_ranges;
Permutation* _perm;
bool use_german_string = false;
};
// MergeTwoChunk merge two chunk in column-wise
@ -270,6 +283,7 @@ public:
const Column* left_col = left_run.get_column(col);
const Column* right_col = right_run.get_column(col);
MergeTwoColumn merge2(sort_desc.get_column_desc(col), left_col, right_col, &equal_ranges, output);
merge2.set_use_german_string(sort_desc.is_use_german_string());
Status st = left_col->accept(&merge2);
CHECK(st.ok());
if (equal_ranges.size() == 0) {

View File

@ -80,7 +80,8 @@ static Status sort_and_tie_helper(const std::atomic<bool>& cancel, const Column*
}
static Status sort_and_tie_column(const std::atomic<bool>& cancel, const Column* column, const SortDesc& sort_desc,
SmallPermutation& permutation, Tie& tie, Ranges&& ranges, bool build_tie);
SmallPermutation& permutation, Tie& tie, Ranges&& ranges, bool build_tie,
const SortDescs* sort_descs = nullptr);
template <class NullPred>
static Status sort_and_tie_helper_nullable(const std::atomic<bool>& cancel, const NullableColumn* column,
@ -170,16 +171,29 @@ public:
DCHECK_GE(column.size(), _permutation.size());
}
using ItemType = InlinePermuteItem<Slice>;
auto cmp = [&](const ItemType& lhs, const ItemType& rhs) -> int {
return lhs.inline_value.compare(rhs.inline_value);
};
if (_use_german_string) {
using ItemType = InlinePermuteItem<GermanString>;
auto cmp = [&](const ItemType& lhs, const ItemType& rhs) -> int {
return lhs.inline_value.compare(rhs.inline_value);
};
auto inlined = create_inline_permutation<Slice, IS_RANGES>(_permutation, column.get_proxy_data());
RETURN_IF_ERROR(sort_and_tie_helper(_cancel, &column, _sort_desc.asc_order(), inlined, _tie, cmp,
_range_or_ranges, _build_tie));
restore_inline_permutation(inlined, _permutation);
auto inlined =
create_inline_permutation<GermanString, IS_RANGES>(_permutation, column.get_german_strings());
RETURN_IF_ERROR(sort_and_tie_helper(_cancel, &column, _sort_desc.asc_order(), inlined, _tie, cmp,
_range_or_ranges, _build_tie));
restore_inline_permutation(inlined, _permutation);
} else {
using ItemType = InlinePermuteItem<Slice>;
auto cmp = [&](const ItemType& lhs, const ItemType& rhs) -> int {
return lhs.inline_value.compare(rhs.inline_value);
};
auto inlined = create_inline_permutation<Slice, IS_RANGES>(_permutation, column.get_proxy_data());
RETURN_IF_ERROR(sort_and_tie_helper(_cancel, &column, _sort_desc.asc_order(), inlined, _tie, cmp,
_range_or_ranges, _build_tie));
restore_inline_permutation(inlined, _permutation);
}
return Status::OK();
}
@ -218,6 +232,8 @@ public:
_build_tie);
}
void use_german_string(bool flag) { _use_german_string = flag; }
private:
const std::atomic<bool>& _cancel;
const SortDesc& _sort_desc;
@ -225,6 +241,7 @@ private:
Tie& _tie;
R _range_or_ranges;
bool _build_tie;
bool _use_german_string = false;
};
// Sort multiple a column from multiple chunks(vertical column)
@ -469,24 +486,33 @@ private:
};
Status sort_and_tie_column(const std::atomic<bool>& cancel, const ColumnPtr& column, const SortDesc& sort_desc,
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, bool build_tie) {
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, bool build_tie,
const SortDescs* sort_descs) {
ColumnSorter column_sorter(cancel, sort_desc, permutation, tie, range, build_tie);
if (sort_descs != nullptr) {
column_sorter.use_german_string(sort_descs->is_use_german_string());
}
return column->accept(&column_sorter);
}
Status sort_and_tie_column(const std::atomic<bool>& cancel, ColumnPtr& column, const SortDesc& sort_desc,
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, bool build_tie) {
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, bool build_tie,
const SortDescs* sort_descs) {
// Nullable column need set all the null rows to default values,
// see the comment of the declaration of `partition_null_and_nonnull_helper` for details.
if (column->is_nullable() && !column->is_constant()) {
ColumnHelper::as_column<NullableColumn>(column)->fill_null_with_default();
}
ColumnSorter column_sorter(cancel, sort_desc, permutation, tie, range, build_tie);
if (sort_descs != nullptr) {
column_sorter.use_german_string(sort_descs->is_use_german_string());
}
return column->accept(&column_sorter);
}
static Status sort_and_tie_column(const std::atomic<bool>& cancel, const Column* column, const SortDesc& sort_desc,
SmallPermutation& permutation, Tie& tie, Ranges&& ranges, bool build_tie) {
SmallPermutation& permutation, Tie& tie, Ranges&& ranges, bool build_tie,
const SortDescs* sort_descs) {
// Nullable column need set all the null rows to default values,
// see the comment of the declaration of `partition_null_and_nonnull_helper` for details.
if (column->is_nullable() && !column->is_constant()) {
@ -494,6 +520,9 @@ static Status sort_and_tie_column(const std::atomic<bool>& cancel, const Column*
down_cast<NullableColumn*>(mutable_col)->fill_null_with_default();
}
ColumnSorter column_sorter(cancel, sort_desc, permutation, tie, std::move(ranges), build_tie);
if (sort_descs != nullptr) {
column_sorter.use_german_string(sort_descs->is_use_german_string());
}
return column->accept(&column_sorter);
}
@ -525,7 +554,7 @@ Status sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& colu
ColumnPtr column = columns[col_index];
bool build_tie = col_index != columns.size() - 1;
RETURN_IF_ERROR(sort_and_tie_column(cancel, column, sort_desc.get_column_desc(col_index), permutation, tie,
range, build_tie));
range, build_tie, &sort_desc));
}
return Status::OK();
@ -534,7 +563,8 @@ Status sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& colu
Status sort_and_tie_columns(const std::atomic<bool>& cancel, const std::vector<const Column*>& columns,
const SortDescs& sort_desc, SmallPermutation& perm,
const std::span<const uint32_t> src_offsets,
const std::vector<std::span<const uint32_t>>& offsets_per_key) {
const std::vector<std::span<const uint32_t>>& offsets_per_key,
const SortDescs* sort_descs) {
if (src_offsets.empty()) {
return Status::OK();
}
@ -584,7 +614,7 @@ Status sort_and_tie_columns(const std::atomic<bool>& cancel, const std::vector<c
const bool build_tie = (col_i != (columns.size() - 1));
shift_perm(offsets_per_key[col_i].data());
RETURN_IF_ERROR(sort_and_tie_column(cancel, column, sort_desc.get_column_desc(col_i), perm, tie,
Ranges(src_offsets, offsets_per_key[col_i]), build_tie));
Ranges(src_offsets, offsets_per_key[col_i]), build_tie, sort_descs));
}
shift_perm(src_offsets.data());

View File

@ -17,6 +17,7 @@
#include <algorithm>
#include <concepts>
#include "column/german_string.h"
#include "column/nullable_column.h"
#include "column/type_traits.h"
#include "column/vectorized_fwd.h"
@ -52,6 +53,16 @@ struct SorterComparator<Slice> {
}
};
template <>
struct SorterComparator<GermanString> {
static int compare(const GermanString& lhs, const GermanString& rhs) {
int x = lhs.compare(rhs);
if (x > 0) return 1;
if (x < 0) return -1;
return x;
}
};
template <>
struct SorterComparator<DateValue> {
static int compare(const DateValue& lhs, const DateValue& rhs) {

View File

@ -35,9 +35,11 @@ struct SortDescs;
// @param tie input and output tie
// @param range sort range, {0, 0} means not build tie but sort data
Status sort_and_tie_column(const std::atomic<bool>& cancel, ColumnPtr& column, const SortDesc& sort_desc,
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, const bool build_tie);
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, const bool build_tie,
const SortDescs* sort_descs = nullptr);
Status sort_and_tie_column(const std::atomic<bool>& cancel, const ColumnPtr& column, const SortDesc& sort_desc,
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, const bool build_tie);
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, const bool build_tie,
const SortDescs* sort_descs = nullptr);
// Sort multiple columns using column-wise algorithm, the final order is stored in the Permutation argument
Status sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& columns, const SortDescs& sort_desc,
@ -60,7 +62,8 @@ Status sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& colu
Status sort_and_tie_columns(const std::atomic<bool>& cancel, const std::vector<const Column*>& columns,
const SortDescs& sort_desc, SmallPermutation& perm,
const std::span<const uint32_t> src_offsets,
const std::vector<std::span<const uint32_t>>& offsets_per_key);
const std::vector<std::span<const uint32_t>>& offsets_per_key,
const SortDescs* sort_descs = nullptr);
// Sort multiple columns, and stable
Status stable_sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& columns, const SortDescs& sort_desc,
@ -105,6 +108,7 @@ struct SortDesc {
};
struct SortDescs {
std::vector<SortDesc> descs;
mutable bool use_german_string = false;
SortDescs() = default;
~SortDescs() = default;
@ -136,6 +140,8 @@ struct SortDescs {
size_t num_columns() const { return descs.size(); }
SortDesc get_column_desc(int col) const { return descs[col]; }
bool is_use_german_string() const { return use_german_string; }
void set_use_german_string(bool value) const { use_german_string = value; }
};
} // namespace starrocks

View File

@ -362,6 +362,11 @@ public:
return _spill_options->spill_partitionwise_agg_skew_elimination;
}
bool enable_full_sort_use_german_string() const {
return _query_options.__isset.enable_full_sort_use_german_string &&
_query_options.enable_full_sort_use_german_string;
}
int32_t spill_mem_table_size() const {
return EXTRACE_SPILL_PARAM(_query_options, _spill_options, spill_mem_table_size);
}

View File

@ -14,6 +14,8 @@
#pragma once
#include <util/stack_util.h>
#include <cstdint>
#include <functional>
@ -22,4 +24,21 @@ namespace starrocks {
// take a sleep with small intervals until time out by `sleep_secs` or the `stop_condition()` is true
void nap_sleep(int32_t sleep_secs, const std::function<bool()>& stop_condition);
#if defined(__GNUC__) || defined(__clang__)
#define NOT_SUPPORT() \
do { \
throw std::runtime_error(std::string("Not support method '") + __PRETTY_FUNCTION__ + \
"': " + get_stack_trace()); \
} while (0);
#elif defined(_MSC_VER)
#define NOT_SUPPORT() \
do { \
throw std::runtime_error(std::string("Not support method '") + __FUNCSIG__ + "': " + get_stack_trace()); \
} while (0);
#else
#define NOT_SUPPORT() \
do { \
throw std::runtime_error(std::string("Not support method '") + __func__ + "': " + get_stack_trace()); \
} while (0);
#endif
} // namespace starrocks

View File

@ -7,6 +7,7 @@ set(EXEC_FILES
./column/column_view_test.cpp
./column/array_view_column_test.cpp
./column/binary_column_test.cpp
./column/german_string_test.cpp
./column/chunk_test.cpp
./column/column_helper_test.cpp
./column/const_column_test.cpp

View File

@ -0,0 +1,212 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "column/german_string.h"
#include <gtest/gtest.h>
#include <iostream>
#include <random>
#include <string>
#include <tuple>
#include "column/binary_column.h"
#include "column/column_helper.h"
#include "column/const_column.h"
#include "column/fixed_length_column.h"
#include "column/nullable_column.h"
#include "column/vectorized_fwd.h"
#include "testutil/parallel_test.h"
namespace starrocks {
class GermanStringExternalAllocator {
public:
static constexpr auto PAGE_SIZE = 4096;
static constexpr auto MEDIUM_STRING_MAX_SIZE = 512;
virtual ~GermanStringExternalAllocator() = default;
size_t size() const;
void clear();
char* allocate(size_t n);
virtual void incorporate(std::shared_ptr<GermanStringExternalAllocator>&& other);
bool is_from_binary() const { return false; }
private:
std::vector<std::vector<char>> medium_string_pages;
std::vector<std::string> large_strings;
};
size_t GermanStringExternalAllocator::size() const {
size_t total_size = 0;
for (const auto& page : medium_string_pages) {
total_size += page.size();
}
for (const auto& str : large_strings) {
total_size += str.size();
}
return total_size;
}
void GermanStringExternalAllocator::clear() {
large_strings.clear();
medium_string_pages.clear();
}
char* GermanStringExternalAllocator::allocate(size_t n) {
if (n <= GermanString::INLINE_MAX_LENGTH) {
return nullptr;
}
if (n <= MEDIUM_STRING_MAX_SIZE) {
if (medium_string_pages.empty() || medium_string_pages.back().size() + n > PAGE_SIZE) {
medium_string_pages.emplace_back();
medium_string_pages.back().reserve(PAGE_SIZE);
}
auto& page = medium_string_pages.back();
raw::stl_vector_resize_uninitialized(&page, page.size() + n);
return page.data() + page.size() - n;
} else {
large_strings.emplace_back();
auto& str = large_strings.back();
raw::make_room(&str, n);
return large_strings.back().data();
}
}
void GermanStringExternalAllocator::incorporate(std::shared_ptr<GermanStringExternalAllocator>&& other) {
DCHECK(!other->is_from_binary());
large_strings.insert(large_strings.end(), std::make_move_iterator(other->large_strings.begin()),
std::make_move_iterator(other->large_strings.end()));
medium_string_pages.insert(medium_string_pages.end(), std::make_move_iterator(other->medium_string_pages.begin()),
std::make_move_iterator(other->medium_string_pages.end()));
other->clear();
}
static inline std::string gen_string(std::size_t length) {
const std::string chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789";
std::random_device rd; // For seeding
std::mt19937 gen(rd()); // Mersenne Twister RNG
std::uniform_int_distribution<> dist(0, chars.size() - 1);
std::string result;
result.reserve(length);
for (std::size_t i = 0; i < length; ++i) {
result += chars[dist(gen)];
}
return result;
}
TEST(GermanStringTest, test_german_string) {
GermanStringExternalAllocator allocator;
std::vector<std::string> strings{gen_string(0), gen_string(1), gen_string(3), gen_string(4),
gen_string(5), gen_string(11), gen_string(12), gen_string(13),
gen_string(15), gen_string(4096), gen_string(4097)};
for (const auto& s : strings) {
GermanString gs1(s.data(), s.size(), allocator.allocate(s.size()));
auto s1 = static_cast<std::string>(gs1);
ASSERT_EQ(s, s1);
GermanString gs2(gs1, allocator.allocate(gs1.len));
auto s2 = static_cast<std::string>(gs2);
ASSERT_EQ(s, s2);
}
}
TEST(GermanStringTest, test_german_string_compare) {
GermanStringExternalAllocator allocator;
// short vs short
{
GermanString gs1("Hello", 5, allocator.allocate(5));
GermanString gs2("Hello", 5, allocator.allocate(5));
GermanString gs3("Hello!", 6, allocator.allocate(6));
GermanString gs4("Helloi", 6, allocator.allocate(6));
ASSERT_TRUE(gs1 == gs2);
ASSERT_FALSE(gs1 == gs3);
ASSERT_FALSE(gs1 == gs4);
ASSERT_FALSE(gs3 == gs4);
ASSERT_FALSE(gs3 == gs1);
ASSERT_FALSE(gs4 == gs1);
ASSERT_FALSE(gs4 == gs3);
ASSERT_TRUE(gs1.compare(gs2) == 0);
ASSERT_TRUE(gs1 < gs3);
ASSERT_TRUE(gs3 < gs4);
ASSERT_TRUE(gs4 > gs1);
}
// long vs long
{
GermanString gs1("Hello GermanString", 18, allocator.allocate(18));
GermanString gs2("Hello GermanString", 18, allocator.allocate(18));
GermanString gs3("Hello GermanString!", 19, allocator.allocate(19));
GermanString gs4("Hello GermanStringi", 19, allocator.allocate(19));
ASSERT_TRUE(gs1 == gs2);
ASSERT_FALSE(gs1 == gs3);
ASSERT_FALSE(gs1 == gs4);
ASSERT_FALSE(gs3 == gs4);
ASSERT_FALSE(gs3 == gs1);
ASSERT_FALSE(gs4 == gs1);
ASSERT_FALSE(gs4 == gs3);
ASSERT_TRUE(gs1.compare(gs2) == 0);
ASSERT_TRUE(gs1 < gs3);
ASSERT_TRUE(gs3 < gs4);
ASSERT_TRUE(gs4 > gs1);
}
// short vs long
{
GermanString gs1("Hello", 5, allocator.allocate(5));
GermanString gs2("Hello GermanString", 18, allocator.allocate(18));
GermanString gs3("Hello GermanString!", 19, allocator.allocate(19));
GermanString gs4("Hello GermanStringi", 19, allocator.allocate(19));
ASSERT_TRUE(gs1 < gs2);
ASSERT_TRUE(gs1 < gs3);
ASSERT_TRUE(gs1 < gs4);
ASSERT_FALSE(gs2 < gs1);
ASSERT_FALSE(gs3 < gs1);
ASSERT_FALSE(gs4 < gs1);
ASSERT_TRUE(gs2 > gs1);
ASSERT_TRUE(gs3 > gs1);
ASSERT_TRUE(gs4 > gs1);
}
{
GermanString gs0("", 0, allocator.allocate(0));
GermanString gs4("abcd", 4, allocator.allocate(4));
GermanString gs12("abcdabcdabcd", 12, allocator.allocate(12));
GermanString gs16("abcdabcdabcd", 16, allocator.allocate(16));
ASSERT_TRUE(gs0 < gs4);
ASSERT_TRUE(gs0 < gs12);
ASSERT_TRUE(gs0 < gs16);
ASSERT_TRUE(gs4 < gs12);
ASSERT_TRUE(gs4 < gs16);
ASSERT_TRUE(gs12 < gs16);
ASSERT_TRUE(gs0 == gs0);
ASSERT_TRUE(gs4 == gs4);
ASSERT_TRUE(gs12 == gs12);
ASSERT_TRUE(gs16 == gs16);
ASSERT_TRUE(gs0 != gs4);
ASSERT_TRUE(gs0 != gs12);
ASSERT_TRUE(gs0 != gs16);
ASSERT_TRUE(gs4 != gs12);
ASSERT_TRUE(gs4 != gs16);
ASSERT_TRUE(gs12 != gs16);
}
}
} // namespace starrocks

View File

@ -957,6 +957,8 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
public static final String ENABLE_DESENSITIZE_EXPLAIN = "enable_desensitize_explain";
public static final String ENABLE_FULL_SORT_USE_GERMAN_STRING = "enable_full_sort_use_german_string";
public static final List<String> DEPRECATED_VARIABLES = ImmutableList.<String>builder()
.add(CODEGEN_LEVEL)
.add(MAX_EXECUTION_TIME)
@ -1960,6 +1962,9 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
this.enableDesensitizeExplain = enableDesensitizeExplain;
}
@VarAttr(name = ENABLE_FULL_SORT_USE_GERMAN_STRING)
private boolean enableFullSortUseGermanString = true;
public int getCboPruneJsonSubfieldDepth() {
return cboPruneJsonSubfieldDepth;
}
@ -5317,6 +5322,13 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
this.cboJSONV2DictOpt = value;
}
public void setEnableFullSortUseGermanString(boolean value) {
this.enableFullSortUseGermanString = value;
}
public boolean isEnableFullSortUseGermanString() {
return this.enableFullSortUseGermanString;
}
// Serialize to thrift object
// used for rest api
public TQueryOptions toThrift() {
@ -5339,7 +5351,7 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
tResult.setRuntime_profile_report_interval(runtimeProfileReportInterval);
tResult.setBatch_size(chunkSize);
tResult.setLoad_mem_limit(loadMemLimit);
tResult.setEnable_full_sort_use_german_string(enableFullSortUseGermanString);
if (maxScanKeyNum > -1) {
tResult.setMax_scan_key_num(maxScanKeyNum);
}

View File

@ -347,6 +347,8 @@ struct TQueryOptions {
190: optional i64 column_view_concat_rows_limit;
191: optional i64 column_view_concat_bytes_limit;
200: optional bool enable_full_sort_use_german_string;
}
// A scan range plus the parameters needed to execute that scan.
@ -410,6 +412,7 @@ struct TPlanFragmentExecParams {
// Debug options: perform some action in a particular phase of a particular node
74: optional list<TExecDebugOption> exec_debug_options
}
// Global query parameters assigned by the coordinator.

View File

@ -0,0 +1,60 @@
-- name: test_full_sort_use_german_string
DROP TABLE if exists t0;
-- result:
-- !result
CREATE TABLE if not exists t0
(
c0 VARCHAR(255) NOT NULL,
c1 VARCHAR(255) NOT NULL,
c2 VARCHAR(255) NOT NULL,
c3 VARCHAR(255) NOT NULL
) ENGINE=OLAP
DUPLICATE KEY(`c0` )
COMMENT "OLAP"
DISTRIBUTED BY HASH(`c0` ) BUCKETS 1
PROPERTIES(
"replication_num" = "1",
"in_memory" = "false",
"storage_format" = "default"
);
-- result:
-- !result
insert into t0
select
concat("",i,"_abc") as c0,
concat("foo_", 100000 - i) as c1,
concat("bar_", ((50000 - i)/10)*((50000 - i)/10)) as c2,
concat("", ((50000 - i)/10)*((50000 - i)/10),"_bar") as c3
from table(generate_series(1,100000)) gs(i);
-- result:
-- !result
with cte as(select c0, count(1) over(partition by c0) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0, cnt)) as fingerprint from cte;
-- result:
380598722181
-- !result
with cte as(select c0, count(1) over(partition by c0) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0, cnt)) as fingerprint from cte;
-- result:
380598722181
-- !result
with cte as(select c0,c1, count(1) over(partition by c0,c1) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0,c1, cnt)) as fingerprint from cte;
-- result:
188259655263
-- !result
with cte as(select c0,c1, count(1) over(partition by c0,c1) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0,c1, cnt)) as fingerprint from cte;
-- result:
188259655263
-- !result
with cte as(select c0,c1,c2, count(1) over(partition by c0,c1,c2) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0,c1,c2, cnt)) as fingerprint from cte;
-- result:
-418370384866
-- !result
with cte as(select c0,c1,c2, count(1) over(partition by c0,c1,c2) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0,c1,c2, cnt)) as fingerprint from cte;
-- result:
-418370384866
-- !result

View File

@ -0,0 +1,37 @@
-- name: test_full_sort_use_german_string
DROP TABLE if exists t0;
CREATE TABLE if not exists t0
(
c0 VARCHAR(255) NOT NULL,
c1 VARCHAR(255) NOT NULL,
c2 VARCHAR(255) NOT NULL,
c3 VARCHAR(255) NOT NULL
) ENGINE=OLAP
DUPLICATE KEY(`c0` )
COMMENT "OLAP"
DISTRIBUTED BY HASH(`c0` ) BUCKETS 1
PROPERTIES(
"replication_num" = "1",
"in_memory" = "false",
"storage_format" = "default"
);
insert into t0
select
concat("",i,"_abc") as c0,
concat("foo_", 100000 - i) as c1,
concat("bar_", ((50000 - i)/10)*((50000 - i)/10)) as c2,
concat("", ((50000 - i)/10)*((50000 - i)/10),"_bar") as c3
from table(generate_series(1,100000)) gs(i);
with cte as(select c0, count(1) over(partition by c0) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0, cnt)) as fingerprint from cte;
with cte as(select c0, count(1) over(partition by c0) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0, cnt)) as fingerprint from cte;
with cte as(select c0,c1, count(1) over(partition by c0,c1) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0,c1, cnt)) as fingerprint from cte;
with cte as(select c0,c1, count(1) over(partition by c0,c1) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0,c1, cnt)) as fingerprint from cte;
with cte as(select c0,c1,c2, count(1) over(partition by c0,c1,c2) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0,c1,c2, cnt)) as fingerprint from cte;
with cte as(select c0,c1,c2, count(1) over(partition by c0,c1,c2) as cnt from t0)
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0,c1,c2, cnt)) as fingerprint from cte;