[Enhancement] Full sort use german string for comparison (#62929)
Signed-off-by: satanson <ranpanf@gmail.com>
This commit is contained in:
parent
1ed65e2a6d
commit
b956f11544
|
|
@ -45,4 +45,5 @@ add_library(Column STATIC
|
|||
column_view/column_view_base.cpp
|
||||
column_view/column_view_helper.cpp
|
||||
variant_column.cpp
|
||||
german_string.cpp
|
||||
)
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ template <typename T>
|
|||
void BinaryColumnBase<T>::append(const Slice& str) {
|
||||
_bytes.insert(_bytes.end(), str.data, str.data + str.size);
|
||||
_offsets.emplace_back(_bytes.size());
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -79,7 +79,7 @@ void BinaryColumnBase<T>::append(const Column& src, size_t offset, size_t count)
|
|||
new_offsets[i] += delta;
|
||||
}
|
||||
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -146,7 +146,7 @@ void BinaryColumnBase<T>::append_selective(const Column& src, const uint32_t* in
|
|||
}
|
||||
_offsets.resize(prev_num_offsets + size);
|
||||
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -175,7 +175,7 @@ void BinaryColumnBase<T>::append_value_multiple_times(const Column& src, uint32_
|
|||
str_size);
|
||||
}
|
||||
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
//TODO(fzh): optimize copy using SIMD
|
||||
|
|
@ -229,7 +229,7 @@ bool BinaryColumnBase<T>::append_strings(const Slice* data, size_t size) {
|
|||
strings::memcpy_inlined(bytes + offsets[i], p, data[i].size);
|
||||
}
|
||||
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -284,7 +284,7 @@ bool BinaryColumnBase<T>::append_strings_overflow(const Slice* data, size_t size
|
|||
_offsets.emplace_back(_bytes.size());
|
||||
}
|
||||
}
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -305,7 +305,7 @@ bool BinaryColumnBase<T>::append_continuous_strings(const Slice* data, size_t si
|
|||
_offsets.emplace_back(new_size);
|
||||
}
|
||||
DCHECK_EQ(_bytes.size(), new_size);
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -359,7 +359,7 @@ void BinaryColumnBase<T>::append_bytes(char* const* data, uint32_t* length, size
|
|||
for (size_t i = 0; i < size; i++) {
|
||||
_bytes.insert(_bytes.end(), data[i], data[i] + length[i]);
|
||||
}
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
template <typename T, size_t copy_length>
|
||||
|
|
@ -399,7 +399,7 @@ void BinaryColumnBase<T>::append_bytes_overflow(char* const* data, uint32_t* len
|
|||
} else {
|
||||
append_bytes(data, lengths, size);
|
||||
}
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -414,7 +414,7 @@ void BinaryColumnBase<T>::append_value_multiple_times(const void* value, size_t
|
|||
_bytes.insert(_bytes.end(), p, pend);
|
||||
_offsets.emplace_back(_bytes.size());
|
||||
}
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -436,6 +436,22 @@ void BinaryColumnBase<T>::_build_slices() const {
|
|||
_slices_cache = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BinaryColumnBase<T>::_build_german_strings() const {
|
||||
DCHECK(_offsets.size() > 0);
|
||||
_german_strings_cache = false;
|
||||
_german_strings.clear();
|
||||
|
||||
const auto num_rows = _offsets.size() - 1;
|
||||
_german_strings.resize(num_rows);
|
||||
|
||||
const auto* base = _bytes.data();
|
||||
for (auto i = 0; i < num_rows; ++i) {
|
||||
_german_strings[i] = GermanString(base + _offsets[i], _offsets[i + 1] - _offsets[i]);
|
||||
}
|
||||
_german_strings_cache = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BinaryColumnBase<T>::fill_default(const Filter& filter) {
|
||||
std::vector<uint32_t> indexes;
|
||||
|
|
@ -506,7 +522,7 @@ void BinaryColumnBase<T>::assign(size_t n, size_t idx) {
|
|||
_bytes.insert(_bytes.end(), start, end);
|
||||
_offsets.emplace_back(_bytes.size());
|
||||
}
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
//TODO(kks): improve this
|
||||
|
|
@ -519,7 +535,7 @@ void BinaryColumnBase<T>::remove_first_n_values(size_t count) {
|
|||
auto* binary_column = down_cast<const BinaryColumnBase<T>*>(column.get());
|
||||
_offsets = std::move(binary_column->_offsets);
|
||||
_bytes = std::move(binary_column->_bytes);
|
||||
_slices_cache = false;
|
||||
invalidate_slice_cache();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
#include "column/bytes.h"
|
||||
#include "column/column.h"
|
||||
#include "column/datum.h"
|
||||
#include "column/german_string.h"
|
||||
#include "column/vectorized_fwd.h"
|
||||
#include "common/statusor.h"
|
||||
#include "gutil/strings/fastmem.h"
|
||||
|
|
@ -48,6 +49,7 @@ public:
|
|||
};
|
||||
|
||||
using Container = Buffer<Slice>;
|
||||
using GermanStringContainer = Buffer<GermanString>;
|
||||
using ProxyContainer = BinaryDataProxyContainer;
|
||||
using ImmContainer = BinaryDataProxyContainer;
|
||||
|
||||
|
|
@ -304,6 +306,20 @@ public:
|
|||
return _slices;
|
||||
}
|
||||
|
||||
GermanStringContainer& get_german_strings() {
|
||||
if (!_german_strings_cache) {
|
||||
_build_german_strings();
|
||||
}
|
||||
return _german_strings;
|
||||
}
|
||||
|
||||
const GermanStringContainer& get_german_strings() const {
|
||||
if (!_german_strings_cache) {
|
||||
_build_german_strings();
|
||||
}
|
||||
return _german_strings;
|
||||
}
|
||||
|
||||
const BinaryDataProxyContainer& get_proxy_data() const { return _immuable_container; }
|
||||
|
||||
Bytes& get_bytes() { return _bytes; }
|
||||
|
|
@ -342,7 +358,10 @@ public:
|
|||
_slices_cache = false;
|
||||
}
|
||||
|
||||
void invalidate_slice_cache() { _slices_cache = false; }
|
||||
void invalidate_slice_cache() {
|
||||
_slices_cache = false;
|
||||
_german_strings_cache = false;
|
||||
}
|
||||
|
||||
std::string debug_item(size_t idx) const override;
|
||||
|
||||
|
|
@ -366,12 +385,16 @@ public:
|
|||
|
||||
private:
|
||||
void _build_slices() const;
|
||||
void _build_german_strings() const;
|
||||
|
||||
Bytes _bytes;
|
||||
Offsets _offsets;
|
||||
|
||||
mutable Container _slices;
|
||||
mutable bool _slices_cache = false;
|
||||
mutable GermanStringContainer _german_strings;
|
||||
mutable bool _german_strings_cache = false;
|
||||
|
||||
BinaryDataProxyContainer _immuable_container = BinaryDataProxyContainer(*this);
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,103 @@
|
|||
// Copyright 2021-present StarRocks, Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "column/german_string.h"
|
||||
|
||||
#include "gutil/strings/fastmem.h"
|
||||
#include "util/hash_util.hpp"
|
||||
#include "util/misc.h"
|
||||
#include "util/raw_container.h"
|
||||
#include "util/slice.h"
|
||||
|
||||
namespace starrocks {
|
||||
|
||||
GermanString::GermanString() {
|
||||
auto* p = reinterpret_cast<char*>(this);
|
||||
std::fill(p, p + sizeof(GermanString), 0);
|
||||
}
|
||||
|
||||
GermanString::GermanString(const starrocks::GermanString& rhs) {
|
||||
strings::memcpy_inlined(this, &rhs, sizeof(GermanString));
|
||||
}
|
||||
|
||||
GermanString::GermanString(const char* str, size_t len, void* ptr) {
|
||||
if (len <= INLINE_MAX_LENGTH) {
|
||||
auto* p = reinterpret_cast<char*>(this);
|
||||
std::fill(p, p + sizeof(GermanString), 0);
|
||||
strings::memcpy_inlined(short_rep.str, str, len);
|
||||
} else {
|
||||
strings::memcpy_inlined(long_rep.prefix, str, PREFIX_LENGTH);
|
||||
strings::memcpy_inlined(ptr, str, len);
|
||||
long_rep.ptr = reinterpret_cast<uintptr_t>(ptr);
|
||||
}
|
||||
this->len = len;
|
||||
}
|
||||
|
||||
GermanString::GermanString(const void* str, size_t len) {
|
||||
if (len <= INLINE_MAX_LENGTH) {
|
||||
auto* p = reinterpret_cast<char*>(this);
|
||||
std::fill(p, p + sizeof(GermanString), 0);
|
||||
strings::memcpy_inlined(short_rep.str, str, len);
|
||||
} else {
|
||||
strings::memcpy_inlined(long_rep.prefix, str, PREFIX_LENGTH);
|
||||
long_rep.ptr = reinterpret_cast<uintptr_t>(str);
|
||||
}
|
||||
this->len = len;
|
||||
}
|
||||
GermanString& GermanString::operator=(const Slice& slice) {
|
||||
*this = GermanString(slice);
|
||||
return *this;
|
||||
}
|
||||
|
||||
GermanString::GermanString(const GermanString& rhs, void* ptr) {
|
||||
strings::memcpy_inlined(this, &rhs, sizeof(GermanString));
|
||||
if (!rhs.is_inline()) {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
const auto* rhs_ptr = reinterpret_cast<const char*>(rhs.long_rep.ptr);
|
||||
strings::memcpy_inlined(ptr, rhs_ptr, rhs.len);
|
||||
long_rep.ptr = reinterpret_cast<uintptr_t>(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
GermanString::operator std::string() const {
|
||||
if (len <= INLINE_MAX_LENGTH) {
|
||||
return std::string(short_rep.str, len);
|
||||
} else {
|
||||
std::string s;
|
||||
raw::make_room(&s, len);
|
||||
char* data = s.data();
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
strings::memcpy_inlined(data, reinterpret_cast<const char*>(long_rep.ptr), len);
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t GermanString::fnv_hash(uint32_t seed) const {
|
||||
if (is_inline()) {
|
||||
return HashUtil::fnv_hash(short_rep.str, len, seed);
|
||||
} else {
|
||||
// // NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return HashUtil::fnv_hash(reinterpret_cast<const char*>(long_rep.ptr), len, seed);
|
||||
}
|
||||
}
|
||||
uint32_t GermanString::crc32_hash(uint32_t seed) const {
|
||||
if (is_inline()) {
|
||||
return HashUtil::zlib_crc_hash(short_rep.str, len, seed);
|
||||
} else {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(long_rep.ptr), len, seed);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
@ -0,0 +1,139 @@
|
|||
// Copyright 2021-present StarRocks, Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "runtime/mem_pool.h"
|
||||
#include "runtime/memory/column_allocator.h"
|
||||
#include "util/memcmp.h"
|
||||
#include "util/raw_container.h"
|
||||
#include "util/slice.h"
|
||||
|
||||
namespace starrocks {
|
||||
class Slice;
|
||||
|
||||
// A GermanString is a string that can be stored inline or as a pointer to a larger buffer.
|
||||
|
||||
class GermanString {
|
||||
public:
|
||||
static constexpr uint32_t INLINE_MAX_LENGTH = 12;
|
||||
static constexpr uint32_t PREFIX_LENGTH = 4;
|
||||
GermanString();
|
||||
GermanString(const char* str, size_t len, void* ptr);
|
||||
GermanString(const void* str, size_t len);
|
||||
GermanString(const GermanString& rhs, void* ptr);
|
||||
GermanString(const GermanString& rhs);
|
||||
|
||||
explicit GermanString(const Slice& slice) : GermanString(slice.data, slice.size){};
|
||||
GermanString& operator=(const Slice& slice);
|
||||
|
||||
GermanString(const std::string_view& str, void* ptr) : GermanString(str.data(), str.size(), ptr){};
|
||||
GermanString(const std::string& str);
|
||||
GermanString& operator=(const std::string& str);
|
||||
|
||||
explicit operator std::string() const;
|
||||
|
||||
bool is_inline() const { return len <= INLINE_MAX_LENGTH; }
|
||||
int compare(const GermanString& rhs) const {
|
||||
auto byte_swap = [](uint32_t v) -> uint32_t {
|
||||
uint32_t t1 = (v >> 16u) | (v << 16u);
|
||||
uint32_t t2 = t1 & 0x00ff00ff;
|
||||
uint32_t t3 = t1 & 0xff00ff00;
|
||||
return (t2 << 8u) | (t3 >> 8u);
|
||||
};
|
||||
|
||||
auto lhs_prefix = *reinterpret_cast<const uint32_t*>(this->long_rep.prefix);
|
||||
auto rhs_prefix = *reinterpret_cast<const uint32_t*>(rhs.long_rep.prefix);
|
||||
if (lhs_prefix != rhs_prefix) {
|
||||
// if prefix not equal, we can determine the result by prefix
|
||||
lhs_prefix = byte_swap(lhs_prefix);
|
||||
rhs_prefix = byte_swap(rhs_prefix);
|
||||
return (lhs_prefix > rhs_prefix) - (lhs_prefix < rhs_prefix);
|
||||
}
|
||||
|
||||
auto min_len = std::min<uint32_t>(this->len, rhs.len);
|
||||
auto r = memcompare(this->get_data(), min_len, rhs.get_data(), min_len);
|
||||
return r != 0 ? r : ((this->len > rhs.len) - (this->len < rhs.len));
|
||||
}
|
||||
uint32_t fnv_hash(uint32_t seed) const;
|
||||
uint32_t crc32_hash(uint32_t seed) const;
|
||||
|
||||
std::string to_string() const { return static_cast<std::string>(*this); }
|
||||
|
||||
const char* get_data() const {
|
||||
if (is_inline()) {
|
||||
return short_rep.str;
|
||||
} else {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return reinterpret_cast<const char*>(long_rep.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
inline bool operator==(const GermanString& rhs) const {
|
||||
// compare first 8 bytes;
|
||||
const auto lhs_h0 = *reinterpret_cast<const uint64_t*>(this);
|
||||
const auto rhs_h0 = *reinterpret_cast<const uint64_t*>(&rhs);
|
||||
if (lhs_h0 != rhs_h0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// compare second 8 bytes;
|
||||
const auto lhs_h1 = *(reinterpret_cast<const uint64_t*>(this) + 1);
|
||||
const auto rhs_h1 = *(reinterpret_cast<const uint64_t*>(&rhs) + 1);
|
||||
if (lhs_h1 == rhs_h1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return memcompare(get_data(), this->len, rhs.get_data(), rhs.len) == 0;
|
||||
}
|
||||
|
||||
inline bool operator!=(const GermanString& rhs) const { return !(*this == rhs); }
|
||||
inline bool operator<(const GermanString& rhs) const { return compare(rhs) < 0; }
|
||||
inline bool operator<=(const GermanString& rhs) const { return compare(rhs) <= 0; }
|
||||
inline bool operator>(const GermanString& rhs) const { return compare(rhs) > 0; }
|
||||
inline bool operator>=(const GermanString& rhs) const { return compare(rhs) >= 0; }
|
||||
inline bool operator<=>(const GermanString& rhs) const { return compare(rhs); }
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const GermanString& gs) {
|
||||
os << gs.to_string();
|
||||
return os;
|
||||
}
|
||||
alignas(8) union {
|
||||
uint32_t len;
|
||||
struct {
|
||||
uint32_t len;
|
||||
char str[INLINE_MAX_LENGTH];
|
||||
} short_rep;
|
||||
struct {
|
||||
uint32_t len;
|
||||
char prefix[PREFIX_LENGTH];
|
||||
uintptr_t ptr;
|
||||
} long_rep;
|
||||
};
|
||||
};
|
||||
} // namespace starrocks
|
||||
|
||||
namespace std {
|
||||
static inline std::string to_string(const starrocks::GermanString& gs) {
|
||||
return static_cast<std::string>(gs);
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
|
|
@ -45,6 +45,7 @@ ChunksSorter::ChunksSorter(RuntimeState* state, const std::vector<ExprContext*>*
|
|||
_sort_desc(*is_asc, *is_null_first),
|
||||
_sort_keys(std::move(sort_keys)),
|
||||
_is_topn(is_topn) {
|
||||
set_use_german_string(state->enable_full_sort_use_german_string());
|
||||
DCHECK(_sort_exprs != nullptr);
|
||||
DCHECK(is_asc != nullptr);
|
||||
DCHECK(is_null_first != nullptr);
|
||||
|
|
@ -62,6 +63,7 @@ void ChunksSorter::setup_runtime(RuntimeState* state, RuntimeProfile* profile, M
|
|||
_sort_cnt = ADD_COUNTER(profile, "SortingCnt", TUnit::UNIT);
|
||||
profile->add_info_string("SortKeys", _sort_keys);
|
||||
profile->add_info_string("SortType", _is_topn ? "TopN" : "All");
|
||||
profile->add_info_string("UseGermanString", is_use_german_string() ? "True" : "False");
|
||||
}
|
||||
|
||||
StatusOr<ChunkPtr> ChunksSorter::materialize_chunk_before_sort(Chunk* chunk, TupleDescriptor* materialized_tuple_desc,
|
||||
|
|
|
|||
|
|
@ -138,6 +138,8 @@ public:
|
|||
virtual size_t reserved_bytes(const ChunkPtr& chunk) { return chunk != nullptr ? chunk->memory_usage() : 0; }
|
||||
|
||||
virtual void cancel() {}
|
||||
bool is_use_german_string() const { return _use_german_string; }
|
||||
void set_use_german_string(bool value) { this->_use_german_string = value; }
|
||||
|
||||
protected:
|
||||
size_t _get_number_of_order_by_columns() const { return _sort_exprs->size(); }
|
||||
|
|
@ -149,6 +151,7 @@ protected:
|
|||
const SortDescs _sort_desc;
|
||||
const std::string _sort_keys;
|
||||
const bool _is_topn;
|
||||
bool _use_german_string = false;
|
||||
|
||||
RuntimeProfile::Counter* _build_timer = nullptr;
|
||||
RuntimeProfile::Counter* _sort_timer = nullptr;
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@
|
|||
#include "column/column_visitor_adapter.h"
|
||||
#include "column/const_column.h"
|
||||
#include "column/datum.h"
|
||||
#include "column/german_string.h"
|
||||
#include "column/json_column.h"
|
||||
#include "column/nullable_column.h"
|
||||
#include "column/vectorized_fwd.h"
|
||||
|
|
@ -172,10 +173,17 @@ public:
|
|||
template <typename SizeT>
|
||||
Status do_visit(const BinaryColumnBase<SizeT>& _) {
|
||||
using ColumnType = const BinaryColumnBase<SizeT>;
|
||||
using Container = typename BinaryColumnBase<SizeT>::BinaryDataProxyContainer;
|
||||
auto& left_data = down_cast<const ColumnType*>(_left_col)->get_proxy_data();
|
||||
auto& right_data = down_cast<const ColumnType*>(_right_col)->get_proxy_data();
|
||||
return merge_ordinary_column<Container, Slice>(left_data, right_data);
|
||||
if (use_german_string) {
|
||||
using Container = typename BinaryColumnBase<SizeT>::GermanStringContainer;
|
||||
auto& left_data = down_cast<const ColumnType*>(_left_col)->get_german_strings();
|
||||
auto& right_data = down_cast<const ColumnType*>(_right_col)->get_german_strings();
|
||||
return merge_ordinary_column<Container, GermanString>(left_data, right_data);
|
||||
} else {
|
||||
using Container = typename BinaryColumnBase<SizeT>::BinaryDataProxyContainer;
|
||||
auto& left_data = down_cast<const ColumnType*>(_left_col)->get_proxy_data();
|
||||
auto& right_data = down_cast<const ColumnType*>(_right_col)->get_proxy_data();
|
||||
return merge_ordinary_column<Container, Slice>(left_data, right_data);
|
||||
}
|
||||
}
|
||||
|
||||
Status do_visit(const NullableColumn& _) {
|
||||
|
|
@ -185,6 +193,7 @@ public:
|
|||
const auto* lhs_data = down_cast<const NullableColumn*>(_left_col)->data_column().get();
|
||||
const auto* rhs_data = down_cast<const NullableColumn*>(_right_col)->data_column().get();
|
||||
MergeTwoColumn merge2({_sort_order, _null_first}, lhs_data, rhs_data, _equal_ranges, _perm);
|
||||
merge2.set_use_german_string(is_use_german_string());
|
||||
return lhs_data->accept(&merge2);
|
||||
}
|
||||
|
||||
|
|
@ -192,6 +201,9 @@ public:
|
|||
return do_visit_slow(_);
|
||||
}
|
||||
|
||||
bool is_use_german_string() const { return use_german_string; }
|
||||
void set_use_german_string(bool value) { use_german_string = value; }
|
||||
|
||||
private:
|
||||
constexpr static uint32_t kLeftIndex = 0;
|
||||
constexpr static uint32_t kRightIndex = 1;
|
||||
|
|
@ -202,6 +214,7 @@ private:
|
|||
const Column* _right_col;
|
||||
std::vector<EqualRange>* _equal_ranges;
|
||||
Permutation* _perm;
|
||||
bool use_german_string = false;
|
||||
};
|
||||
|
||||
// MergeTwoChunk merge two chunk in column-wise
|
||||
|
|
@ -270,6 +283,7 @@ public:
|
|||
const Column* left_col = left_run.get_column(col);
|
||||
const Column* right_col = right_run.get_column(col);
|
||||
MergeTwoColumn merge2(sort_desc.get_column_desc(col), left_col, right_col, &equal_ranges, output);
|
||||
merge2.set_use_german_string(sort_desc.is_use_german_string());
|
||||
Status st = left_col->accept(&merge2);
|
||||
CHECK(st.ok());
|
||||
if (equal_ranges.size() == 0) {
|
||||
|
|
|
|||
|
|
@ -80,7 +80,8 @@ static Status sort_and_tie_helper(const std::atomic<bool>& cancel, const Column*
|
|||
}
|
||||
|
||||
static Status sort_and_tie_column(const std::atomic<bool>& cancel, const Column* column, const SortDesc& sort_desc,
|
||||
SmallPermutation& permutation, Tie& tie, Ranges&& ranges, bool build_tie);
|
||||
SmallPermutation& permutation, Tie& tie, Ranges&& ranges, bool build_tie,
|
||||
const SortDescs* sort_descs = nullptr);
|
||||
|
||||
template <class NullPred>
|
||||
static Status sort_and_tie_helper_nullable(const std::atomic<bool>& cancel, const NullableColumn* column,
|
||||
|
|
@ -170,16 +171,29 @@ public:
|
|||
DCHECK_GE(column.size(), _permutation.size());
|
||||
}
|
||||
|
||||
using ItemType = InlinePermuteItem<Slice>;
|
||||
auto cmp = [&](const ItemType& lhs, const ItemType& rhs) -> int {
|
||||
return lhs.inline_value.compare(rhs.inline_value);
|
||||
};
|
||||
if (_use_german_string) {
|
||||
using ItemType = InlinePermuteItem<GermanString>;
|
||||
auto cmp = [&](const ItemType& lhs, const ItemType& rhs) -> int {
|
||||
return lhs.inline_value.compare(rhs.inline_value);
|
||||
};
|
||||
|
||||
auto inlined = create_inline_permutation<Slice, IS_RANGES>(_permutation, column.get_proxy_data());
|
||||
RETURN_IF_ERROR(sort_and_tie_helper(_cancel, &column, _sort_desc.asc_order(), inlined, _tie, cmp,
|
||||
_range_or_ranges, _build_tie));
|
||||
restore_inline_permutation(inlined, _permutation);
|
||||
auto inlined =
|
||||
create_inline_permutation<GermanString, IS_RANGES>(_permutation, column.get_german_strings());
|
||||
|
||||
RETURN_IF_ERROR(sort_and_tie_helper(_cancel, &column, _sort_desc.asc_order(), inlined, _tie, cmp,
|
||||
_range_or_ranges, _build_tie));
|
||||
restore_inline_permutation(inlined, _permutation);
|
||||
} else {
|
||||
using ItemType = InlinePermuteItem<Slice>;
|
||||
auto cmp = [&](const ItemType& lhs, const ItemType& rhs) -> int {
|
||||
return lhs.inline_value.compare(rhs.inline_value);
|
||||
};
|
||||
|
||||
auto inlined = create_inline_permutation<Slice, IS_RANGES>(_permutation, column.get_proxy_data());
|
||||
RETURN_IF_ERROR(sort_and_tie_helper(_cancel, &column, _sort_desc.asc_order(), inlined, _tie, cmp,
|
||||
_range_or_ranges, _build_tie));
|
||||
restore_inline_permutation(inlined, _permutation);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
@ -218,6 +232,8 @@ public:
|
|||
_build_tie);
|
||||
}
|
||||
|
||||
void use_german_string(bool flag) { _use_german_string = flag; }
|
||||
|
||||
private:
|
||||
const std::atomic<bool>& _cancel;
|
||||
const SortDesc& _sort_desc;
|
||||
|
|
@ -225,6 +241,7 @@ private:
|
|||
Tie& _tie;
|
||||
R _range_or_ranges;
|
||||
bool _build_tie;
|
||||
bool _use_german_string = false;
|
||||
};
|
||||
|
||||
// Sort multiple a column from multiple chunks(vertical column)
|
||||
|
|
@ -469,24 +486,33 @@ private:
|
|||
};
|
||||
|
||||
Status sort_and_tie_column(const std::atomic<bool>& cancel, const ColumnPtr& column, const SortDesc& sort_desc,
|
||||
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, bool build_tie) {
|
||||
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, bool build_tie,
|
||||
const SortDescs* sort_descs) {
|
||||
ColumnSorter column_sorter(cancel, sort_desc, permutation, tie, range, build_tie);
|
||||
if (sort_descs != nullptr) {
|
||||
column_sorter.use_german_string(sort_descs->is_use_german_string());
|
||||
}
|
||||
return column->accept(&column_sorter);
|
||||
}
|
||||
|
||||
Status sort_and_tie_column(const std::atomic<bool>& cancel, ColumnPtr& column, const SortDesc& sort_desc,
|
||||
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, bool build_tie) {
|
||||
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, bool build_tie,
|
||||
const SortDescs* sort_descs) {
|
||||
// Nullable column need set all the null rows to default values,
|
||||
// see the comment of the declaration of `partition_null_and_nonnull_helper` for details.
|
||||
if (column->is_nullable() && !column->is_constant()) {
|
||||
ColumnHelper::as_column<NullableColumn>(column)->fill_null_with_default();
|
||||
}
|
||||
ColumnSorter column_sorter(cancel, sort_desc, permutation, tie, range, build_tie);
|
||||
if (sort_descs != nullptr) {
|
||||
column_sorter.use_german_string(sort_descs->is_use_german_string());
|
||||
}
|
||||
return column->accept(&column_sorter);
|
||||
}
|
||||
|
||||
static Status sort_and_tie_column(const std::atomic<bool>& cancel, const Column* column, const SortDesc& sort_desc,
|
||||
SmallPermutation& permutation, Tie& tie, Ranges&& ranges, bool build_tie) {
|
||||
SmallPermutation& permutation, Tie& tie, Ranges&& ranges, bool build_tie,
|
||||
const SortDescs* sort_descs) {
|
||||
// Nullable column need set all the null rows to default values,
|
||||
// see the comment of the declaration of `partition_null_and_nonnull_helper` for details.
|
||||
if (column->is_nullable() && !column->is_constant()) {
|
||||
|
|
@ -494,6 +520,9 @@ static Status sort_and_tie_column(const std::atomic<bool>& cancel, const Column*
|
|||
down_cast<NullableColumn*>(mutable_col)->fill_null_with_default();
|
||||
}
|
||||
ColumnSorter column_sorter(cancel, sort_desc, permutation, tie, std::move(ranges), build_tie);
|
||||
if (sort_descs != nullptr) {
|
||||
column_sorter.use_german_string(sort_descs->is_use_german_string());
|
||||
}
|
||||
return column->accept(&column_sorter);
|
||||
}
|
||||
|
||||
|
|
@ -525,7 +554,7 @@ Status sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& colu
|
|||
ColumnPtr column = columns[col_index];
|
||||
bool build_tie = col_index != columns.size() - 1;
|
||||
RETURN_IF_ERROR(sort_and_tie_column(cancel, column, sort_desc.get_column_desc(col_index), permutation, tie,
|
||||
range, build_tie));
|
||||
range, build_tie, &sort_desc));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
|
@ -534,7 +563,8 @@ Status sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& colu
|
|||
Status sort_and_tie_columns(const std::atomic<bool>& cancel, const std::vector<const Column*>& columns,
|
||||
const SortDescs& sort_desc, SmallPermutation& perm,
|
||||
const std::span<const uint32_t> src_offsets,
|
||||
const std::vector<std::span<const uint32_t>>& offsets_per_key) {
|
||||
const std::vector<std::span<const uint32_t>>& offsets_per_key,
|
||||
const SortDescs* sort_descs) {
|
||||
if (src_offsets.empty()) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
@ -584,7 +614,7 @@ Status sort_and_tie_columns(const std::atomic<bool>& cancel, const std::vector<c
|
|||
const bool build_tie = (col_i != (columns.size() - 1));
|
||||
shift_perm(offsets_per_key[col_i].data());
|
||||
RETURN_IF_ERROR(sort_and_tie_column(cancel, column, sort_desc.get_column_desc(col_i), perm, tie,
|
||||
Ranges(src_offsets, offsets_per_key[col_i]), build_tie));
|
||||
Ranges(src_offsets, offsets_per_key[col_i]), build_tie, sort_descs));
|
||||
}
|
||||
shift_perm(src_offsets.data());
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
#include <algorithm>
|
||||
#include <concepts>
|
||||
|
||||
#include "column/german_string.h"
|
||||
#include "column/nullable_column.h"
|
||||
#include "column/type_traits.h"
|
||||
#include "column/vectorized_fwd.h"
|
||||
|
|
@ -52,6 +53,16 @@ struct SorterComparator<Slice> {
|
|||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SorterComparator<GermanString> {
|
||||
static int compare(const GermanString& lhs, const GermanString& rhs) {
|
||||
int x = lhs.compare(rhs);
|
||||
if (x > 0) return 1;
|
||||
if (x < 0) return -1;
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SorterComparator<DateValue> {
|
||||
static int compare(const DateValue& lhs, const DateValue& rhs) {
|
||||
|
|
|
|||
|
|
@ -35,9 +35,11 @@ struct SortDescs;
|
|||
// @param tie input and output tie
|
||||
// @param range sort range, {0, 0} means not build tie but sort data
|
||||
Status sort_and_tie_column(const std::atomic<bool>& cancel, ColumnPtr& column, const SortDesc& sort_desc,
|
||||
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, const bool build_tie);
|
||||
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, const bool build_tie,
|
||||
const SortDescs* sort_descs = nullptr);
|
||||
Status sort_and_tie_column(const std::atomic<bool>& cancel, const ColumnPtr& column, const SortDesc& sort_desc,
|
||||
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, const bool build_tie);
|
||||
SmallPermutation& permutation, Tie& tie, std::pair<int, int> range, const bool build_tie,
|
||||
const SortDescs* sort_descs = nullptr);
|
||||
|
||||
// Sort multiple columns using column-wise algorithm, the final order is stored in the Permutation argument
|
||||
Status sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& columns, const SortDescs& sort_desc,
|
||||
|
|
@ -60,7 +62,8 @@ Status sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& colu
|
|||
Status sort_and_tie_columns(const std::atomic<bool>& cancel, const std::vector<const Column*>& columns,
|
||||
const SortDescs& sort_desc, SmallPermutation& perm,
|
||||
const std::span<const uint32_t> src_offsets,
|
||||
const std::vector<std::span<const uint32_t>>& offsets_per_key);
|
||||
const std::vector<std::span<const uint32_t>>& offsets_per_key,
|
||||
const SortDescs* sort_descs = nullptr);
|
||||
|
||||
// Sort multiple columns, and stable
|
||||
Status stable_sort_and_tie_columns(const std::atomic<bool>& cancel, const Columns& columns, const SortDescs& sort_desc,
|
||||
|
|
@ -105,6 +108,7 @@ struct SortDesc {
|
|||
};
|
||||
struct SortDescs {
|
||||
std::vector<SortDesc> descs;
|
||||
mutable bool use_german_string = false;
|
||||
|
||||
SortDescs() = default;
|
||||
~SortDescs() = default;
|
||||
|
|
@ -136,6 +140,8 @@ struct SortDescs {
|
|||
size_t num_columns() const { return descs.size(); }
|
||||
|
||||
SortDesc get_column_desc(int col) const { return descs[col]; }
|
||||
bool is_use_german_string() const { return use_german_string; }
|
||||
void set_use_german_string(bool value) const { use_german_string = value; }
|
||||
};
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
|
|
@ -362,6 +362,11 @@ public:
|
|||
return _spill_options->spill_partitionwise_agg_skew_elimination;
|
||||
}
|
||||
|
||||
bool enable_full_sort_use_german_string() const {
|
||||
return _query_options.__isset.enable_full_sort_use_german_string &&
|
||||
_query_options.enable_full_sort_use_german_string;
|
||||
}
|
||||
|
||||
int32_t spill_mem_table_size() const {
|
||||
return EXTRACE_SPILL_PARAM(_query_options, _spill_options, spill_mem_table_size);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,8 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <util/stack_util.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
|
||||
|
|
@ -22,4 +24,21 @@ namespace starrocks {
|
|||
// take a sleep with small intervals until time out by `sleep_secs` or the `stop_condition()` is true
|
||||
void nap_sleep(int32_t sleep_secs, const std::function<bool()>& stop_condition);
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
#define NOT_SUPPORT() \
|
||||
do { \
|
||||
throw std::runtime_error(std::string("Not support method '") + __PRETTY_FUNCTION__ + \
|
||||
"': " + get_stack_trace()); \
|
||||
} while (0);
|
||||
#elif defined(_MSC_VER)
|
||||
#define NOT_SUPPORT() \
|
||||
do { \
|
||||
throw std::runtime_error(std::string("Not support method '") + __FUNCSIG__ + "': " + get_stack_trace()); \
|
||||
} while (0);
|
||||
#else
|
||||
#define NOT_SUPPORT() \
|
||||
do { \
|
||||
throw std::runtime_error(std::string("Not support method '") + __func__ + "': " + get_stack_trace()); \
|
||||
} while (0);
|
||||
#endif
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ set(EXEC_FILES
|
|||
./column/column_view_test.cpp
|
||||
./column/array_view_column_test.cpp
|
||||
./column/binary_column_test.cpp
|
||||
./column/german_string_test.cpp
|
||||
./column/chunk_test.cpp
|
||||
./column/column_helper_test.cpp
|
||||
./column/const_column_test.cpp
|
||||
|
|
|
|||
|
|
@ -0,0 +1,212 @@
|
|||
// Copyright 2021-present StarRocks, Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "column/german_string.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
|
||||
#include "column/binary_column.h"
|
||||
#include "column/column_helper.h"
|
||||
#include "column/const_column.h"
|
||||
#include "column/fixed_length_column.h"
|
||||
#include "column/nullable_column.h"
|
||||
#include "column/vectorized_fwd.h"
|
||||
#include "testutil/parallel_test.h"
|
||||
|
||||
namespace starrocks {
|
||||
class GermanStringExternalAllocator {
|
||||
public:
|
||||
static constexpr auto PAGE_SIZE = 4096;
|
||||
static constexpr auto MEDIUM_STRING_MAX_SIZE = 512;
|
||||
virtual ~GermanStringExternalAllocator() = default;
|
||||
size_t size() const;
|
||||
|
||||
void clear();
|
||||
|
||||
char* allocate(size_t n);
|
||||
virtual void incorporate(std::shared_ptr<GermanStringExternalAllocator>&& other);
|
||||
bool is_from_binary() const { return false; }
|
||||
|
||||
private:
|
||||
std::vector<std::vector<char>> medium_string_pages;
|
||||
std::vector<std::string> large_strings;
|
||||
};
|
||||
|
||||
size_t GermanStringExternalAllocator::size() const {
|
||||
size_t total_size = 0;
|
||||
for (const auto& page : medium_string_pages) {
|
||||
total_size += page.size();
|
||||
}
|
||||
for (const auto& str : large_strings) {
|
||||
total_size += str.size();
|
||||
}
|
||||
return total_size;
|
||||
}
|
||||
|
||||
void GermanStringExternalAllocator::clear() {
|
||||
large_strings.clear();
|
||||
medium_string_pages.clear();
|
||||
}
|
||||
|
||||
char* GermanStringExternalAllocator::allocate(size_t n) {
|
||||
if (n <= GermanString::INLINE_MAX_LENGTH) {
|
||||
return nullptr;
|
||||
}
|
||||
if (n <= MEDIUM_STRING_MAX_SIZE) {
|
||||
if (medium_string_pages.empty() || medium_string_pages.back().size() + n > PAGE_SIZE) {
|
||||
medium_string_pages.emplace_back();
|
||||
medium_string_pages.back().reserve(PAGE_SIZE);
|
||||
}
|
||||
auto& page = medium_string_pages.back();
|
||||
raw::stl_vector_resize_uninitialized(&page, page.size() + n);
|
||||
return page.data() + page.size() - n;
|
||||
} else {
|
||||
large_strings.emplace_back();
|
||||
auto& str = large_strings.back();
|
||||
raw::make_room(&str, n);
|
||||
return large_strings.back().data();
|
||||
}
|
||||
}
|
||||
|
||||
void GermanStringExternalAllocator::incorporate(std::shared_ptr<GermanStringExternalAllocator>&& other) {
|
||||
DCHECK(!other->is_from_binary());
|
||||
large_strings.insert(large_strings.end(), std::make_move_iterator(other->large_strings.begin()),
|
||||
std::make_move_iterator(other->large_strings.end()));
|
||||
medium_string_pages.insert(medium_string_pages.end(), std::make_move_iterator(other->medium_string_pages.begin()),
|
||||
std::make_move_iterator(other->medium_string_pages.end()));
|
||||
other->clear();
|
||||
}
|
||||
|
||||
static inline std::string gen_string(std::size_t length) {
|
||||
const std::string chars =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789";
|
||||
|
||||
std::random_device rd; // For seeding
|
||||
std::mt19937 gen(rd()); // Mersenne Twister RNG
|
||||
std::uniform_int_distribution<> dist(0, chars.size() - 1);
|
||||
|
||||
std::string result;
|
||||
result.reserve(length);
|
||||
for (std::size_t i = 0; i < length; ++i) {
|
||||
result += chars[dist(gen)];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
TEST(GermanStringTest, test_german_string) {
|
||||
GermanStringExternalAllocator allocator;
|
||||
std::vector<std::string> strings{gen_string(0), gen_string(1), gen_string(3), gen_string(4),
|
||||
gen_string(5), gen_string(11), gen_string(12), gen_string(13),
|
||||
gen_string(15), gen_string(4096), gen_string(4097)};
|
||||
for (const auto& s : strings) {
|
||||
GermanString gs1(s.data(), s.size(), allocator.allocate(s.size()));
|
||||
auto s1 = static_cast<std::string>(gs1);
|
||||
ASSERT_EQ(s, s1);
|
||||
GermanString gs2(gs1, allocator.allocate(gs1.len));
|
||||
auto s2 = static_cast<std::string>(gs2);
|
||||
ASSERT_EQ(s, s2);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GermanStringTest, test_german_string_compare) {
|
||||
GermanStringExternalAllocator allocator;
|
||||
// short vs short
|
||||
{
|
||||
GermanString gs1("Hello", 5, allocator.allocate(5));
|
||||
GermanString gs2("Hello", 5, allocator.allocate(5));
|
||||
GermanString gs3("Hello!", 6, allocator.allocate(6));
|
||||
GermanString gs4("Helloi", 6, allocator.allocate(6));
|
||||
ASSERT_TRUE(gs1 == gs2);
|
||||
ASSERT_FALSE(gs1 == gs3);
|
||||
ASSERT_FALSE(gs1 == gs4);
|
||||
ASSERT_FALSE(gs3 == gs4);
|
||||
ASSERT_FALSE(gs3 == gs1);
|
||||
ASSERT_FALSE(gs4 == gs1);
|
||||
ASSERT_FALSE(gs4 == gs3);
|
||||
|
||||
ASSERT_TRUE(gs1.compare(gs2) == 0);
|
||||
ASSERT_TRUE(gs1 < gs3);
|
||||
ASSERT_TRUE(gs3 < gs4);
|
||||
ASSERT_TRUE(gs4 > gs1);
|
||||
}
|
||||
// long vs long
|
||||
{
|
||||
GermanString gs1("Hello GermanString", 18, allocator.allocate(18));
|
||||
GermanString gs2("Hello GermanString", 18, allocator.allocate(18));
|
||||
GermanString gs3("Hello GermanString!", 19, allocator.allocate(19));
|
||||
GermanString gs4("Hello GermanStringi", 19, allocator.allocate(19));
|
||||
ASSERT_TRUE(gs1 == gs2);
|
||||
ASSERT_FALSE(gs1 == gs3);
|
||||
ASSERT_FALSE(gs1 == gs4);
|
||||
ASSERT_FALSE(gs3 == gs4);
|
||||
ASSERT_FALSE(gs3 == gs1);
|
||||
ASSERT_FALSE(gs4 == gs1);
|
||||
ASSERT_FALSE(gs4 == gs3);
|
||||
|
||||
ASSERT_TRUE(gs1.compare(gs2) == 0);
|
||||
ASSERT_TRUE(gs1 < gs3);
|
||||
ASSERT_TRUE(gs3 < gs4);
|
||||
ASSERT_TRUE(gs4 > gs1);
|
||||
}
|
||||
// short vs long
|
||||
{
|
||||
GermanString gs1("Hello", 5, allocator.allocate(5));
|
||||
GermanString gs2("Hello GermanString", 18, allocator.allocate(18));
|
||||
GermanString gs3("Hello GermanString!", 19, allocator.allocate(19));
|
||||
GermanString gs4("Hello GermanStringi", 19, allocator.allocate(19));
|
||||
ASSERT_TRUE(gs1 < gs2);
|
||||
ASSERT_TRUE(gs1 < gs3);
|
||||
ASSERT_TRUE(gs1 < gs4);
|
||||
ASSERT_FALSE(gs2 < gs1);
|
||||
ASSERT_FALSE(gs3 < gs1);
|
||||
ASSERT_FALSE(gs4 < gs1);
|
||||
|
||||
ASSERT_TRUE(gs2 > gs1);
|
||||
ASSERT_TRUE(gs3 > gs1);
|
||||
ASSERT_TRUE(gs4 > gs1);
|
||||
}
|
||||
{
|
||||
GermanString gs0("", 0, allocator.allocate(0));
|
||||
GermanString gs4("abcd", 4, allocator.allocate(4));
|
||||
GermanString gs12("abcdabcdabcd", 12, allocator.allocate(12));
|
||||
GermanString gs16("abcdabcdabcd", 16, allocator.allocate(16));
|
||||
ASSERT_TRUE(gs0 < gs4);
|
||||
ASSERT_TRUE(gs0 < gs12);
|
||||
ASSERT_TRUE(gs0 < gs16);
|
||||
ASSERT_TRUE(gs4 < gs12);
|
||||
ASSERT_TRUE(gs4 < gs16);
|
||||
ASSERT_TRUE(gs12 < gs16);
|
||||
|
||||
ASSERT_TRUE(gs0 == gs0);
|
||||
ASSERT_TRUE(gs4 == gs4);
|
||||
ASSERT_TRUE(gs12 == gs12);
|
||||
ASSERT_TRUE(gs16 == gs16);
|
||||
|
||||
ASSERT_TRUE(gs0 != gs4);
|
||||
ASSERT_TRUE(gs0 != gs12);
|
||||
ASSERT_TRUE(gs0 != gs16);
|
||||
ASSERT_TRUE(gs4 != gs12);
|
||||
ASSERT_TRUE(gs4 != gs16);
|
||||
ASSERT_TRUE(gs12 != gs16);
|
||||
}
|
||||
}
|
||||
} // namespace starrocks
|
||||
|
|
@ -957,6 +957,8 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
|
|||
|
||||
public static final String ENABLE_DESENSITIZE_EXPLAIN = "enable_desensitize_explain";
|
||||
|
||||
public static final String ENABLE_FULL_SORT_USE_GERMAN_STRING = "enable_full_sort_use_german_string";
|
||||
|
||||
public static final List<String> DEPRECATED_VARIABLES = ImmutableList.<String>builder()
|
||||
.add(CODEGEN_LEVEL)
|
||||
.add(MAX_EXECUTION_TIME)
|
||||
|
|
@ -1960,6 +1962,9 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
|
|||
this.enableDesensitizeExplain = enableDesensitizeExplain;
|
||||
}
|
||||
|
||||
@VarAttr(name = ENABLE_FULL_SORT_USE_GERMAN_STRING)
|
||||
private boolean enableFullSortUseGermanString = true;
|
||||
|
||||
public int getCboPruneJsonSubfieldDepth() {
|
||||
return cboPruneJsonSubfieldDepth;
|
||||
}
|
||||
|
|
@ -5317,6 +5322,13 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
|
|||
this.cboJSONV2DictOpt = value;
|
||||
}
|
||||
|
||||
public void setEnableFullSortUseGermanString(boolean value) {
|
||||
this.enableFullSortUseGermanString = value;
|
||||
}
|
||||
|
||||
public boolean isEnableFullSortUseGermanString() {
|
||||
return this.enableFullSortUseGermanString;
|
||||
}
|
||||
// Serialize to thrift object
|
||||
// used for rest api
|
||||
public TQueryOptions toThrift() {
|
||||
|
|
@ -5339,7 +5351,7 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
|
|||
tResult.setRuntime_profile_report_interval(runtimeProfileReportInterval);
|
||||
tResult.setBatch_size(chunkSize);
|
||||
tResult.setLoad_mem_limit(loadMemLimit);
|
||||
|
||||
tResult.setEnable_full_sort_use_german_string(enableFullSortUseGermanString);
|
||||
if (maxScanKeyNum > -1) {
|
||||
tResult.setMax_scan_key_num(maxScanKeyNum);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -347,6 +347,8 @@ struct TQueryOptions {
|
|||
|
||||
190: optional i64 column_view_concat_rows_limit;
|
||||
191: optional i64 column_view_concat_bytes_limit;
|
||||
|
||||
200: optional bool enable_full_sort_use_german_string;
|
||||
}
|
||||
|
||||
// A scan range plus the parameters needed to execute that scan.
|
||||
|
|
@ -410,6 +412,7 @@ struct TPlanFragmentExecParams {
|
|||
|
||||
// Debug options: perform some action in a particular phase of a particular node
|
||||
74: optional list<TExecDebugOption> exec_debug_options
|
||||
|
||||
}
|
||||
|
||||
// Global query parameters assigned by the coordinator.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
-- name: test_full_sort_use_german_string
|
||||
DROP TABLE if exists t0;
|
||||
-- result:
|
||||
-- !result
|
||||
CREATE TABLE if not exists t0
|
||||
(
|
||||
c0 VARCHAR(255) NOT NULL,
|
||||
c1 VARCHAR(255) NOT NULL,
|
||||
c2 VARCHAR(255) NOT NULL,
|
||||
c3 VARCHAR(255) NOT NULL
|
||||
) ENGINE=OLAP
|
||||
DUPLICATE KEY(`c0` )
|
||||
COMMENT "OLAP"
|
||||
DISTRIBUTED BY HASH(`c0` ) BUCKETS 1
|
||||
PROPERTIES(
|
||||
"replication_num" = "1",
|
||||
"in_memory" = "false",
|
||||
"storage_format" = "default"
|
||||
);
|
||||
-- result:
|
||||
-- !result
|
||||
insert into t0
|
||||
select
|
||||
concat("",i,"_abc") as c0,
|
||||
concat("foo_", 100000 - i) as c1,
|
||||
concat("bar_", ((50000 - i)/10)*((50000 - i)/10)) as c2,
|
||||
concat("", ((50000 - i)/10)*((50000 - i)/10),"_bar") as c3
|
||||
from table(generate_series(1,100000)) gs(i);
|
||||
-- result:
|
||||
-- !result
|
||||
with cte as(select c0, count(1) over(partition by c0) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0, cnt)) as fingerprint from cte;
|
||||
-- result:
|
||||
380598722181
|
||||
-- !result
|
||||
with cte as(select c0, count(1) over(partition by c0) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0, cnt)) as fingerprint from cte;
|
||||
-- result:
|
||||
380598722181
|
||||
-- !result
|
||||
with cte as(select c0,c1, count(1) over(partition by c0,c1) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0,c1, cnt)) as fingerprint from cte;
|
||||
-- result:
|
||||
188259655263
|
||||
-- !result
|
||||
with cte as(select c0,c1, count(1) over(partition by c0,c1) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0,c1, cnt)) as fingerprint from cte;
|
||||
-- result:
|
||||
188259655263
|
||||
-- !result
|
||||
with cte as(select c0,c1,c2, count(1) over(partition by c0,c1,c2) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0,c1,c2, cnt)) as fingerprint from cte;
|
||||
-- result:
|
||||
-418370384866
|
||||
-- !result
|
||||
with cte as(select c0,c1,c2, count(1) over(partition by c0,c1,c2) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0,c1,c2, cnt)) as fingerprint from cte;
|
||||
-- result:
|
||||
-418370384866
|
||||
-- !result
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
-- name: test_full_sort_use_german_string
|
||||
DROP TABLE if exists t0;
|
||||
|
||||
CREATE TABLE if not exists t0
|
||||
(
|
||||
c0 VARCHAR(255) NOT NULL,
|
||||
c1 VARCHAR(255) NOT NULL,
|
||||
c2 VARCHAR(255) NOT NULL,
|
||||
c3 VARCHAR(255) NOT NULL
|
||||
) ENGINE=OLAP
|
||||
DUPLICATE KEY(`c0` )
|
||||
COMMENT "OLAP"
|
||||
DISTRIBUTED BY HASH(`c0` ) BUCKETS 1
|
||||
PROPERTIES(
|
||||
"replication_num" = "1",
|
||||
"in_memory" = "false",
|
||||
"storage_format" = "default"
|
||||
);
|
||||
insert into t0
|
||||
select
|
||||
concat("",i,"_abc") as c0,
|
||||
concat("foo_", 100000 - i) as c1,
|
||||
concat("bar_", ((50000 - i)/10)*((50000 - i)/10)) as c2,
|
||||
concat("", ((50000 - i)/10)*((50000 - i)/10),"_bar") as c3
|
||||
from table(generate_series(1,100000)) gs(i);
|
||||
with cte as(select c0, count(1) over(partition by c0) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0, cnt)) as fingerprint from cte;
|
||||
with cte as(select c0, count(1) over(partition by c0) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0, cnt)) as fingerprint from cte;
|
||||
with cte as(select c0,c1, count(1) over(partition by c0,c1) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0,c1, cnt)) as fingerprint from cte;
|
||||
with cte as(select c0,c1, count(1) over(partition by c0,c1) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0,c1, cnt)) as fingerprint from cte;
|
||||
with cte as(select c0,c1,c2, count(1) over(partition by c0,c1,c2) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=true)*/sum(murmur_hash3_32(c0,c1,c2, cnt)) as fingerprint from cte;
|
||||
with cte as(select c0,c1,c2, count(1) over(partition by c0,c1,c2) as cnt from t0)
|
||||
select /*+SET_VAR(enable_full_sort_use_german_string=false)*/sum(murmur_hash3_32(c0,c1,c2, cnt)) as fingerprint from cte;
|
||||
Loading…
Reference in New Issue