89 lines
3.1 KiB
C++
89 lines
3.1 KiB
C++
// Copyright 2021-present StarRocks, Inc. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// https://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "formats/parquet/level_codec.h"
|
|
|
|
#include "util/bit_stream_utils.inline.h"
|
|
#include "util/bit_util.h"
|
|
#include "util/coding.h"
|
|
#include "util/slice.h"
|
|
|
|
namespace starrocks::parquet {
|
|
|
|
Status LevelDecoder::parse(tparquet::Encoding::type encoding, level_t max_level, uint32_t num_levels, Slice* slice) {
|
|
_encoding = encoding;
|
|
_bit_width = BitUtil::log2(max_level + 1);
|
|
_num_levels = num_levels;
|
|
// new page, invalid cached decode
|
|
_levels_decoded = _levels_parsed;
|
|
switch (encoding) {
|
|
case tparquet::Encoding::RLE: {
|
|
if (slice->size < 4) {
|
|
return Status::Corruption("");
|
|
}
|
|
|
|
auto* data = (uint8_t*)slice->data;
|
|
uint32_t num_bytes = decode_fixed32_le(data);
|
|
if (num_bytes > slice->size - 4) {
|
|
return Status::Corruption("");
|
|
}
|
|
_rle_decoder = RleDecoder<level_t>(data + 4, num_bytes, _bit_width);
|
|
slice->remove_prefix(4 + num_bytes);
|
|
break;
|
|
}
|
|
case tparquet::Encoding::BIT_PACKED: {
|
|
uint32_t num_bits = num_levels * _bit_width;
|
|
uint32_t num_bytes = BitUtil::RoundUpNumBytes(num_bits);
|
|
if (num_bytes > slice->size) {
|
|
return Status::Corruption("");
|
|
}
|
|
_bit_packed_decoder = BitReader((uint8_t*)slice->data, num_bytes);
|
|
slice->remove_prefix(num_bytes);
|
|
break;
|
|
}
|
|
default:
|
|
return Status::InternalError("not supported encoding");
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status LevelDecoder::parse_v2(uint32_t num_bytes, level_t max_level, uint32_t num_levels, Slice* slice) {
|
|
_encoding = tparquet::Encoding::RLE;
|
|
_bit_width = BitUtil::log2(max_level + 1);
|
|
_num_levels = num_levels;
|
|
// new page, invalid cached decode
|
|
_levels_decoded = _levels_parsed;
|
|
auto* data = (uint8_t*)slice->data;
|
|
if (num_bytes > slice->size) {
|
|
return Status::Corruption("");
|
|
}
|
|
_rle_decoder = RleDecoder<level_t>(data, num_bytes, _bit_width);
|
|
slice->remove_prefix(num_bytes);
|
|
return Status::OK();
|
|
}
|
|
|
|
size_t LevelDecoder::_get_level_to_decode_batch_size(size_t row_num) {
|
|
constexpr size_t min_level_batch_size = 4096;
|
|
size_t levels_remaining = _levels_decoded - _levels_parsed;
|
|
if (row_num <= levels_remaining) {
|
|
return 0;
|
|
}
|
|
|
|
size_t levels_to_decode = std::max(min_level_batch_size, row_num - levels_remaining);
|
|
levels_to_decode = std::min(levels_to_decode, static_cast<size_t>(_num_levels));
|
|
return levels_to_decode;
|
|
}
|
|
|
|
} // namespace starrocks::parquet
|