[Enhancement] Optimize accessing non-existent JSON field (backport #62003) (#62133)

Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
This commit is contained in:
mergify[bot] 2025-08-20 16:05:19 +08:00 committed by GitHub
parent 843806e61e
commit c42eaf88df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 27 additions and 26 deletions

View File

@ -199,6 +199,9 @@ public:
bool has_remain_json() const { return _has_remain; }
// Return the pointer to the remain filter if it exists, otherwise return nullptr.
const BloomFilter* get_remain_filter() const { return _remain_filter ? _remain_filter.get() : nullptr; }
private:
StatusOr<std::unique_ptr<ColumnIterator>> _new_json_iterator(ColumnAccessPath* path = nullptr,
const TabletColumn* column = nullptr);

View File

@ -44,6 +44,7 @@
#include "column/schema.h"
#include "common/logging.h"
#include "fs/key_cache.h"
#include "gutil/strings/split.h"
#include "gutil/strings/substitute.h"
#include "segment_iterator.h"
#include "segment_options.h"
@ -524,7 +525,25 @@ StatusOr<ColumnIteratorUPtr> Segment::_new_extended_column_iterator(const Tablet
}
}
// Build a regular ColumnIterator to read it
// case 3: check if this segment contains the specific field
auto& column_reader = _column_readers[source_id];
bool may_contains = column_reader->has_remain_json();
if (may_contains && column_reader->get_remain_filter() != nullptr) {
std::vector<std::string> paths = strings::Split(full_path, ".");
std::string_view leaf = paths.back();
may_contains = column_reader->get_remain_filter()->test_bytes(leaf.data(), leaf.size());
}
if (!may_contains) {
// create an iterator always return NULL for fields that don't exist in this segment
auto default_null_iter = std::make_unique<DefaultValueColumnIterator>(false, "", true, get_type_info(column),
column.length(), num_rows());
ColumnIteratorOptions iter_opts;
RETURN_IF_ERROR(default_null_iter->init(iter_opts));
VLOG(2) << "json field " << full_path << " not found in segment, return NULL directly";
return default_null_iter;
}
// Build a regular JsonExtractIterator to read it
auto& source_reader = _column_readers[source_id];
ASSIGN_OR_RETURN(auto source_iter, source_reader->new_iterator(path, &column));
return create_json_extract_iterator(std::move(source_iter), source_reader->is_nullable(), std::string(field_name),

View File

@ -45,6 +45,7 @@
#include "exprs/column_ref.h"
#include "exprs/expr_context.h"
#include "gutil/casts.h"
#include "gutil/strings/split.h"
#include "runtime/types.h"
#include "storage/rowset/column_reader.h"
#include "types/logical_type.h"

View File

@ -2491,7 +2491,7 @@ TEST_F(FlatJsonColumnRWTest, testSegmentWriterIteratorWithMixedDataTypes) {
ASSIGN_OR_ABORT(auto column_iter, segment->new_column_iterator_or_default(col, path.get()));
ASSERT_OK(column_iter->init(column_opts));
ASSERT_OK(column_iter->seek_to_first());
size_t count = 4096;
size_t count = 3;
auto column = ColumnHelper::create_column(TypeDescriptor(field_type), true);
ASSERT_OK(column_iter->next_batch(&count, column.get()));
ASSERT_EQ(column->size(), json_strings.size());

View File

@ -167,8 +167,6 @@ select get_json_int(j1, '$.f1') from js2 where get_json_int(j1, '$.f1') = 1;
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 1
- AccessPathHits: 1
- PushdownAccessPaths: 0
-- !result
select get_json_int(j1, '$.f2') from js2 where get_json_int(j1, '$.f1') = 1;
@ -177,8 +175,6 @@ None
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 2
- AccessPathHits: 2
- PushdownAccessPaths: 0
-- !result
select get_json_int(j1, '$.f1') from js2 where get_json_int(j1, '$.f2') = 1;
@ -187,8 +183,6 @@ None
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 2
- AccessPathHits: 2
- PushdownAccessPaths: 0
-- !result
select get_json_int(j1, '$.f2'), j1 from js2 where get_json_int(j1, '$.f1') = 1;
@ -197,8 +191,7 @@ None {"f1": 1}
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 2
- AccessPathHits: 4
- AccessPathHits: 2
- PushdownAccessPaths: 2
-- !result
select get_json_int(j1, '$.f2'), j1 from js2 where get_json_int(j1, '$.f3') = 1;
@ -206,8 +199,7 @@ select get_json_int(j1, '$.f2'), j1 from js2 where get_json_int(j1, '$.f3') = 1;
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 3
- AccessPathHits: 5
- AccessPathHits: 2
- PushdownAccessPaths: 2
-- !result
select get_json_int(j1, '$.f2'), get_json_int(j1, '$.f2') from js2 where get_json_int(j1, '$.f1') = 1;
@ -216,8 +208,6 @@ None None
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 2
- AccessPathHits: 2
- PushdownAccessPaths: 0
-- !result
select count(get_json_int(j1, '$.f2')) from js2 ;
@ -226,8 +216,6 @@ select count(get_json_int(j1, '$.f2')) from js2 ;
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 1
- AccessPathHits: 1
- PushdownAccessPaths: 0
-- !result
select count(get_json_int(j1, '$.f3')) from js2 ;
@ -236,8 +224,6 @@ select count(get_json_int(j1, '$.f3')) from js2 ;
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 2
- AccessPathHits: 2
- PushdownAccessPaths: 0
-- !result
select * from js2 where get_json_int(j1, '$.f1') = -1;
@ -507,8 +493,6 @@ select count(*) from js3 where get_json_double(j1, 'f_bool') = 1.0;
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 1
- AccessPathHits: 1
- PushdownAccessPaths: 0
-- !result
select count(*) from js3 where get_json_int(j1, 'f_int') < 500;
@ -529,8 +513,6 @@ select count(*) from js3 where get_json_double(j1, 'f_int') < 500;
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 1
- AccessPathHits: 1
- PushdownAccessPaths: 0
-- !result
select count(*) from js3 where get_json_double(j1, 'f_double') < 500.0;
@ -551,8 +533,6 @@ select count(*) from js3 where get_json_string(j1, 'f_double') < '500';
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 1
- AccessPathHits: 1
- PushdownAccessPaths: 0
-- !result
select count(*) from js3 where get_json_int(j1, 'f_none') IS NULL;
@ -581,7 +561,5 @@ select count(*) from js3 where get_json_double(j1, 'f_none') < 500;
-- !result
select * from profile_access_path;
-- result:
- AccessPathExtract: 2
- AccessPathHits: 2
- PushdownAccessPaths: 0
-- !result