starrocks/be/test/exec/file_scanner/csv_scanner_test.cpp

1380 lines
50 KiB
C++

// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "exec/file_scanner/csv_scanner.h"
#include <gtest/gtest.h>
#include <iostream>
#include "column/chunk.h"
#include "column/datum_tuple.h"
#include "fs/fs_memory.h"
#include "fs/fs_util.h"
#include "gen_cpp/Descriptors_types.h"
#include "runtime/descriptor_helper.h"
#include "runtime/descriptors.h"
#include "runtime/mem_tracker.h"
#include "runtime/runtime_state.h"
#include "testutil/assert.h"
namespace starrocks {
using ::testing::TestWithParam;
using ::testing::Values;
class CSVScannerTest : public TestWithParam<bool> {
protected:
virtual void SetUp() override { _use_v2 = GetParam(); }
void TearDown() override {}
std::unique_ptr<CSVScanner> create_csv_scanner(const std::vector<TypeDescriptor>& types,
const std::vector<TBrokerRangeDesc>& ranges,
TBrokerScanRangeParams* params) {
/// Init DescriptorTable
TDescriptorTableBuilder desc_tbl_builder;
TTupleDescriptorBuilder tuple_desc_builder;
for (auto& t : types) {
TSlotDescriptorBuilder slot_desc_builder;
slot_desc_builder.type(t).length(t.len).precision(t.precision).scale(t.scale).nullable(true);
tuple_desc_builder.add_slot(slot_desc_builder.build());
}
tuple_desc_builder.build(&desc_tbl_builder);
DescriptorTbl* desc_tbl = nullptr;
Status st = DescriptorTbl::create(&_runtime_state, &_obj_pool, desc_tbl_builder.desc_tbl(), &desc_tbl,
config::vector_chunk_size);
CHECK(st.ok()) << st.to_string();
/// Init RuntimeState
RuntimeState* state = _obj_pool.add(new RuntimeState(TUniqueId(), TQueryOptions(), TQueryGlobals(), nullptr));
state->set_desc_tbl(desc_tbl);
state->init_instance_mem_tracker();
state->_query_options.query_type = TQueryType::LOAD;
params->strict_mode = true;
params->dest_tuple_id = 0;
params->src_tuple_id = 0;
for (int i = 0; i < types.size(); i++) {
params->expr_of_dest_slot[i] = TExpr();
params->expr_of_dest_slot[i].nodes.emplace_back(TExprNode());
params->expr_of_dest_slot[i].nodes[0].__set_type(types[i].to_thrift());
params->expr_of_dest_slot[i].nodes[0].__set_node_type(TExprNodeType::SLOT_REF);
params->expr_of_dest_slot[i].nodes[0].__set_is_nullable(true);
params->expr_of_dest_slot[i].nodes[0].__set_slot_ref(TSlotRef());
params->expr_of_dest_slot[i].nodes[0].slot_ref.__set_slot_id(i);
params->expr_of_dest_slot[i].nodes[0].__set_type(types[i].to_thrift());
}
for (int i = 0; i < types.size(); i++) {
params->src_slot_ids.emplace_back(i);
}
RuntimeProfile* profile = _obj_pool.add(new RuntimeProfile("test_prof", true));
ScannerCounter* counter = _obj_pool.add(new ScannerCounter());
TBrokerScanRange* broker_scan_range = _obj_pool.add(new TBrokerScanRange());
broker_scan_range->params = *params;
broker_scan_range->ranges = ranges;
return std::make_unique<CSVScanner>(state, profile, *broker_scan_range, counter);
}
std::unique_ptr<CSVScanner> create_csv_scanner(const std::vector<TypeDescriptor>& types,
const std::vector<TBrokerRangeDesc>& ranges,
const std::string& multi_row_delimiter = "\n",
const std::string& multi_column_separator = "|",
const int64_t skip_header = 0, const bool trim_space = false,
const char enclose = 0, const char escape = 0) {
/// TBrokerScanRangeParams
TBrokerScanRangeParams* params = _obj_pool.add(new TBrokerScanRangeParams());
params->__set_multi_row_delimiter(multi_row_delimiter);
params->__set_multi_column_separator(multi_column_separator);
params->__set_skip_header(skip_header);
params->__set_trim_space(trim_space);
params->__set_enclose(enclose);
params->__set_escape(escape);
return create_csv_scanner(types, ranges, params);
}
bool _use_v2;
private:
RuntimeState _runtime_state;
ObjectPool _obj_pool;
};
class CSVScannerTrimSpaceTest : public CSVScannerTest {};
TEST_P(CSVScannerTest, test_scalar_types) {
std::vector<TypeDescriptor> types;
types.emplace_back(TYPE_INT);
types.emplace_back(TYPE_DOUBLE);
types.emplace_back(TYPE_VARCHAR);
types.emplace_back(TYPE_DATE);
types.emplace_back(TYPE_VARCHAR);
types[2].len = 10;
types[4].len = 6;
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range_one;
range_one.__set_path("./be/test/exec/test_data/csv_scanner/csv_file1");
range_one.__set_start_offset(0);
range_one.__set_num_of_columns_from_file(types.size());
ranges.push_back(range_one);
TBrokerRangeDesc range_second;
range_second.__set_path("./be/test/exec/test_data/csv_scanner/csv_file2");
range_second.__set_start_offset(0);
range_second.__set_num_of_columns_from_file(types.size());
ranges.push_back(range_second);
auto scanner = create_csv_scanner(types, ranges);
EXPECT_NE(scanner, nullptr);
auto st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
auto res = scanner->get_next();
ASSERT_TRUE(res.ok()) << res.status().to_string();
auto chunk = res.value();
auto chunk2 = scanner->get_next().value();
chunk->append(*chunk2);
EXPECT_EQ(5, chunk->num_columns());
EXPECT_EQ(4, chunk->num_rows());
// int column
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(-1, chunk->get(1)[0].get_int32());
EXPECT_EQ(10, chunk->get(2)[0].get_int32());
EXPECT_EQ(10, chunk->get(3)[0].get_int32());
// double column
EXPECT_FLOAT_EQ(1.1, chunk->get(0)[1].get_double());
EXPECT_FLOAT_EQ(-0.1, chunk->get(1)[1].get_double());
EXPECT_TRUE(chunk->get(2)[1].is_null());
EXPECT_FLOAT_EQ(10.1, chunk->get(3)[1].get_double());
// string column
EXPECT_EQ("apple", chunk->get(0)[2].get_slice());
EXPECT_EQ("banana", chunk->get(1)[2].get_slice());
EXPECT_EQ("grapefruit", chunk->get(2)[2].get_slice());
EXPECT_EQ("orange", chunk->get(3)[2].get_slice());
// date column
EXPECT_EQ("2020-01-01", chunk->get(0)[3].get_date().to_string());
EXPECT_EQ("1998-09-01", chunk->get(1)[3].get_date().to_string());
EXPECT_EQ("2021-02-19", chunk->get(2)[3].get_date().to_string());
EXPECT_EQ("2021-01-01", chunk->get(3)[3].get_date().to_string());
// string column with size limit.
// len(apple) == 5 < 6
EXPECT_EQ(false, chunk->get(0)[4].is_null());
EXPECT_EQ("apple", chunk->get(0)[4].get_slice());
// len(banana) == 6
EXPECT_EQ(false, chunk->get(1)[4].is_null());
EXPECT_EQ("banana", chunk->get(1)[4].get_slice());
// len(grapefruit) == 10 > 6
EXPECT_EQ(true, chunk->get(2)[4].is_null());
// len(oranges) == 7 > 6
EXPECT_EQ(true, chunk->get(3)[4].is_null());
ASSERT_GT(scanner->TEST_scanner_counter()->file_read_count, 0);
ASSERT_GT(scanner->TEST_scanner_counter()->file_read_ns, 0);
}
TEST_P(CSVScannerTest, test_adaptive_nullable_column1) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_VARCHAR), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(3);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file20");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", ",", 0, true, '\'', '\\');
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(4, chunk->num_rows());
EXPECT_EQ(true, chunk->get(0)[0].is_null());
EXPECT_EQ(2, chunk->get(1)[0].get_int32());
EXPECT_EQ(true, chunk->get(2)[0].is_null());
EXPECT_EQ(4, chunk->get(3)[0].get_int32());
EXPECT_EQ(true, chunk->get(0)[1].is_null());
EXPECT_EQ(true, chunk->get(1)[1].is_null());
EXPECT_EQ(true, chunk->get(2)[1].is_null());
EXPECT_EQ("Julia", chunk->get(3)[1].get_slice());
EXPECT_EQ(true, chunk->get(0)[2].is_null());
EXPECT_EQ(true, chunk->get(1)[2].is_null());
EXPECT_EQ(25, chunk->get(2)[2].get_int32());
EXPECT_EQ(25, chunk->get(3)[2].get_int32());
}
TEST_P(CSVScannerTest, test_adaptive_nullable_column2) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_VARCHAR), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(3);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file21");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", ",", 0, true, '\'', '\\');
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(4, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(2, chunk->get(1)[0].get_int32());
EXPECT_EQ(3, chunk->get(2)[0].get_int32());
EXPECT_EQ(true, chunk->get(3)[0].is_null());
EXPECT_EQ("Julia", chunk->get(0)[1].get_slice());
EXPECT_EQ("Andy", chunk->get(1)[1].get_slice());
EXPECT_EQ("Joke", chunk->get(2)[1].get_slice());
EXPECT_EQ(true, chunk->get(3)[1].is_null());
EXPECT_EQ(20, chunk->get(0)[2].get_int32());
EXPECT_EQ(21, chunk->get(1)[2].get_int32());
EXPECT_EQ(22, chunk->get(2)[2].get_int32());
EXPECT_EQ(25, chunk->get(3)[2].get_int32());
}
TEST_P(CSVScannerTest, test_adaptive_nullable_column3) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_VARCHAR), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(3);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file20");
ranges.push_back(range);
TBrokerRangeDesc range2;
range2.__set_num_of_columns_from_file(3);
range2.__set_path("./be/test/exec/test_data/csv_scanner/csv_file21");
ranges.push_back(range2);
auto scanner = create_csv_scanner(types, ranges, "\n", ",", 0, true, '\'', '\\');
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(4, chunk->num_rows());
EXPECT_EQ(true, chunk->get(0)[0].is_null());
EXPECT_EQ(2, chunk->get(1)[0].get_int32());
EXPECT_EQ(true, chunk->get(2)[0].is_null());
EXPECT_EQ(4, chunk->get(3)[0].get_int32());
EXPECT_EQ(true, chunk->get(0)[1].is_null());
EXPECT_EQ(true, chunk->get(1)[1].is_null());
EXPECT_EQ(true, chunk->get(2)[1].is_null());
EXPECT_EQ("Julia", chunk->get(3)[1].get_slice());
EXPECT_EQ(true, chunk->get(0)[2].is_null());
EXPECT_EQ(true, chunk->get(1)[2].is_null());
EXPECT_EQ(25, chunk->get(2)[2].get_int32());
EXPECT_EQ(25, chunk->get(3)[2].get_int32());
chunk = scanner->get_next().value();
EXPECT_EQ(4, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(2, chunk->get(1)[0].get_int32());
EXPECT_EQ(3, chunk->get(2)[0].get_int32());
EXPECT_EQ(true, chunk->get(3)[0].is_null());
EXPECT_EQ("Julia", chunk->get(0)[1].get_slice());
EXPECT_EQ("Andy", chunk->get(1)[1].get_slice());
EXPECT_EQ("Joke", chunk->get(2)[1].get_slice());
EXPECT_EQ(true, chunk->get(3)[1].is_null());
EXPECT_EQ(20, chunk->get(0)[2].get_int32());
EXPECT_EQ(21, chunk->get(1)[2].get_int32());
EXPECT_EQ(22, chunk->get(2)[2].get_int32());
EXPECT_EQ(25, chunk->get(3)[2].get_int32());
}
TEST_P(CSVScannerTest, test_multi_seprator) {
std::vector<TypeDescriptor> types;
types.emplace_back(TYPE_INT);
types.emplace_back(TYPE_DOUBLE);
types.emplace_back(TYPE_VARCHAR);
types.emplace_back(TYPE_DATE);
types.emplace_back(TYPE_VARCHAR);
types[2].len = 10;
types[4].len = 6;
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range_one;
range_one.__set_path("./be/test/exec/test_data/csv_scanner/csv_file14");
range_one.__set_start_offset(0);
range_one.__set_num_of_columns_from_file(types.size());
ranges.push_back(range_one);
auto scanner = create_csv_scanner(types, ranges, "<br>", "^^");
EXPECT_NE(scanner, nullptr);
auto st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
auto res = scanner->get_next();
ASSERT_TRUE(res.ok()) << res.status().to_string();
auto chunk = res.value();
EXPECT_EQ(5, chunk->num_columns());
EXPECT_EQ(2, chunk->num_rows());
// int column
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(-1, chunk->get(1)[0].get_int32());
// double column
EXPECT_FLOAT_EQ(1.1, chunk->get(0)[1].get_double());
EXPECT_FLOAT_EQ(-0.1, chunk->get(1)[1].get_double());
// string column
EXPECT_EQ("ap", chunk->get(0)[2].get_slice());
EXPECT_EQ("br", chunk->get(1)[2].get_slice());
// date column
EXPECT_EQ("2020-01-01", chunk->get(0)[3].get_date().to_string());
EXPECT_EQ("1998-09-01", chunk->get(1)[3].get_date().to_string());
}
TEST_P(CSVScannerTest, test_array_of_int) {
TypeDescriptor t;
t.type = TYPE_ARRAY;
t.children.emplace_back(TYPE_INT);
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file3");
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(1);
ranges.push_back(range);
auto scanner = create_csv_scanner({t}, ranges);
EXPECT_NE(scanner, nullptr);
auto st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
auto chunk = scanner->get_next().value();
EXPECT_EQ(1, chunk->num_columns());
EXPECT_EQ(5, chunk->num_rows());
// 1st row
EXPECT_EQ(0, chunk->get(0)[0].get_array().size());
// 2nd row
EXPECT_EQ(1, chunk->get(1)[0].get_array().size());
EXPECT_TRUE(chunk->get(1)[0].get_array()[0].is_null());
// 3rd row
EXPECT_TRUE(chunk->get(2)[0].is_null());
// 4th row
EXPECT_EQ(2, chunk->get(3)[0].get_array().size());
EXPECT_EQ(1, chunk->get(3)[0].get_array()[0].get_int32());
EXPECT_EQ(2, chunk->get(3)[0].get_array()[1].get_int32());
// 5th row
EXPECT_EQ(2, chunk->get(4)[0].get_array().size());
EXPECT_EQ(1, chunk->get(4)[0].get_array()[0].get_int32());
EXPECT_TRUE(chunk->get(4)[0].get_array()[1].is_null());
}
TEST_P(CSVScannerTest, test_array_of_string) {
std::vector<TypeDescriptor> types;
// ARRAY<VARCHAR(10)>
TypeDescriptor t;
t.type = TYPE_ARRAY;
t.children.emplace_back(TYPE_VARCHAR);
t.children.back().len = 10;
types.emplace_back(t);
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file4");
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(types.size());
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
EXPECT_NE(scanner, nullptr);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
auto res = scanner->get_next();
ASSERT_TRUE(res.ok()) << res.status().to_string();
auto chunk = res.value();
EXPECT_EQ(1, chunk->num_columns());
EXPECT_EQ(6, chunk->num_rows());
// []
EXPECT_EQ(0, chunk->get(0)[0].get_array().size());
// [null]
EXPECT_EQ(1, chunk->get(1)[0].get_array().size());
EXPECT_TRUE(chunk->get(1)[0].get_array()[0].is_null());
// \N
EXPECT_TRUE(chunk->get(2)[0].is_null());
// ["apple",null,"pear"]
EXPECT_EQ(3, chunk->get(3)[0].get_array().size());
EXPECT_EQ("apple", chunk->get(3)[0].get_array()[0].get_slice());
EXPECT_TRUE(chunk->get(3)[0].get_array()[1].is_null());
EXPECT_EQ("pear", chunk->get(3)[0].get_array()[2].get_slice());
// ["str with left bracket([)","str with dot(,)"]
EXPECT_EQ(2, chunk->get(4)[0].get_array().size());
EXPECT_EQ("str with left bracket([)", chunk->get(4)[0].get_array()[0].get_slice());
EXPECT_EQ("str with dot(,)", chunk->get(4)[0].get_array()[1].get_slice());
// ["I""m hungry!",""]
EXPECT_EQ(2, chunk->get(5)[0].get_array().size());
EXPECT_EQ("I\"m hungry!", chunk->get(5)[0].get_array()[0].get_slice());
EXPECT_EQ("", chunk->get(5)[0].get_array()[1].get_slice());
}
TEST_P(CSVScannerTest, test_array_of_date) {
TypeDescriptor t;
t.type = TYPE_ARRAY;
t.children.emplace_back(TYPE_DATE);
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file5");
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(1);
ranges.push_back(range);
auto scanner = create_csv_scanner({t}, ranges);
EXPECT_NE(scanner, nullptr);
Status st;
st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
auto res = scanner->get_next();
ASSERT_TRUE(res.ok()) << res.status().to_string();
auto chunk = res.value();
EXPECT_EQ(1, chunk->num_columns());
EXPECT_EQ(5, chunk->num_rows());
// []
EXPECT_EQ(0, chunk->get(0)[0].get_array().size());
// [null]
EXPECT_EQ(1, chunk->get(1)[0].get_array().size());
EXPECT_TRUE(chunk->get(1)[0].get_array()[0].is_null());
// \N
EXPECT_TRUE(chunk->get(2)[0].is_null());
// ["2020-01-01","2021-01-01"]
EXPECT_EQ(2, chunk->get(3)[0].get_array().size());
EXPECT_EQ("2020-01-01", chunk->get(3)[0].get_array()[0].get_date().to_string());
EXPECT_EQ("2021-01-01", chunk->get(3)[0].get_array()[1].get_date().to_string());
// ["2022-01-01",null]
EXPECT_EQ(2, chunk->get(4)[0].get_array().size());
EXPECT_EQ("2022-01-01", chunk->get(4)[0].get_array()[0].get_date().to_string());
EXPECT_TRUE(chunk->get(4)[0].get_array()[1].is_null());
}
TEST_P(CSVScannerTest, test_nested_array_of_int) {
TypeDescriptor t(TYPE_ARRAY);
t.children.emplace_back(TYPE_ARRAY);
t.children.back().children.emplace_back(TYPE_INT);
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file6");
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(1);
ranges.push_back(range);
auto scanner = create_csv_scanner({t}, ranges);
EXPECT_NE(scanner, nullptr);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(8, chunk->num_rows());
// []
EXPECT_EQ(0, chunk->get(0)[0].get_array().size());
// [[]]
EXPECT_EQ(1, chunk->get(1)[0].get_array().size());
EXPECT_EQ(0, chunk->get(1)[0].get_array()[0].get_array().size());
// \N
EXPECT_TRUE(chunk->get(2)[0].is_null());
// [[1,2,3]]
EXPECT_EQ(1, chunk->get(3)[0].get_array().size());
// -> [1,2,3]
EXPECT_EQ(3, chunk->get(3)[0].get_array()[0].get_array().size());
EXPECT_EQ(1, chunk->get(3)[0].get_array()[0].get_array()[0].get_int32());
EXPECT_EQ(2, chunk->get(3)[0].get_array()[0].get_array()[1].get_int32());
EXPECT_EQ(3, chunk->get(3)[0].get_array()[0].get_array()[2].get_int32());
// [[1],[2],[3]]
EXPECT_EQ(3, chunk->get(4)[0].get_array().size());
// -> [1]
EXPECT_EQ(1, chunk->get(4)[0].get_array()[0].get_array().size());
EXPECT_EQ(1, chunk->get(4)[0].get_array()[0].get_array()[0].get_int32());
// -> [2]
EXPECT_EQ(1, chunk->get(4)[0].get_array()[1].get_array().size());
EXPECT_EQ(2, chunk->get(4)[0].get_array()[1].get_array()[0].get_int32());
// -> [3]
EXPECT_EQ(1, chunk->get(4)[0].get_array()[2].get_array().size());
EXPECT_EQ(3, chunk->get(4)[0].get_array()[2].get_array()[0].get_int32());
// [[1,2],[3]]
EXPECT_EQ(2, chunk->get(5)[0].get_array().size());
// -> [1,2]
EXPECT_EQ(2, chunk->get(5)[0].get_array()[0].get_array().size());
EXPECT_EQ(1, chunk->get(5)[0].get_array()[0].get_array()[0].get_int32());
EXPECT_EQ(2, chunk->get(5)[0].get_array()[0].get_array()[1].get_int32());
// -> [3]
EXPECT_EQ(1, chunk->get(5)[0].get_array()[1].get_array().size());
EXPECT_EQ(3, chunk->get(5)[0].get_array()[1].get_array()[0].get_int32());
// [null]
EXPECT_EQ(1, chunk->get(6)[0].get_array().size());
EXPECT_TRUE(chunk->get(6)[0].get_array()[0].is_null());
// [[null]]
EXPECT_EQ(1, chunk->get(7)[0].get_array().size());
EXPECT_EQ(1, chunk->get(7)[0].get_array()[0].get_array().size());
EXPECT_TRUE(chunk->get(7)[0].get_array()[0].get_array()[0].is_null());
}
TEST_P(CSVScannerTest, test_invalid_field_as_null) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file7");
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(types.size());
ranges.push_back(range);
auto scanner = create_csv_scanner({types}, ranges);
EXPECT_NE(scanner, nullptr);
Status st;
st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
ASSERT_TRUE(st.ok()) << st.to_string();
EXPECT_EQ(3, chunk->num_rows());
EXPECT_TRUE(chunk->get(0)[0].is_null());
EXPECT_TRUE(chunk->get(1)[0].is_null());
EXPECT_TRUE(chunk->get(2)[0].is_null());
}
TEST_P(CSVScannerTest, test_invalid_field_of_array_as_null) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_ARRAY)};
types[0].children.emplace_back(TYPE_INT);
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file8");
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(types.size());
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
EXPECT_NE(scanner, nullptr);
Status st;
st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(3, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].is_null());
EXPECT_EQ(1, chunk->get(1)[0].is_null());
EXPECT_EQ(1, chunk->get(2)[0].is_null());
}
TEST_P(CSVScannerTest, test_start_offset) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(2);
range.__set_start_offset(4);
range.__set_size(10);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file9");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
ASSERT_TRUE(st.ok()) << st.to_string();
EXPECT_EQ(2, chunk->num_rows());
EXPECT_EQ(5, chunk->get(0)[0].get_int32());
EXPECT_EQ(7, chunk->get(1)[0].get_int32());
EXPECT_EQ(6, chunk->get(0)[1].get_int32());
EXPECT_EQ(8, chunk->get(1)[1].get_int32());
}
TEST_P(CSVScannerTest, test_split_multi_scan_ranges) {
// ----- csv_file9 -----
// 1|2\n
// 3|4\n
// 5|6\n
// 7|8\n
// 9|0\n
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_INT)};
{
// split at row delimiter
// 1|2\n
// |<- split point is '\n'
// 3|4\n
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(2);
range.__set_start_offset(0);
range.__set_size(4);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file9");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
ASSERT_TRUE(st.ok()) << st.to_string();
EXPECT_EQ(2, chunk->num_rows());
EXPECT_EQ("[1, 2]", chunk->debug_row(0));
EXPECT_EQ("[3, 4]", chunk->debug_row(1));
}
{
// split before row delimiter
// 3|4\n
// 5|6\n
// |<- split point is '6'
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(2);
range.__set_start_offset(4);
range.__set_size(7);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file9");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
ASSERT_TRUE(st.ok()) << st.to_string();
EXPECT_EQ(1, chunk->num_rows());
EXPECT_EQ("[5, 6]", chunk->debug_row(0));
}
{
// split after row delimiter
// 7|8\n
// 9|0\n
// |<- split point is '9'
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(2);
range.__set_start_offset(11);
range.__set_size(6);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file9");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
ASSERT_TRUE(st.ok()) << st.to_string();
EXPECT_EQ(2, chunk->num_rows());
EXPECT_EQ("[7, 8]", chunk->debug_row(0));
EXPECT_EQ("[9, 0]", chunk->debug_row(1));
}
{
// left
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(2);
range.__set_start_offset(17);
range.__set_size(3);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file9");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
auto st2 = scanner->get_next();
ASSERT_TRUE(st2.status().is_end_of_file());
}
}
TEST_P(CSVScannerTest, test_skip_header) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(2);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file15");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", "|", 4);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(5, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(3, chunk->get(1)[0].get_int32());
EXPECT_EQ(5, chunk->get(2)[0].get_int32());
EXPECT_EQ(7, chunk->get(3)[0].get_int32());
EXPECT_EQ(9, chunk->get(4)[0].get_int32());
EXPECT_EQ(2, chunk->get(0)[1].get_int32());
EXPECT_EQ(4, chunk->get(1)[1].get_int32());
EXPECT_EQ(6, chunk->get(2)[1].get_int32());
EXPECT_EQ(8, chunk->get(3)[1].get_int32());
EXPECT_EQ(0, chunk->get(4)[1].get_int32());
}
TEST_P(CSVScannerTest, test_skip_header_start_offset_not_0) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
// the first line is not included
range.__set_start_offset(1);
range.__set_num_of_columns_from_file(2);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file15");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", "|", 4);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(8, chunk->num_rows());
EXPECT_EQ(33, chunk->get(0)[0].get_int32());
EXPECT_EQ(55, chunk->get(1)[0].get_int32());
EXPECT_EQ(77, chunk->get(2)[0].get_int32());
EXPECT_EQ(1, chunk->get(3)[0].get_int32());
EXPECT_EQ(3, chunk->get(4)[0].get_int32());
EXPECT_EQ(5, chunk->get(5)[0].get_int32());
EXPECT_EQ(7, chunk->get(6)[0].get_int32());
EXPECT_EQ(9, chunk->get(7)[0].get_int32());
EXPECT_EQ(44, chunk->get(0)[1].get_int32());
EXPECT_EQ(66, chunk->get(1)[1].get_int32());
EXPECT_EQ(88, chunk->get(2)[1].get_int32());
EXPECT_EQ(2, chunk->get(3)[1].get_int32());
EXPECT_EQ(4, chunk->get(4)[1].get_int32());
EXPECT_EQ(6, chunk->get(5)[1].get_int32());
EXPECT_EQ(8, chunk->get(6)[1].get_int32());
EXPECT_EQ(0, chunk->get(7)[1].get_int32());
}
TEST_P(CSVScannerTrimSpaceTest, test_trim_space) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_VARCHAR)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(2);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file16");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", "|", 0, true, '"');
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(2, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(3, chunk->get(1)[0].get_int32());
EXPECT_EQ("aa ", chunk->get(0)[1].get_slice());
EXPECT_EQ(" bb", chunk->get(1)[1].get_slice());
}
TEST_P(CSVScannerTrimSpaceTest, test_trim_space_with_ENCLOSE) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_VARCHAR), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(3);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file19");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", ",", 0, true, '\'', '\\');
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(4, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(2, chunk->get(1)[0].get_int32());
EXPECT_EQ(3, chunk->get(2)[0].get_int32());
EXPECT_EQ(4, chunk->get(3)[0].get_int32());
EXPECT_EQ(" Lily, asdf\n\n\nsafsdfaasfsdfa23'1111111 ", chunk->get(0)[1].get_slice());
EXPECT_EQ(" Ro 'se", chunk->get(1)[1].get_slice());
EXPECT_EQ("Al i ce", chunk->get(2)[1].get_slice());
EXPECT_EQ("Julia", chunk->get(3)[1].get_slice());
EXPECT_EQ(24, chunk->get(0)[2].get_int32());
EXPECT_EQ(23, chunk->get(1)[2].get_int32());
EXPECT_EQ(24, chunk->get(2)[2].get_int32());
EXPECT_EQ(25, chunk->get(3)[2].get_int32());
}
TEST_P(CSVScannerTest, test_ENCLOSE) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_VARCHAR),
TypeDescriptor(TYPE_VARCHAR)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(3);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file17");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", "|", 0, true, '"', '\\');
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(7, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(3, chunk->get(1)[0].get_int32());
EXPECT_EQ(5, chunk->get(2)[0].get_int32());
EXPECT_EQ(7, chunk->get(3)[0].get_int32());
EXPECT_EQ(9, chunk->get(4)[0].get_int32());
EXPECT_EQ(11, chunk->get(5)[0].get_int32());
EXPECT_EQ(13, chunk->get(6)[0].get_int32());
EXPECT_EQ("aa", chunk->get(0)[1].get_slice());
EXPECT_EQ("bb|BB", chunk->get(1)[1].get_slice());
EXPECT_EQ("cc\nadf,1,3455", chunk->get(2)[1].get_slice());
EXPECT_EQ("dd", chunk->get(3)[1].get_slice());
EXPECT_EQ("\"ee\"", chunk->get(4)[1].get_slice());
EXPECT_EQ("", chunk->get(5)[1].get_slice());
EXPECT_EQ("\"cd\"", chunk->get(6)[1].get_slice());
EXPECT_EQ("abc", chunk->get(0)[2].get_slice());
EXPECT_EQ("", chunk->get(1)[2].get_slice());
EXPECT_EQ("e", chunk->get(2)[2].get_slice());
EXPECT_EQ("abc|ef\ngh", chunk->get(3)[2].get_slice());
EXPECT_EQ("", chunk->get(4)[2].get_slice());
EXPECT_EQ("ab", chunk->get(5)[2].get_slice());
EXPECT_EQ("ab\"c", chunk->get(6)[2].get_slice());
}
TEST_P(CSVScannerTest, test_ESCAPE) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_VARCHAR),
TypeDescriptor(TYPE_VARCHAR)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(3);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file18");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", "|", 0, true, '"', '\\');
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(5, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(3, chunk->get(1)[0].get_int32());
EXPECT_EQ(5, chunk->get(2)[0].get_int32());
EXPECT_EQ(7, chunk->get(3)[0].get_int32());
EXPECT_EQ(9, chunk->get(4)[0].get_int32());
EXPECT_EQ("\"aa\"", chunk->get(0)[1].get_slice());
EXPECT_EQ("bb|BB", chunk->get(1)[1].get_slice());
EXPECT_EQ("cc\nadf,1,3455", chunk->get(2)[1].get_slice());
EXPECT_EQ("dd", chunk->get(3)[1].get_slice());
EXPECT_EQ("\\ee", chunk->get(4)[1].get_slice());
EXPECT_EQ("abc\"", chunk->get(0)[2].get_slice());
EXPECT_EQ("", chunk->get(1)[2].get_slice());
EXPECT_EQ("\"e", chunk->get(2)[2].get_slice());
EXPECT_EQ("abc|ef\ngh", chunk->get(3)[2].get_slice());
EXPECT_EQ("", chunk->get(4)[2].get_slice());
}
TEST_P(CSVScannerTest, TEST_Pile_not_ended_with_record_delimiter) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(types.size());
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file10");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(5, chunk->num_rows());
EXPECT_EQ(1, chunk->get(0)[0].get_int32());
EXPECT_EQ(3, chunk->get(1)[0].get_int32());
EXPECT_EQ(5, chunk->get(2)[0].get_int32());
EXPECT_EQ(7, chunk->get(3)[0].get_int32());
EXPECT_EQ(9, chunk->get(4)[0].get_int32());
EXPECT_EQ(2, chunk->get(0)[1].get_int32());
EXPECT_EQ(4, chunk->get(1)[1].get_int32());
EXPECT_EQ(6, chunk->get(2)[1].get_int32());
EXPECT_EQ(8, chunk->get(3)[1].get_int32());
EXPECT_EQ(0, chunk->get(4)[1].get_int32());
}
TEST_P(CSVScannerTest, test_large_record_size) {
constexpr size_t record_length = 65533 * 5;
constexpr size_t field_length = 65533;
constexpr size_t field_count = (record_length + field_length - 1) / field_length;
TypeDescriptor large_varchar_type;
large_varchar_type.type = TYPE_VARCHAR;
large_varchar_type.len = field_length;
std::vector<TypeDescriptor> types(field_count, large_varchar_type);
constexpr int kNumRecords = 5;
// Construct kNumRecords records, each record contains |field_count| fields and each field
// starts with 'x' and ends with two digit suffix.
std::stringstream ss;
std::string csv_field(field_length, 'x');
for (int i = 0; i < kNumRecords; i++) {
csv_field[csv_field.size() - 2] = '0' + (i % 10);
for (int j = 0; j < field_count; j++) {
csv_field[csv_field.size() - 1] = '0' + (j % 10);
ss << csv_field << (j == field_count - 1 ? '\n' : '|');
}
}
std::string csv_content = ss.str();
ss.clear();
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file11");
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(types.size());
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
EXPECT_NE(scanner, nullptr);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(kNumRecords, chunk->num_rows());
EXPECT_EQ(field_count, chunk->num_columns());
for (int col = 0; col < chunk->num_columns(); col++) {
const auto& column = chunk->get_column_by_index(col);
for (int row = 0; row < chunk->num_rows(); row++) {
auto datum = column->get(row);
auto s = datum.get_slice();
ASSERT_EQ(field_length, s.size);
ASSERT_EQ(row % 10, s[s.size - 2] - '0');
ASSERT_EQ(col % 10, s[s.size - 1] - '0');
}
}
}
TEST_P(CSVScannerTest, test_record_length_exceed_limit) {
constexpr size_t record_length = TypeDescriptor::MAX_VARCHAR_LENGTH;
constexpr size_t field_length = TypeDescriptor::MAX_VARCHAR_LENGTH;
constexpr size_t field_count = (record_length + field_length - 1) / field_length;
TypeDescriptor large_varchar_type;
large_varchar_type.type = TYPE_VARCHAR;
large_varchar_type.len = field_length;
std::vector<TypeDescriptor> types(field_count, large_varchar_type);
// Construct 1 record with |field_count| fixed-length fields and the
// total record length is greater than |TypeDescriptor::MAX_VARCHAR_LENGTH|.
std::stringstream ss;
std::string csv_field(field_length, 'x');
for (int i = 0; i < field_count; i++) {
ss << csv_field << (i == field_count - 1 ? '\n' : '|');
}
std::string csv_content = ss.str();
ss.clear();
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file12");
range.__set_start_offset(0);
range.__set_num_of_columns_from_file(types.size());
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
EXPECT_NE(scanner, nullptr);
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
auto res = scanner->get_next();
EXPECT_TRUE(!res.ok());
}
TEST_P(CSVScannerTest, test_empty) {
auto run_test = [this](LogicalType lt) {
std::vector<TypeDescriptor> types{TypeDescriptor(lt)};
if (lt == TYPE_VARCHAR || lt == TYPE_CHAR) {
types[0].len = 10;
}
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_start_offset(0);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file13");
range.__set_num_of_columns_from_file(types.size());
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges);
ASSERT_TRUE(scanner->open().ok());
scanner->use_v2(_use_v2);
auto res = scanner->get_next();
ASSERT_TRUE(res.status().is_end_of_file());
};
run_test(TYPE_VARCHAR);
run_test(TYPE_CHAR);
run_test(TYPE_INT);
run_test(TYPE_DATE);
run_test(TYPE_DATETIME);
}
// 21431,"Rowdy" Roddy Piper, Superstar,,,,1,-999
TEST_P(CSVScannerTest, test_enclose_fanatics) {
std::vector<TypeDescriptor> types{TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_VARCHAR),
TypeDescriptor(TYPE_VARCHAR), TypeDescriptor(TYPE_INT),
TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_INT),
TypeDescriptor(TYPE_INT), TypeDescriptor(TYPE_INT)};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_num_of_columns_from_file(types.size());
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file22");
ranges.push_back(range);
auto scanner = create_csv_scanner(types, ranges, "\n", ",", 0, true, '"', '\\');
Status st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
ChunkPtr chunk = scanner->get_next().value();
EXPECT_EQ(1, chunk->num_rows());
EXPECT_EQ(8, chunk->num_columns());
EXPECT_EQ(21431, chunk->get(0)[0].get_int32());
EXPECT_EQ("\"Rowdy\" Roddy Piper", chunk->get(0)[1].get_slice());
EXPECT_EQ("Superstar", chunk->get(0)[2].get_slice());
EXPECT_TRUE(chunk->get(0)[3].is_null());
EXPECT_TRUE(chunk->get(0)[4].is_null());
EXPECT_TRUE(chunk->get(0)[5].is_null());
EXPECT_EQ(1, chunk->get(0)[6].get_int32());
EXPECT_EQ(-999, chunk->get(0)[7].get_int32());
}
TEST_P(CSVScannerTest, test_column_count_inconsistent) {
std::vector<TypeDescriptor> types;
types.emplace_back(TYPE_INT);
types.emplace_back(TYPE_DOUBLE);
types.emplace_back(TYPE_VARCHAR);
types.emplace_back(TYPE_DATE);
types[2].len = 10;
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range_one;
range_one.__set_path("./be/test/exec/test_data/csv_scanner/csv_file1");
range_one.__set_start_offset(0);
range_one.__set_num_of_columns_from_file(types.size());
ranges.push_back(range_one);
auto scanner = create_csv_scanner(types, ranges);
EXPECT_NE(scanner, nullptr);
auto st = scanner->open();
ASSERT_TRUE(st.ok()) << st.to_string();
scanner->use_v2(_use_v2);
auto log_file_path = "test_column_count_inconsistent_error_log_file";
std::ofstream wfile(log_file_path, std::ofstream::out);
scanner->TEST_runtime_state()->_error_log_file = &wfile;
auto res = scanner->get_next();
ASSERT_TRUE(res.status().is_end_of_file()) << res.status().to_string();
wfile.close();
scanner->TEST_runtime_state()->_error_log_file = nullptr;
std::ifstream rfile(log_file_path, std::ifstream::in);
std::string line;
line.resize(1024);
rfile.getline(line.data(), line.size());
auto found = line.find("Target column count: 4 doesn't match source value column count: 5");
ASSERT_TRUE(found != std::string::npos);
rfile.close();
(void)fs::remove(log_file_path);
}
TEST_P(CSVScannerTest, test_get_schema) {
{
// sample 1 row
std::vector<std::pair<std::string, LogicalType>> expected_schema = {
{"$1", TYPE_BIGINT}, {"$2", TYPE_DOUBLE}, {"$3", TYPE_DOUBLE}, {"$4", TYPE_BOOLEAN}};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file23");
range.__set_num_of_columns_from_file(0);
ranges.push_back(range);
TBrokerScanRangeParams* params = _obj_pool.add(new TBrokerScanRangeParams());
params->__set_row_delimiter('\n');
params->__set_column_separator(',');
params->__set_schema_sample_file_row_count(1);
auto scanner = create_csv_scanner({}, ranges, params);
EXPECT_OK(scanner->open());
std::vector<SlotDescriptor> schema;
EXPECT_OK(scanner->get_schema(&schema));
EXPECT_EQ(expected_schema.size(), schema.size());
for (size_t i = 0; i < schema.size(); i++) {
EXPECT_EQ(expected_schema[i].first, schema[i].col_name());
EXPECT_EQ(expected_schema[i].second, schema[i].type().type) << schema[i].col_name();
}
}
{
// sample 2 row
std::vector<std::pair<std::string, LogicalType>> expected_schema = {{"$1", TYPE_BIGINT},
{"$2", TYPE_VARCHAR},
{"$3", TYPE_VARCHAR},
{"$4", TYPE_VARCHAR},
{"$5", TYPE_BOOLEAN}};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file23");
range.__set_num_of_columns_from_file(0);
ranges.push_back(range);
TBrokerScanRangeParams* params = _obj_pool.add(new TBrokerScanRangeParams());
params->__set_row_delimiter('\n');
params->__set_column_separator(',');
params->__set_schema_sample_file_row_count(2);
auto scanner = create_csv_scanner({}, ranges, params);
EXPECT_OK(scanner->open());
std::vector<SlotDescriptor> schema;
EXPECT_OK(scanner->get_schema(&schema));
EXPECT_EQ(expected_schema.size(), schema.size());
for (size_t i = 0; i < schema.size(); i++) {
EXPECT_EQ(expected_schema[i].first, schema[i].col_name());
EXPECT_EQ(expected_schema[i].second, schema[i].type().type) << schema[i].col_name();
}
}
{
// sample 1 row, skip header 1, enclose ", escape "\"
std::vector<std::pair<std::string, LogicalType>> expected_schema = {
{"$1", TYPE_BIGINT}, {"$2", TYPE_VARCHAR}, {"$3", TYPE_DOUBLE}, {"$4", TYPE_BOOLEAN}};
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file23");
range.__set_num_of_columns_from_file(0);
ranges.push_back(range);
TBrokerScanRangeParams* params = _obj_pool.add(new TBrokerScanRangeParams());
params->__set_row_delimiter('\n');
params->__set_column_separator(',');
params->__set_skip_header(1);
params->__set_enclose('"');
params->__set_escape('\\');
params->__set_schema_sample_file_row_count(1);
auto scanner = create_csv_scanner({}, ranges, params);
EXPECT_OK(scanner->open());
std::vector<SlotDescriptor> schema;
EXPECT_OK(scanner->get_schema(&schema));
EXPECT_EQ(expected_schema.size(), schema.size());
for (size_t i = 0; i < schema.size(); i++) {
EXPECT_EQ(expected_schema[i].first, schema[i].col_name());
EXPECT_EQ(expected_schema[i].second, schema[i].type().type) << schema[i].col_name();
}
}
}
TEST_P(CSVScannerTest, test_flexible_column_mapping) {
std::vector<TypeDescriptor> types;
types.emplace_back(TYPE_BIGINT);
types.emplace_back(TYPE_DOUBLE);
types.emplace_back(TYPE_VARCHAR);
types.emplace_back(TYPE_VARCHAR);
types.emplace_back(TYPE_VARCHAR);
// not existing column
types.emplace_back(TYPE_INT);
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_start_offset(0);
range.__set_path("./be/test/exec/test_data/csv_scanner/csv_file1");
range.__set_num_of_columns_from_file(types.size());
ranges.push_back(range);
TBrokerScanRangeParams* params = _obj_pool.add(new TBrokerScanRangeParams());
params->__set_row_delimiter('\n');
params->__set_column_separator('|');
params->__set_flexible_column_mapping(true);
auto scanner = create_csv_scanner(types, ranges, params);
scanner->use_v2(_use_v2);
EXPECT_OK(scanner->open());
auto res = scanner->get_next();
EXPECT_OK(res.status());
ChunkPtr chunk = res.value();
EXPECT_EQ(6, chunk->num_columns());
EXPECT_EQ(3, chunk->num_rows());
EXPECT_EQ("[1, 1.1, 'apple', '2020-01-01', 'apple', NULL]", chunk->debug_row(0));
EXPECT_EQ("[-1, -0.1, 'banana', '1998-09-01', 'banana', NULL]", chunk->debug_row(1));
EXPECT_EQ("[10, NULL, 'grapefruit', '2021-02-19', 'grapefruit', NULL]", chunk->debug_row(2));
}
TEST_P(CSVScannerTest, test_skip_headers) {
std::vector<TBrokerRangeDesc> ranges;
TBrokerRangeDesc range;
range.__set_path("./be/test/exec/test_data/csv_scanner/small.csv");
range.__set_num_of_columns_from_file(0);
ranges.push_back(range);
TBrokerScanRangeParams* params = _obj_pool.add(new TBrokerScanRangeParams());
params->__set_row_delimiter('\n');
// there are only 2 rows within file small.csv
// if we set skip first 3 line, we expect to get a clear error message
params->__set_skip_header(3);
params->__set_column_separator(',');
params->__set_enclose('"');
params->__set_escape('\\');
auto scanner = create_csv_scanner({}, ranges, params);
EXPECT_OK(scanner->open());
std::vector<SlotDescriptor> schema;
auto st = scanner->get_schema(&schema);
EXPECT_FALSE(st.ok());
EXPECT_EQ(0, schema.size());
EXPECT_EQ(st.to_string(false),
"End of file: The parameter 'skip_header' is set to 3, but there are only 2 rows in the csv file");
}
INSTANTIATE_TEST_CASE_P(CSVScannerTestParams, CSVScannerTest, Values(true, false));
INSTANTIATE_TEST_CASE_P(CSVScannerTestParams, CSVScannerTrimSpaceTest, Values(true));
} // namespace starrocks