[BugFix] optional map key for parquet (#28296)
Fixes #32772 If the parquet file is generated by hive, the key of map may be optional, which is not allowed by arrow. This PR adds the patch to arrow to remove the limit.
This commit is contained in:
parent
600a65ceb3
commit
74f3311fa9
|
|
@ -90,9 +90,10 @@ Status ParquetReaderWrap::_init_parquet_reader() {
|
|||
parquet::ParquetFileReader::Open(_parquet, _properties),
|
||||
arrow_reader_properties, &_reader);
|
||||
if (!st.ok()) {
|
||||
LOG(WARNING) << "Failed to create parquet file reader. error: " << st.ToString()
|
||||
<< ", filename: " << _filename;
|
||||
return Status::InternalError(fmt::format("Failed to create file reader. filename: {}", _filename));
|
||||
std::ostringstream oss;
|
||||
oss << "Failed to create parquet file reader. error: " << st.ToString() << ", filename: " << _filename;
|
||||
LOG(INFO) << oss.str();
|
||||
return Status::InternalError(oss.str());
|
||||
}
|
||||
|
||||
if (!_reader || !_reader->parquet_reader()) {
|
||||
|
|
|
|||
|
|
@ -224,7 +224,9 @@ class ParquetScannerTest : public ::testing::Test {
|
|||
{"issue_17693_c0", TypeDescriptor::create_array_type(TypeDescriptor::from_logical_type(TYPE_VARCHAR))},
|
||||
{"issue_17822_c0", TypeDescriptor::create_array_type(TypeDescriptor::from_logical_type(TYPE_VARCHAR))},
|
||||
{"nested_array_c0", TypeDescriptor::create_array_type(TypeDescriptor::create_array_type(
|
||||
TypeDescriptor::from_logical_type(TYPE_VARCHAR)))}};
|
||||
TypeDescriptor::from_logical_type(TYPE_VARCHAR)))},
|
||||
{"col_map", TypeDescriptor::create_map_type(TypeDescriptor::create_varchar_type(1048576),
|
||||
TypeDescriptor::create_varchar_type(1048576))}};
|
||||
SlotTypeDescInfoArray slot_infos;
|
||||
slot_infos.reserve(column_names.size());
|
||||
for (auto& name : column_names) {
|
||||
|
|
@ -764,4 +766,47 @@ TEST_F(ParquetScannerTest, datetime) {
|
|||
}
|
||||
}
|
||||
|
||||
TEST_F(ParquetScannerTest, optional_map_key) {
|
||||
const std::string parquet_file_name = test_exec_dir + "/test_data/parquet_data/optional_map_key.parquet";
|
||||
std::vector<std::tuple<std::string, std::vector<std::string>>> test_cases = {
|
||||
{"col_int", {"1", "2", "6", "3", "4", "5", "7", "8", "9", "1", "2", "3", "4", "5", "7", "8", "9", "6"}},
|
||||
{"col_map",
|
||||
{R"({" ":" "})",
|
||||
R"({" aAbBcC":" aAbBcC"})",
|
||||
R"("你好,中国!":null})",
|
||||
R"({"aAbBcC ":"aAbBcC "})",
|
||||
R"({" aAbBcCdDeE ":" aAbBcCdDeE "})",
|
||||
R"({"null":null})",
|
||||
R"({" ":" "})",
|
||||
R"({"Hello, world!你好":"Hello, world!你好"})",
|
||||
R"({"Total MapReduce CPU Time Spent: 2 seconds 120 msec":"Total MapReduce CPU Time Spent: 2 seconds 120 msec"})",
|
||||
R"({" ":" "})",
|
||||
R"({" aAbBcC":" aAbBcC"})",
|
||||
R"({"aAbBcC ":"aAbBcC "})",
|
||||
R"({" aAbBcCdDeE ":" aAbBcCdDeE "})",
|
||||
R"({"null":null})",
|
||||
R"({" ":" "})",
|
||||
R"({"Hello, world!你好":"Hello, world!你好"})",
|
||||
R"({"Total MapReduce CPU Time Spent: 2 seconds 120 msec":"Total MapReduce CPU Time Spent: 2 seconds 120 msec"})",
|
||||
R"("你好,中国!":null})"}}};
|
||||
|
||||
std::vector<std::string> columns_from_path;
|
||||
std::vector<std::string> path_values;
|
||||
std::unordered_map<size_t, TExpr> slot_map;
|
||||
|
||||
for (auto& [column_name, expected] : test_cases) {
|
||||
std::vector<std::string> column_names{column_name};
|
||||
|
||||
ChunkPtr chunk = get_chunk<true>(column_names, slot_map, parquet_file_name, 18);
|
||||
ASSERT_EQ(1, chunk->num_columns());
|
||||
|
||||
auto col = chunk->columns()[0];
|
||||
for (int i = 0; i < col->size(); i++) {
|
||||
std::string result = col->debug_item(i);
|
||||
std::string expect = expected[i];
|
||||
EXPECT_EQ(expect, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace starrocks
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -485,6 +485,7 @@ if [[ -d $TP_SOURCE_DIR/$ARROW_SOURCE ]] ; then
|
|||
patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-force-use-external-jemalloc.patch
|
||||
# fix exception handling
|
||||
patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-fix-exception-handling.patch
|
||||
patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-parquet-map-key.patch
|
||||
touch $PATCHED_MARK
|
||||
fi
|
||||
cd -
|
||||
|
|
|
|||
|
|
@ -0,0 +1,32 @@
|
|||
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
|
||||
index eb7fd628d..e61d5ac3d 100644
|
||||
--- a/cpp/src/parquet/arrow/schema.cc
|
||||
+++ b/cpp/src/parquet/arrow/schema.cc
|
||||
@@ -542,10 +542,24 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
|
||||
return Status::Invalid("Key-value map node must have 1 or 2 child elements. Found: ",
|
||||
key_value.field_count());
|
||||
}
|
||||
- const Node& key_node = *key_value.field(0);
|
||||
- if (!key_node.is_required()) {
|
||||
- return Status::Invalid("Map keys must be annotated as required.");
|
||||
+ /*
|
||||
+The map key generated by hive may be optional.
|
||||
+
|
||||
+required group field_id=-1 hive_schema {
|
||||
+ optional int32 field_id=-1 col_int;
|
||||
+ optional group field_id=-1 col_map (Map) {
|
||||
+ repeated group field_id=-1 map (Map) {
|
||||
+ optional byte_array field_id=-1 key (String);
|
||||
+ optional byte_array field_id=-1 value (String);
|
||||
+ }
|
||||
}
|
||||
+}
|
||||
+
|
||||
+const Node& key_node = *key_value.field(0);
|
||||
+if (!key_node.is_required()) {
|
||||
+ return Status::Invalid("Map keys must be annotated as required.");
|
||||
+}
|
||||
+*/
|
||||
// Arrow doesn't support 1 column maps (i.e. Sets). The options are to either
|
||||
// make the values column nullable, or process the map as a list. We choose the latter
|
||||
// as it is simpler.
|
||||
Loading…
Reference in New Issue