[BugFix] optional map key for parquet (#28296)

Fixes #32772
If the parquet file is generated by hive, the key of map may be optional, which is not allowed by arrow.
This PR adds the patch to arrow to remove the limit.
This commit is contained in:
ricky 2023-10-17 17:26:53 +08:00 committed by GitHub
parent 600a65ceb3
commit 74f3311fa9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 83 additions and 4 deletions

View File

@ -90,9 +90,10 @@ Status ParquetReaderWrap::_init_parquet_reader() {
parquet::ParquetFileReader::Open(_parquet, _properties),
arrow_reader_properties, &_reader);
if (!st.ok()) {
LOG(WARNING) << "Failed to create parquet file reader. error: " << st.ToString()
<< ", filename: " << _filename;
return Status::InternalError(fmt::format("Failed to create file reader. filename: {}", _filename));
std::ostringstream oss;
oss << "Failed to create parquet file reader. error: " << st.ToString() << ", filename: " << _filename;
LOG(INFO) << oss.str();
return Status::InternalError(oss.str());
}
if (!_reader || !_reader->parquet_reader()) {

View File

@ -224,7 +224,9 @@ class ParquetScannerTest : public ::testing::Test {
{"issue_17693_c0", TypeDescriptor::create_array_type(TypeDescriptor::from_logical_type(TYPE_VARCHAR))},
{"issue_17822_c0", TypeDescriptor::create_array_type(TypeDescriptor::from_logical_type(TYPE_VARCHAR))},
{"nested_array_c0", TypeDescriptor::create_array_type(TypeDescriptor::create_array_type(
TypeDescriptor::from_logical_type(TYPE_VARCHAR)))}};
TypeDescriptor::from_logical_type(TYPE_VARCHAR)))},
{"col_map", TypeDescriptor::create_map_type(TypeDescriptor::create_varchar_type(1048576),
TypeDescriptor::create_varchar_type(1048576))}};
SlotTypeDescInfoArray slot_infos;
slot_infos.reserve(column_names.size());
for (auto& name : column_names) {
@ -764,4 +766,47 @@ TEST_F(ParquetScannerTest, datetime) {
}
}
TEST_F(ParquetScannerTest, optional_map_key) {
const std::string parquet_file_name = test_exec_dir + "/test_data/parquet_data/optional_map_key.parquet";
std::vector<std::tuple<std::string, std::vector<std::string>>> test_cases = {
{"col_int", {"1", "2", "6", "3", "4", "5", "7", "8", "9", "1", "2", "3", "4", "5", "7", "8", "9", "6"}},
{"col_map",
{R"({" ":" "})",
R"({" aAbBcC":" aAbBcC"})",
R"("":null})",
R"({"aAbBcC ":"aAbBcC "})",
R"({" aAbBcCdDeE ":" aAbBcCdDeE "})",
R"({"null":null})",
R"({" ":" "})",
R"({"Hello, world!":"Hello, world!"})",
R"({"Total MapReduce CPU Time Spent: 2 seconds 120 msec":"Total MapReduce CPU Time Spent: 2 seconds 120 msec"})",
R"({" ":" "})",
R"({" aAbBcC":" aAbBcC"})",
R"({"aAbBcC ":"aAbBcC "})",
R"({" aAbBcCdDeE ":" aAbBcCdDeE "})",
R"({"null":null})",
R"({" ":" "})",
R"({"Hello, world!":"Hello, world!"})",
R"({"Total MapReduce CPU Time Spent: 2 seconds 120 msec":"Total MapReduce CPU Time Spent: 2 seconds 120 msec"})",
R"("":null})"}}};
std::vector<std::string> columns_from_path;
std::vector<std::string> path_values;
std::unordered_map<size_t, TExpr> slot_map;
for (auto& [column_name, expected] : test_cases) {
std::vector<std::string> column_names{column_name};
ChunkPtr chunk = get_chunk<true>(column_names, slot_map, parquet_file_name, 18);
ASSERT_EQ(1, chunk->num_columns());
auto col = chunk->columns()[0];
for (int i = 0; i < col->size(); i++) {
std::string result = col->debug_item(i);
std::string expect = expected[i];
EXPECT_EQ(expect, result);
}
}
}
} // namespace starrocks

View File

@ -485,6 +485,7 @@ if [[ -d $TP_SOURCE_DIR/$ARROW_SOURCE ]] ; then
patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-force-use-external-jemalloc.patch
# fix exception handling
patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-fix-exception-handling.patch
patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-parquet-map-key.patch
touch $PATCHED_MARK
fi
cd -

View File

@ -0,0 +1,32 @@
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index eb7fd628d..e61d5ac3d 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -542,10 +542,24 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
return Status::Invalid("Key-value map node must have 1 or 2 child elements. Found: ",
key_value.field_count());
}
- const Node& key_node = *key_value.field(0);
- if (!key_node.is_required()) {
- return Status::Invalid("Map keys must be annotated as required.");
+ /*
+The map key generated by hive may be optional.
+
+required group field_id=-1 hive_schema {
+ optional int32 field_id=-1 col_int;
+ optional group field_id=-1 col_map (Map) {
+ repeated group field_id=-1 map (Map) {
+ optional byte_array field_id=-1 key (String);
+ optional byte_array field_id=-1 value (String);
+ }
}
+}
+
+const Node& key_node = *key_value.field(0);
+if (!key_node.is_required()) {
+ return Status::Invalid("Map keys must be annotated as required.");
+}
+*/
// Arrow doesn't support 1 column maps (i.e. Sets). The options are to either
// make the values column nullable, or process the map as a list. We choose the latter
// as it is simpler.