[BugFix] optional map key for parquet (#28296)

Fixes #32772 If the parquet file is generated by hive, the key of map may be optional, which is not allowed by arrow. This PR adds the patch to arrow to remove the limit.
2023-10-17 17:26:53 +08:00 · 2023-10-17 17:26:53 +08:00 · 74f3311fa9
parent 600a65ceb3
commit 74f3311fa9
5 changed files with 83 additions and 4 deletions
--- a/be/src/exec/parquet_reader.cpp
+++ b/be/src/exec/parquet_reader.cpp
@ -90,9 +90,10 @@ Status ParquetReaderWrap::_init_parquet_reader() {
                                                   parquet::ParquetFileReader::Open(_parquet, _properties),
                                                   arrow_reader_properties, &_reader);
        if (!st.ok()) {
-            LOG(WARNING) << "Failed to create parquet file reader. error: " << st.ToString()
-                         << ", filename: " << _filename;
-            return Status::InternalError(fmt::format("Failed to create file reader. filename: {}", _filename));
+            std::ostringstream oss;
+            oss << "Failed to create parquet file reader. error: " << st.ToString() << ", filename: " << _filename;
+            LOG(INFO) << oss.str();
+            return Status::InternalError(oss.str());
        }

        if (!_reader || !_reader->parquet_reader()) {
--- a/be/test/exec/parquet_scanner_test.cpp
+++ b/be/test/exec/parquet_scanner_test.cpp
@ -224,7 +224,9 @@ class ParquetScannerTest : public ::testing::Test {
                {"issue_17693_c0", TypeDescriptor::create_array_type(TypeDescriptor::from_logical_type(TYPE_VARCHAR))},
                {"issue_17822_c0", TypeDescriptor::create_array_type(TypeDescriptor::from_logical_type(TYPE_VARCHAR))},
                {"nested_array_c0", TypeDescriptor::create_array_type(TypeDescriptor::create_array_type(
-                                            TypeDescriptor::from_logical_type(TYPE_VARCHAR)))}};
+                                            TypeDescriptor::from_logical_type(TYPE_VARCHAR)))},
+                {"col_map", TypeDescriptor::create_map_type(TypeDescriptor::create_varchar_type(1048576),
+                                                            TypeDescriptor::create_varchar_type(1048576))}};
        SlotTypeDescInfoArray slot_infos;
        slot_infos.reserve(column_names.size());
        for (auto& name : column_names) {
@ -764,4 +766,47 @@ TEST_F(ParquetScannerTest, datetime) {
    }
 }

+TEST_F(ParquetScannerTest, optional_map_key) {
+    const std::string parquet_file_name = test_exec_dir + "/test_data/parquet_data/optional_map_key.parquet";
+    std::vector<std::tuple<std::string, std::vector<std::string>>> test_cases = {
+            {"col_int", {"1", "2", "6", "3", "4", "5", "7", "8", "9", "1", "2", "3", "4", "5", "7", "8", "9", "6"}},
+            {"col_map",
+             {R"({" ":" "})",
+              R"({"                                            aAbBcC":"                                            aAbBcC"})",
+              R"("你好，中国！":null})",
+              R"({"aAbBcC                                            ":"aAbBcC                                            "})",
+              R"({"                    aAbBcCdDeE                    ":"                    aAbBcCdDeE                    "})",
+              R"({"null":null})",
+              R"({"                                                  ":"                                                  "})",
+              R"({"Hello, world!你好":"Hello, world!你好"})",
+              R"({"Total MapReduce CPU Time Spent: 2 seconds 120 msec":"Total MapReduce CPU Time Spent: 2 seconds 120 msec"})",
+              R"({" ":" "})",
+              R"({"                                            aAbBcC":"                                            aAbBcC"})",
+              R"({"aAbBcC                                            ":"aAbBcC                                            "})",
+              R"({"                    aAbBcCdDeE                    ":"                    aAbBcCdDeE                    "})",
+              R"({"null":null})",
+              R"({"                                                  ":"                                                  "})",
+              R"({"Hello, world!你好":"Hello, world!你好"})",
+              R"({"Total MapReduce CPU Time Spent: 2 seconds 120 msec":"Total MapReduce CPU Time Spent: 2 seconds 120 msec"})",
+              R"("你好，中国！":null})"}}};
+
+    std::vector<std::string> columns_from_path;
+    std::vector<std::string> path_values;
+    std::unordered_map<size_t, TExpr> slot_map;
+
+    for (auto& [column_name, expected] : test_cases) {
+        std::vector<std::string> column_names{column_name};
+
+        ChunkPtr chunk = get_chunk<true>(column_names, slot_map, parquet_file_name, 18);
+        ASSERT_EQ(1, chunk->num_columns());
+
+        auto col = chunk->columns()[0];
+        for (int i = 0; i < col->size(); i++) {
+            std::string result = col->debug_item(i);
+            std::string expect = expected[i];
+            EXPECT_EQ(expect, result);
+        }
+    }
+}
+
 } // namespace starrocks
--- a/be/test/exec/test_data/parquet_data/optional_map_key.parquet
+++ b/be/test/exec/test_data/parquet_data/optional_map_key.parquet
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@ -485,6 +485,7 @@ if [[ -d $TP_SOURCE_DIR/$ARROW_SOURCE ]] ; then
        patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-force-use-external-jemalloc.patch
        # fix exception handling
        patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-fix-exception-handling.patch
+        patch -p1 < $TP_PATCH_DIR/arrow-5.0.0-parquet-map-key.patch
        touch $PATCHED_MARK
    fi
    cd -
--- a/thirdparty/patches/arrow-5.0.0-parquet-map-key.patch
+++ b/thirdparty/patches/arrow-5.0.0-parquet-map-key.patch
@ -0,0 +1,32 @@
+diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
+index eb7fd628d..e61d5ac3d 100644
+--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
+@@ -542,10 +542,24 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
+     return Status::Invalid("Key-value map node must have 1 or 2 child elements. Found: ",
+                            key_value.field_count());
+   }
+-  const Node& key_node = *key_value.field(0);
+-  if (!key_node.is_required()) {
+-    return Status::Invalid("Map keys must be annotated as required.");
+  /*
+The map key generated by hive may be optional.
+
+required group field_id=-1 hive_schema {
+  optional int32 field_id=-1 col_int;
+  optional group field_id=-1 col_map (Map) {
+    repeated group field_id=-1 map (Map) {
+      optional byte_array field_id=-1 key (String);
+      optional byte_array field_id=-1 value (String);
+    }
+   }
+}
+
+const Node& key_node = *key_value.field(0);
+if (!key_node.is_required()) {
+  return Status::Invalid("Map keys must be annotated as required.");
+}
+*/
+   // Arrow doesn't support 1 column maps (i.e. Sets).  The options are to either
+   // make the values column nullable, or process the map as a list.  We choose the latter
+   // as it is simpler.