[Enhancement] Optimize code in arm (#55072)

## Why I'm doing: arm is slower than x86 in some cases ## What I'm doing: 1. vectorize rf's insert_hash using Neon intrinsics 2. streamvbyte's cmakelist is wrong, which cause performance downgrade in arm because vectorization cannot work properly 3. arm's int128_mul_overflow is super slow becase of divide operation, __builtin_mul_overflow(int128_t a, int128_t b, int128_t* c) is fast enough when compile with gcc. But gcc's __builtin_mul_overflow is at least 5 times faster then clang in arm, we already reported it to the community: https://github.com/llvm/llvm-project/issues/123262. So we still use gcc as default compiler and use __builtin_mul_overflow to replace original int128_mul_overflow implementation 4. arm's cast int128 to double is super slow in arm with gcc because the bad implementation of __floattidf, clang runtime-rt's implementation is 20 times faster then gcc, so I used clang compiler-rt's implementation to replace gcc's version after this pr, arm is faster then gcc in the most of cases. ``` | Query | arm-opt | x86 | |---------|--------|--------| | QUERY01 | 36 | 61 | | QUERY02 | 39 | 62 | | QUERY14 | 1510 | 1514 | | QUERY15 | 1407 | 1496 | | QUERY17 | 21 | 88 | | QUERY20 | 151 | 279 | | QUERY21 | 1526 | 1529 | | QUERY24 | 1399 | 1504 | | QUERY26 | 32 | 122 | | QUERY27 | 1493 | 1519 | | QUERY90 | 3399 | 4030 | | QUERY97 | 3859 | 4776 | | QUERY98 | 2763 | 3208 | | QUERY99 | 868 | 1259 | ``` Signed-off-by: before-Sunrise <unclejyj@gmail.com>
2025-01-27 11:18:10 +08:00 · 2025-01-27 11:18:10 +08:00 · e88bb85360
parent ec7e5e30ae
commit e88bb85360
7 changed files with 159 additions and 21 deletions
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@ -1010,7 +1010,7 @@ endif()

 set(STARROCKS_LINK_LIBS ${STARROCKS_LINK_LIBS}
    ${WL_LINK_STATIC} -lbfd
-    ${WL_LINK_DYNAMIC} -lresolv -liberty -lc -lm -ldl -rdynamic -pthread -Wl,-wrap=__cxa_throw
+    ${WL_LINK_DYNAMIC} -lresolv -liberty -lc -lm -ldl -rdynamic -pthread -Wl,-wrap,__cxa_throw -Wl,-wrap,__floattidf
 )

 # link gcov if WITH_GCOV is on
--- a/be/src/exprs/runtime_filter.h
+++ b/be/src/exprs/runtime_filter.h
@ -69,6 +69,15 @@ public:
        const __m256i mask = make_mask(hash >> _log_num_buckets);
        __m256i* const bucket = &reinterpret_cast<__m256i*>(_directory)[bucket_idx];
        _mm256_store_si256(bucket, _mm256_or_si256(*bucket, mask));
+#elif defined(__ARM_NEON)
+        uint32x4_t masks[2];
+        make_mask(hash >> _log_num_buckets, masks);
+        uint32x4_t directory_1 = vld1q_u32(&_directory[bucket_idx][0]);
+        uint32x4_t directory_2 = vld1q_u32(&_directory[bucket_idx][4]);
+        directory_1 = vorrq_u32(directory_1, masks[0]);
+        directory_2 = vorrq_u32(directory_2, masks[1]);
+        vst1q_u32(&_directory[bucket_idx][0], directory_1);
+        vst1q_u32(&_directory[bucket_idx][4], directory_2);
 #else
        uint32_t masks[BITS_SET_PER_BLOCK];
        make_mask(hash >> _log_num_buckets, masks);
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@ -106,6 +106,7 @@ set(RUNTIME_FILES
    dictionary_cache_sink.cpp
    type_pack.cpp
    customized_result_writer.cpp
+    int128_to_double.cpp
 )

 set(RUNTIME_FILES ${RUNTIME_FILES}
--- a/be/src/runtime/int128_to_double.cpp
+++ b/be/src/runtime/int128_to_double.cpp
@ -0,0 +1,94 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "runtime/int128_to_double.h"
+
+#include <glog/logging.h>
+
+#include <climits>
+#include <cstdint>
+
+#include "integer_overflow_arithmetics.h"
+namespace starrocks {
+double __wrap___floattidf(__int128 a) {
+    typedef double dst_t;
+    typedef uint64_t dst_rep_t;
+    typedef __uint128_t usrc_t;
+#define DST_REP_C UINT64_C
+
+    enum {
+        dstSigBits = 52,
+    };
+
+    if (a == 0) return 0.0;
+
+    enum {
+        dstMantDig = dstSigBits + 1,
+        srcBits = sizeof(__int128) * CHAR_BIT,
+        srcIsSigned = ((__int128)-1) < 0,
+    };
+
+    const __int128 s = srcIsSigned ? a >> (srcBits - 1) : 0;
+
+    a = (usrc_t)(a ^ s) - s;
+    int sd = srcBits - clz128(a); // number of significant digits
+    int e = sd - 1;               // exponent
+    if (sd > dstMantDig) {
+        //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
+        //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
+        //                                                12345678901234567890123456
+        //  1 = msb 1 bit
+        //  P = bit dstMantDig-1 bits to the right of 1
+        //  Q = bit dstMantDig bits to the right of 1
+        //  R = "or" of all bits to the right of Q
+        if (sd == dstMantDig + 1) {
+            a <<= 1;
+        } else if (sd == dstMantDig + 2) {
+            // Do nothing.
+        } else {
+            a = ((usrc_t)a >> (sd - (dstMantDig + 2))) |
+                ((a & ((usrc_t)(-1) >> ((srcBits + dstMantDig + 2) - sd))) != 0);
+        }
+        // finish:
+        a |= (a & 4) != 0; // Or P into R
+        ++a;               // round - this step may add a significant bit
+        a >>= 2;           // dump Q and R
+        // a is now rounded to dstMantDig or dstMantDig+1 bits
+        if (a & ((usrc_t)1 << dstMantDig)) {
+            a >>= 1;
+            ++e;
+        }
+        // a is now rounded to dstMantDig bits
+    } else {
+        a <<= (dstMantDig - sd);
+        // a is now rounded to dstMantDig bits
+    }
+    const int dstBits = sizeof(dst_t) * CHAR_BIT;
+    const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1);
+    const int dstExpBits = dstBits - dstSigBits - 1;
+    const int dstExpBias = (1 << (dstExpBits - 1)) - 1;
+    const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1;
+    // Combine sign, exponent, and mantissa.
+    const dst_rep_t result = ((dst_rep_t)s & dstSignMask) | ((dst_rep_t)(e + dstExpBias) << dstSigBits) |
+                             ((dst_rep_t)(a)&dstSignificandMask);
+
+    const union {
+        dst_t f;
+        dst_rep_t i;
+    } rep = {.i = result};
+
+    DCHECK(std::abs(rep.f - __real___floattidf(a)) < 0.001);
+    return rep.f;
+}
+} // namespace starrocks
--- a/be/src/runtime/int128_to_double.h
+++ b/be/src/runtime/int128_to_double.h
@ -0,0 +1,22 @@
+// Copyright 2021-present StarRocks, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+namespace starrocks {
+extern "C" {
+// origin from llvm-project
+// https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/int_to_fp_impl.inc
+// this implementation is 20x faster than gcc
+double __wrap___floattidf(__int128 a);
+double __real___floattidf(__int128 a);
+}
+} // namespace starrocks
--- a/be/src/runtime/integer_overflow_arithmetics.h
+++ b/be/src/runtime/integer_overflow_arithmetics.h
@ -113,25 +113,7 @@ inline int clz128(unsigned __int128 v) {
 }

 inline bool int128_mul_overflow(int128_t a, int128_t b, int128_t* c) {
-    if (a == 0 || b == 0) {
-        *c = 0;
-        return false;
-    }
-
-    // sgn(x)
-    auto sa = a >> 127;
-    // sgn(y)
-    auto sb = b >> 127;
-    // abx(x), abs(y)
-    a = (a ^ sa) - sa;
-    b = (b ^ sb) - sb;
-    // sgn(x * y)
-    sa ^= sb;
-    *c = a * b;
-    // sgn(x * y) and abs(x) * abs(y) produces x * y;
-    *c = (*c ^ sa) - sa;
-    static constexpr auto int128_max = get_max<int128_t>();
-    return clz128(a) + clz128(b) < sizeof(int128_t) || int128_max / a < b;
+    return __builtin_mul_overflow(a, b, c);
 }

 template <>
--- a/thirdparty/patches/streamvbyte.patch
+++ b/thirdparty/patches/streamvbyte.patch
@ -1,9 +1,36 @@
+From 676d0175085a7996f909d9d2e63ab7b4683ef475 Mon Sep 17 00:00:00 2001
+From: before-Sunrise <unclejyj@gmail.com>
+Date: Tue, 14 Jan 2025 18:41:46 +0800
+Subject: [PATCH] patch
+
+Signed-off-by: before-Sunrise <unclejyj@gmail.com>
+---
+ CMakeLists.txt        | 5 +----
+ include/streamvbyte.h | 2 +-
+ 2 files changed, 2 insertions(+), 5 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 39df85d..1e32b0c 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -32,10 +32,7 @@ if (MSVC)
+ endif()
+ # test for arm
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
+-   set(BASE_FLAGS
+-    ${BASE_FLAGS}
+-    "-D__ARM_NEON__"
+-    )
+    add_compile_options(-D__ARM_NEON__)
+ endif()
+ set(STREAMVBYTE_SRCS
+   ${PROJECT_SOURCE_DIR}/src/streamvbyte_encode.c
 diff --git a/include/streamvbyte.h b/include/streamvbyte.h
 index bc9533c..a6cbb1a 100644
 --- a/include/streamvbyte.h
 +++ b/include/streamvbyte.h
@@ -1,7 +1,7 @@
-
+ 
 #ifndef INCLUDE_STREAMVBYTE_H_
 #define INCLUDE_STREAMVBYTE_H_
 -#define __STDC_FORMAT_MACROS
@ -11,3 +38,6 @@ index bc9533c..a6cbb1a 100644
 #include <inttypes.h>
 #include <stdint.h>// please use a C99-compatible compiler
 #include <stddef.h>
+-- 
+2.34.1
+