[Enhancement] Optimize code in arm (#55072)
## Why I'm doing: arm is slower than x86 in some cases ## What I'm doing: 1. vectorize rf's insert_hash using Neon intrinsics 2. streamvbyte's cmakelist is wrong, which cause performance downgrade in arm because vectorization cannot work properly 3. arm's int128_mul_overflow is super slow becase of divide operation, __builtin_mul_overflow(int128_t a, int128_t b, int128_t* c) is fast enough when compile with gcc. But gcc's __builtin_mul_overflow is at least 5 times faster then clang in arm, we already reported it to the community: https://github.com/llvm/llvm-project/issues/123262. So we still use gcc as default compiler and use __builtin_mul_overflow to replace original int128_mul_overflow implementation 4. arm's cast int128 to double is super slow in arm with gcc because the bad implementation of __floattidf, clang runtime-rt's implementation is 20 times faster then gcc, so I used clang compiler-rt's implementation to replace gcc's version after this pr, arm is faster then gcc in the most of cases. ``` | Query | arm-opt | x86 | |---------|--------|--------| | QUERY01 | 36 | 61 | | QUERY02 | 39 | 62 | | QUERY14 | 1510 | 1514 | | QUERY15 | 1407 | 1496 | | QUERY17 | 21 | 88 | | QUERY20 | 151 | 279 | | QUERY21 | 1526 | 1529 | | QUERY24 | 1399 | 1504 | | QUERY26 | 32 | 122 | | QUERY27 | 1493 | 1519 | | QUERY90 | 3399 | 4030 | | QUERY97 | 3859 | 4776 | | QUERY98 | 2763 | 3208 | | QUERY99 | 868 | 1259 | ``` Signed-off-by: before-Sunrise <unclejyj@gmail.com>
This commit is contained in:
parent
ec7e5e30ae
commit
e88bb85360
|
|
@ -1010,7 +1010,7 @@ endif()
|
|||
|
||||
set(STARROCKS_LINK_LIBS ${STARROCKS_LINK_LIBS}
|
||||
${WL_LINK_STATIC} -lbfd
|
||||
${WL_LINK_DYNAMIC} -lresolv -liberty -lc -lm -ldl -rdynamic -pthread -Wl,-wrap=__cxa_throw
|
||||
${WL_LINK_DYNAMIC} -lresolv -liberty -lc -lm -ldl -rdynamic -pthread -Wl,-wrap,__cxa_throw -Wl,-wrap,__floattidf
|
||||
)
|
||||
|
||||
# link gcov if WITH_GCOV is on
|
||||
|
|
|
|||
|
|
@ -69,6 +69,15 @@ public:
|
|||
const __m256i mask = make_mask(hash >> _log_num_buckets);
|
||||
__m256i* const bucket = &reinterpret_cast<__m256i*>(_directory)[bucket_idx];
|
||||
_mm256_store_si256(bucket, _mm256_or_si256(*bucket, mask));
|
||||
#elif defined(__ARM_NEON)
|
||||
uint32x4_t masks[2];
|
||||
make_mask(hash >> _log_num_buckets, masks);
|
||||
uint32x4_t directory_1 = vld1q_u32(&_directory[bucket_idx][0]);
|
||||
uint32x4_t directory_2 = vld1q_u32(&_directory[bucket_idx][4]);
|
||||
directory_1 = vorrq_u32(directory_1, masks[0]);
|
||||
directory_2 = vorrq_u32(directory_2, masks[1]);
|
||||
vst1q_u32(&_directory[bucket_idx][0], directory_1);
|
||||
vst1q_u32(&_directory[bucket_idx][4], directory_2);
|
||||
#else
|
||||
uint32_t masks[BITS_SET_PER_BLOCK];
|
||||
make_mask(hash >> _log_num_buckets, masks);
|
||||
|
|
|
|||
|
|
@ -106,6 +106,7 @@ set(RUNTIME_FILES
|
|||
dictionary_cache_sink.cpp
|
||||
type_pack.cpp
|
||||
customized_result_writer.cpp
|
||||
int128_to_double.cpp
|
||||
)
|
||||
|
||||
set(RUNTIME_FILES ${RUNTIME_FILES}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,94 @@
|
|||
// Copyright 2021-present StarRocks, Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "runtime/int128_to_double.h"
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
|
||||
#include "integer_overflow_arithmetics.h"
|
||||
namespace starrocks {
|
||||
double __wrap___floattidf(__int128 a) {
|
||||
typedef double dst_t;
|
||||
typedef uint64_t dst_rep_t;
|
||||
typedef __uint128_t usrc_t;
|
||||
#define DST_REP_C UINT64_C
|
||||
|
||||
enum {
|
||||
dstSigBits = 52,
|
||||
};
|
||||
|
||||
if (a == 0) return 0.0;
|
||||
|
||||
enum {
|
||||
dstMantDig = dstSigBits + 1,
|
||||
srcBits = sizeof(__int128) * CHAR_BIT,
|
||||
srcIsSigned = ((__int128)-1) < 0,
|
||||
};
|
||||
|
||||
const __int128 s = srcIsSigned ? a >> (srcBits - 1) : 0;
|
||||
|
||||
a = (usrc_t)(a ^ s) - s;
|
||||
int sd = srcBits - clz128(a); // number of significant digits
|
||||
int e = sd - 1; // exponent
|
||||
if (sd > dstMantDig) {
|
||||
// start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
|
||||
// finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
|
||||
// 12345678901234567890123456
|
||||
// 1 = msb 1 bit
|
||||
// P = bit dstMantDig-1 bits to the right of 1
|
||||
// Q = bit dstMantDig bits to the right of 1
|
||||
// R = "or" of all bits to the right of Q
|
||||
if (sd == dstMantDig + 1) {
|
||||
a <<= 1;
|
||||
} else if (sd == dstMantDig + 2) {
|
||||
// Do nothing.
|
||||
} else {
|
||||
a = ((usrc_t)a >> (sd - (dstMantDig + 2))) |
|
||||
((a & ((usrc_t)(-1) >> ((srcBits + dstMantDig + 2) - sd))) != 0);
|
||||
}
|
||||
// finish:
|
||||
a |= (a & 4) != 0; // Or P into R
|
||||
++a; // round - this step may add a significant bit
|
||||
a >>= 2; // dump Q and R
|
||||
// a is now rounded to dstMantDig or dstMantDig+1 bits
|
||||
if (a & ((usrc_t)1 << dstMantDig)) {
|
||||
a >>= 1;
|
||||
++e;
|
||||
}
|
||||
// a is now rounded to dstMantDig bits
|
||||
} else {
|
||||
a <<= (dstMantDig - sd);
|
||||
// a is now rounded to dstMantDig bits
|
||||
}
|
||||
const int dstBits = sizeof(dst_t) * CHAR_BIT;
|
||||
const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1);
|
||||
const int dstExpBits = dstBits - dstSigBits - 1;
|
||||
const int dstExpBias = (1 << (dstExpBits - 1)) - 1;
|
||||
const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1;
|
||||
// Combine sign, exponent, and mantissa.
|
||||
const dst_rep_t result = ((dst_rep_t)s & dstSignMask) | ((dst_rep_t)(e + dstExpBias) << dstSigBits) |
|
||||
((dst_rep_t)(a)&dstSignificandMask);
|
||||
|
||||
const union {
|
||||
dst_t f;
|
||||
dst_rep_t i;
|
||||
} rep = {.i = result};
|
||||
|
||||
DCHECK(std::abs(rep.f - __real___floattidf(a)) < 0.001);
|
||||
return rep.f;
|
||||
}
|
||||
} // namespace starrocks
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
// Copyright 2021-present StarRocks, Inc. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
namespace starrocks {
|
||||
extern "C" {
|
||||
// origin from llvm-project
|
||||
// https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/int_to_fp_impl.inc
|
||||
// this implementation is 20x faster than gcc
|
||||
double __wrap___floattidf(__int128 a);
|
||||
double __real___floattidf(__int128 a);
|
||||
}
|
||||
} // namespace starrocks
|
||||
|
|
@ -113,25 +113,7 @@ inline int clz128(unsigned __int128 v) {
|
|||
}
|
||||
|
||||
inline bool int128_mul_overflow(int128_t a, int128_t b, int128_t* c) {
|
||||
if (a == 0 || b == 0) {
|
||||
*c = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
// sgn(x)
|
||||
auto sa = a >> 127;
|
||||
// sgn(y)
|
||||
auto sb = b >> 127;
|
||||
// abx(x), abs(y)
|
||||
a = (a ^ sa) - sa;
|
||||
b = (b ^ sb) - sb;
|
||||
// sgn(x * y)
|
||||
sa ^= sb;
|
||||
*c = a * b;
|
||||
// sgn(x * y) and abs(x) * abs(y) produces x * y;
|
||||
*c = (*c ^ sa) - sa;
|
||||
static constexpr auto int128_max = get_max<int128_t>();
|
||||
return clz128(a) + clz128(b) < sizeof(int128_t) || int128_max / a < b;
|
||||
return __builtin_mul_overflow(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
|
|
|||
|
|
@ -1,9 +1,36 @@
|
|||
From 676d0175085a7996f909d9d2e63ab7b4683ef475 Mon Sep 17 00:00:00 2001
|
||||
From: before-Sunrise <unclejyj@gmail.com>
|
||||
Date: Tue, 14 Jan 2025 18:41:46 +0800
|
||||
Subject: [PATCH] patch
|
||||
|
||||
Signed-off-by: before-Sunrise <unclejyj@gmail.com>
|
||||
---
|
||||
CMakeLists.txt | 5 +----
|
||||
include/streamvbyte.h | 2 +-
|
||||
2 files changed, 2 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 39df85d..1e32b0c 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -32,10 +32,7 @@ if (MSVC)
|
||||
endif()
|
||||
# test for arm
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
||||
- set(BASE_FLAGS
|
||||
- ${BASE_FLAGS}
|
||||
- "-D__ARM_NEON__"
|
||||
- )
|
||||
+ add_compile_options(-D__ARM_NEON__)
|
||||
endif()
|
||||
set(STREAMVBYTE_SRCS
|
||||
${PROJECT_SOURCE_DIR}/src/streamvbyte_encode.c
|
||||
diff --git a/include/streamvbyte.h b/include/streamvbyte.h
|
||||
index bc9533c..a6cbb1a 100644
|
||||
--- a/include/streamvbyte.h
|
||||
+++ b/include/streamvbyte.h
|
||||
@@ -1,7 +1,7 @@
|
||||
|
||||
|
||||
#ifndef INCLUDE_STREAMVBYTE_H_
|
||||
#define INCLUDE_STREAMVBYTE_H_
|
||||
-#define __STDC_FORMAT_MACROS
|
||||
|
|
@ -11,3 +38,6 @@ index bc9533c..a6cbb1a 100644
|
|||
#include <inttypes.h>
|
||||
#include <stdint.h>// please use a C99-compatible compiler
|
||||
#include <stddef.h>
|
||||
--
|
||||
2.34.1
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue