[Enhancement] Optimize bitshuffle and crc for ARM (#44607)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
This commit is contained in:
parent
4584a54ad2
commit
92e6bf6d53
|
|
@ -20,7 +20,7 @@
|
|||
#ifdef __x86_64__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#if defined(__ARM_NEON__) || defined(__aarch64__)
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#include <arm_acle.h>
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
|
@ -433,7 +433,7 @@ public:
|
|||
|
||||
start_offset += kBatchNums;
|
||||
}
|
||||
#elif defined(__ARM_NEON__) && defined(__aarch64__)
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
const uint8_t* filter_data = filter.data() + from;
|
||||
constexpr size_t data_type_size = sizeof(T);
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@
|
|||
#include <vector>
|
||||
#ifdef __SSE2__
|
||||
#include <emmintrin.h>
|
||||
#elif defined(__ARM_NEON__) && defined(__aarch64__)
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#include <arm_acle.h>
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
|
@ -193,7 +193,7 @@ inline bool contain_nonzero(const std::vector<uint8_t>& list, size_t start, size
|
|||
return pos < list.size() && pos < start + count;
|
||||
}
|
||||
|
||||
#if defined(__ARM_NEON__) && defined(__aarch64__)
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
|
||||
/// Returns a 64-bit mask, each 4-bit represents a byte of the input.
|
||||
/// The input containes 16 bytes and is expected to either 0x00 or 0xff for each byte.
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@
|
|||
|
||||
// Include the bitshuffle header again, but this time importing the
|
||||
// AVX2-compiled symbols by defining some macros.
|
||||
// See `build_bitshuffle` in `build-thirdparty.sh` for detail.
|
||||
#undef BITSHUFFLE_H
|
||||
#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx512
|
||||
#define bshuf_compress_lz4 bshuf_compress_lz4_avx512
|
||||
|
|
@ -60,6 +61,15 @@
|
|||
#undef bshuf_compress_lz4
|
||||
#undef bshuf_decompress_lz4
|
||||
|
||||
#undef BITSHUFFLE_H
|
||||
#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_neon
|
||||
#define bshuf_compress_lz4 bshuf_compress_lz4_neon
|
||||
#define bshuf_decompress_lz4 bshuf_decompress_lz4_neon
|
||||
#include <bitshuffle/bitshuffle.h> // NOLINT(*)
|
||||
#undef bshuf_compress_lz4_bound
|
||||
#undef bshuf_compress_lz4
|
||||
#undef bshuf_decompress_lz4
|
||||
|
||||
using base::CPU;
|
||||
|
||||
namespace starrocks::bitshuffle {
|
||||
|
|
@ -92,6 +102,10 @@ __attribute__((constructor)) void SelectBitshuffleFunctions() {
|
|||
g_bshuf_compress_lz4 = bshuf_compress_lz4;
|
||||
g_bshuf_decompress_lz4 = bshuf_decompress_lz4;
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_neon;
|
||||
g_bshuf_compress_lz4 = bshuf_compress_lz4_neon;
|
||||
g_bshuf_decompress_lz4 = bshuf_decompress_lz4_neon;
|
||||
#else
|
||||
g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound;
|
||||
g_bshuf_compress_lz4 = bshuf_compress_lz4;
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@
|
|||
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
/* GCC-compatible compiler, targeting x86/x86-64 */
|
||||
#include <x86intrin.h>
|
||||
#elif defined(__GNUC__) && defined(__ARM_NEON__)
|
||||
#elif defined(__GNUC__) && defined(__ARM_NEON)
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
|
|
|
|||
|
|
@ -22,6 +22,10 @@
|
|||
#ifdef __SSE4_2__
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#include <arm_acle.h>
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace starrocks::crc32c {
|
||||
|
|
@ -170,7 +174,14 @@ static inline uint64_t LE_LOAD64(const uint8_t* p) {
|
|||
|
||||
static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
|
||||
#ifndef __SSE4_2__
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
*l = __crc32cw(static_cast<unsigned int>(*l), LE_LOAD32(*p));
|
||||
*p += 4;
|
||||
*l = __crc32cw(static_cast<unsigned int>(*l), LE_LOAD32(*p));
|
||||
*p += 4;
|
||||
#else
|
||||
Slow_CRC32(l, p);
|
||||
#endif // defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#elif defined(__LP64__) || defined(_WIN64)
|
||||
*l = _mm_crc32_u64(*l, LE_LOAD64(*p));
|
||||
*p += 8;
|
||||
|
|
@ -226,11 +237,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
|
|||
}
|
||||
|
||||
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
|
||||
#ifdef __SSE4_2__
|
||||
return ExtendImpl<Fast_CRC32>(crc, buf, size);
|
||||
#else
|
||||
return ExtendImpl<Slow_CRC32>(crc, buf, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace starrocks::crc32c
|
||||
|
|
|
|||
|
|
@ -754,7 +754,7 @@ build_bitshuffle() {
|
|||
arches="default avx2 avx512"
|
||||
# Becuase aarch64 don't support avx2, disable it.
|
||||
if [[ "${MACHINE_TYPE}" == "aarch64" ]]; then
|
||||
arches="default"
|
||||
arches="default neon"
|
||||
fi
|
||||
|
||||
to_link=""
|
||||
|
|
@ -764,6 +764,8 @@ build_bitshuffle() {
|
|||
arch_flag="-mavx2"
|
||||
elif [ "$arch" == "avx512" ]; then
|
||||
arch_flag="-march=icelake-server"
|
||||
elif [ "$arch" == "neon" ]; then
|
||||
arch_flag="-march=armv8-a+crc"
|
||||
fi
|
||||
tmp_obj=bitshuffle_${arch}_tmp.o
|
||||
dst_obj=bitshuffle_${arch}.o
|
||||
|
|
@ -774,13 +776,7 @@ build_bitshuffle() {
|
|||
# Merge the object files together to produce a combined .o file.
|
||||
ld -r -o $tmp_obj bitshuffle_core.o bitshuffle.o iochain.o
|
||||
# For the AVX2 symbols, suffix them.
|
||||
if [ "$arch" == "avx2" ]; then
|
||||
# Create a mapping file with '<old_sym> <suffixed_sym>' on each line.
|
||||
nm --defined-only --extern-only $tmp_obj | while read addr type sym ; do
|
||||
echo ${sym} ${sym}_${arch}
|
||||
done > renames.txt
|
||||
objcopy --redefine-syms=renames.txt $tmp_obj $dst_obj
|
||||
elif [ "$arch" == "avx512" ]; then
|
||||
if [[ "$arch" == "avx2" || "$arch" == "avx512" || "$arch" == "neon" ]]; then
|
||||
# Create a mapping file with '<old_sym> <suffixed_sym>' on each line.
|
||||
nm --defined-only --extern-only $tmp_obj | while read addr type sym ; do
|
||||
echo ${sym} ${sym}_${arch}
|
||||
|
|
|
|||
|
|
@ -491,3 +491,14 @@ if [[ -d $TP_SOURCE_DIR/$BZIP_SOURCE ]] ; then
|
|||
cd -
|
||||
echo "Finished patching $BZIP_SOURCE"
|
||||
fi
|
||||
|
||||
# patch bitshuffle
|
||||
if [[ -d $TP_SOURCE_DIR/$BITSHUFFLE_SOURCE ]] ; then
|
||||
cd $TP_SOURCE_DIR/$BITSHUFFLE_SOURCE
|
||||
if [ ! -f "$PATCHED_MARK" ] && [[ $BITSHUFFLE_SOURCE == "bitshuffle-0.5.1" ]] ; then
|
||||
patch -p1 < "$TP_PATCH_DIR/bitshuffle-0.5.1.patch"
|
||||
touch "$PATCHED_MARK"
|
||||
fi
|
||||
cd -
|
||||
echo "Finished patching $BITSHUFFLE_SOURCE"
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -0,0 +1,188 @@
|
|||
From 88001dfe180522b16629cd641605f6135a8e407c Mon Sep 17 00:00:00 2001
|
||||
From: Sebastian Pop <spop@amazon.com>
|
||||
Date: Fri, 7 Apr 2023 21:26:54 +0000
|
||||
Subject: [PATCH] [arm64] use a better translation for move_mask
|
||||
|
||||
No changes | With the patch | Speedup
|
||||
$ python3 ./tests/test_ext.py | |
|
||||
.bitshuffle 64 : 4.94 s/GB, 0.20 GB/s | 1.53 s/GB, 0.65 GB/s | 3.25x
|
||||
.bitunshuffle 64 : 5.09 s/GB, 0.20 GB/s | 1.53 s/GB, 0.65 GB/s | 3.25x
|
||||
.compress 64 : 5.26 s/GB, 0.19 GB/s | 1.80 s/GB, 0.55 GB/s | 2.89x
|
||||
.compress zstd 64 : 8.02 s/GB, 0.12 GB/s | 4.80 s/GB, 0.21 GB/s | 1.75x
|
||||
.decompress 64 : 5.72 s/GB, 0.17 GB/s | 2.21 s/GB, 0.45 GB/s | 2.64x
|
||||
.decompress zstd 64 : 5.71 s/GB, 0.18 GB/s | 2.18 s/GB, 0.46 GB/s | 2.55x
|
||||
|
||||
fix aliasing bug
|
||||
|
||||
Patch from Andrew Pinski <pinskia@gcc.gnu.org>.
|
||||
---
|
||||
src/bitshuffle_core.c | 97 +++++++++++++++++++++++++++++--------------
|
||||
1 file changed, 65 insertions(+), 32 deletions(-)
|
||||
|
||||
diff --git a/src/bitshuffle_core.c b/src/bitshuffle_core.c
|
||||
index ba41473..f3b6ca0 100644
|
||||
--- a/src/bitshuffle_core.c
|
||||
+++ b/src/bitshuffle_core.c
|
||||
@@ -49,6 +49,8 @@ typedef int64_t omp_size_t;
|
||||
typedef size_t omp_size_t;
|
||||
#endif
|
||||
|
||||
+typedef uint16_t alias_uint16_t __attribute__((may_alias));
|
||||
+
|
||||
// Macros.
|
||||
#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
|
||||
#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
|
||||
@@ -605,44 +607,59 @@ int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
|
||||
}
|
||||
}
|
||||
|
||||
-
|
||||
-/* Creates a mask made up of the most significant
|
||||
- * bit of each byte of 'input'
|
||||
- */
|
||||
-int32_t move_byte_mask_neon(uint8x16_t input) {
|
||||
-
|
||||
- return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3)
|
||||
- | (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7)
|
||||
- | (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11)
|
||||
- | (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15)
|
||||
- );
|
||||
+uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) {
|
||||
+ const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
||||
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
||||
+ uint8x16_t t0 = vandq_u8(p0, bitmask);
|
||||
+ uint8x16_t t1 = vandq_u8(p1, bitmask);
|
||||
+ uint8x16_t t2 = vandq_u8(p2, bitmask);
|
||||
+ uint8x16_t t3 = vandq_u8(p3, bitmask);
|
||||
+ uint8x16_t sum0 = vpaddq_u8(t0, t1);
|
||||
+ uint8x16_t sum1 = vpaddq_u8(t2, t3);
|
||||
+ sum0 = vpaddq_u8(sum0, sum1);
|
||||
+ sum0 = vpaddq_u8(sum0, sum0);
|
||||
+ return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
||||
}
|
||||
|
||||
/* Transpose bits within bytes. */
|
||||
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
- size_t ii, kk;
|
||||
+ size_t ii;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
- uint16_t* out_ui16;
|
||||
-
|
||||
int64_t count;
|
||||
-
|
||||
size_t nbyte = elem_size * size;
|
||||
|
||||
CHECK_MULT_EIGHT(nbyte);
|
||||
|
||||
- int16x8_t xmm;
|
||||
- int32_t bt;
|
||||
+ const uint8x16_t a0 = vdupq_n_u8(0x80);
|
||||
+ const uint8x16_t a1 = vdupq_n_u8(0x40);
|
||||
+ const uint8x16_t a2 = vdupq_n_u8(0x20);
|
||||
+ const uint8x16_t a3 = vdupq_n_u8(0x10);
|
||||
+ const uint8x16_t a4 = vdupq_n_u8(0x8);
|
||||
+ const uint8x16_t a5 = vdupq_n_u8(0x4);
|
||||
+ const uint8x16_t a6 = vdupq_n_u8(0x2);
|
||||
+ const uint8x16_t a7 = vdupq_n_u8(0x1);
|
||||
|
||||
for (ii = 0; ii + 15 < nbyte; ii += 16) {
|
||||
- xmm = vld1q_s16((int16_t *) (in_b + ii));
|
||||
+ uint8x16_t x = vld1q_u8((uint8_t *) (in_b + ii));
|
||||
+ uint8x16_t x0 = vceqq_u8(a0, vandq_u8(x, a0));
|
||||
+ uint8x16_t x1 = vceqq_u8(a1, vandq_u8(x, a1));
|
||||
+ uint8x16_t x2 = vceqq_u8(a2, vandq_u8(x, a2));
|
||||
+ uint8x16_t x3 = vceqq_u8(a3, vandq_u8(x, a3));
|
||||
+ uint8x16_t x4 = vceqq_u8(a4, vandq_u8(x, a4));
|
||||
+ uint8x16_t x5 = vceqq_u8(a5, vandq_u8(x, a5));
|
||||
+ uint8x16_t x6 = vceqq_u8(a6, vandq_u8(x, a6));
|
||||
+ uint8x16_t x7 = vceqq_u8(a7, vandq_u8(x, a7));
|
||||
+
|
||||
+ uint64_t out[2];
|
||||
+ out[0] = neonmovemask_bulk(x0, x1, x2, x3);
|
||||
+ out[1] = neonmovemask_bulk(x4, x5, x6, x7);
|
||||
+ int kk;
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
- bt = move_byte_mask_neon((uint8x16_t) xmm);
|
||||
- xmm = vshlq_n_s16(xmm, 1);
|
||||
- out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
- *out_ui16 = bt;
|
||||
+ alias_uint16_t *out_ui16 = (alias_uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
+ *out_ui16 = ((alias_uint16_t*)out)[kk];
|
||||
}
|
||||
}
|
||||
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
|
||||
@@ -780,26 +797,42 @@ int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
const char* in_b = (const char*) in;
|
||||
- uint16_t* out_ui16 = (uint16_t*) out;
|
||||
+ alias_uint16_t* out_ui16 = (alias_uint16_t*) out;
|
||||
|
||||
size_t ii, jj, kk;
|
||||
size_t nbyte = elem_size * size;
|
||||
|
||||
- int16x8_t xmm;
|
||||
- int32_t bt;
|
||||
-
|
||||
if (elem_size % 2) {
|
||||
bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
|
||||
} else {
|
||||
+ const uint8x16_t a0 = vdupq_n_u8(0x80);
|
||||
+ const uint8x16_t a1 = vdupq_n_u8(0x40);
|
||||
+ const uint8x16_t a2 = vdupq_n_u8(0x20);
|
||||
+ const uint8x16_t a3 = vdupq_n_u8(0x10);
|
||||
+ const uint8x16_t a4 = vdupq_n_u8(0x8);
|
||||
+ const uint8x16_t a5 = vdupq_n_u8(0x4);
|
||||
+ const uint8x16_t a6 = vdupq_n_u8(0x2);
|
||||
+ const uint8x16_t a7 = vdupq_n_u8(0x1);
|
||||
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
|
||||
ii += 8 * elem_size) {
|
||||
for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
|
||||
- xmm = vld1q_s16((int16_t *) &in_b[ii + jj]);
|
||||
+ uint8x16_t x = vld1q_u8((uint8_t *) &in_b[ii + jj]);
|
||||
+ uint8x16_t x0 = vceqq_u8(a0, vandq_u8(x, a0));
|
||||
+ uint8x16_t x1 = vceqq_u8(a1, vandq_u8(x, a1));
|
||||
+ uint8x16_t x2 = vceqq_u8(a2, vandq_u8(x, a2));
|
||||
+ uint8x16_t x3 = vceqq_u8(a3, vandq_u8(x, a3));
|
||||
+ uint8x16_t x4 = vceqq_u8(a4, vandq_u8(x, a4));
|
||||
+ uint8x16_t x5 = vceqq_u8(a5, vandq_u8(x, a5));
|
||||
+ uint8x16_t x6 = vceqq_u8(a6, vandq_u8(x, a6));
|
||||
+ uint8x16_t x7 = vceqq_u8(a7, vandq_u8(x, a7));
|
||||
+
|
||||
+ uint64_t out[2];
|
||||
+ out[0] = neonmovemask_bulk(x0, x1, x2, x3);
|
||||
+ out[1] = neonmovemask_bulk(x4, x5, x6, x7);
|
||||
+
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
- bt = move_byte_mask_neon((uint8x16_t) xmm);
|
||||
- xmm = vshlq_n_s16(xmm, 1);
|
||||
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
|
||||
- out_ui16[ind / 2] = bt;
|
||||
+ out_ui16[ind / 2] = ((alias_uint16_t *)out)[kk];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1114,7 +1147,7 @@ int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
|
||||
size_t ii, kk;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
- uint16_t* out_ui16;
|
||||
+ alias_uint16_t* out_ui16;
|
||||
|
||||
int64_t count;
|
||||
|
||||
@@ -1130,7 +1163,7 @@ int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = _mm_movemask_epi8(xmm);
|
||||
xmm = _mm_slli_epi16(xmm, 1);
|
||||
- out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
+ out_ui16 = (alias_uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
*out_ui16 = bt;
|
||||
}
|
||||
}
|
||||
--
|
||||
2.32.0 (Apple Git-132)
|
||||
|
||||
Loading…
Reference in New Issue