[Enhancement] Add statistic for xx_hash3_64 function (#63791)

Why I'm doing:
The function XX_HASH3_64 does not have any statistics and will be set to UNKNOWN. Even though we can not set good statistics for this function, even having trivial statistics is better then not having anything as it could be helpful for follow up functions like MOD.
For example for the expression ABS(MOD(XX_HASH3_64(a),100)) without statistics for XX_HASH3_64 the whole expression statistics will be unknown, but with trivial XX_HASH3_64 statistics starrocks can set the MIN to 0 and the MAX to 100 for the whole expression.
This could be the difference between a correct and a wrong join order.

(there are probably other cases like this one)

What I'm doing:
Adding statistic for the function XX_HASH3_64. The MIN and MAX are set to negative infinity and positive infinity. The DISTINCT COUNT is set to the number of rows.

Signed-off-by: m-selmi <m.selmi@celonis.com>
This commit is contained in:
Hechem Selmi 2025-10-11 03:07:02 +02:00 committed by GitHub
parent af76406358
commit f754c243b4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 26 additions and 0 deletions

View File

@ -393,6 +393,8 @@ public class FunctionSet {
// Hash functions:
public static final String MURMUR_HASH3_32 = "murmur_hash3_32";
public static final String CRC32_HASH = "crc32_hash";
public static final String XX_HASH3_64 = "xx_hash3_64";
public static final String XX_HASH3_128 = "xx_hash3_128";
// Percentile functions:
public static final String PERCENTILE_APPROX_RAW = "percentile_approx_raw";

View File

@ -18,6 +18,7 @@ package com.starrocks.sql.optimizer.statistics;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.starrocks.catalog.FunctionSet;
import com.starrocks.sql.ast.expression.LargeIntLiteral;
import com.starrocks.sql.optimizer.ConstantOperatorUtils;
import com.starrocks.sql.optimizer.Utils;
import com.starrocks.sql.optimizer.operator.scalar.CallOperator;
@ -470,6 +471,18 @@ public class ExpressionStatisticCalculator {
maxValue = 4294967295.0;
distinctValue = rowCount;
break;
case FunctionSet.XX_HASH3_64:
// xx_hash3_64's range is int64_t
minValue = Long.MIN_VALUE;
maxValue = Long.MAX_VALUE;
distinctValue = rowCount;
break;
case FunctionSet.XX_HASH3_128:
// xx_hash3_128's range is LARGE_INT
minValue = LargeIntLiteral.LARGE_INT_MIN.doubleValue();
maxValue = LargeIntLiteral.LARGE_INT_MAX.doubleValue();
distinctValue = rowCount;
break;
case FunctionSet.POSITIVE:
case FunctionSet.FLOOR:
case FunctionSet.DFLOOR:

View File

@ -21,6 +21,7 @@ import com.starrocks.catalog.FunctionSet;
import com.starrocks.catalog.Type;
import com.starrocks.common.util.DateUtils;
import com.starrocks.sql.ast.expression.BinaryType;
import com.starrocks.sql.ast.expression.LargeIntLiteral;
import com.starrocks.sql.optimizer.Utils;
import com.starrocks.sql.optimizer.operator.scalar.BinaryPredicateOperator;
import com.starrocks.sql.optimizer.operator.scalar.CallOperator;
@ -393,6 +394,16 @@ public class ExpressionStatisticsCalculatorTest {
columnStatistic = ExpressionStatisticCalculator.calculate(callOperator, statistics);
Assertions.assertEquals(columnStatistic.getMaxValue(), 100, 0.001);
Assertions.assertEquals(columnStatistic.getMinValue(), 0, 0.001);
// test xx_hash3_64 function
callOperator = new CallOperator(FunctionSet.XX_HASH3_64, Type.BIGINT, Lists.newArrayList(columnRefOperator));
columnStatistic = ExpressionStatisticCalculator.calculate(callOperator, statistics);
Assertions.assertEquals(columnStatistic.getMaxValue(), Long.MAX_VALUE, 0.001);
Assertions.assertEquals(columnStatistic.getMinValue(), Long.MIN_VALUE, 0.001);
// test xx_hash3_128 function
callOperator = new CallOperator(FunctionSet.XX_HASH3_128, Type.LARGEINT, Lists.newArrayList(columnRefOperator));
columnStatistic = ExpressionStatisticCalculator.calculate(callOperator, statistics);
Assertions.assertEquals(columnStatistic.getMaxValue(), LargeIntLiteral.LARGE_INT_MAX.doubleValue(), 0.001);
Assertions.assertEquals(columnStatistic.getMinValue(), LargeIntLiteral.LARGE_INT_MIN.doubleValue(), 0.001);
}
@Test