starrocks/tools/benchmark/table_sample.sh

84 lines
2.8 KiB
Bash

#!/bin/bash
# It's used to generate sqls for benchmarking TABLE SAMPLE functionality
# Define mysql connection parameters
mysql_user=${MYSQL_USER:-"root"}
mysql_host=${MYSQL_HOST:-"127.0.0.1"}
mysql_port=${MYSQL_PORT:-"9030"}
mysql_database=tpcds_100g
query_columns="select table_name, column_name from information_schema.columns where table_schema='tpcds_100g' \
and table_name in ('catalog_sales', 'store_returns', 'store_sales', 'web_sales') "
percents=(
1
4
16
32
)
# select count(lo_orderkey), count(distinct lo_orderkey) from lineorder
# --------------
# +--------------------+-----------------------------+
# | count(lo_orderkey) | count(DISTINCT lo_orderkey) |
# +--------------------+-----------------------------+
# | 600037902 | 150000000 |
# +--------------------+-----------------------------+
# 1 row in set (2.06 sec)
# Bye"
execute_sql() {
local variables=$1
local query="$2"
result=$(mysql -u ${mysql_user} -P ${mysql_port} -h ${mysql_host} ${mysql_database} -vvv -e "$query" 2>&1)
if [ $? -ne 0 ]; then
echo "MySQL execution failed: ${result}"
exit 1;
fi
execution_time=$(echo "$result" | grep -oP '\(\K[0-9]+\.[0-9]+(?= sec\))')
count=$(echo "$result" | grep -A2 "count" | tail -n1 | awk '{print $2}')
ndv=$(echo "$result" | grep -A2 "count" | tail -n1 | awk '{print $4}')
echo "$variables,count=${count},ndv=${ndv},elapsed=${execution_time},query=${query}"
}
while read table col; do
full_column="column=$table.$col"
method="method=full"
percent="percent=100"
prefix="${full_column},$method,$percent"
execute_sql "$prefix" "select count($col), ndv($col) from $table; "
execute_sql "$prefix" "select count($col), count(distinct $col) from $table; "
for sample_percent in ${percents[@]}; do
sample_ratio=$(echo "scale=2; $sample_percent / 100" | bc)
percent="percent=${sample_percent}"
method="method=rand"
prefix="${full_column},$method,$percent"
execute_sql $prefix "select count($col), ndv($col) from $table where rand() <= ${sample_ratio}; "
method="method=rand_limit"
prefix="${full_column},$method,$percent"
execute_sql $prefix "select count($col), ndv($col) from $table where rand() <= ${sample_ratio} limit 10000000; "
method="method=by_block"
prefix="${full_column},$method,$percent"
execute_sql $prefix "select count($col), ndv($col) from $table sample('method'='by_block', 'percent'='${sample_percent}', 'seed'='1');"
method="method=by_page"
prefix="${full_column},$method,$percent"
execute_sql $prefix "select count($col), ndv($col) from $table sample('method'='by_page', 'percent'='${sample_percent}', 'seed'='1');"
done
done < <(mysql -u "$mysql_user" -h "$mysql_host" -P"$mysql_port" -e "${query_columns}" -N)
exit 0