[BugFix] Fix mv rewriter binder bugs (backport #62919) (#63057)

Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
This commit is contained in:
mergify[bot] 2025-09-12 15:21:42 +08:00 committed by GitHub
parent 25adbac4e0
commit e2a0aac85d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 580 additions and 15 deletions

View File

@ -47,7 +47,8 @@ public class Binder {
// `nextIdx` marks the current idx which iterates calling `next()` method and it's used for MULTI_JOIN pattern
// to optimize iteration expansions.
private int nextIdx = 0;
// if the binder is exhausted, no need to check again
private boolean isExhausted = false;
/**
* Extract a expression from GroupExpression which match the given pattern
*
@ -101,7 +102,7 @@ public class Binder {
this.groupExpressionIndex.set(lastNode, lastNodeIndex + 1);
expression = match(pattern, groupExpression);
} while (expression == null && this.groupExpressionIndex.size() != 1);
} while (!isExhausted && expression == null && this.groupExpressionIndex.size() != 1);
nextIdx++;
return expression;
@ -206,7 +207,7 @@ public class Binder {
public OptExpression match(GroupExpression ge) {
// 1. Check if the entire tree is MULTI_JOIN
// 2. Enumerate GE
if (ge == null || !isMultiJoin(ge)) {
if (ge == null || isExhausted || !isMultiJoin(ge)) {
return null;
}
@ -217,14 +218,20 @@ public class Binder {
* Check whether the binder is exhausted.
*/
private boolean exhausted() {
if (loopCount++ % CHECK_EXHAUSTED_INTERVAL == 0) {
final long elapsed = watch.elapsed(TimeUnit.MILLISECONDS);
final boolean exhausted = elapsed > timeLimit;
if (exhausted) {
if (isExhausted) {
return true;
}
// Only check elapsed time every CHECK_EXHAUSTED_INTERVAL iterations
if ((++loopCount % CHECK_EXHAUSTED_INTERVAL) == 0) {
long elapsed = watch.elapsed(TimeUnit.MILLISECONDS);
if (elapsed > timeLimit) {
isExhausted = true;
// Log only once to avoid log flooding
Tracers.log(Tracers.Module.MV, args ->
String.format("[MV TRACE] MultiJoinBinder %s exhausted(loop:%s)\n", this, loopCount));
String.format("[MV TRACE] MultiJoinBinder exhausted after %d loops (elapsed: %d ms, limit: %d ms)%n",
loopCount, elapsed, timeLimit));
return true;
}
return exhausted;
}
return false;
}
@ -279,6 +286,7 @@ public class Binder {
}
// directly return if next has rewritten by mv
if (next.hasAppliedMVRules()) {
groupExpressionIndex.remove(groupTraceKey);
return next;
}
@ -294,7 +302,7 @@ public class Binder {
next = group.getLogicalExpressions().get(valueIndex);
if (next.hasAppliedMVRules()) {
groupExpressionIndex.set(groupTraceKey, valueIndex);
groupExpressionIndex.remove(groupTraceKey);
return next;
}
}

View File

@ -28,6 +28,7 @@ import com.starrocks.catalog.Type;
import com.starrocks.server.GlobalStateMgr;
import com.starrocks.sql.optimizer.OptExpression;
import com.starrocks.sql.optimizer.OptimizerContext;
import com.starrocks.sql.optimizer.OptimizerTraceUtil;
import com.starrocks.sql.optimizer.Utils;
import com.starrocks.sql.optimizer.operator.AggType;
import com.starrocks.sql.optimizer.operator.ColumnOutputInfo;
@ -213,6 +214,7 @@ public class FineGrainedRangePredicateRule extends TransformationRule {
LogicalAggregationOperator newAggOp = rewriteAggOperator(aggColInfoList, unionOutputCols);
OptimizerTraceUtil.logMVRewrite(context, this, "FineGrainedRangePredicateRule applied");
return Lists.newArrayList(OptExpression.create(newAggOp, unionOpt));
}

View File

@ -15,14 +15,15 @@
package com.starrocks.sql.optimizer.rule.transformation.materialization.rule;
import com.starrocks.sql.optimizer.OptExpression;
import com.starrocks.sql.optimizer.OptimizerContext;
import com.starrocks.sql.optimizer.operator.OperatorType;
import com.starrocks.sql.optimizer.operator.pattern.Pattern;
import com.starrocks.sql.optimizer.rule.RuleType;
import com.starrocks.sql.optimizer.rule.transformation.materialization.MvUtils;
/*
*
* Here is the rule for pattern Join
*
/**
* OnlyJoinRule is used to match SPJ query pattern and rewrite it by mv.
*/
public class OnlyJoinRule extends BaseMaterializedViewRewriteRule {
private static final OnlyJoinRule INSTANCE = new OnlyJoinRule();
@ -35,4 +36,16 @@ public class OnlyJoinRule extends BaseMaterializedViewRewriteRule {
return INSTANCE;
}
@Override
public boolean check(OptExpression input, OptimizerContext context) {
// NOTE:
// 1. For only-join rule, only SPJ is supported
// 2. Don't limit the input must contain a join because it may be a single table query but we still can rewrite it
// in this rule, because of a multi table plan after some rules(eg: FineGrainedRangePredicateRule) may
// become a single table plan.
if (!MvUtils.isLogicalSPJ(input)) {
return false;
}
return super.check(input, context);
}
}

View File

@ -32,11 +32,11 @@ public class MvRewriteNestedMVTest extends MVTestBase {
starRocksAssert.withTable(cluster, "depts");
starRocksAssert.withTable(cluster, "emps");
starRocksAssert.withTable(cluster, "t1");
}
@Test
public void testNestedMv() throws Exception {
starRocksAssert.withTable(cluster, "t1");
starRocksAssert.withTable("CREATE TABLE nest_base_table_1 (\n" +
" k1 INT,\n" +
" v1 INT,\n" +
@ -114,4 +114,77 @@ public class MvRewriteNestedMVTest extends MVTestBase {
dropMv("test", "hive_nested_mv_2");
dropMv("test", "hive_nested_mv_3");
}
@Test
public void testRangePredicateRewrite() throws Exception {
starRocksAssert.withTable("CREATE TABLE `t0` (\n" +
" `date_col` date,\n" +
" `id` int(11),\n" +
" `int_col` int(11),\n" +
" `float_col_1` float,\n" +
" `float_col_2` float,\n" +
" `varchar_col` varchar(255),\n" +
" `tinyint_col` tinyint(4)\n" +
") ENGINE=OLAP\n" +
"DUPLICATE KEY(`date_col`, `id`)\n" +
"DISTRIBUTED BY HASH(`id`)\n" +
"PROPERTIES (\n" +
"\"replication_num\" = \"1\"\n" +
");\n");
starRocksAssert.withTable("CREATE TABLE `t1` (\n" +
" `id_1` int(11),\n" +
" `varchar_col_1` varchar(255),\n" +
" `varchar_col_2` varchar(255),\n" +
" `int_col_1` int(11),\n" +
" `tinyint_col_1` tinyint(4)\n" +
") ENGINE=OLAP\n" +
"DUPLICATE KEY(`id_1`, `varchar_col_1`)\n" +
"DISTRIBUTED BY HASH(`id_1`)\n" +
"PROPERTIES (\n" +
"\"replication_num\" = \"1\"\n" +
");");
executeInsertSql("INSERT INTO `t0` VALUES ('2024-02-01', 1, 100, 10.5, 20.5, 'varchar_value_1', 1);");
executeInsertSql("INSERT INTO `t1` VALUES (1, 'varchar_value_1', 'varchar_value_21', 100, 1);");
starRocksAssert.withRefreshedMaterializedView("create MATERIALIZED VIEW flat_mv\n" +
"REFRESH DEFERRED MANUAL\n" +
"PROPERTIES (\n" +
"\"replication_num\" = \"1\"\n" +
") as select t0.id, t0.date_col, t0.float_col_1, t0.float_col_2, t0.varchar_col, " +
"t0.tinyint_col, t1.varchar_col_1, t1.varchar_col_2, t1.int_col_1, t1.tinyint_col_1 " +
"from t0 join t1 on t0.tinyint_col = t1.tinyint_col_1;\n");
starRocksAssert.withRefreshedMaterializedView("create MATERIALIZED VIEW join_filter_mv\n" +
"REFRESH DEFERRED MANUAL\n" +
"PROPERTIES (\n" +
"\"replication_num\" = \"1\"\n" +
") as select id, date_col, float_col_1, int_col_1, tinyint_col, " +
"tinyint_col_1 from flat_mv where id in (1, 2, 3, 4, 5, 6, 6, 7, 9, 10);\n");
starRocksAssert.withRefreshedMaterializedView("create MATERIALIZED VIEW date_mv\n" +
"REFRESH DEFERRED MANUAL\n" +
"PROPERTIES (\n" +
"\"replication_num\" = \"1\"\n" +
") as select tinyint_col, date_col , sum(float_col_1 * int_col_1) as sum_value " +
"from join_filter_mv group by tinyint_col, date_col;\n");
starRocksAssert.withRefreshedMaterializedView("create MATERIALIZED VIEW month_mv\n" +
"REFRESH DEFERRED MANUAL\n" +
"PROPERTIES (\n" +
"\"replication_num\" = \"1\"\n" +
") as select tinyint_col, date_trunc('month', date_col) as date_col, sum(sum_value) as sum_value " +
"from date_mv group by tinyint_col, date_trunc('month', date_col);\n");
starRocksAssert.withRefreshedMaterializedView("create MATERIALIZED VIEW year_mv\n" +
"REFRESH DEFERRED MANUAL\n" +
"PROPERTIES (\n" +
"\"replication_num\" = \"1\"\n" +
") as select tinyint_col, date_trunc('year', date_col) as date_col, sum(sum_value) " +
"as sum_value from date_mv group by tinyint_col, date_trunc('year', date_col);\n");
String sql = "select sum(t0.float_col_1 * t1.int_col_1), t0.tinyint_col " +
"from t0 join t1 on t0.tinyint_col = t1.tinyint_col_1 where t0.id in (1, 2, 3, 4, 5, 6, 6, 7, 9, 10)\n" +
"and date_col > '2024-02-11' and date_col < '2028-05-14' group by tinyint_col order by 1;";
connectContext.getSessionVariable().setNestedMvRewriteMaxLevel(10);
connectContext.getSessionVariable().setMaterializedViewRewriteMode("force");
connectContext.getSessionVariable().setEnableFineGrainedRangePredicate(true);
connectContext.getSessionVariable().setEnableMaterializedViewTimeSeriesPushDownRewrite(false);
connectContext.getSessionVariable().setEnableMaterializedViewPushDownRewrite(false);
String plan = getFragmentPlan(sql);
PlanTestBase.assertContains(plan, "date_mv", "month_mv", "year_mv");
}
}

View File

@ -62,6 +62,7 @@ public class MvTransparentUnionRewriteOlapTest extends MVTestBase {
)
);
connectContext.getSessionVariable().setEnableMaterializedViewTransparentUnionRewrite(true);
connectContext.getSessionVariable().setMaterializedViewRewriteMode("force");
}
private void withPartialScanMv(StarRocksAssert.ExceptionRunnable runner) {

View File

@ -0,0 +1,239 @@
-- name: test_mv_rewrite_bugs1
create database db_${uuid0};
-- result:
-- !result
use db_${uuid0};
-- result:
-- !result
CREATE TABLE `sales_data` (
`customer_id` bigint(20) NOT NULL,
`order_id` bigint(20) NOT NULL,
`line_item` smallint(6) NOT NULL,
`order_date` datetime NOT NULL,
`original_customer_id` varchar(1048576) NOT NULL,
`original_order_id` varchar(1048576) NOT NULL,
`month_order` datetime NULL,
`week_order` datetime NULL,
`quarter_order` datetime NULL,
`year_order` datetime NULL,
`store_name` varchar(1048576) NULL,
`retailer_brand` varchar(1048576) NULL,
`sales_channel` varchar(1048576) NULL,
`parent_company` varchar(1048576) NULL,
`store_postal_code` varchar(1048576) NULL,
`state` varchar(1048576) NULL,
`region` varchar(18) NULL,
`age` decimal(16, 0) NULL,
`generation` varchar(17) NULL,
`gender` varchar(11) NULL,
`is_adult` boolean NULL,
`is_hispanic` boolean NULL,
`recent_login` boolean NULL,
`customer_postal_code` varchar(1048576) NULL,
`loyalty_id` varchar(1048576) NULL,
`product_code` varchar(1048576) NULL,
`brand` varchar(1048576) NULL,
`manufacturer` varchar(1048576) NULL,
`category_l1` varchar(1048576) NULL,
`category_l2` varchar(1048576) NULL,
`category_l3` varchar(1048576) NULL,
`category_l4` varchar(1048576) NULL,
`product_description` varchar(1048576) NULL,
`private_label` boolean NULL,
`digital_receipt` boolean NULL,
`quantity` decimal(10, 2) NULL,
`sales_amount` decimal(10, 2) NULL,
`loyalty_program_a` boolean NULL,
`loyalty_program_b` boolean NULL,
`loyalty_program_c` boolean NULL,
`category_partition_key` varchar(1048576) NULL,
`load_timestamp` datetime NULL
) ENGINE=OLAP
PRIMARY KEY(`customer_id`, `order_id`, `line_item`, `order_date`)
PARTITION BY date_trunc('day', order_date)
DISTRIBUTED BY HASH(`customer_id`) BUCKETS 12
ORDER BY(`brand`, `category_l1`)
PROPERTIES (
"colocate_with" = "sales_data_orders",
"compression" = "LZ4",
"enable_persistent_index" = "true",
"replication_num" = "1"
);
-- result:
-- !result
INSERT INTO `sales_data` (
`customer_id`, `order_id`, `line_item`, `order_date`,
`original_customer_id`, `original_order_id`,
`month_order`, `week_order`, `quarter_order`, `year_order`,
`store_name`, `retailer_brand`, `sales_channel`, `parent_company`,
`store_postal_code`, `state`, `region`, `age`, `generation`, `gender`,
`is_adult`, `is_hispanic`, `recent_login`,
`customer_postal_code`, `loyalty_id`, `product_code`, `brand`, `manufacturer`,
`category_l1`, `category_l2`, `category_l3`, `category_l4`, `product_description`,
`private_label`, `digital_receipt`, `quantity`, `sales_amount`,
`loyalty_program_a`, `loyalty_program_b`, `loyalty_program_c`,
`category_partition_key`, `load_timestamp`
) VALUES
(10001, 500001, 1, '2025-09-01 10:15:00',
'10001', '500001',
'2025-09-01', '2025-09-01', '2025-07-01', '2025-01-01',
'SuperMart Downtown', 'SuperMart', 'Offline', 'SuperMart Inc.',
'10001', 'NY', 'Northeast', 32, 'Millennial', 'F',
TRUE, FALSE, TRUE,
'10001', 'LOYAL123', '1234567890123', 'Store Brand', 'SuperMart',
'Dairy', 'Milk', 'Whole Milk', NULL, '1 Gallon Whole Milk',
FALSE, TRUE, 1.00, 3.49,
FALSE, TRUE, FALSE,
'Dairy', '2025-09-01 12:00:00'),
(10002, 500002, 1, '2025-09-02 15:30:00',
'10002', '500002',
'2025-09-01', '2025-09-01', '2025-07-01', '2025-01-01',
'MegaStore', 'MegaStore', 'Offline', 'MegaStore Corp.',
'90001', 'CA', 'West', 25, 'Gen Z', 'M',
TRUE, TRUE, FALSE,
'90001', 'LOYAL456', '9876543210987', 'CrunchyChips', 'SnackCorp',
'Snacks', 'Chips', NULL, NULL, 'Classic Potato Chips',
FALSE, FALSE, 2.00, 5.98,
FALSE, TRUE, TRUE,
'Snacks', '2025-09-02 16:00:00'),
(10003, 500003, 1, '2025-09-03 09:45:00',
'10003', '500003',
'2025-09-01', '2025-09-01', '2025-07-01', '2025-01-01',
'BulkMart', 'BulkMart', 'Offline', 'BulkMart Wholesale',
'77001', 'TX', 'South', 29, 'Millennial', 'F',
TRUE, FALSE, TRUE,
'77001', 'LOYAL789', '5555555555555', 'BabyComfort', 'BabyCorp',
'Baby', 'Diapers', NULL, NULL, 'Comfort Diapers Size 2',
FALSE, TRUE, 1.00, 39.99,
TRUE, FALSE, FALSE,
'Baby', '2025-09-03 10:00:00');
-- result:
-- !result
CREATE MATERIALIZED VIEW `sales_data_mv2`
PARTITION BY (date_trunc('day', `order_date`))
DISTRIBUTED BY HASH(`customer_id`) BUCKETS 12
ORDER BY (category_l1, category_l2, category_l3)
REFRESH MANUAL
PROPERTIES (
"replicated_storage" = "true",
"replication_num" = "1",
"partition_refresh_number" = "60",
"bloom_filter_columns" = "category_l1, category_l2, category_l3, product_description, manufacturer, retailer_brand, store_name",
"colocate_with" = "sales_data_orders"
)
AS SELECT
`sales_data`.`customer_id`,
`sales_data`.`order_id`,
`sales_data`.`line_item`,
`sales_data`.`order_date`,
`sales_data`.`month_order`,
`sales_data`.`week_order`,
`sales_data`.`quarter_order`,
`sales_data`.`year_order`,
`sales_data`.`store_name`,
`sales_data`.`retailer_brand`,
`sales_data`.`sales_channel`,
`sales_data`.`parent_company`,
`sales_data`.`store_postal_code`,
`sales_data`.`state`,
`sales_data`.`region`,
`sales_data`.`age`,
`sales_data`.`generation`,
`sales_data`.`gender`,
`sales_data`.`is_adult`,
`sales_data`.`is_hispanic`,
`sales_data`.`recent_login`,
`sales_data`.`customer_postal_code`,
`sales_data`.`loyalty_id`,
`sales_data`.`product_code`,
`sales_data`.`brand`,
`sales_data`.`manufacturer`,
`sales_data`.`category_l1`,
`sales_data`.`category_l2`,
`sales_data`.`category_l3`,
`sales_data`.`category_l4`,
`sales_data`.`product_description`,
`sales_data`.`private_label`,
`sales_data`.`digital_receipt`,
`sales_data`.`quantity`,
`sales_data`.`sales_amount`,
`sales_data`.`loyalty_program_a`,
`sales_data`.`loyalty_program_b`,
`sales_data`.`loyalty_program_c`,
`sales_data`.`category_partition_key`,
`sales_data`.`load_timestamp`
FROM `sales_data`;
-- result:
-- !result
CREATE MATERIALIZED VIEW `sales_data_mv1`
PARTITION BY (date_trunc('day', `order_date`))
DISTRIBUTED BY HASH(`order_id`) BUCKETS 12
ORDER BY (brand, category_l1)
REFRESH MANUAL
PROPERTIES (
"replicated_storage" = "true",
"replication_num" = "1",
"partition_refresh_number" = "60",
"bloom_filter_columns" = "brand, category_l1, category_l2, category_l3, product_description, manufacturer, retailer_brand, store_name",
"colocate_with" = "sales_data_mv1_orders_mv1"
)
AS SELECT
`sales_data`.`customer_id`,
`sales_data`.`order_id`,
`sales_data`.`line_item`,
`sales_data`.`order_date`,
`sales_data`.`month_order`,
`sales_data`.`week_order`,
`sales_data`.`quarter_order`,
`sales_data`.`year_order`,
`sales_data`.`store_name`,
`sales_data`.`retailer_brand`,
`sales_data`.`sales_channel`,
`sales_data`.`parent_company`,
`sales_data`.`store_postal_code`,
`sales_data`.`state`,
`sales_data`.`region`,
`sales_data`.`age`,
`sales_data`.`generation`,
`sales_data`.`gender`,
`sales_data`.`is_adult`,
`sales_data`.`is_hispanic`,
`sales_data`.`recent_login`,
`sales_data`.`customer_postal_code`,
`sales_data`.`loyalty_id`,
`sales_data`.`product_code`,
`sales_data`.`brand`,
`sales_data`.`manufacturer`,
`sales_data`.`category_l1`,
`sales_data`.`category_l2`,
`sales_data`.`category_l3`,
`sales_data`.`category_l4`,
`sales_data`.`product_description`,
`sales_data`.`private_label`,
`sales_data`.`digital_receipt`,
`sales_data`.`quantity`,
`sales_data`.`sales_amount`,
`sales_data`.`loyalty_program_a`,
`sales_data`.`loyalty_program_b`,
`sales_data`.`loyalty_program_c`,
`sales_data`.`category_partition_key`,
`sales_data`.`load_timestamp`
FROM `sales_data`;
-- result:
-- !result
WITH snack_buyers AS (
SELECT DISTINCT customer_id
FROM sales_data
WHERE category_l1 = 'Snacks'
),
dairy_buyers AS (
SELECT DISTINCT customer_id
FROM sales_data
WHERE category_l1 = 'Dairy'
)
SELECT
(SELECT APPROX_COUNT_DISTINCT(customer_id) FROM dairy_buyers) AS dairy_customers,
(SELECT APPROX_COUNT_DISTINCT(customer_id) FROM snack_buyers) AS snack_customers;
-- result:
1 1
-- !result

View File

@ -0,0 +1,229 @@
-- name: test_mv_rewrite_bugs1
create database db_${uuid0};
use db_${uuid0};
CREATE TABLE `sales_data` (
`customer_id` bigint(20) NOT NULL,
`order_id` bigint(20) NOT NULL,
`line_item` smallint(6) NOT NULL,
`order_date` datetime NOT NULL,
`original_customer_id` varchar(1048576) NOT NULL,
`original_order_id` varchar(1048576) NOT NULL,
`month_order` datetime NULL,
`week_order` datetime NULL,
`quarter_order` datetime NULL,
`year_order` datetime NULL,
`store_name` varchar(1048576) NULL,
`retailer_brand` varchar(1048576) NULL,
`sales_channel` varchar(1048576) NULL,
`parent_company` varchar(1048576) NULL,
`store_postal_code` varchar(1048576) NULL,
`state` varchar(1048576) NULL,
`region` varchar(18) NULL,
`age` decimal(16, 0) NULL,
`generation` varchar(17) NULL,
`gender` varchar(11) NULL,
`is_adult` boolean NULL,
`is_hispanic` boolean NULL,
`recent_login` boolean NULL,
`customer_postal_code` varchar(1048576) NULL,
`loyalty_id` varchar(1048576) NULL,
`product_code` varchar(1048576) NULL,
`brand` varchar(1048576) NULL,
`manufacturer` varchar(1048576) NULL,
`category_l1` varchar(1048576) NULL,
`category_l2` varchar(1048576) NULL,
`category_l3` varchar(1048576) NULL,
`category_l4` varchar(1048576) NULL,
`product_description` varchar(1048576) NULL,
`private_label` boolean NULL,
`digital_receipt` boolean NULL,
`quantity` decimal(10, 2) NULL,
`sales_amount` decimal(10, 2) NULL,
`loyalty_program_a` boolean NULL,
`loyalty_program_b` boolean NULL,
`loyalty_program_c` boolean NULL,
`category_partition_key` varchar(1048576) NULL,
`load_timestamp` datetime NULL
) ENGINE=OLAP
PRIMARY KEY(`customer_id`, `order_id`, `line_item`, `order_date`)
PARTITION BY date_trunc('day', order_date)
DISTRIBUTED BY HASH(`customer_id`) BUCKETS 12
ORDER BY(`brand`, `category_l1`)
PROPERTIES (
"colocate_with" = "sales_data_orders",
"compression" = "LZ4",
"enable_persistent_index" = "true",
"replication_num" = "1"
);
INSERT INTO `sales_data` (
`customer_id`, `order_id`, `line_item`, `order_date`,
`original_customer_id`, `original_order_id`,
`month_order`, `week_order`, `quarter_order`, `year_order`,
`store_name`, `retailer_brand`, `sales_channel`, `parent_company`,
`store_postal_code`, `state`, `region`, `age`, `generation`, `gender`,
`is_adult`, `is_hispanic`, `recent_login`,
`customer_postal_code`, `loyalty_id`, `product_code`, `brand`, `manufacturer`,
`category_l1`, `category_l2`, `category_l3`, `category_l4`, `product_description`,
`private_label`, `digital_receipt`, `quantity`, `sales_amount`,
`loyalty_program_a`, `loyalty_program_b`, `loyalty_program_c`,
`category_partition_key`, `load_timestamp`
) VALUES
(10001, 500001, 1, '2025-09-01 10:15:00',
'10001', '500001',
'2025-09-01', '2025-09-01', '2025-07-01', '2025-01-01',
'SuperMart Downtown', 'SuperMart', 'Offline', 'SuperMart Inc.',
'10001', 'NY', 'Northeast', 32, 'Millennial', 'F',
TRUE, FALSE, TRUE,
'10001', 'LOYAL123', '1234567890123', 'Store Brand', 'SuperMart',
'Dairy', 'Milk', 'Whole Milk', NULL, '1 Gallon Whole Milk',
FALSE, TRUE, 1.00, 3.49,
FALSE, TRUE, FALSE,
'Dairy', '2025-09-01 12:00:00'),
(10002, 500002, 1, '2025-09-02 15:30:00',
'10002', '500002',
'2025-09-01', '2025-09-01', '2025-07-01', '2025-01-01',
'MegaStore', 'MegaStore', 'Offline', 'MegaStore Corp.',
'90001', 'CA', 'West', 25, 'Gen Z', 'M',
TRUE, TRUE, FALSE,
'90001', 'LOYAL456', '9876543210987', 'CrunchyChips', 'SnackCorp',
'Snacks', 'Chips', NULL, NULL, 'Classic Potato Chips',
FALSE, FALSE, 2.00, 5.98,
FALSE, TRUE, TRUE,
'Snacks', '2025-09-02 16:00:00'),
(10003, 500003, 1, '2025-09-03 09:45:00',
'10003', '500003',
'2025-09-01', '2025-09-01', '2025-07-01', '2025-01-01',
'BulkMart', 'BulkMart', 'Offline', 'BulkMart Wholesale',
'77001', 'TX', 'South', 29, 'Millennial', 'F',
TRUE, FALSE, TRUE,
'77001', 'LOYAL789', '5555555555555', 'BabyComfort', 'BabyCorp',
'Baby', 'Diapers', NULL, NULL, 'Comfort Diapers Size 2',
FALSE, TRUE, 1.00, 39.99,
TRUE, FALSE, FALSE,
'Baby', '2025-09-03 10:00:00');
CREATE MATERIALIZED VIEW `sales_data_mv2`
PARTITION BY (date_trunc('day', `order_date`))
DISTRIBUTED BY HASH(`customer_id`) BUCKETS 12
ORDER BY (category_l1, category_l2, category_l3)
REFRESH MANUAL
PROPERTIES (
"replicated_storage" = "true",
"replication_num" = "1",
"partition_refresh_number" = "60",
"bloom_filter_columns" = "category_l1, category_l2, category_l3, product_description, manufacturer, retailer_brand, store_name",
"colocate_with" = "sales_data_orders"
)
AS SELECT
`sales_data`.`customer_id`,
`sales_data`.`order_id`,
`sales_data`.`line_item`,
`sales_data`.`order_date`,
`sales_data`.`month_order`,
`sales_data`.`week_order`,
`sales_data`.`quarter_order`,
`sales_data`.`year_order`,
`sales_data`.`store_name`,
`sales_data`.`retailer_brand`,
`sales_data`.`sales_channel`,
`sales_data`.`parent_company`,
`sales_data`.`store_postal_code`,
`sales_data`.`state`,
`sales_data`.`region`,
`sales_data`.`age`,
`sales_data`.`generation`,
`sales_data`.`gender`,
`sales_data`.`is_adult`,
`sales_data`.`is_hispanic`,
`sales_data`.`recent_login`,
`sales_data`.`customer_postal_code`,
`sales_data`.`loyalty_id`,
`sales_data`.`product_code`,
`sales_data`.`brand`,
`sales_data`.`manufacturer`,
`sales_data`.`category_l1`,
`sales_data`.`category_l2`,
`sales_data`.`category_l3`,
`sales_data`.`category_l4`,
`sales_data`.`product_description`,
`sales_data`.`private_label`,
`sales_data`.`digital_receipt`,
`sales_data`.`quantity`,
`sales_data`.`sales_amount`,
`sales_data`.`loyalty_program_a`,
`sales_data`.`loyalty_program_b`,
`sales_data`.`loyalty_program_c`,
`sales_data`.`category_partition_key`,
`sales_data`.`load_timestamp`
FROM `sales_data`;
CREATE MATERIALIZED VIEW `sales_data_mv1`
PARTITION BY (date_trunc('day', `order_date`))
DISTRIBUTED BY HASH(`order_id`) BUCKETS 12
ORDER BY (brand, category_l1)
REFRESH MANUAL
PROPERTIES (
"replicated_storage" = "true",
"replication_num" = "1",
"partition_refresh_number" = "60",
"bloom_filter_columns" = "brand, category_l1, category_l2, category_l3, product_description, manufacturer, retailer_brand, store_name",
"colocate_with" = "sales_data_mv1_orders_mv1"
)
AS SELECT
`sales_data`.`customer_id`,
`sales_data`.`order_id`,
`sales_data`.`line_item`,
`sales_data`.`order_date`,
`sales_data`.`month_order`,
`sales_data`.`week_order`,
`sales_data`.`quarter_order`,
`sales_data`.`year_order`,
`sales_data`.`store_name`,
`sales_data`.`retailer_brand`,
`sales_data`.`sales_channel`,
`sales_data`.`parent_company`,
`sales_data`.`store_postal_code`,
`sales_data`.`state`,
`sales_data`.`region`,
`sales_data`.`age`,
`sales_data`.`generation`,
`sales_data`.`gender`,
`sales_data`.`is_adult`,
`sales_data`.`is_hispanic`,
`sales_data`.`recent_login`,
`sales_data`.`customer_postal_code`,
`sales_data`.`loyalty_id`,
`sales_data`.`product_code`,
`sales_data`.`brand`,
`sales_data`.`manufacturer`,
`sales_data`.`category_l1`,
`sales_data`.`category_l2`,
`sales_data`.`category_l3`,
`sales_data`.`category_l4`,
`sales_data`.`product_description`,
`sales_data`.`private_label`,
`sales_data`.`digital_receipt`,
`sales_data`.`quantity`,
`sales_data`.`sales_amount`,
`sales_data`.`loyalty_program_a`,
`sales_data`.`loyalty_program_b`,
`sales_data`.`loyalty_program_c`,
`sales_data`.`category_partition_key`,
`sales_data`.`load_timestamp`
FROM `sales_data`;
WITH snack_buyers AS (
SELECT DISTINCT customer_id
FROM sales_data
WHERE category_l1 = 'Snacks'
),
dairy_buyers AS (
SELECT DISTINCT customer_id
FROM sales_data
WHERE category_l1 = 'Dairy'
)
SELECT
(SELECT APPROX_COUNT_DISTINCT(customer_id) FROM dairy_buyers) AS dairy_customers,
(SELECT APPROX_COUNT_DISTINCT(customer_id) FROM snack_buyers) AS snack_customers;