Compare commits

...

334 Commits

Author SHA1 Message Date
mergify[bot] b95e90b091
[BugFix] Fix shutdown tablet can not gc (backport #63595) (#63624)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-09-26 10:33:19 +00:00
mergify[bot] 308a567473
[BugFix] change CHECK to DCHECK in nullablecolumn to prevent the crash (backport #63553) (backport #63565) (#63606)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Murphy <mofei@starrocks.com>
2025-09-26 08:01:25 +00:00
mergify[bot] a17c83e003
[BugFix] Fix query detail lost audit items (backport #63237) (#63469) 2025-09-24 19:59:58 +08:00
mergify[bot] c36f909425
[BugFix] Remove the deregister logic from container (backport #63085) (#63514)
Signed-off-by: yandongxiao <yandongxiao@starrocks.com>
Co-authored-by: yandongxiao <yandongxiao@starrocks.com>
2025-09-24 19:16:11 +08:00
mergify[bot] c5ebf3e063
[BugFix] Fix array type analyze (backport #63371) (#63506)
Signed-off-by: Seaven <seaven_7@qq.com>
Co-authored-by: Seaven <seaven_7@qq.com>
2025-09-24 18:31:12 +08:00
mergify[bot] 5ce9b16626
[UT] fix unstable distance function cases because of precision (backport #63502) (#63511)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-09-24 17:26:17 +08:00
mergify[bot] d54987f1f2
[BugFix] update staros to v3.5-rc4 (backport #63398) (#63492)
Signed-off-by: starrocks-xupeng <xupeng@starrocks.com>
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
2025-09-24 15:52:45 +08:00
SevenJ 0f43bd91ae
[BugFix] Fix bug where reserved words in iceberg partitions break toThrift (backport #63243) (#63476)
Co-authored-by: kyle-goodale-klaviyo <kyle.goodale@klaviyo.com>
2025-09-24 12:09:01 +08:00
mergify[bot] 418b332b49
[Doc] Update SQL Blacklist Doc (backport #63457) (#63489)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-24 03:52:56 +00:00
mergify[bot] bd224880d9
[Doc]Update iceberg_catalog.md (backport #63317) (#63487)
Signed-off-by: chelsea <48942089+wangsimo0@users.noreply.github.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: chelsea <48942089+wangsimo0@users.noreply.github.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-24 03:52:06 +00:00
mergify[bot] 10f01058d2
[Doc] Remove SELECT INTO OUTFILE from Doc (backport #63478) (#63483)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-24 03:35:39 +00:00
mergify[bot] 2ffe600c22
[Enhancement] Cache parquet column batch for delta lake metadata (backport #63441) (#63472)
Co-authored-by: Youngwb <yangwenbo_mailbox@163.com>
2025-09-24 03:28:45 +00:00
mergify[bot] 03470d1c32
[Doc] Add document for iceberg table sorting function. (backport #63392) (#63479)
Signed-off-by: GavinMar <yangguansuo@starrocks.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: Gavin <yangguansuo@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-24 02:57:22 +00:00
mergify[bot] d28f28448d
[BugFix] Fix zone map incorrect filtering after CHAR to VARCHAR fast schema evolution in shared-data (backport #63377) (#63474)
Signed-off-by: PengFei Li <lpengfei2016@gmail.com>
Co-authored-by: PengFei Li <lpengfei2016@gmail.com>
2025-09-24 10:27:44 +08:00
mergify[bot] ee1a9a5df9
[BugFix] fix iceberg read null partition bug (backport #62934) (#63040)
Signed-off-by: SevenJ <wenjun7j@gmail.com>
Co-authored-by: SevenJ <166966490+Wenjun7J@users.noreply.github.com>
2025-09-24 09:55:35 +08:00
mergify[bot] b491a3f7c0
[Enhancement] Pass stream load label directly to TransactionStmtExecutor (backport #63334) (#63463)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-23 10:41:06 -07:00
mergify[bot] ce2f48c0f8
[BugFix] remove the annoying warning log of json (backport #63414) (#63458)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-23 11:33:53 +00:00
mergify[bot] 95f15ea692
[BugFix] Forget left join flag of table function when applying array low cardinality optimization (backport #63419) (#63450)
Signed-off-by: satanson <ranpanf@gmail.com>
Co-authored-by: satanson <ranpanf@gmail.com>
2025-09-23 19:07:51 +08:00
mergify[bot] 7f0a2acad5
[Enhancement] Optimize logging error message for delta lake (backport #63389) (#63431)
Co-authored-by: Youngwb <yangwenbo_mailbox@163.com>
2025-09-23 09:20:15 +00:00
mergify[bot] f6ab8e8b94
[Enhancement] Implement SQL standard JOIN USING with MySQL compatibility (backport #63312) (#63395)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
Co-authored-by: stephen <stephen5217@163.com>
2025-09-23 08:37:21 +00:00
mergify[bot] e84b027d86
[UT] Fix floating-point precision test failures on ARM platforms (backport #63427) (#63437)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-23 07:50:44 +00:00
mergify[bot] bd6623501f
[BugFix] Fix infinite loop when inserting decimal256 data on ARM platforms (backport #63406) (#63434)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-23 07:47:31 +00:00
mergify[bot] c8582133dd
[Enhancement] [BugFix] Ensure fast fail if cngroup is not available (backport #63314) (#63428)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-23 15:35:53 +08:00
mergify[bot] 079adbaa8f
[BugFix] Fix incompatible bitmap index reuse for fast schema evolution in shared-data (backport #63315) (#63415)
Signed-off-by: PengFei Li <lpengfei2016@gmail.com>
Co-authored-by: PengFei Li <lpengfei2016@gmail.com>
2025-09-23 15:35:05 +08:00
mergify[bot] 79a69a8b67
[Enhancement] Add be jvm memory metrics (backport #62210) (#63413)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: nillin <linwb@zju.edu.cn>
Co-authored-by: Kevin Cai <kevin.cai@celerdata.com>
2025-09-23 07:29:18 +00:00
mergify[bot] a12237e952
[BugFix] Fix dcg meta inconsistency when partial update with auto increment column in column upsert mode. (backport #63370) (#63423)
Signed-off-by: srlch <linzichao@starrocks.com>
Co-authored-by: srlch <111035020+srlch@users.noreply.github.com>
2025-09-23 07:25:57 +00:00
mergify[bot] f3c0f6898b
[Enhancement] QueryDetailActionV2 and QueryProfileActionV2 APIs return json result (backport #63235) (#63411)
Signed-off-by: zhaohehuhu <luoyedeyi@163.com>
Co-authored-by: He Zhao <luoyedeyi@163.com>
2025-09-23 06:36:49 +00:00
mergify[bot] fb74cebe3e
[Tool] add JVM options for JDK-17 (backport #63120) (#63129)
Signed-off-by: zhaohehuhu <luoyedeyi@163.com>
Co-authored-by: He Zhao <luoyedeyi@163.com>
2025-09-23 13:42:01 +08:00
mergify[bot] e5b9ac8c92
[Enhancement] Support revoke external group and add more test case (backport #63385) (#63397)
Co-authored-by: Harbor Liu <460660596@qq.com>
2025-09-23 13:30:34 +08:00
mergify[bot] 3d2a0d5301
[Doc] Flink Connector 1.2.12 Doc (backport #63386) (#63407)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-23 03:47:35 +00:00
mergify[bot] fadb8061f3
[Refactor] SegmentIterator::ScanContext (backport #63333) (#63401)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Murphy <mofei@starrocks.com>
2025-09-23 03:31:18 +00:00
mergify[bot] 7d30a58f88
[Enhancement] Enforce request consistency for channel stream load (backport #63347) (#63404)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-23 03:28:23 +00:00
mergify[bot] 62118d7396
[Doc] Update Feature Support for Iceberg (backport #63288) (#63399)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-23 11:16:49 +08:00
mergify[bot] 6fcc5672fe
[Tool] meta_tool: dump_zonemap (backport #63292) (#63402)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-23 03:14:34 +00:00
mergify[bot] cc892a7196
[Enhancement] optimize analyze profile format (backport #63326) (#63396)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-23 03:14:16 +00:00
mergify[bot] 449a3ab2a6
[Enhancement] support expr reuse in outer join where predicates (backport #62139) (#62625)
Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com>
Co-authored-by: eyes_on_me <nopainnofame@sina.com>
2025-09-23 11:13:51 +08:00
mergify[bot] 0aa7cfb99e
[BugFix] Fix the issue that create spill directory failed when writing data to iceberg table. (backport #63278) (#63393)
Signed-off-by: GavinMar <yangguansuo@starrocks.com>
Co-authored-by: Gavin <yangguansuo@starrocks.com>
2025-09-23 10:58:33 +08:00
mergify[bot] 953555c49c
[Tool] update default configuration of allin1 docker (backport #63133) (#63403)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-23 02:53:08 +00:00
mergify[bot] a34b2a699d
[BugFix] fix non-deterministic predicate push down problem (backport #62827) (#63353)
Signed-off-by: before-Sunrise <unclejyj@gmail.com>
Co-authored-by: before-Sunrise <71162020+before-Sunrise@users.noreply.github.com>
2025-09-23 10:45:00 +08:00
mergify[bot] 4d6242ccf3
[UT] Refactor some ut (backport #63095) (#63159)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-09-23 10:19:54 +08:00
mergify[bot] e7459daeac
[Enhancement] Implement grant and revoke role functionality for external groups (backport #63258) (#63374)
Co-authored-by: Harbor Liu <460660596@qq.com>
2025-09-22 21:43:47 +08:00
mergify[bot] 8f3acf1d0c
[BugFix] fix delete predicate edge case (backport #63339) (#63365)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
Co-authored-by: Kevin Cai <kevin.cai@celerdata.com>
2025-09-22 13:02:08 +00:00
mergify[bot] 5f1aa6a870
[Enhancement] reduce trace rule log (backport #62834) (#62904)
Signed-off-by: Seaven <seaven_7@qq.com>
Co-authored-by: Seaven <seaven_7@qq.com>
2025-09-22 16:02:04 +08:00
mergify[bot] bf1bd75627
[BugFix] Fix pk index cumulative compaction strategy when max_rss_rowid is same (backport #63277) (#63359)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-09-22 14:49:20 +08:00
mergify[bot] e82a6491a5
[BugFix] fix duplicate key table delete issue in shared-data mode (backport #63296) (#63352)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
Co-authored-by: Kevin Cai <kevin.cai@celerdata.com>
2025-09-22 03:26:07 +00:00
mergify[bot] 50e3b621c4
[Enhancement] remove the adaptive zonemap creation (backport #63297) (#63340)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-22 10:24:07 +08:00
mergify[bot] 81685137b1
[Enhancement] Support for `IF NOT EXISTS` and `IF EXISTS` clauses to the GROUP PROVIDER (backport #63248) (#63304)
Co-authored-by: Harbor Liu <460660596@qq.com>
2025-09-22 10:03:34 +08:00
mergify[bot] f102d804b6
[Doc] Add tip about TPC-DS benchmark query example (backport #63203) (#63320)
Signed-off-by: Dan Roscigno <dan@roscigno.com>
Signed-off-by: DanRoscigno <dan@roscigno.com>
Co-authored-by: Dan Roscigno <dan@roscigno.com>
2025-09-20 21:56:45 +08:00
mergify[bot] 76d03c639e
[Doc] add video (backport #63330) (#63336)
Signed-off-by: DanRoscigno <dan@roscigno.com>
Co-authored-by: Dan Roscigno <dan@roscigno.com>
2025-09-20 21:55:11 +08:00
PengFei Li 060b5f60a4
[BugFix] Fix incompatible zonemap reuse for fast schema evolution in shared-data (backport #63143) (#63318)
Signed-off-by: PengFei Li <lpengfei2016@gmail.com>
2025-09-20 21:53:43 +08:00
mergify[bot] f338bc6ad8
[Enhancement] Make BE reject multi-statement transaction stream load (backport #63242) (#63329)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-19 11:07:25 -07:00
mergify[bot] 470c45b1b8
[BugFix] fix delvec no found issue when drop tablet and queries run concurrently (backport #63291) (#63308)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-09-19 06:50:00 +00:00
SevenJ 157b0085b5
[BugFix] Iceberg/fix case backport (backport #63194) (#63289)
Signed-off-by: SevenJ <wenjun7j@gmail.com>
2025-09-19 11:50:14 +08:00
mergify[bot] 2aedd17e2f
[BugFix] Fix BE crash in tracer when jaeger_endpoint is invalid (backport #63257) (#63283)
Signed-off-by: xiangguangyxg <xiangguangyxg@gmail.com>
Co-authored-by: xiangguangyxg <110401425+xiangguangyxg@users.noreply.github.com>
2025-09-19 03:25:16 +00:00
mergify[bot] 5be0b3f880
[BugFix] Fix ApplyCommitTask loss in concurrent scenarios. (backport #60633) (#63287)
Signed-off-by: edwinhzhang <edwinhzhang@tencent.com>
Signed-off-by: sevev <qiangzh95@gmail.com>
Signed-off-by: zhangqiang <qiangzh95@gmail.com>
Co-authored-by: zhanghe <edwinhzhang@tencent.com>
Co-authored-by: sevev <qiangzh95@gmail.com>
2025-09-19 02:47:02 +00:00
mergify[bot] 5df5201c3f
[BugFix] Fix get max compaction score NullPointerException bug (backport #63268) (#63273)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-18 14:20:04 +00:00
mergify[bot] f0a8047878
[BugFix] Remove unused codes (backport #63261) (#63269)
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-18 12:13:25 +00:00
mergify[bot] cf8109bd11
[BugFix] Fix ranger hive service alter privilege not work (backport #63251) (#63264)
Co-authored-by: Youngwb <yangwenbo_mailbox@163.com>
2025-09-18 12:00:14 +00:00
mergify[bot] cb35a1c0d8
[UT] Fix timeout SQL Test (backport #63246) (#63254)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-09-18 15:18:59 +08:00
mergify[bot] 749529a24b
[Enhancement] Update default bucket size of random distribution (backport #63168) (#63244)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-18 13:12:16 +08:00
Shawn a999634dc7
[Enhancement] Emit routine load lag time (backport #62048) (#63239) 2025-09-17 16:37:20 +00:00
mergify[bot] 7a765d9f1a
[Enhancement] Optimize parsing predicates with large number of CompoundPredicates (backport #63139) (#63234)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-17 11:54:40 +00:00
Drake Wang b464c2e4a5
[BugFix] Fix version check failed while appling replication txn with compaction enabled (backport #62663) (#63228)
Signed-off-by: Drake Wang <wxl24life@gmail.com>
2025-09-17 18:26:25 +08:00
mergify[bot] 03157ef98d
[BugFix] Fix the bug where UserProperty priority is lower than Session Variable (backport #63173) (#63214)
Co-authored-by: Harbor Liu <460660596@qq.com>
2025-09-17 16:51:38 +08:00
mergify[bot] dccca50e33
[Doc] Add references for sys.policy_references (backport #63183) (#63211)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-17 01:57:34 +00:00
mergify[bot] fe9067d503
[BugFix] Fix cte reuse plan extract error (backport #62784) (#63187)
Signed-off-by: Seaven <seaven_7@qq.com>
Co-authored-by: Seaven <seaven_7@qq.com>
2025-09-17 09:37:19 +08:00
mergify[bot] 71023642f7
[Doc] Add keywords for query tuning introduction (backport #63204) (#63207)
Signed-off-by: Dan Roscigno <dan@roscigno.com>
Co-authored-by: Dan Roscigno <dan@roscigno.com>
2025-09-17 08:39:30 +08:00
mergify[bot] 0d8ffc659e
[BugFix] Fix multi statement stream load due to invalid source type (backport #63044) (#63154)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-16 11:24:43 -07:00
mergify[bot] d675248863
[BugFix] Fix transaction write deadlock due to set error message (backport #62961) (#63199)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-16 10:13:22 -07:00
mergify[bot] 9d3a2e50df
[BugFix] Fix null exception during remove expired load job (backport #63042) (#63179)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-09-16 10:20:50 +00:00
mergify[bot] 00d76ad660
[Enhancement] Improve fragment instance exec state report (backport #63132) (#63190)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-16 10:17:10 +00:00
mergify[bot] c462f0779d
[Enhancement] Implement ProfileActionV2 and QueryDetailActionV2 to obtain query information across all FEs (backport #61345) (#63174)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: He Zhao <luoyedeyi@163.com>
Co-authored-by: Kevin Cai <kevin.cai@celerdata.com>
2025-09-16 07:43:49 +00:00
mergify[bot] 675fdd1197
[UT] Optimize FE tests' logging output (backport #62985) (#63111)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-16 15:00:46 +08:00
mergify[bot] 4211f7222d
[Doc] Fix Stream Load Param Desc (backport #63172) (#63178)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-16 06:14:19 +00:00
mergify[bot] d95b1a7d75
[BugFix] Fix mv repair hive base table bug (backport #63072) (#63123)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-16 05:42:21 +00:00
mergify[bot] 55667efd01
[UT] [BugFix] Fix FineGrainedRangePredicateRule rule bug (backport #63148) (#63169)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-16 12:41:21 +08:00
mergify[bot] af71661762
[BugFix] Fix secondary replicas continue waiting because of wrong timestamp (backport #62805) (#63162)
Signed-off-by: PengFei Li <lpengfei2016@gmail.com>
Co-authored-by: PengFei Li <lpengfei2016@gmail.com>
2025-09-16 11:27:38 +08:00
mergify[bot] e33b9bc64d
[Enhancement] Loose check mv's schema for better compatibilities (backport #63114) (#63160)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-16 11:02:12 +08:00
mergify[bot] 631312127b
[Enhancement] Force drop decommissioned backend if all the tablets in recycle bin (backport #62781) (#63156)
Signed-off-by: gengjun-git <gengjun@starrocks.com>
Co-authored-by: gengjun-git <gengjun@starrocks.com>
2025-09-16 02:39:21 +00:00
mergify[bot] c0e4a1337b
[Doc] Update Colocate Join Principles (backport #63153) (#63165)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-16 02:13:50 +00:00
mergify[bot] 53754c69a8
[Enhancement] Update vacuum metric when vacuum success (backport #62540) (#63099)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-09-16 09:56:32 +08:00
mergify[bot] 5670387325
[Doc] Add DN matching mechanism for LDAP Group Provider and update documentation (backport #63115) (#63158)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: Harbor Liu <460660596@qq.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-16 09:54:54 +08:00
mergify[bot] a8c769ab8c
[Enhancement] Enhance cluster snapshot restore to support warehouse (backport #63023) (#63118)
Signed-off-by: xiangguangyxg <xiangguangyxg@gmail.com>
Co-authored-by: xiangguangyxg <110401425+xiangguangyxg@users.noreply.github.com>
2025-09-15 10:27:37 +00:00
mergify[bot] c6e4057dff
[Enhancement] Choose best candidate mv with considering input query data layout (backport #62830) (#63025)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-15 17:14:25 +08:00
mergify[bot] 12f39a524d
[Doc] Metrics for Fragment Instance State Report (backport #63112) (#63122)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-15 05:10:01 +00:00
mergify[bot] d894c1cc98
[BugFix] Fix mv agg pushdown rewrite bugs (backport #63060) (#63107)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-15 12:49:56 +08:00
mergify[bot] 6027fa3b52
[Tool] add healthcheck in allin1-ubuntu (backport #62998) (#63113)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-09-15 03:06:30 +00:00
mergify[bot] 885389fe1f
[UT] Fix optimize job test (backport #63096) (#63104)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-15 10:56:06 +08:00
mergify[bot] ded88787e5
[Doc] Add clone metrics doc (backport #63073) (#63110)
Signed-off-by: wyb <wybb86@gmail.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: wyb <wybb86@gmail.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-15 02:15:06 +00:00
mergify[bot] 8861a1027e
[Enhancement] support s3 path style in shared-data cluster (backport #62591) (#63082)
Signed-off-by: starrocks-xupeng <xupeng@starrocks.com>
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-09-14 16:50:15 +08:00
mergify[bot] 4a3309228a
[Doc] Add 'nei cun' (memory) keyword to best practice overview (backport #63028) (#63050)
Signed-off-by: Dan Roscigno <dan@roscigno.com>
Signed-off-by: DanRoscigno <dan@roscigno.com>
Co-authored-by: Dan Roscigno <dan@roscigno.com>
2025-09-12 09:02:26 -04:00
mergify[bot] 8322e224e4
[Enhancement] Add fragment instance exec state report thread pool metrics (backport #63067) (#63092)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-12 11:12:57 +00:00
mergify[bot] b4522a7da5
[Enhancement] Optimize removeDuplicateField performance (backport #62938) (#63063)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-12 18:08:50 +08:00
mergify[bot] b557885b73
[BugFix] change tuning guide format (backport #63024) (#63079)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-12 18:01:58 +08:00
mergify[bot] 13cd888d9d
[BugFix] Fix JSON extraction null column consistency and add validation checks (backport #63054) (#63078)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-12 08:47:52 +00:00
mergify[bot] e2a0aac85d
[BugFix] Fix mv rewriter binder bugs (backport #62919) (#63057)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-12 15:21:42 +08:00
mergify[bot] 25adbac4e0
[BugFix] fix hour_from_unixtime rule (backport #63006) (#63056)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Murphy <mofei@starrocks.com>
2025-09-12 07:04:51 +00:00
mergify[bot] c8e85a33ae
[BugFix] fix shared-data cluster MV does not support colocation (backport #62941) (#63033)
Signed-off-by: starrocks-xupeng <xupeng@starrocks.com>
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
2025-09-12 14:37:12 +08:00
mergify[bot] 70280d3da6
[BugFix] fix iceberg manifest cache npe in data race condition (backport #63043) (#63051)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-09-12 06:09:33 +00:00
mergify[bot] fcb46895e3
[BugFix] Fix view based rewrite bugs (backport #62918) (#63013)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-12 14:03:41 +08:00
mergify[bot] 055bf5a488
[BugFix] fix bugs of FlatJSON with lake table (backport #62706) (#63041)
Signed-off-by: Murphy <mofei@starrocks.com>
Signed-off-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Kevin Cai <caixh.kevin@gmail.com>
2025-09-12 13:49:09 +08:00
mergify[bot] 2fe00e59f3
[Enhancement] Make some fe metrics leader awareness (backport #63004) (#63038)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-12 04:01:17 +00:00
mergify[bot] df795c442f
[Enhancement] add catalog and queryId info in ShowProcessList (backport #62552) (#63022)
Signed-off-by: zhaohehuhu <luoyedeyi@163.com>
Co-authored-by: He Zhao <luoyedeyi@163.com>
2025-09-11 12:11:21 +00:00
mergify[bot] b6c2478a24
[BugFix] fix iceberg table scan exception during scan range deploy (backport #62994) (#63018)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-09-11 11:05:18 +00:00
mergify[bot] 2ac795296e
[BugFix] Revert #62916 (backport #63007) (#63009)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-09-11 16:59:19 +08:00
mergify[bot] 2568feffa6
[BugFix] Gracefully Shutdown Compute Node on Exit (backport #62916) (#63000)
Signed-off-by: Claire Fei <cfei@atlassian.com>
Co-authored-by: Claire <30540604+Tenaria@users.noreply.github.com>
2025-09-11 08:01:59 +00:00
mergify[bot] 395e12e3ba
[UT] fix ut ASAN leak, wait for all rpc requests done before exit (backport #62986) (#62990)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-09-11 06:39:55 +00:00
srlch 7bfb6c6076
[Enhancement] provide alter table xxx set auto_increment (backport #62767) (#62876)
Signed-off-by: srlch <linzichao@starrocks.com>
Co-authored-by: Evgeniy Shishkin <eshishki@gmail.com>
2025-09-11 13:54:57 +08:00
mergify[bot] 5969a5e8e3
[BugFix] Fix stream load exec status update NPE (backport #62921) (#62981)
Signed-off-by: PengFei Li <lpengfei2016@gmail.com>
Co-authored-by: PengFei Li <lpengfei2016@gmail.com>
2025-09-11 04:03:22 +00:00
mergify[bot] a507915f0a
[Doc] Add keywords for resource group documentation (backport #62959) (#62977)
Signed-off-by: Dan Roscigno <dan@roscigno.com>
Co-authored-by: Dan Roscigno <dan@roscigno.com>
2025-09-11 10:38:34 +08:00
mergify[bot] 0d5c7a1f8b
[Doc] Update JDBC connector download link for Tableau (backport #62960) (#62972)
Signed-off-by: Dan Roscigno <dan@roscigno.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: Dan Roscigno <dan@roscigno.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-11 10:37:44 +08:00
mergify[bot] 360aedf922
[BugFix] Fix delta lake table can not find partition column (backport #62953) (#62969)
Co-authored-by: Youngwb <yangwenbo_mailbox@163.com>
2025-09-11 02:37:08 +00:00
mergify[bot] 853cdaeedf
[Doc]Fixed syntax error of pre-create partition example (backport #62726) (#62965)
Signed-off-by: megao <jetgm@163.com>
Signed-off-by: DanRoscigno <dan@roscigno.com>
Co-authored-by: megao <jetgm@163.com>
Co-authored-by: DanRoscigno <dan@roscigno.com>
2025-09-11 00:53:35 +00:00
mergify[bot] 15827a307c
[Doc] Doc for Modifying Column Comment (backport #62951) (#62955)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-10 16:45:06 -04:00
mergify[bot] 35ae03962e
[BugFix] Fix invalid ProjectOperator above table-pruning frontier CTEConsumperOperator (backport #62914) (#62936) 2025-09-10 19:52:19 +08:00
mergify[bot] 9fba692e37
[Doc] Fix DROP ROLE Description (backport #62946) (#62948)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-10 18:11:17 +08:00
mergify[bot] 0e50611222
[BugFix] mutate input columns in functions' returning value (backport #62826) (#62943)
Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com>
Co-authored-by: eyes_on_me <nopainnofame@sina.com>
2025-09-10 09:56:19 +00:00
mergify[bot] 47f53b9891
[BugFix] Revert Add transaction error message to loads internal table (#61364) (backport #62928) (#62930)
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-10 16:25:07 +08:00
mergify[bot] ff2850a094
[Doc] Add recyclebin_catalogs system table doc (backport #62878) (#62925)
Signed-off-by: wyb <wybb86@gmail.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: wyb <wybb86@gmail.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-10 07:28:10 +00:00
mergify[bot] 3e8c877412
[BugFix] Fix redundant replica handling after clone (backport #62542) (#62896)
Signed-off-by: Hongkun Xu <xuhongkun666@163.com>
Co-authored-by: Hongkun Xu <xuhongkun666@163.com>
2025-09-10 06:51:47 +00:00
mergify[bot] 8aeabdf45d
[BugFix] Fix collecting stream load profile failed (backport #62802) (#62907)
Signed-off-by: PengFei Li <lpengfei2016@gmail.com>
Co-authored-by: PengFei Li <lpengfei2016@gmail.com>
2025-09-10 13:52:41 +08:00
mergify[bot] 2484b49178
[Enhancement] Support use DN to match group in group provider (backport #62711) (#62885)
Co-authored-by: Harbor Liu <460660596@qq.com>
2025-09-10 13:38:46 +08:00
mergify[bot] 94342fc8e6
[Doc] Update Delta Lake Feature Support (backport #62906) (#62910)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-10 02:54:15 +00:00
Cosmin Lazar 41e54825ee
[Enhancement] Support Azure Workload Identity authentication for Azure Data Lake Storage Gen2 (backport #62754) (#62887)
Signed-off-by: Cosmin Constantin Lazar <cosminconstantinlazar@gmail.com>
2025-09-10 10:36:35 +08:00
mergify[bot] a791d22b21
[BugFix] Fix improper BE selection causing ineffective rebalance (backport #62776) (#62892)
Signed-off-by: Hongkun Xu <xuhongkun666@163.com>
Co-authored-by: Hongkun Xu <xuhongkun666@163.com>
2025-09-10 02:24:40 +00:00
mergify[bot] 3635c50e13
[Doc]Update views.md (backport #62898) (#62899)
Signed-off-by: chelsea <48942089+wangsimo0@users.noreply.github.com>
Co-authored-by: chelsea <48942089+wangsimo0@users.noreply.github.com>
2025-09-10 02:16:25 +00:00
mergify[bot] 4df9563dce
[Doc] fixed hardcoded outdated JDBC link to general Tableau JDBC marketplace link (backport #62858) (#62890)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: Ron Kapoor <ronkapoor2017@gmail.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-10 00:38:44 +00:00
mergify[bot] f2b1b99859
[BugFix] fix missing compaction profile when file bundling is on (backport #62638) (#62864)
Signed-off-by: starrocks-xupeng <xupeng@starrocks.com>
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
2025-09-09 16:34:07 +08:00
mergify[bot] bafb065189
[BugFix] Fix SQL syntax error in histogram statistics when MCV contains single quotes (backport #62853) (#62865)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-09 16:22:33 +08:00
絵空事スピリット a55d7a614a
[Doc] Release Prep in branch-4.0 (#62814)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-09 15:49:25 +08:00
mergify[bot] a0ec39230a
[BugFix] fix kill analyze command (backport #62842) (#62869)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-09 15:37:48 +08:00
mergify[bot] 3be4653b77
[Feature] Support multi statement transaction (part1) - stream load (backport #61362) (#61917)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-09 00:33:25 -07:00
mergify[bot] 573454f4f8
[BugFix] fix nullptr delta writer in local tablet channel (backport #62861) (#62875)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-09-09 07:26:54 +00:00
mergify[bot] 5c87a1899f
[BugFix] Fix colocate group execution not found exec group (backport #62465) (#62548)
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-09-09 14:48:54 +08:00
mergify[bot] 1538f9d741
[Doc] Iceberg REST Catalog supports Vended Credentials (backport #62576) (#62859)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-09 14:41:58 +08:00
mergify[bot] 183bae2ff1
[Enhancement] Add transaction error message to loads internal table (backport #61364) (#62851)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-09 04:31:21 +00:00
mergify[bot] 1cd78def58
[Doc] Doc Update for Indexes (backport #62042) (#62857)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-09 03:21:51 +00:00
mergify[bot] c29dec34bd
[Doc] Format Deltalake Catalog for 4.0 (backport #62599) (#62856)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-09 03:16:11 +00:00
mergify[bot] b38111c4bc
[Doc] V4.0 Iceberg Compaction (backport #62641) (#62855)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-09 03:15:37 +00:00
mergify[bot] 4193ea9824
[Doc] Optimized Function Docs for v4.0 (backport #62158) (#62854)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-09 03:14:41 +00:00
mergify[bot] fb75a71742
[BugFix] Fix metrics of Prometheus format bug (backport #62742) (#62838)
Signed-off-by: gengjun-git <gengjun@starrocks.com>
Co-authored-by: gengjun-git <gengjun@starrocks.com>
2025-09-09 02:43:54 +00:00
mergify[bot] 47386399b5
[Doc] Rebuild Data Distribution (backport #61889) (#62845)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-09 02:35:14 +00:00
mergify[bot] 5e34559192
[Enhancement] support common expr reuse in complex case-when expr in scan predicates (backport #62779) (#62824)
Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com>
Co-authored-by: eyes_on_me <nopainnofame@sina.com>
Co-authored-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com>
2025-09-09 02:19:14 +00:00
mergify[bot] 08e4100bd6
[Enhancement] Health Action need to be handled synchronously (backport #62490) (#62762)
Signed-off-by: crossoverJie <crossoverJie@gmail.com>
Co-authored-by: crossoverJie <crossoverJie@gmail.com>
2025-09-09 09:20:53 +08:00
mergify[bot] 1710f4d901
[Doc] Revert enable_ssl property in Storage Volume (backport #62811) (#62817)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-08 03:32:28 +00:00
mergify[bot] d82390e5a2
[BugFix] fix NPE of information_schema.analyze_status when db is dropped (backport #62796) (#62813)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-08 03:18:01 +00:00
mergify[bot] 0649cfb7a8
[Doc] Docs for CN Blacklist (backport #62018) (#62769)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-08 09:28:08 +08:00
mergify[bot] c5fc434994
[Doc] Update Partitioned Materialized View (backport #62759) (#62774)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-08 09:25:30 +08:00
mergify[bot] 860aa78a39
[BugFix] storage volume delete should not be allowed if active snapshot present (backport #62246) (#62777)
Signed-off-by: Rohit Satardekar <rohitrs1983@gmail.com>
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Rohit Satardekar <rohitrs1983@gmail.com>
Co-authored-by: Kevin Cai <kevin.cai@celerdata.com>
2025-09-08 09:18:55 +08:00
mergify[bot] ccb3e787bd
[BugFix][CVE-2025-58056] bump io.netty version to 4.1.125.Final (backport #62801) (#62807)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-09-07 15:14:35 +08:00
mergify[bot] 98cbd48b98
[UT] fix sql test for low card on lake (backport #62771) (#62804)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-09-06 21:34:00 +08:00
mergify[bot] 77bf0cbc1c
[Refactor] Split data cache engine into disk cache engine and memory cache engine. (backport #62760) (#62798)
Signed-off-by: trueeyu <lxhhust350@qq.com>
Co-authored-by: trueeyu <lxhhust350@qq.com>
2025-09-05 23:34:41 +08:00
mergify[bot] 6f48e4686e
[BugFix] Use session default db in show create routine load if unspecified (backport #62745) (#62791)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-05 11:51:05 +00:00
mergify[bot] 939b72c40a
[BugFix] Fix csv header skip causing data loss in files() (backport #62719) (#62786)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-05 11:04:59 +00:00
mergify[bot] 2bee442be7
[Enhancement] Use privilege REFRSH instead of ALTER to execute refresh statement (backport #62636) (#62782)
Co-authored-by: Youngwb <yangwenbo_mailbox@163.com>
2025-09-05 09:43:19 +00:00
mergify[bot] dedb56f72e
[Enhancement] add command to show the dropped meta information that can be recovered (backport #51007) (#62765)
Signed-off-by: Rohit Satardekar <rohitrs1983@gmail.com>
Co-authored-by: Rohit Satardekar <rohitrs1983@gmail.com>
2025-09-05 06:22:31 +00:00
mergify[bot] 14573ee728
[BugFix] skip combine txnlog when handle non-pk table deletion (backport #62735) (#62755)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-09-04 21:38:39 +08:00
mergify[bot] d66412ce10
[Doc] Add enable_group_by_compressed_key (backport #61986) (#62734)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-04 19:26:15 +08:00
mergify[bot] 419b803d98
[Doc] Separator and Delimiter with Multiple Non-printable Characters (backport #62744) (#62747)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-04 19:08:14 +08:00
mergify[bot] 96b335c9ad
[BugFix] Fix db is null when replaying batch transactions upsert (backport #62715) (#62736)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-04 17:07:36 +08:00
mergify[bot] 8bf9df6f65
[UT] Fix timeout for SQL test case `test_partition_hash_join` (backport #62722) (#62739)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-09-04 16:16:13 +08:00
mergify[bot] 68ea8d4fde
[Doc] fix typo (backport #62729) (#62731)
Signed-off-by: qingzhongli <qingzhongli2018@gmail.com>
Co-authored-by: qingzhongli <qingzhongli2018@gmail.com>
2025-09-04 07:33:29 +00:00
mergify[bot] 84921c4361
[Tool] simplify meta_tool help message (backport #62703) (#62724)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-09-04 03:43:28 +00:00
mergify[bot] 09b4fda4d5
[BugFix] fix iceberg transform compact bug (backport #62697) (#62721)
Signed-off-by: SevenJ <wenjun7j@gmail.com>
Co-authored-by: SevenJ <166966490+Wenjun7J@users.noreply.github.com>
2025-09-04 11:09:41 +08:00
mergify[bot] 13755f4f16
[BugFix] Fix cache env init order (backport #62700) (#62712)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-03 12:22:08 +00:00
mergify[bot] 7640fb0302
[Enhancement] Disable ARRAY function for DECIMAL256 type (backport #62670) (#62709)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-03 19:18:53 +08:00
mergify[bot] ba258aee4c
[UT] add more decimal256 sql test cases (backport #62664) (#62710)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-03 18:54:37 +08:00
mergify[bot] c0fa924fe2
[Enhancement] Fast schema evolution supports adding key columns for shared-data (backport #62253) (#62489)
Signed-off-by: PengFei Li <lpengfei2016@gmail.com>
Co-authored-by: PengFei Li <lpengfei2016@gmail.com>
2025-09-03 17:19:44 +08:00
mergify[bot] cd8a9c3599
[UT] add more sql test about agg filter (backport #62640) (#62701)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-03 08:27:32 +00:00
mergify[bot] 9d837fc359
[Enhancement] disable low card on lake by default because of potential bugs (backport #62586) (#62691)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-09-03 05:51:44 +00:00
mergify[bot] 8d1700a966
[BugFix] Fix publish incorrectly reported as successful during graceful shutdown in shared-nothing (backport #62417) (#62684)
Signed-off-by: PengFei Li <lpengfei2016@gmail.com>
Co-authored-by: PengFei Li <lpengfei2016@gmail.com>
2025-09-03 13:41:47 +08:00
mergify[bot] 47a74eca99
[Enhancement] turn on tablet balance between workers by default in shared-data mode (backport #62661) (#62676)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-09-03 03:22:33 +00:00
mergify[bot] 98477be39a
[Enhancement] introduce a function to obtain the column size (backport #62481) (#62674)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-09-03 03:19:21 +00:00
mergify[bot] 5cfe433d5b
[Enhancement] replace memcompare with memequal for SortedAgg (backport #62585) (#62672)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-09-03 10:34:43 +08:00
mergify[bot] 9b21d191af
[Enhancement] Support complex expressions in FILTER clause and add boolean type validation for aggregate functions (backport #62637) (#62665)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-03 10:27:31 +08:00
mergify[bot] ff547d6a45
[Doc] Doc for pipeline_sink_dop (backport #62618) (#62668)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-03 01:44:40 +00:00
mergify[bot] 3c9d5cb89c
[BugFix] Clear mv's version map if restore job failed (backport #62634) (#62644)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-02 18:46:22 +08:00
mergify[bot] ebf06bbe4e
[BugFix] Fix async delta writer crash due to null pointer (backport #62626) (#62651)
Signed-off-by: xiangguangyxg <xiangguangyxg@gmail.com>
Co-authored-by: xiangguangyxg <110401425+xiangguangyxg@users.noreply.github.com>
2025-09-02 10:28:34 +00:00
mergify[bot] 4a333abbd2
[BugFix] Fix case-sensitive partition column validation in materialized view analyzer (backport #62598) (#62622)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-02 13:37:18 +08:00
mergify[bot] 8dfe7b0eb3
[BugFix] Set executionId before parser to avoid duplicate executionId for syntax error sql. (backport #62258) (#62612)
Signed-off-by: gengjun-git <gengjun@starrocks.com>
Co-authored-by: gengjun-git <gengjun@starrocks.com>
2025-09-02 10:37:47 +08:00
mergify[bot] fd9b3e3c2f
[BugFix] Fix optimize table task submit reject (backport #62300) (#62555)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-01 19:16:33 -07:00
mergify[bot] d3dedf1051
[Enhancement] Add clone metrics in backend (backport #62479) (#62607)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-09-02 02:15:40 +00:00
mergify[bot] 4d0d380972
[Enhancement] Support multi statements transaction (part2) (backport #62019) (#62606)
Signed-off-by: meegoo <meegoo.sr@gmail.com>
Co-authored-by: meegoo <meegoo.sr@gmail.com>
2025-09-01 19:15:16 -07:00
mergify[bot] 9714a299a7
[BugFix] Change the `tuple_id` field in `TIcebergTableSink` to optional to fix the compatibility issues with historical versions. (backport #62593) (#62604)
Signed-off-by: GavinMar <yangguansuo@starrocks.com>
Co-authored-by: Gavin <yangguansuo@starrocks.com>
2025-09-02 02:10:15 +00:00
mergify[bot] a3553f320d
[BugFix] Fix two minus signs when the result is INT256_MIN (backport #62512) (#62528)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-02 10:02:19 +08:00
mergify[bot] 23079e240e
[Doc] Remove the custom JDBC plugin from ldap authentication (backport #62584) (#62615)
Signed-off-by: gengjun-git <gengjun@starrocks.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: gengjun-git <gengjun@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-09-02 10:00:57 +08:00
mergify[bot] 678a880fe3
[BugFix]Disable tablet creation optimization when partitions has multiple indexes (backport #62595) (#62600)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-09-01 22:18:03 +08:00
mergify[bot] ff3e3b86bd
[Enhancement] Change transform type prefer string for fixed length varchar (backport #62476) (#62587)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-01 17:04:44 +08:00
mergify[bot] 016d806bdd
[BugFix] Fix int256_t negation undefined behavior causing memory allocation failure (backport #62510) (#62578)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-01 07:37:32 +00:00
mergify[bot] dacc9100f5
[Enhancement] Add config to disable statistics cache lazy refresh by default (backport #62518) (#62573)
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-01 15:17:38 +08:00
mergify[bot] 95b4145579
[BugFix] Remove redundant status setting in CancelableAnalyzeTask to avoid overriding StatisticsExecutor status (backport #62538) (#62571)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-01 15:01:14 +08:00
mergify[bot] 5aba68ee1f
[BugFix] Fix possible NPE in mv backup restore (backport #62514) (#62560)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-09-01 06:56:11 +00:00
mergify[bot] 4c14589c24
[BugFix] fix statistics collection error message (backport #62533) (#62568)
Signed-off-by: stephen <stephen5217@163.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
2025-09-01 14:55:53 +08:00
mergify[bot] 82b8a41e2f
[BugFix] Increase default max connection limit for external users (backport #62523) (#62564)
Co-authored-by: Harbor Liu <460660596@qq.com>
2025-09-01 14:40:33 +08:00
mergify[bot] 46cd1a0986
[BugFix] fix the http_workers_num metric (backport #62457) (#62546)
Signed-off-by: crossoverJie <crossoverJie@gmail.com>
Co-authored-by: crossoverJie <crossoverJie@gmail.com>
2025-09-01 04:32:31 +00:00
starrocks-xupeng 67f90d7979
[Enhancement] mask crendential info in submit task (backport #62311) (#62554)
Signed-off-by: starrocks-xupeng <xupeng@starrocks.com>
2025-09-01 04:30:57 +00:00
mergify[bot] 148829d642
[BugFix] Fix integer overflow caused by integer left shift in compression key (backport #62366) (#62451)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-31 13:16:05 +08:00
mergify[bot] 6029763c83
[BugFix] Do not simplify case-when with complex functions to avoid yielding very tedious result on scan node because of lack of CSE extraction (backport #62505) (#62519)
Signed-off-by: satanson <ranpanf@gmail.com>
Co-authored-by: satanson <ranpanf@gmail.com>
2025-08-29 19:51:57 +08:00
mergify[bot] 2fe7265367
[Doc] Categorize enable_auth_check (backport #62517) (#62525)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-29 08:55:59 +00:00
mergify[bot] b1ec299c5d
[Enhancement] Make json_extract return json type in trino dialect (backport #59718) (#62503)
Signed-off-by: ‘duanyyyyyyy’ <yan.duan9759@gmail.com>
Co-authored-by: duanyyyyyyy <139062392+duanyyyyyyy@users.noreply.github.com>
2025-08-29 08:12:25 +00:00
mergify[bot] ede876e277
[Enhancement] Add clone metrics in frontend (backport #62421) (#62515)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-29 08:08:05 +00:00
mergify[bot] ea63796289
[BugFix] fix table function use lowcarditinay error (backport #62292) (#62385)
Signed-off-by: Seaven <seaven_7@qq.com>
Co-authored-by: Seaven <seaven_7@qq.com>
2025-08-29 15:39:32 +08:00
mergify[bot] fa61498350
[Enhancement] add remote file cache limit for hive (backport #62288) (#62353)
Signed-off-by: SevenJ <wenjun7j@gmail.com>
Co-authored-by: SevenJ <166966490+Wenjun7J@users.noreply.github.com>
2025-08-29 14:17:20 +08:00
mergify[bot] 575b52389b
[Enhancement] revise cache mem size limit for iceberg (backport #61966) (#62357)
Signed-off-by: SevenJ <wenjun7j@gmail.com>
Co-authored-by: SevenJ <166966490+Wenjun7J@users.noreply.github.com>
2025-08-29 14:16:42 +08:00
mergify[bot] 9dd848f23c
[Enhancement] reduce unnecessary storage related logs (backport #62121) (#62470)
Signed-off-by: starrocks-xupeng <xupeng@starrocks.com>
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
2025-08-29 14:07:43 +08:00
mergify[bot] d2e01def4b
[BugFix] TableFunction not use low cardinality optimization (backport #62466) (#62495)
Signed-off-by: satanson <ranpanf@gmail.com>
Co-authored-by: satanson <ranpanf@gmail.com>
2025-08-29 11:26:39 +08:00
mergify[bot] 71e93ec62b
[Enhancement] improve lake information schema performance (backport #62404) (#62441)
Signed-off-by: SevenJ <wenjun7j@gmail.com>
Co-authored-by: SevenJ <166966490+Wenjun7J@users.noreply.github.com>
2025-08-29 11:24:00 +08:00
mergify[bot] bf3c758a1d
[BugFix] Fix compile memcpy_inlined_overflow16 on ARM (backport #62478) (#62493)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-29 03:11:42 +00:00
mergify[bot] 76225c9a6f
[BugFix] Fix division by zero for partition hash join (backport #62474) (#62486)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-29 02:47:32 +00:00
mergify[bot] a2b49dcc85
[BugFix] Fix table with gin index use replicated_storage (backport #62480) (#62487)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-08-29 02:44:28 +00:00
mergify[bot] 640bbe10e9
[BugFix] fix the execState of multiple fe (backport #62376) (#62382)
Signed-off-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Murphy <mofei@starrocks.com>
2025-08-28 22:55:48 +08:00
mergify[bot] 5f192a6f28
[Enhancement] add ExecState into /current_queries cmd result to distinguish running/pending query (backport #62261) (#62372)
Signed-off-by: MatthewH00 <1639097204@qq.com>
Co-authored-by: hmx <1639097204@qq.com>
2025-08-28 21:30:49 +08:00
mergify[bot] 77291b7c49
[BugFix] Fix UAF when FixedLengthColumn append self (backport #62375) (#62393)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-28 11:11:25 +00:00
mergify[bot] 89cd08dd15
[BugFix] fix the privilege issue of refresh mv (backport #62396) (#62464)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Murphy <mofei@starrocks.com>
2025-08-28 10:24:38 +00:00
mergify[bot] b045d6efbd
[BugFix] Fix mv refresh bug with case-insensitive partition names (backport #62389) (#62444)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-08-28 16:24:19 +08:00
mergify[bot] c03f27bca3
[BugFix] Fix UAF for BinaryColumn::append_selective (backport #62410) (#62458)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-28 15:57:15 +08:00
mergify[bot] e7b6d57f23
[Doc] Update auditloder download url (backport #62452) (#62456)
Signed-off-by: Dan Jing <jingdan@starrocks.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: Dan Jing <jingdan@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-28 07:27:32 +00:00
mergify[bot] b5805bfe29
[Doc] Updated documentation to align with Auditloader 5.0 (backport #62419) (#62449)
Signed-off-by: 吴梦龙 <1849777679@qq.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 吴梦龙 <1849777679@qq.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-28 14:52:24 +08:00
mergify[bot] abc050347c
[UT] Fix ut append_chunk_safe (backport #62413) (#62418)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-28 14:50:45 +08:00
mergify[bot] df8b4f31f3
[BugFix] fix lambda common expr slot id conflicts in array_map (backport #62414) (#62428)
Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com>
Co-authored-by: eyes_on_me <nopainnofame@sina.com>
2025-08-28 06:44:24 +00:00
mergify[bot] 6ada019be0
[BugFix] fix the overlap check of zonemap (backport #62369) (#62411)
Signed-off-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-28 14:06:06 +08:00
mergify[bot] b228bb67cc
[Enhancement] Extend hour_from_unixtime optimization (backport #62338) (#62397)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-08-28 14:04:37 +08:00
mergify[bot] 658d5c1b9d
[Doc] Add zonemap and json configuration parameters (backport #62368) (#62398)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-08-28 14:02:16 +08:00
mergify[bot] 02e81c8b8f
[Doc] Document session variables and update Flat_json.md (backport #62367) (#62399)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-08-28 14:01:15 +08:00
mergify[bot] dc9598f87f
[Feature] Support automatic creating split tablet job (backport #61650) (#62420)
Signed-off-by: xiangguangyxg <xiangguangyxg@gmail.com>
Co-authored-by: xiangguangyxg <110401425+xiangguangyxg@users.noreply.github.com>
2025-08-28 13:37:16 +08:00
mergify[bot] e6d48b7a86
[BugFix] adjust star mgr journal replay log (backport #62374) (#62401)
Signed-off-by: starrocks-xupeng <xupeng@starrocks.com>
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
2025-08-28 03:11:44 +00:00
mergify[bot] f4ee640fc7
[Enhancement] create adaptive zonemap index for strings (backport #61965) (#62361)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-08-28 10:23:33 +08:00
mergify[bot] 512567775d
[BugFix] use fe id instead to replace the fe name (backport #62378) (#62394)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-28 10:09:00 +08:00
mergify[bot] ad022e686f
[BugFix] fix combine txnlog vacuum issue when delete tablets (backport #62363) (#62390)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-08-27 19:45:36 +08:00
mergify[bot] 56c73c2443
[Enhancement] Add more failure reasons for tablet clone (backport #62293) (#62380)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-27 09:09:53 +00:00
mergify[bot] d4d0774dda
[Enhancement] enable lake tablet internal parallel scan by default (backport #62159) (#62359)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-08-27 03:50:13 +00:00
mergify[bot] 0eb1a93ca0
[Enhancement] Adjust partition hash join strategy (backport #61405) (#62355)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-27 11:25:14 +08:00
mergify[bot] 7d45558eb5
[Enhancement] enable flat json by default (backport #62097) (#62354)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-27 03:12:18 +00:00
mergify[bot] b3fa782058
[BugFix] Fix too many disk io when check consistency (backport #61745) (#62348)
Co-authored-by: kisshot288 <59246842+kisshot288@users.noreply.github.com>
2025-08-26 15:09:14 +00:00
mergify[bot] e4e493480b
[BugFix] Fix possible NPE in alter table (backport #62321) (#62340)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-08-26 21:18:52 +08:00
mergify[bot] b69e6bd00f
[BugFix] Align enable_merge_commit setting for FE/BE (backport #62310) (#62322)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-08-26 15:18:54 +08:00
mergify[bot] ee253d78df
[Enhancement] add a session variable to deploy scan ranges back/foreground (backport #62291) (#62320)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-08-26 06:59:50 +00:00
mergify[bot] 4efe5dbf02
[Doc] enable_ssl Property in Storage Volume (backport #62323) (#62328)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-26 06:59:29 +00:00
mergify[bot] 98032f2b0a
[Enhancement] create string column zonemap with prefix truncation (backport #61975) (#62317)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-08-26 06:33:09 +00:00
mergify[bot] 6ca0ee936b
[UT] Fix join test timeout (backport #62298) (#62319)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-26 06:24:49 +00:00
mergify[bot] c844fce1fb
[Enhancement] Rewrite MIN(f(col)) to f(MIN(col)) for Monotonic Functions (backport #62225) (#62315)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Murphy <mofei@starrocks.com>
2025-08-26 05:59:55 +00:00
mergify[bot] 3b1e377c6e
[Enhancement] Extend MinMaxStats optimization to support DictMappingExpr (backport #62212) (#62316)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-26 05:58:15 +00:00
mergify[bot] afcbb584cd
[BugFix] mask credential info when query execution error (backport #62283) (#62313)
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
2025-08-26 13:51:13 +08:00
mergify[bot] 38e8956901
[Doc] correct parameter for enabling JWT auth (backport #62242) (#62306)
Signed-off-by: DanRoscigno <dan@roscigno.com>
Co-authored-by: Dan Roscigno <dan@roscigno.com>
2025-08-26 02:39:15 +00:00
mergify[bot] de98d5a362
[Doc] add pinyin term (backport #62299) (#62303)
Signed-off-by: DanRoscigno <dan@roscigno.com>
Co-authored-by: Dan Roscigno <dan@roscigno.com>
2025-08-26 02:25:41 +00:00
mergify[bot] 946a05b429
[BugFix] avoid BE crash when LakePersistentIndex init fail (backport #62279) (#62296)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-08-26 10:21:54 +08:00
mergify[bot] b647155d20
[BugFix]Fix the error of using GTID to handle dirty tablet metadata (backport #62275) (#62284)
Signed-off-by: edwinhzhang <edwinhzhang@tencent.com>
Co-authored-by: zhanghe <edwinhzhang@tencent.com>
2025-08-25 10:11:11 +00:00
mergify[bot] 08af8a12be
[Enhancement] Use linear-chained to optimize hash join (backport #61429) (#62281)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-25 09:44:21 +00:00
mergify[bot] cc4228fb6d
[BugFix] Add lock when get max version from tablet in replication txn manager (backport #62238) (#62277)
Signed-off-by: xiangguangyxg <xiangguangyxg@gmail.com>
Co-authored-by: xiangguangyxg <110401425+xiangguangyxg@users.noreply.github.com>
2025-08-25 09:09:35 +00:00
mergify[bot] e3faf9c570
[Doc] Fix snippet in deployment (backport #62269) (#62272)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-25 07:10:35 +00:00
mergify[bot] 64db225e92
[Doc] Add balance statistics doc (backport #62170) (#62265)
Signed-off-by: wyb <wybb86@gmail.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: wyb <wybb86@gmail.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-25 06:35:21 +00:00
mergify[bot] b1e0f0cca9
[Doc] Update fe tablet schdules system table doc (backport #62180) (#62267)
Signed-off-by: wyb <wybb86@gmail.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: wyb <wybb86@gmail.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-25 06:34:55 +00:00
mergify[bot] 1ec6cf224f
[Enhancement] fix profile when deploying scan ranges in background (backport #62223) (#62264)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-08-25 06:12:02 +00:00
Gavin 66b0f412c4
[Enhancement] Optimize the iceberg sink local sorting based on the spill partition writer (backport #62096) (#62252)
Signed-off-by: GavinMar <yangguansuo@starrocks.com>
2025-08-25 13:57:40 +08:00
mergify[bot] bdb0b0e467
[Enhancement] Optimize append_selective for binary column (backport #62165) (#62259)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-25 12:32:17 +08:00
mergify[bot] bd48411e3e
[Enhancement] short circuit optimization on select limit case (on Scan Node) (backport #62188) (#62257)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-08-25 04:07:07 +00:00
mergify[bot] 3a7e014ba6
[BugFix] fix cn crash if when cache is turned off (backport #62174) (#62256)
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
2025-08-25 10:53:58 +08:00
mergify[bot] c0ec75f889
[Doc] Add Variables (backport #62171) (#62237)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-25 10:02:30 +08:00
mergify[bot] d14a733a70
[BugFix] support lazy delta column compact for size tiered compaction in pk table to reduce cost (backport #61930) (#62244)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-08-23 09:36:42 +08:00
mergify[bot] cf71a82f85
[BugFix] check if it's flatjson before non-existent field optimization (backport #62227) (#62241)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-23 09:04:53 +08:00
mergify[bot] 34e3f8ca7b
[BugFix] Fix view based mv rewrite bug (backport #62198) (#62228)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-08-22 13:13:10 +00:00
mergify[bot] 484afe81e1
[Enhancement] Support trace optimizer logs even if throw exceptions (backport #62192) (#62205)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-08-22 16:35:05 +08:00
mergify[bot] 22bd12a69f
[Feature] Implement dynamic tablet job in FE for dynamic tablet splitting and merging (backport #61349) (#62213)
Signed-off-by: xiangguangyxg <xiangguangyxg@gmail.com>
Co-authored-by: xiangguangyxg <110401425+xiangguangyxg@users.noreply.github.com>
2025-08-22 08:13:38 +00:00
mergify[bot] 6af0301a52
[Enhancement] fix query profile when deploying more tasks (backport #62186) (#62218)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-08-22 07:32:43 +00:00
mergify[bot] 6d4fdae2f1
[BugFix] Fix trace times merge bug (backport #62126) (#62216)
Signed-off-by: Seaven <seaven_7@qq.com>
Co-authored-by: Seaven <seaven_7@qq.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-08-22 15:29:37 +08:00
mergify[bot] b44663ddfc
[BugFix]runtime filter partition are bucket aware (backport #62191) (#62208)
Signed-off-by: zombee0 <ewang2027@gmail.com>
Co-authored-by: zombee0 <ewang2027@gmail.com>
2025-08-22 14:03:02 +08:00
mergify[bot] 4fe0b95635
[Enhancement] Improve the iceberg table sink memory and small file problems by implementing a partition based global shuffle (backport #62123) (#62190)
Signed-off-by: GavinMar <yangguansuo@starrocks.com>
Co-authored-by: Gavin <yangguansuo@starrocks.com>
2025-08-22 10:24:41 +08:00
mergify[bot] 5be180a827
[Doc] add doc for bucket-aware execution (backport #62164) (#62194)
Signed-off-by: zombee0 <ewang2027@gmail.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: zombee0 <ewang2027@gmail.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-22 09:23:54 +08:00
mergify[bot] c7be66352b
[Enhancement] update persistent index size statistic when do major compaction (backport #62195) (#62201)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-08-21 13:54:47 +00:00
mergify[bot] 677023a9f1
[BugFix] Fix partitioned hash join crash when enable query cache (backport #62146) (#62183)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-21 17:17:30 +08:00
mergify[bot] 72c4842aa6
[BugFix] Fixed phased scheduler always waiting for profile collection in sync profile collection (backport #62140) (#62177)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-21 16:43:48 +08:00
mergify[bot] 9db589ba27
[Enhancement] Refactor balance type (backport #62163) (#62185)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-21 08:43:32 +00:00
mergify[bot] 0c3cdc90ca
[Enhancement]support config bucket assign mode for bucket-aware execution (backport #62135) (#62168)
Signed-off-by: zombee0 <ewang2027@gmail.com>
Co-authored-by: zombee0 <ewang2027@gmail.com>
2025-08-21 07:28:22 +00:00
mergify[bot] f5f20afaf2
[BugFix] Fix FE restart problem when enable query queue v2 (backport #62161) (#62167)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-08-21 14:59:27 +08:00
mergify[bot] e98f32834b
[UT] fix unstable case (backport #62157) (#62160)
Signed-off-by: zombee0 <ewang2027@gmail.com>
Co-authored-by: zombee0 <ewang2027@gmail.com>
2025-08-21 11:30:00 +08:00
mergify[bot] 0e39d339cb
[UT] Fix the test case: test_files_sink (backport #62138) (#62154)
Signed-off-by: trueeyu <lxhhust350@qq.com>
Co-authored-by: trueeyu <lxhhust350@qq.com>
2025-08-21 09:07:18 +08:00
mergify[bot] 94bd28bbd8
[Doc] datacache_mem_size、datacache_disk_size is mutable now (backport #62111) (#62151)
Signed-off-by: Pei Yu <125331682@qq.com>
Co-authored-by: Pei Yu <125331682@qq.com>
2025-08-20 12:49:09 +00:00
mergify[bot] ec4c0ecd2b
[Doc] Update the default value of cbo_eq_base_type. (backport #62084) (#62086)
Signed-off-by: edwinhzhang <edwinhzhang@tencent.com>
Co-authored-by: zhanghe <edwinhzhang@tencent.com>
2025-08-20 08:44:52 -04:00
mergify[bot] d90d3bc5b6
[BugFix] Fix group by compressed key cause wrong result on decimal (backport #62022) (#62147)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-20 12:17:12 +00:00
mergify[bot] 8d11089dcb
[Enhancement] support group by compressed key (backport #61632) (#62145)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-20 10:37:08 +00:00
mergify[bot] ed1d4cc111
[BugFix] Fix throw exception issue in low-cardinality optimization error in ALLOW_THROW_EXCEPTION mode (backport #62098) (#62144)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-20 10:07:30 +00:00
mergify[bot] c42eaf88df
[Enhancement] Optimize accessing non-existent JSON field (backport #62003) (#62133)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-20 16:05:19 +08:00
zombee0 843806e61e
[BugFix] fix some minor bugs and add comment (backport #61902) (#62125)
Signed-off-by: zombee0 <ewang2027@gmail.com>
2025-08-20 15:16:01 +08:00
mergify[bot] 85b141ca97
[BugFix] fix json global dict with heterogeneous schema (backport #62001) (#62119)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-20 03:56:35 +00:00
mergify[bot] f93eadcee6
[BugFix] fix insert into select's audit log (backport #61381) (#62104)
Co-authored-by: before-Sunrise <71162020+before-Sunrise@users.noreply.github.com>
2025-08-20 11:26:37 +08:00
mergify[bot] 7fb868e211
[BugFix] Fix set_tablet_schema for partition_morsel_queue(split_morsel_queue) (backport #62034) (#62118)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-20 11:15:48 +08:00
mergify[bot] b68721abdc
[Enhancement] revise iceberg rewrite data (backport #61851) (#61913)
Signed-off-by: SevenJ <wenjun7j@gmail.com>
Co-authored-by: SevenJ <166966490+Wenjun7J@users.noreply.github.com>
2025-08-20 10:18:55 +08:00
mergify[bot] 4217260158
[Enhancement] Improve fe tablet schedules system table (backport #62073) (#62112)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-19 16:31:40 +00:00
mergify[bot] 945d51a80b
[BugFix] Fix missing clone copy size and duration (backport #62074) (#62108)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-19 13:42:17 +00:00
mergify[bot] 7b3f2789b9
[BugFix] fail to calculate nested cte's statistics outside memo in table pruning (backport #62070) (#62094)
Signed-off-by: satanson <ranpanf@gmail.com>
Co-authored-by: satanson <ranpanf@gmail.com>
2025-08-19 19:35:48 +08:00
mergify[bot] 5b41a92084
[BugFix] Fix error base version in schema change job with lake rollup (backport #62046) (#62089)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-08-19 11:11:50 +00:00
mergify[bot] 0df4fb0522
[BugFix] avoid get file size in report tablet stat thread (backport #61901) (#62028)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-08-19 18:45:36 +08:00
mergify[bot] ae28c45368
[Enhancement] support encode_sort_key function (backport #61781) (#61976)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-08-19 18:36:47 +08:00
mergify[bot] 90f1f3be58
[Enhancement] optimize GlobalDictCodeColumnIterator::decode_string_dict_codes (backport #62002) (#62015)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-19 18:36:08 +08:00
mergify[bot] cf4a3df21c
[Refactor] Change some vlog level (backport #61995) (#62083)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-08-19 10:12:34 +00:00
mergify[bot] 7e26ff974e
[Doc] Fix SQL Digest (backport #62075) (#62080)
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-19 08:46:16 +00:00
mergify[bot] 24d26c33ac
[Enhancement] Introduce a connector partition chunk writer to support spilling chunk data for iceberg table sink. (backport #61963) (#62062)
Signed-off-by: GavinMar <yangguansuo@starrocks.com>
Co-authored-by: Gavin <yangguansuo@starrocks.com>
2025-08-19 14:41:42 +08:00
mergify[bot] 3e09498f8f
[BugFix][CVE] CVE-2025-55163 fix, bump io.netty version (backport #62041) (#62057)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-08-19 04:51:47 +00:00
mergify[bot] 89bc4ff068
[Enhancement] integrate global dict with flatjson (backport #61681) (#62055)
Signed-off-by: Murphy <mofei@starrocks.com>
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-19 11:30:16 +08:00
mergify[bot] 04bb4e3f1b
[UT] move test_groupby_array_agg to another test file (backport #61971) (#62054)
Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com>
Co-authored-by: eyes_on_me <nopainnofame@sina.com>
2025-08-19 01:54:57 +00:00
mergify[bot] e0fe6d4e72
[BugFix] arrow build respect avx2 settings (backport #62006) (#62045)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-08-18 14:24:32 +00:00
mergify[bot] 8413284035
[Enhancement] add new column cngroupname in `show nodes` sql (backport #62020) (#62040)
Signed-off-by: Kevin Cai <kevin.cai@celerdata.com>
Co-authored-by: Kevin Cai <caixiaohua@starrocks.com>
2025-08-18 09:57:56 +00:00
mergify[bot] 8dd56fd7ad
[BugFix] Fix create mv with case-when incompatible varchar type (backport #61996) (#62036)
Signed-off-by: shuming.li <ming.moriarty@gmail.com>
Co-authored-by: shuming.li <ming.moriarty@gmail.com>
2025-08-18 17:19:45 +08:00
mergify[bot] d989b56d51
[Enhancement] dump distro and arch info in crash log (backport #62017) (#62032)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-18 09:09:59 +00:00
mergify[bot] ea8c32a0d8
[Enhancement] VacuumFull Implementation (backport #61602) (#62016)
Signed-off-by: srlch <linzichao@starrocks.com>
Co-authored-by: srlch <111035020+srlch@users.noreply.github.com>
Co-authored-by: Connor Brennan <cbrennan@pinterest.com>
2025-08-18 03:51:37 +00:00
mergify[bot] dbb3e1d5f8
[BugFix] fix parquet array write when split null string (backport #61999) (#62012)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-08-18 03:37:13 +00:00
mergify[bot] 982f2ebd3e
[UT] disable lake compaction scheduler in unit test (backport #61968) (#61987)
Signed-off-by: starrocks-xupeng <xupeng@starrocks.com>
Co-authored-by: starrocks-xupeng <xupeng@starrocks.com>
2025-08-15 18:58:03 +08:00
mergify[bot] f5fac98bdb
[BugFix] Fix NullPoinitException when doesn't found column partition statis (backport #61935) (#61979)
Signed-off-by: Seaven <seaven_7@qq.com>
Co-authored-by: Seaven <seaven_7@qq.com>
2025-08-15 18:47:18 +08:00
mergify[bot] 960c351557
[Doc] Add Mapping for Packages (backport #61898) (#61983)
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-15 08:27:52 +00:00
mergify[bot] 17f92859be
[Doc] add new fe config for controlling array ndv colleciton to cbo section (backport #61921) (#61982)
Signed-off-by: stephen <stephen5217@163.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-15 08:25:53 +00:00
mergify[bot] a670068304
[Enhancement] Add logs for the reason why tablet cannot be repaired (backport #61959) (#61969)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-15 07:16:12 +00:00
mergify[bot] 492586e993
[Doc] add table name insensitive doc (backport #61923) (#61974)
Signed-off-by: stephen <stephen5217@163.com>
Signed-off-by: 絵空事スピリット <wanglichen@starrocks.com>
Co-authored-by: stephen <91597003+stephen-shelby@users.noreply.github.com>
Co-authored-by: 絵空事スピリット <wanglichen@starrocks.com>
2025-08-15 15:02:20 +08:00
Gavin 9df260eee1
[Refactor] Introduce a load chunk spiller and refactor the load spill memtable sink based on it. (backport #61867) (#61964)
Signed-off-by: GavinMar <yangguansuo@starrocks.com>
2025-08-15 14:15:31 +08:00
mergify[bot] f5a74aa16d
[Enhancement] enhance drop partition log information (backport #61787) (#61962)
Signed-off-by: crossoverJie <crossoverJie@gmail.com>
Co-authored-by: crossoverJie <crossoverJie@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-15 14:04:15 +08:00
mergify[bot] 3708c97461
[BugFix] Correct add query context to context conditions (backport #61929) (#61945)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-15 14:00:14 +08:00
mergify[bot] f571bb1ac0
[Enhancement] assign a large but configurable row count to unknown stats table (backport #61332) (#61953)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-08-15 04:33:37 +00:00
mergify[bot] 3635b317d8
[BugFix] Disable sync_publish for shadow tablet (backport #61887) (#61941)
Signed-off-by: sevev <qiangzh95@gmail.com>
Co-authored-by: zhangqiang <qiangzh95@gmail.com>
2025-08-15 09:51:39 +08:00
mergify[bot] f96b93e208
[BugFix] fix dict version of random distribution table (backport #61933) (#61948)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
2025-08-14 12:18:03 +00:00
mergify[bot] 7082f55ab0
[Enhancement]support bucket-aware execution for iceberg (backport #61756) (#61931)
Signed-off-by: zombee0 <ewang2027@gmail.com>
Co-authored-by: zombee0 <ewang2027@gmail.com>
2025-08-14 17:04:11 +08:00
mergify[bot] c7f97d8f46
[Enhancement] Remove compatible meta_dir code (backport #61924) (#61934)
Signed-off-by: gengjun-git <gengjun@starrocks.com>
Co-authored-by: gengjun-git <gengjun@starrocks.com>
2025-08-14 09:03:25 +00:00
mergify[bot] a65a4e2eb9
[BugFix] CBO Table Pruning misses other predicates (backport #61881) (#61910)
Signed-off-by: satanson <ranpanf@gmail.com>
Co-authored-by: satanson <ranpanf@gmail.com>
2025-08-14 15:51:48 +08:00
mergify[bot] 6b4f0cbef5
[Enhancement] Add label location balance statistic (backport #61905) (#61927)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-14 06:54:41 +00:00
mergify[bot] c6da99c2bb
[BugFix] avoid hold tablet shard lock to get compaction score (backport #61899) (#61919)
Signed-off-by: luohaha <18810541851@163.com>
Co-authored-by: Yixin Luo <18810541851@163.com>
2025-08-14 04:18:26 +00:00
mergify[bot] 6bebdbac4d
[BugFix] Fix NPE for JoinHashTable::mem_usage (backport #61872) (#61915)
Signed-off-by: zihe.liu <ziheliu1024@gmail.com>
Co-authored-by: zihe.liu <ziheliu1024@gmail.com>
2025-08-14 03:20:04 +00:00
mergify[bot] 1cf54d7670
[BugFix] Fix QueryContext cancel may cause use-after-free (backport #61897) (#61907)
Signed-off-by: stdpain <drfeng08@gmail.com>
Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com>
2025-08-14 02:52:23 +00:00
mergify[bot] 9837153661
[Enhancement] Separate path id from physical partition id (backport #61854) (#61894)
Signed-off-by: xiangguangyxg <xiangguangyxg@gmail.com>
Co-authored-by: xiangguangyxg <110401425+xiangguangyxg@users.noreply.github.com>
2025-08-13 12:21:16 +00:00
andyziye 109deb7a80
[Tool] Disable codeowner check (#61903)
Signed-off-by: andyziye <108652123+andyziye@users.noreply.github.com>
2025-08-13 19:10:17 +08:00
mergify[bot] c8e77680d7
[UT] Fix ShowDataDistributionStmtTest (backport #61880) (#61895)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-13 10:11:36 +00:00
mergify[bot] e371915c8c
[BugFix] Fix the problem with the number of rebuild file counted. (backport #61859) (#61890)
Signed-off-by: edwinhzhang <edwinhzhang@tencent.com>
Co-authored-by: zhanghe <edwinhzhang@tencent.com>
2025-08-13 09:50:43 +00:00
mergify[bot] 459a5fc3f0
[BugFix] Other predicates of Join contains non-push-down subfield should not be rewritten (backport #61868) (#61883)
Signed-off-by: satanson <ranpanf@gmail.com>
Co-authored-by: satanson <ranpanf@gmail.com>
2025-08-13 16:55:10 +08:00
mergify[bot] 288b12572d
[BugFix] forbidden statistics collect generate expr column (backport #61829) (#61870)
Signed-off-by: Seaven <seaven_7@qq.com>
Co-authored-by: Seaven <seaven_7@qq.com>
2025-08-13 16:01:33 +08:00
mergify[bot] 898d7a400e
[BugFix] fix min/max optimization on iceberg on partition columns (backport #61858) (#61878)
Signed-off-by: yan zhang <dirtysalt1987@gmail.com>
Co-authored-by: yan zhang <dirtysalt1987@gmail.com>
2025-08-13 07:23:53 +00:00
mergify[bot] e70b5139dd
[Enhancement] Implement function json_contains (backport #61403) (#61867)
Co-authored-by: Murphy <96611012+murphyatwork@users.noreply.github.com>
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
2025-08-13 07:10:39 +00:00
mergify[bot] 1c0ffd7f4c
[Enhancement] Add colocate group balance statistic (backport #61736) (#61876)
Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
2025-08-13 06:13:14 +00:00
1352 changed files with 57815 additions and 8556 deletions

135
.github/CODEOWNERS vendored
View File

@ -1,137 +1,2 @@
# committer will be the owner of all codes
* @StarRocks/starrocks-committer
# cpp miscellaneous
/be/src/common/ @StarRocks/cpp-misc-maintainer
/be/src/gen_cpp/ @StarRocks/cpp-misc-maintainer
/be/src/gutil/ @StarRocks/cpp-misc-maintainer
/be/src/simd/ @StarRocks/cpp-misc-maintainer
/be/src/testutil/ @StarRocks/cpp-misc-maintainer
/be/src/util/ @StarRocks/cpp-misc-maintainer
# execution engine
/be/src/column/ @StarRocks/execution-maintainer
/be/src/exec/ @StarRocks/execution-maintainer
/be/src/exprs/ @StarRocks/execution-maintainer
/be/src/runtime/ @StarRocks/execution-maintainer
/be/src/types/ @StarRocks/execution-maintainer
/be/src/udf/ @StarRocks/execution-maintainer
# open formats
/be/src/formats/ @StarRocks/open-format-maintainer
# storage engine
/be/src/fs/ @StarRocks/storage-maintainer
/be/src/io/ @StarRocks/storage-maintainer
/be/src/storage/ @StarRocks/storage-maintainer
# /docs/ belong to docs-maintainer
/docs/ @StarRocks/docs-maintainer
# /docker
/docker/ @StarRocks/docker-maintainer
# metadata
/fe/fe-core/src/main/java/com/starrocks/authentication/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/privilege/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/common/util/concurrent/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/mysql/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/healthchecker/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/clone/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/consistency/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/ha/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/journal/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/leader/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/meta/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/persist/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/alter/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/backup/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/catalog/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/metric/ @StarRocks/metadata-maintainer
/fe/fe-core/src/main/java/com/starrocks/system/ @StarRocks/metadata-maintainer
# connector
/fe/fe-core/src/main/java/com/starrocks/connector/ @StarRocks/connector-maintainer
/fe/fe-core/src/main/java/com/starrocks/credential/ @StarRocks/connector-maintainer
# parser
/fe/fe-core/src/main/java/com/starrocks/sql/ast/ @StarRocks/parser
/fe/fe-core/src/main/java/com/starrocks/sql/parser/ @StarRocks/parser
# analyzer
/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/ @StarRocks/analyzer
/fe/fe-core/src/main/java/com/starrocks/analysis/ @StarRocks/analyzer
# optimizer
/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/ @StarRocks/optimizer
/fe/fe-core/src/main/java/com/starrocks/statistic/ @StarRocks/optimizer
# scheduler
/fe/fe-core/src/main/java/com/starrocks/qe/scheduler/ @StarRocks/scheduler-maintainer
# sql/parser/StarRocksLex.g4 sql/parser/StarRocks.g4 belong to syntax-committer
/fe/fe-core/src/main/java/com/starrocks/sql/parser/StarRocksLex.g4 @StarRocks/syntax-committer
/fe/fe-core/src/main/java/com/starrocks/sql/parser/StarRocks.g4 @StarRocks/syntax-committer
/gensrc/script/functions.py @StarRocks/syntax-committer
# /thirdparty/ /docker/dockerfiles/dev-env/dev-env.Dockerfile belong to thirdparty-maintainer
/be/src/thirdparty/ @StarRocks/thirdparty-maintainer
/thirdparty/ @StarRocks/thirdparty-maintainer
/docker/dockerfiles/dev-env/dev-env.Dockerfile @StarRocks/thirdparty-maintainer
# cloud native
/be/src/storage/lake/ @StarRocks/cloud-native-maintainer
/be/src/runtime/lake_tablets_channel.h @StarRocks/cloud-native-maintainer
/be/src/runtime/lake_tablets_channel.cpp @StarRocks/cloud-native-maintainer
# error message
/fe/fe-core/src/main/java/com/starrocks/common/ErrorCode.java @StarRocks/msg-reviewer
# StorageEngine/ExecEnv/GlobalEnv
/be/src/runtime/exec_env.h @StarRocks/thread-committer
/be/src/runtime/exec_env.cpp @StarRocks/thread-committer
/be/src/storage/olap_server.cpp @StarRocks/thread-committer
/be/src/storage/storage_engine.h @StarRocks/thread-committer
/be/src/storage/storage_engine.cpp @StarRocks/thread-committer
/be/src/service/starrocks_main.cpp @StarRocks/thread-committer
/be/src/service/service_be/starrocks_be.cpp @StarRocks/thread-committer
# restful
/fe/fe-core/src/main/java/com/starrocks/http @StarRocks/restful-maintainer
/be/src/http @StarRocks/restful-maintainer
# load and unload
/fe/fe-core/src/main/java/com/starrocks/load/* @StarRocks/load-unload-maintainer
/fe/fe-core/src/main/java/com/starrocks/plan/StreamLoad* @StarRocks/load-unload-maintainer
/fe/fe-core/src/main/java/com/starrocks/plan/*Sink.java @StarRocks/load-unload-maintainer
/fe/fe-core/src/main/java/com/starrocks/sql/InsertPlanner.java @StarRocks/load-unload-maintainer
/fe/fe-core/src/main/java/com/starrocks/sql/LoadPlanner.java @StarRocks/load-unload-maintainer
/fe/fe-core/src/main/java/com/starrocks/backup/* @StarRocks/load-unload-maintainer
/fe/fe-core/src/main/java/com/starrocks/alter/Optimize* @StarRocks/load-unload-maintainer
/fe/fe-core/src/main/java/com/starrocks/alter/Compaction* @StarRocks/load-unload-maintainer
/fe/fe-core/src/main/java/com/starrocks/catalog/*Partition* @StarRocks/load-unload-maintainer
/be/src/storage/* @StarRocks/load-unload-maintainer
/be/src/exec/tablet_sink* @StarRocks/load-unload-maintainer
/be/src/exec/csv_scanner.cpp @StarRocks/load-unload-maintainer
/be/src/exec/json_scanner.cpp @StarRocks/load-unload-maintainer
/be/src/exec/pipeline/olap_table_sink_operator.cpp @StarRocks/load-unload-maintainer
/be/src/formats/avro/* @StarRocks/load-unload-maintainer
/be/src/formats/csv/* @StarRocks/load-unload-maintainer
/be/src/formats/json/* @StarRocks/load-unload-maintainer
/be/src/http/action/compaction_action.cpp @StarRocks/load-unload-maintainer
/be/src/http/action/*stream_load.cpp @StarRocks/load-unload-maintainer
/be/src/http/action/restore* @StarRocks/load-unload-maintainer
/be/src/runtime/batch_write/* @StarRocks/load-unload-maintainer
/be/src/runtime/routine_load/* @StarRocks/load-unload-maintainer
/be/src/runtime/stream_load/* @StarRocks/load-unload-maintainer
/be/src/runtime/load* @StarRocks/load-unload-maintainer
/be/src/runtime/tablets_channel.cpp @StarRocks/load-unload-maintainer
/be/src/runtime/local_tablets_channel* @StarRocks/load-unload-maintainer
/be/src/runtime/export_sink.cpp @StarRocks/load-unload-maintainer
# meta upgrade/downgrade compatibility
/fe/fe-core/src/main/java/com/starrocks/persist/gson/GsonUtils.java @StarRocks/meta-compatibility-maintainer

View File

@ -451,22 +451,35 @@ void AgentServer::Impl::submit_tasks(TAgentResult& agent_result, const std::vect
}
}
#define HANDLE_TASK(t_task_type, all_tasks, do_func, AGENT_REQ, request, env) \
for (auto* task : all_tasks) { \
auto pool = get_thread_pool(t_task_type); \
auto signature = task->signature; \
std::pair<bool, size_t> register_pair = register_task_info(task_type, signature); \
if (register_pair.first) { \
LOG(INFO) << "Submit task success. type=" << t_task_type << ", signature=" << signature \
<< ", task_count_in_queue=" << register_pair.second; \
ret_st = pool->submit_func( \
std::bind(do_func, std::make_shared<AGENT_REQ>(*task, task->request, time(nullptr)), env)); \
if (!ret_st.ok()) { \
LOG(WARNING) << "fail to submit task. reason: " << ret_st.message() << ", task: " << task; \
} \
} else { \
LOG(INFO) << "Submit task failed, already exists type=" << t_task_type << ", signature=" << signature; \
} \
#define HANDLE_TASK(t_task_type, all_tasks, do_func, AGENT_REQ, request, env) \
{ \
std::string submit_log = "Submit task success. type=" + to_string(t_task_type) + ", signatures="; \
size_t log_count = 0; \
size_t queue_len = 0; \
for (auto* task : all_tasks) { \
auto pool = get_thread_pool(t_task_type); \
auto signature = task->signature; \
std::pair<bool, size_t> register_pair = register_task_info(task_type, signature); \
if (register_pair.first) { \
if (log_count++ < 100) { \
submit_log += std::to_string(signature) + ","; \
} \
queue_len = register_pair.second; \
ret_st = pool->submit_func( \
std::bind(do_func, std::make_shared<AGENT_REQ>(*task, task->request, time(nullptr)), env)); \
if (!ret_st.ok()) { \
LOG(WARNING) << "fail to submit task. reason: " << ret_st.message() << ", task: " << task; \
} \
} else { \
LOG(INFO) << "Submit task failed, already exists type=" << t_task_type << ", signature=" << signature; \
} \
} \
if (queue_len > 0) { \
if (log_count >= 100) { \
submit_log += "...,"; \
} \
LOG(INFO) << submit_log << " task_count_in_queue=" << queue_len; \
} \
}
// batch submit tasks

View File

@ -119,7 +119,7 @@ static void alter_tablet(const TAlterTabletReqV2& agent_task_req, int64_t signat
if (status == STARROCKS_SUCCESS) {
swap(finish_tablet_infos, finish_task_request->finish_tablet_infos);
finish_task_request->__isset.finish_tablet_infos = true;
LOG(INFO) << alter_msg_head << "alter success. signature: " << signature;
VLOG(2) << alter_msg_head << "alter success. signature: " << signature;
error_msgs.emplace_back("alter success");
task_status.__set_status_code(TStatusCode::OK);
} else if (status == STARROCKS_TASK_REQUEST_ERROR) {
@ -156,13 +156,11 @@ static void unify_finish_agent_task(TStatusCode::type status_code, const std::ve
finish_task(finish_task_request);
size_t task_queue_size = remove_task_info(task_type, signature);
LOG(INFO) << "Remove task success. type=" << task_type << ", signature=" << signature
<< ", task_count_in_queue=" << task_queue_size;
VLOG(1) << "Remove task success. type=" << task_type << ", signature=" << signature
<< ", task_count_in_queue=" << task_queue_size;
}
void run_drop_tablet_task(const std::shared_ptr<DropTabletAgentTaskRequest>& agent_task_req, ExecEnv* exec_env) {
StarRocksMetrics::instance()->clone_requests_total.increment(1);
const TDropTabletReq& drop_tablet_req = agent_task_req->task_req;
bool force_drop = drop_tablet_req.__isset.force && drop_tablet_req.force;
@ -348,6 +346,7 @@ void run_clear_transaction_task(const std::shared_ptr<ClearTransactionAgentTaskR
}
void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req, ExecEnv* exec_env) {
StarRocksMetrics::instance()->clone_requests_total.increment(1);
const TCloneReq& clone_req = agent_task_req->task_req;
AgentStatus status = STARROCKS_SUCCESS;
@ -366,6 +365,7 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
if (clone_req.__isset.is_local && clone_req.is_local) {
DataDir* dest_store = StorageEngine::instance()->get_store(clone_req.dest_path_hash);
if (dest_store == nullptr) {
StarRocksMetrics::instance()->clone_requests_failed.increment(1);
LOG(WARNING) << "fail to get dest store. path_hash:" << clone_req.dest_path_hash;
status_code = TStatusCode::RUNTIME_ERROR;
} else {
@ -374,6 +374,7 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
need_rebuild_pk_index);
Status res = StorageEngine::instance()->execute_task(&engine_task);
if (!res.ok()) {
StarRocksMetrics::instance()->clone_requests_failed.increment(1);
status_code = TStatusCode::RUNTIME_ERROR;
LOG(WARNING) << "local tablet migration failed. status: " << res
<< ", signature: " << agent_task_req->signature;
@ -392,6 +393,14 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
tablet_infos.push_back(tablet_info);
}
finish_task_request.__set_finish_tablet_infos(tablet_infos);
int64_t copy_size = engine_task.get_copy_size();
finish_task_request.__set_copy_size(copy_size);
StarRocksMetrics::instance()->clone_task_intra_node_copy_bytes.increment(copy_size);
int64_t copy_time_ms = engine_task.get_copy_time_ms();
finish_task_request.__set_copy_time_ms(copy_time_ms);
StarRocksMetrics::instance()->clone_task_intra_node_copy_duration_ms.increment(copy_time_ms);
}
}
} else {
@ -399,6 +408,7 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
&error_msgs, &tablet_infos, &status);
Status res = StorageEngine::instance()->execute_task(&engine_task);
if (!res.ok()) {
StarRocksMetrics::instance()->clone_requests_failed.increment(1);
status_code = TStatusCode::RUNTIME_ERROR;
LOG(WARNING) << "clone failed. status:" << res << ", signature:" << agent_task_req->signature;
error_msgs.emplace_back("clone failed.");
@ -412,6 +422,14 @@ void run_clone_task(const std::shared_ptr<CloneAgentTaskRequest>& agent_task_req
LOG(INFO) << "clone success, set tablet infos. status:" << status
<< ", signature:" << agent_task_req->signature;
finish_task_request.__set_finish_tablet_infos(tablet_infos);
int64_t copy_size = engine_task.get_copy_size();
finish_task_request.__set_copy_size(copy_size);
StarRocksMetrics::instance()->clone_task_inter_node_copy_bytes.increment(copy_size);
int64_t copy_time_ms = engine_task.get_copy_time_ms();
finish_task_request.__set_copy_time_ms(copy_time_ms);
StarRocksMetrics::instance()->clone_task_inter_node_copy_duration_ms.increment(copy_time_ms);
}
}
}
@ -708,8 +726,7 @@ void run_upload_task(const std::shared_ptr<UploadAgentTaskRequest>& agent_task_r
finish_task(finish_task_request);
remove_task_info(agent_task_req->task_type, agent_task_req->signature);
LOG(INFO) << "Finished uploaded task signature=" << agent_task_req->signature
<< " job id=" << upload_request.job_id;
VLOG(1) << "Finished uploaded task signature=" << agent_task_req->signature << " job id=" << upload_request.job_id;
}
void run_download_task(const std::shared_ptr<DownloadAgentTaskRequest>& agent_task_req, ExecEnv* exec_env) {
@ -744,8 +761,8 @@ void run_download_task(const std::shared_ptr<DownloadAgentTaskRequest>& agent_ta
finish_task(finish_task_request);
remove_task_info(agent_task_req->task_type, agent_task_req->signature);
LOG(INFO) << "Finished downloaded task signature=" << agent_task_req->signature
<< " job id=" << download_request.job_id;
VLOG(1) << "Finished downloaded task signature=" << agent_task_req->signature
<< " job id=" << download_request.job_id;
}
void run_make_snapshot_task(const std::shared_ptr<SnapshotAgentTaskRequest>& agent_task_req, ExecEnv* exec_env) {
@ -766,9 +783,9 @@ void run_make_snapshot_task(const std::shared_ptr<SnapshotAgentTaskRequest>& age
<< " status=" << st.to_string();
error_msgs.push_back("make_snapshot failed. status: " + st.to_string());
} else {
LOG(INFO) << "Created snapshot tablet_id=" << snapshot_request.tablet_id
<< " schema_hash=" << snapshot_request.schema_hash << " version=" << snapshot_request.version
<< " snapshot_path=" << snapshot_path;
VLOG(1) << "Created snapshot tablet_id=" << snapshot_request.tablet_id
<< " schema_hash=" << snapshot_request.schema_hash << " version=" << snapshot_request.version
<< " snapshot_path=" << snapshot_path;
if (snapshot_request.__isset.list_files) {
// list and save all snapshot files
// snapshot_path like: data/snapshot/20180417205230.1.86400
@ -818,7 +835,7 @@ void run_release_snapshot_task(const std::shared_ptr<ReleaseSnapshotAgentTaskReq
error_msgs.push_back("release_snapshot failed. status: " +
boost::lexical_cast<std::string>(release_snapshot_status));
} else {
LOG(INFO) << "Released snapshot path=" << snapshot_path << " status=" << release_snapshot_status;
VLOG(1) << "Released snapshot path=" << snapshot_path << " status=" << release_snapshot_status;
}
unify_finish_agent_task(status_code, error_msgs, agent_task_req->task_type, agent_task_req->signature);
@ -1045,8 +1062,8 @@ void run_remote_snapshot_task(const std::shared_ptr<RemoteSnapshotAgentTaskReque
finish_task(finish_task_request);
#endif
auto task_queue_size = remove_task_info(agent_task_req->task_type, agent_task_req->signature);
LOG(INFO) << "Remove task success. type=" << agent_task_req->task_type
<< ", signature=" << agent_task_req->signature << ", task_count_in_queue=" << task_queue_size;
VLOG(1) << "Remove task success. type=" << agent_task_req->task_type << ", signature=" << agent_task_req->signature
<< ", task_count_in_queue=" << task_queue_size;
}
void run_replicate_snapshot_task(const std::shared_ptr<ReplicateSnapshotAgentTaskRequest>& agent_task_req,
@ -1090,8 +1107,8 @@ void run_replicate_snapshot_task(const std::shared_ptr<ReplicateSnapshotAgentTas
finish_task(finish_task_request);
#endif
auto task_queue_size = remove_task_info(agent_task_req->task_type, agent_task_req->signature);
LOG(INFO) << "Remove task success. type=" << agent_task_req->task_type
<< ", signature=" << agent_task_req->signature << ", task_count_in_queue=" << task_queue_size;
VLOG(1) << "Remove task success. type=" << agent_task_req->task_type << ", signature=" << agent_task_req->signature
<< ", task_count_in_queue=" << task_queue_size;
}
} // namespace starrocks

View File

@ -27,6 +27,8 @@
#include "storage/tablet.h"
#include "storage/tablet_manager.h"
#include "storage/txn_manager.h"
#include "util/countdown_latch.h"
#include "util/defer_op.h"
#include "util/starrocks_metrics.h"
#include "util/threadpool.h"
#include "util/time.h"
@ -49,6 +51,7 @@ struct TabletPublishVersionTask {
// or 0 which means tablet not found or publish task cannot be submitted
int64_t max_continuous_version{0};
bool is_double_write{false};
bool is_shadow{false};
};
void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionRequest& publish_version_req,
@ -91,7 +94,7 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
}
}
} else {
std::vector<std::map<TabletInfo, RowsetSharedPtr>> partitions(num_partition);
std::vector<std::map<TabletInfo, std::pair<RowsetSharedPtr, bool>>> partitions(num_partition);
for (size_t i = 0; i < publish_version_req.partition_version_infos.size(); i++) {
StorageEngine::instance()->txn_manager()->get_txn_related_tablets(
transaction_id, publish_version_req.partition_version_infos[i].partition_id, &partitions[i]);
@ -108,7 +111,8 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
task.partition_id = publish_version_req.partition_version_infos[i].partition_id;
task.tablet_id = itr.first.tablet_id;
task.version = publish_version_req.partition_version_infos[i].version;
task.rowset = std::move(itr.second);
task.rowset = std::move(itr.second.first);
task.is_shadow = itr.second.second;
// rowset can be nullptr if it just prepared but not committed
if (task.rowset != nullptr) {
task.rowset->rowset_meta()->set_gtid(publish_version_req.gtid);
@ -122,87 +126,101 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
span->SetAttribute("num_tablet", num_active_tablet);
std::mutex affected_dirs_lock;
CountDownLatch latch(static_cast<int>(tablet_tasks.size()));
for (auto& tablet_task : tablet_tasks) {
uint32_t retry_time = 0;
Status st;
while (retry_time++ < PUBLISH_VERSION_SUBMIT_MAX_RETRY) {
st = token->submit_func([&]() {
auto& task = tablet_task;
auto tablet_span = Tracer::Instance().add_span("tablet_publish_txn", span);
auto scoped_tablet_span = trace::Scope(tablet_span);
tablet_span->SetAttribute("txn_id", transaction_id);
tablet_span->SetAttribute("tablet_id", task.tablet_id);
tablet_span->SetAttribute("version", task.version);
if (!is_replication_txn && !task.rowset) {
task.st = Status::NotFound(
fmt::format("rowset not found of tablet: {}, txn_id: {}", task.tablet_id, task.txn_id));
LOG(WARNING) << task.st;
return;
}
TabletSharedPtr tablet = StorageEngine::instance()->tablet_manager()->get_tablet(task.tablet_id);
if (!tablet) {
// tablet may get dropped, it's ok to ignore this situation
LOG(WARNING) << fmt::format(
"publish_version tablet not found tablet_id: {}, version: {} txn_id: {}", task.tablet_id,
task.version, task.txn_id);
return;
}
{
std::lock_guard lg(affected_dirs_lock);
affected_dirs.insert(tablet->data_dir());
}
if (is_replication_txn) {
task.st = StorageEngine::instance()->replication_txn_manager()->publish_txn(
task.txn_id, task.partition_id, tablet, task.version);
if (!task.st.ok()) {
LOG(WARNING) << "Publish txn failed tablet:" << tablet->tablet_id()
<< " version:" << task.version << " partition:" << task.partition_id
<< " txn_id: " << task.txn_id;
std::string_view msg = task.st.message();
tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
} else {
VLOG(2) << "Publish txn success tablet:" << tablet->tablet_id() << " version:" << task.version
<< " tablet_max_version:" << tablet->max_continuous_version()
<< " partition:" << task.partition_id << " txn_id: " << task.txn_id;
}
} else if (is_version_overwrite) {
task.st = StorageEngine::instance()->txn_manager()->publish_overwrite_txn(
task.partition_id, tablet, task.txn_id, task.version, task.rowset, wait_time);
if (!task.st.ok()) {
LOG(WARNING) << "Publish overwrite txn failed tablet:" << tablet->tablet_id()
<< " version:" << task.version << " partition:" << task.partition_id
<< " txn_id: " << task.txn_id << " rowset:" << task.rowset->rowset_id();
std::string_view msg = task.st.message();
tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
} else {
LOG(INFO) << "Publish overwrite txn success tablet:" << tablet->tablet_id()
<< " version:" << task.version
<< " tablet_max_version:" << tablet->max_continuous_version()
<< " partition:" << task.partition_id << " txn_id: " << task.txn_id
<< " rowset:" << task.rowset->rowset_id();
}
} else {
task.st = StorageEngine::instance()->txn_manager()->publish_txn(
task.partition_id, tablet, task.txn_id, task.version, task.rowset, wait_time,
task.is_double_write);
if (!task.st.ok()) {
LOG(WARNING) << "Publish txn failed tablet:" << tablet->tablet_id()
<< " version:" << task.version << " partition:" << task.partition_id
<< " txn_id: " << task.txn_id << " rowset:" << task.rowset->rowset_id();
std::string_view msg = task.st.message();
tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
} else {
if (task.is_double_write || VLOG_ROW_IS_ON) {
LOG(INFO) << "Publish txn success tablet:" << tablet->tablet_id()
<< " version:" << task.version
<< " tablet_max_version:" << tablet->max_continuous_version()
<< " is_double_write:" << task.is_double_write
<< " partition:" << task.partition_id << " txn_id: " << task.txn_id
<< " rowset:" << task.rowset->rowset_id();
auto task = std::make_shared<CancellableRunnable>(
[&]() {
DeferOp defer([&] { latch.count_down(); });
auto& task = tablet_task;
auto tablet_span = Tracer::Instance().add_span("tablet_publish_txn", span);
auto scoped_tablet_span = trace::Scope(tablet_span);
tablet_span->SetAttribute("txn_id", transaction_id);
tablet_span->SetAttribute("tablet_id", task.tablet_id);
tablet_span->SetAttribute("version", task.version);
if (!is_replication_txn && !task.rowset) {
task.st = Status::NotFound(fmt::format("rowset not found of tablet: {}, txn_id: {}",
task.tablet_id, task.txn_id));
LOG(WARNING) << task.st;
return;
}
}
}
});
TabletSharedPtr tablet =
StorageEngine::instance()->tablet_manager()->get_tablet(task.tablet_id);
if (!tablet) {
// tablet may get dropped, it's ok to ignore this situation
LOG(WARNING) << fmt::format(
"publish_version tablet not found tablet_id: {}, version: {} txn_id: {}",
task.tablet_id, task.version, task.txn_id);
return;
}
{
std::lock_guard lg(affected_dirs_lock);
affected_dirs.insert(tablet->data_dir());
}
if (is_replication_txn) {
task.st = StorageEngine::instance()->replication_txn_manager()->publish_txn(
task.txn_id, task.partition_id, tablet, task.version);
if (!task.st.ok()) {
LOG(WARNING) << "Publish txn failed tablet:" << tablet->tablet_id()
<< " version:" << task.version << " partition:" << task.partition_id
<< " txn_id: " << task.txn_id;
std::string_view msg = task.st.message();
tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
} else {
VLOG(2) << "Publish txn success tablet:" << tablet->tablet_id()
<< " version:" << task.version
<< " tablet_max_version:" << tablet->max_continuous_version()
<< " partition:" << task.partition_id << " txn_id: " << task.txn_id;
}
} else if (is_version_overwrite) {
task.st = StorageEngine::instance()->txn_manager()->publish_overwrite_txn(
task.partition_id, tablet, task.txn_id, task.version, task.rowset, wait_time);
if (!task.st.ok()) {
LOG(WARNING) << "Publish overwrite txn failed tablet:" << tablet->tablet_id()
<< " version:" << task.version << " partition:" << task.partition_id
<< " txn_id: " << task.txn_id << " rowset:" << task.rowset->rowset_id();
std::string_view msg = task.st.message();
tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
} else {
LOG(INFO) << "Publish overwrite txn success tablet:" << tablet->tablet_id()
<< " version:" << task.version
<< " tablet_max_version:" << tablet->max_continuous_version()
<< " partition:" << task.partition_id << " txn_id: " << task.txn_id
<< " rowset:" << task.rowset->rowset_id();
}
} else {
task.st = StorageEngine::instance()->txn_manager()->publish_txn(
task.partition_id, tablet, task.txn_id, task.version, task.rowset, wait_time,
task.is_double_write);
if (!task.st.ok()) {
LOG(WARNING) << "Publish txn failed tablet:" << tablet->tablet_id()
<< " version:" << task.version << " partition:" << task.partition_id
<< " txn_id: " << task.txn_id << " rowset:" << task.rowset->rowset_id();
std::string_view msg = task.st.message();
tablet_span->SetStatus(trace::StatusCode::kError, {msg.data(), msg.size()});
} else {
if (task.is_double_write || VLOG_ROW_IS_ON) {
LOG(INFO) << "Publish txn success tablet:" << tablet->tablet_id()
<< " version:" << task.version
<< " tablet_max_version:" << tablet->max_continuous_version()
<< " is_double_write:" << task.is_double_write
<< " partition:" << task.partition_id << " txn_id: " << task.txn_id
<< " rowset:" << task.rowset->rowset_id();
}
}
}
},
[&]() {
tablet_task.st = Status::Cancelled(
fmt::format("publish version task has been cancelled, tablet_id={}, version={}",
tablet_task.tablet_id, tablet_task.version));
VLOG(1) << tablet_task.st;
latch.count_down();
});
st = token->submit(std::move(task));
if (st.is_service_unavailable()) {
int64_t retry_sleep_ms = 50 * retry_time;
LOG(WARNING) << "publish version threadpool is busy, retry in " << retry_sleep_ms
@ -217,10 +235,11 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
}
if (!st.ok()) {
tablet_task.st = std::move(st);
latch.count_down();
}
}
span->AddEvent("all_task_submitted");
token->wait();
latch.wait();
span->AddEvent("all_task_finished");
Status st;
@ -235,10 +254,13 @@ void run_publish_version_task(ThreadPoolToken* token, const TPublishVersionReque
if (st.ok()) {
st = task.st;
}
} else {
} else if (!task.is_shadow) {
auto& pair = tablet_publish_versions.emplace_back();
pair.__set_tablet_id(task.tablet_id);
pair.__set_version(task.version);
} else {
VLOG(1) << "publish_version success tablet:" << task.tablet_id << " version:" << task.version
<< " is_shadow:" << task.is_shadow;
}
}
// return tablet and its version which has already finished.

View File

@ -401,8 +401,8 @@ void* DeleteTaskWorkerPool::_worker_thread_callback(void* arg_this) {
int num_of_remove_task = 0;
if (push_req.push_type == TPushType::CANCEL_DELETE) {
LOG(INFO) << "get delete push task. remove delete task txn_id: " << push_req.transaction_id
<< " priority: " << priority << " push_type: " << push_req.push_type;
VLOG(3) << "get delete push task. remove delete task txn_id: " << push_req.transaction_id
<< " priority: " << priority << " push_type: " << push_req.push_type;
std::lock_guard l(worker_pool_this->_worker_thread_lock);
auto& tasks = worker_pool_this->_tasks;
@ -435,8 +435,8 @@ void* DeleteTaskWorkerPool::_worker_thread_callback(void* arg_this) {
}
auto& push_req = agent_task_req->task_req;
LOG(INFO) << "get delete push task. signature: " << agent_task_req->signature << " priority: " << priority
<< " push_type: " << push_req.push_type;
VLOG(3) << "get delete push task. signature: " << agent_task_req->signature << " priority: " << priority
<< " push_type: " << push_req.push_type;
std::vector<TTabletInfo> tablet_infos;
EngineBatchLoadTask engine_task(push_req, &tablet_infos, agent_task_req->signature, &status,
GlobalEnv::GetInstance()->load_mem_tracker());
@ -848,7 +848,8 @@ void* ReportDataCacheMetricsTaskWorkerPool::_worker_thread_callback(void* arg_th
request.__set_report_version(g_report_version.load(std::memory_order_relaxed));
TDataCacheMetrics t_metrics{};
const LocalCacheEngine* cache = DataCache::GetInstance()->local_cache();
// TODO: mem_metrics + disk_metrics
const LocalCacheEngine* cache = DataCache::GetInstance()->local_disk_cache();
if (cache != nullptr && cache->is_initialized()) {
const auto metrics = cache->cache_metrics();
DataCacheUtils::set_metrics_from_thrift(t_metrics, metrics);

View File

@ -129,8 +129,6 @@ void ObjectCacheBench::init_cache(CacheType cache_type) {
_page_cache = std::make_shared<StoragePageCache>();
_page_cache->init(_lru_cache.get());
} else {
opt.engine = "starcache";
_star_cache = std::make_shared<StarCacheEngine>();
Status st = _star_cache->init(opt);
if (!st.ok()) {

View File

@ -36,7 +36,7 @@ BlockCache::~BlockCache() {
(void)shutdown();
}
Status BlockCache::init(const CacheOptions& options, std::shared_ptr<LocalCacheEngine> local_cache,
Status BlockCache::init(const BlockCacheOptions& options, std::shared_ptr<LocalCacheEngine> local_cache,
std::shared_ptr<RemoteCacheEngine> remote_cache) {
_block_size = std::min(options.block_size, MAX_BLOCK_SIZE);
_local_cache = std::move(local_cache);

View File

@ -33,7 +33,7 @@ public:
~BlockCache();
// Init the block cache instance
Status init(const CacheOptions& options, std::shared_ptr<LocalCacheEngine> local_cache,
Status init(const BlockCacheOptions& options, std::shared_ptr<LocalCacheEngine> local_cache,
std::shared_ptr<RemoteCacheEngine> remote_cache);
// Write data buffer to cache, the `offset` must be aligned by block size

View File

@ -42,7 +42,15 @@ struct DirSpace {
size_t size;
};
struct CacheOptions {
struct RemoteCacheOptions {
double skip_read_factor = 0;
};
struct MemCacheOptions {
size_t mem_space_size = 0;
};
struct DiskCacheOptions {
// basic
size_t mem_space_size = 0;
std::vector<DirSpace> dir_spaces;
@ -54,7 +62,6 @@ struct CacheOptions {
bool enable_direct_io = false;
bool enable_tiered_cache = true;
bool enable_datacache_persistence = false;
std::string engine;
size_t max_concurrent_inserts = 0;
size_t max_flying_memory_mb = 0;
double scheduler_threads_per_cpu = 0;
@ -63,6 +70,10 @@ struct CacheOptions {
std::string eviction_policy;
};
struct BlockCacheOptions {
size_t block_size = 0;
};
struct WriteCacheOptions {
int8_t priority = 0;
// If ttl_seconds=0 (default), no ttl restriction will be set. If an old one exists, remove it.

View File

@ -44,14 +44,9 @@ Status DataCache::init(const std::vector<StorePath>& store_paths) {
_page_cache = std::make_shared<StoragePageCache>();
#if defined(WITH_STARCACHE)
if (config::datacache_engine == "" || config::datacache_engine == "starcache") {
config::datacache_engine = "starcache";
} else {
config::datacache_engine = "lrucache";
}
#else
config::datacache_engine = "lrucache";
_local_disk_cache_engine = "starcache";
#endif
_local_mem_cache_engine = "lrucache";
if (!config::datacache_enable) {
config::disable_storage_page_cache = true;
@ -59,22 +54,22 @@ Status DataCache::init(const std::vector<StorePath>& store_paths) {
return Status::OK();
}
ASSIGN_OR_RETURN(auto cache_options, _init_cache_options());
ASSIGN_OR_RETURN(auto mem_cache_options, _init_mem_cache_options());
if (config::datacache_engine == "starcache") {
#if defined(WITH_STARCACHE)
RETURN_IF_ERROR(_init_starcache_engine(&cache_options));
RETURN_IF_ERROR(_init_peer_cache(cache_options));
ASSIGN_OR_RETURN(auto disk_cache_options, _init_disk_cache_options());
RETURN_IF_ERROR(_init_starcache_engine(&disk_cache_options));
if (config::block_cache_enable) {
RETURN_IF_ERROR(_block_cache->init(cache_options, _local_cache, _remote_cache));
}
#else
return Status::InternalError("starcache engine is not supported");
#endif
} else {
RETURN_IF_ERROR(_init_lrucache_engine(cache_options));
auto remote_cache_options = _init_remote_cache_options();
RETURN_IF_ERROR(_init_peer_cache(remote_cache_options));
if (config::block_cache_enable) {
auto block_cache_options = _init_block_cache_options();
RETURN_IF_ERROR(_block_cache->init(block_cache_options, _local_disk_cache, _remote_cache));
}
#endif
RETURN_IF_ERROR(_init_lrucache_engine(mem_cache_options));
RETURN_IF_ERROR(_init_page_cache());
@ -100,14 +95,15 @@ void DataCache::destroy() {
LOG(INFO) << "pagecache shutdown successfully";
_block_cache.reset();
_local_cache.reset();
_local_mem_cache.reset();
_local_disk_cache.reset();
_remote_cache.reset();
LOG(INFO) << "datacache shutdown successfully";
}
bool DataCache::adjust_mem_capacity(int64_t delta, size_t min_capacity) {
if (_local_cache != nullptr) {
Status st = _local_cache->adjust_mem_quota(delta, min_capacity);
if (_local_mem_cache != nullptr) {
Status st = _local_mem_cache->adjust_mem_quota(delta, min_capacity);
if (st.ok()) {
return true;
} else {
@ -119,52 +115,67 @@ bool DataCache::adjust_mem_capacity(int64_t delta, size_t min_capacity) {
}
size_t DataCache::get_mem_capacity() const {
if (_local_cache != nullptr) {
return _local_cache->mem_quota();
if (_local_mem_cache != nullptr) {
return _local_mem_cache->mem_quota();
} else {
return 0;
}
}
Status DataCache::_init_lrucache_engine(const CacheOptions& cache_options) {
_local_cache = std::make_shared<LRUCacheEngine>();
RETURN_IF_ERROR(_local_cache->init(cache_options));
Status DataCache::_init_lrucache_engine(const MemCacheOptions& cache_options) {
_local_mem_cache = std::make_shared<LRUCacheEngine>();
RETURN_IF_ERROR(reinterpret_cast<LRUCacheEngine*>(_local_mem_cache.get())->init(cache_options));
LOG(INFO) << "lrucache engine init successfully";
return Status::OK();
}
Status DataCache::_init_page_cache() {
_page_cache->init(_local_cache.get());
_page_cache->init(_local_mem_cache.get());
_page_cache->init_metrics();
LOG(INFO) << "storage page cache init successfully";
return Status::OK();
}
#if defined(WITH_STARCACHE)
Status DataCache::_init_starcache_engine(CacheOptions* cache_options) {
Status DataCache::_init_starcache_engine(DiskCacheOptions* cache_options) {
// init starcache & disk monitor
// TODO: DiskSpaceMonitor needs to be decoupled from StarCacheEngine.
_local_cache = std::make_shared<StarCacheEngine>();
_disk_space_monitor = std::make_shared<DiskSpaceMonitor>(_local_cache.get());
_local_disk_cache = std::make_shared<StarCacheEngine>();
_disk_space_monitor = std::make_shared<DiskSpaceMonitor>(_local_disk_cache.get());
RETURN_IF_ERROR(_disk_space_monitor->init(&cache_options->dir_spaces));
RETURN_IF_ERROR(_local_cache->init(*cache_options));
RETURN_IF_ERROR(reinterpret_cast<StarCacheEngine*>(_local_disk_cache.get())->init(*cache_options));
_disk_space_monitor->start();
return Status::OK();
}
Status DataCache::_init_peer_cache(const CacheOptions& cache_options) {
Status DataCache::_init_peer_cache(const RemoteCacheOptions& cache_options) {
_remote_cache = std::make_shared<PeerCacheEngine>();
return _remote_cache->init(cache_options);
}
#endif
StatusOr<CacheOptions> DataCache::_init_cache_options() {
CacheOptions cache_options;
RemoteCacheOptions DataCache::_init_remote_cache_options() {
RemoteCacheOptions cache_options{.skip_read_factor = config::datacache_skip_read_factor};
return cache_options;
}
StatusOr<MemCacheOptions> DataCache::_init_mem_cache_options() {
MemCacheOptions cache_options;
RETURN_IF_ERROR(DataCacheUtils::parse_conf_datacache_mem_size(
config::datacache_mem_size, _global_env->process_mem_limit(), &cache_options.mem_space_size));
cache_options.engine = config::datacache_engine;
return cache_options;
}
if (config::datacache_engine == "starcache") {
BlockCacheOptions DataCache::_init_block_cache_options() {
BlockCacheOptions cache_options;
cache_options.block_size = config::datacache_block_size;
return cache_options;
}
StatusOr<DiskCacheOptions> DataCache::_init_disk_cache_options() {
DiskCacheOptions cache_options;
if (_local_disk_cache_engine == "starcache") {
#ifdef USE_STAROS
std::vector<string> corresponding_starlet_dirs;
if (config::datacache_unified_instance_enable && !config::starlet_cache_dir.empty()) {
@ -276,8 +287,8 @@ void DataCache::try_release_resource_before_core_dump() {
return release_all || modules.contains(name);
};
if (_local_cache != nullptr && need_release("data_cache")) {
(void)_local_cache->update_mem_quota(0, false);
if (_local_mem_cache != nullptr && need_release("data_cache")) {
(void)_local_mem_cache->update_mem_quota(0, false);
}
}

View File

@ -23,7 +23,7 @@ namespace starrocks {
class Status;
class StorePath;
class RemoteCacheEngine;
class CacheOptions;
class DiskCacheOptions;
class GlobalEnv;
class DiskSpaceMonitor;
class MemSpaceMonitor;
@ -39,10 +39,16 @@ public:
void try_release_resource_before_core_dump();
void set_local_cache(std::shared_ptr<LocalCacheEngine> local_cache) { _local_cache = std::move(local_cache); }
void set_local_mem_cache(std::shared_ptr<LocalCacheEngine> local_mem_cache) {
_local_mem_cache = std::move(local_mem_cache);
}
void set_local_disk_cache(std::shared_ptr<LocalCacheEngine> local_disk_cache) {
_local_disk_cache = std::move(local_disk_cache);
}
void set_page_cache(std::shared_ptr<StoragePageCache> page_cache) { _page_cache = std::move(page_cache); }
LocalCacheEngine* local_cache() { return _local_cache.get(); }
LocalCacheEngine* local_mem_cache() { return _local_mem_cache.get(); }
LocalCacheEngine* local_disk_cache() { return _local_disk_cache.get(); }
BlockCache* block_cache() const { return _block_cache.get(); }
void set_block_cache(std::shared_ptr<BlockCache> block_cache) { _block_cache = std::move(block_cache); }
StoragePageCache* page_cache() const { return _page_cache.get(); }
@ -56,19 +62,26 @@ public:
size_t get_mem_capacity() const;
private:
StatusOr<CacheOptions> _init_cache_options();
StatusOr<MemCacheOptions> _init_mem_cache_options();
StatusOr<DiskCacheOptions> _init_disk_cache_options();
RemoteCacheOptions _init_remote_cache_options();
BlockCacheOptions _init_block_cache_options();
#if defined(WITH_STARCACHE)
Status _init_starcache_engine(CacheOptions* cache_options);
Status _init_peer_cache(const CacheOptions& cache_options);
Status _init_starcache_engine(DiskCacheOptions* cache_options);
Status _init_peer_cache(const RemoteCacheOptions& cache_options);
#endif
Status _init_lrucache_engine(const CacheOptions& cache_options);
Status _init_lrucache_engine(const MemCacheOptions& cache_options);
Status _init_page_cache();
GlobalEnv* _global_env;
std::vector<StorePath> _store_paths;
// cache engine
std::shared_ptr<LocalCacheEngine> _local_cache;
std::string _local_mem_cache_engine;
std::string _local_disk_cache_engine;
std::shared_ptr<LocalCacheEngine> _local_mem_cache;
std::shared_ptr<LocalCacheEngine> _local_disk_cache;
std::shared_ptr<RemoteCacheEngine> _remote_cache;
std::shared_ptr<BlockCache> _block_cache;

View File

@ -27,7 +27,6 @@ class LocalCacheEngine {
public:
virtual ~LocalCacheEngine() = default;
virtual Status init(const CacheOptions& options) = 0;
virtual bool is_initialized() const = 0;
// Write data to cache

View File

@ -17,7 +17,7 @@
#include <butil/fast_rand.h>
namespace starrocks {
Status LRUCacheEngine::init(const CacheOptions& options) {
Status LRUCacheEngine::init(const MemCacheOptions& options) {
_cache = std::make_unique<ShardedLRUCache>(options.mem_space_size);
_initialized.store(true, std::memory_order_relaxed);
return Status::OK();

View File

@ -25,7 +25,7 @@ public:
LRUCacheEngine() = default;
virtual ~LRUCacheEngine() override = default;
Status init(const CacheOptions& options) override;
Status init(const MemCacheOptions& options);
bool is_initialized() const override { return _initialized.load(std::memory_order_relaxed); }
Status write(const std::string& key, const IOBuffer& buffer, WriteCacheOptions* options) override;

View File

@ -23,7 +23,7 @@
namespace starrocks {
Status PeerCacheEngine::init(const CacheOptions& options) {
Status PeerCacheEngine::init(const RemoteCacheOptions& options) {
_cache_adaptor.reset(starcache::create_default_adaptor(options.skip_read_factor));
return Status::OK();
}

View File

@ -24,7 +24,7 @@ public:
PeerCacheEngine() = default;
~PeerCacheEngine() override = default;
Status init(const CacheOptions& options) override;
Status init(const RemoteCacheOptions& options) override;
Status read(const std::string& key, size_t off, size_t size, IOBuffer* buffer, ReadCacheOptions* options) override;

View File

@ -25,7 +25,7 @@ public:
virtual ~RemoteCacheEngine() = default;
// Init remote cache
virtual Status init(const CacheOptions& options) = 0;
virtual Status init(const RemoteCacheOptions& options) = 0;
// Write data to remote cache
virtual Status write(const std::string& key, const IOBuffer& buffer, WriteCacheOptions* options) = 0;

View File

@ -27,7 +27,7 @@
namespace starrocks {
Status StarCacheEngine::init(const CacheOptions& options) {
Status StarCacheEngine::init(const DiskCacheOptions& options) {
starcache::CacheOptions opt;
opt.mem_quota_bytes = options.mem_space_size;
for (auto& dir : options.dir_spaces) {

View File

@ -26,7 +26,7 @@ public:
StarCacheEngine() = default;
virtual ~StarCacheEngine() override = default;
Status init(const CacheOptions& options) override;
Status init(const DiskCacheOptions& options);
bool is_initialized() const override { return _initialized.load(std::memory_order_relaxed); }
Status write(const std::string& key, const IOBuffer& buffer, WriteCacheOptions* options) override;

View File

@ -37,12 +37,12 @@ void BinaryColumnBase<T>::check_or_die() const {
CHECK_EQ(_bytes.size(), _offsets.back());
size_t size = this->size();
for (size_t i = 0; i < size; i++) {
CHECK_GE(_offsets[i + 1], _offsets[i]);
DCHECK_GE(_offsets[i + 1], _offsets[i]);
}
if (_slices_cache) {
for (size_t i = 0; i < size; i++) {
CHECK_EQ(_slices[i].data, get_slice(i).data);
CHECK_EQ(_slices[i].size, get_slice(i).size);
DCHECK_EQ(_slices[i].data, get_slice(i).data);
DCHECK_EQ(_slices[i].size, get_slice(i).size);
}
}
}
@ -83,35 +83,69 @@ void BinaryColumnBase<T>::append(const Column& src, size_t offset, size_t count)
}
template <typename T>
void BinaryColumnBase<T>::append_selective(const Column& src, const uint32_t* indexes, uint32_t from, uint32_t size) {
void BinaryColumnBase<T>::append_selective(const Column& src, const uint32_t* indexes, uint32_t from,
const uint32_t size) {
if (src.is_binary_view()) {
down_cast<const ColumnView*>(&src)->append_to(*this, indexes, from, size);
return;
}
indexes += from;
const auto& src_column = down_cast<const BinaryColumnBase<T>&>(src);
const auto& src_offsets = src_column.get_offset();
const auto& src_bytes = src_column.get_bytes();
size_t cur_row_count = _offsets.size() - 1;
size_t cur_byte_size = _bytes.size();
const size_t prev_num_offsets = _offsets.size();
const size_t prev_num_rows = prev_num_offsets - 1;
_offsets.resize(cur_row_count + size + 1);
_offsets.resize(prev_num_offsets + size * 2);
auto* __restrict new_offsets = _offsets.data() + prev_num_offsets;
const auto* __restrict src_offsets = src_column.get_offset().data();
// Buffer i-th start offset and end offset in new_offsets[i * 2] and new_offsets[i * 2 + 1].
for (size_t i = 0; i < size; i++) {
uint32_t row_idx = indexes[from + i];
T str_size = src_offsets[row_idx + 1] - src_offsets[row_idx];
_offsets[cur_row_count + i + 1] = _offsets[cur_row_count + i] + str_size;
cur_byte_size += str_size;
const uint32_t src_idx = indexes[i];
new_offsets[i * 2] = src_offsets[src_idx];
new_offsets[i * 2 + 1] = src_offsets[src_idx + 1];
}
_bytes.resize(cur_byte_size);
auto* dest_bytes = _bytes.data();
for (size_t i = 0; i < size; i++) {
uint32_t row_idx = indexes[from + i];
T str_size = src_offsets[row_idx + 1] - src_offsets[row_idx];
strings::memcpy_inlined(dest_bytes + _offsets[cur_row_count + i], src_bytes.data() + src_offsets[row_idx],
str_size);
// Write bytes
{
size_t num_bytes = _bytes.size();
for (size_t i = 0; i < size; i++) {
num_bytes += new_offsets[i * 2 + 1] - new_offsets[i * 2];
}
_bytes.resize(num_bytes);
const auto* __restrict src_bytes = src_column.get_bytes().data();
auto* __restrict dest_bytes = _bytes.data();
size_t cur_offset = _offsets[prev_num_rows];
if (src_column.get_bytes().size() > 32 * 1024 * 1024ull) {
for (size_t i = 0; i < size; i++) {
if (i + 16 < size) {
// If the source column is large enough, use prefetch to speed up copying.
__builtin_prefetch(src_bytes + new_offsets[i * 2 + 32]);
}
const T str_size = new_offsets[i * 2 + 1] - new_offsets[i * 2];
strings::memcpy_inlined(dest_bytes + cur_offset, src_bytes + new_offsets[i * 2], str_size);
cur_offset += str_size;
}
} else {
for (size_t i = 0; i < size; i++) {
const T str_size = new_offsets[i * 2 + 1] - new_offsets[i * 2];
// Only copy 16 bytes extra when src_column is small enough, because the overhead of copying 16 bytes
// will be large when src_column is large enough.
strings::memcpy_inlined_overflow16(dest_bytes + cur_offset, src_bytes + new_offsets[i * 2], str_size);
cur_offset += str_size;
}
}
}
// Write offsets.
for (int64_t i = 0; i < size; i++) {
new_offsets[i] = new_offsets[i - 1] + (new_offsets[i * 2 + 1] - new_offsets[i * 2]);
}
_offsets.resize(prev_num_offsets + size);
_slices_cache = false;
}

View File

@ -255,7 +255,7 @@ std::unique_ptr<Chunk> Chunk::clone_empty_with_slot(size_t size) const {
columns[i] = _columns[i]->clone_empty();
columns[i]->reserve(size);
}
return std::make_unique<Chunk>(columns, _slot_id_to_index);
return std::make_unique<Chunk>(std::move(columns), _slot_id_to_index);
}
std::unique_ptr<Chunk> Chunk::clone_empty_with_schema() const {

View File

@ -74,6 +74,8 @@ public:
bool is_index() const { return _type == TAccessPathType::type::INDEX; }
bool is_root() const { return _type == TAccessPathType::type::ROOT; }
bool is_from_predicate() const { return _from_predicate; }
bool is_extended() const { return _extended; }

View File

@ -18,6 +18,7 @@
#include "runtime/mem_pool.h"
#include "storage/olap_type_infra.h"
#include "storage/type_traits.h"
#include "types/logical_type.h"
namespace starrocks {
@ -51,6 +52,7 @@ Status datum_from_string(TypeInfo* type_info, Datum* dst, const std::string& str
return Status::OK();
}
/* Type need memory allocated */
case TYPE_VARBINARY:
case TYPE_CHAR:
case TYPE_VARCHAR: {
/* Type need memory allocated */
@ -92,6 +94,7 @@ std::string datum_to_string(TypeInfo* type_info, const Datum& datum) {
switch (type) {
case TYPE_BOOLEAN:
return datum_to_string<TYPE_TINYINT>(type_info, datum);
case TYPE_VARBINARY:
case TYPE_CHAR:
case TYPE_VARCHAR:
return datum_to_string<TYPE_VARCHAR>(type_info, datum);

View File

@ -37,28 +37,36 @@ StatusOr<ColumnPtr> FixedLengthColumnBase<T>::upgrade_if_overflow() {
template <typename T>
void FixedLengthColumnBase<T>::append(const Column& src, size_t offset, size_t count) {
const auto& num_src = down_cast<const FixedLengthColumnBase<T>&>(src);
_data.insert(_data.end(), num_src._data.begin() + offset, num_src._data.begin() + offset + count);
DCHECK(this != &src);
const size_t orig_size = _data.size();
raw::stl_vector_resize_uninitialized(&_data, orig_size + count);
const T* src_data = reinterpret_cast<const T*>(src.raw_data());
strings::memcpy_inlined(_data.data() + orig_size, src_data + offset, count * sizeof(T));
}
template <typename T>
void FixedLengthColumnBase<T>::append_selective(const Column& src, const uint32_t* indexes, uint32_t from,
uint32_t size) {
DCHECK(this != &src);
indexes += from;
const T* src_data = reinterpret_cast<const T*>(src.raw_data());
const size_t orig_size = _data.size();
_data.resize(orig_size + size);
raw::stl_vector_resize_uninitialized(&_data, orig_size + size);
auto* dest_data = _data.data() + orig_size;
const T* src_data = reinterpret_cast<const T*>(src.raw_data());
SIMDGather::gather(dest_data, src_data, indexes, size);
}
template <typename T>
void FixedLengthColumnBase<T>::append_value_multiple_times(const Column& src, uint32_t index, uint32_t size) {
const T* src_data = reinterpret_cast<const T*>(src.raw_data());
DCHECK(this != &src);
size_t orig_size = _data.size();
_data.resize(orig_size + size);
const T* src_data = reinterpret_cast<const T*>(src.raw_data());
for (size_t i = 0; i < size; ++i) {
_data[orig_size + i] = src_data[index];
}

View File

@ -523,10 +523,10 @@ void NullableColumn::put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool
}
void NullableColumn::check_or_die() const {
CHECK_EQ(_null_column->size(), _data_column->size());
DCHECK_EQ(_null_column->size(), _data_column->size());
// when _has_null=true, the column may have no null value, so don't check.
if (!_has_null) {
CHECK(!SIMD::contain_nonzero(_null_column->get_data(), 0));
DCHECK(!SIMD::contain_nonzero(_null_column->get_data(), 0));
}
_data_column->check_or_die();
_null_column->check_or_die();

View File

@ -17,6 +17,8 @@
#include <algorithm>
#include <utility>
#include "exec/sorting/sorting.h"
namespace starrocks {
#ifdef BE_TEST
@ -28,8 +30,13 @@ Schema::Schema(Fields fields) : Schema(fields, KeysType::DUP_KEYS, {}) {
#endif
Schema::Schema(Fields fields, KeysType keys_type, std::vector<ColumnId> sort_key_idxes)
: Schema(std::move(fields), keys_type, std::move(sort_key_idxes), nullptr) {}
Schema::Schema(Fields fields, KeysType keys_type, std::vector<ColumnId> sort_key_idxes,
std::shared_ptr<SortDescs> sort_descs)
: _fields(std::move(fields)),
_sort_key_idxes(std::move(sort_key_idxes)),
_sort_descs(std::move(sort_descs)),
_name_to_index_append_buffer(nullptr),
_keys_type(static_cast<uint8_t>(keys_type)) {
@ -52,9 +59,16 @@ Schema::Schema(Schema* schema, const std::vector<ColumnId>& cids)
_fields[i] = schema->_fields[cids[i]];
cids_to_field_id[cids[i]] = i;
}
for (auto idx : ori_sort_idxes) {
if (schema->sort_descs()) {
_sort_descs = std::make_shared<SortDescs>();
}
for (size_t pos = 0; pos < ori_sort_idxes.size(); ++pos) {
auto idx = ori_sort_idxes[pos];
if (cids_to_field_id.count(idx) > 0) {
_sort_key_idxes.emplace_back(cids_to_field_id[idx]);
if (_sort_descs && pos < schema->sort_descs()->descs.size()) {
_sort_descs->descs.emplace_back(schema->sort_descs()->descs[pos]);
}
}
}
auto is_key = [](const FieldPtr& f) { return f->is_key(); };
@ -88,6 +102,7 @@ Schema::Schema(Schema* schema)
_fields[i] = schema->_fields[i];
}
_sort_key_idxes = schema->sort_key_idxes();
_sort_descs = schema->sort_descs();
if (schema->_name_to_index_append_buffer == nullptr) {
// share the name_to_index with schema, later append fields will be added to _name_to_index_append_buffer
schema->_share_name_to_index = true;
@ -109,6 +124,7 @@ Schema::Schema(const Schema& schema)
_fields[i] = schema._fields[i];
}
_sort_key_idxes = schema.sort_key_idxes();
_sort_descs = schema.sort_descs();
if (schema._name_to_index_append_buffer == nullptr) {
// share the name_to_index with schema&, later append fields will be added to _name_to_index_append_buffer
schema._share_name_to_index = true;
@ -132,6 +148,7 @@ Schema& Schema::operator=(const Schema& other) {
this->_fields[i] = other._fields[i];
}
this->_sort_key_idxes = other.sort_key_idxes();
this->_sort_descs = other.sort_descs();
if (other._name_to_index_append_buffer == nullptr) {
// share the name_to_index with schema&, later append fields will be added to _name_to_index_append_buffer
other._share_name_to_index = true;

View File

@ -24,6 +24,8 @@
namespace starrocks {
struct SortDescs;
// TODO: move constructor and move assignment
class Schema {
public:
@ -39,6 +41,9 @@ public:
explicit Schema(Fields fields, KeysType keys_type, std::vector<ColumnId> sort_key_idxes);
explicit Schema(Fields fields, KeysType keys_type, std::vector<ColumnId> sort_key_idxes,
std::shared_ptr<SortDescs> sort_descs);
// if we use this constructor and share the name_to_index with another schema,
// we must make sure another shema is read only!!!
explicit Schema(Schema* schema);
@ -61,6 +66,10 @@ public:
const std::vector<ColumnId> sort_key_idxes() const { return _sort_key_idxes; }
void append_sort_key_idx(ColumnId idx) { _sort_key_idxes.emplace_back(idx); }
void set_sort_key_idxes(const std::vector<ColumnId>& sort_key_idxes) { _sort_key_idxes = sort_key_idxes; }
std::shared_ptr<SortDescs> sort_descs() const { return _sort_descs; }
void set_sort_descs(const std::shared_ptr<SortDescs>& sort_descs) { _sort_descs = sort_descs; }
void reserve(size_t size) { _fields.reserve(size); }
@ -133,6 +142,7 @@ private:
Fields _fields;
size_t _num_keys = 0;
std::vector<ColumnId> _sort_key_idxes;
std::shared_ptr<SortDescs> _sort_descs;
std::shared_ptr<std::unordered_map<std::string_view, size_t>> _name_to_index;
// If we share the same _name_to_index with another vectorized schema,

View File

@ -323,6 +323,14 @@ CONF_mBool(enable_zonemap_index_memory_page_cache, "true");
// whether to enable the ordinal index memory cache
CONF_mBool(enable_ordinal_index_memory_page_cache, "true");
// ========================== ZONEMAP BEGIN ===================================
// Enable ZoneMap for string (CHAR/VARCHAR) columns using prefix-based min/max
CONF_mBool(enable_string_prefix_zonemap, "true");
// Prefix length used for string ZoneMap min/max when enabled
CONF_mInt32(string_prefix_zonemap_prefix_len, "16");
// ========================== ZONEMAP END ===================================
CONF_mInt32(base_compaction_check_interval_seconds, "60");
CONF_mInt64(min_base_compaction_num_singleton_deltas, "5");
CONF_mInt64(max_base_compaction_num_singleton_deltas, "100");
@ -568,6 +576,8 @@ CONF_mBool(enable_token_check, "true");
// to open/close system metrics
CONF_Bool(enable_system_metrics, "true");
CONF_Bool(enable_jvm_metrics, "false");
CONF_mBool(enable_prefetch, "true");
// Number of cores StarRocks will used, this will effect only when it's greater than 0.
@ -915,6 +925,9 @@ CONF_mInt64(tablet_internal_parallel_min_scan_dop, "4");
// Only the num rows of lake tablet less than lake_tablet_rows_splitted_ratio * splitted_scan_rows, than the lake tablet can be splitted.
CONF_mDouble(lake_tablet_rows_splitted_ratio, "1.5");
// Allow skipping invalid delete_predicate in order to get the segment data back, and do manual correction.
CONF_mBool(lake_tablet_ignore_invalid_delete_predicate, "false");
// The bitmap serialize version.
CONF_Int16(bitmap_serialize_version, "1");
// The max hdfs file handle.
@ -1073,6 +1086,8 @@ CONF_Int64(rpc_connect_timeout_ms, "30000");
CONF_Int32(max_batch_publish_latency_ms, "100");
// Config for opentelemetry tracing.
// Valid example: jaeger_endpoint = localhost:14268
// Invalid example: jaeger_endpoint = http://localhost:14268
CONF_String(jaeger_endpoint, "");
// Config for query debug trace
@ -1511,8 +1526,10 @@ CONF_mBool(lake_enable_vertical_compaction_fill_data_cache, "true");
CONF_mInt32(dictionary_cache_refresh_timeout_ms, "60000"); // 1 min
CONF_mInt32(dictionary_cache_refresh_threadpool_size, "8");
// ======================= FLAT JSON start ==============================================
// json flat flag
CONF_mBool(enable_json_flat, "false");
CONF_mBool(enable_json_flat, "true");
// enable compaction is base on flat json, not whole json
CONF_mBool(enable_compaction_flat_json, "true");
@ -1546,6 +1563,7 @@ CONF_mInt32(json_flat_column_max, "100");
// for whitelist on flat json remain data, max set 1kb
CONF_mInt32(json_flat_remain_filter_max_bytes, "1024");
// ======================= FLAT JSON end ==============================================
// Allowable intervals for continuous generation of pk dumps
// Disable when pk_dump_interval_seconds <= 0
@ -1589,6 +1607,8 @@ CONF_mBool(apply_del_vec_after_all_index_filter, "true");
CONF_mDouble(connector_sink_mem_high_watermark_ratio, "0.3");
CONF_mDouble(connector_sink_mem_low_watermark_ratio, "0.1");
CONF_mDouble(connector_sink_mem_urgent_space_ratio, "0.1");
// Whether enable spill intermediate data for connector sink.
CONF_mBool(enable_connector_sink_spill, "true");
// .crm file can be removed after 1day.
CONF_mInt32(unused_crm_file_threshold_second, "86400" /** 1day **/);
@ -1729,4 +1749,5 @@ CONF_mInt64(split_exchanger_buffer_chunk_num, "1000");
// when to split hashmap/hashset into two level hashmap/hashset, negative number means use default value
CONF_mInt64(two_level_memory_threshold, "-1");
} // namespace starrocks::config

View File

@ -210,6 +210,7 @@ void jemalloc_tracker_daemon(void* arg_this) {
static void init_starrocks_metrics(const std::vector<StorePath>& store_paths) {
bool init_system_metrics = config::enable_system_metrics;
bool init_jvm_metrics = config::enable_jvm_metrics;
std::set<std::string> disk_devices;
std::vector<std::string> network_interfaces;
std::vector<std::string> paths;
@ -229,7 +230,8 @@ static void init_starrocks_metrics(const std::vector<StorePath>& store_paths) {
return;
}
}
StarRocksMetrics::instance()->initialize(paths, init_system_metrics, disk_devices, network_interfaces);
StarRocksMetrics::instance()->initialize(paths, init_system_metrics, init_jvm_metrics, disk_devices,
network_interfaces);
}
void sigterm_handler(int signo, siginfo_t* info, void* context) {

View File

@ -78,11 +78,11 @@
#define VLOG_OPERATOR VLOG(3)
#define VLOG_ROW VLOG(10)
#define VLOG_PROGRESS VLOG(2)
#define VLOG_CACHE VLOG(1)
#define VLOG_CACHE VLOG(3)
#define VLOG_CONNECTION_IS_ON VLOG_IS_ON(1)
#define VLOG_CONNECTION_IS_ON VLOG_IS_ON(2)
#define VLOG_RPC_IS_ON VLOG_IS_ON(2)
#define VLOG_QUERY_IS_ON VLOG_IS_ON(1)
#define VLOG_QUERY_IS_ON VLOG_IS_ON(2)
#define VLOG_FILE_IS_ON VLOG_IS_ON(2)
#define VLOG_OPERATOR_IS_ON VLOG_IS_ON(3)
#define VLOG_ROW_IS_ON VLOG_IS_ON(10)

View File

@ -40,12 +40,17 @@ void Tracer::release_instance() {
Instance().shutdown();
}
static inline opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> create_no_op_tracer() {
return opentelemetry::trace::Provider::GetTracerProvider()->GetTracer("no-op", OPENTELEMETRY_SDK_VERSION);
}
void Tracer::init(const std::string& service_name) {
if (!config::jaeger_endpoint.empty()) {
opentelemetry::exporter::jaeger::JaegerExporterOptions opts;
vector<string> host_port = strings::Split(config::jaeger_endpoint, ":");
if (host_port.size() != 2) {
LOG(WARNING) << "bad jaeger_endpoint " << config::jaeger_endpoint;
_tracer = create_no_op_tracer();
return;
}
opts.endpoint = host_port[0];
@ -63,7 +68,7 @@ void Tracer::init(const std::string& service_name) {
new opentelemetry::sdk::trace::TracerProvider(std::move(processor), jaeger_resource));
_tracer = provider->GetTracer(service_name, OPENTELEMETRY_SDK_VERSION);
} else {
_tracer = opentelemetry::trace::Provider::GetTracerProvider()->GetTracer("no-op", OPENTELEMETRY_SDK_VERSION);
_tracer = create_no_op_tracer();
}
}

View File

@ -31,6 +31,8 @@ add_library(Connector STATIC
utils.cpp
async_flush_stream_poller.cpp
sink_memory_manager.cpp
partition_chunk_writer.cpp
connector_sink_executor.cpp
deletion_vector/deletion_vector.cpp
deletion_vector/deletion_bitmap.cpp
)

View File

@ -16,7 +16,7 @@
namespace starrocks::connector {
void AsyncFlushStreamPoller::enqueue(std::unique_ptr<Stream> stream) {
void AsyncFlushStreamPoller::enqueue(std::shared_ptr<Stream> stream) {
auto async_status = stream->io_status();
_queue.push_back({
.stream = std::move(stream),

View File

@ -34,7 +34,7 @@ public:
virtual ~AsyncFlushStreamPoller() = default;
virtual void enqueue(std::unique_ptr<Stream> stream);
virtual void enqueue(std::shared_ptr<Stream> stream);
// return a pair of
// 1. io status
@ -45,7 +45,7 @@ public:
private:
struct StreamWithStatus {
std::unique_ptr<Stream> stream;
std::shared_ptr<Stream> stream;
std::future<Status> async_status;
};

View File

@ -24,21 +24,18 @@ namespace starrocks::connector {
ConnectorChunkSink::ConnectorChunkSink(std::vector<std::string> partition_columns,
std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
std::unique_ptr<LocationProvider> location_provider,
std::unique_ptr<formats::FileWriterFactory> file_writer_factory,
int64_t max_file_size, RuntimeState* state, bool support_null_partition)
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory,
RuntimeState* state, bool support_null_partition)
: _partition_column_names(std::move(partition_columns)),
_partition_column_evaluators(std::move(partition_column_evaluators)),
_location_provider(std::move(location_provider)),
_file_writer_factory(std::move(file_writer_factory)),
_max_file_size(max_file_size),
_partition_chunk_writer_factory(std::move(partition_chunk_writer_factory)),
_state(state),
_support_null_partition(support_null_partition) {}
Status ConnectorChunkSink::init() {
RETURN_IF_ERROR(ColumnEvaluator::init(_partition_column_evaluators));
RETURN_IF_ERROR(_file_writer_factory->init());
_op_mem_mgr->init(&_writer_stream_pairs, _io_poller,
RETURN_IF_ERROR(_partition_chunk_writer_factory->init());
_op_mem_mgr->init(&_partition_chunk_writers, _io_poller,
[this](const CommitResult& r) { this->callback_on_commit(r); });
return Status::OK();
}
@ -49,38 +46,20 @@ Status ConnectorChunkSink::write_partition_chunk(const std::string& partition,
// They are under the same dir path, but should not in the same data file.
// We should record them in different files so that each data file could has its own meta info.
// otherwise, the scanFileTask may filter data incorrectly.
auto it = _writer_stream_pairs.find(std::make_pair(partition, partition_field_null_list));
if (it != _writer_stream_pairs.end()) {
Writer* writer = it->second.first.get();
if (writer->get_written_bytes() >= _max_file_size) {
string null_fingerprint(partition_field_null_list.size(), '0');
std::transform(partition_field_null_list.begin(), partition_field_null_list.end(), null_fingerprint.begin(),
[](int8_t b) { return b + '0'; });
callback_on_commit(writer->commit().set_extra_data(null_fingerprint));
_writer_stream_pairs.erase(it);
auto path =
!_partition_column_names.empty() ? _location_provider->get(partition) : _location_provider->get();
ASSIGN_OR_RETURN(auto new_writer_and_stream, _file_writer_factory->create(path));
std::unique_ptr<Writer> new_writer = std::move(new_writer_and_stream.writer);
std::unique_ptr<Stream> new_stream = std::move(new_writer_and_stream.stream);
RETURN_IF_ERROR(new_writer->init());
RETURN_IF_ERROR(new_writer->write(chunk));
_writer_stream_pairs[std::make_pair(partition, partition_field_null_list)] =
std::make_pair(std::move(new_writer), new_stream.get());
_io_poller->enqueue(std::move(new_stream));
} else {
RETURN_IF_ERROR(writer->write(chunk));
}
PartitionKey partition_key = std::make_pair(partition, partition_field_null_list);
auto it = _partition_chunk_writers.find(partition_key);
if (it != _partition_chunk_writers.end()) {
return it->second->write(chunk);
} else {
auto path = !_partition_column_names.empty() ? _location_provider->get(partition) : _location_provider->get();
ASSIGN_OR_RETURN(auto new_writer_and_stream, _file_writer_factory->create(path));
std::unique_ptr<Writer> new_writer = std::move(new_writer_and_stream.writer);
std::unique_ptr<Stream> new_stream = std::move(new_writer_and_stream.stream);
RETURN_IF_ERROR(new_writer->init());
RETURN_IF_ERROR(new_writer->write(chunk));
_writer_stream_pairs[std::make_pair(partition, partition_field_null_list)] =
std::make_pair(std::move(new_writer), new_stream.get());
_io_poller->enqueue(std::move(new_stream));
auto writer = _partition_chunk_writer_factory->create(partition, partition_field_null_list);
auto commit_callback = [this](const CommitResult& r) { this->callback_on_commit(r); };
auto error_handler = [this](const Status& s) { this->set_status(s); };
writer->set_commit_callback(commit_callback);
writer->set_error_handler(error_handler);
writer->set_io_poller(_io_poller);
RETURN_IF_ERROR(writer->init());
RETURN_IF_ERROR(writer->write(chunk));
_partition_chunk_writers[partition_key] = writer;
}
return Status::OK();
}
@ -100,19 +79,42 @@ Status ConnectorChunkSink::add(Chunk* chunk) {
}
Status ConnectorChunkSink::finish() {
for (auto& [partition_key, writer_and_stream] : _writer_stream_pairs) {
string extra_data(partition_key.second.size(), '0');
std::transform(partition_key.second.begin(), partition_key.second.end(), extra_data.begin(),
[](int8_t b) { return b + '0'; });
callback_on_commit(writer_and_stream.first->commit().set_extra_data(extra_data));
for (auto& [partition_key, writer] : _partition_chunk_writers) {
RETURN_IF_ERROR(writer->finish());
}
return Status::OK();
}
void ConnectorChunkSink::push_rollback_action(const std::function<void()>& action) {
// Not a very frequent operation, so use unique_lock here is ok.
std::unique_lock<std::shared_mutex> wlck(_mutex);
_rollback_actions.push_back(std::move(action));
}
void ConnectorChunkSink::rollback() {
std::shared_lock<std::shared_mutex> rlck(_mutex);
for (auto& action : _rollback_actions) {
action();
}
}
void ConnectorChunkSink::set_status(const Status& status) {
std::unique_lock<std::shared_mutex> wlck(_mutex);
_status = status;
}
Status ConnectorChunkSink::status() {
std::shared_lock<std::shared_mutex> rlck(_mutex);
return _status;
}
bool ConnectorChunkSink::is_finished() {
for (auto& [partition_key, writer] : _partition_chunk_writers) {
if (!writer->is_finished()) {
return false;
}
}
return true;
}
} // namespace starrocks::connector

View File

@ -20,8 +20,8 @@
#include "column/chunk.h"
#include "common/status.h"
#include "connector/partition_chunk_writer.h"
#include "connector/utils.h"
#include "formats/file_writer.h"
#include "fs/fs.h"
#include "runtime/runtime_state.h"
@ -30,20 +30,14 @@ namespace starrocks::connector {
class AsyncFlushStreamPoller;
class SinkOperatorMemoryManager;
using Writer = formats::FileWriter;
using Stream = io::AsyncFlushOutputStream;
using WriterStreamPair = std::pair<std::unique_ptr<Writer>, Stream*>;
using PartitionKey = std::pair<std::string, std::vector<int8_t>>;
using CommitResult = formats::FileWriter::CommitResult;
using CommitFunc = std::function<void(const CommitResult& result)>;
class ConnectorChunkSink {
public:
ConnectorChunkSink(std::vector<std::string> partition_columns,
std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
std::unique_ptr<LocationProvider> location_provider,
std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
RuntimeState* state, bool support_null_partition);
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory, RuntimeState* state,
bool support_null_partition);
void set_io_poller(AsyncFlushStreamPoller* poller) { _io_poller = poller; }
@ -59,26 +53,35 @@ public:
void rollback();
bool is_finished();
virtual void callback_on_commit(const CommitResult& result) = 0;
Status write_partition_chunk(const std::string& partition, const vector<int8_t>& partition_field_null_list,
Chunk* chunk);
Status status();
void set_status(const Status& status);
protected:
void push_rollback_action(const std::function<void()>& action);
AsyncFlushStreamPoller* _io_poller = nullptr;
SinkOperatorMemoryManager* _op_mem_mgr = nullptr;
std::vector<std::string> _partition_column_names;
std::vector<std::unique_ptr<ColumnEvaluator>> _partition_column_evaluators;
std::unique_ptr<LocationProvider> _location_provider;
std::unique_ptr<formats::FileWriterFactory> _file_writer_factory;
int64_t _max_file_size = 1024L * 1024 * 1024;
std::unique_ptr<PartitionChunkWriterFactory> _partition_chunk_writer_factory;
RuntimeState* _state = nullptr;
bool _support_null_partition{false};
std::vector<std::function<void()>> _rollback_actions;
std::map<PartitionKey, WriterStreamPair> _writer_stream_pairs;
std::map<PartitionKey, PartitionChunkWriterPtr> _partition_chunk_writers;
inline static std::string DEFAULT_PARTITION = "__DEFAULT_PARTITION__";
std::shared_mutex _mutex;
Status _status;
};
struct ConnectorChunkSinkContext {

View File

@ -0,0 +1,66 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "connector/connector_sink_executor.h"
#include "column/chunk.h"
#include "common/status.h"
#include "connector/partition_chunk_writer.h"
#include "storage/load_chunk_spiller.h"
namespace starrocks::connector {
Status ConnectorSinkSpillExecutor::init() {
return ThreadPoolBuilder(_executor_name)
.set_min_threads(0)
.set_max_threads(calc_max_thread_num())
.build(&_thread_pool);
}
int ConnectorSinkSpillExecutor::calc_max_thread_num() {
int dir_count = 0;
std::vector<starrocks::StorePath> spill_local_storage_paths;
Status st = parse_conf_store_paths(config::spill_local_storage_dir, &spill_local_storage_paths);
if (st.ok()) {
dir_count = spill_local_storage_paths.size();
}
int threads = config::lake_flush_thread_num_per_store;
if (threads == 0) {
threads = -2;
}
if (threads <= 0) {
threads = -threads;
threads *= CpuInfo::num_cores();
}
dir_count = std::max(1, dir_count);
dir_count = std::min(8, dir_count);
return dir_count * threads;
}
void ChunkSpillTask::run() {
auto res = _load_chunk_spiller->spill(*_chunk);
if (_cb) {
_cb(_chunk, res);
}
}
void MergeBlockTask::run() {
auto st = _writer->merge_blocks();
if (_cb) {
_cb(st);
}
}
} // namespace starrocks::connector

View File

@ -0,0 +1,100 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <fmt/format.h>
#include <map>
#include "column/chunk.h"
#include "common/status.h"
#include "connector/utils.h"
#include "util/threadpool.h"
namespace starrocks {
class LoadChunkSpiller;
}
namespace starrocks::connector {
class SpillPartitionChunkWriter;
class ConnectorSinkExecutor {
public:
ConnectorSinkExecutor(const std::string& executor_name) : _executor_name(executor_name) {}
virtual ~ConnectorSinkExecutor() {}
virtual Status init() = 0;
ThreadPool* get_thread_pool() { return _thread_pool.get(); }
std::unique_ptr<ThreadPoolToken> create_token() {
return _thread_pool->new_token(ThreadPool::ExecutionMode::SERIAL);
}
Status refresh_max_thread_num() {
if (_thread_pool != nullptr) {
return _thread_pool->update_max_threads(calc_max_thread_num());
}
return Status::OK();
}
protected:
virtual int calc_max_thread_num() = 0;
protected:
std::string _executor_name;
std::unique_ptr<ThreadPool> _thread_pool;
};
class ConnectorSinkSpillExecutor : public ConnectorSinkExecutor {
public:
ConnectorSinkSpillExecutor() : ConnectorSinkExecutor("conn_sink_spill") {}
Status init() override;
protected:
int calc_max_thread_num() override;
};
class ChunkSpillTask final : public Runnable {
public:
ChunkSpillTask(LoadChunkSpiller* load_chunk_spiller, ChunkPtr chunk,
std::function<void(ChunkPtr chunk, const StatusOr<size_t>&)> cb)
: _load_chunk_spiller(load_chunk_spiller), _chunk(chunk), _cb(std::move(cb)) {}
~ChunkSpillTask() override = default;
void run() override;
private:
LoadChunkSpiller* _load_chunk_spiller;
ChunkPtr _chunk;
std::function<void(ChunkPtr, const StatusOr<size_t>&)> _cb;
};
class MergeBlockTask : public Runnable {
public:
MergeBlockTask(SpillPartitionChunkWriter* writer, std::function<void(const Status&)> cb)
: _writer(writer), _cb(std::move(cb)) {}
void run() override;
private:
SpillPartitionChunkWriter* _writer;
std::function<void(const Status&)> _cb;
};
} // namespace starrocks::connector

View File

@ -14,7 +14,6 @@
#include "connector/es_connector.h"
#include "common/logging.h"
#include "exec/es/es_predicate.h"
#include "exec/es/es_query_builder.h"
#include "exec/es/es_scan_reader.h"
@ -22,6 +21,7 @@
#include "exec/es/es_scroll_query.h"
#include "exec/exec_node.h"
#include "exprs/expr.h"
#include "service/backend_options.h"
#include "storage/chunk_helper.h"
namespace starrocks::connector {

View File

@ -31,12 +31,10 @@ namespace starrocks::connector {
FileChunkSink::FileChunkSink(std::vector<std::string> partition_columns,
std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
std::unique_ptr<LocationProvider> location_provider,
std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory,
RuntimeState* state)
: ConnectorChunkSink(std::move(partition_columns), std::move(partition_column_evaluators),
std::move(location_provider), std::move(file_writer_factory), max_file_size, state, true) {
}
std::move(partition_chunk_writer_factory), state, true) {}
void FileChunkSink::callback_on_commit(const CommitResult& result) {
_rollback_actions.push_back(std::move(result.rollback_action));
@ -49,27 +47,27 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> FileChunkSinkProvider::create_chun
std::shared_ptr<ConnectorChunkSinkContext> context, int32_t driver_id) {
auto ctx = std::dynamic_pointer_cast<FileChunkSinkContext>(context);
auto runtime_state = ctx->fragment_context->runtime_state();
auto fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
std::shared_ptr<FileSystem> fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
auto column_evaluators = ColumnEvaluator::clone(ctx->column_evaluators);
auto location_provider = std::make_unique<connector::LocationProvider>(
auto location_provider = std::make_shared<connector::LocationProvider>(
ctx->path, print_id(ctx->fragment_context->query_id()), runtime_state->be_number(), driver_id,
boost::to_lower_copy(ctx->format));
std::unique_ptr<formats::FileWriterFactory> file_writer_factory;
std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
if (boost::iequals(ctx->format, formats::PARQUET)) {
file_writer_factory = std::make_unique<formats::ParquetFileWriterFactory>(
std::move(fs), ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
std::nullopt, ctx->executor, runtime_state);
file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators), std::nullopt,
ctx->executor, runtime_state);
} else if (boost::iequals(ctx->format, formats::ORC)) {
file_writer_factory = std::make_unique<formats::ORCFileWriterFactory>(
std::move(fs), ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
ctx->executor, runtime_state);
file_writer_factory = std::make_shared<formats::ORCFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators), ctx->executor,
runtime_state);
} else if (boost::iequals(ctx->format, formats::CSV)) {
file_writer_factory = std::make_unique<formats::CSVFileWriterFactory>(
std::move(fs), ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
ctx->executor, runtime_state);
file_writer_factory = std::make_shared<formats::CSVFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators), ctx->executor,
runtime_state);
} else {
file_writer_factory = std::make_unique<formats::UnknownFileWriterFactory>(ctx->format);
file_writer_factory = std::make_shared<formats::UnknownFileWriterFactory>(ctx->format);
}
std::vector<std::string> partition_columns;
@ -78,9 +76,28 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> FileChunkSinkProvider::create_chun
partition_columns.push_back(ctx->column_names[idx]);
partition_column_evaluators.push_back(ctx->column_evaluators[idx]->clone());
}
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory;
// Disable the load spill for file sink temperarily
if (/* config::enable_connector_sink_spill */ false) {
auto partition_chunk_writer_ctx =
std::make_shared<SpillPartitionChunkWriterContext>(SpillPartitionChunkWriterContext{
{file_writer_factory, location_provider, ctx->max_file_size, partition_columns.empty()},
fs,
ctx->fragment_context,
nullptr,
nullptr});
partition_chunk_writer_factory = std::make_unique<SpillPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
} else {
auto partition_chunk_writer_ctx =
std::make_shared<BufferPartitionChunkWriterContext>(BufferPartitionChunkWriterContext{
{file_writer_factory, location_provider, ctx->max_file_size, partition_columns.empty()}});
partition_chunk_writer_factory =
std::make_unique<BufferPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
}
return std::make_unique<connector::FileChunkSink>(partition_columns, std::move(partition_column_evaluators),
std::move(location_provider), std::move(file_writer_factory),
ctx->max_file_size, runtime_state);
std::move(partition_chunk_writer_factory), runtime_state);
}
} // namespace starrocks::connector

View File

@ -36,9 +36,7 @@ class FileChunkSink : public ConnectorChunkSink {
public:
FileChunkSink(std::vector<std::string> partition_columns,
std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
std::unique_ptr<LocationProvider> location_provider,
std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
RuntimeState* state);
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory, RuntimeState* state);
~FileChunkSink() override = default;

View File

@ -29,12 +29,10 @@ namespace starrocks::connector {
HiveChunkSink::HiveChunkSink(std::vector<std::string> partition_columns,
std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
std::unique_ptr<LocationProvider> location_provider,
std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory,
RuntimeState* state)
: ConnectorChunkSink(std::move(partition_columns), std::move(partition_column_evaluators),
std::move(location_provider), std::move(file_writer_factory), max_file_size, state,
false) {}
std::move(partition_chunk_writer_factory), state, false) {}
void HiveChunkSink::callback_on_commit(const CommitResult& result) {
_rollback_actions.push_back(std::move(result.rollback_action));
@ -55,36 +53,56 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> HiveChunkSinkProvider::create_chun
std::shared_ptr<ConnectorChunkSinkContext> context, int32_t driver_id) {
auto ctx = std::dynamic_pointer_cast<HiveChunkSinkContext>(context);
auto runtime_state = ctx->fragment_context->runtime_state();
auto fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value(); // must succeed
std::shared_ptr<FileSystem> fs =
FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value(); // must succeed
auto data_column_evaluators = ColumnEvaluator::clone(ctx->data_column_evaluators);
auto location_provider = std::make_unique<connector::LocationProvider>(
auto location_provider = std::make_shared<connector::LocationProvider>(
ctx->path, print_id(ctx->fragment_context->query_id()), runtime_state->be_number(), driver_id,
boost::to_lower_copy(ctx->format));
std::unique_ptr<formats::FileWriterFactory> file_writer_factory;
std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
if (boost::iequals(ctx->format, formats::PARQUET)) {
// ensure hive compatibility since hive 3 and lower version accepts specific encoding
ctx->options[formats::ParquetWriterOptions::USE_LEGACY_DECIMAL_ENCODING] = "true";
ctx->options[formats::ParquetWriterOptions::USE_INT96_TIMESTAMP_ENCODING] = "true";
file_writer_factory = std::make_unique<formats::ParquetFileWriterFactory>(
std::move(fs), ctx->compression_type, ctx->options, ctx->data_column_names,
std::move(data_column_evaluators), std::nullopt, ctx->executor, runtime_state);
file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->data_column_names, std::move(data_column_evaluators),
std::nullopt, ctx->executor, runtime_state);
} else if (boost::iequals(ctx->format, formats::ORC)) {
file_writer_factory = std::make_unique<formats::ORCFileWriterFactory>(
std::move(fs), ctx->compression_type, ctx->options, ctx->data_column_names,
std::move(data_column_evaluators), ctx->executor, runtime_state);
file_writer_factory = std::make_shared<formats::ORCFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->data_column_names, std::move(data_column_evaluators),
ctx->executor, runtime_state);
} else if (boost::iequals(ctx->format, formats::TEXTFILE)) {
file_writer_factory = std::make_unique<formats::CSVFileWriterFactory>(
std::move(fs), ctx->compression_type, ctx->options, ctx->data_column_names,
std::move(data_column_evaluators), ctx->executor, runtime_state);
file_writer_factory = std::make_shared<formats::CSVFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->data_column_names, std::move(data_column_evaluators),
ctx->executor, runtime_state);
} else {
file_writer_factory = std::make_unique<formats::UnknownFileWriterFactory>(ctx->format);
file_writer_factory = std::make_shared<formats::UnknownFileWriterFactory>(ctx->format);
}
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory;
// Disable the load spill for hive sink temperarily
if (/* config::enable_connector_sink_spill */ false) {
auto partition_chunk_writer_ctx = std::make_shared<SpillPartitionChunkWriterContext>(
SpillPartitionChunkWriterContext{{file_writer_factory, location_provider, ctx->max_file_size,
ctx->partition_column_names.empty()},
fs,
ctx->fragment_context,
nullptr,
nullptr});
partition_chunk_writer_factory = std::make_unique<SpillPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
} else {
auto partition_chunk_writer_ctx = std::make_shared<BufferPartitionChunkWriterContext>(
BufferPartitionChunkWriterContext{{file_writer_factory, location_provider, ctx->max_file_size,
ctx->partition_column_names.empty()}});
partition_chunk_writer_factory =
std::make_unique<BufferPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
}
auto partition_column_evaluators = ColumnEvaluator::clone(ctx->partition_column_evaluators);
return std::make_unique<connector::HiveChunkSink>(
ctx->partition_column_names, std::move(partition_column_evaluators), std::move(location_provider),
std::move(file_writer_factory), ctx->max_file_size, runtime_state);
return std::make_unique<connector::HiveChunkSink>(ctx->partition_column_names,
std::move(partition_column_evaluators),
std::move(partition_chunk_writer_factory), runtime_state);
}
} // namespace starrocks::connector

View File

@ -38,9 +38,7 @@ class HiveChunkSink : public ConnectorChunkSink {
public:
HiveChunkSink(std::vector<std::string> partition_columns,
std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
std::unique_ptr<LocationProvider> location_provider,
std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
RuntimeState* state);
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory, RuntimeState* state);
~HiveChunkSink() override = default;

View File

@ -30,15 +30,14 @@ namespace starrocks::connector {
IcebergChunkSink::IcebergChunkSink(std::vector<std::string> partition_columns, std::vector<std::string> transform_exprs,
std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
std::unique_ptr<LocationProvider> location_provider,
std::unique_ptr<formats::FileWriterFactory> file_writer_factory,
int64_t max_file_size, RuntimeState* state)
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory,
RuntimeState* state)
: ConnectorChunkSink(std::move(partition_columns), std::move(partition_column_evaluators),
std::move(location_provider), std::move(file_writer_factory), max_file_size, state, true),
std::move(partition_chunk_writer_factory), state, true),
_transform_exprs(std::move(transform_exprs)) {}
void IcebergChunkSink::callback_on_commit(const CommitResult& result) {
_rollback_actions.push_back(std::move(result.rollback_action));
push_rollback_action(std::move(result.rollback_action));
if (result.io_status.ok()) {
_state->update_num_rows_load_sink(result.file_statistics.record_count);
@ -82,27 +81,46 @@ StatusOr<std::unique_ptr<ConnectorChunkSink>> IcebergChunkSinkProvider::create_c
std::shared_ptr<ConnectorChunkSinkContext> context, int32_t driver_id) {
auto ctx = std::dynamic_pointer_cast<IcebergChunkSinkContext>(context);
auto runtime_state = ctx->fragment_context->runtime_state();
auto fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
std::shared_ptr<FileSystem> fs = FileSystem::CreateUniqueFromString(ctx->path, FSOptions(&ctx->cloud_conf)).value();
auto column_evaluators = ColumnEvaluator::clone(ctx->column_evaluators);
auto location_provider = std::make_unique<connector::LocationProvider>(
auto location_provider = std::make_shared<connector::LocationProvider>(
ctx->path, print_id(ctx->fragment_context->query_id()), runtime_state->be_number(), driver_id,
boost::to_lower_copy(ctx->format));
std::unique_ptr<formats::FileWriterFactory> file_writer_factory;
if (boost::iequals(ctx->format, formats::PARQUET)) {
file_writer_factory = std::make_unique<formats::ParquetFileWriterFactory>(
std::move(fs), ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
ctx->parquet_field_ids, ctx->executor, runtime_state);
} else {
file_writer_factory = std::make_unique<formats::UnknownFileWriterFactory>(ctx->format);
}
std::vector<std::string>& partition_columns = ctx->partition_column_names;
std::vector<std::string>& transform_exprs = ctx->transform_exprs;
auto partition_evaluators = ColumnEvaluator::clone(ctx->partition_evaluators);
return std::make_unique<connector::IcebergChunkSink>(
partition_columns, transform_exprs, std::move(partition_evaluators), std::move(location_provider),
std::move(file_writer_factory), ctx->max_file_size, runtime_state);
std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
if (boost::iequals(ctx->format, formats::PARQUET)) {
file_writer_factory = std::make_shared<formats::ParquetFileWriterFactory>(
fs, ctx->compression_type, ctx->options, ctx->column_names, std::move(column_evaluators),
ctx->parquet_field_ids, ctx->executor, runtime_state);
} else {
file_writer_factory = std::make_shared<formats::UnknownFileWriterFactory>(ctx->format);
}
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory;
if (config::enable_connector_sink_spill) {
auto partition_chunk_writer_ctx =
std::make_shared<SpillPartitionChunkWriterContext>(SpillPartitionChunkWriterContext{
{file_writer_factory, location_provider, ctx->max_file_size, partition_columns.empty()},
fs,
ctx->fragment_context,
runtime_state->desc_tbl().get_tuple_descriptor(ctx->tuple_desc_id),
&ctx->column_evaluators,
ctx->sort_ordering});
partition_chunk_writer_factory = std::make_unique<SpillPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
} else {
auto partition_chunk_writer_ctx =
std::make_shared<BufferPartitionChunkWriterContext>(BufferPartitionChunkWriterContext{
{file_writer_factory, location_provider, ctx->max_file_size, partition_columns.empty()}});
partition_chunk_writer_factory =
std::make_unique<BufferPartitionChunkWriterFactory>(partition_chunk_writer_ctx);
}
return std::make_unique<connector::IcebergChunkSink>(partition_columns, transform_exprs,
std::move(partition_evaluators),
std::move(partition_chunk_writer_factory), runtime_state);
}
Status IcebergChunkSink::add(Chunk* chunk) {

View File

@ -37,9 +37,7 @@ class IcebergChunkSink : public ConnectorChunkSink {
public:
IcebergChunkSink(std::vector<std::string> partition_columns, std::vector<std::string> transform_exprs,
std::vector<std::unique_ptr<ColumnEvaluator>>&& partition_column_evaluators,
std::unique_ptr<LocationProvider> location_provider,
std::unique_ptr<formats::FileWriterFactory> file_writer_factory, int64_t max_file_size,
RuntimeState* state);
std::unique_ptr<PartitionChunkWriterFactory> partition_chunk_writer_factory, RuntimeState* state);
~IcebergChunkSink() override = default;
@ -70,6 +68,8 @@ struct IcebergChunkSinkContext : public ConnectorChunkSinkContext {
PriorityThreadPool* executor = nullptr;
TCloudConfiguration cloud_conf;
pipeline::FragmentContext* fragment_context = nullptr;
int tuple_desc_id = -1;
std::shared_ptr<SortOrdering> sort_ordering;
};
class IcebergChunkSinkProvider : public ConnectorChunkSinkProvider {

View File

@ -374,8 +374,9 @@ Status LakeDataSource::init_tablet_reader(RuntimeState* runtime_state) {
_params.plan_node_id = _morsel->get_plan_node_id();
_params.scan_range = _morsel->get_scan_range();
}
ASSIGN_OR_RETURN(_reader, _tablet.new_reader(std::move(child_schema), need_split,
_provider->could_split_physically(), _morsel->rowsets()));
ASSIGN_OR_RETURN(_reader,
_tablet.new_reader(std::move(child_schema), need_split, _provider->could_split_physically(),
_morsel->rowsets(), _tablet_schema));
if (reader_columns.size() == scanner_columns.size()) {
_prj_iter = _reader;
} else {
@ -434,7 +435,15 @@ Status LakeDataSource::_extend_schema_by_access_paths() {
column.set_type(value_type);
column.set_length(path->value_type().len);
column.set_is_nullable(true);
column.set_extended_info(std::make_unique<ExtendedColumnInfo>(path.get(), root_column_index));
int32_t root_uid = _tablet_schema->column(static_cast<size_t>(root_column_index)).unique_id();
column.set_extended_info(std::make_unique<ExtendedColumnInfo>(path.get(), root_uid));
// For UNIQUE/AGG tables, extended flat JSON subcolumns behave like value columns
// and must carry a valid aggregation for pre-aggregation. Use REPLACE.
auto keys_type = _tablet_schema->keys_type();
if (keys_type == KeysType::UNIQUE_KEYS || keys_type == KeysType::AGG_KEYS) {
column.set_aggregation(StorageAggregateType::STORAGE_AGGREGATE_REPLACE);
}
tmp_schema->append_column(column);
VLOG(2) << "extend the access path column: " << path->linear_path();
@ -464,6 +473,28 @@ Status LakeDataSource::init_column_access_paths(Schema* schema) {
LOG(WARNING) << "failed to find column in schema: " << root;
}
}
// Preserve access paths referenced by extended columns even if not selected by pushdown
{
std::unordered_set<const ColumnAccessPath*> kept;
kept.reserve(new_one.size());
for (const auto& p : new_one) kept.insert(p.get());
for (size_t i = 0; i < _tablet_schema->num_columns(); ++i) {
const auto& col = _tablet_schema->column(i);
if (!col.is_extended() || col.extended_info() == nullptr || col.extended_info()->access_path == nullptr) {
continue;
}
const ColumnAccessPath* needed = col.extended_info()->access_path;
if (kept.find(needed) != kept.end()) continue;
for (auto& owned : _column_access_paths) {
if (owned.get() == needed) {
new_one.emplace_back(std::move(owned));
kept.insert(needed);
break;
}
}
}
}
_column_access_paths = std::move(new_one);
_params.column_access_paths = &_column_access_paths;

View File

@ -0,0 +1,388 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "connector/partition_chunk_writer.h"
#include "column/chunk.h"
#include "common/status.h"
#include "connector/async_flush_stream_poller.h"
#include "connector/connector_sink_executor.h"
#include "connector/sink_memory_manager.h"
#include "exec/pipeline/fragment_context.h"
#include "formats/file_writer.h"
#include "runtime/runtime_state.h"
#include "storage/chunk_helper.h"
#include "storage/convert_helper.h"
#include "storage/load_spill_block_manager.h"
#include "storage/storage_engine.h"
#include "storage/types.h"
#include "util/monotime.h"
namespace starrocks::connector {
PartitionChunkWriter::PartitionChunkWriter(std::string partition, std::vector<int8_t> partition_field_null_list,
const std::shared_ptr<PartitionChunkWriterContext>& ctx)
: _partition(std::move(partition)),
_partition_field_null_list(std::move(partition_field_null_list)),
_file_writer_factory(ctx->file_writer_factory),
_location_provider(ctx->location_provider),
_max_file_size(ctx->max_file_size),
_is_default_partition(ctx->is_default_partition) {
_commit_extra_data.resize(_partition_field_null_list.size(), '0');
std::transform(_partition_field_null_list.begin(), _partition_field_null_list.end(), _commit_extra_data.begin(),
[](int8_t b) { return b + '0'; });
}
Status PartitionChunkWriter::create_file_writer_if_needed() {
if (!_file_writer) {
std::string path = _is_default_partition ? _location_provider->get() : _location_provider->get(_partition);
ASSIGN_OR_RETURN(auto new_writer_and_stream, _file_writer_factory->create(path));
_file_writer = std::move(new_writer_and_stream.writer);
_out_stream = std::move(new_writer_and_stream.stream);
RETURN_IF_ERROR(_file_writer->init());
_io_poller->enqueue(_out_stream);
}
return Status::OK();
}
void PartitionChunkWriter::commit_file() {
if (!_file_writer) {
return;
}
auto result = _file_writer->commit();
_commit_callback(result.set_extra_data(_commit_extra_data));
_file_writer = nullptr;
VLOG(3) << "commit to remote file, filename: " << _out_stream->filename()
<< ", size: " << result.file_statistics.file_size;
_out_stream = nullptr;
}
Status BufferPartitionChunkWriter::init() {
return Status::OK();
}
Status BufferPartitionChunkWriter::write(Chunk* chunk) {
RETURN_IF_ERROR(create_file_writer_if_needed());
if (_file_writer->get_written_bytes() >= _max_file_size) {
commit_file();
}
return _file_writer->write(chunk);
}
Status BufferPartitionChunkWriter::flush() {
commit_file();
return Status::OK();
}
Status BufferPartitionChunkWriter::finish() {
commit_file();
return Status::OK();
}
SpillPartitionChunkWriter::SpillPartitionChunkWriter(std::string partition,
std::vector<int8_t> partition_field_null_list,
const std::shared_ptr<SpillPartitionChunkWriterContext>& ctx)
: PartitionChunkWriter(std::move(partition), std::move(partition_field_null_list), ctx),
_fs(ctx->fs),
_fragment_context(ctx->fragment_context),
_column_evaluators(ctx->column_evaluators),
_sort_ordering(ctx->sort_ordering) {
_chunk_spill_token = ExecEnv::GetInstance()->connector_sink_spill_executor()->create_token();
_block_merge_token = StorageEngine::instance()->load_spill_block_merge_executor()->create_token();
_tuple_desc = ctx->tuple_desc;
_writer_id = generate_uuid();
}
SpillPartitionChunkWriter::~SpillPartitionChunkWriter() {
if (_chunk_spill_token) {
_chunk_spill_token->shutdown();
}
if (_block_merge_token) {
_block_merge_token->shutdown();
}
}
Status SpillPartitionChunkWriter::init() {
std::string root_location = _location_provider->root_location();
_load_spill_block_mgr =
std::make_unique<LoadSpillBlockManager>(_fragment_context->query_id(), _writer_id, root_location, _fs);
RETURN_IF_ERROR(_load_spill_block_mgr->init());
_load_chunk_spiller = std::make_unique<LoadChunkSpiller>(_load_spill_block_mgr.get(),
_fragment_context->runtime_state()->runtime_profile());
if (_column_evaluators) {
RETURN_IF_ERROR(ColumnEvaluator::init(*_column_evaluators));
}
return Status::OK();
}
Status SpillPartitionChunkWriter::write(Chunk* chunk) {
RETURN_IF_ERROR(create_file_writer_if_needed());
_chunks.push_back(chunk->clone_unique());
_chunk_bytes_usage += chunk->bytes_usage();
if (!_base_chunk) {
_base_chunk = _chunks.back();
}
int64_t max_flush_batch_size = _file_writer->get_flush_batch_size();
if (_sort_ordering || max_flush_batch_size == 0) {
max_flush_batch_size = _max_file_size;
}
if (_chunk_bytes_usage >= max_flush_batch_size) {
return _flush_to_file();
} else if (_mem_insufficent()) {
return _spill();
}
return Status::OK();
}
Status SpillPartitionChunkWriter::flush() {
RETURN_IF(!_file_writer, Status::OK());
return _spill();
}
Status SpillPartitionChunkWriter::finish() {
_chunk_spill_token->wait();
// If no chunks have been spilled, flush data to remote file directly.
if (_load_chunk_spiller->empty()) {
VLOG(2) << "flush to remote directly when finish, query_id: " << print_id(_fragment_context->query_id())
<< ", writer_id: " << print_id(_writer_id);
RETURN_IF_ERROR(_flush_to_file());
commit_file();
return Status::OK();
}
auto cb = [this](const Status& st) {
LOG_IF(ERROR, !st.ok()) << "fail to merge spill blocks, query_id: " << print_id(_fragment_context->query_id())
<< ", writer_id: " << print_id(_writer_id);
_handle_err(st);
commit_file();
};
auto merge_task = std::make_shared<MergeBlockTask>(this, cb);
return _block_merge_token->submit(merge_task);
}
const int64_t SpillPartitionChunkWriter::kWaitMilliseconds = 10;
bool SpillPartitionChunkWriter::is_finished() {
bool finished = _chunk_spill_token->wait_for(MonoDelta::FromMilliseconds(kWaitMilliseconds)) &&
_block_merge_token->wait_for(MonoDelta::FromMilliseconds(kWaitMilliseconds));
return finished;
}
Status SpillPartitionChunkWriter::merge_blocks() {
RETURN_IF_ERROR(flush());
_chunk_spill_token->wait();
auto write_func = [this](Chunk* chunk) { return _flush_chunk(chunk, false); };
auto flush_func = [this]() {
// Commit file after each merge function to ensure the data written to one file is ordered,
// because data generated by different merge function may be unordered.
if (_sort_ordering) {
commit_file();
}
return Status::OK();
};
Status st = _load_chunk_spiller->merge_write(_max_file_size, _sort_ordering != nullptr, false /* do_agg */,
write_func, flush_func);
VLOG(2) << "finish merge blocks, query_id: " << _fragment_context->query_id() << ", status: " << st.message();
return st;
}
Status SpillPartitionChunkWriter::_sort() {
RETURN_IF(!_result_chunk, Status::OK());
auto chunk = _result_chunk->clone_empty_with_schema(0);
_result_chunk->swap_chunk(*chunk);
SmallPermutation perm = create_small_permutation(static_cast<uint32_t>(chunk->num_rows()));
Columns columns;
for (auto sort_key_idx : _sort_ordering->sort_key_idxes) {
columns.push_back(chunk->get_column_by_index(sort_key_idx));
}
RETURN_IF_ERROR(stable_sort_and_tie_columns(false, columns, _sort_ordering->sort_descs, &perm));
std::vector<uint32_t> selective;
permutate_to_selective(perm, &selective);
_result_chunk->rolling_append_selective(*chunk, selective.data(), 0, chunk->num_rows());
return Status::OK();
}
Status SpillPartitionChunkWriter::_spill() {
RETURN_IF(_chunks.empty(), Status::OK());
RETURN_IF_ERROR(_merge_chunks());
if (_sort_ordering) {
RETURN_IF_ERROR(_sort());
}
auto callback = [this](const ChunkPtr& chunk, const StatusOr<size_t>& res) {
if (!res.ok()) {
LOG(ERROR) << "fail to spill connector partition chunk sink, write it to remote file directly. msg: "
<< res.status().message();
Status st = _flush_chunk(chunk.get(), true);
_handle_err(st);
} else {
VLOG(3) << "spill chunk data, filename: " << out_stream()->filename() << ", size: " << chunk->bytes_usage()
<< ", rows: " << chunk->num_rows() << ", partition: " << _partition
<< ", writer_id: " << _writer_id;
}
_spilling_bytes_usage.fetch_sub(chunk->bytes_usage(), std::memory_order_relaxed);
};
auto spill_task = std::make_shared<ChunkSpillTask>(_load_chunk_spiller.get(), _result_chunk, callback);
RETURN_IF_ERROR(_chunk_spill_token->submit(spill_task));
_spilling_bytes_usage.fetch_add(_result_chunk->bytes_usage(), std::memory_order_relaxed);
_chunk_bytes_usage = 0;
return Status::OK();
}
Status SpillPartitionChunkWriter::_flush_to_file() {
RETURN_IF(_chunks.empty(), Status::OK());
if (!_sort_ordering) {
for (auto& chunk : _chunks) {
RETURN_IF_ERROR(_flush_chunk(chunk.get(), false));
}
} else {
RETURN_IF_ERROR(_merge_chunks());
RETURN_IF_ERROR(_sort());
RETURN_IF_ERROR(_flush_chunk(_result_chunk.get(), true));
commit_file();
}
_chunks.clear();
_chunk_bytes_usage = 0;
return Status::OK();
};
Status SpillPartitionChunkWriter::_flush_chunk(Chunk* chunk, bool split) {
if (chunk->get_slot_id_to_index_map().empty()) {
auto& slot_map = _base_chunk->get_slot_id_to_index_map();
for (auto& it : slot_map) {
chunk->set_slot_id_to_index(it.first, _col_index_map[it.second]);
}
}
if (!split) {
return _write_chunk(chunk);
}
size_t chunk_size = config::vector_chunk_size;
for (size_t offset = 0; offset < chunk->num_rows(); offset += chunk_size) {
auto sub_chunk = chunk->clone_empty(chunk_size);
size_t num_rows = std::min(chunk_size, chunk->num_rows() - offset);
sub_chunk->append(*chunk, offset, num_rows);
RETURN_IF_ERROR(_write_chunk(sub_chunk.get()));
}
return Status::OK();
}
Status SpillPartitionChunkWriter::_write_chunk(Chunk* chunk) {
if (!_sort_ordering && _file_writer->get_written_bytes() >= _max_file_size) {
commit_file();
}
RETURN_IF_ERROR(create_file_writer_if_needed());
RETURN_IF_ERROR(_file_writer->write(chunk));
return Status::OK();
}
Status SpillPartitionChunkWriter::_merge_chunks() {
if (_chunks.empty()) {
return Status::OK();
}
// Create a target chunk with schema to make it can use some
// module functions of native table directly.
size_t num_rows = std::accumulate(_chunks.begin(), _chunks.end(), 0,
[](int sum, const ChunkPtr& chunk) { return sum + chunk->num_rows(); });
_result_chunk = _create_schema_chunk(_chunks.front(), num_rows);
std::unordered_map<Column*, size_t> col_ptr_index_map;
auto& columns = _chunks.front()->columns();
for (size_t i = 0; i < columns.size(); ++i) {
col_ptr_index_map[columns[i]->get_ptr()] = i;
}
for (auto& chunk : _chunks) {
for (size_t i = 0; i < _result_chunk->num_columns(); ++i) {
auto* dst_col = _result_chunk->get_column_by_index(i).get();
ColumnPtr src_col;
if (_column_evaluators) {
ASSIGN_OR_RETURN(src_col, (*_column_evaluators)[i]->evaluate(chunk.get()));
} else {
src_col = chunk->get_column_by_index(i);
}
dst_col->append(*src_col);
if (chunk == _chunks.front()) {
auto it = col_ptr_index_map.find(src_col.get());
if (it != col_ptr_index_map.end()) {
_col_index_map[it->second] = i;
} else {
return Status::InternalError("unknown column index: " + std::to_string(i));
}
}
}
chunk.reset();
}
_chunks.clear();
return Status::OK();
}
bool SpillPartitionChunkWriter::_mem_insufficent() {
// Return false because we will triger spill by sink memory manager.
return false;
}
void SpillPartitionChunkWriter::_handle_err(const Status& st) {
if (!st.ok()) {
_error_handler(st);
}
}
SchemaPtr SpillPartitionChunkWriter::_make_schema() {
Fields fields;
for (auto& slot : _tuple_desc->slots()) {
TypeDescriptor type_desc = slot->type();
TypeInfoPtr type_info = get_type_info(type_desc.type, type_desc.precision, type_desc.scale);
auto field = std::make_shared<Field>(slot->id(), slot->col_name(), type_info, slot->is_nullable());
fields.push_back(field);
}
SchemaPtr schema;
if (_sort_ordering) {
schema = std::make_shared<Schema>(std::move(fields), KeysType::DUP_KEYS, _sort_ordering->sort_key_idxes,
std::make_shared<SortDescs>(_sort_ordering->sort_descs));
} else {
schema = std::make_shared<Schema>(std::move(fields), KeysType::DUP_KEYS, std::vector<uint32_t>(), nullptr);
}
return schema;
}
ChunkPtr SpillPartitionChunkWriter::_create_schema_chunk(const ChunkPtr& base_chunk, size_t num_rows) {
if (!_schema) {
const SchemaPtr& schema = base_chunk->schema();
if (schema) {
_schema = schema;
if (_sort_ordering) {
_schema->set_sort_key_idxes(_sort_ordering->sort_key_idxes);
_schema->set_sort_descs(std::make_shared<SortDescs>(_sort_ordering->sort_descs));
}
} else {
_schema = _make_schema();
}
}
auto chunk = ChunkHelper::new_chunk(*_schema, num_rows);
return chunk;
}
} // namespace starrocks::connector

View File

@ -0,0 +1,256 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "column/chunk.h"
#include "common/status.h"
#include "connector/utils.h"
#include "formats/file_writer.h"
#include "fs/fs.h"
#include "runtime/exec_env.h"
#include "storage/load_chunk_spiller.h"
#include "util/threadpool.h"
#include "util/uid_util.h"
namespace starrocks::connector {
using CommitResult = formats::FileWriter::CommitResult;
using CommitFunc = std::function<void(const CommitResult& result)>;
using ErrorHandleFunc = std::function<void(const Status& status)>;
class AsyncFlushStreamPoller;
struct SortOrdering {
std::vector<uint32_t> sort_key_idxes;
SortDescs sort_descs;
};
struct PartitionChunkWriterContext {
std::shared_ptr<formats::FileWriterFactory> file_writer_factory;
std::shared_ptr<LocationProvider> location_provider;
int64_t max_file_size = 0;
bool is_default_partition = false;
};
struct BufferPartitionChunkWriterContext : public PartitionChunkWriterContext {};
struct SpillPartitionChunkWriterContext : public PartitionChunkWriterContext {
std::shared_ptr<FileSystem> fs;
pipeline::FragmentContext* fragment_context = nullptr;
TupleDescriptor* tuple_desc = nullptr;
std::vector<std::unique_ptr<ColumnEvaluator>>* column_evaluators;
std::shared_ptr<SortOrdering> sort_ordering;
};
class PartitionChunkWriter {
public:
PartitionChunkWriter(std::string partition, std::vector<int8_t> partition_field_null_list,
const std::shared_ptr<PartitionChunkWriterContext>& ctx);
virtual ~PartitionChunkWriter() = default;
virtual Status init() = 0;
virtual Status write(Chunk* chunk) = 0;
virtual Status flush() = 0;
virtual Status finish() = 0;
virtual bool is_finished() = 0;
virtual int64_t get_written_bytes() = 0;
virtual int64_t get_flushable_bytes() = 0;
const std::string& partition() const { return _partition; }
const std::vector<int8_t>& partition_field_null_list() const { return _partition_field_null_list; }
std::shared_ptr<formats::FileWriter> file_writer() { return _file_writer; }
std::shared_ptr<io::AsyncFlushOutputStream> out_stream() { return _out_stream; }
void set_io_poller(AsyncFlushStreamPoller* io_poller) { _io_poller = io_poller; }
void set_commit_callback(const CommitFunc& commit_callback) { _commit_callback = commit_callback; }
void set_error_handler(const ErrorHandleFunc& error_handler) { _error_handler = error_handler; }
protected:
Status create_file_writer_if_needed();
void commit_file();
protected:
std::string _partition;
std::vector<int8_t> _partition_field_null_list;
std::shared_ptr<formats::FileWriterFactory> _file_writer_factory;
std::shared_ptr<LocationProvider> _location_provider;
int64_t _max_file_size = 0;
bool _is_default_partition = false;
AsyncFlushStreamPoller* _io_poller = nullptr;
std::shared_ptr<formats::FileWriter> _file_writer;
std::shared_ptr<io::AsyncFlushOutputStream> _out_stream;
CommitFunc _commit_callback;
std::string _commit_extra_data;
ErrorHandleFunc _error_handler = nullptr;
};
class BufferPartitionChunkWriter : public PartitionChunkWriter {
public:
BufferPartitionChunkWriter(std::string partition, std::vector<int8_t> partition_field_null_list,
const std::shared_ptr<BufferPartitionChunkWriterContext>& ctx)
: PartitionChunkWriter(std::move(partition), std::move(partition_field_null_list), ctx) {}
Status init() override;
Status write(Chunk* chunk) override;
Status flush() override;
Status finish() override;
bool is_finished() override { return true; }
int64_t get_written_bytes() override { return _file_writer ? _file_writer->get_written_bytes() : 0; }
int64_t get_flushable_bytes() override { return _file_writer ? _file_writer->get_written_bytes() : 0; }
};
class SpillPartitionChunkWriter : public PartitionChunkWriter {
public:
SpillPartitionChunkWriter(std::string partition, std::vector<int8_t> partition_field_null_list,
const std::shared_ptr<SpillPartitionChunkWriterContext>& ctx);
~SpillPartitionChunkWriter();
Status init() override;
Status write(Chunk* chunk) override;
Status flush() override;
Status finish() override;
bool is_finished() override;
int64_t get_written_bytes() override {
if (!_file_writer) {
return 0;
}
return _chunk_bytes_usage + _spilling_bytes_usage.load(std::memory_order_relaxed) +
_file_writer->get_written_bytes();
}
int64_t get_flushable_bytes() override { return _chunk_bytes_usage; }
Status merge_blocks();
private:
Status _sort();
Status _spill();
Status _flush_to_file();
Status _flush_chunk(Chunk* chunk, bool split);
Status _write_chunk(Chunk* chunk);
Status _merge_chunks();
SchemaPtr _make_schema();
ChunkPtr _create_schema_chunk(const ChunkPtr& base_chunk, size_t row_nums);
bool _mem_insufficent();
void _handle_err(const Status& st);
private:
std::shared_ptr<FileSystem> _fs = nullptr;
pipeline::FragmentContext* _fragment_context = nullptr;
TupleDescriptor* _tuple_desc = nullptr;
std::vector<std::unique_ptr<ColumnEvaluator>>* _column_evaluators;
std::shared_ptr<SortOrdering> _sort_ordering;
std::unique_ptr<ThreadPoolToken> _chunk_spill_token;
std::unique_ptr<ThreadPoolToken> _block_merge_token;
std::unique_ptr<LoadSpillBlockManager> _load_spill_block_mgr;
std::shared_ptr<LoadChunkSpiller> _load_chunk_spiller;
//std::function<StatusOr<ColumnPtr>(Chunk*, size_t)> _column_eval_func;
TUniqueId _writer_id;
std::list<ChunkPtr> _chunks;
int64_t _chunk_bytes_usage = 0;
std::atomic<int64_t> _spilling_bytes_usage = 0;
ChunkPtr _result_chunk;
ChunkPtr _base_chunk;
SchemaPtr _schema;
std::unordered_map<int, int> _col_index_map; // result chunk index -> chunk index
static const int64_t kWaitMilliseconds;
};
using PartitionChunkWriterPtr = std::shared_ptr<PartitionChunkWriter>;
class PartitionChunkWriterFactory {
public:
virtual ~PartitionChunkWriterFactory() = default;
virtual Status init() = 0;
virtual PartitionChunkWriterPtr create(std::string partition,
std::vector<int8_t> partition_field_null_list) const = 0;
};
class BufferPartitionChunkWriterFactory : public PartitionChunkWriterFactory {
public:
BufferPartitionChunkWriterFactory(std::shared_ptr<BufferPartitionChunkWriterContext> ctx) : _ctx(ctx) {}
~BufferPartitionChunkWriterFactory() = default;
Status init() override { return _ctx->file_writer_factory->init(); }
PartitionChunkWriterPtr create(std::string partition,
std::vector<int8_t> partition_field_null_list) const override {
return std::make_shared<BufferPartitionChunkWriter>(std::move(partition), std::move(partition_field_null_list),
_ctx);
}
private:
std::shared_ptr<BufferPartitionChunkWriterContext> _ctx;
};
class SpillPartitionChunkWriterFactory : public PartitionChunkWriterFactory {
public:
SpillPartitionChunkWriterFactory(std::shared_ptr<SpillPartitionChunkWriterContext> ctx) : _ctx(ctx) {}
~SpillPartitionChunkWriterFactory() = default;
Status init() override { return _ctx->file_writer_factory->init(); }
PartitionChunkWriterPtr create(std::string partition,
std::vector<int8_t> partition_field_null_list) const override {
return std::make_shared<SpillPartitionChunkWriter>(std::move(partition), std::move(partition_field_null_list),
_ctx);
}
private:
std::shared_ptr<SpillPartitionChunkWriterContext> _ctx;
};
} // namespace starrocks::connector

View File

@ -18,9 +18,9 @@
namespace starrocks::connector {
void SinkOperatorMemoryManager::init(std::map<PartitionKey, WriterStreamPair>* writer_stream_pairs,
void SinkOperatorMemoryManager::init(std::map<PartitionKey, PartitionChunkWriterPtr>* partition_chunk_writers,
AsyncFlushStreamPoller* io_poller, CommitFunc commit_func) {
_candidates = writer_stream_pairs;
_candidates = partition_chunk_writers;
_commit_func = std::move(commit_func);
_io_poller = io_poller;
}
@ -30,24 +30,29 @@ bool SinkOperatorMemoryManager::kill_victim() {
return false;
}
// find file writer with the largest file size
PartitionKey partition;
WriterStreamPair* victim = nullptr;
for (auto& [key, writer_and_stream] : *_candidates) {
if (victim && victim->first->get_written_bytes() > writer_and_stream.first->get_written_bytes()) {
// Find a target file writer to flush.
// For buffered partition writer, choose the the writer with the largest file size.
// For spillable partition writer, choose the the writer with the largest memory size that can be spilled.
PartitionChunkWriterPtr victim = nullptr;
for (auto& [key, writer] : *_candidates) {
int64_t flushable_bytes = writer->get_flushable_bytes();
if (flushable_bytes == 0) {
continue;
}
partition = key;
victim = &writer_and_stream;
if (victim && flushable_bytes < victim->get_flushable_bytes()) {
continue;
}
victim = writer;
}
if (victim == nullptr) {
return false;
}
auto result = victim->first->commit();
_commit_func(result);
LOG(INFO) << "kill victim: " << victim->second->filename() << " size: " << result.file_statistics.file_size;
_candidates->erase(partition);
// The flush will decrease the writer flushable memory bytes, so it usually
// will not be choosed in a short time.
const auto filename = victim->out_stream()->filename();
const auto result = victim->flush();
LOG(INFO) << "kill victim: " << filename << ", result: " << result;
return true;
}
@ -59,8 +64,8 @@ int64_t SinkOperatorMemoryManager::update_releasable_memory() {
int64_t SinkOperatorMemoryManager::update_writer_occupied_memory() {
int64_t writer_occupied_memory = 0;
for (auto& [_, writer_and_stream] : *_candidates) {
writer_occupied_memory += writer_and_stream.first->get_written_bytes();
for (auto& [_, writer] : *_candidates) {
writer_occupied_memory += writer->get_flushable_bytes();
}
_writer_occupied_memory.store(writer_occupied_memory);
return _writer_occupied_memory;
@ -113,33 +118,29 @@ bool SinkMemoryManager::_apply_on_mem_tracker(SinkOperatorMemoryManager* child_m
auto available_memory = [&]() { return mem_tracker->limit() - mem_tracker->consumption(); };
auto low_watermark = static_cast<int64_t>(mem_tracker->limit() * _low_watermark_ratio);
auto high_watermark = static_cast<int64_t>(mem_tracker->limit() * _high_watermark_ratio);
auto exceed_urgent_space = [&]() {
return _total_writer_occupied_memory() > _query_tracker->limit() * _urgent_space_ratio;
};
if (available_memory() <= low_watermark) {
child_manager->update_releasable_memory();
int64_t flush_watermark = _query_tracker->limit() * _urgent_space_ratio;
while (available_memory() <= low_watermark) {
child_manager->update_writer_occupied_memory();
LOG_EVERY_SECOND(WARNING) << "consumption: " << mem_tracker->consumption()
<< " releasable_memory: " << _total_releasable_memory()
<< " writer_allocated_memory: " << _total_writer_occupied_memory();
// trigger early close
while (exceed_urgent_space() && available_memory() + _total_releasable_memory() < high_watermark) {
bool found = child_manager->kill_victim();
if (!found) {
break;
}
child_manager->update_releasable_memory();
child_manager->update_writer_occupied_memory();
int64_t total_occupied_memory = _total_writer_occupied_memory();
LOG_EVERY_SECOND(INFO) << "consumption: " << mem_tracker->consumption()
<< ", total_occupied_memory: " << total_occupied_memory
<< ", flush_watermark: " << flush_watermark;
if (total_occupied_memory < flush_watermark) {
break;
}
bool found = child_manager->kill_victim();
if (!found) {
break;
}
}
child_manager->update_releasable_memory();
if (available_memory() <= low_watermark && _total_releasable_memory() > 0) {
LOG_EVERY_SECOND(WARNING) << "memory usage is still high after flush, : available_memory" << available_memory()
<< ", memory_low_watermark: " << low_watermark
<< ", total_releasable_memory: " << _total_releasable_memory();
return false;
}
return true;
}

View File

@ -28,8 +28,8 @@ class SinkOperatorMemoryManager {
public:
SinkOperatorMemoryManager() = default;
void init(std::map<PartitionKey, WriterStreamPair>* writer_stream_pairs, AsyncFlushStreamPoller* io_poller,
CommitFunc commit_func);
void init(std::map<PartitionKey, PartitionChunkWriterPtr>* partition_chunk_writers,
AsyncFlushStreamPoller* io_poller, CommitFunc commit_func);
// return true if a victim is found and killed, otherwise return false
bool kill_victim();
@ -45,7 +45,7 @@ public:
int64_t writer_occupied_memory() { return _writer_occupied_memory.load(); }
private:
std::map<PartitionKey, WriterStreamPair>* _candidates = nullptr; // reference, owned by sink operator
std::map<PartitionKey, PartitionChunkWriterPtr>* _candidates = nullptr; // reference, owned by sink operator
CommitFunc _commit_func;
AsyncFlushStreamPoller* _io_poller;
std::atomic_int64_t _releasable_memory{0};

View File

@ -104,6 +104,12 @@ public:
// location = base_path/{query_id}_{be_number}_{driver_id}_index.file_suffix
std::string get() { return fmt::format("{}/{}_{}.{}", _base_path, _file_name_prefix, _index++, _file_name_suffix); }
std::string root_location(const std::string& partition) {
return fmt::format("{}/{}", _base_path, PathUtils::remove_trailing_slash(partition));
}
std::string root_location() { return fmt::format("{}", PathUtils::remove_trailing_slash(_base_path)); }
private:
const std::string _base_path;
const std::string _file_name_prefix;

View File

@ -51,11 +51,13 @@ set(EXEC_FILES
aggregator.cpp
sorted_streaming_aggregator.cpp
aggregate/agg_hash_variant.cpp
aggregate/compress_serializer.cpp
aggregate/aggregate_base_node.cpp
aggregate/aggregate_blocking_node.cpp
aggregate/distinct_blocking_node.cpp
aggregate/aggregate_streaming_node.cpp
aggregate/distinct_streaming_node.cpp
partition/bucket_aware_partition.cpp
partition/chunks_partitioner.cpp
partition/partition_hash_variant.cpp
analytic_node.cpp
@ -156,6 +158,7 @@ set(EXEC_FILES
schema_scanner/schema_be_cloud_native_compactions_scanner.cpp
schema_scanner/schema_pipe_files.cpp
schema_scanner/schema_pipes.cpp
schema_scanner/schema_recyclebin_catalogs.cpp
schema_scanner/starrocks_role_edges_scanner.cpp
schema_scanner/starrocks_grants_to_scanner.cpp
schema_scanner/schema_helper.cpp

View File

@ -0,0 +1,28 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#pragma once
#include <cstdint>
namespace starrocks {
using AggDataPtr = uint8_t*;
using int128_t = __int128;
class SliceWithHash;
class HashOnSliceWithHash;
class EqualOnSliceWithHash;
} // namespace starrocks

View File

@ -14,27 +14,25 @@
#pragma once
#include <any>
#include <cstdint>
#include <limits>
#include <type_traits>
#include <utility>
#include "column/column.h"
#include "column/column_hash.h"
#include "column/column_helper.h"
#include "column/hash_set.h"
#include "column/type_traits.h"
#include "column/vectorized_fwd.h"
#include "common/compiler_util.h"
#include "exec/aggregate/agg_hash_set.h"
#include "exec/aggregate/agg_profile.h"
#include "exec/aggregate/compress_serializer.h"
#include "gutil/casts.h"
#include "gutil/strings/fastmem.h"
#include "runtime/mem_pool.h"
#include "util/fixed_hash_map.h"
#include "util/hash_util.hpp"
#include "util/phmap/phmap.h"
#include "util/phmap/phmap_dump.h"
namespace starrocks {
@ -245,9 +243,10 @@ struct AggHashMapWithOneNumberKeyWithNullable
DCHECK(!key_column->is_nullable());
const auto column = down_cast<const ColumnType*>(key_column);
size_t bucket_count = this->hash_map.bucket_count();
if (bucket_count < prefetch_threhold) {
if constexpr (is_no_prefetch_map<HashMap>) {
this->template compute_agg_noprefetch<Func, HTBuildOp>(column, agg_states,
std::forward<Func>(allocate_func), extra);
} else if (this->hash_map.bucket_count() < prefetch_threhold) {
this->template compute_agg_noprefetch<Func, HTBuildOp>(column, agg_states,
std::forward<Func>(allocate_func), extra);
} else {
@ -1091,4 +1090,151 @@ struct AggHashMapWithSerializedKeyFixedSize
int32_t _chunk_size;
};
template <typename HashMap>
struct AggHashMapWithCompressedKeyFixedSize
: public AggHashMapWithKey<HashMap, AggHashMapWithCompressedKeyFixedSize<HashMap>> {
using Self = AggHashMapWithCompressedKeyFixedSize<HashMap>;
using Base = AggHashMapWithKey<HashMap, AggHashMapWithCompressedKeyFixedSize<HashMap>>;
using KeyType = typename HashMap::key_type;
using Iterator = typename HashMap::iterator;
using FixedSizeSliceKey = typename HashMap::key_type;
using ResultVector = typename std::vector<FixedSizeSliceKey>;
template <class... Args>
AggHashMapWithCompressedKeyFixedSize(int chunk_size, Args&&... args)
: Base(chunk_size, std::forward<Args>(args)...),
mem_pool(std::make_unique<MemPool>()),
_chunk_size(chunk_size) {
fixed_keys.reserve(chunk_size);
}
AggDataPtr get_null_key_data() { return nullptr; }
void set_null_key_data(AggDataPtr data) {}
template <AllocFunc<Self> Func, typename HTBuildOp>
ALWAYS_NOINLINE void compute_agg_noprefetch(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
ExtraAggParam* extra) {
[[maybe_unused]] size_t hash_table_size = this->hash_map.size();
auto* __restrict not_founds = extra->not_founds;
// serialize
bitcompress_serialize(key_columns, bases, offsets, chunk_size, sizeof(FixedSizeSliceKey), fixed_keys.data());
for (size_t i = 0; i < chunk_size; ++i) {
if constexpr (HTBuildOp::process_limit) {
if (hash_table_size < extra->limits) {
_emplace_key(fixed_keys[i], (*agg_states)[i], allocate_func, [&] { hash_table_size++; });
} else {
_find_key((*agg_states)[i], (*not_founds)[i], fixed_keys[i]);
}
} else if constexpr (HTBuildOp::allocate) {
_emplace_key(fixed_keys[i], (*agg_states)[i], allocate_func,
FillNotFounds<HTBuildOp::fill_not_found>(not_founds, i));
} else if constexpr (HTBuildOp::fill_not_found) {
_find_key((*agg_states)[i], (*not_founds)[i], fixed_keys[i]);
}
}
}
template <AllocFunc<Self> Func, typename HTBuildOp>
ALWAYS_NOINLINE void compute_agg_prefetch(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
ExtraAggParam* extra) {
[[maybe_unused]] size_t hash_table_size = this->hash_map.size();
auto* __restrict not_founds = extra->not_founds;
// serialize
bitcompress_serialize(key_columns, bases, offsets, chunk_size, sizeof(FixedSizeSliceKey), fixed_keys.data());
hashs.reserve(chunk_size);
for (size_t i = 0; i < chunk_size; ++i) {
hashs[i] = this->hash_map.hash_function()(fixed_keys[i]);
}
size_t prefetch_index = AGG_HASH_MAP_DEFAULT_PREFETCH_DIST;
for (size_t i = 0; i < chunk_size; ++i) {
if (prefetch_index < chunk_size) {
this->hash_map.prefetch_hash(hashs[prefetch_index++]);
}
if constexpr (HTBuildOp::process_limit) {
if (hash_table_size < extra->limits) {
_emplace_key_with_hash(fixed_keys[i], hashs[i], (*agg_states)[i], allocate_func,
[&] { hash_table_size++; });
} else {
_find_key((*agg_states)[i], (*not_founds)[i], fixed_keys[i]);
}
} else if constexpr (HTBuildOp::allocate) {
_emplace_key_with_hash(fixed_keys[i], hashs[i], (*agg_states)[i], allocate_func,
FillNotFounds<HTBuildOp::fill_not_found>(not_founds, i));
} else if constexpr (HTBuildOp::fill_not_found) {
_find_key((*agg_states)[i], (*not_founds)[i], fixed_keys[i]);
}
}
}
template <AllocFunc<Self> Func, typename HTBuildOp>
void compute_agg_states(size_t chunk_size, const Columns& key_columns, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, ExtraAggParam* extra) {
auto* buffer = reinterpret_cast<uint8_t*>(fixed_keys.data());
memset(buffer, 0x0, sizeof(FixedSizeSliceKey) * chunk_size);
if constexpr (is_no_prefetch_map<HashMap>) {
this->template compute_agg_noprefetch<Func, HTBuildOp>(
chunk_size, key_columns, pool, std::forward<Func>(allocate_func), agg_states, extra);
} else if (this->hash_map.bucket_count() < prefetch_threhold) {
this->template compute_agg_noprefetch<Func, HTBuildOp>(
chunk_size, key_columns, pool, std::forward<Func>(allocate_func), agg_states, extra);
} else {
this->template compute_agg_prefetch<Func, HTBuildOp>(chunk_size, key_columns, pool,
std::forward<Func>(allocate_func), agg_states, extra);
}
}
template <AllocFunc<Self> Func, typename EmplaceCallBack>
ALWAYS_INLINE void _emplace_key(KeyType key, AggDataPtr& target_state, Func&& allocate_func,
EmplaceCallBack&& callback) {
auto iter = this->hash_map.lazy_emplace(key, [&](const auto& ctor) {
callback();
AggDataPtr pv = allocate_func(key);
ctor(key, pv);
});
target_state = iter->second;
}
template <AllocFunc<Self> Func, typename EmplaceCallBack>
ALWAYS_INLINE void _emplace_key_with_hash(KeyType key, size_t hash, AggDataPtr& target_state, Func&& allocate_func,
EmplaceCallBack&& callback) {
auto iter = this->hash_map.lazy_emplace_with_hash(key, hash, [&](const auto& ctor) {
callback();
AggDataPtr pv = allocate_func(key);
ctor(key, pv);
});
target_state = iter->second;
}
template <typename... Args>
ALWAYS_INLINE void _find_key(AggDataPtr& target_state, uint8_t& not_found, Args&&... args) {
if (auto iter = this->hash_map.find(std::forward<Args>(args)...); iter != this->hash_map.end()) {
target_state = iter->second;
} else {
not_found = 1;
}
}
void insert_keys_to_columns(ResultVector& keys, Columns& key_columns, int32_t chunk_size) {
bitcompress_deserialize(key_columns, bases, offsets, used_bits, chunk_size, sizeof(FixedSizeSliceKey),
keys.data());
}
static constexpr bool has_single_null_key = false;
std::vector<int> used_bits;
std::vector<int> offsets;
std::vector<std::any> bases;
std::vector<FixedSizeSliceKey> fixed_keys;
std::vector<size_t> hashs;
std::unique_ptr<MemPool> mem_pool;
ResultVector results;
int32_t _chunk_size;
};
} // namespace starrocks

View File

@ -14,19 +14,17 @@
#pragma once
#include <any>
#include "column/column_hash.h"
#include "column/column_helper.h"
#include "column/hash_set.h"
#include "column/type_traits.h"
#include "column/vectorized_fwd.h"
#include "exec/aggregate/agg_profile.h"
#include "gutil/casts.h"
#include "runtime/mem_pool.h"
#include "runtime/runtime_state.h"
#include "util/fixed_hash_map.h"
#include "util/hash_util.hpp"
#include "util/phmap/phmap.h"
#include "util/runtime_profile.h"
namespace starrocks {
@ -111,14 +109,6 @@ struct AggHashSet {
}
};
template <typename T>
struct no_prefetch_set : std::false_type {};
template <PhmapSeed seed>
struct no_prefetch_set<Int8AggHashSet<seed>> : std::true_type {};
template <class T>
constexpr bool is_no_prefetch_set = no_prefetch_set<T>::value;
// handle one number hash key
template <LogicalType logical_type, typename HashSet>
struct AggHashSetOfOneNumberKey : public AggHashSet<HashSet, AggHashSetOfOneNumberKey<logical_type, HashSet>> {
@ -147,12 +137,10 @@ struct AggHashSetOfOneNumberKey : public AggHashSet<HashSet, AggHashSetOfOneNumb
if constexpr (is_no_prefetch_set<HashSet>) {
this->template build_set_noprefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
} else if (this->hash_set.bucket_count() < prefetch_threhold) {
this->template build_set_noprefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
} else {
if (this->hash_set.bucket_count() < prefetch_threhold) {
this->template build_set_noprefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
} else {
this->template build_set_prefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
}
this->template build_set_prefetch<compute_and_allocate>(chunk_size, key_columns, pool, not_founds);
}
}
@ -754,10 +742,94 @@ struct AggHashSetOfSerializedKeyFixedSize : public AggHashSet<HashSet, AggHashSe
uint8_t* buffer;
ResultVector results;
Buffer<Slice> tmp_slices;
// std::vector<Slice> tmp_slices;
int32_t _chunk_size;
std::vector<size_t> hashes;
};
template <typename HashSet>
struct AggHashSetCompressedFixedSize : public AggHashSet<HashSet, AggHashSetCompressedFixedSize<HashSet>> {
using Base = AggHashSet<HashSet, AggHashSetCompressedFixedSize<HashSet>>;
using Iterator = typename HashSet::iterator;
using KeyType = typename HashSet::key_type;
using FixedSizeSliceKey = typename HashSet::key_type;
using ResultVector = typename std::vector<FixedSizeSliceKey>;
bool has_null_column = false;
static constexpr size_t max_fixed_size = sizeof(FixedSizeSliceKey);
template <class... Args>
AggHashSetCompressedFixedSize(int32_t chunk_size, Args&&... args)
: Base(chunk_size, std::forward<Args>(args)...), _chunk_size(chunk_size) {
fixed_keys.reserve(chunk_size);
}
// When compute_and_allocate=false:
// Elements queried in HashSet will be added to HashSet
// elements that cannot be queried are not processed,
// and are mainly used in the first stage of two-stage aggregation when aggr reduction is low
template <bool compute_and_allocate>
void build_set(size_t chunk_size, const Columns& key_columns, MemPool* pool, Filter* not_founds) {
if constexpr (!compute_and_allocate) {
DCHECK(not_founds);
not_founds->assign(chunk_size, 0);
}
auto* buffer = reinterpret_cast<uint8_t*>(fixed_keys.data());
memset(buffer, 0x0, sizeof(FixedSizeSliceKey) * chunk_size);
bitcompress_serialize(key_columns, bases, offsets, chunk_size, sizeof(FixedSizeSliceKey), fixed_keys.data());
if constexpr (is_no_prefetch_set<HashSet>) {
this->template build_set_noprefetch<compute_and_allocate>(chunk_size, pool, not_founds);
} else if (this->hash_set.bucket_count() < prefetch_threhold) {
this->template build_set_noprefetch<compute_and_allocate>(chunk_size, pool, not_founds);
} else {
this->template build_set_prefetch<compute_and_allocate>(chunk_size, pool, not_founds);
}
}
template <bool compute_and_allocate>
ALWAYS_NOINLINE void build_set_prefetch(size_t chunk_size, MemPool* pool, Filter* not_founds) {
auto* keys = reinterpret_cast<FixedSizeSliceKey*>(fixed_keys.data());
AGG_HASH_SET_PRECOMPUTE_HASH_VALS();
for (size_t i = 0; i < chunk_size; ++i) {
AGG_HASH_SET_PREFETCH_HASH_VAL();
if constexpr (compute_and_allocate) {
this->hash_set.emplace_with_hash(hashes[i], keys[i]);
} else {
(*not_founds)[i] = this->hash_set.find(keys[i], hashes[i]) == this->hash_set.end();
}
}
}
template <bool compute_and_allocate>
ALWAYS_NOINLINE void build_set_noprefetch(size_t chunk_size, MemPool* pool, Filter* not_founds) {
for (size_t i = 0; i < chunk_size; ++i) {
if constexpr (compute_and_allocate) {
this->hash_set.insert(fixed_keys[i]);
} else {
(*not_founds)[i] = !this->hash_set.contains(fixed_keys[i]);
}
}
}
void insert_keys_to_columns(ResultVector& keys, Columns& key_columns, int32_t chunk_size) {
bitcompress_deserialize(key_columns, bases, offsets, used_bits, chunk_size, sizeof(FixedSizeSliceKey),
keys.data());
}
static constexpr bool has_single_null_key = false;
bool has_null_key = false;
std::vector<int> used_bits;
std::vector<int> offsets;
std::vector<std::any> bases;
std::vector<FixedSizeSliceKey> fixed_keys;
std::vector<size_t> hashes;
ResultVector results;
int32_t _chunk_size;
};
} // namespace starrocks

View File

@ -15,11 +15,91 @@
#include "exec/aggregate/agg_hash_variant.h"
#include <tuple>
#include <type_traits>
#include <variant>
#include "runtime/runtime_state.h"
#include "util/phmap/phmap.h"
#define APPLY_FOR_AGG_VARIANT_ALL(M) \
M(phase1_uint8) \
M(phase1_int8) \
M(phase1_int16) \
M(phase1_int32) \
M(phase1_int64) \
M(phase1_int128) \
M(phase1_decimal32) \
M(phase1_decimal64) \
M(phase1_decimal128) \
M(phase1_decimal256) \
M(phase1_date) \
M(phase1_timestamp) \
M(phase1_string) \
M(phase1_slice) \
M(phase1_null_uint8) \
M(phase1_null_int8) \
M(phase1_null_int16) \
M(phase1_null_int32) \
M(phase1_null_int64) \
M(phase1_null_int128) \
M(phase1_null_decimal32) \
M(phase1_null_decimal64) \
M(phase1_null_decimal128) \
M(phase1_null_decimal256) \
M(phase1_null_date) \
M(phase1_null_timestamp) \
M(phase1_null_string) \
M(phase1_slice_two_level) \
M(phase1_int32_two_level) \
M(phase1_null_string_two_level) \
M(phase1_string_two_level) \
\
M(phase2_uint8) \
M(phase2_int8) \
M(phase2_int16) \
M(phase2_int32) \
M(phase2_int64) \
M(phase2_int128) \
M(phase2_decimal32) \
M(phase2_decimal64) \
M(phase2_decimal128) \
M(phase2_decimal256) \
M(phase2_date) \
M(phase2_timestamp) \
M(phase2_string) \
M(phase2_slice) \
M(phase2_null_uint8) \
M(phase2_null_int8) \
M(phase2_null_int16) \
M(phase2_null_int32) \
M(phase2_null_int64) \
M(phase2_null_int128) \
M(phase2_null_decimal32) \
M(phase2_null_decimal64) \
M(phase2_null_decimal128) \
M(phase2_null_decimal256) \
M(phase2_null_date) \
M(phase2_null_timestamp) \
M(phase2_null_string) \
M(phase2_slice_two_level) \
M(phase2_int32_two_level) \
M(phase2_null_string_two_level) \
M(phase2_string_two_level) \
\
M(phase1_slice_fx4) \
M(phase1_slice_fx8) \
M(phase1_slice_fx16) \
M(phase2_slice_fx4) \
M(phase2_slice_fx8) \
M(phase2_slice_fx16) \
M(phase1_slice_cx1) \
M(phase1_slice_cx4) \
M(phase1_slice_cx8) \
M(phase1_slice_cx16) \
M(phase2_slice_cx1) \
M(phase2_slice_cx4) \
M(phase2_slice_cx8) \
M(phase2_slice_cx16)
namespace starrocks {
namespace detail {
template <AggHashMapVariant::Type>
@ -65,6 +145,10 @@ DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_string_two_level, OneStringTwoLe
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_fx4, SerializedKeyFixedSize4AggHashMap<PhmapSeed1>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_fx8, SerializedKeyFixedSize8AggHashMap<PhmapSeed1>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_fx16, SerializedKeyFixedSize16AggHashMap<PhmapSeed1>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_cx1, CompressedFixedSize1AggHashMap<PhmapSeed1>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_cx4, CompressedFixedSize4AggHashMap<PhmapSeed1>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_cx8, CompressedFixedSize8AggHashMap<PhmapSeed1>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase1_slice_cx16, CompressedFixedSize16AggHashMap<PhmapSeed1>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_uint8, UInt8AggHashMapWithOneNumberKey<PhmapSeed2>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_int8, Int8AggHashMapWithOneNumberKey<PhmapSeed2>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_int16, Int16AggHashMapWithOneNumberKey<PhmapSeed2>);
@ -99,6 +183,10 @@ DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_string_two_level, OneStringTwoLe
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_fx4, SerializedKeyFixedSize4AggHashMap<PhmapSeed2>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_fx8, SerializedKeyFixedSize8AggHashMap<PhmapSeed2>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_fx16, SerializedKeyFixedSize16AggHashMap<PhmapSeed2>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_cx1, CompressedFixedSize1AggHashMap<PhmapSeed2>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_cx4, CompressedFixedSize4AggHashMap<PhmapSeed2>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_cx8, CompressedFixedSize8AggHashMap<PhmapSeed2>);
DEFINE_MAP_TYPE(AggHashMapVariant::Type::phase2_slice_cx16, CompressedFixedSize16AggHashMap<PhmapSeed2>);
template <AggHashSetVariant::Type>
struct AggHashSetVariantTypeTraits;
@ -180,6 +268,15 @@ DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_fx4, SerializedKeyAggHashS
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_fx8, SerializedKeyAggHashSetFixedSize8<PhmapSeed2>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_fx16, SerializedKeyAggHashSetFixedSize16<PhmapSeed2>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase1_slice_cx1, CompressedAggHashSetFixedSize1<PhmapSeed1>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase1_slice_cx4, CompressedAggHashSetFixedSize4<PhmapSeed1>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase1_slice_cx8, CompressedAggHashSetFixedSize8<PhmapSeed1>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase1_slice_cx16, CompressedAggHashSetFixedSize16<PhmapSeed1>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_cx1, CompressedAggHashSetFixedSize1<PhmapSeed2>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_cx4, CompressedAggHashSetFixedSize4<PhmapSeed2>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_cx8, CompressedAggHashSetFixedSize8<PhmapSeed2>);
DEFINE_SET_TYPE(AggHashSetVariant::Type::phase2_slice_cx16, CompressedAggHashSetFixedSize16<PhmapSeed2>);
} // namespace detail
void AggHashMapVariant::init(RuntimeState* state, Type type, AggStatistics* agg_stat) {
_type = type;

View File

@ -17,93 +17,15 @@
#pragma once
#include <type_traits>
#include <utility>
#include <variant>
#include "column/hash_set.h"
#include "exec/aggregate/agg_hash_map.h"
#include "exec/aggregate/agg_hash_set.h"
#include "exec/aggregate/agg_profile.h"
#include "types/logical_type.h"
#include "util/phmap/phmap.h"
namespace starrocks {
enum AggrPhase { AggrPhase1, AggrPhase2 };
#define APPLY_FOR_AGG_VARIANT_ALL(M) \
M(phase1_uint8) \
M(phase1_int8) \
M(phase1_int16) \
M(phase1_int32) \
M(phase1_int64) \
M(phase1_int128) \
M(phase1_decimal32) \
M(phase1_decimal64) \
M(phase1_decimal128) \
M(phase1_decimal256) \
M(phase1_date) \
M(phase1_timestamp) \
M(phase1_string) \
M(phase1_slice) \
M(phase1_null_uint8) \
M(phase1_null_int8) \
M(phase1_null_int16) \
M(phase1_null_int32) \
M(phase1_null_int64) \
M(phase1_null_int128) \
M(phase1_null_decimal32) \
M(phase1_null_decimal64) \
M(phase1_null_decimal128) \
M(phase1_null_decimal256) \
M(phase1_null_date) \
M(phase1_null_timestamp) \
M(phase1_null_string) \
M(phase1_slice_two_level) \
M(phase1_int32_two_level) \
M(phase1_null_string_two_level) \
M(phase1_string_two_level) \
\
M(phase2_uint8) \
M(phase2_int8) \
M(phase2_int16) \
M(phase2_int32) \
M(phase2_int64) \
M(phase2_int128) \
M(phase2_decimal32) \
M(phase2_decimal64) \
M(phase2_decimal128) \
M(phase2_decimal256) \
M(phase2_date) \
M(phase2_timestamp) \
M(phase2_string) \
M(phase2_slice) \
M(phase2_null_uint8) \
M(phase2_null_int8) \
M(phase2_null_int16) \
M(phase2_null_int32) \
M(phase2_null_int64) \
M(phase2_null_int128) \
M(phase2_null_decimal32) \
M(phase2_null_decimal64) \
M(phase2_null_decimal128) \
M(phase2_null_decimal256) \
M(phase2_null_date) \
M(phase2_null_timestamp) \
M(phase2_null_string) \
M(phase2_slice_two_level) \
M(phase2_int32_two_level) \
M(phase2_null_string_two_level) \
M(phase2_string_two_level) \
\
M(phase1_slice_fx4) \
M(phase1_slice_fx8) \
M(phase1_slice_fx16) \
M(phase2_slice_fx4) \
M(phase2_slice_fx8) \
M(phase2_slice_fx16)
// Aggregate Hash maps
// no-nullable single key maps:
@ -187,6 +109,16 @@ using SerializedKeyFixedSize8AggHashMap = AggHashMapWithSerializedKeyFixedSize<F
template <PhmapSeed seed>
using SerializedKeyFixedSize16AggHashMap = AggHashMapWithSerializedKeyFixedSize<FixedSize16SliceAggHashMap<seed>>;
// fixed compress key
template <PhmapSeed seed>
using CompressedFixedSize1AggHashMap = AggHashMapWithCompressedKeyFixedSize<Int8AggHashMap<seed>>;
template <PhmapSeed seed>
using CompressedFixedSize4AggHashMap = AggHashMapWithCompressedKeyFixedSize<Int32AggHashMap<seed>>;
template <PhmapSeed seed>
using CompressedFixedSize8AggHashMap = AggHashMapWithCompressedKeyFixedSize<Int64AggHashMap<seed>>;
template <PhmapSeed seed>
using CompressedFixedSize16AggHashMap = AggHashMapWithCompressedKeyFixedSize<Int128AggHashMap<seed>>;
// Hash sets
//
template <PhmapSeed seed>
@ -270,6 +202,15 @@ using SerializedKeyAggHashSetFixedSize8 = AggHashSetOfSerializedKeyFixedSize<Fix
template <PhmapSeed seed>
using SerializedKeyAggHashSetFixedSize16 = AggHashSetOfSerializedKeyFixedSize<FixedSize16SliceAggHashSet<seed>>;
template <PhmapSeed seed>
using CompressedAggHashSetFixedSize1 = AggHashSetCompressedFixedSize<Int8AggHashSet<seed>>;
template <PhmapSeed seed>
using CompressedAggHashSetFixedSize4 = AggHashSetCompressedFixedSize<Int32AggHashSet<seed>>;
template <PhmapSeed seed>
using CompressedAggHashSetFixedSize8 = AggHashSetCompressedFixedSize<Int64AggHashSet<seed>>;
template <PhmapSeed seed>
using CompressedAggHashSetFixedSize16 = AggHashSetCompressedFixedSize<Int128AggHashSet<seed>>;
// aggregate key
template <class HashMapWithKey>
struct CombinedFixedSizeKey {
@ -294,6 +235,24 @@ static_assert(!is_combined_fixed_size_key<Int32TwoLevelAggHashSetOfOneNumberKey<
static_assert(is_combined_fixed_size_key<SerializedKeyAggHashSetFixedSize4<PhmapSeed1>>);
static_assert(!is_combined_fixed_size_key<Int32TwoLevelAggHashMapWithOneNumberKey<PhmapSeed1>>);
template <class HashMapWithKey>
struct CompressedFixedSizeKey {
static auto constexpr value = false;
};
template <typename HashMap>
struct CompressedFixedSizeKey<AggHashMapWithCompressedKeyFixedSize<HashMap>> {
static auto constexpr value = true;
};
template <typename HashSet>
struct CompressedFixedSizeKey<AggHashSetCompressedFixedSize<HashSet>> {
static auto constexpr value = true;
};
template <typename HashMapOrSetWithKey>
inline constexpr bool is_compressed_fixed_size_key = CompressedFixedSizeKey<HashMapOrSetWithKey>::value;
// 1) For different group by columns type, size, cardinality, volume, we should choose different
// hash functions and different hashmaps.
// When runtime, we will only have one hashmap.
@ -341,6 +300,10 @@ using AggHashMapWithKeyPtr = std::variant<
std::unique_ptr<SerializedKeyFixedSize4AggHashMap<PhmapSeed1>>,
std::unique_ptr<SerializedKeyFixedSize8AggHashMap<PhmapSeed1>>,
std::unique_ptr<SerializedKeyFixedSize16AggHashMap<PhmapSeed1>>,
std::unique_ptr<CompressedFixedSize1AggHashMap<PhmapSeed1>>,
std::unique_ptr<CompressedFixedSize4AggHashMap<PhmapSeed1>>,
std::unique_ptr<CompressedFixedSize8AggHashMap<PhmapSeed1>>,
std::unique_ptr<CompressedFixedSize16AggHashMap<PhmapSeed1>>,
std::unique_ptr<UInt8AggHashMapWithOneNumberKey<PhmapSeed2>>,
std::unique_ptr<Int8AggHashMapWithOneNumberKey<PhmapSeed2>>,
std::unique_ptr<Int16AggHashMapWithOneNumberKey<PhmapSeed2>>,
@ -373,7 +336,11 @@ using AggHashMapWithKeyPtr = std::variant<
std::unique_ptr<NullOneStringTwoLevelAggHashMap<PhmapSeed2>>,
std::unique_ptr<SerializedKeyFixedSize4AggHashMap<PhmapSeed2>>,
std::unique_ptr<SerializedKeyFixedSize8AggHashMap<PhmapSeed2>>,
std::unique_ptr<SerializedKeyFixedSize16AggHashMap<PhmapSeed2>>>;
std::unique_ptr<SerializedKeyFixedSize16AggHashMap<PhmapSeed2>>,
std::unique_ptr<CompressedFixedSize1AggHashMap<PhmapSeed2>>,
std::unique_ptr<CompressedFixedSize4AggHashMap<PhmapSeed2>>,
std::unique_ptr<CompressedFixedSize8AggHashMap<PhmapSeed2>>,
std::unique_ptr<CompressedFixedSize16AggHashMap<PhmapSeed2>>>;
using AggHashSetWithKeyPtr = std::variant<
std::unique_ptr<UInt8AggHashSetOfOneNumberKey<PhmapSeed1>>,
@ -441,7 +408,16 @@ using AggHashSetWithKeyPtr = std::variant<
std::unique_ptr<SerializedKeyAggHashSetFixedSize16<PhmapSeed1>>,
std::unique_ptr<SerializedKeyAggHashSetFixedSize4<PhmapSeed2>>,
std::unique_ptr<SerializedKeyAggHashSetFixedSize8<PhmapSeed2>>,
std::unique_ptr<SerializedKeyAggHashSetFixedSize16<PhmapSeed2>>>;
std::unique_ptr<SerializedKeyAggHashSetFixedSize16<PhmapSeed2>>,
std::unique_ptr<CompressedAggHashSetFixedSize1<PhmapSeed1>>,
std::unique_ptr<CompressedAggHashSetFixedSize4<PhmapSeed1>>,
std::unique_ptr<CompressedAggHashSetFixedSize8<PhmapSeed1>>,
std::unique_ptr<CompressedAggHashSetFixedSize16<PhmapSeed1>>,
std::unique_ptr<CompressedAggHashSetFixedSize1<PhmapSeed2>>,
std::unique_ptr<CompressedAggHashSetFixedSize4<PhmapSeed2>>,
std::unique_ptr<CompressedAggHashSetFixedSize8<PhmapSeed2>>,
std::unique_ptr<CompressedAggHashSetFixedSize16<PhmapSeed2>>>;
} // namespace detail
struct AggHashMapVariant {
enum class Type {
@ -481,6 +457,11 @@ struct AggHashMapVariant {
phase1_slice_fx8,
phase1_slice_fx16,
phase1_slice_cx1,
phase1_slice_cx4,
phase1_slice_cx8,
phase1_slice_cx16,
phase2_uint8,
phase2_int8,
phase2_int16,
@ -517,6 +498,10 @@ struct AggHashMapVariant {
phase2_slice_fx8,
phase2_slice_fx16,
phase2_slice_cx1,
phase2_slice_cx4,
phase2_slice_cx8,
phase2_slice_cx16,
};
detail::AggHashMapWithKeyPtr hash_map_with_key;
@ -630,6 +615,14 @@ struct AggHashSetVariant {
phase2_slice_fx8,
phase2_slice_fx16,
phase1_slice_cx1,
phase1_slice_cx4,
phase1_slice_cx8,
phase1_slice_cx16,
phase2_slice_cx1,
phase2_slice_cx4,
phase2_slice_cx8,
phase2_slice_cx16,
};
detail::AggHashSetWithKeyPtr hash_set_with_key;

View File

@ -14,7 +14,7 @@
#include "exec/aggregate/aggregate_base_node.h"
#include "gutil/strings/substitute.h"
#include "exec/aggregator.h"
namespace starrocks {

View File

@ -14,9 +14,7 @@
#pragma once
#include <any>
#include "exec/aggregator.h"
#include "exec/aggregator_fwd.h"
#include "exec/exec_node.h"
namespace starrocks {

View File

@ -16,13 +16,10 @@
#include <memory>
#include <type_traits>
#include <variant>
#include "exec/aggregator.h"
#include "exec/pipeline/aggregate/aggregate_blocking_sink_operator.h"
#include "exec/pipeline/aggregate/aggregate_blocking_source_operator.h"
#include "exec/pipeline/aggregate/aggregate_streaming_sink_operator.h"
#include "exec/pipeline/aggregate/aggregate_streaming_source_operator.h"
#include "exec/pipeline/aggregate/sorted_aggregate_streaming_sink_operator.h"
#include "exec/pipeline/aggregate/sorted_aggregate_streaming_source_operator.h"
#include "exec/pipeline/aggregate/spillable_aggregate_blocking_sink_operator.h"
@ -32,12 +29,8 @@
#include "exec/pipeline/chunk_accumulate_operator.h"
#include "exec/pipeline/exchange/local_exchange_source_operator.h"
#include "exec/pipeline/limit_operator.h"
#include "exec/pipeline/noop_sink_operator.h"
#include "exec/pipeline/operator.h"
#include "exec/pipeline/pipeline_builder.h"
#include "exec/pipeline/spill_process_operator.h"
#include "exec/sorted_streaming_aggregator.h"
#include "gutil/casts.h"
#include "runtime/current_thread.h"
#include "simd/simd.h"
@ -121,8 +114,7 @@ Status AggregateBlockingNode::open(RuntimeState* state) {
if (_aggregator->hash_map_variant().size() == 0) {
_aggregator->set_ht_eos();
}
_aggregator->hash_map_variant().visit(
[&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
_aggregator->it_hash() = _aggregator->state_allocator().begin();
} else if (_aggregator->is_none_group_by_exprs()) {
// for aggregate no group by, if _num_input_rows is 0,
// In update phase, we directly return empty chunk.

View File

@ -204,7 +204,7 @@ Status AggregateStreamingNode::get_next(RuntimeState* state, ChunkPtr* chunk, bo
Status AggregateStreamingNode::_output_chunk_from_hash_map(ChunkPtr* chunk) {
if (!_aggregator->it_hash().has_value()) {
_aggregator->it_hash() = _aggregator->_state_allocator.begin();
_aggregator->it_hash() = _aggregator->state_allocator().begin();
COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_map_variant().size());
}

View File

@ -0,0 +1,296 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <any>
#include <optional>
#include "column/column_helper.h"
#include "column/column_visitor_adapter.h"
#include "column/decimalv3_column.h"
#include "column/nullable_column.h"
#include "common/status.h"
#include "exprs/literal.h"
#include "types/logical_type_infra.h"
#include "util/unaligned_access.h"
namespace starrocks {
template <size_t N>
struct int_type {};
template <>
struct int_type<1> {
using type = int8_t;
};
template <>
struct int_type<2> {
using type = int16_t;
};
template <>
struct int_type<4> {
using type = int32_t;
};
template <>
struct int_type<8> {
using type = int64_t;
};
template <>
struct int_type<16> {
using type = __int128;
};
template <class T>
int leading_zeros(T v) {
if (v == 0) return sizeof(T) * 8;
typename std::make_unsigned<T>::type uv = v;
return __builtin_clzll(static_cast<size_t>(uv)) - (sizeof(size_t) * 8 - sizeof(T) * 8);
}
template <>
int leading_zeros<int128_t>(int128_t v) {
uint64_t high = (uint64_t)(v >> 64);
uint64_t low = (uint64_t)v;
if (high != 0) {
return leading_zeros(high);
} else {
return 64 + leading_zeros(low);
}
}
template <class T>
int get_used_bits(T min, T max) {
using IntType = typename int_type<sizeof(T)>::type;
auto vmin = unaligned_load<IntType>(&min);
auto vmax = unaligned_load<IntType>(&max);
IntType delta = vmax - vmin;
return sizeof(T) * 8 - (leading_zeros<IntType>(delta));
}
std::optional<int> get_used_bits(LogicalType ltype, const VectorizedLiteral& begin, const VectorizedLiteral& end,
std::any& base) {
size_t used_bits = 0;
bool applied = scalar_type_dispatch(ltype, [&]<LogicalType Type>() {
if constexpr ((lt_is_integer<Type> || lt_is_decimal<Type> ||
lt_is_date<Type>)&&(sizeof(RunTimeCppType<Type>) <= 16)) {
RunTimeCppType<Type> cs_min = ColumnHelper::get_const_value<Type>(begin.value().get());
RunTimeCppType<Type> cs_max = ColumnHelper::get_const_value<Type>(end.value().get());
base = cs_min;
used_bits = get_used_bits(cs_min, cs_max);
return true;
}
return false;
});
if (applied) {
return used_bits;
}
return {};
}
template <class TSrc, class TDst>
void bitcompress_serialize(const TSrc* __restrict val, const uint8_t* __restrict nulls, TSrc base, size_t n, int offset,
TDst* __restrict dst) {
using UTSrc = typename std::make_unsigned<TSrc>::type;
if (nulls == nullptr) {
for (size_t i = 0; i < n; ++i) {
TDst v = UTSrc(val[i] - base);
dst[i] |= v << offset;
}
} else {
for (size_t i = 0; i < n; ++i) {
TDst v = UTSrc(val[i] - base) & ~(-static_cast<TSrc>(nulls[i]));
dst[i] |= TDst(nulls[i]) << offset;
dst[i] |= v << (offset + 1);
}
}
}
template <class Dst>
class CompressSerializer : public ColumnVisitorAdapter<CompressSerializer<Dst>> {
public:
using Base = ColumnVisitorAdapter<CompressSerializer<Dst>>;
CompressSerializer(Dst* dst, const std::any& base, int offset)
: Base(this), _dst(dst), _base(base), _offset(offset) {}
Status do_visit(const NullableColumn& column) {
_null_data = column.null_column_data().data();
return column.data_column()->accept(this);
}
template <typename Column, typename T>
void bit_compress(const Column& column) {
if constexpr (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16) {
using SrcType = typename int_type<sizeof(T)>::type;
const auto& container = column.get_data();
const auto& raw_data = container.data();
size_t n = container.size();
auto base = std::any_cast<T>(_base);
auto tbase = unaligned_load<SrcType>(&base);
bitcompress_serialize((SrcType*)raw_data, _null_data, tbase, n, _offset, _dst);
} else {
CHECK(false) << "unreachable";
}
}
template <typename T>
Status do_visit(const FixedLengthColumn<T>& column) {
bit_compress<FixedLengthColumn<T>, T>(column);
return Status::OK();
}
template <typename T>
Status do_visit(const DecimalV3Column<T>& column) {
bit_compress<DecimalV3Column<T>, T>(column);
return Status::OK();
}
template <typename T>
Status do_visit(const T& column) {
CHECK(false) << "unreachable";
return Status::NotSupported("unsupported type");
}
private:
Dst* _dst;
const std::any& _base;
int _offset;
const uint8_t* _null_data = nullptr;
};
template <class T>
T mask(T bits) {
if (bits == sizeof(T) * 8) return ~T(0);
return (T(1) << bits) - 1;
}
template <class TSrc, class TDst>
void bitcompress_deserialize(const TSrc* __restrict src, uint8_t* __restrict nulls, TDst base, int n, int used_bits,
int offset, TDst* __restrict dst) {
typename std::make_unsigned<TSrc>::type* usrc = (typename std::make_unsigned<TSrc>::type*)src;
const uint8_t mask1 = mask<uint8_t>(1);
const TSrc mask2 = mask<TSrc>(used_bits - offset - (nulls != nullptr));
if (nulls == nullptr) {
for (size_t i = 0; i < n; ++i) {
dst[i] = ((usrc[i] >> (offset)) & mask2) + base;
}
} else {
for (size_t i = 0; i < n; ++i) {
nulls[i] = (usrc[i] >> offset) & mask1;
dst[i] = ((usrc[i] >> (offset + 1)) & mask2) + base;
}
}
}
template <class Src>
class CompressDeserializer final : public ColumnVisitorMutableAdapter<CompressDeserializer<Src>> {
public:
using Base = ColumnVisitorMutableAdapter<CompressDeserializer<Src>>;
explicit CompressDeserializer(size_t num_rows, Src* src, const std::any& base, int offset, int used_bits)
: Base(this), _num_rows(num_rows), _src(src), _base(base), _offset(offset), _used_bits(used_bits) {}
Status do_visit(NullableColumn* column) {
// TODO: opt me
column->null_column_data().resize(_num_rows);
_null_data = column->null_column_data().data();
RETURN_IF_ERROR(column->data_column()->accept_mutable(this));
column->update_has_null();
return Status::OK();
}
template <typename Column, typename T>
void bit_decompress(Column* column) {
if constexpr (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16) {
using DstType = typename int_type<sizeof(T)>::type;
column->resize(_num_rows);
auto& container = column->get_data();
auto* raw_data = container.data();
auto base = std::any_cast<T>(_base);
auto tbase = unaligned_load<DstType>(&base);
bitcompress_deserialize(_src, _null_data, tbase, _num_rows, _used_bits, _offset, (DstType*)raw_data);
} else {
CHECK(false) << "unreachable";
}
}
template <typename T>
Status do_visit(FixedLengthColumn<T>* column) {
bit_decompress<FixedLengthColumn<T>, T>(column);
return Status::OK();
}
template <typename T>
Status do_visit(DecimalV3Column<T>* column) {
bit_decompress<DecimalV3Column<T>, T>(column);
return Status::OK();
}
template <typename T>
Status do_visit(const T& column) {
DCHECK(false) << "unreachable";
return Status::NotSupported("unsupported type");
}
private:
size_t _num_rows;
const Src* _src;
const std::any& _base;
int _offset;
int _used_bits;
uint8_t* _null_data = nullptr;
};
void bitcompress_serialize(const Columns& columns, const std::vector<std::any>& bases, const std::vector<int>& offsets,
size_t num_rows, size_t fixed_key_size, void* buffer) {
for (size_t i = 0; i < columns.size(); ++i) {
if (fixed_key_size == 1) {
CompressSerializer<uint8_t> serializer((uint8_t*)buffer, bases[i], offsets[i]);
(void)columns[i]->accept(&serializer);
} else if (fixed_key_size == 4) {
CompressSerializer<int> serializer((int*)buffer, bases[i], offsets[i]);
(void)columns[i]->accept(&serializer);
} else if (fixed_key_size == 8) {
CompressSerializer<int64_t> serializer((int64_t*)buffer, bases[i], offsets[i]);
(void)columns[i]->accept(&serializer);
} else if (fixed_key_size == 16) {
CompressSerializer<int128_t> serializer((int128_t*)buffer, bases[i], offsets[i]);
(void)columns[i]->accept(&serializer);
} else {
DCHECK(false) << "unreachable path";
}
}
}
void bitcompress_deserialize(Columns& columns, const std::vector<std::any>& bases, const std::vector<int>& offsets,
const std::vector<int>& used_bits, size_t num_rows, size_t fixed_key_size, void* buffer) {
for (size_t i = 0; i < columns.size(); ++i) {
if (fixed_key_size == 1) {
CompressDeserializer<uint8_t> deserializer(num_rows, (uint8_t*)buffer, bases[i], offsets[i], used_bits[i]);
(void)columns[i]->accept_mutable(&deserializer);
} else if (fixed_key_size == 4) {
CompressDeserializer<int> deserializer(num_rows, (int*)buffer, bases[i], offsets[i], used_bits[i]);
(void)columns[i]->accept_mutable(&deserializer);
} else if (fixed_key_size == 8) {
CompressDeserializer<int64_t> deserializer(num_rows, (int64_t*)buffer, bases[i], offsets[i], used_bits[i]);
(void)columns[i]->accept_mutable(&deserializer);
} else if (fixed_key_size == 16) {
CompressDeserializer<int128_t> deserializer(num_rows, (int128_t*)buffer, bases[i], offsets[i],
used_bits[i]);
(void)columns[i]->accept_mutable(&deserializer);
} else {
DCHECK(false) << "unreachable path";
}
}
}
} // namespace starrocks

View File

@ -0,0 +1,48 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <any>
#include "column/column.h"
#include "types/logical_type.h"
namespace starrocks {
class VectorizedLiteral;
/**
* Calculates the number of bits used between a given range for a specified logical type.
*
* This function calculates the number of bits required for a given logical type and a specified range
* of start and end values. The result is an optional integer representing the calculated number of bits.
*
* If we input a column that does not support bit compress, we will return an empty optional.
*/
std::optional<int> get_used_bits(LogicalType ltype, const VectorizedLiteral& begin, const VectorizedLiteral& end,
std::any& base);
/**
* serialize column data into a bit-compressed format.
*/
void bitcompress_serialize(const Columns& columns, const std::vector<std::any>& bases, const std::vector<int>& offsets,
size_t num_rows, size_t fixed_key_size, void* buffer);
/**
* deserialize column data from a bit-compressed format.
*
*/
void bitcompress_deserialize(Columns& columns, const std::vector<std::any>& bases, const std::vector<int>& offsets,
const std::vector<int>& used_bits, size_t num_rows, size_t fixed_key_size, void* buffer);
} // namespace starrocks

View File

@ -17,27 +17,27 @@
#include <algorithm>
#include <memory>
#include <type_traits>
#include <variant>
#include <utility>
#include "column/chunk.h"
#include "column/column_helper.h"
#include "column/vectorized_fwd.h"
#include "common/config.h"
#include "common/logging.h"
#include "common/status.h"
#include "exec/agg_runtime_filter_builder.h"
#include "exec/aggregate/agg_hash_variant.h"
#include "exec/aggregate/agg_profile.h"
#include "exec/exec_node.h"
#include "exec/limited_pipeline_chunk_buffer.h"
#include "exec/pipeline/operator.h"
#include "exec/spill/spiller.hpp"
#include "exprs/agg/agg_state_if.h"
#include "exprs/agg/agg_state_merge.h"
#include "exprs/agg/agg_state_union.h"
#include "exprs/agg/aggregate_factory.h"
#include "exprs/agg/aggregate_state_allocator.h"
#include "exprs/literal.h"
#include "gen_cpp/PlanNodes_types.h"
#include "runtime/current_thread.h"
#include "runtime/descriptors.h"
#include "runtime/memory/roaring_hook.h"
#include "types/logical_type.h"
#include "udf/java/utils.h"
#include "util/runtime_profile.h"
@ -52,6 +52,60 @@ static const std::string AGG_STATE_MERGE_SUFFIX = "_merge";
static const std::string AGG_STATE_IF_SUFFIX = "_if";
static const std::string FUNCTION_COUNT = "count";
template <class HashMapWithKey>
struct AllocateState {
AllocateState(Aggregator* aggregator_) : aggregator(aggregator_) {}
inline AggDataPtr operator()(const typename HashMapWithKey::KeyType& key);
inline AggDataPtr operator()(std::nullptr_t);
private:
Aggregator* aggregator;
};
template <class HashMapWithKey>
inline AggDataPtr AllocateState<HashMapWithKey>::operator()(const typename HashMapWithKey::KeyType& key) {
AggDataPtr agg_state = aggregator->_state_allocator.allocate();
*reinterpret_cast<typename HashMapWithKey::KeyType*>(agg_state) = key;
size_t created = 0;
size_t aggregate_function_sz = aggregator->_agg_fn_ctxs.size();
try {
for (int i = 0; i < aggregate_function_sz; i++) {
aggregator->_agg_functions[i]->create(aggregator->_agg_fn_ctxs[i],
agg_state + aggregator->_agg_states_offsets[i]);
created++;
}
return agg_state;
} catch (std::bad_alloc& e) {
for (size_t i = 0; i < created; ++i) {
aggregator->_agg_functions[i]->destroy(aggregator->_agg_fn_ctxs[i],
agg_state + aggregator->_agg_states_offsets[i]);
}
aggregator->_state_allocator.rollback();
throw;
}
}
template <class HashMapWithKey>
inline AggDataPtr AllocateState<HashMapWithKey>::operator()(std::nullptr_t) {
AggDataPtr agg_state = aggregator->_state_allocator.allocate_null_key_data();
size_t created = 0;
size_t aggregate_function_sz = aggregator->_agg_fn_ctxs.size();
try {
for (int i = 0; i < aggregate_function_sz; i++) {
aggregator->_agg_functions[i]->create(aggregator->_agg_fn_ctxs[i],
agg_state + aggregator->_agg_states_offsets[i]);
created++;
}
return agg_state;
} catch (std::bad_alloc& e) {
for (int i = 0; i < created; i++) {
aggregator->_agg_functions[i]->destroy(aggregator->_agg_fn_ctxs[i],
agg_state + aggregator->_agg_states_offsets[i]);
}
throw;
}
}
template <bool UseIntermediateAsOutput>
bool AggFunctionTypes::is_result_nullable() const {
if constexpr (UseIntermediateAsOutput) {
@ -143,6 +197,9 @@ AggregatorParamsPtr convert_to_aggregator_params(const TPlanNode& tnode) {
params->intermediate_aggr_exprs = tnode.agg_node.intermediate_aggr_exprs;
params->enable_pipeline_share_limit =
tnode.agg_node.__isset.enable_pipeline_share_limit ? tnode.agg_node.enable_pipeline_share_limit : false;
params->grouping_min_max =
tnode.agg_node.__isset.group_by_min_max ? tnode.agg_node.group_by_min_max : std::vector<TExpr>{};
break;
}
default:
@ -358,6 +415,16 @@ Status Aggregator::prepare(RuntimeState* state, ObjectPool* pool, RuntimeProfile
RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _params->conjuncts, &_conjunct_ctxs, state, true));
RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _params->grouping_exprs, &_group_by_expr_ctxs, state, true));
RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _params->grouping_min_max, &_group_by_min_max, state, true));
_ranges.resize(_group_by_expr_ctxs.size());
if (_group_by_min_max.size() == _group_by_expr_ctxs.size() * 2) {
for (size_t i = 0; i < _group_by_expr_ctxs.size(); ++i) {
std::pair<VectorizedLiteral*, VectorizedLiteral*> range;
range.first = down_cast<VectorizedLiteral*>(_group_by_min_max[i * 2]->root());
range.second = down_cast<VectorizedLiteral*>(_group_by_min_max[i * 2 + 1]->root());
_ranges[i] = range;
}
}
// add profile attributes
if (!_params->sql_grouping_keys.empty()) {
@ -582,7 +649,7 @@ Status Aggregator::_create_aggregate_function(starrocks::RuntimeState* state, co
TypeDescriptor return_type = TypeDescriptor::from_thrift(fn.ret_type);
TypeDescriptor serde_type = TypeDescriptor::from_thrift(fn.aggregate_fn.intermediate_type);
DCHECK_LE(1, fn.arg_types.size());
TypeDescriptor arg_type = arg_types[0];
const TypeDescriptor& arg_type = arg_types[0];
auto* func = get_aggregate_function(func_name, return_type, arg_types, is_result_nullable, fn.binary_type,
state->func_version());
if (func == nullptr) {
@ -1287,19 +1354,76 @@ Status Aggregator::evaluate_agg_fn_exprs(Chunk* chunk, bool use_intermediate) {
return Status::OK();
}
bool is_group_columns_fixed_size(std::vector<ExprContext*>& group_by_expr_ctxs, std::vector<ColumnType>& group_by_types,
size_t* max_size, bool* has_null) {
bool could_apply_bitcompress_opt(
const std::vector<ColumnType>& group_by_types,
const std::vector<std::optional<std::pair<VectorizedLiteral*, VectorizedLiteral*>>>& ranges,
std::vector<std::any>& base, std::vector<int>& used_bytes, size_t* max_size, bool* has_null) {
size_t accumulated = 0;
size_t accumulated_fixed_length_bits = 0;
for (size_t i = 0; i < group_by_types.size(); i++) {
size_t size = 0;
// 1 bytes for null flag.
if (group_by_types[i].is_nullable) {
*has_null = true;
size += 1;
}
if (group_by_types[i].result_type.is_complex_type()) {
return false;
}
LogicalType ltype = group_by_types[i].result_type.type;
size_t fixed_base_size = get_size_of_fixed_length_type(ltype);
if (fixed_base_size == 0) return false;
accumulated_fixed_length_bits += fixed_base_size * 8;
if (!ranges[i].has_value()) {
return false;
}
auto used_bits = get_used_bits(ltype, *ranges[i]->first, *ranges[i]->second, base[i]);
if (!used_bits.has_value()) {
return false;
}
size += used_bits.value();
accumulated += size;
used_bytes[i] = accumulated;
}
auto get_level = [](size_t used_bits) {
if (used_bits <= sizeof(uint8_t) * 8)
return 1;
else if (used_bits <= sizeof(uint16_t) * 8)
return 2;
else if (used_bits <= sizeof(uint32_t) * 8)
return 3;
else if (used_bits <= sizeof(uint64_t) * 8)
return 4;
else if (used_bits <= sizeof(int128_t) * 8)
return 5;
else
return 6;
};
// If they are at the same level, grouping by compressed key will not optimize performance, so we disable it.
// eg: For example, two int32 values both have a threshold of 0-2^32, so they need to use group by int64.
// In this case, there will be no optimization effect. We disable this situation.
if (get_level(accumulated_fixed_length_bits) > get_level(accumulated)) {
*max_size = accumulated;
return true;
}
return false;
}
bool is_group_columns_fixed_size(std::vector<ColumnType>& group_by_types, size_t* max_size, bool* has_null) {
size_t size = 0;
*has_null = false;
for (size_t i = 0; i < group_by_expr_ctxs.size(); i++) {
ExprContext* ctx = group_by_expr_ctxs[i];
for (size_t i = 0; i < group_by_types.size(); i++) {
// 1 bytes for null flag.
if (group_by_types[i].is_nullable) {
*has_null = true;
size += 1; // 1 bytes for null flag.
size += 1;
}
LogicalType ltype = ctx->root()->type().type;
if (ctx->root()->type().is_complex_type()) {
LogicalType ltype = group_by_types[i].result_type.type;
if (group_by_types[i].result_type.is_complex_type()) {
return false;
}
size_t byte_size = get_size_of_fixed_length_type(ltype);
@ -1311,20 +1435,30 @@ bool is_group_columns_fixed_size(std::vector<ExprContext*>& group_by_expr_ctxs,
}
template <typename HashVariantType>
void Aggregator::_init_agg_hash_variant(HashVariantType& hash_variant) {
typename HashVariantType::Type Aggregator::_get_hash_table_type() {
auto type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice : HashVariantType::Type::phase2_slice;
if (_group_by_expr_ctxs.size() == 1) {
type = HashVariantResolver<HashVariantType>::instance().get_unary_type(
_aggr_phase, _group_by_types[0].result_type.type, _has_nullable_key);
if (_group_by_types.empty()) {
return type;
}
// using one key hash table
if (_group_by_types.size() == 1) {
bool nullable = _group_by_types[0].is_nullable;
LogicalType type = _group_by_types[0].result_type.type;
return HashVariantResolver<HashVariantType>::instance().get_unary_type(_aggr_phase, type, nullable);
}
return type;
}
template <typename HashVariantType>
typename HashVariantType::Type Aggregator::_try_to_apply_fixed_size_opt(typename HashVariantType::Type type,
bool* has_null, int* fixed_size) {
bool has_null_column = false;
int fixed_byte_size = 0;
// this optimization don't need to be limited to multi-column group by.
// single column like float/double/decimal/largeint could also be applied to.
if (type == HashVariantType::Type::phase1_slice || type == HashVariantType::Type::phase2_slice) {
size_t max_size = 0;
if (is_group_columns_fixed_size(_group_by_expr_ctxs, _group_by_types, &max_size, &has_null_column)) {
if (is_group_columns_fixed_size(_group_by_types, &max_size, &has_null_column)) {
// we need reserve a byte for serialization length for nullable columns
if (max_size < 4 || (!has_null_column && max_size == 4)) {
type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_fx4
@ -1341,6 +1475,99 @@ void Aggregator::_init_agg_hash_variant(HashVariantType& hash_variant) {
}
}
}
*has_null = has_null_column;
*fixed_size = fixed_byte_size;
return type;
}
template <typename HashVariantType>
typename HashVariantType::Type Aggregator::_try_to_apply_compressed_key_opt(typename HashVariantType::Type input_type,
CompressKeyContext* ctx) {
typename HashVariantType::Type type = input_type;
if (_group_by_types.empty()) {
return type;
}
for (size_t i = 0; i < _ranges.size(); ++i) {
if (!_ranges[i].has_value()) {
return type;
}
}
// check apply bit compress opt
{
bool has_null_column;
size_t new_max_bit_size = 0;
std::vector<int>& offsets = ctx->offsets;
std::vector<int>& used_bits = ctx->used_bits;
std::vector<std::any>& bases = ctx->bases;
size_t group_by_keys = _group_by_types.size();
used_bits.resize(group_by_keys);
offsets.resize(group_by_keys);
bases.resize(group_by_keys);
if (could_apply_bitcompress_opt(_group_by_types, _ranges, bases, used_bits, &new_max_bit_size,
&has_null_column)) {
if (_group_by_types.size() > 0) {
if (new_max_bit_size <= 8) {
type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_cx1
: HashVariantType::Type::phase2_slice_cx1;
} else if (new_max_bit_size <= 4 * 8) {
type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_cx4
: HashVariantType::Type::phase2_slice_cx4;
} else if (new_max_bit_size <= 8 * 8) {
type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_cx8
: HashVariantType::Type::phase2_slice_cx8;
} else if (new_max_bit_size <= 16 * 8) {
type = _aggr_phase == AggrPhase1 ? HashVariantType::Type::phase1_slice_cx16
: HashVariantType::Type::phase2_slice_cx16;
}
}
}
offsets[0] = 0;
for (size_t i = 1; i < group_by_keys; ++i) {
offsets[i] = used_bits[i - 1];
}
}
return type;
}
template <typename HashVariantType>
void Aggregator::_build_hash_variant(HashVariantType& hash_variant, typename HashVariantType::Type type,
CompressKeyContext&& context) {
hash_variant.init(_state, type, _agg_stat);
hash_variant.visit([&](auto& variant) {
if constexpr (is_compressed_fixed_size_key<std::decay_t<decltype(*variant)>>) {
variant->offsets = std::move(context.offsets);
variant->used_bits = std::move(context.used_bits);
variant->bases = std::move(context.bases);
}
});
}
template <typename HashVariantType>
void Aggregator::_init_agg_hash_variant(HashVariantType& hash_variant) {
auto type = _get_hash_table_type<HashVariantType>();
CompressKeyContext compress_key_ctx;
bool apply_compress_key_opt = false;
typename HashVariantType::Type prev_type = type;
type = _try_to_apply_compressed_key_opt<HashVariantType>(type, &compress_key_ctx);
apply_compress_key_opt = prev_type != type;
if (apply_compress_key_opt) {
// build with compressed key
VLOG_ROW << "apply compressed key";
_build_hash_variant<HashVariantType>(hash_variant, type, std::move(compress_key_ctx));
return;
}
bool has_null_column = false;
int fixed_byte_size = 0;
if (_group_by_types.size() > 1) {
type = _try_to_apply_fixed_size_opt<HashVariantType>(type, &has_null_column, &fixed_byte_size);
}
VLOG_ROW << "hash type is "
<< static_cast<typename std::underlying_type<typename HashVariantType::Type>::type>(type);

View File

@ -19,40 +19,34 @@
#include <cstddef>
#include <cstdint>
#include <memory>
#include <mutex>
#include <new>
#include <queue>
#include <utility>
#include "column/chunk.h"
#include "column/column_helper.h"
#include "column/type_traits.h"
#include "column/vectorized_fwd.h"
#include "common/object_pool.h"
#include "common/statusor.h"
#include "exec/aggregate/agg_hash_variant.h"
#include "exec/aggregate/agg_profile.h"
#include "exec/chunk_buffer_memory_manager.h"
#include "exec/aggregator_fwd.h"
#include "exec/limited_pipeline_chunk_buffer.h"
#include "exec/pipeline/context_with_dependency.h"
#include "exec/pipeline/schedule/observer.h"
#include "exec/pipeline/spill_process_channel.h"
#include "exprs/agg/aggregate_factory.h"
#include "exprs/agg/aggregate.h"
#include "exprs/expr.h"
#include "gen_cpp/QueryPlanExtra_types.h"
#include "gutil/strings/substitute.h"
#include "runtime/current_thread.h"
#include "runtime/descriptors.h"
#include "runtime/mem_pool.h"
#include "runtime/memory/counting_allocator.h"
#include "runtime/runtime_state.h"
#include "runtime/types.h"
#include "util/defer_op.h"
namespace starrocks {
class RuntimeFilter;
class AggInRuntimeFilterMerger;
struct HashTableKeyAllocator;
class VectorizedLiteral;
struct RawHashTableIterator {
RawHashTableIterator(HashTableKeyAllocator* alloc_, size_t x_, int y_) : alloc(alloc_), x(x_), y(y_) {}
@ -117,19 +111,6 @@ inline uint8_t* RawHashTableIterator::value() {
return static_cast<uint8_t*>(alloc->vecs[x].first) + alloc->aggregate_key_size * y;
}
class Aggregator;
class SortedStreamingAggregator;
template <class HashMapWithKey>
struct AllocateState {
AllocateState(Aggregator* aggregator_) : aggregator(aggregator_) {}
inline AggDataPtr operator()(const typename HashMapWithKey::KeyType& key);
inline AggDataPtr operator()(std::nullptr_t);
private:
Aggregator* aggregator;
};
struct AggFunctionTypes {
TypeDescriptor result_type;
TypeDescriptor serde_type; // for serialize
@ -227,6 +208,7 @@ struct AggregatorParams {
std::vector<TExpr> grouping_exprs;
std::vector<TExpr> aggregate_functions;
std::vector<TExpr> intermediate_aggr_exprs;
std::vector<TExpr> grouping_min_max;
// Incremental MV
// Whether it's testing, use MemStateTable in testing, instead use IMTStateTable.
@ -255,12 +237,6 @@ AggregatorParamsPtr convert_to_aggregator_params(const TPlanNode& tnode);
// it contains common data struct and algorithm of aggregation
class Aggregator : public pipeline::ContextWithDependency {
public:
#ifdef NDEBUG
static constexpr size_t two_level_memory_threshold = 33554432; // 32M, L3 Cache
#else
static constexpr size_t two_level_memory_threshold = 64;
#endif
Aggregator(AggregatorParamsPtr params);
~Aggregator() noexcept override {
@ -414,7 +390,7 @@ public:
bool is_streaming_all_states() const { return _streaming_all_states; }
HashTableKeyAllocator _state_allocator;
HashTableKeyAllocator& state_allocator() { return _state_allocator; }
void attach_sink_observer(RuntimeState* state, pipeline::PipelineObserver* observer) {
_pip_observable.attach_sink_observer(state, observer);
@ -435,6 +411,8 @@ protected:
std::unique_ptr<MemPool> _mem_pool;
// used to count heap memory usage of agg states
std::unique_ptr<CountingAllocatorWithHook> _allocator;
HashTableKeyAllocator _state_allocator;
// The open phase still relies on the TFunction object for some initialization operations
std::vector<TFunction> _fns;
@ -501,6 +479,8 @@ protected:
// Exprs used to evaluate group by column
std::vector<ExprContext*> _group_by_expr_ctxs;
std::vector<ExprContext*> _group_by_min_max;
std::vector<std::optional<std::pair<VectorizedLiteral*, VectorizedLiteral*>>> _ranges;
Columns _group_by_columns;
std::vector<ColumnType> _group_by_types;
@ -598,6 +578,24 @@ protected:
// Choose different agg hash map/set by different group by column's count, type, nullable
template <typename HashVariantType>
void _init_agg_hash_variant(HashVariantType& hash_variant);
// get spec hash table/set type
template <typename HashVariantType>
typename HashVariantType::Type _get_hash_table_type();
template <typename HashVariantType>
typename HashVariantType::Type _try_to_apply_fixed_size_opt(typename HashVariantType::Type type,
bool* has_null_column, int* fixed_byte_size);
struct CompressKeyContext {
std::vector<int> offsets;
std::vector<int> used_bits;
std::vector<std::any> bases;
};
template <typename HashVariantType>
typename HashVariantType::Type _try_to_apply_compressed_key_opt(typename HashVariantType::Type input_type,
CompressKeyContext* ctx);
template <typename HashVariantType>
void _build_hash_variant(HashVariantType& hash_variant, typename HashVariantType::Type type,
CompressKeyContext&& context);
void _release_agg_memory();
@ -608,7 +606,7 @@ protected:
int64_t get_two_level_threahold() {
if (config::two_level_memory_threshold < 0) {
return two_level_memory_threshold;
return agg::two_level_memory_threshold;
}
return config::two_level_memory_threshold;
}
@ -617,50 +615,6 @@ protected:
friend struct AllocateState;
};
template <class HashMapWithKey>
inline AggDataPtr AllocateState<HashMapWithKey>::operator()(const typename HashMapWithKey::KeyType& key) {
AggDataPtr agg_state = aggregator->_state_allocator.allocate();
*reinterpret_cast<typename HashMapWithKey::KeyType*>(agg_state) = key;
size_t created = 0;
size_t aggregate_function_sz = aggregator->_agg_fn_ctxs.size();
try {
for (int i = 0; i < aggregate_function_sz; i++) {
aggregator->_agg_functions[i]->create(aggregator->_agg_fn_ctxs[i],
agg_state + aggregator->_agg_states_offsets[i]);
created++;
}
return agg_state;
} catch (std::bad_alloc& e) {
for (size_t i = 0; i < created; ++i) {
aggregator->_agg_functions[i]->destroy(aggregator->_agg_fn_ctxs[i],
agg_state + aggregator->_agg_states_offsets[i]);
}
aggregator->_state_allocator.rollback();
throw;
}
}
template <class HashMapWithKey>
inline AggDataPtr AllocateState<HashMapWithKey>::operator()(std::nullptr_t) {
AggDataPtr agg_state = aggregator->_state_allocator.allocate_null_key_data();
size_t created = 0;
size_t aggregate_function_sz = aggregator->_agg_fn_ctxs.size();
try {
for (int i = 0; i < aggregate_function_sz; i++) {
aggregator->_agg_functions[i]->create(aggregator->_agg_fn_ctxs[i],
agg_state + aggregator->_agg_states_offsets[i]);
created++;
}
return agg_state;
} catch (std::bad_alloc& e) {
for (int i = 0; i < created; i++) {
aggregator->_agg_functions[i]->destroy(aggregator->_agg_fn_ctxs[i],
agg_state + aggregator->_agg_states_offsets[i]);
}
throw;
}
}
inline bool LimitedMemAggState::has_limited(const Aggregator& aggregator) const {
return limited_memory_size > 0 && aggregator.memory_usage() >= limited_memory_size;
}
@ -702,11 +656,4 @@ private:
std::atomic<int64_t> _shared_limit_countdown;
};
using AggregatorFactory = AggregatorFactoryBase<Aggregator>;
using AggregatorFactoryPtr = std::shared_ptr<AggregatorFactory>;
using SortedStreamingAggregatorPtr = std::shared_ptr<SortedStreamingAggregator>;
using StreamingAggregatorFactory = AggregatorFactoryBase<SortedStreamingAggregator>;
using StreamingAggregatorFactoryPtr = std::shared_ptr<StreamingAggregatorFactory>;
} // namespace starrocks

View File

@ -0,0 +1,32 @@
#pragma once
#include <cstddef>
#include <memory>
namespace starrocks {
namespace agg {
#ifdef NDEBUG
constexpr size_t two_level_memory_threshold = 33554432; // 32M, L3 Cache
#else
constexpr size_t two_level_memory_threshold = 64;
#endif
} // namespace agg
class Aggregator;
class SortedStreamingAggregator;
using AggregatorPtr = std::shared_ptr<Aggregator>;
using SortedStreamingAggregatorPtr = std::shared_ptr<SortedStreamingAggregator>;
template <class HashMapWithKey>
struct AllocateState;
template <class T>
class AggregatorFactoryBase;
using AggregatorFactory = AggregatorFactoryBase<Aggregator>;
using AggregatorFactoryPtr = std::shared_ptr<AggregatorFactory>;
using StreamingAggregatorFactory = AggregatorFactoryBase<SortedStreamingAggregator>;
using StreamingAggregatorFactoryPtr = std::shared_ptr<StreamingAggregatorFactory>;
} // namespace starrocks

View File

@ -88,6 +88,13 @@ Status CrossJoinNode::init(const TPlanNode& tnode, RuntimeState* state) {
_build_runtime_filters.emplace_back(rf_desc);
}
}
if (tnode.nestloop_join_node.__isset.common_slot_map) {
for (const auto& [key, val] : tnode.nestloop_join_node.common_slot_map) {
ExprContext* context;
RETURN_IF_ERROR(Expr::create_expr_tree(_pool, val, &context, state, true));
_common_expr_ctxs.insert({key, context});
}
}
return Status::OK();
}
@ -608,10 +615,10 @@ std::vector<std::shared_ptr<pipeline::OperatorFactory>> CrossJoinNode::_decompos
OpFactories left_ops = _children[0]->decompose_to_pipeline(context);
// communication with CrossJoinRight through shared_data.
auto left_factory =
std::make_shared<ProbeFactory>(context->next_operator_id(), id(), _row_descriptor, child(0)->row_desc(),
child(1)->row_desc(), _sql_join_conjuncts, std::move(_join_conjuncts),
std::move(_conjunct_ctxs), std::move(cross_join_context), _join_op);
auto left_factory = std::make_shared<ProbeFactory>(
context->next_operator_id(), id(), _row_descriptor, child(0)->row_desc(), child(1)->row_desc(),
_sql_join_conjuncts, std::move(_join_conjuncts), std::move(_conjunct_ctxs), std::move(_common_expr_ctxs),
std::move(cross_join_context), _join_op);
// Initialize OperatorFactory's fields involving runtime filters.
this->init_runtime_filter_for_operator(left_factory.get(), context, rc_rf_probe_collector);
if (!context->is_colocate_group()) {

View File

@ -128,6 +128,8 @@ private:
std::vector<RuntimeFilterBuildDescriptor*> _build_runtime_filters;
bool _interpolate_passthrough = false;
std::map<SlotId, ExprContext*> _common_expr_ctxs;
};
} // namespace starrocks

View File

@ -253,14 +253,13 @@ Status CSVScanner::_init_reader() {
_curr_reader = std::make_unique<ScannerCSVReader>(file, _state, _parse_options);
_curr_reader->set_counter(_counter);
if (_scan_range.ranges[_curr_file_index].size > 0 &&
_scan_range.ranges[_curr_file_index].format_type == TFileFormatType::FORMAT_CSV_PLAIN) {
if (range_desc.size > 0 && range_desc.format_type == TFileFormatType::FORMAT_CSV_PLAIN) {
// Does not set limit for compressed file.
_curr_reader->set_limit(_scan_range.ranges[_curr_file_index].size);
_curr_reader->set_limit(range_desc.size);
}
if (_scan_range.ranges[_curr_file_index].start_offset > 0) {
if (range_desc.start_offset > 0) {
// Skip the first record started from |start_offset|.
auto status = file->skip(_scan_range.ranges[_curr_file_index].start_offset);
auto status = file->skip(range_desc.start_offset);
if (status.is_time_out()) {
// open this file next time
--_curr_file_index;
@ -271,7 +270,8 @@ Status CSVScanner::_init_reader() {
RETURN_IF_ERROR(_curr_reader->next_record(&dummy));
}
if (_parse_options.skip_header) {
// only the first range needs to skip header
if (_parse_options.skip_header && range_desc.start_offset == 0) {
for (int64_t i = 0; i < _parse_options.skip_header; i++) {
CSVReader::Record dummy;
auto st = _curr_reader->next_record(&dummy);

View File

@ -35,7 +35,6 @@
#pragma once
#include <functional>
#include <mutex>
#include <sstream>
#include <vector>
@ -48,10 +47,7 @@
#include "runtime/descriptors.h"
#include "runtime/mem_pool.h"
#include "runtime/query_statistics.h"
#include "service/backend_options.h"
#include "util/blocking_queue.hpp"
#include "util/runtime_profile.h"
#include "util/uid_util.h" // for print_id
namespace starrocks {

View File

@ -120,6 +120,16 @@ public:
const ChunkPtr& back() { return _chunks.back(); }
void append_selective_to_back(const Chunk& src, const uint32_t* indexes, uint32_t from, uint32_t size) {
auto& chunk = _chunks.back();
const size_t prev_bytes = chunk->memory_usage();
chunk->append_selective(src, indexes, from, size);
const size_t new_bytes = chunk->memory_usage();
_tracker->consume(new_bytes - prev_bytes);
}
bool is_full() const {
return _chunks.size() >= 4 || _tracker->consumption() > config::partition_hash_join_probe_limit_size;
}
@ -213,10 +223,10 @@ Status PartitionedHashJoinProberImpl::push_probe_chunk(RuntimeState* state, Chun
}
std::vector<uint32_t> hash_values;
{
hash_values.assign(num_rows, HashUtil::FNV_SEED);
hash_values.assign(num_rows, 0);
for (const ColumnPtr& column : partition_columns) {
column->fnv_hash(hash_values.data(), 0, num_rows);
column->crc32_hash(hash_values.data(), 0, num_rows);
}
// find partition id
for (size_t i = 0; i < hash_values.size(); ++i) {
@ -329,7 +339,9 @@ StatusOr<ChunkPtr> PartitionedHashJoinProberImpl::probe_remain(RuntimeState* sta
}
void PartitionedHashJoinProberImpl::reset(RuntimeState* runtime_state) {
_probers.clear();
for (auto& prober : _probers) {
prober->reset(runtime_state);
}
_partition_input_channels.clear();
_all_input_finished = false;
_remain_partition_idx = 0;
@ -362,7 +374,7 @@ bool SingleHashJoinBuilder::anti_join_key_column_has_null() const {
return false;
}
Status SingleHashJoinBuilder::do_append_chunk(const ChunkPtr& chunk) {
Status SingleHashJoinBuilder::do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) {
if (UNLIKELY(_ht.get_row_count() + chunk->num_rows() >= max_hash_table_element_size)) {
return Status::NotSupported(strings::Substitute("row count of right table in hash join > $0", UINT32_MAX));
}
@ -404,7 +416,7 @@ enum class CacheLevel { L2, L3, MEMORY };
class AdaptivePartitionHashJoinBuilder final : public HashJoinBuilder {
public:
AdaptivePartitionHashJoinBuilder(HashJoiner& hash_joiner);
explicit AdaptivePartitionHashJoinBuilder(HashJoiner& hash_joiner);
~AdaptivePartitionHashJoinBuilder() override = default;
void create(const HashTableParam& param) override;
@ -413,7 +425,7 @@ public:
void reset(const HashTableParam& param) override;
Status do_append_chunk(const ChunkPtr& chunk) override;
Status do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) override;
Status build(RuntimeState* state) override;
@ -432,27 +444,53 @@ public:
void clone_readable(HashJoinBuilder* builder) override;
Status prepare_for_spill_start(RuntimeState* state) override;
ChunkPtr convert_to_spill_schema(const ChunkPtr& chunk) const override;
private:
size_t _estimated_row_size(const HashTableParam& param) const;
size_t _estimated_probe_cost(const HashTableParam& param) const;
static double _calculate_cache_miss_factor(const HashJoiner& hash_joiner);
size_t _estimate_hash_table_probing_bytes_per_row(const HashTableParam& param) const;
size_t _estimate_probe_row_bytes(const HashTableParam& param) const;
template <CacheLevel T>
size_t _estimated_build_cost(size_t build_row_size) const;
void _adjust_partition_rows(size_t build_row_size);
size_t _estimate_cost_by_bytes(size_t row_bytes) const;
void _init_partition_nums(const HashTableParam& param);
Status _convert_to_single_partition();
Status _append_chunk_to_partitions(const ChunkPtr& chunk);
void _adjust_partition_rows(size_t hash_table_bytes_per_row, size_t hash_table_probing_bytes_per_row);
Status _do_append_chunk(RuntimeState* state, const ChunkPtr& chunk);
Status _append_chunk_to_partitions(RuntimeState* state, const ChunkPtr& chunk);
Status _transfer_to_appending_stage(RuntimeState* state);
Status _convert_to_single_partition(RuntimeState* state);
Status _flush_buffer_chunks(RuntimeState* state);
bool _need_partition_join_for_build(size_t ht_num_rows) const;
bool _need_partition_join_for_append(size_t ht_num_rows) const;
private:
std::vector<std::unique_ptr<SingleHashJoinBuilder>> _builders;
size_t _partition_num = 0;
size_t _partition_join_min_rows = 0;
size_t _partition_join_max_rows = 0;
// Split append chunk into two stages:
// - BUFFERING: buffers chunks without partitioning until the number of rows exceeds _partition_join_l2_max_rows or _partition_join_l3_max_rows.
// - APPENDING: partitions all incoming chunks.
enum class Stage { BUFFERING, APPENDING };
Stage _stage = Stage::BUFFERING;
MemTracker _mem_tracker;
std::vector<PartitionChunkChannel> _partition_input_channels;
std::vector<ChunkPtr> _unpartition_chunks;
size_t _probe_estimated_costs = 0;
size_t _partition_num = 0;
size_t _hash_table_probing_bytes_per_row = 0;
size_t _hash_table_bytes_per_row = 0;
size_t _partition_join_l2_min_rows = 0;
size_t _partition_join_l2_max_rows = 0;
size_t _partition_join_l3_min_rows = 0;
size_t _partition_join_l3_max_rows = 0;
size_t _probe_row_shuffle_cost = 0;
size_t _l2_benefit = 0;
size_t _l3_benefit = 0;
size_t _fit_L2_cache_max_rows = 0;
size_t _fit_L3_cache_max_rows = 0;
@ -461,10 +499,15 @@ private:
size_t _L3_cache_size = 0;
size_t _pushed_chunks = 0;
// Shared read-only data accessed concurrently by threads can lead to better cache performance.
// Therefore, for broadcast joins, this parameter is used to reduce benefit of partitioned hash joins as the number
// of prober threads (DOP) increases.
const double _cache_miss_factor;
};
AdaptivePartitionHashJoinBuilder::AdaptivePartitionHashJoinBuilder(HashJoiner& hash_joiner)
: HashJoinBuilder(hash_joiner) {
: HashJoinBuilder(hash_joiner), _cache_miss_factor(_calculate_cache_miss_factor(hash_joiner)) {
static constexpr size_t DEFAULT_L2_CACHE_SIZE = 1 * 1024 * 1024;
static constexpr size_t DEFAULT_L3_CACHE_SIZE = 32 * 1024 * 1024;
const auto& cache_sizes = CpuInfo::get_cache_sizes();
@ -474,100 +517,173 @@ AdaptivePartitionHashJoinBuilder::AdaptivePartitionHashJoinBuilder(HashJoiner& h
_L3_cache_size = _L3_cache_size ? _L3_cache_size : DEFAULT_L3_CACHE_SIZE;
}
size_t AdaptivePartitionHashJoinBuilder::_estimated_row_size(const HashTableParam& param) const {
double AdaptivePartitionHashJoinBuilder::_calculate_cache_miss_factor(const HashJoiner& hash_joiner) {
if (hash_joiner.distribution_mode() != TJoinDistributionMode::BROADCAST) {
return 1.0; // No broadcast join, no cache reuse between different probers.
}
const size_t max_prober_dop = hash_joiner.max_dop();
if (max_prober_dop <= 1) {
return 1.0;
}
if (max_prober_dop > 8) {
return 0.1;
}
return 1 - (max_prober_dop - 1) * 0.1;
}
size_t AdaptivePartitionHashJoinBuilder::_estimate_hash_table_probing_bytes_per_row(const HashTableParam& param) const {
size_t estimated_each_row = 0;
// Probing a row need
// 1. touch `first` and `next` vectors,
// 2 and compare join keys between builder and prober.
// 3. output columns from the build side.
// 1. `first` and `next` bytes
estimated_each_row += 8;
// 2. key bytes
for (const auto& join_key : param.join_keys) {
if (join_key.type != nullptr) {
estimated_each_row += get_size_of_fixed_length_type(join_key.type->type);
// The benefit from non-fixed key columns is less than those from fixed key columns, so the penalty (/4) is applied here.
estimated_each_row += type_estimated_overhead_bytes(join_key.type->type) / 4;
}
}
// 3. output bytes
for (auto* tuple : param.build_row_desc->tuple_descriptors()) {
for (auto slot : tuple->slots()) {
if (param.build_output_slots.contains(slot->id())) {
for (const auto* slot : tuple->slots()) {
if (param.build_output_slots.empty() || param.build_output_slots.contains(slot->id())) {
estimated_each_row += get_size_of_fixed_length_type(slot->type().type);
estimated_each_row += type_estimated_overhead_bytes(slot->type().type);
}
}
}
// for hash table bucket
estimated_each_row += 4;
return estimated_each_row;
return std::max<size_t>(estimated_each_row * _cache_miss_factor, 1);
}
// We could use a better estimation model.
size_t AdaptivePartitionHashJoinBuilder::_estimated_probe_cost(const HashTableParam& param) const {
size_t AdaptivePartitionHashJoinBuilder::_estimate_probe_row_bytes(const HashTableParam& param) const {
size_t size = 0;
// shuffling probe bytes
for (auto* tuple : param.probe_row_desc->tuple_descriptors()) {
for (auto slot : tuple->slots()) {
if (param.probe_output_slots.contains(slot->id())) {
size += get_size_of_fixed_length_type(slot->type().type);
size += type_estimated_overhead_bytes(slot->type().type);
}
for (const auto* slot : tuple->slots()) {
size += get_size_of_fixed_length_type(slot->type().type);
size += type_estimated_overhead_bytes(slot->type().type);
}
}
// we define probe cost is bytes size * 6
return size * 6;
return std::max<size_t>(size, 1);
}
template <>
size_t AdaptivePartitionHashJoinBuilder::_estimated_build_cost<CacheLevel::L2>(size_t build_row_size) const {
return build_row_size / 2;
size_t AdaptivePartitionHashJoinBuilder::_estimate_cost_by_bytes<CacheLevel::L2>(size_t row_bytes) const {
return row_bytes / 2;
}
template <>
size_t AdaptivePartitionHashJoinBuilder::_estimated_build_cost<CacheLevel::L3>(size_t build_row_size) const {
return build_row_size;
size_t AdaptivePartitionHashJoinBuilder::_estimate_cost_by_bytes<CacheLevel::L3>(size_t row_bytes) const {
return row_bytes;
}
template <>
size_t AdaptivePartitionHashJoinBuilder::_estimated_build_cost<CacheLevel::MEMORY>(size_t build_row_size) const {
return build_row_size * 2;
size_t AdaptivePartitionHashJoinBuilder::_estimate_cost_by_bytes<CacheLevel::MEMORY>(size_t row_bytes) const {
return row_bytes * 2;
}
void AdaptivePartitionHashJoinBuilder::_adjust_partition_rows(size_t build_row_size) {
build_row_size = std::max(build_row_size, 4UL);
_fit_L2_cache_max_rows = _L2_cache_size / build_row_size;
_fit_L3_cache_max_rows = _L3_cache_size / build_row_size;
bool AdaptivePartitionHashJoinBuilder::_need_partition_join_for_build(size_t ht_num_rows) const {
return (_partition_join_l2_min_rows < ht_num_rows && ht_num_rows <= _partition_join_l2_max_rows) ||
(_partition_join_l3_min_rows < ht_num_rows && ht_num_rows <= _partition_join_l3_max_rows);
}
// If the hash table is smaller than the L2 cache. we don't think partition hash join is needed.
_partition_join_min_rows = _fit_L2_cache_max_rows;
// If the hash table after partition can't be loaded to L3. we don't think partition hash join is needed.
_partition_join_max_rows = _fit_L3_cache_max_rows * _partition_num;
bool AdaptivePartitionHashJoinBuilder::_need_partition_join_for_append(size_t ht_num_rows) const {
return ht_num_rows <= _partition_join_l2_max_rows || ht_num_rows <= _partition_join_l3_max_rows;
}
if (_probe_estimated_costs + _estimated_build_cost<CacheLevel::L2>(build_row_size) <
_estimated_build_cost<CacheLevel::L3>(build_row_size)) {
// overhead after hash table partitioning + probe extra cost < cost before partitioning
// nothing to do
} else if (_probe_estimated_costs + _estimated_build_cost<CacheLevel::L3>(build_row_size) <
_estimated_build_cost<CacheLevel::MEMORY>(build_row_size)) {
// It is only after this that performance gains can be realized beyond the L3 cache.
_partition_join_min_rows = _fit_L3_cache_max_rows;
void AdaptivePartitionHashJoinBuilder::_adjust_partition_rows(size_t hash_table_bytes_per_row,
size_t hash_table_probing_bytes_per_row) {
if (hash_table_bytes_per_row == _hash_table_bytes_per_row &&
hash_table_probing_bytes_per_row == _hash_table_probing_bytes_per_row) {
return; // No need to adjust partition rows.
}
_hash_table_bytes_per_row = hash_table_bytes_per_row;
_hash_table_probing_bytes_per_row = hash_table_probing_bytes_per_row;
hash_table_bytes_per_row = std::max<size_t>(hash_table_bytes_per_row, 1);
_fit_L2_cache_max_rows = _L2_cache_size / hash_table_bytes_per_row;
_fit_L3_cache_max_rows = _L3_cache_size / hash_table_bytes_per_row;
_partition_join_l2_min_rows = -1;
_partition_join_l2_max_rows = 0;
_partition_join_l3_min_rows = -1;
_partition_join_l3_max_rows = 0;
const auto l2_benefit = _estimate_cost_by_bytes<CacheLevel::L3>(hash_table_probing_bytes_per_row) -
_estimate_cost_by_bytes<CacheLevel::L2>(hash_table_probing_bytes_per_row);
const auto l3_benefit = _estimate_cost_by_bytes<CacheLevel::MEMORY>(hash_table_probing_bytes_per_row) -
_estimate_cost_by_bytes<CacheLevel::L3>(hash_table_probing_bytes_per_row);
if (_probe_row_shuffle_cost < l3_benefit) { // Partitioned joins benefit from L3 cache.
// Partitioned joins benefit from L3 cache when probing a row has cache miss in non-partitioned join but not in partitioned join.
// 1. min_rows > (l3_cache_size/hash_table_bytes_per_row)*(l3_benefit/(l3_benefit-_probe_row_shuffle_cost)), because:
// - l3_benefit * non_partition_cache_miss_rate > _probe_row_shuffle_cost
// - non_partition_cache_miss_rate = 1 - l3_cache_size/(min_rows*hash_table_bytes_per_row)
// 2. max_rows < (l3_cache_size/hash_table_bytes_per_row)*(l3_benefit/_probe_row_shuffle_cost)*num_partitions, because:
// - l3_benefit * partition_cache_hit_rate > _probe_row_shuffle_cost
// - partition_cache_hit_rate = l3_cache_size/(max_rows_per_partition*hash_table_bytes_per_row)
_partition_join_l3_min_rows = _fit_L3_cache_max_rows * l3_benefit / (l3_benefit - _probe_row_shuffle_cost);
_partition_join_l3_max_rows = _fit_L3_cache_max_rows * _partition_num * l3_benefit / _probe_row_shuffle_cost;
_partition_join_l3_max_rows *= 2; // relax the restriction
if (_probe_row_shuffle_cost < l2_benefit) { // Partitioned joins benefit from L2 cache.
_partition_join_l2_min_rows = _fit_L2_cache_max_rows * l2_benefit / (l2_benefit - _probe_row_shuffle_cost);
_partition_join_l2_min_rows *= 2; // Make the restriction more stringent
_partition_join_l2_max_rows =
(_fit_L2_cache_max_rows * _partition_num) * l2_benefit / _probe_row_shuffle_cost;
}
} else {
// Partitioned joins don't have performance gains. Not using partition hash join.
_partition_num = 1;
}
VLOG_OPERATOR << "TRACE:"
<< "partition_num=" << _partition_num << " partition_join_min_rows=" << _partition_join_min_rows
<< " partition_join_max_rows=" << _partition_join_max_rows << " probe cost=" << _probe_estimated_costs
<< " build cost L2=" << _estimated_build_cost<CacheLevel::L2>(build_row_size)
<< " build cost L3=" << _estimated_build_cost<CacheLevel::L3>(build_row_size)
<< " build cost Mem=" << _estimated_build_cost<CacheLevel::MEMORY>(build_row_size);
_l2_benefit = l2_benefit;
_l3_benefit = l3_benefit;
VLOG_OPERATOR << "TRACE: _adjust_partition_rows "
<< "[partition_num=" << _partition_num << "] "
<< "[partition_join_l2_min_rows=" << _partition_join_l2_min_rows << "] "
<< "[partition_join_l2_max_rows=" << _partition_join_l2_max_rows << "] "
<< "[partition_join_l3_min_rows=" << _partition_join_l3_min_rows << "] "
<< "[partition_join_l3_max_rows=" << _partition_join_l3_max_rows << "] "
<< "[hash_table_probing_bytes_per_row=" << hash_table_probing_bytes_per_row << "] "
<< "[hash_table_bytes_per_row=" << hash_table_bytes_per_row << "] "
<< "[l2_benefit=" << l2_benefit << "] "
<< "[l3_benefit=" << l3_benefit << "] "
<< "[probe_shuffle_cost=" << _probe_row_shuffle_cost << "] ";
}
void AdaptivePartitionHashJoinBuilder::_init_partition_nums(const HashTableParam& param) {
_partition_num = 16;
size_t estimated_bytes_each_row = _estimated_row_size(param);
_probe_row_shuffle_cost =
std::max<size_t>(_estimate_cost_by_bytes<CacheLevel::L3>(_estimate_probe_row_bytes(param)), 1);
_probe_estimated_costs = _estimated_probe_cost(param);
const size_t hash_table_probing_bytes_per_row = _estimate_hash_table_probing_bytes_per_row(param);
_adjust_partition_rows(1, hash_table_probing_bytes_per_row);
_adjust_partition_rows(estimated_bytes_each_row);
COUNTER_SET(_hash_joiner.build_metrics().partition_nums, (int64_t)_partition_num);
COUNTER_SET(_hash_joiner.build_metrics().partition_nums, static_cast<int64_t>(_partition_num));
}
void AdaptivePartitionHashJoinBuilder::create(const HashTableParam& param) {
_init_partition_nums(param);
if (_partition_num > 1) {
_partition_input_channels.resize(_partition_num, PartitionChunkChannel(&_mem_tracker));
}
for (size_t i = 0; i < _partition_num; ++i) {
_builders.emplace_back(std::make_unique<SingleHashJoinBuilder>(_hash_joiner));
_builders.back()->create(param);
@ -579,10 +695,14 @@ void AdaptivePartitionHashJoinBuilder::close() {
builder->close();
}
_builders.clear();
_partition_input_channels.clear();
_partition_num = 0;
_partition_join_min_rows = 0;
_partition_join_max_rows = 0;
_probe_estimated_costs = 0;
_partition_join_l2_min_rows = 0;
_partition_join_l2_max_rows = 0;
_partition_join_l3_min_rows = 0;
_partition_join_l3_max_rows = 0;
_probe_row_shuffle_cost = 0;
_hash_table_probing_bytes_per_row = 0;
_fit_L2_cache_max_rows = 0;
_fit_L3_cache_max_rows = 0;
_pushed_chunks = 0;
@ -637,17 +757,70 @@ int64_t AdaptivePartitionHashJoinBuilder::ht_mem_usage() const {
[](int64_t sum, const auto& builder) { return sum + builder->ht_mem_usage(); });
}
Status AdaptivePartitionHashJoinBuilder::_convert_to_single_partition() {
Status AdaptivePartitionHashJoinBuilder::_convert_to_single_partition(RuntimeState* state) {
VLOG_OPERATOR << "TRACE: convert_to_single_partition "
<< "[partition_num=" << _partition_num << "] "
<< "[partition_join_l2_min_rows=" << _partition_join_l2_min_rows << "] "
<< "[partition_join_l2_max_rows=" << _partition_join_l2_max_rows << "] "
<< "[partition_join_l3_min_rows=" << _partition_join_l3_min_rows << "] "
<< "[partition_join_l3_max_rows=" << _partition_join_l3_max_rows << "] "
<< "[hash_table_row_count=" << hash_table_row_count() << "] ";
// merge all partition data to the first partition
for (size_t i = 1; i < _builders.size(); ++i) {
_builders[0]->hash_table().merge_ht(_builders[i]->hash_table());
if (_stage == Stage::BUFFERING) {
_mem_tracker.set(0);
for (const auto& unpartition_chunk : _unpartition_chunks) {
RETURN_IF_ERROR(_builders[0]->do_append_chunk(state, unpartition_chunk));
}
_unpartition_chunks.clear();
} else {
for (size_t i = 0; i < _builders.size(); ++i) {
if (i != 0) {
_builders[0]->hash_table().merge_ht(_builders[i]->hash_table());
}
auto& channel = _partition_input_channels[i];
while (!channel.is_empty()) {
RETURN_IF_ERROR(_builders[0]->do_append_chunk(state, channel.pull()));
}
}
_partition_input_channels.clear();
}
_builders.resize(1);
_partition_num = 1;
COUNTER_SET(_hash_joiner.build_metrics().partition_nums, static_cast<int64_t>(1));
return Status::OK();
}
Status AdaptivePartitionHashJoinBuilder::_append_chunk_to_partitions(const ChunkPtr& chunk) {
Status AdaptivePartitionHashJoinBuilder::_transfer_to_appending_stage(RuntimeState* state) {
_stage = Stage::APPENDING;
_mem_tracker.set(0); // All the buffered chunks are moved to the partition builders, so clear the memory tracker.
for (const auto& unpartition_chunk : _unpartition_chunks) {
RETURN_IF_ERROR(_append_chunk_to_partitions(state, unpartition_chunk));
}
_unpartition_chunks.clear();
return Status::OK();
}
Status AdaptivePartitionHashJoinBuilder::_do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) {
if (_stage == Stage::BUFFERING) {
_mem_tracker.consume(chunk->memory_usage());
_unpartition_chunks.push_back(chunk);
const size_t num_rows = hash_table_row_count();
if (num_rows >= _partition_join_l2_min_rows || num_rows >= _partition_join_l3_min_rows) {
RETURN_IF_ERROR(_transfer_to_appending_stage(state));
}
return Status::OK();
} else {
return _append_chunk_to_partitions(state, chunk);
}
}
Status AdaptivePartitionHashJoinBuilder::_append_chunk_to_partitions(RuntimeState* state, const ChunkPtr& chunk) {
const std::vector<ExprContext*>& build_partition_keys = _hash_joiner.build_expr_ctxs();
size_t num_rows = chunk->num_rows();
@ -660,10 +833,10 @@ Status AdaptivePartitionHashJoinBuilder::_append_chunk_to_partitions(const Chunk
}
std::vector<uint32_t> hash_values;
{
hash_values.assign(num_rows, HashUtil::FNV_SEED);
hash_values.assign(num_rows, 0);
for (const ColumnPtr& column : partition_columns) {
column->fnv_hash(hash_values.data(), 0, num_rows);
column->crc32_hash(hash_values.data(), 0, num_rows);
}
// find partition id
for (size_t i = 0; i < hash_values.size(); ++i) {
@ -698,45 +871,83 @@ Status AdaptivePartitionHashJoinBuilder::_append_chunk_to_partitions(const Chunk
if (size == 0) {
continue;
}
// TODO: make builder implements append with selective
auto partition_chunk = chunk->clone_empty();
partition_chunk->append_selective(*chunk, selection.data(), from, size);
RETURN_IF_ERROR(_builders[i]->append_chunk(std::move(partition_chunk)));
auto& channel = _partition_input_channels[i];
if (channel.is_empty()) {
channel.push(chunk->clone_empty());
}
if (channel.back()->num_rows() + size <= state->chunk_size()) {
channel.append_selective_to_back(*chunk, selection.data(), from, size);
} else {
channel.push(chunk->clone_empty());
channel.append_selective_to_back(*chunk, selection.data(), from, size);
}
while (channel.is_full()) {
RETURN_IF_ERROR(_builders[i]->append_chunk(state, channel.pull()));
}
}
return Status::OK();
}
Status AdaptivePartitionHashJoinBuilder::do_append_chunk(const ChunkPtr& chunk) {
if (_partition_num > 1 && hash_table_row_count() > _partition_join_max_rows) {
RETURN_IF_ERROR(_convert_to_single_partition());
Status AdaptivePartitionHashJoinBuilder::do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) {
if (_partition_num > 1 && !_need_partition_join_for_append(hash_table_row_count())) {
RETURN_IF_ERROR(_convert_to_single_partition(state));
}
if (_partition_num > 1 && ++_pushed_chunks % 8 == 0) {
size_t build_row_size = ht_mem_usage() / hash_table_row_count();
_adjust_partition_rows(build_row_size);
const size_t build_row_size = (ht_mem_usage() + _mem_tracker.consumption()) / hash_table_row_count();
_adjust_partition_rows(build_row_size, _hash_table_probing_bytes_per_row);
if (_partition_num == 1) {
RETURN_IF_ERROR(_convert_to_single_partition());
RETURN_IF_ERROR(_convert_to_single_partition(state));
}
}
if (_partition_num > 1) {
RETURN_IF_ERROR(_append_chunk_to_partitions(chunk));
RETURN_IF_ERROR(_do_append_chunk(state, chunk));
} else {
RETURN_IF_ERROR(_builders[0]->do_append_chunk(chunk));
RETURN_IF_ERROR(_builders[0]->do_append_chunk(state, chunk));
}
return Status::OK();
}
Status AdaptivePartitionHashJoinBuilder::prepare_for_spill_start(RuntimeState* state) {
if (_partition_num > 1) {
return _flush_buffer_chunks(state);
}
return Status::OK();
}
ChunkPtr AdaptivePartitionHashJoinBuilder::convert_to_spill_schema(const ChunkPtr& chunk) const {
return _builders[0]->convert_to_spill_schema(chunk);
}
Status AdaptivePartitionHashJoinBuilder::_flush_buffer_chunks(RuntimeState* state) {
if (_stage == Stage::BUFFERING) {
RETURN_IF_ERROR(_transfer_to_appending_stage(state));
}
for (size_t i = 0; i < _partition_input_channels.size(); ++i) {
auto& channel = _partition_input_channels[i];
while (!channel.is_empty()) {
RETURN_IF_ERROR(_builders[i]->do_append_chunk(state, channel.pull()));
}
}
return Status::OK();
}
Status AdaptivePartitionHashJoinBuilder::build(RuntimeState* state) {
DCHECK_EQ(_partition_num, _builders.size());
if (_partition_num > 1 && hash_table_row_count() < _partition_join_min_rows) {
RETURN_IF_ERROR(_convert_to_single_partition());
if (_partition_num > 1) {
if (!_need_partition_join_for_build(hash_table_row_count())) {
RETURN_IF_ERROR(_convert_to_single_partition(state));
} else {
RETURN_IF_ERROR(_flush_buffer_chunks(state));
}
}
for (auto& builder : _builders) {
@ -769,17 +980,20 @@ std::unique_ptr<HashJoinProberImpl> AdaptivePartitionHashJoinBuilder::create_pro
}
}
void AdaptivePartitionHashJoinBuilder::clone_readable(HashJoinBuilder* builder) {
void AdaptivePartitionHashJoinBuilder::clone_readable(HashJoinBuilder* other_builder) {
for (auto& builder : _builders) {
DCHECK(builder->ready());
}
DCHECK(_ready);
DCHECK_EQ(_partition_num, _builders.size());
auto other = down_cast<AdaptivePartitionHashJoinBuilder*>(builder);
auto other = down_cast<AdaptivePartitionHashJoinBuilder*>(other_builder);
other->_builders.clear();
other->_partition_num = _partition_num;
other->_partition_join_max_rows = _partition_join_max_rows;
other->_partition_join_min_rows = _partition_join_min_rows;
other->_partition_join_l2_min_rows = _partition_join_l2_min_rows;
other->_partition_join_l2_max_rows = _partition_join_l2_max_rows;
other->_partition_join_l3_min_rows = _partition_join_l3_min_rows;
other->_partition_join_l3_max_rows = _partition_join_l3_max_rows;
other->_partition_join_l3_max_rows = _partition_join_l3_max_rows;
other->_ready = _ready;
for (size_t i = 0; i < _partition_num; ++i) {
other->_builders.emplace_back(std::make_unique<SingleHashJoinBuilder>(_hash_joiner));

View File

@ -92,11 +92,11 @@ public:
virtual void create(const HashTableParam& param) = 0;
// append chunk to hash table
Status append_chunk(const ChunkPtr& chunk) {
Status append_chunk(RuntimeState* state, const ChunkPtr& chunk) {
_inc_row_count(chunk->num_rows());
return do_append_chunk(chunk);
return do_append_chunk(state, chunk);
}
virtual Status do_append_chunk(const ChunkPtr& chunk) = 0;
virtual Status do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) = 0;
virtual Status build(RuntimeState* state) = 0;
@ -125,6 +125,7 @@ public:
// clone readable to to builder
virtual void clone_readable(HashJoinBuilder* builder) = 0;
virtual Status prepare_for_spill_start(RuntimeState* state) { return Status::OK(); }
virtual ChunkPtr convert_to_spill_schema(const ChunkPtr& chunk) const = 0;
protected:
@ -149,7 +150,7 @@ public:
void reset(const HashTableParam& param) override;
Status do_append_chunk(const ChunkPtr& chunk) override;
Status do_append_chunk(RuntimeState* state, const ChunkPtr& chunk) override;
Status build(RuntimeState* state) override;

View File

@ -126,6 +126,14 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) {
_build_equivalence_partition_expr_ctxs = _build_expr_ctxs;
}
if (tnode.__isset.hash_join_node && tnode.hash_join_node.__isset.common_slot_map) {
for (const auto& [key, val] : tnode.hash_join_node.common_slot_map) {
ExprContext* context;
RETURN_IF_ERROR(Expr::create_expr_tree(_pool, val, &context, state, true));
_common_expr_ctxs.insert({key, context});
}
}
RETURN_IF_ERROR(Expr::create_expr_trees(_pool, tnode.hash_join_node.other_join_conjuncts,
&_other_join_conjunct_ctxs, state));
@ -483,8 +491,8 @@ pipeline::OpFactories HashJoinNode::_decompose_to_pipeline(pipeline::PipelineBui
HashJoinerParam param(pool, _hash_join_node, _is_null_safes, _build_expr_ctxs, _probe_expr_ctxs,
_other_join_conjunct_ctxs, _conjunct_ctxs, child(1)->row_desc(), child(0)->row_desc(),
child(1)->type(), child(0)->type(), child(1)->conjunct_ctxs().empty(), _build_runtime_filters,
_output_slots, _output_slots, _distribution_mode, _enable_late_materialization,
_enable_partition_hash_join, _is_skew_join);
_output_slots, _output_slots, context->degree_of_parallelism(), _distribution_mode,
_enable_late_materialization, _enable_partition_hash_join, _is_skew_join, _common_expr_ctxs);
auto hash_joiner_factory = std::make_shared<starrocks::pipeline::HashJoinerFactory>(param);
// Create a shared RefCountedRuntimeFilterCollector

View File

@ -140,6 +140,8 @@ private:
bool _probe_eos = false; // probe table scan finished;
size_t _runtime_join_filter_pushdown_limit = 1024000;
std::map<SlotId, ExprContext*> _common_expr_ctxs;
RuntimeProfile::Counter* _build_timer = nullptr;
RuntimeProfile::Counter* _build_ht_timer = nullptr;
RuntimeProfile::Counter* _copy_right_table_chunk_timer = nullptr;

View File

@ -33,6 +33,7 @@
#include "pipeline/hashjoin/hash_joiner_fwd.h"
#include "runtime/current_thread.h"
#include "simd/simd.h"
#include "storage/chunk_helper.h"
#include "util/runtime_profile.h"
namespace starrocks {
@ -73,6 +74,7 @@ HashJoiner::HashJoiner(const HashJoinerParam& param)
_probe_expr_ctxs(param._probe_expr_ctxs),
_other_join_conjunct_ctxs(param._other_join_conjunct_ctxs),
_conjunct_ctxs(param._conjunct_ctxs),
_common_expr_ctxs(param._common_expr_ctxs),
_build_row_descriptor(param._build_row_descriptor),
_probe_row_descriptor(param._probe_row_descriptor),
_build_node_type(param._build_node_type),
@ -82,6 +84,7 @@ HashJoiner::HashJoiner(const HashJoinerParam& param)
_probe_output_slots(param._probe_output_slots),
_build_runtime_filters(param._build_runtime_filters.begin(), param._build_runtime_filters.end()),
_enable_late_materialization(param._enable_late_materialization),
_max_dop(param._max_dop),
_is_skew_join(param._is_skew_join) {
_is_push_down = param._hash_join_node.is_push_down;
if (_join_type == TJoinOp::LEFT_ANTI_JOIN && param._hash_join_node.is_rewritten_from_not_in) {
@ -157,6 +160,11 @@ void HashJoiner::_init_hash_table_param(HashTableParam* param, RuntimeState* sta
param->column_view_concat_rows_limit = state->column_view_concat_rows_limit();
param->column_view_concat_bytes_limit = state->column_view_concat_bytes_limit();
std::set<SlotId> predicate_slots;
for (const auto& [slot_id, ctx] : _common_expr_ctxs) {
std::vector<SlotId> expr_slots;
ctx->root()->get_slot_ids(&expr_slots);
predicate_slots.insert(expr_slots.begin(), expr_slots.end());
}
for (ExprContext* expr_context : _conjunct_ctxs) {
std::vector<SlotId> expr_slots;
expr_context->root()->get_slot_ids(&expr_slots);
@ -178,7 +186,7 @@ void HashJoiner::_init_hash_table_param(HashTableParam* param, RuntimeState* sta
}
}
}
Status HashJoiner::append_chunk_to_ht(const ChunkPtr& chunk) {
Status HashJoiner::append_chunk_to_ht(RuntimeState* state, const ChunkPtr& chunk) {
if (_phase != HashJoinPhase::BUILD) {
return Status::OK();
}
@ -187,7 +195,7 @@ Status HashJoiner::append_chunk_to_ht(const ChunkPtr& chunk) {
}
update_build_rows(chunk->num_rows());
return _hash_join_builder->append_chunk(chunk);
return _hash_join_builder->append_chunk(state, chunk);
}
Status HashJoiner::append_chunk_to_spill_buffer(RuntimeState* state, const ChunkPtr& chunk) {
@ -387,6 +395,9 @@ Status HashJoiner::_calc_filter_for_other_conjunct(ChunkPtr* chunk, Filter& filt
hit_all = false;
filter.assign((*chunk)->num_rows(), 1);
CommonExprEvalScopeGuard guard(*chunk, _common_expr_ctxs);
RETURN_IF_ERROR(guard.evaluate());
for (auto* ctx : _other_join_conjunct_ctxs) {
ASSIGN_OR_RETURN(ColumnPtr column, ctx->evaluate((*chunk).get()))
size_t true_count = ColumnHelper::count_true_with_notnull(column);
@ -515,6 +526,8 @@ Status HashJoiner::_process_other_conjunct(ChunkPtr* chunk, JoinHashTable& hash_
Status HashJoiner::_process_where_conjunct(ChunkPtr* chunk) {
SCOPED_TIMER(probe_metrics().where_conjunct_evaluate_timer);
CommonExprEvalScopeGuard guard(*chunk, _common_expr_ctxs);
RETURN_IF_ERROR(guard.evaluate());
return ExecNode::eval_conjuncts(_conjunct_ctxs, (*chunk).get());
}

View File

@ -70,9 +70,10 @@ struct HashJoinerParam {
const RowDescriptor& build_row_descriptor, const RowDescriptor& probe_row_descriptor,
TPlanNodeType::type build_node_type, TPlanNodeType::type probe_node_type,
bool build_conjunct_ctxs_is_empty, std::list<RuntimeFilterBuildDescriptor*> build_runtime_filters,
std::set<SlotId> build_output_slots, std::set<SlotId> probe_output_slots,
std::set<SlotId> build_output_slots, std::set<SlotId> probe_output_slots, size_t max_dop,
const TJoinDistributionMode::type distribution_mode, bool enable_late_materialization,
bool enable_partition_hash_join, bool is_skew_join)
bool enable_partition_hash_join, bool is_skew_join,
const std::map<SlotId, ExprContext*>& common_expr_ctxs)
: _pool(pool),
_hash_join_node(hash_join_node),
_is_null_safes(std::move(is_null_safes)),
@ -88,10 +89,12 @@ struct HashJoinerParam {
_build_runtime_filters(std::move(build_runtime_filters)),
_build_output_slots(std::move(build_output_slots)),
_probe_output_slots(std::move(probe_output_slots)),
_max_dop(max_dop),
_distribution_mode(distribution_mode),
_enable_late_materialization(enable_late_materialization),
_enable_partition_hash_join(enable_partition_hash_join),
_is_skew_join(is_skew_join) {}
_is_skew_join(is_skew_join),
_common_expr_ctxs(common_expr_ctxs) {}
HashJoinerParam(HashJoinerParam&&) = default;
HashJoinerParam(HashJoinerParam&) = default;
@ -113,10 +116,13 @@ struct HashJoinerParam {
std::set<SlotId> _build_output_slots;
std::set<SlotId> _probe_output_slots;
size_t _max_dop;
const TJoinDistributionMode::type _distribution_mode;
const bool _enable_late_materialization;
const bool _enable_partition_hash_join;
const bool _is_skew_join;
const std::map<SlotId, ExprContext*> _common_expr_ctxs;
};
inline bool could_short_circuit(TJoinOp::type join_type) {
@ -205,7 +211,7 @@ public:
void enter_eos_phase() { _phase = HashJoinPhase::EOS; }
// build phase
Status append_chunk_to_ht(const ChunkPtr& chunk);
Status append_chunk_to_ht(RuntimeState* state, const ChunkPtr& chunk);
Status append_chunk_to_spill_buffer(RuntimeState* state, const ChunkPtr& chunk);
@ -343,6 +349,9 @@ public:
return DeferOp([this]() { _probe_observable.notify_source_observers(); });
}
size_t max_dop() const { return _max_dop; }
TJoinDistributionMode::type distribution_mode() const { return _hash_join_node.distribution_mode; }
private:
static bool _has_null(const ColumnPtr& column);
@ -361,7 +370,7 @@ private:
const_column->data_column()->assign(chunk->num_rows(), 0);
key_columns.emplace_back(const_column->data_column());
} else {
key_columns.emplace_back(column_ptr);
key_columns.emplace_back(std::move(column_ptr));
}
}
return Status::OK();
@ -433,6 +442,7 @@ private:
const std::vector<ExprContext*>& _other_join_conjunct_ctxs;
// Conjuncts in Join followed by a filter predicate, usually in Where and Having.
const std::vector<ExprContext*>& _conjunct_ctxs;
const std::map<SlotId, ExprContext*>& _common_expr_ctxs;
const RowDescriptor& _build_row_descriptor;
const RowDescriptor& _probe_row_descriptor;
const TPlanNodeType::type _build_node_type;
@ -483,6 +493,8 @@ private:
pipeline::Observable _builder_observable;
pipeline::Observable _probe_observable;
size_t _max_dop = 0;
bool _is_skew_join = false;
};

View File

@ -237,8 +237,9 @@ Status HdfsScanner::get_next(RuntimeState* runtime_state, ChunkPtr* chunk) {
// short circuit for min/max optimization.
if (_scanner_ctx.can_use_min_max_optimization()) {
// 3 means we output 3 values: min, max, and null
_scanner_ctx.append_or_update_min_max_column_to_chunk(chunk, 3);
size_t row_count = (*chunk)->num_rows();
const size_t row_count = 3;
(*chunk)->set_num_rows(row_count);
_scanner_ctx.append_or_update_min_max_column_to_chunk(chunk, row_count);
_scanner_ctx.append_or_update_partition_column_to_chunk(chunk, row_count);
_scanner_ctx.append_or_update_extended_column_to_chunk(chunk, row_count);
_scanner_ctx.no_more_chunks = true;

View File

@ -26,6 +26,7 @@
#include "simd/simd.h"
#include "types/logical_type_infra.h"
#include "util/runtime_profile.h"
#include "util/stack_util.h"
namespace starrocks {
@ -47,6 +48,10 @@ private:
template <LogicalType LT>
static std::pair<bool, JoinHashMapMethodUnaryType> _try_use_range_direct_mapping(RuntimeState* state,
JoinHashTableItems* table_items);
// @return: <can_use, JoinHashMapMethodUnaryType>, where `JoinHashMapMethodUnaryType` is effective only when `can_use` is true.
template <LogicalType LT>
static std::pair<bool, JoinHashMapMethodUnaryType> _try_use_linear_chained(RuntimeState* state,
JoinHashTableItems* table_items);
};
std::tuple<JoinKeyConstructorUnaryType, JoinHashMapMethodUnaryType>
@ -152,6 +157,10 @@ JoinHashMapMethodUnaryType JoinHashMapSelector::_determine_hash_map_method(
}
}
if (const auto [can_use, hash_map_type] = _try_use_linear_chained<LT>(state, table_items); can_use) {
return hash_map_type;
}
return JoinHashMapMethodTypeTraits<JoinHashMapMethodType::BUCKET_CHAINED, LT>::unary_type;
}
});
@ -220,6 +229,28 @@ std::pair<bool, JoinHashMapMethodUnaryType> JoinHashMapSelector::_try_use_range_
return {false, JoinHashMapMethodUnaryType::BUCKET_CHAINED_INT};
}
template <LogicalType LT>
std::pair<bool, JoinHashMapMethodUnaryType> JoinHashMapSelector::_try_use_linear_chained(
RuntimeState* state, JoinHashTableItems* table_items) {
if (!state->enable_hash_join_linear_chained_opt()) {
return {false, JoinHashMapMethodTypeTraits<JoinHashMapMethodType::BUCKET_CHAINED, LT>::unary_type};
}
const uint64_t bucket_size = JoinHashMapHelper::calc_bucket_size(table_items->row_count + 1);
if (bucket_size > LinearChainedJoinHashMap<LT>::max_supported_bucket_size()) {
return {false, JoinHashMapMethodTypeTraits<JoinHashMapMethodType::BUCKET_CHAINED, LT>::unary_type};
}
const bool is_left_anti_join_without_other_conjunct =
(table_items->join_type == TJoinOp::LEFT_ANTI_JOIN || table_items->join_type == TJoinOp::LEFT_SEMI_JOIN) &&
!table_items->with_other_conjunct;
if (is_left_anti_join_without_other_conjunct) {
return {true, JoinHashMapMethodTypeTraits<JoinHashMapMethodType::LINEAR_CHAINED_SET, LT>::unary_type};
} else {
return {true, JoinHashMapMethodTypeTraits<JoinHashMapMethodType::LINEAR_CHAINED, LT>::unary_type};
}
}
// ------------------------------------------------------------------------------------
// JoinHashMap
// ------------------------------------------------------------------------------------
@ -483,6 +514,15 @@ void JoinHashTable::_init_join_keys() {
}
int64_t JoinHashTable::mem_usage() const {
// Theoretically, `_table_items` may be a nullptr after a cancel, even though in practice we havent observed any
// cases where `_table_items` was unexpectedly cleared or left uninitialized.
// To prevent potential null pointer exceptions, we add a defensive check here.
if (_table_items == nullptr) {
LOG(WARNING) << "table_items is nullptr in mem_usage, stack:" << get_stack_trace();
DCHECK(false);
return 0;
}
int64_t usage = 0;
if (_table_items->build_chunk != nullptr) {
usage += _table_items->build_chunk->memory_usage();
@ -617,6 +657,21 @@ void JoinHashTable::merge_ht(const JoinHashTable& ht) {
}
columns[i]->append(*other_columns[i], 1, other_columns[i]->size() - 1);
}
auto& key_columns = _table_items->key_columns;
auto& other_key_columns = ht._table_items->key_columns;
for (size_t i = 0; i < key_columns.size(); i++) {
// If the join key is slot ref, will get from build chunk directly,
// otherwise will append from key_column of input
if (_table_items->join_keys[i].col_ref == nullptr) {
// upgrade to nullable column
if (!key_columns[i]->is_nullable() && other_key_columns[i]->is_nullable()) {
const size_t row_count = key_columns[i]->size();
key_columns[i] = NullableColumn::create(key_columns[i], NullColumn::create(row_count, 0));
}
key_columns[i]->append(*other_key_columns[i]);
}
}
}
ChunkPtr JoinHashTable::convert_to_spill_schema(const ChunkPtr& chunk) const {

View File

@ -327,26 +327,6 @@ private:
HashTableProbeState* _probe_state = nullptr;
};
#define JoinHashMapForOneKey(LT) JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::BUCKET_CHAINED>
#define JoinHashMapForDirectMapping(LT) \
JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::DIRECT_MAPPING>
#define JoinHashMapForFixedSizeKey(LT) \
JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::BUCKET_CHAINED>
#define JoinHashMapForSerializedKey(LT) \
JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED, JoinHashMapMethodType::BUCKET_CHAINED>
#define JoinHashMapForOneKeyRangeDirectMapping(LT) \
JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::RANGE_DIRECT_MAPPING>
#define JoinHashSetForOneKeyRangeDirectMapping(LT) \
JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::RANGE_DIRECT_MAPPING_SET>
#define JoinHashMapForOneKeyDenseRangeDirectMapping(LT) \
JoinHashMap<LT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::DENSE_RANGE_DIRECT_MAPPING>
#define JoinHashMapForFixedSizeKeyRangeDirectMapping(LT) \
JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::RANGE_DIRECT_MAPPING>
#define JoinHashSetForFixedSizeKeyRangeDirectMapping(LT) \
JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::RANGE_DIRECT_MAPPING_SET>
#define JoinHashMapForFixedSizeKeyDenseRangeDirectMapping(LT) \
JoinHashMap<LT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::DENSE_RANGE_DIRECT_MAPPING>
// ------------------------------------------------------------------------------------
// JoinHashTable
// ------------------------------------------------------------------------------------
@ -420,42 +400,55 @@ private:
void _remove_duplicate_index_for_right_anti_join(Filter* filter);
void _remove_duplicate_index_for_full_outer_join(Filter* filter);
using JoinHashMapVariant =
std::variant<std::unique_ptr<JoinHashMapForEmpty>, //
std::unique_ptr<JoinHashMapForDirectMapping(TYPE_BOOLEAN)>,
std::unique_ptr<JoinHashMapForDirectMapping(TYPE_TINYINT)>,
std::unique_ptr<JoinHashMapForDirectMapping(TYPE_SMALLINT)>,
#define JoinHashMapForIntBigintKey(MT) \
std::unique_ptr<JoinHashMap<TYPE_INT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_BIGINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr< \
JoinHashMap<TYPE_INT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_BIGINT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, \
JoinHashMapMethodType::MT>>
std::unique_ptr<JoinHashMapForOneKey(TYPE_INT)>, //
std::unique_ptr<JoinHashMapForOneKey(TYPE_BIGINT)>,
std::unique_ptr<JoinHashMapForOneKey(TYPE_LARGEINT)>, //
std::unique_ptr<JoinHashMapForOneKey(TYPE_FLOAT)>,
std::unique_ptr<JoinHashMapForOneKey(TYPE_DOUBLE)>, //
std::unique_ptr<JoinHashMapForOneKey(TYPE_VARCHAR)>,
std::unique_ptr<JoinHashMapForOneKey(TYPE_DATE)>, //
std::unique_ptr<JoinHashMapForOneKey(TYPE_DATETIME)>,
std::unique_ptr<JoinHashMapForOneKey(TYPE_DECIMALV2)>,
std::unique_ptr<JoinHashMapForOneKey(TYPE_DECIMAL32)>,
std::unique_ptr<JoinHashMapForOneKey(TYPE_DECIMAL64)>,
std::unique_ptr<JoinHashMapForOneKey(TYPE_DECIMAL128)>,
#define JoinHashMapForSmallKey(MT) \
std::unique_ptr<JoinHashMap<TYPE_BOOLEAN, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_TINYINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_SMALLINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>
std::unique_ptr<JoinHashMapForSerializedKey(TYPE_VARCHAR)>,
std::unique_ptr<JoinHashMapForFixedSizeKey(TYPE_INT)>,
std::unique_ptr<JoinHashMapForFixedSizeKey(TYPE_BIGINT)>,
std::unique_ptr<JoinHashMapForFixedSizeKey(TYPE_LARGEINT)>,
#define JoinHashMapForNonSmallKey(MT) \
std::unique_ptr<JoinHashMap<TYPE_INT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_BIGINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_LARGEINT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_FLOAT, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_DOUBLE, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_DATE, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_DATETIME, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_DECIMALV2, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_DECIMAL32, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_DECIMAL64, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_DECIMAL128, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_VARCHAR, JoinKeyConstructorType::ONE_KEY, JoinHashMapMethodType::MT>>, \
\
std::unique_ptr< \
JoinHashMap<TYPE_INT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_BIGINT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, \
JoinHashMapMethodType::MT>>, \
std::unique_ptr<JoinHashMap<TYPE_LARGEINT, JoinKeyConstructorType::SERIALIZED_FIXED_SIZE, \
JoinHashMapMethodType::MT>>, \
\
std::unique_ptr<JoinHashMap<TYPE_VARCHAR, JoinKeyConstructorType::SERIALIZED, JoinHashMapMethodType::MT>>
std::unique_ptr<JoinHashMapForOneKeyRangeDirectMapping(TYPE_INT)>,
std::unique_ptr<JoinHashMapForOneKeyRangeDirectMapping(TYPE_BIGINT)>,
std::unique_ptr<JoinHashSetForOneKeyRangeDirectMapping(TYPE_INT)>,
std::unique_ptr<JoinHashSetForOneKeyRangeDirectMapping(TYPE_BIGINT)>,
std::unique_ptr<JoinHashMapForOneKeyDenseRangeDirectMapping(TYPE_INT)>,
std::unique_ptr<JoinHashMapForOneKeyDenseRangeDirectMapping(TYPE_BIGINT)>,
std::unique_ptr<JoinHashMapForFixedSizeKeyRangeDirectMapping(TYPE_INT)>,
std::unique_ptr<JoinHashMapForFixedSizeKeyRangeDirectMapping(TYPE_BIGINT)>,
std::unique_ptr<JoinHashSetForFixedSizeKeyRangeDirectMapping(TYPE_INT)>,
std::unique_ptr<JoinHashSetForFixedSizeKeyRangeDirectMapping(TYPE_BIGINT)>,
std::unique_ptr<JoinHashMapForFixedSizeKeyDenseRangeDirectMapping(TYPE_INT)>,
std::unique_ptr<JoinHashMapForFixedSizeKeyDenseRangeDirectMapping(TYPE_BIGINT)>>;
using JoinHashMapVariant = std::variant<std::unique_ptr<JoinHashMapForEmpty>,
JoinHashMapForSmallKey(DIRECT_MAPPING), //
JoinHashMapForNonSmallKey(BUCKET_CHAINED), //
JoinHashMapForNonSmallKey(LINEAR_CHAINED), //
JoinHashMapForNonSmallKey(LINEAR_CHAINED_SET), //
JoinHashMapForIntBigintKey(RANGE_DIRECT_MAPPING), //
JoinHashMapForIntBigintKey(RANGE_DIRECT_MAPPING_SET), //
JoinHashMapForIntBigintKey(DENSE_RANGE_DIRECT_MAPPING) //
>;
#undef JoinHashMapForNonSmallKey
#undef JoinHashMapForSmallKey
#undef JoinHashMapForIntBigintKey
bool _is_empty_map = true;
JoinKeyConstructorUnaryType _key_constructor_type;

View File

@ -400,7 +400,7 @@ void JoinHashMap<LT, CT, MT>::_search_ht(RuntimeState* state, ChunkPtr* probe_ch
auto& build_data = BuildKeyConstructor().get_key_data(*_table_items);
auto& probe_data = ProbeKeyConstructor().get_key_data(*_probe_state);
HashMapMethod().lookup_init(*_table_items, _probe_state, probe_data, _probe_state->null_array);
HashMapMethod().lookup_init(*_table_items, _probe_state, build_data, probe_data, _probe_state->null_array);
_probe_state->consider_probe_time_locality();
if (_table_items->is_collision_free_and_unique) {
@ -629,9 +629,11 @@ void JoinHashMap<LT, CT, MT>::_search_ht_impl(RuntimeState* state, const Buffer<
#define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
#endif
#define PREFETCH_AND_COWAIT(x, y) \
XXH_PREFETCH(x); \
XXH_PREFETCH(y); \
#define PREFETCH_AND_COWAIT(cur_data, next_index) \
if constexpr (!HashMapMethod::AreKeysInChainIdentical) { \
XXH_PREFETCH(cur_data); \
} \
XXH_PREFETCH(next_index); \
co_await std::suspend_always{};
// When a probe row corresponds to multiple Build rows,
@ -994,6 +996,19 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_semi_join(RuntimeState* st
}
}
if (match_count == probe_row_count) {
_probe_state->match_flag = JoinMatchFlag::ALL_MATCH_ONE;
} else if (match_count * 2 >= probe_row_count) {
_probe_state->match_flag = JoinMatchFlag::MOST_MATCH_ONE;
uint8_t* match_filter_data = _probe_state->probe_match_filter.data();
memset(match_filter_data, 0, sizeof(uint8_t) * probe_row_count);
for (uint32_t i = 0; i < match_count; i++) {
match_filter_data[_probe_state->probe_index[i]] = 1;
}
} else {
_probe_state->match_flag = JoinMatchFlag::NORMAL;
}
PROBE_OVER()
}
@ -1001,10 +1016,10 @@ template <LogicalType LT, JoinKeyConstructorType CT, JoinHashMapMethodType MT>
template <bool first_probe, bool is_collision_free_and_unique>
void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_anti_join(RuntimeState* state, const Buffer<CppType>& build_data,
const Buffer<CppType>& probe_data) {
size_t match_count = 0;
size_t probe_row_count = _probe_state->probe_row_count;
DCHECK_LT(0, _table_items->row_count);
size_t match_count = 0;
const size_t probe_row_count = _probe_state->probe_row_count;
if (_table_items->join_type == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && _probe_state->null_array != nullptr) {
// process left anti join from not in
for (size_t i = 0; i < probe_row_count; i++) {
@ -1022,6 +1037,19 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_anti_join(RuntimeState* st
}
}
if (match_count == probe_row_count) {
_probe_state->match_flag = JoinMatchFlag::ALL_MATCH_ONE;
} else if (match_count * 2 >= probe_row_count) {
_probe_state->match_flag = JoinMatchFlag::MOST_MATCH_ONE;
uint8_t* match_filter_data = _probe_state->probe_match_filter.data();
memset(match_filter_data, 0, sizeof(uint8_t) * probe_row_count);
for (uint32_t i = 0; i < match_count; i++) {
match_filter_data[_probe_state->probe_index[i]] = 1;
}
} else {
_probe_state->match_flag = JoinMatchFlag::NORMAL;
}
PROBE_OVER()
}
@ -1122,7 +1150,13 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_right_outer_join(RuntimeState*
_probe_state->build_match_index[build_index] = 1;
match_count++;
RETURN_IF_CHUNK_FULL()
if constexpr (!is_collision_free_and_unique) {
RETURN_IF_CHUNK_FULL()
}
}
if constexpr (is_collision_free_and_unique) {
break;
}
build_index = _table_items->next[build_index];
}
@ -1190,9 +1224,15 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_right_semi_join(RuntimeState* s
_probe_state->build_match_index[build_index] = 1;
match_count++;
RETURN_IF_CHUNK_FULL()
if constexpr (!is_collision_free_and_unique) {
RETURN_IF_CHUNK_FULL()
}
}
}
if constexpr (is_collision_free_and_unique) {
break;
}
build_index = _table_items->next[build_index];
}
}
@ -1247,6 +1287,10 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_right_anti_join(RuntimeState* s
if (HashMapMethod().equal(build_data[index], probe_data[i])) {
_probe_state->build_match_index[index] = 1;
}
if constexpr (is_collision_free_and_unique) {
break;
}
index = _table_items->next[index];
}
}
@ -1311,7 +1355,13 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_full_outer_join(RuntimeState* s
_probe_state->cur_row_match_count++;
match_count++;
RETURN_IF_CHUNK_FULL()
if constexpr (!is_collision_free_and_unique) {
RETURN_IF_CHUNK_FULL()
}
}
if constexpr (is_collision_free_and_unique) {
break;
}
build_index = _table_items->next[build_index];
}
@ -1399,8 +1449,15 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_semi_join_with_other_conju
_probe_state->build_index[match_count] = build_index;
match_count++;
RETURN_IF_CHUNK_FULL()
if constexpr (!is_collision_free_and_unique) {
RETURN_IF_CHUNK_FULL()
}
}
if constexpr (is_collision_free_and_unique) {
break;
}
build_index = _table_items->next[build_index];
}
}
@ -1463,8 +1520,15 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_null_aware_anti_join_with_other
match_count++;
_probe_state->cur_row_match_count++;
RETURN_IF_CHUNK_FULL()
if constexpr (!is_collision_free_and_unique) {
RETURN_IF_CHUNK_FULL()
}
}
if constexpr (is_collision_free_and_unique) {
break;
}
build_index = _table_items->next[build_index];
}
@ -1503,7 +1567,13 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_right_outer_right_semi_right_an
_probe_state->build_index[match_count] = build_index;
match_count++;
RETURN_IF_CHUNK_FULL()
if constexpr (!is_collision_free_and_unique) {
RETURN_IF_CHUNK_FULL()
}
}
if constexpr (is_collision_free_and_unique) {
break;
}
build_index = _table_items->next[build_index];
}
@ -1552,7 +1622,13 @@ void JoinHashMap<LT, CT, MT>::_probe_from_ht_for_left_outer_left_anti_full_outer
_probe_state->cur_row_match_count++;
match_count++;
RETURN_IF_CHUNK_FULL()
if constexpr (!is_collision_free_and_unique) {
RETURN_IF_CHUNK_FULL()
}
}
if constexpr (is_collision_free_and_unique) {
break;
}
build_index = _table_items->next[build_index];
}

View File

@ -56,16 +56,101 @@ public:
using CppType = typename RunTimeTypeTraits<LT>::CppType;
using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;
static constexpr bool AreKeysInChainIdentical = false;
static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
const Buffer<uint8_t>* is_nulls);
static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls);
static bool equal(const CppType& x, const CppType& y) { return x == y; }
};
// The `LinearChainedJoinHashMap` uses linear probing to store distinct keys and chained to storage for linked lists of
// identical keys.
// - `first` stores the build index of the header for the linked list for each distinct key.
// - `next` maintains the linked list structure for each distinct key.
//
// Fingerprint
// - Each `first` entry uses the highest 1 byte to store the fingerprint and the lower 3 bytes for the build index,
// thus supporting up to 0xFFFFFF buckets.
// - The fingerprint is generated via hashing.
// During hashing, `bucket_num_with_fp = hash % (bucket_size * 8)` is computed instead of `hash % bucket_size`.
// - The lower 8 bits of `bucket_num_with_fp` represent the fingerprint (`fp`),
// - while `bucket_num_with_fp >> 8` yields the bucket number.
//
// Insert and probe
// - During insertion, linear probing is used in `first` to locate either the first empty bucket or an existing matching key.
// The new build index is then inserted into the corresponding linked list in `next`.
// - During probing, linear probing is used in `first` to locate either an empty bucket or the bucket_num for a matching key.
// - If an empty bucket is found, it indicates no matching key exists.
// - If a matching key exists, the entire linked list (with `first[bucket_num]` as its header) in `next` stores build
// indexes for all the same keys.
//
// The following diagram illustrates the structure of `LinearChainedJoinHashMap`:
//
// build keys first next
// ┌──────────────┐ ┌───┐
// │FP|build_index│ │ │◄───┐
// │1B 3B │ │ │◄┐ │
// ├──────────────┤ ├───┤ │ │
// ┌───────►│ │ │ │ │ │
// ┌────┐ │ ┌──┤ │ │ │ │ │
// ┌──────┐ │ │ │ │ ├──────────────┤ ├───┤ │ │
// │ key ├─►│hash├───┘ └─►│ │ │ ├─┘ │
// └──────┘ │ │ ┌──┤ │ │ │◄─┐ │
// └────┘ │ ├──────────────┤ ├───┤ │ │
// │ │ │ │ │ │ │
// │ │ │ │ │ │ │
// │ ├──────────────┤ ├───┤ │ │
// └─►│ ├──►│ │ │ │
// │ │ │ ├──┘ │
// ├──────────────┤ ├───┤ │
// │ │ │ │ │
// │ │ │ │ │
// ├──────────────┤ ├───┤ │
// │ │ │ │ │
// │ ├──►│ ├────┘
// └──────────────┘ └───┘
template <LogicalType LT, bool NeedBuildChained = true>
class LinearChainedJoinHashMap {
public:
using CppType = typename RunTimeTypeTraits<LT>::CppType;
using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;
static constexpr bool AreKeysInChainIdentical = true;
static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
const Buffer<uint8_t>* is_nulls);
static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls);
static bool equal(const CppType& x, const CppType& y) { return true; }
static uint32_t max_supported_bucket_size() { return DATA_MASK; }
private:
static constexpr uint32_t FP_BITS = 8;
static constexpr uint32_t FP_MASK = 0xFF00'0000ul;
static constexpr uint32_t DATA_MASK = 0x00FF'FFFFul;
static uint32_t _combine_data_fp(const uint32_t data, const uint32_t fp) { return fp | data; }
static uint32_t _extract_data(const uint32_t v) { return v & DATA_MASK; }
static uint32_t _extract_fp(const uint32_t v) { return v & FP_MASK; }
static uint32_t _get_bucket_num_from_hash(const uint32_t hash) { return hash >> FP_BITS; }
static uint32_t _get_fp_from_hash(const uint32_t hash) { return hash << (32 - FP_BITS); }
};
template <LogicalType LT>
using LinearChainedJoinHashSet = LinearChainedJoinHashMap<LT, false>;
// The bucket-chained linked list formed by first` and `next` is the same as that of `BucketChainedJoinHashMap`.
//
// `DirectMappingJoinHashMap` maps to a position in `first` using `key-MIN_VALUE`.
@ -101,12 +186,15 @@ public:
using CppType = typename RunTimeTypeTraits<LT>::CppType;
using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;
static constexpr bool AreKeysInChainIdentical = true;
static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
const Buffer<uint8_t>* is_nulls);
static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls);
static bool equal(const CppType& x, const CppType& y) { return true; }
};
@ -149,12 +237,15 @@ public:
using CppType = typename RunTimeTypeTraits<LT>::CppType;
using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;
static constexpr bool AreKeysInChainIdentical = true;
static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
const Buffer<uint8_t>* is_nulls);
static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls);
static bool equal(const CppType& x, const CppType& y) { return true; }
};
@ -168,12 +259,15 @@ public:
using CppType = typename RunTimeTypeTraits<LT>::CppType;
using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;
static constexpr bool AreKeysInChainIdentical = true;
static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
const Buffer<uint8_t>* is_nulls);
static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls);
static bool equal(const CppType& x, const CppType& y) { return true; }
};
@ -221,12 +315,15 @@ public:
using CppType = typename RunTimeTypeTraits<LT>::CppType;
using ColumnType = typename RunTimeTypeTraits<LT>::ColumnType;
static constexpr bool AreKeysInChainIdentical = true;
static void build_prepare(RuntimeState* state, JoinHashTableItems* table_items);
static void construct_hash_table(JoinHashTableItems* table_items, const Buffer<CppType>& keys,
const Buffer<uint8_t>* is_nulls);
static void lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls);
const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls);
static bool equal(const CppType& x, const CppType& y) { return true; }
};

View File

@ -84,7 +84,8 @@ void BucketChainedJoinHashMap<LT>::construct_hash_table(JoinHashTableItems* tabl
template <LogicalType LT>
void BucketChainedJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls) {
const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls) {
const uint32_t row_count = probe_state->probe_row_count;
const auto* firsts = table_items.first.data();
const auto* buckets = probe_state->buckets.data();
@ -92,8 +93,8 @@ void BucketChainedJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_i
if (is_nulls == nullptr) {
for (uint32_t i = 0; i < row_count; i++) {
probe_state->buckets[i] = JoinHashMapHelper::calc_bucket_num<CppType>(keys[i], table_items.bucket_size,
table_items.log_bucket_size);
probe_state->buckets[i] = JoinHashMapHelper::calc_bucket_num<CppType>(
probe_keys[i], table_items.bucket_size, table_items.log_bucket_size);
}
SIMDGather::gather(nexts, firsts, buckets, row_count);
} else {
@ -107,14 +108,197 @@ void BucketChainedJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_i
};
for (uint32_t i = 0; i < row_count; i++) {
if (need_calc_bucket_num(i)) {
probe_state->buckets[i] = JoinHashMapHelper::calc_bucket_num<CppType>(keys[i], table_items.bucket_size,
table_items.log_bucket_size);
probe_state->buckets[i] = JoinHashMapHelper::calc_bucket_num<CppType>(
probe_keys[i], table_items.bucket_size, table_items.log_bucket_size);
}
}
SIMDGather::gather(nexts, firsts, buckets, is_nulls_data, row_count);
}
}
// ------------------------------------------------------------------------------------
// LinearChainedJoinHashMap
// ------------------------------------------------------------------------------------
template <LogicalType LT, bool NeedBuildChained>
void LinearChainedJoinHashMap<LT, NeedBuildChained>::build_prepare(RuntimeState* state,
JoinHashTableItems* table_items) {
table_items->bucket_size = JoinHashMapHelper::calc_bucket_size(table_items->row_count + 1);
table_items->log_bucket_size = __builtin_ctz(table_items->bucket_size);
table_items->first.resize(table_items->bucket_size, 0);
table_items->next.resize(table_items->row_count + 1, 0);
}
template <LogicalType LT, bool NeedBuildChained>
void LinearChainedJoinHashMap<LT, NeedBuildChained>::construct_hash_table(JoinHashTableItems* table_items,
const Buffer<CppType>& keys,
const Buffer<uint8_t>* is_nulls) {
auto process = [&]<bool IsNullable>() {
const auto num_rows = 1 + table_items->row_count;
const uint32_t bucket_size_mask = table_items->bucket_size - 1;
auto* __restrict next = table_items->next.data();
auto* __restrict first = table_items->first.data();
const uint8_t* __restrict is_nulls_data = IsNullable ? is_nulls->data() : nullptr;
auto need_calc_bucket_num = [&](const uint32_t index) {
// Only check `is_nulls_data[i]` for the nullable slice type. The hash calculation overhead for
// fixed-size types is small, and thus we do not check it to allow vectorization of the hash calculation.
if constexpr (!IsNullable || !std::is_same_v<CppType, Slice>) {
return true;
} else {
return is_nulls_data[index] == 0;
}
};
auto is_null = [&](const uint32_t index) {
if constexpr (!IsNullable) {
return false;
} else {
return is_nulls_data[index] != 0;
}
};
for (uint32_t i = 1; i < num_rows; i++) {
// Use `next` stores `bucket_num` temporarily.
if (need_calc_bucket_num(i)) {
next[i] = JoinHashMapHelper::calc_bucket_num<CppType>(keys[i], table_items->bucket_size << FP_BITS,
table_items->log_bucket_size + FP_BITS);
}
}
for (uint32_t i = 1; i < num_rows; i++) {
if (i + 16 < num_rows && !is_null(i + 16)) {
__builtin_prefetch(first + _get_bucket_num_from_hash(next[i + 16]));
}
if (is_null(i)) {
next[i] = 0;
continue;
}
const uint32_t hash = next[i];
const uint32_t fp = _get_fp_from_hash(hash);
uint32_t bucket_num = _get_bucket_num_from_hash(hash);
uint32_t probe_times = 1;
while (true) {
if (first[bucket_num] == 0) {
if constexpr (NeedBuildChained) {
next[i] = 0;
}
first[bucket_num] = _combine_data_fp(i, fp);
break;
}
if (fp == _extract_fp(first[bucket_num]) && keys[i] == keys[_extract_data(first[bucket_num])]) {
if constexpr (NeedBuildChained) {
next[i] = _extract_data(first[bucket_num]);
first[bucket_num] = _combine_data_fp(i, fp);
}
break;
}
bucket_num = (bucket_num + probe_times) & bucket_size_mask;
probe_times++;
}
}
if constexpr (!NeedBuildChained) {
table_items->next.clear();
}
};
if (is_nulls == nullptr) {
process.template operator()<false>();
} else {
process.template operator()<true>();
}
}
template <LogicalType LT, bool NeedBuildChained>
void LinearChainedJoinHashMap<LT, NeedBuildChained>::lookup_init(const JoinHashTableItems& table_items,
HashTableProbeState* probe_state,
const Buffer<CppType>& build_keys,
const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls) {
auto process = [&]<bool IsNullable>() {
const uint32_t bucket_size_mask = table_items.bucket_size - 1;
const uint32_t row_count = probe_state->probe_row_count;
const auto* firsts = table_items.first.data();
auto* hashes = probe_state->buckets.data();
auto* nexts = probe_state->next.data();
const uint8_t* is_nulls_data = IsNullable ? is_nulls->data() : nullptr;
auto need_calc_bucket_num = [&](const uint32_t index) {
if constexpr (!IsNullable || !std::is_same_v<CppType, Slice>) {
// Only check `is_nulls_data[i]` for the nullable slice type. The hash calculation overhead for
// fixed-size types is small, and thus we do not check it to allow vectorization of the hash calculation.
return true;
} else {
return is_nulls_data[index] == 0;
}
};
auto is_null = [&](const uint32_t index) {
if constexpr (!IsNullable) {
return false;
} else {
return is_nulls_data[index] != 0;
}
};
for (uint32_t i = 0; i < row_count; i++) {
if (need_calc_bucket_num(i)) {
hashes[i] = JoinHashMapHelper::calc_bucket_num<CppType>(
probe_keys[i], table_items.bucket_size << FP_BITS, table_items.log_bucket_size + FP_BITS);
}
}
for (uint32_t i = 0; i < row_count; i++) {
if (i + 16 < row_count && !is_null(i + 16)) {
__builtin_prefetch(firsts + _get_bucket_num_from_hash(hashes[i + 16]));
}
if (is_null(i)) {
nexts[i] = 0;
continue;
}
const uint32_t hash = hashes[i];
const uint32_t fp = _get_fp_from_hash(hash);
uint32_t bucket_num = _get_bucket_num_from_hash(hash);
uint32_t probe_times = 1;
while (true) {
if (firsts[bucket_num] == 0) {
nexts[i] = 0;
break;
}
const uint32_t cur_fp = _extract_fp(firsts[bucket_num]);
const uint32_t cur_index = _extract_data(firsts[bucket_num]);
if (fp == cur_fp && probe_keys[i] == build_keys[cur_index]) {
if constexpr (NeedBuildChained) {
nexts[i] = cur_index;
} else {
nexts[i] = 1;
}
break;
}
bucket_num = (bucket_num + probe_times) & bucket_size_mask;
probe_times++;
}
}
};
if (is_nulls == nullptr) {
process.template operator()<false>();
} else {
process.template operator()<true>();
}
}
// ------------------------------------------------------------------------------------
// DirectMappingJoinHashMap
// ------------------------------------------------------------------------------------
@ -155,7 +339,8 @@ void DirectMappingJoinHashMap<LT>::construct_hash_table(JoinHashTableItems* tabl
template <LogicalType LT>
void DirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_items, HashTableProbeState* probe_state,
const Buffer<CppType>& keys, const Buffer<uint8_t>* is_nulls) {
const Buffer<CppType>& build_keys, const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls) {
probe_state->active_coroutines = 0; // the ht data is not large, so disable it always.
static constexpr CppType MIN_VALUE = RunTimeTypeLimits<LT>::min_value();
@ -163,13 +348,13 @@ void DirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_i
if (is_nulls == nullptr) {
for (size_t i = 0; i < probe_row_count; i++) {
probe_state->next[i] = table_items.first[keys[i] - MIN_VALUE];
probe_state->next[i] = table_items.first[probe_keys[i] - MIN_VALUE];
}
} else {
const auto* is_nulls_data = is_nulls->data();
for (size_t i = 0; i < probe_row_count; i++) {
if (is_nulls_data[i] == 0) {
probe_state->next[i] = table_items.first[keys[i] - MIN_VALUE];
probe_state->next[i] = table_items.first[probe_keys[i] - MIN_VALUE];
} else {
probe_state->next[i] = 0;
}
@ -215,7 +400,8 @@ void RangeDirectMappingJoinHashMap<LT>::construct_hash_table(JoinHashTableItems*
template <LogicalType LT>
void RangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_items,
HashTableProbeState* probe_state, const Buffer<CppType>& keys,
HashTableProbeState* probe_state, const Buffer<CppType>& build_keys,
const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls) {
probe_state->active_coroutines = 0; // the ht data is not large, so disable it always.
@ -224,8 +410,8 @@ void RangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& ta
const size_t num_rows = probe_state->probe_row_count;
if (is_nulls == nullptr) {
for (size_t i = 0; i < num_rows; i++) {
if ((keys[i] >= min_value) & (keys[i] <= max_value)) {
const uint64_t index = keys[i] - min_value;
if ((probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
const uint64_t index = probe_keys[i] - min_value;
probe_state->next[i] = table_items.first[index];
} else {
probe_state->next[i] = 0;
@ -234,8 +420,8 @@ void RangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& ta
} else {
const auto* is_nulls_data = is_nulls->data();
for (size_t i = 0; i < num_rows; i++) {
if ((is_nulls_data[i] == 0) & (keys[i] >= min_value) & (keys[i] <= max_value)) {
const uint64_t index = keys[i] - min_value;
if ((is_nulls_data[i] == 0) & (probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
const uint64_t index = probe_keys[i] - min_value;
probe_state->next[i] = table_items.first[index];
} else {
probe_state->next[i] = 0;
@ -281,7 +467,8 @@ void RangeDirectMappingJoinHashSet<LT>::construct_hash_table(JoinHashTableItems*
template <LogicalType LT>
void RangeDirectMappingJoinHashSet<LT>::lookup_init(const JoinHashTableItems& table_items,
HashTableProbeState* probe_state, const Buffer<CppType>& keys,
HashTableProbeState* probe_state, const Buffer<CppType>& build_keys,
const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls) {
probe_state->active_coroutines = 0; // the ht data is not large, so disable it always.
@ -290,8 +477,8 @@ void RangeDirectMappingJoinHashSet<LT>::lookup_init(const JoinHashTableItems& ta
const size_t num_rows = probe_state->probe_row_count;
if (is_nulls == nullptr) {
for (size_t i = 0; i < num_rows; i++) {
if ((keys[i] >= min_value) & (keys[i] <= max_value)) {
const uint64_t index = keys[i] - min_value;
if ((probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
const uint64_t index = probe_keys[i] - min_value;
const uint32_t group = index / 8;
const uint32_t offset = index % 8;
probe_state->next[i] = (table_items.key_bitset[group] & (1 << offset)) != 0;
@ -302,8 +489,8 @@ void RangeDirectMappingJoinHashSet<LT>::lookup_init(const JoinHashTableItems& ta
} else {
const auto* is_nulls_data = is_nulls->data();
for (size_t i = 0; i < num_rows; i++) {
if ((is_nulls_data[i] == 0) & (keys[i] >= min_value) & (keys[i] <= max_value)) {
const uint64_t index = keys[i] - min_value;
if ((is_nulls_data[i] == 0) & (probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
const uint64_t index = probe_keys[i] - min_value;
const uint32_t group = index / 8;
const uint32_t offset = index % 8;
probe_state->next[i] = (table_items.key_bitset[group] & (1 << offset)) != 0;
@ -387,7 +574,9 @@ void DenseRangeDirectMappingJoinHashMap<LT>::construct_hash_table(JoinHashTableI
template <LogicalType LT>
void DenseRangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItems& table_items,
HashTableProbeState* probe_state, const Buffer<CppType>& keys,
HashTableProbeState* probe_state,
const Buffer<CppType>& build_keys,
const Buffer<CppType>& probe_keys,
const Buffer<uint8_t>* is_nulls) {
probe_state->active_coroutines = 0; // the ht data is not large, so disable it always.
@ -415,8 +604,8 @@ void DenseRangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItem
const size_t num_rows = probe_state->probe_row_count;
if (is_nulls == nullptr) {
for (size_t i = 0; i < num_rows; i++) {
if ((keys[i] >= min_value) & (keys[i] <= max_value)) {
const uint64_t bucket_num = keys[i] - min_value;
if ((probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
const uint64_t bucket_num = probe_keys[i] - min_value;
probe_state->next[i] = get_dense_first(bucket_num);
} else {
probe_state->next[i] = 0;
@ -425,8 +614,8 @@ void DenseRangeDirectMappingJoinHashMap<LT>::lookup_init(const JoinHashTableItem
} else {
const auto* is_nulls_data = is_nulls->data();
for (size_t i = 0; i < num_rows; i++) {
if ((is_nulls_data[i] == 0) & (keys[i] >= min_value) & (keys[i] <= max_value)) {
const uint64_t bucket_num = keys[i] - min_value;
if ((is_nulls_data[i] == 0) & (probe_keys[i] >= min_value) & (probe_keys[i] <= max_value)) {
const uint64_t bucket_num = probe_keys[i] - min_value;
probe_state->next[i] = get_dense_first(bucket_num);
} else {
probe_state->next[i] = 0;

View File

@ -114,7 +114,8 @@ struct JoinHashTableItems {
// 1) the ht's size is enough large, for example, larger than (1UL << 27) bytes.
// 2) smaller ht but most buckets have more than one keys
cache_miss_serious = row_count > (1UL << 18) &&
((probe_bytes > (1UL << 25) && keys_per_bucket > 2) ||
((probe_bytes > (1UL << 24) && keys_per_bucket >= 10) ||
(probe_bytes > (1UL << 25) && keys_per_bucket > 2) ||
(probe_bytes > (1UL << 26) && keys_per_bucket > 1.5) || probe_bytes > (1UL << 27));
VLOG_QUERY << "ht cache miss serious = " << cache_miss_serious << " row# = " << row_count
<< " , bytes = " << probe_bytes << " , depth = " << keys_per_bucket;

View File

@ -43,7 +43,9 @@ namespace starrocks {
M(DIRECT_MAPPING) \
M(RANGE_DIRECT_MAPPING) \
M(RANGE_DIRECT_MAPPING_SET) \
M(DENSE_RANGE_DIRECT_MAPPING)
M(DENSE_RANGE_DIRECT_MAPPING) \
M(LINEAR_CHAINED) \
M(LINEAR_CHAINED_SET)
#define APPLY_JOIN_KEY_CONSTRUCTOR_UNARY_TYPE(M) \
M(ONE_KEY_BOOLEAN) \
@ -89,7 +91,33 @@ namespace starrocks {
M(RANGE_DIRECT_MAPPING_SET_INT) \
M(RANGE_DIRECT_MAPPING_SET_BIGINT) \
M(DENSE_RANGE_DIRECT_MAPPING_INT) \
M(DENSE_RANGE_DIRECT_MAPPING_BIGINT)
M(DENSE_RANGE_DIRECT_MAPPING_BIGINT) \
\
M(LINEAR_CHAINED_INT) \
M(LINEAR_CHAINED_BIGINT) \
M(LINEAR_CHAINED_LARGEINT) \
M(LINEAR_CHAINED_FLOAT) \
M(LINEAR_CHAINED_DOUBLE) \
M(LINEAR_CHAINED_DATE) \
M(LINEAR_CHAINED_DATETIME) \
M(LINEAR_CHAINED_DECIMALV2) \
M(LINEAR_CHAINED_DECIMAL32) \
M(LINEAR_CHAINED_DECIMAL64) \
M(LINEAR_CHAINED_DECIMAL128) \
M(LINEAR_CHAINED_VARCHAR) \
\
M(LINEAR_CHAINED_SET_INT) \
M(LINEAR_CHAINED_SET_BIGINT) \
M(LINEAR_CHAINED_SET_LARGEINT) \
M(LINEAR_CHAINED_SET_FLOAT) \
M(LINEAR_CHAINED_SET_DOUBLE) \
M(LINEAR_CHAINED_SET_DATE) \
M(LINEAR_CHAINED_SET_DATETIME) \
M(LINEAR_CHAINED_SET_DECIMALV2) \
M(LINEAR_CHAINED_SET_DECIMAL32) \
M(LINEAR_CHAINED_SET_DECIMAL64) \
M(LINEAR_CHAINED_SET_DECIMAL128) \
M(LINEAR_CHAINED_SET_VARCHAR)
enum class JoinKeyConstructorType {
#define NAME_TO_ENUM(NAME) NAME,
@ -237,6 +265,36 @@ REGISTER_JOIN_MAP_METHOD_TYPE(DENSE_RANGE_DIRECT_MAPPING, TYPE_INT, DenseRangeDi
REGISTER_JOIN_MAP_METHOD_TYPE(DENSE_RANGE_DIRECT_MAPPING, TYPE_BIGINT, DenseRangeDirectMappingJoinHashMap,
DENSE_RANGE_DIRECT_MAPPING_BIGINT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_INT, LinearChainedJoinHashMap, LINEAR_CHAINED_INT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_BIGINT, LinearChainedJoinHashMap, LINEAR_CHAINED_BIGINT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_LARGEINT, LinearChainedJoinHashMap, LINEAR_CHAINED_LARGEINT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_FLOAT, LinearChainedJoinHashMap, LINEAR_CHAINED_FLOAT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DOUBLE, LinearChainedJoinHashMap, LINEAR_CHAINED_DOUBLE);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DATE, LinearChainedJoinHashMap, LINEAR_CHAINED_DATE);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DATETIME, LinearChainedJoinHashMap, LINEAR_CHAINED_DATETIME);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DECIMALV2, LinearChainedJoinHashMap, LINEAR_CHAINED_DECIMALV2);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DECIMAL32, LinearChainedJoinHashMap, LINEAR_CHAINED_DECIMAL32);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DECIMAL64, LinearChainedJoinHashMap, LINEAR_CHAINED_DECIMAL64);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_DECIMAL128, LinearChainedJoinHashMap, LINEAR_CHAINED_DECIMAL128);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED, TYPE_VARCHAR, LinearChainedJoinHashMap, LINEAR_CHAINED_VARCHAR);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_INT, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_INT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_BIGINT, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_BIGINT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_LARGEINT, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_LARGEINT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_FLOAT, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_FLOAT);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DOUBLE, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_DOUBLE);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DATE, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_DATE);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DATETIME, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_DATETIME);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DECIMALV2, LinearChainedJoinHashSet,
LINEAR_CHAINED_SET_DECIMALV2);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DECIMAL32, LinearChainedJoinHashSet,
LINEAR_CHAINED_SET_DECIMAL32);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DECIMAL64, LinearChainedJoinHashSet,
LINEAR_CHAINED_SET_DECIMAL64);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_DECIMAL128, LinearChainedJoinHashSet,
LINEAR_CHAINED_SET_DECIMAL128);
REGISTER_JOIN_MAP_METHOD_TYPE(LINEAR_CHAINED_SET, TYPE_VARCHAR, LinearChainedJoinHashSet, LINEAR_CHAINED_SET_VARCHAR);
#undef REGISTER_JOIN_MAP_TYPE
// ------------------------------------------------------------------------------------

View File

@ -44,6 +44,10 @@ Status LakeMetaScanner::_real_init() {
reader_params.low_card_threshold = _parent->_meta_scan_node.__isset.low_cardinality_threshold
? _parent->_meta_scan_node.low_cardinality_threshold
: DICT_DECODE_MAX_SIZE;
// Pass column access paths and extend schema similar to OLAP path
if (_parent->_meta_scan_node.__isset.column_access_paths && !_parent->_column_access_paths.empty()) {
reader_params.column_access_paths = &_parent->_column_access_paths;
}
_reader = std::make_unique<LakeMetaReader>();
TEST_SYNC_POINT_CALLBACK("lake_meta_scanner:open_mock_reader", &_reader);

View File

@ -72,7 +72,8 @@ Status OlapMetaScanner::_init_meta_reader_params() {
column.set_type(path->value_type().type);
column.set_length(path->value_type().len);
column.set_is_nullable(true);
column.set_extended_info(std::make_unique<ExtendedColumnInfo>(path.get(), root_column_index));
int32_t root_uid = tmp_schema->column(static_cast<size_t>(root_column_index)).unique_id();
column.set_extended_info(std::make_unique<ExtendedColumnInfo>(path.get(), root_uid));
tmp_schema->append_column(column);
VLOG(2) << "extend the tablet-schema: " << column.debug_string();

View File

@ -0,0 +1,63 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "exec/partition/bucket_aware_partition.h"
#include "column/nullable_column.h"
#include "gutil/casts.h"
namespace starrocks {
void calc_hash_values_and_bucket_ids(const std::vector<const Column*>& partitions_columns,
BucketAwarePartitionCtx ctx) {
size_t num_rows = partitions_columns[0]->size();
const auto& bucket_properties = ctx.bucket_properties;
auto& hash_values = ctx.hash_values;
auto& bucket_ids = ctx.bucket_ids;
auto& round_hashes = ctx.round_hashes;
auto& round_ids = ctx.round_ids;
hash_values.assign(num_rows, 0);
bucket_ids.assign(num_rows, 0);
for (int i = 0; i < partitions_columns.size(); ++i) {
// TODO, enhance it if we try to support more bucket functions.
DCHECK(bucket_properties[i].bucket_func == TBucketFunction::MURMUR3_X86_32);
round_hashes.assign(num_rows, 0);
round_ids.assign(num_rows, 0);
partitions_columns[i]->murmur_hash3_x86_32(&round_hashes[0], 0, num_rows);
for (int j = 0; j < num_rows; j++) {
hash_values[j] ^= round_hashes[j];
round_ids[j] = (round_hashes[j] & std::numeric_limits<int>::max()) % bucket_properties[i].bucket_num;
}
if (partitions_columns[i]->has_null()) {
const auto& null_data = down_cast<const NullableColumn*>(partitions_columns[i])->null_column()->get_data();
for (int j = 0; j < num_rows; j++) {
round_ids[j] = null_data[j] ? bucket_properties[i].bucket_num : round_ids[j];
}
}
if (i == partitions_columns.size() - 1) {
for (int j = 0; j < num_rows; j++) {
bucket_ids[j] += round_ids[j];
}
} else {
for (int j = 0; j < num_rows; j++) {
// bucket mapping, same behavior as FE
bucket_ids[j] = (round_ids[j] + bucket_ids[j]) * (bucket_properties[i + 1].bucket_num + 1);
}
}
}
}
} // namespace starrocks

View File

@ -0,0 +1,42 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "column/column.h"
#include "gen_cpp/Partitions_types.h"
namespace starrocks {
struct BucketAwarePartitionCtx {
BucketAwarePartitionCtx(const std::vector<TBucketProperty>& bucket_properties, std::vector<uint32_t>& hash_values,
std::vector<uint32_t>& round_hashes, std::vector<uint32_t>& bucket_ids,
std::vector<uint32_t>& round_ids)
: bucket_properties(bucket_properties),
hash_values(hash_values),
round_hashes(round_hashes),
bucket_ids(bucket_ids),
round_ids(round_ids) {}
const std::vector<TBucketProperty>& bucket_properties;
std::vector<uint32_t>& hash_values;
std::vector<uint32_t>& round_hashes;
std::vector<uint32_t>& bucket_ids;
std::vector<uint32_t>& round_ids;
};
void calc_hash_values_and_bucket_ids(const std::vector<const Column*>& partitions_columns, BucketAwarePartitionCtx ctx);
} // namespace starrocks

View File

@ -69,8 +69,7 @@ Status AggregateBlockingSinkOperator::set_finishing(RuntimeState* state) {
if (_aggregator->hash_map_variant().size() == 0) {
_aggregator->set_ht_eos();
}
_aggregator->hash_map_variant().visit(
[&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
_aggregator->it_hash() = _aggregator->state_allocator().begin();
} else if (_aggregator->is_none_group_by_exprs()) {
// for aggregate no group by, if _num_input_rows is 0,

View File

@ -89,8 +89,7 @@ DEFINE_FAIL_POINT(force_reset_aggregator_after_agg_streaming_sink_finish);
Status AggregateStreamingSourceOperator::_output_chunk_from_hash_map(ChunkPtr* chunk, RuntimeState* state) {
if (!_aggregator->it_hash().has_value()) {
_aggregator->hash_map_variant().visit(
[&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
_aggregator->it_hash() = _aggregator->state_allocator().begin();
COUNTER_SET(_aggregator->hash_table_size(), (int64_t)_aggregator->hash_map_variant().size());
}

View File

@ -14,7 +14,7 @@
#pragma once
#include "exec/aggregator.h"
#include "exec/aggregator_fwd.h"
#include "exec/pipeline/source_operator.h"
namespace starrocks::pipeline {

View File

@ -66,8 +66,7 @@ Status SpillableAggregateBlockingSinkOperator::set_finishing(RuntimeState* state
}
if (!_aggregator->spill_channel()->has_task()) {
if (_aggregator->hash_map_variant().size() > 0 || !_streaming_chunks.empty()) {
_aggregator->hash_map_variant().visit(
[&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
_aggregator->it_hash() = _aggregator->state_allocator().begin();
_aggregator->spill_channel()->add_spill_task(_build_spill_task(state));
}
}
@ -270,8 +269,7 @@ Status SpillableAggregateBlockingSinkOperator::_try_to_spill_by_auto(RuntimeStat
Status SpillableAggregateBlockingSinkOperator::_spill_all_data(RuntimeState* state, bool should_spill_hash_table) {
RETURN_IF(_aggregator->hash_map_variant().size() == 0, Status::OK());
if (should_spill_hash_table) {
_aggregator->hash_map_variant().visit(
[&](auto& hash_map_with_key) { _aggregator->it_hash() = _aggregator->_state_allocator.begin(); });
_aggregator->it_hash() = _aggregator->state_allocator().begin();
}
CHECK(!_aggregator->spill_channel()->has_task());
RETURN_IF_ERROR(_aggregator->spill_aggregate_data(state, _build_spill_task(state, should_spill_hash_table)));

View File

@ -16,9 +16,8 @@
#include <utility>
#include "exec/aggregator.h"
#include "exec/aggregator_fwd.h"
#include "exec/pipeline/aggregate/aggregate_blocking_source_operator.h"
#include "exec/sorted_streaming_aggregator.h"
#include "runtime/runtime_state.h"
#include "storage/chunk_helper.h"

View File

@ -16,7 +16,7 @@
#include <utility>
#include "exec/aggregator.h"
#include "exec/aggregator_fwd.h"
#include "exec/pipeline/aggregate/aggregate_distinct_blocking_sink_operator.h"
#include "exec/pipeline/aggregate/aggregate_distinct_blocking_source_operator.h"
#include "exec/pipeline/operator.h"

View File

@ -49,9 +49,7 @@ Status SpillablePartitionWiseAggregateSinkOperator::set_finishing(RuntimeState*
}
if (!_agg_op->aggregator()->spill_channel()->has_task()) {
if (_agg_op->aggregator()->hash_map_variant().size() > 0 || !_streaming_chunks.empty()) {
_agg_op->aggregator()->hash_map_variant().visit([&](auto& hash_map_with_key) {
_agg_op->aggregator()->it_hash() = _agg_op->aggregator()->_state_allocator.begin();
});
_agg_op->aggregator()->it_hash() = _agg_op->aggregator()->state_allocator().begin();
_agg_op->aggregator()->spill_channel()->add_spill_task(_build_spill_task(state));
}
}
@ -279,9 +277,7 @@ ChunkPtr& SpillablePartitionWiseAggregateSinkOperator::_append_hash_column(Chunk
Status SpillablePartitionWiseAggregateSinkOperator::_spill_all_data(RuntimeState* state, bool should_spill_hash_table) {
RETURN_IF(_agg_op->aggregator()->hash_map_variant().size() == 0, Status::OK());
if (should_spill_hash_table) {
_agg_op->aggregator()->hash_map_variant().visit([&](auto& hash_map_with_key) {
_agg_op->aggregator()->it_hash() = _agg_op->aggregator()->_state_allocator.begin();
});
_agg_op->aggregator()->it_hash() = _agg_op->aggregator()->state_allocator().begin();
}
CHECK(!_agg_op->aggregator()->spill_channel()->has_task());
RETURN_IF_ERROR(

View File

@ -23,6 +23,7 @@
#include <utility>
#include "common/config.h"
#include "exec/partition/bucket_aware_partition.h"
#include "exec/pipeline/exchange/shuffler.h"
#include "exec/pipeline/exchange/sink_buffer.h"
#include "exprs/expr.h"
@ -640,38 +641,13 @@ Status ExchangeSinkOperator::push_chunk(RuntimeState* state, const ChunkPtr& chu
}
void ExchangeSinkOperator::_calc_hash_values_and_bucket_ids() {
size_t num_rows = _partitions_columns[0]->size();
_hash_values.assign(num_rows, 0);
_bucket_ids.assign(num_rows, 0);
for (int i = 0; i < _partitions_columns.size(); ++i) {
// TODO, enhance it if we try to support more bucket functions.
DCHECK(_bucket_properties[i].bucket_func == TBucketFunction::MURMUR3_X86_32);
_round_hashes.assign(num_rows, 0);
_round_ids.assign(num_rows, 0);
_partitions_columns[i]->murmur_hash3_x86_32(&_round_hashes[0], 0, num_rows);
for (int j = 0; j < num_rows; j++) {
_hash_values[j] ^= _round_hashes[j];
_round_ids[j] = (_round_hashes[j] & std::numeric_limits<int>::max()) % _bucket_properties[i].bucket_num;
}
if (_partitions_columns[i]->has_null()) {
const auto& null_data =
down_cast<const NullableColumn*>(_partitions_columns[i].get())->null_column()->get_data();
for (int j = 0; j < num_rows; j++) {
_round_ids[j] = null_data[j] ? _bucket_properties[i].bucket_num : _round_ids[j];
}
}
if (i == _partitions_columns.size() - 1) {
for (int j = 0; j < num_rows; j++) {
_bucket_ids[j] += _round_ids[j];
}
} else {
for (int j = 0; j < num_rows; j++) {
// bucket mapping, same behavior as FE
_bucket_ids[j] = (_round_ids[j] + _bucket_ids[j]) * (_bucket_properties[i + 1].bucket_num + 1);
}
}
std::vector<const Column*> partitions_columns;
for (size_t i = 0; i < _partitions_columns.size(); i++) {
partitions_columns.emplace_back(_partitions_columns[i].get());
}
BucketAwarePartitionCtx bctx(_bucket_properties, _hash_values, _round_hashes, _bucket_ids, _round_ids);
calc_hash_values_and_bucket_ids(partitions_columns, bctx);
}
void ExchangeSinkOperator::update_metrics(RuntimeState* state) {

View File

@ -24,6 +24,7 @@
#include "runtime/exec_env.h"
#include "service/backend_options.h"
#include "util/network_util.h"
#include "util/starrocks_metrics.h"
#include "util/thrift_rpc_helper.h"
namespace starrocks::pipeline {
@ -247,7 +248,7 @@ Status ExecStateReporter::report_epoch(const TMVMaintenanceTasks& params, ExecEn
}
ExecStateReporter::ExecStateReporter(const CpuUtil::CpuIds& cpuids) {
auto status = ThreadPoolBuilder("ex_state_report") // exec state reporter
auto status = ThreadPoolBuilder("exec_state_report") // exec state reporter
.set_min_threads(1)
.set_max_threads(2)
.set_max_queue_size(1000)
@ -257,8 +258,9 @@ ExecStateReporter::ExecStateReporter(const CpuUtil::CpuIds& cpuids) {
if (!status.ok()) {
LOG(FATAL) << "Cannot create thread pool for ExecStateReport: error=" << status.to_string();
}
REGISTER_THREAD_POOL_METRICS(exec_state_report, _thread_pool);
status = ThreadPoolBuilder("priority_ex_state_report") // priority exec state reporter with infinite queue
status = ThreadPoolBuilder("priority_exec_state_report") // priority exec state reporter with infinite queue
.set_min_threads(1)
.set_max_threads(2)
.set_idle_timeout(MonoDelta::FromMilliseconds(2000))
@ -267,6 +269,7 @@ ExecStateReporter::ExecStateReporter(const CpuUtil::CpuIds& cpuids) {
if (!status.ok()) {
LOG(FATAL) << "Cannot create thread pool for priority ExecStateReport: error=" << status.to_string();
}
REGISTER_THREAD_POOL_METRICS(priority_exec_state_report, _priority_thread_pool);
}
void ExecStateReporter::submit(std::function<void()>&& report_task, bool priority) {

View File

@ -676,7 +676,8 @@ Status FragmentExecutor::_prepare_stream_load_pipe(ExecEnv* exec_env, const Unif
delete ctx;
}
});
RETURN_IF_ERROR(exec_env->stream_context_mgr()->put_channel_context(label, channel_id, ctx));
RETURN_IF_ERROR(
exec_env->stream_context_mgr()->put_channel_context(label, table_name, channel_id, ctx));
}
stream_load_contexts.push_back(ctx);
}
@ -975,7 +976,8 @@ void FragmentExecutor::_fail_cleanup(bool fragment_has_registed) {
}
}
Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const TExecPlanFragmentParams& request) {
Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const TExecPlanFragmentParams& request,
TExecPlanFragmentResult* response) {
DCHECK(!request.__isset.fragment);
DCHECK(request.__isset.params);
const TPlanFragmentExecParams& params = request.params;
@ -996,6 +998,7 @@ Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const
RuntimeState* runtime_state = fragment_ctx->runtime_state();
std::unordered_set<int> notify_ids;
std::vector<int32_t> closed_scan_nodes;
for (const auto& [node_id, scan_ranges] : params.per_node_scan_ranges) {
if (scan_ranges.size() == 0) continue;
@ -1019,6 +1022,10 @@ Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const
RETURN_IF_ERROR(morsel_queue_factory->append_morsels(0, std::move(morsels)));
morsel_queue_factory->set_has_more(has_more_morsel);
notify_ids.insert(node_id);
if (morsel_queue_factory->reach_limit()) {
closed_scan_nodes.push_back(node_id);
}
}
if (params.__isset.node_to_per_driver_seq_scan_ranges) {
@ -1047,6 +1054,14 @@ Status FragmentExecutor::append_incremental_scan_ranges(ExecEnv* exec_env, const
}
morsel_queue_factory->set_has_more(has_more_morsel);
notify_ids.insert(node_id);
if (morsel_queue_factory->reach_limit()) {
closed_scan_nodes.push_back(node_id);
}
}
if (closed_scan_nodes.size() > 0) {
response->__set_closed_scan_nodes(closed_scan_nodes);
}
}

View File

@ -107,7 +107,8 @@ public:
const TExecPlanFragmentParams& unique_request);
Status execute(ExecEnv* exec_env);
static Status append_incremental_scan_ranges(ExecEnv* exec_env, const TExecPlanFragmentParams& request);
static Status append_incremental_scan_ranges(ExecEnv* exec_env, const TExecPlanFragmentParams& request,
TExecPlanFragmentResult* response);
private:
void _fail_cleanup(bool fragment_has_registed);

View File

@ -38,7 +38,7 @@ HashJoinBuildOperator::HashJoinBuildOperator(OperatorFactory* factory, int32_t i
_distribution_mode(distribution_mode) {}
Status HashJoinBuildOperator::push_chunk(RuntimeState* state, const ChunkPtr& chunk) {
return _join_builder->append_chunk_to_ht(chunk);
return _join_builder->append_chunk_to_ht(state, chunk);
}
Status HashJoinBuildOperator::prepare(RuntimeState* state) {

Some files were not shown because too many files have changed in this diff Show More