1use std::ffi::c_void;
4use std::sync::atomic::Ordering;
5
6use crate::{
7 cuda_graph::{CapturedCudaGraph, CsmCudaGraphKey, CudaGraphNodeKind},
8 AsKernelParam, DeviceSlice, LaunchAsync, LaunchConfig,
9};
10use xlog_core::{Result, ScalarType, Schema, XlogError};
11
12use super::{
13 dedup_kernels, filter_kernels, ilp_kernels, join_kernels, pack_kernels, scan_kernels,
14 set_ops_kernels, sort_kernels, CsmCudaGraphEntry, CsmCudaGraphNodes, HashTableU64,
15 JoinHashTableV2, JoinIndexV2, JoinType, PackedKeyData, RadixSortScratch, DEDUP_MODULE,
16 DEFAULT_JOIN_MAX_OUTPUT, FILTER_MODULE, ILP_MODULE, JOIN_MODULE, NESTED_LOOP_TOTAL_THRESHOLD,
17 PACK_MODULE, SCAN_MODULE, SET_OPS_MODULE, SORT_MODULE,
18};
19use crate::device_runtime::{Access, BlockId, StreamId};
20use crate::launch::LaunchRecorder;
21use crate::memory::{CudaColumn, TrackedCudaSlice};
22use crate::CudaBuffer;
23
24const XLOG_TY_U32: u8 = 0;
29const XLOG_TY_U64: u8 = 1;
30const XLOG_TY_I32: u8 = 2;
31const XLOG_TY_I64: u8 = 3;
32const XLOG_TY_F32: u8 = 4;
33const XLOG_TY_F64: u8 = 5;
34const XLOG_TY_BOOL: u8 = 6;
35const XLOG_TY_SYMBOL: u8 = 7;
36const SMALL_FULL_ROW_SORT_MAX_ROWS: usize = 1024;
37
38#[inline]
39fn scalar_type_code_dedup(ty: ScalarType) -> u8 {
40 match ty {
41 ScalarType::U32 => XLOG_TY_U32,
42 ScalarType::U64 => XLOG_TY_U64,
43 ScalarType::I32 => XLOG_TY_I32,
44 ScalarType::I64 => XLOG_TY_I64,
45 ScalarType::F32 => XLOG_TY_F32,
46 ScalarType::F64 => XLOG_TY_F64,
47 ScalarType::Bool => XLOG_TY_BOOL,
48 ScalarType::Symbol => XLOG_TY_SYMBOL,
49 }
50}
51
52impl super::CudaKernelProvider {
53 pub fn hash_join(
71 &self,
72 left: &CudaBuffer,
73 right: &CudaBuffer,
74 left_keys: &[usize],
75 right_keys: &[usize],
76 ) -> Result<CudaBuffer> {
77 self.hash_join_with_limit(left, right, left_keys, right_keys, None)
78 }
79
80 pub fn hash_join_with_limit(
99 &self,
100 left: &CudaBuffer,
101 right: &CudaBuffer,
102 left_keys: &[usize],
103 right_keys: &[usize],
104 max_output: Option<usize>,
105 ) -> Result<CudaBuffer> {
106 let max_output_limit = max_output.unwrap_or(DEFAULT_JOIN_MAX_OUTPUT);
107
108 if left_keys.is_empty() || right_keys.is_empty() {
110 return Err(XlogError::Kernel(
111 "Join requires at least one key column".to_string(),
112 ));
113 }
114 if left_keys.len() != right_keys.len() {
115 return Err(XlogError::Kernel(
116 "Left and right key columns must have same length".to_string(),
117 ));
118 }
119 for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
120 if left_idx >= left.arity() {
121 return Err(XlogError::Kernel(format!(
122 "Left key column index {} out of bounds (arity {})",
123 left_idx,
124 left.arity()
125 )));
126 }
127 if right_idx >= right.arity() {
128 return Err(XlogError::Kernel(format!(
129 "Right key column index {} out of bounds (arity {})",
130 right_idx,
131 right.arity()
132 )));
133 }
134 }
135
136 let right_key_set: std::collections::HashSet<usize> = right_keys.iter().copied().collect();
138 let mut result_columns_schema = left.schema().columns.clone();
139 let mut result_sort_labels = left.schema().sort_labels().to_vec();
140 for (idx, col) in right.schema().columns.iter().enumerate() {
141 if !right_key_set.contains(&idx) {
142 result_columns_schema.push(col.clone());
143 result_sort_labels.push(
144 right
145 .schema()
146 .column_sort_label(idx)
147 .unwrap_or(&col.0)
148 .to_string(),
149 );
150 }
151 }
152 let result_schema = Schema::new(result_columns_schema)
153 .with_sort_labels(result_sort_labels)
154 .expect("natural join sort labels match result schema arity");
155
156 if left.is_empty() || right.is_empty() {
158 return self.create_empty_buffer(result_schema);
159 }
160
161 let combined = self.hash_join_v2_with_limit(
163 left,
164 right,
165 left_keys,
166 right_keys,
167 JoinType::Inner,
168 Some(max_output_limit),
169 )?;
170
171 if combined.is_empty() {
172 return self.create_empty_buffer(result_schema);
173 }
174
175 let left_arity = left.arity();
176 let right_arity = right.arity();
177
178 let CudaBuffer {
179 columns: combined_columns,
180 row_cap,
181 d_num_rows,
182 schema: _,
183 ..
184 } = combined;
185
186 if combined_columns.len() != left_arity + right_arity {
187 return Err(XlogError::Kernel(format!(
188 "Join internal error: expected {} columns, got {}",
189 left_arity + right_arity,
190 combined_columns.len()
191 )));
192 }
193
194 let mut output_columns = Vec::with_capacity(result_schema.arity());
195 let mut it = combined_columns.into_iter();
196
197 for _ in 0..left_arity {
199 let col = it.next().ok_or_else(|| {
200 XlogError::Kernel("Join internal error: missing left columns".to_string())
201 })?;
202 output_columns.push(col);
203 }
204
205 for (right_col_idx, col) in it.enumerate() {
207 if !right_key_set.contains(&right_col_idx) {
208 output_columns.push(col);
209 }
210 }
211
212 Ok(CudaBuffer::from_columns(
213 output_columns,
214 row_cap,
215 d_num_rows,
216 result_schema,
217 ))
218 }
219 pub fn dedup(&self, input: &CudaBuffer, key_cols: &[usize]) -> Result<CudaBuffer> {
233 if input.is_empty() {
234 return self.create_empty_buffer(input.schema().clone());
235 }
236
237 if key_cols.is_empty() {
238 if input.arity() == 0 {
239 let rows = self.device_row_count(input)?;
242 if rows == 0 {
243 return self.create_empty_buffer(input.schema().clone());
244 }
245 return self.buffer_from_columns(Vec::new(), 1, input.schema().clone());
246 }
247 return Err(XlogError::Kernel(
248 "Dedup requires at least one key column".to_string(),
249 ));
250 }
251
252 if Self::is_full_row_key(key_cols, input.arity()) && input.arity() > 1 {
253 return self.dedup_full_row_deterministic(input);
254 }
255
256 let sorted = self.sort(input, key_cols)?;
257 self.dedup_sorted(&sorted, key_cols)
258 }
259
260 pub fn dedup_sorted(&self, input: &CudaBuffer, key_cols: &[usize]) -> Result<CudaBuffer> {
272 if input.is_empty() {
273 return self.create_empty_buffer(input.schema().clone());
274 }
275
276 if key_cols.is_empty() {
277 if input.arity() == 0 {
278 let rows = self.device_row_count(input)?;
279 if rows == 0 {
280 return self.create_empty_buffer(input.schema().clone());
281 }
282 return self.buffer_from_columns(Vec::new(), 1, input.schema().clone());
283 }
284 return Err(XlogError::Kernel(
285 "Dedup requires at least one key column".to_string(),
286 ));
287 }
288
289 if Self::is_full_row_key(key_cols, input.arity()) && input.arity() > 1 {
290 return self.dedup_full_row_deterministic(input);
291 }
292
293 if input.num_rows() <= 1 {
294 return self.clone_buffer(input);
295 }
296
297 if input.num_rows() > u32::MAX as u64 {
298 return Err(XlogError::Kernel(format!(
299 "Dedup supports at most {} rows, got {}",
300 u32::MAX,
301 input.num_rows()
302 )));
303 }
304
305 let scalar_type_code = scalar_type_code_dedup;
309
310 let device = self.device.inner();
311 let num_rows = input.num_rows() as u32;
312
313 let mut col_ptrs_host: Vec<u64> = Vec::with_capacity(key_cols.len());
314 let mut col_sizes_host: Vec<u32> = Vec::with_capacity(key_cols.len());
315 let mut col_types_host: Vec<u8> = Vec::with_capacity(key_cols.len());
316
317 for &key_col in key_cols {
318 if key_col >= input.arity() {
319 return Err(XlogError::Kernel(format!(
320 "Key column {} out of bounds (arity {})",
321 key_col,
322 input.arity()
323 )));
324 }
325
326 let col = input
327 .column(key_col)
328 .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", key_col)))?;
329 let ty = input.schema().column_type(key_col).ok_or_else(|| {
330 XlogError::Kernel(format!("Key column {} type not found in schema", key_col))
331 })?;
332
333 let elem_size = ty.size_bytes();
334 let expected_bytes = (num_rows as usize) * elem_size;
335 if col.num_bytes() != expected_bytes {
336 return Err(XlogError::Kernel(format!(
337 "Key column {} has {} bytes but expected {} (num_rows={}, elem_size={})",
338 key_col,
339 col.num_bytes(),
340 expected_bytes,
341 num_rows,
342 elem_size
343 )));
344 }
345
346 let ptr = *col.device_ptr();
347 col_ptrs_host.push(ptr);
348 col_sizes_host.push(elem_size as u32);
349 col_types_host.push(scalar_type_code(ty));
350 }
351
352 let num_key_cols = key_cols.len() as u32;
353 let mut d_col_ptrs = self.memory.alloc::<u64>(key_cols.len())?;
354 let mut d_col_sizes = self.memory.alloc::<u32>(key_cols.len())?;
355 let mut d_col_types = self.memory.alloc::<u8>(key_cols.len())?;
356
357 self.htod_launch_metadata_sync_copy_into(&col_ptrs_host, &mut d_col_ptrs)
358 .map_err(|e| XlogError::Kernel(format!("Failed to upload key column ptrs: {}", e)))?;
359 self.htod_launch_metadata_sync_copy_into(&col_sizes_host, &mut d_col_sizes)
360 .map_err(|e| XlogError::Kernel(format!("Failed to upload key column sizes: {}", e)))?;
361 self.htod_launch_metadata_sync_copy_into(&col_types_host, &mut d_col_types)
362 .map_err(|e| XlogError::Kernel(format!("Failed to upload key column types: {}", e)))?;
363
364 let block_size = 256u32;
365 let num_blocks = num_rows.div_ceil(block_size);
366 let config = LaunchConfig {
367 grid_dim: (num_blocks, 1, 1),
368 block_dim: (block_size, 1, 1),
369 shared_mem_bytes: 0,
370 };
371
372 let d_unique_mask = self.memory.alloc::<u8>(num_rows as usize)?;
373 let d_prefix_sum = self.memory.alloc::<u32>(num_rows as usize)?;
374 let mut d_block_sums = self.memory.alloc::<u32>(num_blocks as usize)?;
375
376 let mark_and_scan_fn = device
377 .get_func(DEDUP_MODULE, dedup_kernels::MARK_UNIQUE_AND_SCAN_COLUMNAR)
378 .ok_or_else(|| {
379 XlogError::Kernel("mark_unique_and_scan_columnar kernel not found".to_string())
380 })?;
381
382 unsafe {
387 mark_and_scan_fn.clone().launch(
388 config,
389 (
390 &d_col_ptrs,
391 &d_col_sizes,
392 &d_col_types,
393 num_key_cols,
394 input.num_rows_device(),
395 num_rows,
396 &d_unique_mask,
397 &d_prefix_sum,
398 &d_block_sums,
399 ),
400 )
401 }
402 .map_err(|e| XlogError::Kernel(format!("mark_unique_and_scan_columnar failed: {}", e)))?;
403 self.device.synchronize()?;
404
405 if num_blocks > 1 {
406 self.multiblock_scan_u32_inplace(&mut d_block_sums, num_blocks)?;
407
408 let phase3_fn = device
409 .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE3)
410 .ok_or_else(|| {
411 XlogError::Kernel("Failed to get multiblock_scan_phase3 kernel".to_string())
412 })?;
413
414 unsafe {
416 phase3_fn.clone().launch(
417 LaunchConfig {
418 grid_dim: (num_blocks, 1, 1),
419 block_dim: (block_size, 1, 1),
420 shared_mem_bytes: 0,
421 },
422 (&d_prefix_sum, &d_block_sums, num_rows),
423 )
424 }
425 .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase3 failed: {}", e)))?;
426 self.device.synchronize()?;
427 }
428
429 self.device.synchronize()?;
430
431 let d_out_count = self.capture_compact_count(&d_prefix_sum, &d_unique_mask, num_rows)?;
432 self.compact_buffer_by_device_mask_device_count(
433 input,
434 &d_unique_mask,
435 &d_prefix_sum,
436 d_out_count,
437 )
438 }
439 pub fn union(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
451 self.union_gpu(a, b)
452 }
453
454 fn concat_buffers_gpu(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
455 if !self.schemas_type_compatible(a.schema(), b.schema()) {
456 return Err(XlogError::Kernel(format!(
457 "Concat requires compatible schemas: {:?} vs {:?}",
458 a.schema(),
459 b.schema()
460 )));
461 }
462
463 let schema = a.schema().clone();
464 let a_rows = self.device_row_count(a)? as u64;
465 let b_rows = self.device_row_count(b)? as u64;
466
467 if a_rows == 0 && b_rows == 0 {
468 return self.create_empty_buffer(schema);
469 }
470 if a_rows == 0 {
471 return self.clone_buffer(b);
472 }
473 if b_rows == 0 {
474 return self.clone_buffer(a);
475 }
476
477 let total_rows = a_rows + b_rows;
478 if total_rows > u32::MAX as u64 {
479 return Err(XlogError::Kernel(format!(
480 "Concat supports at most {} rows, got {}",
481 u32::MAX,
482 total_rows
483 )));
484 }
485
486 let device = self.device.inner();
487 let concat_fn = device
488 .get_func(SET_OPS_MODULE, set_ops_kernels::CONCAT_BYTES)
489 .ok_or_else(|| XlogError::Kernel("concat_bytes kernel not found".to_string()))?;
490
491 let block_size = 256u32;
492
493 let a_rows = usize::try_from(a_rows)
494 .map_err(|_| XlogError::Kernel(format!("Concat: a has too many rows: {}", a_rows)))?;
495 let b_rows = usize::try_from(b_rows)
496 .map_err(|_| XlogError::Kernel(format!("Concat: b has too many rows: {}", b_rows)))?;
497
498 let mut result_columns = Vec::with_capacity(schema.arity());
499 for col_idx in 0..schema.arity() {
500 let elem_size = schema
501 .column_type(col_idx)
502 .map(|t| t.size_bytes())
503 .unwrap_or(4);
504
505 let a_bytes = a_rows
506 .checked_mul(elem_size)
507 .ok_or_else(|| XlogError::Kernel("Concat: a_bytes overflow".to_string()))?;
508 let b_bytes = b_rows
509 .checked_mul(elem_size)
510 .ok_or_else(|| XlogError::Kernel("Concat: b_bytes overflow".to_string()))?;
511 let total_bytes = a_bytes
512 .checked_add(b_bytes)
513 .ok_or_else(|| XlogError::Kernel("Concat: total_bytes overflow".to_string()))?;
514
515 let a_bytes_u32 = u32::try_from(a_bytes).map_err(|_| {
516 XlogError::Kernel(format!("Concat: a_bytes too large: {}", a_bytes))
517 })?;
518 let b_bytes_u32 = u32::try_from(b_bytes).map_err(|_| {
519 XlogError::Kernel(format!("Concat: b_bytes too large: {}", b_bytes))
520 })?;
521 let total_bytes_u32 = u32::try_from(total_bytes).map_err(|_| {
522 XlogError::Kernel(format!("Concat: total_bytes too large: {}", total_bytes))
523 })?;
524
525 let a_col = a
526 .column(col_idx)
527 .ok_or_else(|| XlogError::Kernel(format!("A column {} not found", col_idx)))?;
528 let b_col = b
529 .column(col_idx)
530 .ok_or_else(|| XlogError::Kernel(format!("B column {} not found", col_idx)))?;
531
532 let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
533
534 if total_bytes_u32 > 0 {
535 let grid_size = total_bytes_u32.div_ceil(block_size);
536 let config = LaunchConfig {
537 grid_dim: (grid_size, 1, 1),
538 block_dim: (block_size, 1, 1),
539 shared_mem_bytes: 0,
540 };
541
542 unsafe {
544 concat_fn.clone().launch(
545 config,
546 (a_col, a_bytes_u32, b_col, b_bytes_u32, &mut out_col),
547 )
548 }
549 .map_err(|e| XlogError::Kernel(format!("concat_bytes failed: {}", e)))?;
550 }
551
552 result_columns.push(out_col.into());
553 }
554
555 self.device.synchronize()?;
556
557 self.buffer_from_columns(result_columns, total_rows, schema)
558 }
559 pub fn diff(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
574 let num_a = self.device_row_count(a)?;
575 let num_b = self.device_row_count(b)?;
576 if num_a > u32::MAX as usize || num_b > u32::MAX as usize {
577 return Err(XlogError::Kernel(format!(
578 "Diff supports at most {} rows per side (a={}, b={})",
579 u32::MAX,
580 num_a,
581 num_b
582 )));
583 }
584
585 if num_a == 0 {
587 return self.create_empty_buffer(a.schema().clone());
588 }
589 if num_b == 0 {
590 return self.clone_buffer(a);
591 }
592
593 if !self.schemas_type_compatible(a.schema(), b.schema()) {
595 return Err(XlogError::Kernel(format!(
596 "Diff requires compatible schemas: {:?} vs {:?}",
597 a.schema(),
598 b.schema()
599 )));
600 }
601
602 if a.arity() == 0 {
604 return Err(XlogError::Kernel(
605 "Diff requires at least one column".to_string(),
606 ));
607 }
608
609 let num_b = num_b as u32;
610 let num_a = num_a as u32;
611
612 let hash_table_size = (num_b as usize * 2).max(1024) as u32;
614 let hash_table_alloc_size = (hash_table_size * 3) as usize;
615 let mut hash_table = self.memory.alloc::<u32>(hash_table_alloc_size)?;
616 let mut next_ptrs = self.memory.alloc::<u32>(num_b as usize)?;
617
618 let init_val = 0xFFFFFFFFu32;
620 self.device
621 .inner()
622 .htod_sync_copy_into(&vec![init_val; hash_table_alloc_size], &mut hash_table)
623 .map_err(|e| XlogError::Kernel(format!("Failed to init hash table: {}", e)))?;
624 self.device
625 .inner()
626 .htod_sync_copy_into(&vec![init_val; num_b as usize], &mut next_ptrs)
627 .map_err(|e| XlogError::Kernel(format!("Failed to init next pointers: {}", e)))?;
628
629 let build_func = self
631 .device
632 .inner()
633 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_BUILD)
634 .ok_or_else(|| XlogError::Kernel("hash_join_build kernel not found".to_string()))?;
635
636 let b_key_col = b
637 .column(0)
638 .ok_or_else(|| XlogError::Kernel("B key column not found".to_string()))?;
639 let b_keys_view = self.column_as_u32_view(b_key_col, num_b as usize)?;
640
641 let block_size = 256u32;
642 let build_grid = num_b.div_ceil(block_size);
643 let build_config = LaunchConfig {
644 grid_dim: (build_grid, 1, 1),
645 block_dim: (block_size, 1, 1),
646 shared_mem_bytes: 0,
647 };
648
649 unsafe {
651 build_func
652 .clone()
653 .launch(
654 build_config,
655 (
656 &b_keys_view,
657 &b_keys_view, num_b,
659 &hash_table,
660 &next_ptrs,
661 hash_table_size,
662 ),
663 )
664 .map_err(|e| XlogError::Kernel(format!("Build kernel failed: {}", e)))?;
665 }
666
667 self.device.synchronize()?;
669
670 let a_key_col = a
672 .column(0)
673 .ok_or_else(|| XlogError::Kernel("A key column not found".to_string()))?;
674
675 let mut a_keys_host = vec![0u8; (num_a as usize) * 4];
677 self.dtoh_sync_copy_into_tracked(a_key_col, &mut a_keys_host)
678 .map_err(|e| XlogError::Kernel(format!("Failed to read a keys: {}", e)))?;
679
680 let mut b_keys_host = vec![0u8; (num_b as usize) * 4];
681 self.dtoh_sync_copy_into_tracked(b_key_col, &mut b_keys_host)
682 .map_err(|e| XlogError::Kernel(format!("Failed to read b keys: {}", e)))?;
683
684 let b_keys_set: std::collections::HashSet<u32> = b_keys_host
686 .chunks_exact(4)
687 .map(|chunk| u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
688 .collect();
689
690 let diff_indices: Vec<usize> = a_keys_host
692 .chunks_exact(4)
693 .enumerate()
694 .map(|(i, chunk)| {
695 (
696 i,
697 u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]),
698 )
699 })
700 .filter(|(_, k)| !b_keys_set.contains(k))
701 .map(|(i, _)| i)
702 .collect();
703
704 let diff_count = diff_indices.len() as u64;
705
706 if diff_count == 0 {
707 return self.create_empty_buffer(a.schema().clone());
708 }
709
710 let schema = a.schema().clone();
712 let mut result_columns = Vec::with_capacity(schema.arity());
713
714 for col_idx in 0..schema.arity() {
715 let col_type_size = schema
716 .column_type(col_idx)
717 .map(|t| t.size_bytes())
718 .unwrap_or(4);
719 let result_bytes = (diff_count as usize) * col_type_size;
720
721 if let Some(a_col) = a.column(col_idx) {
722 let a_col_bytes = (num_a as usize) * col_type_size;
724 let mut a_col_host = vec![0u8; a_col_bytes];
725 self.dtoh_sync_copy_into_tracked(a_col, &mut a_col_host)
726 .map_err(|e| XlogError::Kernel(format!("Failed to read column: {}", e)))?;
727
728 let mut result_host = Vec::with_capacity(result_bytes);
730 for &idx in &diff_indices {
731 let start = idx * col_type_size;
732 let end = start + col_type_size;
733 result_host.extend_from_slice(&a_col_host[start..end]);
734 }
735
736 let mut result_col = self.memory.alloc::<u8>(result_bytes)?;
738 self.device
739 .inner()
740 .htod_sync_copy_into(&result_host, &mut result_col)
741 .map_err(|e| XlogError::Kernel(format!("Failed to upload result: {}", e)))?;
742
743 result_columns.push(result_col.into());
744 }
745 }
746
747 self.buffer_from_columns(result_columns, diff_count, schema)
748 }
749 pub fn union_gpu(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
768 if !self.schemas_type_compatible(a.schema(), b.schema()) {
770 return Err(XlogError::Kernel(format!(
771 "Union requires compatible schemas: {:?} vs {:?}",
772 a.schema(),
773 b.schema()
774 )));
775 }
776
777 let schema = a.schema().clone();
778 let a_rows = self.device_row_count(a)?;
779 let b_rows = self.device_row_count(b)?;
780 if schema.arity() == 0 {
781 if a_rows == 0 && b_rows == 0 {
783 return self.create_empty_buffer(schema);
784 }
785 return self.buffer_from_columns(Vec::new(), 1, schema);
786 }
787
788 let key_cols: Vec<usize> = (0..schema.arity()).collect();
789 if a_rows == 0 && b_rows == 0 {
790 return self.create_empty_buffer(schema);
791 }
792
793 if a_rows == 0 {
795 return self.dedup(b, &key_cols);
796 }
797 if b_rows == 0 {
798 return self.dedup(a, &key_cols);
799 }
800
801 let concat = self.concat_buffers_gpu(a, b)?;
802 if Self::use_csm_cuda_graph_env()
803 && schema.arity() > 1
804 && a_rows.saturating_add(b_rows) <= SMALL_FULL_ROW_SORT_MAX_ROWS
805 {
806 return self.dedup_full_row_deterministic(&concat);
807 }
808
809 let sorted = self.sort(&concat, &key_cols)?;
810 self.dedup_sorted(&sorted, &key_cols)
811 }
812
813 pub fn diff_gpu(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
831 let num_a = self.device_row_count(a)?;
832 let num_b = self.device_row_count(b)?;
833 if num_a > u32::MAX as usize || num_b > u32::MAX as usize {
834 return Err(XlogError::Kernel(format!(
835 "Diff supports at most {} rows per side (a={}, b={})",
836 u32::MAX,
837 num_a,
838 num_b
839 )));
840 }
841
842 if num_a == 0 {
843 return self.create_empty_buffer(a.schema().clone());
844 }
845
846 if !self.schemas_type_compatible(a.schema(), b.schema()) {
848 return Err(XlogError::Kernel(format!(
849 "Diff requires compatible schemas: {:?} vs {:?}",
850 a.schema(),
851 b.schema()
852 )));
853 }
854
855 if a.arity() == 0 {
856 if num_b == 0 {
858 return self.buffer_from_columns(Vec::new(), 1, a.schema().clone());
859 }
860 return self.create_empty_buffer(a.schema().clone());
861 }
862
863 let col_type = a
864 .schema()
865 .column_type(0)
866 .ok_or_else(|| XlogError::Kernel("No columns".to_string()))?;
867
868 if a.arity() == 1 && matches!(col_type, ScalarType::U32) && num_b != 0 {
871 return self.diff_gpu_u32(a, b);
872 }
873
874 self.diff_via_deterministic_set(a, b)
875 }
876
877 fn diff_gpu_u32(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
879 if a.arity() != 1 {
881 return self.diff_via_deterministic_set(a, b);
882 }
883
884 let sorted_a = self.sort(a, &[0])?;
886 let deduped_a = self.dedup_sorted(&sorted_a, &[0])?;
887
888 let sorted_b = self.sort(b, &[0])?;
889 let deduped_b = self.dedup_sorted(&sorted_b, &[0])?;
890
891 let num_a = self.device_row_count(&deduped_a)?;
892 let num_b = self.device_row_count(&deduped_b)?;
893 if num_a > u32::MAX as usize || num_b > u32::MAX as usize {
894 return Err(XlogError::Kernel(format!(
895 "Diff supports at most {} rows per side (a={}, b={})",
896 u32::MAX,
897 num_a,
898 num_b
899 )));
900 }
901
902 if num_a == 0 {
903 return self.create_empty_buffer(a.schema().clone());
904 }
905
906 let num_a = num_a as u32;
907 let num_b = num_b as u32;
908
909 let diff_mark_fn = self
911 .device
912 .inner()
913 .get_func(SET_OPS_MODULE, set_ops_kernels::SORTED_DIFF_MARK)
914 .ok_or_else(|| XlogError::Kernel("sorted_diff_mark kernel not found".to_string()))?;
915
916 let a_col = deduped_a
918 .column(0)
919 .ok_or_else(|| XlogError::Kernel("A column 0 not found".to_string()))?;
920 let b_col = deduped_b
921 .column(0)
922 .ok_or_else(|| XlogError::Kernel("B column 0 not found".to_string()))?;
923
924 let a_view = self.column_as_u32_view(a_col, num_a as usize)?;
925 let b_view = self.column_as_u32_view(b_col, num_b as usize)?;
926
927 let diff_mask = self.memory.alloc::<u8>(num_a as usize)?;
929
930 let block_size = 256u32;
932 let grid_size = num_a.div_ceil(block_size);
933 let config = LaunchConfig {
934 grid_dim: (grid_size, 1, 1),
935 block_dim: (block_size, 1, 1),
936 shared_mem_bytes: 0,
937 };
938
939 unsafe {
943 diff_mark_fn.clone().launch(
944 config,
945 (
946 &a_view,
947 deduped_a.num_rows_device(),
948 num_a,
949 &b_view,
950 deduped_b.num_rows_device(),
951 num_b,
952 &diff_mask,
953 ),
954 )
955 }
956 .map_err(|e| XlogError::Kernel(format!("sorted_diff_mark failed: {}", e)))?;
957
958 let device = self.device.inner();
960 let num_blocks = grid_size;
961 let d_prefix_sum = self.memory.alloc::<u32>(num_a as usize)?;
962 let mut d_block_sums = self.memory.alloc::<u32>(num_blocks as usize)?;
963
964 let phase1_fn = device
965 .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE1)
966 .ok_or_else(|| {
967 XlogError::Kernel("Failed to get multiblock_scan_phase1 kernel".to_string())
968 })?;
969
970 unsafe {
972 phase1_fn.clone().launch(
973 LaunchConfig {
974 grid_dim: (num_blocks, 1, 1),
975 block_dim: (block_size, 1, 1),
976 shared_mem_bytes: 0,
977 },
978 (&diff_mask, &d_prefix_sum, &d_block_sums, num_a),
979 )
980 }
981 .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase1 failed: {}", e)))?;
982
983 if num_blocks > 1 {
984 self.multiblock_scan_u32_inplace(&mut d_block_sums, num_blocks)?;
985
986 let phase3_fn = device
987 .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE3)
988 .ok_or_else(|| {
989 XlogError::Kernel("Failed to get multiblock_scan_phase3 kernel".to_string())
990 })?;
991
992 unsafe {
994 phase3_fn.clone().launch(
995 LaunchConfig {
996 grid_dim: (num_blocks, 1, 1),
997 block_dim: (block_size, 1, 1),
998 shared_mem_bytes: 0,
999 },
1000 (&d_prefix_sum, &d_block_sums, num_a),
1001 )
1002 }
1003 .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase3 failed: {}", e)))?;
1004 }
1005
1006 self.device.synchronize()?;
1007
1008 let d_out_count = self.capture_compact_count(&d_prefix_sum, &diff_mask, num_a)?;
1009 self.compact_buffer_by_device_mask_device_count(
1010 &deduped_a,
1011 &diff_mask,
1012 &d_prefix_sum,
1013 d_out_count,
1014 )
1015 }
1016
1017 fn diff_via_deterministic_set(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
1040 let deduped_a = self.dedup_full_row_deterministic(a)?;
1048 let deduped_b = self.dedup_full_row_deterministic(b)?;
1049
1050 let a_rows = self.device_row_count(&deduped_a)? as u32;
1051 let b_rows = self.device_row_count(&deduped_b)? as u32;
1052 if a_rows == 0 {
1053 return self.create_empty_buffer(a.schema().clone());
1054 }
1055 if b_rows == 0 {
1056 return Ok(deduped_a);
1057 }
1058 let arity = deduped_a.arity();
1059 if arity == 0 {
1060 return self.create_empty_buffer(a.schema().clone());
1062 }
1063
1064 let sorted_b = deduped_b;
1070
1071 let schema = deduped_a.schema().clone();
1075 let device = self.device.inner();
1076
1077 let mut a_col_ptrs: Vec<u64> = Vec::with_capacity(arity);
1078 let mut b_col_ptrs: Vec<u64> = Vec::with_capacity(arity);
1079 let mut col_sizes: Vec<u32> = Vec::with_capacity(arity);
1080 let mut col_types: Vec<u8> = Vec::with_capacity(arity);
1081 for col_idx in 0..arity {
1082 let a_col = deduped_a.column(col_idx).ok_or_else(|| {
1083 XlogError::Kernel(format!("diff_full_row: a column {} missing", col_idx))
1084 })?;
1085 let b_col = sorted_b.column(col_idx).ok_or_else(|| {
1086 XlogError::Kernel(format!("diff_full_row: b column {} missing", col_idx))
1087 })?;
1088 let ty = schema.column_type(col_idx).ok_or_else(|| {
1089 XlogError::Kernel(format!("diff_full_row: column {} type missing", col_idx))
1090 })?;
1091 a_col_ptrs.push(*a_col.device_ptr());
1092 b_col_ptrs.push(*b_col.device_ptr());
1093 col_sizes.push(ty.size_bytes() as u32);
1094 col_types.push(scalar_type_code_dedup(ty));
1095 }
1096
1097 let mut d_a_ptrs = self.memory.alloc::<u64>(arity)?;
1098 let mut d_b_ptrs = self.memory.alloc::<u64>(arity)?;
1099 let mut d_sizes = self.memory.alloc::<u32>(arity)?;
1100 let mut d_types = self.memory.alloc::<u8>(arity)?;
1101 self.htod_launch_metadata_sync_copy_into(&a_col_ptrs, &mut d_a_ptrs)
1102 .map_err(|e| XlogError::Kernel(format!("diff_full_row a ptr upload: {}", e)))?;
1103 self.htod_launch_metadata_sync_copy_into(&b_col_ptrs, &mut d_b_ptrs)
1104 .map_err(|e| XlogError::Kernel(format!("diff_full_row b ptr upload: {}", e)))?;
1105 self.htod_launch_metadata_sync_copy_into(&col_sizes, &mut d_sizes)
1106 .map_err(|e| XlogError::Kernel(format!("diff_full_row size upload: {}", e)))?;
1107 self.htod_launch_metadata_sync_copy_into(&col_types, &mut d_types)
1108 .map_err(|e| XlogError::Kernel(format!("diff_full_row type upload: {}", e)))?;
1109
1110 let block_size = 256u32;
1111 let grid = a_rows.div_ceil(block_size);
1112 let cfg = LaunchConfig {
1113 grid_dim: (grid, 1, 1),
1114 block_dim: (block_size, 1, 1),
1115 shared_mem_bytes: 0,
1116 };
1117
1118 let d_keep_mask = self.memory.alloc::<u8>(a_rows as usize)?;
1119 let diff_fn = device
1120 .get_func(DEDUP_MODULE, dedup_kernels::MARK_DIFF_FULL_ROW_TYPED_SORTED)
1121 .ok_or_else(|| {
1122 XlogError::Kernel("mark_diff_full_row_typed_sorted kernel not found".to_string())
1123 })?;
1124 unsafe {
1128 diff_fn.clone().launch(
1129 cfg,
1130 (
1131 &d_a_ptrs,
1132 &d_b_ptrs,
1133 &d_sizes,
1134 &d_types,
1135 arity as u32,
1136 deduped_a.num_rows_device(),
1137 b_rows,
1138 a_rows,
1139 &d_keep_mask,
1140 ),
1141 )
1142 }
1143 .map_err(|e| XlogError::Kernel(format!("mark_diff_full_row_typed_sorted launch: {}", e)))?;
1144 self.device.synchronize()?;
1145
1146 let (d_prefix_sum, d_out_count) =
1148 self.scan_mask_to_prefix_with_count(&d_keep_mask, a_rows)?;
1149
1150 self.compact_buffer_by_device_mask_device_count(
1151 &deduped_a,
1152 &d_keep_mask,
1153 &d_prefix_sum,
1154 d_out_count,
1155 )
1156 }
1157
1158 pub fn dedup_full_row(&self, input: &CudaBuffer) -> Result<CudaBuffer> {
1187 if Self::use_recorded_dedup_env() && input.num_rows() > 1 && input.arity() > 0 {
1191 if let Some(launch_stream) = self.recorded_op_stream_or_init() {
1192 let recorded_compatible = (0..input.arity()).all(|c| {
1193 matches!(
1194 input.schema.column_type(c),
1195 Some(ScalarType::U32) | Some(ScalarType::Symbol)
1196 )
1197 });
1198 if recorded_compatible {
1199 return self.dedup_full_row_recorded(input, launch_stream);
1200 }
1201 }
1202 }
1203 self.dedup_full_row_deterministic(input)
1204 }
1205
1206 pub fn diff_full_row(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
1211 self.diff_gpu(a, b)
1215 }
1216
1217 fn read_join_output_count_metadata(&self, d_count: &TrackedCudaSlice<u32>) -> Result<u32> {
1239 self.dtoh_scalar_untracked::<u32>(d_count, 0)
1246 .map_err(|e| match e {
1247 XlogError::Kernel(message) => {
1248 XlogError::Kernel(format!("Failed to read output count: {}", message))
1249 }
1250 other => XlogError::Kernel(format!("Failed to read output count: {}", other)),
1251 })
1252 }
1253
1254 fn is_full_row_key(key_cols: &[usize], arity: usize) -> bool {
1255 key_cols.len() == arity
1256 && key_cols
1257 .iter()
1258 .copied()
1259 .enumerate()
1260 .all(|(expected, actual)| expected == actual)
1261 }
1262
1263 fn dedup_full_row_deterministic(&self, input: &CudaBuffer) -> Result<CudaBuffer> {
1280 let row_count = self.device_row_count(input)?;
1281 if row_count == 0 {
1282 return self.create_empty_buffer(input.schema().clone());
1283 }
1284 if row_count == 1 {
1285 return self.clone_buffer(input);
1286 }
1287 if row_count > u32::MAX as usize {
1288 return Err(XlogError::Kernel(format!(
1289 "dedup_full_row supports at most {} rows, got {}",
1290 u32::MAX,
1291 row_count
1292 )));
1293 }
1294 let arity = input.arity();
1295 if arity == 0 {
1296 return self.buffer_from_columns(Vec::new(), 1, input.schema().clone());
1298 }
1299
1300 let sorted = if Self::use_csm_cuda_graph_env() && row_count <= SMALL_FULL_ROW_SORT_MAX_ROWS
1303 {
1304 self.small_sort_full_row_deterministic(input, row_count)?
1305 } else {
1306 let all_cols: Vec<usize> = (0..arity).collect();
1307 self.sort(input, &all_cols)?
1308 };
1309
1310 let n = self.device_row_count(&sorted)? as u32;
1312 if n <= 1 {
1313 return Ok(sorted);
1314 }
1315
1316 let device = self.device.inner();
1317 let mut col_ptrs_host: Vec<u64> = Vec::with_capacity(arity);
1318 let mut col_sizes_host: Vec<u32> = Vec::with_capacity(arity);
1319 for col_idx in 0..arity {
1320 let col = sorted
1321 .column(col_idx)
1322 .ok_or_else(|| XlogError::Kernel(format!("Sorted column {} not found", col_idx)))?;
1323 let ty = sorted.schema().column_type(col_idx).ok_or_else(|| {
1324 XlogError::Kernel(format!("Sorted column {} type missing", col_idx))
1325 })?;
1326 col_ptrs_host.push(*col.device_ptr());
1327 col_sizes_host.push(ty.size_bytes() as u32);
1328 }
1329
1330 let mut d_col_ptrs = self.memory.alloc::<u64>(arity)?;
1331 let mut d_col_sizes = self.memory.alloc::<u32>(arity)?;
1332 self.htod_launch_metadata_sync_copy_into(&col_ptrs_host, &mut d_col_ptrs)
1333 .map_err(|e| XlogError::Kernel(format!("dedup_full_row_gpu col ptr upload: {}", e)))?;
1334 self.htod_launch_metadata_sync_copy_into(&col_sizes_host, &mut d_col_sizes)
1335 .map_err(|e| XlogError::Kernel(format!("dedup_full_row_gpu col size upload: {}", e)))?;
1336
1337 let block_size = 256u32;
1338 let grid = n.div_ceil(block_size);
1339 let cfg = LaunchConfig {
1340 grid_dim: (grid, 1, 1),
1341 block_dim: (block_size, 1, 1),
1342 shared_mem_bytes: 0,
1343 };
1344
1345 let d_unique_mask = self.memory.alloc::<u8>(n as usize)?;
1346 let mark_fn = device
1347 .get_func(DEDUP_MODULE, dedup_kernels::MARK_UNIQUE_FULL_ROW_BYTEWISE)
1348 .ok_or_else(|| {
1349 XlogError::Kernel("mark_unique_full_row_bytewise kernel not found".to_string())
1350 })?;
1351
1352 unsafe {
1356 mark_fn.clone().launch(
1357 cfg,
1358 (
1359 &d_col_ptrs,
1360 &d_col_sizes,
1361 arity as u32,
1362 sorted.num_rows_device(),
1363 n,
1364 &d_unique_mask,
1365 ),
1366 )
1367 }
1368 .map_err(|e| {
1369 XlogError::Kernel(format!(
1370 "mark_unique_full_row_bytewise launch failed: {}",
1371 e
1372 ))
1373 })?;
1374 self.device.synchronize()?;
1375
1376 let (d_prefix_sum, d_out_count) = self.scan_mask_to_prefix_with_count(&d_unique_mask, n)?;
1378
1379 self.compact_buffer_by_device_mask_device_count(
1383 &sorted,
1384 &d_unique_mask,
1385 &d_prefix_sum,
1386 d_out_count,
1387 )
1388 }
1389
1390 fn small_sort_full_row_deterministic(
1391 &self,
1392 input: &CudaBuffer,
1393 row_count: usize,
1394 ) -> Result<CudaBuffer> {
1395 if row_count > SMALL_FULL_ROW_SORT_MAX_ROWS {
1396 return Err(XlogError::Kernel(format!(
1397 "small full-row sort supports at most {} rows, got {}",
1398 SMALL_FULL_ROW_SORT_MAX_ROWS, row_count
1399 )));
1400 }
1401 if row_count == 0 {
1402 return self.create_empty_buffer(input.schema().clone());
1403 }
1404 if row_count == 1 {
1405 return self.clone_buffer(input);
1406 }
1407
1408 let arity = input.arity();
1409 let device = self.device.inner();
1410 let mut col_ptrs_host: Vec<u64> = Vec::with_capacity(arity);
1411 let mut col_sizes_host: Vec<u32> = Vec::with_capacity(arity);
1412 let mut col_types_host: Vec<u8> = Vec::with_capacity(arity);
1413 for col_idx in 0..arity {
1414 let col = input.column(col_idx).ok_or_else(|| {
1415 XlogError::Kernel(format!("small full-row sort: column {} missing", col_idx))
1416 })?;
1417 let ty = input.schema().column_type(col_idx).ok_or_else(|| {
1418 XlogError::Kernel(format!(
1419 "small full-row sort: column {} type missing",
1420 col_idx
1421 ))
1422 })?;
1423 let elem_size = ty.size_bytes();
1424 let expected_bytes_u64 =
1425 input
1426 .num_rows()
1427 .checked_mul(elem_size as u64)
1428 .ok_or_else(|| {
1429 XlogError::Kernel(
1430 "small full-row sort: column byte-size overflow".to_string(),
1431 )
1432 })?;
1433 let expected_bytes = usize::try_from(expected_bytes_u64).map_err(|_| {
1434 XlogError::Kernel(format!(
1435 "small full-row sort: expected byte size {} exceeds usize::MAX",
1436 expected_bytes_u64
1437 ))
1438 })?;
1439 if col.num_bytes() != expected_bytes {
1440 return Err(XlogError::Kernel(format!(
1441 "small full-row sort: column {} has {} bytes but expected {}",
1442 col_idx,
1443 col.num_bytes(),
1444 expected_bytes
1445 )));
1446 }
1447 col_ptrs_host.push(*col.device_ptr());
1448 col_sizes_host.push(elem_size as u32);
1449 col_types_host.push(scalar_type_code_dedup(ty));
1450 }
1451
1452 let mut d_col_ptrs = self.memory.alloc::<u64>(arity)?;
1453 let mut d_col_sizes = self.memory.alloc::<u32>(arity)?;
1454 let mut d_col_types = self.memory.alloc::<u8>(arity)?;
1455 self.htod_launch_metadata_sync_copy_into(&col_ptrs_host, &mut d_col_ptrs)
1456 .map_err(|e| XlogError::Kernel(format!("small full-row sort ptr upload: {}", e)))?;
1457 self.htod_launch_metadata_sync_copy_into(&col_sizes_host, &mut d_col_sizes)
1458 .map_err(|e| XlogError::Kernel(format!("small full-row sort size upload: {}", e)))?;
1459 self.htod_launch_metadata_sync_copy_into(&col_types_host, &mut d_col_types)
1460 .map_err(|e| XlogError::Kernel(format!("small full-row sort type upload: {}", e)))?;
1461
1462 let mut d_indices = self.memory.alloc::<u32>(row_count)?;
1463 let sort_fn = device
1464 .get_func(
1465 DEDUP_MODULE,
1466 dedup_kernels::SMALL_SORT_FULL_ROW_INDICES_TYPED,
1467 )
1468 .ok_or_else(|| {
1469 XlogError::Kernel("small_sort_full_row_indices_typed kernel not found".to_string())
1470 })?;
1471 let cfg = LaunchConfig {
1472 grid_dim: (1, 1, 1),
1473 block_dim: (SMALL_FULL_ROW_SORT_MAX_ROWS as u32, 1, 1),
1474 shared_mem_bytes: 0,
1475 };
1476
1477 unsafe {
1481 sort_fn.clone().launch(
1482 cfg,
1483 (
1484 &d_col_ptrs,
1485 &d_col_sizes,
1486 &d_col_types,
1487 arity as u32,
1488 input.num_rows_device(),
1489 row_count as u32,
1490 &mut d_indices,
1491 ),
1492 )
1493 }
1494 .map_err(|e| {
1495 XlogError::Kernel(format!(
1496 "small_sort_full_row_indices_typed launch failed: {}",
1497 e
1498 ))
1499 })?;
1500 self.device.synchronize()?;
1501 self.small_full_row_sort_invocations
1502 .fetch_add(1, Ordering::Relaxed);
1503
1504 self.gather_buffer_by_indices(input, &d_indices, row_count as u32)
1505 }
1506
1507 fn scan_mask_to_prefix_with_count(
1512 &self,
1513 d_mask: &cudarc::driver::CudaSlice<u8>,
1514 n: u32,
1515 ) -> Result<(
1516 crate::memory::TrackedCudaSlice<u32>,
1517 crate::memory::TrackedCudaSlice<u32>,
1518 )> {
1519 let device = self.device.inner();
1520 let block_size = 256u32;
1521 let num_blocks = n.div_ceil(block_size);
1522
1523 let d_prefix_sum = self.memory.alloc::<u32>(n as usize)?;
1524 let mut d_block_sums = self.memory.alloc::<u32>(num_blocks as usize)?;
1525
1526 let phase1_fn = device
1527 .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE1)
1528 .ok_or_else(|| {
1529 XlogError::Kernel("Failed to get multiblock_scan_phase1 kernel".to_string())
1530 })?;
1531 unsafe {
1533 phase1_fn.clone().launch(
1534 LaunchConfig {
1535 grid_dim: (num_blocks, 1, 1),
1536 block_dim: (block_size, 1, 1),
1537 shared_mem_bytes: 0,
1538 },
1539 (d_mask, &d_prefix_sum, &d_block_sums, n),
1540 )
1541 }
1542 .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase1 failed: {}", e)))?;
1543
1544 if num_blocks > 1 {
1545 self.multiblock_scan_u32_inplace(&mut d_block_sums, num_blocks)?;
1546
1547 let phase3_fn = device
1548 .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE3)
1549 .ok_or_else(|| {
1550 XlogError::Kernel("Failed to get multiblock_scan_phase3 kernel".to_string())
1551 })?;
1552 unsafe {
1554 phase3_fn.clone().launch(
1555 LaunchConfig {
1556 grid_dim: (num_blocks, 1, 1),
1557 block_dim: (block_size, 1, 1),
1558 shared_mem_bytes: 0,
1559 },
1560 (&d_prefix_sum, &d_block_sums, n),
1561 )
1562 }
1563 .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase3 failed: {}", e)))?;
1564 }
1565 self.device.synchronize()?;
1566
1567 let d_out_count = self.capture_compact_count(&d_prefix_sum, d_mask, n)?;
1568 Ok((d_prefix_sum, d_out_count))
1569 }
1570
1571 pub(super) const SORT_BLOCK_SIZE: u32 = 256;
1574
1575 pub fn sort(&self, input: &CudaBuffer, key_cols: &[usize]) -> Result<CudaBuffer> {
1593 if Self::use_recorded_sort_env() && !key_cols.is_empty() && input.num_rows() > 0 {
1598 if let Some(launch_stream) = self.recorded_op_stream_or_init() {
1599 let recorded_compatible = key_cols.iter().all(|&k| {
1600 matches!(
1601 input.schema.column_type(k),
1602 Some(ScalarType::U32) | Some(ScalarType::Symbol)
1603 )
1604 });
1605 if recorded_compatible {
1606 return self.sort_recorded(input, key_cols, launch_stream);
1607 }
1608 }
1609 }
1610
1611 if input.num_rows() == 0 {
1612 return self.create_empty_buffer(input.schema.clone());
1613 }
1614
1615 if key_cols.is_empty() {
1616 return Err(XlogError::Kernel(
1617 "Sort requires at least one key column".to_string(),
1618 ));
1619 }
1620
1621 if input.num_rows() > u32::MAX as u64 {
1622 return Err(XlogError::Kernel(format!(
1623 "Sort supports at most {} rows, got {}",
1624 u32::MAX,
1625 input.num_rows()
1626 )));
1627 }
1628
1629 for &key_col in key_cols {
1630 if key_col >= input.arity() {
1631 return Err(XlogError::Kernel(format!(
1632 "Key column index {} out of bounds (arity {})",
1633 key_col,
1634 input.arity()
1635 )));
1636 }
1637 }
1638
1639 let n = input.num_rows() as u32;
1640 let d_num_rows = input.num_rows_device();
1641 let device = self.device.inner();
1642
1643 let block_size = Self::SORT_BLOCK_SIZE;
1644 let grid_size = n.div_ceil(block_size);
1645 let launch_config = LaunchConfig {
1646 grid_dim: (grid_size, 1, 1),
1647 block_dim: (block_size, 1, 1),
1648 shared_mem_bytes: 0,
1649 };
1650
1651 let init_fn = device
1653 .get_func(SORT_MODULE, sort_kernels::INIT_INDICES)
1654 .ok_or_else(|| XlogError::Kernel("init_indices kernel not found".to_string()))?;
1655
1656 let mut indices_a = self.memory.alloc::<u32>(n as usize)?;
1657 let mut indices_b = self.memory.alloc::<u32>(n as usize)?;
1658
1659 unsafe {
1661 init_fn
1662 .clone()
1663 .launch(launch_config, (&mut indices_a, d_num_rows, n))
1664 }
1665 .map_err(|e| XlogError::Kernel(format!("init_indices failed: {}", e)))?;
1666 self.device.synchronize()?;
1667
1668 let mut keys_a = self.memory.alloc::<u32>(n as usize)?;
1670 let mut keys_b = self.memory.alloc::<u32>(n as usize)?;
1671
1672 let mut d_hist = self.memory.alloc::<u32>((grid_size as usize) * 16)?;
1674 let mut d_prefix = self.memory.alloc::<u32>(16)?;
1675 let mut d_ranks = self.memory.alloc::<u32>(n as usize)?;
1676
1677 for &col_idx in key_cols.iter().rev() {
1679 let ty = input.schema.column_type(col_idx).ok_or_else(|| {
1680 XlogError::Kernel(format!("Key column {} type not found in schema", col_idx))
1681 })?;
1682
1683 let col = input
1684 .column(col_idx)
1685 .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", col_idx)))?;
1686
1687 match ty {
1688 ScalarType::U32 | ScalarType::Symbol => {
1689 let col_view = self.column_as_u32_view(col, n as usize)?;
1690 let gather_fn = device
1691 .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_U32)
1692 .ok_or_else(|| {
1693 XlogError::Kernel("apply_permutation_u32 kernel not found".to_string())
1694 })?;
1695
1696 unsafe {
1698 gather_fn.clone().launch(
1699 launch_config,
1700 (&col_view, &mut keys_a, &indices_a, d_num_rows, n),
1701 )
1702 }
1703 .map_err(|e| {
1704 XlogError::Kernel(format!("apply_permutation_u32 failed: {}", e))
1705 })?;
1706
1707 self.radix_sort_u32_pairs_with_scratch(
1708 &mut keys_a,
1709 &mut keys_b,
1710 &mut indices_a,
1711 &mut indices_b,
1712 &mut d_hist,
1713 &mut d_prefix,
1714 &mut d_ranks,
1715 d_num_rows,
1716 n,
1717 )?;
1718 }
1719 ScalarType::I32 => {
1720 let col_bits = self.column_as_u32_view(col, n as usize)?;
1721 let gather_fn = device
1722 .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_I32_ORDERED_U32)
1723 .ok_or_else(|| {
1724 XlogError::Kernel(
1725 "gather_keys_i32_ordered_u32 kernel not found".to_string(),
1726 )
1727 })?;
1728
1729 unsafe {
1731 gather_fn.clone().launch(
1732 launch_config,
1733 (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1734 )
1735 }
1736 .map_err(|e| {
1737 XlogError::Kernel(format!("gather_keys_i32_ordered_u32 failed: {}", e))
1738 })?;
1739
1740 self.radix_sort_u32_pairs_with_scratch(
1741 &mut keys_a,
1742 &mut keys_b,
1743 &mut indices_a,
1744 &mut indices_b,
1745 &mut d_hist,
1746 &mut d_prefix,
1747 &mut d_ranks,
1748 d_num_rows,
1749 n,
1750 )?;
1751 }
1752 ScalarType::F32 => {
1753 let col_bits = self.column_as_u32_view(col, n as usize)?;
1754 let gather_fn = device
1755 .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_F32_ORDERED_U32)
1756 .ok_or_else(|| {
1757 XlogError::Kernel(
1758 "gather_keys_f32_ordered_u32 kernel not found".to_string(),
1759 )
1760 })?;
1761
1762 unsafe {
1764 gather_fn.clone().launch(
1765 launch_config,
1766 (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1767 )
1768 }
1769 .map_err(|e| {
1770 XlogError::Kernel(format!("gather_keys_f32_ordered_u32 failed: {}", e))
1771 })?;
1772
1773 self.radix_sort_u32_pairs_with_scratch(
1774 &mut keys_a,
1775 &mut keys_b,
1776 &mut indices_a,
1777 &mut indices_b,
1778 &mut d_hist,
1779 &mut d_prefix,
1780 &mut d_ranks,
1781 d_num_rows,
1782 n,
1783 )?;
1784 }
1785 ScalarType::Bool => {
1786 if col.num_bytes() < n as usize {
1787 return Err(XlogError::Kernel(format!(
1788 "Bool column {} has {} bytes but expected {}",
1789 col_idx,
1790 col.num_bytes(),
1791 n
1792 )));
1793 }
1794
1795 let gather_fn = device
1796 .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_BOOL_ORDERED_U32)
1797 .ok_or_else(|| {
1798 XlogError::Kernel(
1799 "gather_keys_bool_ordered_u32 kernel not found".to_string(),
1800 )
1801 })?;
1802
1803 unsafe {
1805 gather_fn
1806 .clone()
1807 .launch(launch_config, (col, &indices_a, d_num_rows, n, &mut keys_a))
1808 }
1809 .map_err(|e| {
1810 XlogError::Kernel(format!("gather_keys_bool_ordered_u32 failed: {}", e))
1811 })?;
1812
1813 self.radix_sort_u32_pairs_with_scratch(
1814 &mut keys_a,
1815 &mut keys_b,
1816 &mut indices_a,
1817 &mut indices_b,
1818 &mut d_hist,
1819 &mut d_prefix,
1820 &mut d_ranks,
1821 d_num_rows,
1822 n,
1823 )?;
1824 }
1825 ScalarType::U64 => {
1826 let col_bits = self.column_as_u64_view(col, n as usize)?;
1827 for &word in &[
1828 sort_kernels::GATHER_KEYS_U64_LO_U32,
1829 sort_kernels::GATHER_KEYS_U64_HI_U32,
1830 ] {
1831 let gather_fn = device.get_func(SORT_MODULE, word).ok_or_else(|| {
1832 XlogError::Kernel(format!("{} kernel not found", word))
1833 })?;
1834
1835 unsafe {
1837 gather_fn.clone().launch(
1838 launch_config,
1839 (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1840 )
1841 }
1842 .map_err(|e| XlogError::Kernel(format!("{} failed: {}", word, e)))?;
1843
1844 self.radix_sort_u32_pairs_with_scratch(
1845 &mut keys_a,
1846 &mut keys_b,
1847 &mut indices_a,
1848 &mut indices_b,
1849 &mut d_hist,
1850 &mut d_prefix,
1851 &mut d_ranks,
1852 d_num_rows,
1853 n,
1854 )?;
1855 }
1856 }
1857 ScalarType::I64 => {
1858 let col_bits = self.column_as_u64_view(col, n as usize)?;
1859 for &word in &[
1860 sort_kernels::GATHER_KEYS_I64_LO_U32,
1861 sort_kernels::GATHER_KEYS_I64_HI_U32,
1862 ] {
1863 let gather_fn = device.get_func(SORT_MODULE, word).ok_or_else(|| {
1864 XlogError::Kernel(format!("{} kernel not found", word))
1865 })?;
1866
1867 unsafe {
1869 gather_fn.clone().launch(
1870 launch_config,
1871 (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1872 )
1873 }
1874 .map_err(|e| XlogError::Kernel(format!("{} failed: {}", word, e)))?;
1875
1876 self.radix_sort_u32_pairs_with_scratch(
1877 &mut keys_a,
1878 &mut keys_b,
1879 &mut indices_a,
1880 &mut indices_b,
1881 &mut d_hist,
1882 &mut d_prefix,
1883 &mut d_ranks,
1884 d_num_rows,
1885 n,
1886 )?;
1887 }
1888 }
1889 ScalarType::F64 => {
1890 let col_bits = self.column_as_u64_view(col, n as usize)?;
1891 for &word in &[
1892 sort_kernels::GATHER_KEYS_F64_LO_U32,
1893 sort_kernels::GATHER_KEYS_F64_HI_U32,
1894 ] {
1895 let gather_fn = device.get_func(SORT_MODULE, word).ok_or_else(|| {
1896 XlogError::Kernel(format!("{} kernel not found", word))
1897 })?;
1898
1899 unsafe {
1901 gather_fn.clone().launch(
1902 launch_config,
1903 (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1904 )
1905 }
1906 .map_err(|e| XlogError::Kernel(format!("{} failed: {}", word, e)))?;
1907
1908 self.radix_sort_u32_pairs_with_scratch(
1909 &mut keys_a,
1910 &mut keys_b,
1911 &mut indices_a,
1912 &mut indices_b,
1913 &mut d_hist,
1914 &mut d_prefix,
1915 &mut d_ranks,
1916 d_num_rows,
1917 n,
1918 )?;
1919 }
1920 }
1921 }
1922 }
1923
1924 self.apply_permutation_gpu(input, &indices_a)
1925 }
1926
1927 #[allow(clippy::too_many_arguments)]
1928 fn radix_sort_u32_pairs_with_scratch(
1929 &self,
1930 keys_a: &mut crate::memory::TrackedCudaSlice<u32>,
1931 keys_b: &mut crate::memory::TrackedCudaSlice<u32>,
1932 indices_a: &mut crate::memory::TrackedCudaSlice<u32>,
1933 indices_b: &mut crate::memory::TrackedCudaSlice<u32>,
1934 hist: &mut crate::memory::TrackedCudaSlice<u32>,
1935 prefix: &mut crate::memory::TrackedCudaSlice<u32>,
1936 ranks: &mut crate::memory::TrackedCudaSlice<u32>,
1937 num_rows_device: &crate::memory::TrackedCudaSlice<u32>,
1938 row_cap: u32,
1939 ) -> Result<()> {
1940 if row_cap == 0 {
1941 return Ok(());
1942 }
1943 self.device.synchronize()?;
1944
1945 let device = self.device.inner();
1946 let block_size = Self::SORT_BLOCK_SIZE;
1947 let grid_size = row_cap.div_ceil(block_size);
1948
1949 let sort_config = LaunchConfig {
1950 grid_dim: (grid_size, 1, 1),
1951 block_dim: (block_size, 1, 1),
1952 shared_mem_bytes: 0,
1953 };
1954
1955 let histogram_fn = device
1956 .get_func(SORT_MODULE, sort_kernels::RADIX_HISTOGRAM)
1957 .ok_or_else(|| XlogError::Kernel("radix_histogram kernel not found".to_string()))?;
1958 let prefix_fn = device
1959 .get_func(SORT_MODULE, sort_kernels::COMPUTE_DIGIT_PREFIX_SUMS)
1960 .ok_or_else(|| {
1961 XlogError::Kernel("compute_digit_prefix_sums kernel not found".to_string())
1962 })?;
1963 let ranks_fn = device
1964 .get_func(SORT_MODULE, sort_kernels::COMPUTE_RANKS)
1965 .ok_or_else(|| XlogError::Kernel("compute_ranks kernel not found".to_string()))?;
1966 let scatter_fn = device
1967 .get_func(SORT_MODULE, sort_kernels::RADIX_SCATTER_STABLE)
1968 .ok_or_else(|| {
1969 XlogError::Kernel("radix_scatter_stable kernel not found".to_string())
1970 })?;
1971
1972 let prefix_config = LaunchConfig {
1973 grid_dim: (1, 1, 1),
1974 block_dim: (256, 1, 1),
1975 shared_mem_bytes: 0,
1976 };
1977
1978 let mut in_a = true;
1979 for pass in 0..8u32 {
1980 let shift = pass * 4;
1981
1982 let (keys_in, indices_in, keys_out, indices_out) = if in_a {
1983 (&*keys_a, &*indices_a, &mut *keys_b, &mut *indices_b)
1984 } else {
1985 (&*keys_b, &*indices_b, &mut *keys_a, &mut *indices_a)
1986 };
1987
1988 unsafe {
1991 histogram_fn.clone().launch(
1992 sort_config,
1993 (keys_in, num_rows_device, row_cap, &mut *hist, shift),
1994 )
1995 }
1996 .map_err(|e| XlogError::Kernel(format!("radix_histogram failed: {}", e)))?;
1997 self.device.synchronize()?;
1998
1999 unsafe {
2002 prefix_fn
2003 .clone()
2004 .launch(prefix_config, (&*hist, grid_size, &mut *prefix))
2005 }
2006 .map_err(|e| XlogError::Kernel(format!("compute_digit_prefix_sums failed: {}", e)))?;
2007 self.device.synchronize()?;
2008
2009 for digit in 0..16u32 {
2011 let start = (digit * grid_size) as usize;
2012 let end = start + (grid_size as usize);
2013 let mut digit_slice = hist.slice_mut(start..end);
2014 self.multiblock_scan_u32_view_inplace(&mut digit_slice, grid_size)?;
2015 }
2016 self.device.synchronize()?;
2017
2018 unsafe {
2021 ranks_fn.clone().launch(
2022 sort_config,
2023 (keys_in, num_rows_device, row_cap, &mut *ranks, shift),
2024 )
2025 }
2026 .map_err(|e| XlogError::Kernel(format!("compute_ranks failed: {}", e)))?;
2027 self.device.synchronize()?;
2028
2029 unsafe {
2032 scatter_fn.clone().launch(
2033 sort_config,
2034 (
2035 keys_in,
2036 indices_in,
2037 &*ranks,
2038 keys_out,
2039 indices_out,
2040 &*prefix,
2041 &*hist,
2042 num_rows_device,
2043 row_cap,
2044 shift,
2045 ),
2046 )
2047 }
2048 .map_err(|e| XlogError::Kernel(format!("radix_scatter_stable failed: {}", e)))?;
2049 self.device.synchronize()?;
2050
2051 in_a = !in_a;
2052 }
2053
2054 if !in_a {
2056 return Err(XlogError::Kernel(
2057 "Unexpected radix-sort buffer parity (expected even number of passes)".to_string(),
2058 ));
2059 }
2060
2061 Ok(())
2062 }
2063 pub fn init_indices(
2065 &self,
2066 indices: &mut crate::memory::TrackedCudaSlice<u32>,
2067 n: u32,
2068 ) -> Result<()> {
2069 if n == 0 {
2070 return Ok(());
2071 }
2072 if n as usize > indices.len() {
2073 return Err(XlogError::Kernel(format!(
2074 "init_indices: n={} exceeds indices len={}",
2075 n,
2076 indices.len()
2077 )));
2078 }
2079 let device = self.device.inner();
2080 let block_size = Self::SORT_BLOCK_SIZE;
2081 let grid_size = n.div_ceil(block_size);
2082 let config = LaunchConfig {
2083 grid_dim: (grid_size, 1, 1),
2084 block_dim: (block_size, 1, 1),
2085 shared_mem_bytes: 0,
2086 };
2087 let init_fn = device
2088 .get_func(SORT_MODULE, sort_kernels::INIT_INDICES)
2089 .ok_or_else(|| XlogError::Kernel("init_indices kernel not found".to_string()))?;
2090 let d_num_rows = self.upload_device_row_count(n)?;
2091 unsafe {
2093 init_fn
2094 .clone()
2095 .launch(config, (&mut *indices, &d_num_rows, n))
2096 }
2097 .map_err(|e| XlogError::Kernel(format!("init_indices failed: {}", e)))?;
2098 Ok(())
2099 }
2100
2101 pub fn gather_u32_by_indices(
2103 &self,
2104 input: &crate::memory::TrackedCudaSlice<u32>,
2105 indices: &crate::memory::TrackedCudaSlice<u32>,
2106 output: &mut crate::memory::TrackedCudaSlice<u32>,
2107 n: u32,
2108 ) -> Result<()> {
2109 if n == 0 {
2110 return Ok(());
2111 }
2112 if n as usize > output.len() {
2113 return Err(XlogError::Kernel(format!(
2114 "gather_u32_by_indices: n={} exceeds output len={}",
2115 n,
2116 output.len()
2117 )));
2118 }
2119 let device = self.device.inner();
2120 let block_size = Self::SORT_BLOCK_SIZE;
2121 let grid_size = n.div_ceil(block_size);
2122 let config = LaunchConfig {
2123 grid_dim: (grid_size, 1, 1),
2124 block_dim: (block_size, 1, 1),
2125 shared_mem_bytes: 0,
2126 };
2127 let gather_fn = device
2128 .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_U32)
2129 .ok_or_else(|| {
2130 XlogError::Kernel("apply_permutation_u32 kernel not found".to_string())
2131 })?;
2132 let d_num_rows = self.upload_device_row_count(n)?;
2133 unsafe {
2135 gather_fn
2136 .clone()
2137 .launch(config, (input, output, indices, &d_num_rows, n))
2138 }
2139 .map_err(|e| XlogError::Kernel(format!("gather_u32_by_indices failed: {}", e)))?;
2140 Ok(())
2141 }
2142
2143 pub fn gather_u8_by_indices(
2145 &self,
2146 input: &crate::memory::TrackedCudaSlice<u8>,
2147 indices: &crate::memory::TrackedCudaSlice<u32>,
2148 output: &mut crate::memory::TrackedCudaSlice<u8>,
2149 n: u32,
2150 ) -> Result<()> {
2151 if n == 0 {
2152 return Ok(());
2153 }
2154 if n as usize > output.len() {
2155 return Err(XlogError::Kernel(format!(
2156 "gather_u8_by_indices: n={} exceeds output len={}",
2157 n,
2158 output.len()
2159 )));
2160 }
2161 let device = self.device.inner();
2162 let block_size = Self::SORT_BLOCK_SIZE;
2163 let grid_size = n.div_ceil(block_size);
2164 let config = LaunchConfig {
2165 grid_dim: (grid_size, 1, 1),
2166 block_dim: (block_size, 1, 1),
2167 shared_mem_bytes: 0,
2168 };
2169 let gather_fn = device
2170 .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
2171 .ok_or_else(|| {
2172 XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
2173 })?;
2174 let d_num_rows = self.upload_device_row_count(n)?;
2175 unsafe {
2177 gather_fn
2178 .clone()
2179 .launch(config, (input, output, indices, &d_num_rows, n, 1u32))
2180 }
2181 .map_err(|e| XlogError::Kernel(format!("gather_u8_by_indices failed: {}", e)))?;
2182 Ok(())
2183 }
2184
2185 pub fn gather_u64_lo_by_indices(
2187 &self,
2188 input: &crate::memory::TrackedCudaSlice<u64>,
2189 indices: &crate::memory::TrackedCudaSlice<u32>,
2190 output: &mut crate::memory::TrackedCudaSlice<u32>,
2191 n: u32,
2192 ) -> Result<()> {
2193 if n == 0 {
2194 return Ok(());
2195 }
2196 let device = self.device.inner();
2197 let block_size = Self::SORT_BLOCK_SIZE;
2198 let grid_size = n.div_ceil(block_size);
2199 let config = LaunchConfig {
2200 grid_dim: (grid_size, 1, 1),
2201 block_dim: (block_size, 1, 1),
2202 shared_mem_bytes: 0,
2203 };
2204 let gather_fn = device
2205 .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_U64_LO_U32)
2206 .ok_or_else(|| XlogError::Kernel("gather_keys_u64_lo_u32 not found".to_string()))?;
2207 let d_num_rows = self.upload_device_row_count(n)?;
2208 unsafe {
2210 gather_fn
2211 .clone()
2212 .launch(config, (input, indices, &d_num_rows, n, output))
2213 }
2214 .map_err(|e| XlogError::Kernel(format!("gather_u64_lo_by_indices failed: {}", e)))?;
2215 Ok(())
2216 }
2217
2218 pub fn gather_u64_hi_by_indices(
2220 &self,
2221 input: &crate::memory::TrackedCudaSlice<u64>,
2222 indices: &crate::memory::TrackedCudaSlice<u32>,
2223 output: &mut crate::memory::TrackedCudaSlice<u32>,
2224 n: u32,
2225 ) -> Result<()> {
2226 if n == 0 {
2227 return Ok(());
2228 }
2229 let device = self.device.inner();
2230 let block_size = Self::SORT_BLOCK_SIZE;
2231 let grid_size = n.div_ceil(block_size);
2232 let config = LaunchConfig {
2233 grid_dim: (grid_size, 1, 1),
2234 block_dim: (block_size, 1, 1),
2235 shared_mem_bytes: 0,
2236 };
2237 let gather_fn = device
2238 .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_U64_HI_U32)
2239 .ok_or_else(|| XlogError::Kernel("gather_keys_u64_hi_u32 not found".to_string()))?;
2240 let d_num_rows = self.upload_device_row_count(n)?;
2241 unsafe {
2243 gather_fn
2244 .clone()
2245 .launch(config, (input, indices, &d_num_rows, n, output))
2246 }
2247 .map_err(|e| XlogError::Kernel(format!("gather_u64_hi_by_indices failed: {}", e)))?;
2248 Ok(())
2249 }
2250
2251 pub fn radix_sort_u32_pairs(
2253 &self,
2254 keys: &mut crate::memory::TrackedCudaSlice<u32>,
2255 values: &mut crate::memory::TrackedCudaSlice<u32>,
2256 n: u32,
2257 scratch: &mut RadixSortScratch,
2258 ) -> Result<()> {
2259 if n == 0 {
2260 return Ok(());
2261 }
2262 scratch.ensure_capacity(self, n)?;
2263 let d_num_rows = self.upload_device_row_count(n)?;
2264 self.radix_sort_u32_pairs_with_scratch(
2265 keys,
2266 &mut scratch.keys_b,
2267 values,
2268 &mut scratch.values_b,
2269 &mut scratch.hist,
2270 &mut scratch.prefix,
2271 &mut scratch.ranks,
2272 &d_num_rows,
2273 n,
2274 )
2275 }
2276 pub fn scan_u8_mask_device(
2278 &self,
2279 mask: &crate::memory::TrackedCudaSlice<u8>,
2280 n: u32,
2281 ) -> Result<crate::memory::TrackedCudaSlice<u32>> {
2282 if n == 0 {
2283 return self.memory.alloc::<u32>(0);
2284 }
2285 if n as usize > mask.len() {
2286 return Err(XlogError::Kernel(format!(
2287 "scan_u8_mask_device: n={} exceeds mask len={}",
2288 n,
2289 mask.len()
2290 )));
2291 }
2292 let device = self.device.inner();
2293 let block_size = 256u32;
2294 let num_blocks = n.div_ceil(block_size);
2295
2296 let mut prefix_sum = self.memory.alloc::<u32>(n as usize)?;
2297 let mut block_sums = self.memory.alloc::<u32>(num_blocks as usize)?;
2298
2299 let phase1_fn = device
2300 .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE1)
2301 .ok_or_else(|| {
2302 XlogError::Kernel("multiblock_scan_phase1 kernel not found".to_string())
2303 })?;
2304
2305 unsafe {
2307 phase1_fn.clone().launch(
2308 LaunchConfig {
2309 grid_dim: (num_blocks, 1, 1),
2310 block_dim: (block_size, 1, 1),
2311 shared_mem_bytes: 0,
2312 },
2313 (mask, &mut prefix_sum, &mut block_sums, n),
2314 )
2315 }
2316 .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase1 failed: {}", e)))?;
2317
2318 if num_blocks > 1 {
2319 self.multiblock_scan_u32_inplace(&mut block_sums, num_blocks)?;
2320
2321 let phase3_fn = device
2322 .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE3)
2323 .ok_or_else(|| {
2324 XlogError::Kernel("multiblock_scan_phase3 kernel not found".to_string())
2325 })?;
2326
2327 unsafe {
2329 phase3_fn.clone().launch(
2330 LaunchConfig {
2331 grid_dim: (num_blocks, 1, 1),
2332 block_dim: (block_size, 1, 1),
2333 shared_mem_bytes: 0,
2334 },
2335 (&mut prefix_sum, &block_sums, n),
2336 )
2337 }
2338 .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase3 failed: {}", e)))?;
2339 }
2340
2341 Ok(prefix_sum)
2342 }
2343
2344 pub fn count_mask_device(
2348 &self,
2349 mask: &crate::memory::TrackedCudaSlice<u8>,
2350 n: u32,
2351 ) -> Result<crate::memory::TrackedCudaSlice<u32>> {
2352 let mut d_count = self.memory.alloc::<u32>(1)?;
2353 self.htod_launch_metadata_sync_copy_into(&[0u32], &mut d_count)
2354 .map_err(|e| {
2355 XlogError::Kernel(format!("count_mask_device: zero init failed: {}", e))
2356 })?;
2357
2358 if n == 0 {
2359 return Ok(d_count);
2360 }
2361
2362 let device = self.device.inner();
2363 let block_size = 256u32;
2364 let grid_size = n.div_ceil(block_size);
2365
2366 let count_fn = device
2367 .get_func(SCAN_MODULE, scan_kernels::COUNT_MASK)
2368 .ok_or_else(|| XlogError::Kernel("count_mask kernel not found".to_string()))?;
2369
2370 unsafe {
2372 count_fn.clone().launch(
2373 LaunchConfig {
2374 grid_dim: (grid_size, 1, 1),
2375 block_dim: (block_size, 1, 1),
2376 shared_mem_bytes: 0,
2377 },
2378 (mask, n, &mut d_count),
2379 )
2380 }
2381 .map_err(|e| XlogError::Kernel(format!("count_mask kernel failed: {}", e)))?;
2382
2383 self.device.synchronize()?;
2384
2385 Ok(d_count)
2386 }
2387
2388 pub fn count_mask_into_slot(
2397 &self,
2398 mask: &crate::memory::TrackedCudaSlice<u8>,
2399 n: u32,
2400 task_counts: &mut crate::memory::TrackedCudaSlice<u32>,
2401 slot_idx: usize,
2402 ) -> Result<()> {
2403 if n == 0 {
2404 return Ok(());
2406 }
2407 if slot_idx >= task_counts.len() {
2408 return Err(XlogError::Kernel(format!(
2409 "count_mask_into_slot: slot_idx={} >= len={}",
2410 slot_idx,
2411 task_counts.len()
2412 )));
2413 }
2414
2415 let device = self.device.inner();
2416 let block_size = 256u32;
2417 let grid_size = n.div_ceil(block_size);
2418
2419 let count_fn = device
2420 .get_func(SCAN_MODULE, scan_kernels::COUNT_MASK)
2421 .ok_or_else(|| XlogError::Kernel("count_mask kernel not found".to_string()))?;
2422
2423 let mut slot = task_counts.slice_mut(slot_idx..slot_idx + 1);
2425
2426 unsafe {
2430 count_fn.clone().launch(
2431 LaunchConfig {
2432 grid_dim: (grid_size, 1, 1),
2433 block_dim: (block_size, 1, 1),
2434 shared_mem_bytes: 0,
2435 },
2436 (mask, n, &mut slot),
2437 )
2438 }
2439 .map_err(|e| XlogError::Kernel(format!("count_mask_into_slot kernel failed: {}", e)))?;
2440
2441 Ok(())
2442 }
2443 fn apply_permutation_gpu(
2445 &self,
2446 input: &CudaBuffer,
2447 permutation: &cudarc::driver::CudaSlice<u32>,
2448 ) -> Result<CudaBuffer> {
2449 let row_cap = input.num_rows() as u32;
2450 let d_num_rows = input.num_rows_device();
2451 let device = self.device.inner();
2452
2453 let grid_size = row_cap.div_ceil(Self::SORT_BLOCK_SIZE);
2454 let launch_config = LaunchConfig {
2455 grid_dim: (grid_size, 1, 1),
2456 block_dim: (Self::SORT_BLOCK_SIZE, 1, 1),
2457 shared_mem_bytes: 0,
2458 };
2459
2460 let apply_perm_fn = device
2461 .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
2462 .ok_or_else(|| {
2463 XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
2464 })?;
2465
2466 let mut new_columns = Vec::with_capacity(input.columns.len());
2467
2468 for col_idx in 0..input.columns.len() {
2469 let src_col = input
2470 .column(col_idx)
2471 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
2472
2473 let elem_size = input
2474 .schema
2475 .column_type(col_idx)
2476 .ok_or_else(|| {
2477 XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
2478 })?
2479 .size_bytes() as u32;
2480
2481 let output_bytes = (row_cap as usize) * (elem_size as usize);
2482 if src_col.num_bytes() != output_bytes {
2483 return Err(XlogError::Kernel(format!(
2484 "Column {} has {} bytes but expected {} (num_rows={}, elem_size={})",
2485 col_idx,
2486 src_col.num_bytes(),
2487 output_bytes,
2488 row_cap,
2489 elem_size
2490 )));
2491 }
2492 let dst_col = self.memory.alloc::<u8>(output_bytes)?;
2493
2494 unsafe {
2496 apply_perm_fn.clone().launch(
2497 launch_config,
2498 (
2499 src_col,
2500 &dst_col,
2501 permutation,
2502 d_num_rows,
2503 row_cap,
2504 elem_size,
2505 ),
2506 )
2507 }
2508 .map_err(|e| XlogError::Kernel(format!("apply_permutation_bytes failed: {}", e)))?;
2509
2510 new_columns.push(dst_col.into());
2511 }
2512
2513 self.device.synchronize()?;
2514
2515 self.buffer_from_columns_with_device_count(
2516 new_columns,
2517 input.num_rows(),
2518 input.schema.clone(),
2519 input,
2520 )
2521 }
2522
2523 fn gather_buffer_by_indices(
2528 &self,
2529 input: &CudaBuffer,
2530 indices: &cudarc::driver::CudaSlice<u32>,
2531 output_rows: u32,
2532 ) -> Result<CudaBuffer> {
2533 if output_rows == 0 {
2534 return self.create_empty_buffer(input.schema().clone());
2535 }
2536
2537 if input.num_rows() > u32::MAX as u64 {
2538 return Err(XlogError::Kernel(format!(
2539 "GPU gather supports at most {} input rows, got {}",
2540 u32::MAX,
2541 input.num_rows()
2542 )));
2543 }
2544
2545 let d_output_rows = self.upload_device_row_count(output_rows)?;
2546 let device = self.device.inner();
2547 let block_size = 256u32;
2548 let grid_size = output_rows.div_ceil(block_size);
2549 let launch_config = LaunchConfig {
2550 grid_dim: (grid_size, 1, 1),
2551 block_dim: (block_size, 1, 1),
2552 shared_mem_bytes: 0,
2553 };
2554
2555 let gather_fn = device
2556 .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
2557 .ok_or_else(|| {
2558 XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
2559 })?;
2560
2561 let mut new_columns = Vec::with_capacity(input.columns.len());
2562 for col_idx in 0..input.columns.len() {
2563 let src_col = input
2564 .column(col_idx)
2565 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
2566
2567 let elem_size = input
2568 .schema
2569 .column_type(col_idx)
2570 .ok_or_else(|| {
2571 XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
2572 })?
2573 .size_bytes() as u32;
2574
2575 let expected_src_bytes = (input.num_rows() as usize) * (elem_size as usize);
2576 if src_col.num_bytes() != expected_src_bytes {
2577 return Err(XlogError::Kernel(format!(
2578 "Column {} has {} bytes but expected {} (num_rows={}, elem_size={})",
2579 col_idx,
2580 src_col.num_bytes(),
2581 expected_src_bytes,
2582 input.num_rows(),
2583 elem_size
2584 )));
2585 }
2586
2587 let dst_bytes = (output_rows as usize) * (elem_size as usize);
2588 let dst_col = self.memory.alloc::<u8>(dst_bytes)?;
2589
2590 unsafe {
2592 gather_fn.clone().launch(
2593 launch_config,
2594 (
2595 src_col,
2596 &dst_col,
2597 indices,
2598 &d_output_rows,
2599 output_rows,
2600 elem_size,
2601 ),
2602 )
2603 }
2604 .map_err(|e| XlogError::Kernel(format!("apply_permutation_bytes failed: {}", e)))?;
2605
2606 new_columns.push(dst_col.into());
2607 }
2608
2609 self.device.synchronize()?;
2610
2611 Ok(CudaBuffer::from_columns(
2612 new_columns,
2613 output_rows as u64,
2614 d_output_rows,
2615 input.schema.clone(),
2616 ))
2617 }
2618 pub fn hash_join_v2(
2632 &self,
2633 left: &CudaBuffer,
2634 right: &CudaBuffer,
2635 left_keys: &[usize],
2636 right_keys: &[usize],
2637 join_type: JoinType,
2638 ) -> Result<CudaBuffer> {
2639 self.hash_join_v2_with_limit(left, right, left_keys, right_keys, join_type, None)
2640 }
2641
2642 pub fn hash_join_v2_with_limit(
2658 &self,
2659 left: &CudaBuffer,
2660 right: &CudaBuffer,
2661 left_keys: &[usize],
2662 right_keys: &[usize],
2663 join_type: JoinType,
2664 max_output: Option<usize>,
2665 ) -> Result<CudaBuffer> {
2666 if Self::use_recorded_hash_join_env()
2671 && !left_keys.is_empty()
2672 && left_keys.len() == right_keys.len()
2673 && left_keys.len() <= 4
2674 {
2675 if let Some(launch_stream) = self.recorded_op_stream_or_init() {
2676 return self.hash_join_v2_recorded(
2677 left,
2678 right,
2679 left_keys,
2680 right_keys,
2681 join_type,
2682 max_output,
2683 launch_stream,
2684 );
2685 }
2686 }
2687 match join_type {
2688 JoinType::Inner => {
2689 self.hash_join_inner_v2(left, right, left_keys, right_keys, max_output)
2690 }
2691 JoinType::Semi => self.hash_join_semi_impl(left, right, left_keys, right_keys),
2692 JoinType::Anti => self.hash_join_anti_impl(left, right, left_keys, right_keys),
2693 JoinType::LeftOuter => {
2694 self.hash_join_left_outer_impl(left, right, left_keys, right_keys, max_output)
2695 }
2696 }
2697 }
2698
2699 pub fn nested_loop_join_v2_inner_u32_1key(
2747 &self,
2748 left: &CudaBuffer,
2749 right: &CudaBuffer,
2750 left_key: usize,
2751 right_key: usize,
2752 ) -> Result<CudaBuffer> {
2753 let num_left = self.device_row_count(left)?;
2755 let num_right = self.device_row_count(right)?;
2756
2757 if num_left == 0 || num_right == 0 {
2759 let combined_schema = self.combine_schemas(left.schema(), right.schema());
2760 return self.create_empty_buffer(combined_schema);
2761 }
2762
2763 if left.arity() <= left_key {
2765 return Err(XlogError::Kernel(format!(
2766 "nested_loop: left_key={} out of bounds (arity={})",
2767 left_key,
2768 left.arity()
2769 )));
2770 }
2771 if right.arity() <= right_key {
2772 return Err(XlogError::Kernel(format!(
2773 "nested_loop: right_key={} out of bounds (arity={})",
2774 right_key,
2775 right.arity()
2776 )));
2777 }
2778 let lt = left.schema().column_type(left_key);
2779 let rt = right.schema().column_type(right_key);
2780 if lt != rt || !matches!(lt, Some(ScalarType::U32) | Some(ScalarType::Symbol)) {
2781 return Err(XlogError::Kernel(format!(
2782 "nested_loop: key types must be equal U32/Symbol; got left={:?} right={:?}",
2783 lt, rt
2784 )));
2785 }
2786 let left_col = left
2787 .column(left_key)
2788 .ok_or_else(|| XlogError::Kernel(format!("nested_loop: left.column({})", left_key)))?;
2789 let right_col = right.column(right_key).ok_or_else(|| {
2790 XlogError::Kernel(format!("nested_loop: right.column({})", right_key))
2791 })?;
2792 let required_left_bytes = num_left
2807 .checked_mul(4)
2808 .ok_or_else(|| XlogError::Kernel("nested_loop: left byte-count overflow".into()))?;
2809 let required_right_bytes = num_right
2810 .checked_mul(4)
2811 .ok_or_else(|| XlogError::Kernel("nested_loop: right byte-count overflow".into()))?;
2812 if left_col.num_bytes() < required_left_bytes {
2813 return Err(XlogError::Kernel(format!(
2814 "nested_loop: left key column has {} bytes; \
2815 require at least {} ({} rows × 4) — buffer allocation \
2816 is smaller than logical row count",
2817 left_col.num_bytes(),
2818 required_left_bytes,
2819 num_left
2820 )));
2821 }
2822 if right_col.num_bytes() < required_right_bytes {
2823 return Err(XlogError::Kernel(format!(
2824 "nested_loop: right key column has {} bytes; \
2825 require at least {} ({} rows × 4) — buffer allocation \
2826 is smaller than logical row count",
2827 right_col.num_bytes(),
2828 required_right_bytes,
2829 num_right
2830 )));
2831 }
2832
2833 let upper_bound: u64 = (num_left as u64)
2835 .checked_mul(num_right as u64)
2836 .ok_or_else(|| XlogError::Kernel("nested_loop: row-count product overflow".into()))?;
2837 if upper_bound > NESTED_LOOP_TOTAL_THRESHOLD {
2838 return Err(XlogError::Kernel(format!(
2839 "nested_loop: caller violated eligibility threshold: \
2840 num_left * num_right = {} > {} (NESTED_LOOP_TOTAL_THRESHOLD)",
2841 upper_bound, NESTED_LOOP_TOTAL_THRESHOLD
2842 )));
2843 }
2844
2845 let upper_bound_usize = upper_bound as usize;
2847 let mut d_output_left_idx = self.memory.alloc::<u32>(upper_bound_usize)?;
2848 let mut d_output_right_idx = self.memory.alloc::<u32>(upper_bound_usize)?;
2849 let mut d_output_count = self.memory.alloc::<u32>(1)?;
2850 self.device
2851 .inner()
2852 .memset_zeros(&mut d_output_count)
2853 .map_err(|e| XlogError::Kernel(format!("nested_loop: counter zero failed: {}", e)))?;
2854
2855 let func = self
2857 .device
2858 .inner()
2859 .get_func(
2860 JOIN_MODULE,
2861 join_kernels::NESTED_LOOP_JOIN_INNER_U32_1KEY_PAIRS,
2862 )
2863 .ok_or_else(|| {
2864 XlogError::Kernel("nested_loop_join_inner_u32_1key_pairs kernel not found".into())
2865 })?;
2866
2867 let num_left_u32 = num_left as u32;
2868 let num_right_u32 = num_right as u32;
2869 let upper_bound_u32 = upper_bound as u32;
2870 let block_size = 256u32;
2871 let grid_size = num_left_u32.div_ceil(block_size);
2872 let config = LaunchConfig {
2873 grid_dim: (grid_size, 1, 1),
2874 block_dim: (block_size, 1, 1),
2875 shared_mem_bytes: 0,
2876 };
2877
2878 unsafe {
2888 func.clone()
2889 .launch(
2890 config,
2891 (
2892 left_col,
2893 right_col,
2894 num_left_u32,
2895 num_right_u32,
2896 &mut d_output_left_idx,
2897 &mut d_output_right_idx,
2898 &mut d_output_count,
2899 upper_bound_u32,
2900 ),
2901 )
2902 .map_err(|e| XlogError::Kernel(format!("nested_loop launch failed: {}", e)))?;
2903 }
2904
2905 self.device.synchronize()?;
2906
2907 let output_rows = self.dtoh_scalar_untracked(&d_output_count, 0)?;
2909 if (output_rows as u64) > upper_bound {
2913 return Err(XlogError::Kernel(format!(
2914 "nested_loop: kernel reported {} output rows > upper_bound {}",
2915 output_rows, upper_bound
2916 )));
2917 }
2918
2919 let gathered_left = self.gather_buffer_by_indices(left, &d_output_left_idx, output_rows)?;
2921 let gathered_right =
2922 self.gather_buffer_by_indices(right, &d_output_right_idx, output_rows)?;
2923
2924 let combined_schema = self.combine_schemas(left.schema(), right.schema());
2926 let mut result_columns = Vec::with_capacity(combined_schema.arity());
2927 result_columns.extend(gathered_left.columns);
2928 result_columns.extend(gathered_right.columns);
2929 self.buffer_from_columns(result_columns, output_rows as u64, combined_schema)
2932 }
2933
2934 pub fn is_sorted_ascending_u32(&self, buf: &CudaBuffer, key_col: usize) -> Result<bool> {
2964 let n = self.device_row_count(buf)?;
2966 if n < 2 {
2967 return Ok(true);
2968 }
2969
2970 if buf.arity() <= key_col {
2972 return Err(XlogError::Kernel(format!(
2973 "is_sorted_ascending_u32: key_col={} out of bounds (arity={})",
2974 key_col,
2975 buf.arity()
2976 )));
2977 }
2978 let kt = buf.schema().column_type(key_col);
2979 if !matches!(kt, Some(ScalarType::U32) | Some(ScalarType::Symbol)) {
2980 return Err(XlogError::Kernel(format!(
2981 "is_sorted_ascending_u32: key column must be U32 or Symbol; got {:?}",
2982 kt
2983 )));
2984 }
2985 let key_column = buf.column(key_col).ok_or_else(|| {
2986 XlogError::Kernel(format!(
2987 "is_sorted_ascending_u32: column({}) missing",
2988 key_col
2989 ))
2990 })?;
2991 let required_bytes = n
2992 .checked_mul(4)
2993 .ok_or_else(|| XlogError::Kernel("is_sorted_ascending_u32: byte overflow".into()))?;
2994 if key_column.num_bytes() < required_bytes {
2995 return Err(XlogError::Kernel(format!(
2996 "is_sorted_ascending_u32: key column has {} bytes; require at least {} ({} rows × 4)",
2997 key_column.num_bytes(),
2998 required_bytes,
2999 n
3000 )));
3001 }
3002
3003 let mut d_result = self.memory.alloc::<u32>(1)?;
3007 self.htod_launch_metadata_sync_copy_into(&[1u32], &mut d_result)
3008 .map_err(|e| {
3009 XlogError::Kernel(format!("is_sorted_ascending_u32: htod result init: {}", e))
3010 })?;
3011
3012 let func = self
3014 .device
3015 .inner()
3016 .get_func(SORT_MODULE, sort_kernels::CHECK_ASCENDING_SORTED_U32)
3017 .ok_or_else(|| {
3018 XlogError::Kernel("check_ascending_sorted_u32 kernel not found".into())
3019 })?;
3020 let n_u32 = n as u32;
3021 let block_size = 256u32;
3022 let grid_size = n_u32.div_ceil(block_size);
3023 let config = LaunchConfig {
3024 grid_dim: (grid_size, 1, 1),
3025 block_dim: (block_size, 1, 1),
3026 shared_mem_bytes: 0,
3027 };
3028
3029 unsafe {
3037 func.clone()
3038 .launch(config, (key_column, n_u32, &mut d_result))
3039 .map_err(|e| {
3040 XlogError::Kernel(format!("check_ascending_sorted_u32 launch: {}", e))
3041 })?;
3042 }
3043
3044 self.device.synchronize()?;
3045 let result = self.dtoh_scalar_untracked(&d_result, 0)?;
3046 Ok(result == 1)
3047 }
3048
3049 pub fn sort_merge_join_v2_inner_u32_1key(
3107 &self,
3108 left: &CudaBuffer,
3109 right: &CudaBuffer,
3110 left_key: usize,
3111 right_key: usize,
3112 ) -> Result<CudaBuffer> {
3113 let num_left = self.device_row_count(left)?;
3115 let num_right = self.device_row_count(right)?;
3116
3117 if num_left == 0 || num_right == 0 {
3119 let combined_schema = self.combine_schemas(left.schema(), right.schema());
3120 return self.create_empty_buffer(combined_schema);
3121 }
3122
3123 if left.arity() <= left_key {
3125 return Err(XlogError::Kernel(format!(
3126 "sort_merge: left_key={} out of bounds (arity={})",
3127 left_key,
3128 left.arity()
3129 )));
3130 }
3131 if right.arity() <= right_key {
3132 return Err(XlogError::Kernel(format!(
3133 "sort_merge: right_key={} out of bounds (arity={})",
3134 right_key,
3135 right.arity()
3136 )));
3137 }
3138 let lt = left.schema().column_type(left_key);
3139 let rt = right.schema().column_type(right_key);
3140 if lt != rt || !matches!(lt, Some(ScalarType::U32) | Some(ScalarType::Symbol)) {
3141 return Err(XlogError::Kernel(format!(
3142 "sort_merge: key types must be equal U32/Symbol; got left={:?} right={:?}",
3143 lt, rt
3144 )));
3145 }
3146 let left_col = left
3147 .column(left_key)
3148 .ok_or_else(|| XlogError::Kernel(format!("sort_merge: left.column({})", left_key)))?;
3149 let right_col = right
3150 .column(right_key)
3151 .ok_or_else(|| XlogError::Kernel(format!("sort_merge: right.column({})", right_key)))?;
3152 let required_left_bytes = num_left
3153 .checked_mul(4)
3154 .ok_or_else(|| XlogError::Kernel("sort_merge: left byte overflow".into()))?;
3155 let required_right_bytes = num_right
3156 .checked_mul(4)
3157 .ok_or_else(|| XlogError::Kernel("sort_merge: right byte overflow".into()))?;
3158 if left_col.num_bytes() < required_left_bytes {
3159 return Err(XlogError::Kernel(format!(
3160 "sort_merge: left key column has {} bytes; \
3161 require at least {} ({} rows × 4)",
3162 left_col.num_bytes(),
3163 required_left_bytes,
3164 num_left
3165 )));
3166 }
3167 if right_col.num_bytes() < required_right_bytes {
3168 return Err(XlogError::Kernel(format!(
3169 "sort_merge: right key column has {} bytes; \
3170 require at least {} ({} rows × 4)",
3171 right_col.num_bytes(),
3172 required_right_bytes,
3173 num_right
3174 )));
3175 }
3176
3177 let upper_bound: u64 = (num_left as u64)
3179 .checked_mul(num_right as u64)
3180 .ok_or_else(|| XlogError::Kernel("sort_merge: row-count product overflow".into()))?;
3181 if upper_bound > NESTED_LOOP_TOTAL_THRESHOLD {
3182 return Err(XlogError::Kernel(format!(
3183 "sort_merge: caller violated eligibility threshold: \
3184 num_left * num_right = {} > {} (NESTED_LOOP_TOTAL_THRESHOLD)",
3185 upper_bound, NESTED_LOOP_TOTAL_THRESHOLD
3186 )));
3187 }
3188
3189 let upper_bound_usize = upper_bound as usize;
3191 let mut d_output_left_idx = self.memory.alloc::<u32>(upper_bound_usize)?;
3192 let mut d_output_right_idx = self.memory.alloc::<u32>(upper_bound_usize)?;
3193 let mut d_output_count = self.memory.alloc::<u32>(1)?;
3194 self.device
3195 .inner()
3196 .memset_zeros(&mut d_output_count)
3197 .map_err(|e| XlogError::Kernel(format!("sort_merge: counter zero: {}", e)))?;
3198
3199 let func = self
3201 .device
3202 .inner()
3203 .get_func(
3204 JOIN_MODULE,
3205 join_kernels::SORT_MERGE_JOIN_INNER_U32_1KEY_PAIRS,
3206 )
3207 .ok_or_else(|| {
3208 XlogError::Kernel("sort_merge_join_inner_u32_1key_pairs kernel not found".into())
3209 })?;
3210
3211 let num_left_u32 = num_left as u32;
3212 let num_right_u32 = num_right as u32;
3213 let upper_bound_u32 = upper_bound as u32;
3214 let block_size = 256u32;
3215 let grid_size = num_left_u32.div_ceil(block_size);
3216 let config = LaunchConfig {
3217 grid_dim: (grid_size, 1, 1),
3218 block_dim: (block_size, 1, 1),
3219 shared_mem_bytes: 0,
3220 };
3221
3222 unsafe {
3235 func.clone()
3236 .launch(
3237 config,
3238 (
3239 left_col,
3240 right_col,
3241 num_left_u32,
3242 num_right_u32,
3243 &mut d_output_left_idx,
3244 &mut d_output_right_idx,
3245 &mut d_output_count,
3246 upper_bound_u32,
3247 ),
3248 )
3249 .map_err(|e| XlogError::Kernel(format!("sort_merge launch: {}", e)))?;
3250 }
3251
3252 self.device.synchronize()?;
3253
3254 let output_rows = self.dtoh_scalar_untracked(&d_output_count, 0)?;
3256 if (output_rows as u64) > upper_bound {
3259 return Err(XlogError::Kernel(format!(
3260 "sort_merge: kernel reported {} output rows > upper_bound {}",
3261 output_rows, upper_bound
3262 )));
3263 }
3264
3265 let gathered_left = self.gather_buffer_by_indices(left, &d_output_left_idx, output_rows)?;
3267 let gathered_right =
3268 self.gather_buffer_by_indices(right, &d_output_right_idx, output_rows)?;
3269
3270 let combined_schema = self.combine_schemas(left.schema(), right.schema());
3272 let mut result_columns = Vec::with_capacity(combined_schema.arity());
3273 result_columns.extend(gathered_left.columns);
3274 result_columns.extend(gathered_right.columns);
3275 self.buffer_from_columns(result_columns, output_rows as u64, combined_schema)
3276 }
3277
3278 pub fn sort_merge_join_v2_inner_u32_1key_bounded(
3288 &self,
3289 left: &CudaBuffer,
3290 right: &CudaBuffer,
3291 left_key: usize,
3292 right_key: usize,
3293 output_capacity: usize,
3294 ) -> Result<CudaBuffer> {
3295 let num_left = self.device_row_count(left)?;
3296 let num_right = self.device_row_count(right)?;
3297
3298 if num_left == 0 || num_right == 0 {
3299 let combined_schema = self.combine_schemas(left.schema(), right.schema());
3300 return self.create_empty_buffer(combined_schema);
3301 }
3302 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
3303 return Err(XlogError::Kernel(format!(
3304 "sort_merge_bounded: row counts exceed u32 surface: left={} right={}",
3305 num_left, num_right
3306 )));
3307 }
3308 if output_capacity == 0 || output_capacity > u32::MAX as usize {
3309 return Err(XlogError::Kernel(format!(
3310 "sort_merge_bounded: invalid output capacity {}",
3311 output_capacity
3312 )));
3313 }
3314 if left.arity() <= left_key {
3315 return Err(XlogError::Kernel(format!(
3316 "sort_merge_bounded: left_key={} out of bounds (arity={})",
3317 left_key,
3318 left.arity()
3319 )));
3320 }
3321 if right.arity() <= right_key {
3322 return Err(XlogError::Kernel(format!(
3323 "sort_merge_bounded: right_key={} out of bounds (arity={})",
3324 right_key,
3325 right.arity()
3326 )));
3327 }
3328 let lt = left.schema().column_type(left_key);
3329 let rt = right.schema().column_type(right_key);
3330 if lt != rt || !matches!(lt, Some(ScalarType::U32) | Some(ScalarType::Symbol)) {
3331 return Err(XlogError::Kernel(format!(
3332 "sort_merge_bounded: key types must be equal U32/Symbol; got left={:?} right={:?}",
3333 lt, rt
3334 )));
3335 }
3336
3337 let left_col = left.column(left_key).ok_or_else(|| {
3338 XlogError::Kernel(format!("sort_merge_bounded: left.column({})", left_key))
3339 })?;
3340 let right_col = right.column(right_key).ok_or_else(|| {
3341 XlogError::Kernel(format!("sort_merge_bounded: right.column({})", right_key))
3342 })?;
3343 let required_left_bytes = num_left
3344 .checked_mul(4)
3345 .ok_or_else(|| XlogError::Kernel("sort_merge_bounded: left byte overflow".into()))?;
3346 let required_right_bytes = num_right
3347 .checked_mul(4)
3348 .ok_or_else(|| XlogError::Kernel("sort_merge_bounded: right byte overflow".into()))?;
3349 if left_col.num_bytes() < required_left_bytes {
3350 return Err(XlogError::Kernel(format!(
3351 "sort_merge_bounded: left key column has {} bytes; require at least {}",
3352 left_col.num_bytes(),
3353 required_left_bytes
3354 )));
3355 }
3356 if right_col.num_bytes() < required_right_bytes {
3357 return Err(XlogError::Kernel(format!(
3358 "sort_merge_bounded: right key column has {} bytes; require at least {}",
3359 right_col.num_bytes(),
3360 required_right_bytes
3361 )));
3362 }
3363
3364 let mut d_output_left_idx = self.memory.alloc::<u32>(output_capacity)?;
3365 let mut d_output_right_idx = self.memory.alloc::<u32>(output_capacity)?;
3366 let mut d_output_count = self.memory.alloc::<u32>(1)?;
3367 self.device
3368 .inner()
3369 .memset_zeros(&mut d_output_count)
3370 .map_err(|e| XlogError::Kernel(format!("sort_merge_bounded: counter zero: {}", e)))?;
3371
3372 let func = self
3373 .device
3374 .inner()
3375 .get_func(
3376 JOIN_MODULE,
3377 join_kernels::SORT_MERGE_JOIN_INNER_U32_1KEY_PAIRS,
3378 )
3379 .ok_or_else(|| {
3380 XlogError::Kernel("sort_merge_join_inner_u32_1key_pairs kernel not found".into())
3381 })?;
3382
3383 let num_left_u32 = num_left as u32;
3384 let num_right_u32 = num_right as u32;
3385 let output_capacity_u32 = output_capacity as u32;
3386 let block_size = 256u32;
3387 let grid_size = num_left_u32.div_ceil(block_size);
3388 let config = LaunchConfig {
3389 grid_dim: (grid_size, 1, 1),
3390 block_dim: (block_size, 1, 1),
3391 shared_mem_bytes: 0,
3392 };
3393
3394 unsafe {
3395 func.clone()
3396 .launch(
3397 config,
3398 (
3399 left_col,
3400 right_col,
3401 num_left_u32,
3402 num_right_u32,
3403 &mut d_output_left_idx,
3404 &mut d_output_right_idx,
3405 &mut d_output_count,
3406 output_capacity_u32,
3407 ),
3408 )
3409 .map_err(|e| XlogError::Kernel(format!("sort_merge_bounded launch: {}", e)))?;
3410 }
3411
3412 self.device.synchronize()?;
3413 let output_rows = self.dtoh_scalar_untracked(&d_output_count, 0)?;
3414 if output_rows as usize > output_capacity {
3415 return Err(XlogError::Kernel(format!(
3416 "sort_merge_bounded: output {} exceeded bounded capacity {}",
3417 output_rows, output_capacity
3418 )));
3419 }
3420
3421 let gathered_left = self.gather_buffer_by_indices(left, &d_output_left_idx, output_rows)?;
3422 let gathered_right =
3423 self.gather_buffer_by_indices(right, &d_output_right_idx, output_rows)?;
3424
3425 let combined_schema = self.combine_schemas(left.schema(), right.schema());
3426 let mut result_columns = Vec::with_capacity(combined_schema.arity());
3427 result_columns.extend(gathered_left.columns);
3428 result_columns.extend(gathered_right.columns);
3429 self.buffer_from_columns(result_columns, output_rows as u64, combined_schema)
3430 }
3431
3432 pub fn build_join_index_v2(
3434 &self,
3435 right: &CudaBuffer,
3436 right_keys: &[usize],
3437 ) -> Result<JoinIndexV2> {
3438 let num_right = self.device_row_count(right)?;
3439 if num_right == 0 {
3440 return Err(XlogError::Kernel(
3441 "Cannot build join index for empty relation".to_string(),
3442 ));
3443 }
3444 if num_right > u32::MAX as usize {
3445 return Err(XlogError::Kernel(format!(
3446 "Join index supports at most {} rows, got {}",
3447 u32::MAX,
3448 num_right
3449 )));
3450 }
3451 if right_keys.is_empty() {
3452 return Err(XlogError::Kernel(
3453 "Join requires at least one key column".to_string(),
3454 ));
3455 }
3456 for &k in right_keys {
3457 if k >= right.arity() {
3458 return Err(XlogError::Kernel(format!(
3459 "Right key column index {} out of bounds (arity {})",
3460 k,
3461 right.arity()
3462 )));
3463 }
3464 }
3465
3466 let num_right = num_right as u32;
3467 let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
3468 let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
3469
3470 Ok(JoinIndexV2 {
3471 right_num_rows: num_right,
3472 right_keys: right_keys.to_vec(),
3473 key_bytes: right_packed.key_bytes,
3474 packed_keys: right_packed.packed_keys,
3475 table,
3476 })
3477 }
3478
3479 pub fn build_join_index_v2_background(
3486 &self,
3487 right: &CudaBuffer,
3488 right_keys: &[usize],
3489 ) -> Result<JoinIndexV2> {
3490 if Self::use_recorded_hash_join_env()
3491 && !right_keys.is_empty()
3492 && right_keys.len() <= 4
3493 && right.num_rows() > 0
3494 {
3495 if let Some(launch_stream) = self.recorded_op_stream_or_init() {
3496 return self.build_join_index_v2_recorded(right, right_keys, launch_stream);
3497 }
3498 }
3499
3500 self.build_join_index_v2(right, right_keys)
3501 }
3502
3503 pub fn build_join_index_v2_recorded(
3509 &self,
3510 right: &CudaBuffer,
3511 right_keys: &[usize],
3512 launch_stream: StreamId,
3513 ) -> Result<JoinIndexV2> {
3514 let runtime = self.memory.runtime().ok_or_else(|| {
3515 XlogError::Kernel(
3516 "build_join_index_v2_recorded requires a runtime-backed GpuMemoryManager"
3517 .to_string(),
3518 )
3519 })?;
3520 let cu_stream = runtime
3521 .stream_pool()
3522 .resolve(launch_stream)
3523 .ok_or_else(|| {
3524 XlogError::Kernel(format!(
3525 "build_join_index_v2_recorded: launch_stream StreamId({}) does not resolve",
3526 launch_stream.0
3527 ))
3528 })?;
3529
3530 let num_right = self.device_row_count(right)?;
3531 if num_right == 0 {
3532 return Err(XlogError::Kernel(
3533 "Cannot build join index for empty relation".to_string(),
3534 ));
3535 }
3536 if num_right > u32::MAX as usize {
3537 return Err(XlogError::Kernel(format!(
3538 "Join index supports at most {} rows, got {}",
3539 u32::MAX,
3540 num_right
3541 )));
3542 }
3543 if right_keys.is_empty() {
3544 return Err(XlogError::Kernel(
3545 "Join requires at least one key column".to_string(),
3546 ));
3547 }
3548 if right_keys.len() > 4 {
3549 return Err(XlogError::Kernel(
3550 "build_join_index_v2_recorded: max 4 key columns supported".to_string(),
3551 ));
3552 }
3553 for &k in right_keys {
3554 if k >= right.arity() {
3555 return Err(XlogError::Kernel(format!(
3556 "Right key column index {} out of bounds (arity {})",
3557 k,
3558 right.arity()
3559 )));
3560 }
3561 }
3562
3563 let num_right = num_right as u32;
3564 let right_packed =
3565 self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
3566 let table = self.build_hash_table_v2_on_stream(
3567 &right_packed.hashes,
3568 num_right,
3569 &cu_stream,
3570 launch_stream,
3571 runtime,
3572 )?;
3573
3574 Ok(JoinIndexV2 {
3575 right_num_rows: num_right,
3576 right_keys: right_keys.to_vec(),
3577 key_bytes: right_packed.key_bytes,
3578 packed_keys: right_packed.packed_keys,
3579 table,
3580 })
3581 }
3582
3583 #[allow(clippy::too_many_arguments)]
3587 pub fn hash_join_v2_with_index(
3588 &self,
3589 left: &CudaBuffer,
3590 right: &CudaBuffer,
3591 left_keys: &[usize],
3592 right_keys: &[usize],
3593 join_type: JoinType,
3594 index: &JoinIndexV2,
3595 max_output: Option<usize>,
3596 ) -> Result<CudaBuffer> {
3597 if Self::use_recorded_hash_join_env()
3600 && !left_keys.is_empty()
3601 && left_keys.len() == right_keys.len()
3602 && left_keys.len() <= 4
3603 {
3604 if let Some(launch_stream) = self.recorded_op_stream_or_init() {
3605 return self.hash_join_v2_with_index_recorded(
3606 left,
3607 right,
3608 left_keys,
3609 right_keys,
3610 join_type,
3611 index,
3612 max_output,
3613 launch_stream,
3614 );
3615 }
3616 }
3617 let left_rows = self.device_row_count(left)?;
3618 let right_rows = self.device_row_count(right)?;
3619 if left_rows > u32::MAX as usize || right_rows > u32::MAX as usize {
3620 return Err(XlogError::Kernel(format!(
3621 "Join supports at most {} rows per side (left={}, right={})",
3622 u32::MAX,
3623 left_rows,
3624 right_rows
3625 )));
3626 }
3627
3628 if left_rows == 0 {
3630 return match join_type {
3631 JoinType::Inner | JoinType::LeftOuter => {
3632 let combined_schema = self.combine_schemas(left.schema(), right.schema());
3633 self.create_empty_buffer(combined_schema)
3634 }
3635 JoinType::Semi | JoinType::Anti => self.create_empty_buffer(left.schema().clone()),
3636 };
3637 }
3638 if right_rows == 0 {
3639 return match join_type {
3640 JoinType::Inner => {
3641 let combined_schema = self.combine_schemas(left.schema(), right.schema());
3642 self.create_empty_buffer(combined_schema)
3643 }
3644 JoinType::Semi => self.create_empty_buffer(left.schema().clone()),
3645 JoinType::Anti => self.clone_buffer(left),
3646 JoinType::LeftOuter => self.left_outer_with_nulls(left, right),
3647 };
3648 }
3649
3650 if left_keys.is_empty() || right_keys.is_empty() {
3652 return Err(XlogError::Kernel(
3653 "Join requires at least one key column".to_string(),
3654 ));
3655 }
3656 if left_keys.len() != right_keys.len() {
3657 return Err(XlogError::Kernel(
3658 "Left and right key columns must have same length".to_string(),
3659 ));
3660 }
3661 for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
3662 if left_idx >= left.arity() {
3663 return Err(XlogError::Kernel(format!(
3664 "Left key column index {} out of bounds (arity {})",
3665 left_idx,
3666 left.arity()
3667 )));
3668 }
3669 if right_idx >= right.arity() {
3670 return Err(XlogError::Kernel(format!(
3671 "Right key column index {} out of bounds (arity {})",
3672 right_idx,
3673 right.arity()
3674 )));
3675 }
3676 let left_type = left.schema().column_type(left_idx);
3677 let right_type = right.schema().column_type(right_idx);
3678 if left_type != right_type {
3679 return Err(XlogError::Kernel(format!(
3680 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
3681 left_idx, left_type, right_idx, right_type
3682 )));
3683 }
3684 }
3685
3686 if index.right_num_rows != right_rows as u32 {
3688 return Err(XlogError::Kernel(
3689 "Join index row count does not match right relation".to_string(),
3690 ));
3691 }
3692 if index.right_keys.as_slice() != right_keys {
3693 return Err(XlogError::Kernel(
3694 "Join index key columns do not match requested right_keys".to_string(),
3695 ));
3696 }
3697
3698 match join_type {
3699 JoinType::Inner => {
3700 self.hash_join_inner_v2_indexed(left, right, left_keys, index, max_output)
3701 }
3702 JoinType::Semi => self.hash_join_semi_indexed(left, left_keys, index),
3703 JoinType::Anti => self.hash_join_anti_indexed(left, right, left_keys, index),
3704 JoinType::LeftOuter => {
3705 self.hash_join_left_outer_indexed(left, right, left_keys, index, max_output)
3706 }
3707 }
3708 }
3709
3710 fn pack_keys_gpu(&self, buffer: &CudaBuffer, key_cols: &[usize]) -> Result<PackedKeyData> {
3730 if key_cols.is_empty() {
3731 return Err(XlogError::Kernel(
3732 "pack_keys_gpu: no key columns specified".into(),
3733 ));
3734 }
3735 if key_cols.len() > 4 {
3736 return Err(XlogError::Kernel(
3737 "pack_keys_gpu: max 4 key columns supported".into(),
3738 ));
3739 }
3740
3741 let num_rows = self.device_row_count(buffer)?;
3742 if num_rows > u32::MAX as usize {
3743 return Err(XlogError::Kernel(format!(
3744 "pack_keys_gpu supports at most {} rows, got {}",
3745 u32::MAX,
3746 num_rows
3747 )));
3748 }
3749 let num_rows = num_rows as u32;
3750 if num_rows == 0 {
3751 return Ok(PackedKeyData {
3753 hashes: self.memory.alloc::<u64>(0)?,
3754 packed_keys: self.memory.alloc::<u8>(0)?,
3755 key_bytes: 0,
3756 });
3757 }
3758
3759 let mut col_sizes: Vec<u32> = Vec::with_capacity(key_cols.len());
3761 let mut row_size: u32 = 0;
3762 for &col_idx in key_cols {
3763 let col_type = buffer
3764 .schema()
3765 .column_type(col_idx)
3766 .ok_or_else(|| XlogError::Kernel(format!("Invalid column index: {}", col_idx)))?;
3767 let size = col_type.size_bytes() as u32;
3768 col_sizes.push(size);
3769 row_size += size;
3770 }
3771
3772 let packed_bytes = (num_rows as u64) * (row_size as u64);
3774 let packed_slice = self.memory.alloc::<u8>(packed_bytes as usize)?;
3775 let hash_slice = self.memory.alloc::<u64>(num_rows as usize)?;
3776
3777 let mut col_ptrs: [u64; 4] = [0; 4];
3780 for (i, &col_idx) in key_cols.iter().enumerate() {
3781 let col = buffer
3782 .column(col_idx)
3783 .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", col_idx)))?;
3784 col_ptrs[i] = *col.device_ptr();
3786 }
3787 let mut packed_col_sizes = 0u64;
3788 for (i, size) in col_sizes.iter().copied().enumerate() {
3789 if size > u16::MAX as u32 {
3790 return Err(XlogError::Kernel(format!(
3791 "pack_keys_gpu: column element size {} exceeds 16-bit kernel argument",
3792 size
3793 )));
3794 }
3795 packed_col_sizes |= (size as u64) << (i * 16);
3796 }
3797
3798 let func = self
3800 .device
3801 .inner()
3802 .get_func(PACK_MODULE, pack_kernels::PACK_AND_HASH_KEYS)
3803 .ok_or_else(|| XlogError::Kernel("pack_and_hash_keys kernel not found".to_string()))?;
3804
3805 let block_size = 256u32;
3807 let grid_size = num_rows.div_ceil(block_size);
3808 let config = LaunchConfig {
3809 grid_dim: (grid_size, 1, 1),
3810 block_dim: (block_size, 1, 1),
3811 shared_mem_bytes: 0,
3812 };
3813
3814 unsafe {
3821 func.clone()
3822 .launch(
3823 config,
3824 (
3825 col_ptrs[0],
3826 col_ptrs[1],
3827 col_ptrs[2],
3828 col_ptrs[3],
3829 packed_col_sizes,
3830 key_cols.len() as u32,
3831 num_rows,
3832 row_size,
3833 &packed_slice,
3834 &hash_slice,
3835 ),
3836 )
3837 .map_err(|e| {
3838 XlogError::Kernel(format!("pack_and_hash_keys launch failed: {}", e))
3839 })?;
3840 }
3841
3842 self.device.synchronize()?;
3843
3844 Ok(PackedKeyData {
3845 hashes: hash_slice,
3846 packed_keys: packed_slice,
3847 key_bytes: row_size,
3848 })
3849 }
3850
3851 fn pack_keys_gpu_generic(
3853 &self,
3854 buffer: &CudaBuffer,
3855 key_cols: &[usize],
3856 ) -> Result<PackedKeyData> {
3857 if key_cols.is_empty() {
3858 return Err(XlogError::Kernel(
3859 "pack_keys_gpu_generic: no key columns specified".into(),
3860 ));
3861 }
3862
3863 let num_rows = self.device_row_count(buffer)?;
3864 if num_rows > u32::MAX as usize {
3865 return Err(XlogError::Kernel(format!(
3866 "pack_keys_gpu_generic supports at most {} rows, got {}",
3867 u32::MAX,
3868 num_rows
3869 )));
3870 }
3871 let num_rows = num_rows as u32;
3872 if num_rows == 0 {
3873 return Ok(PackedKeyData {
3874 hashes: self.memory.alloc::<u64>(0)?,
3875 packed_keys: self.memory.alloc::<u8>(0)?,
3876 key_bytes: 0,
3877 });
3878 }
3879
3880 let mut col_sizes: Vec<u32> = Vec::with_capacity(key_cols.len());
3881 let mut col_ptrs: Vec<u64> = Vec::with_capacity(key_cols.len());
3882 let mut row_size: u32 = 0;
3883
3884 for &col_idx in key_cols {
3885 let col_type = buffer
3886 .schema()
3887 .column_type(col_idx)
3888 .ok_or_else(|| XlogError::Kernel(format!("Invalid column index: {}", col_idx)))?;
3889 let size = col_type.size_bytes() as u32;
3890 row_size = row_size
3891 .checked_add(size)
3892 .ok_or_else(|| XlogError::Kernel("Row size overflow".to_string()))?;
3893 col_sizes.push(size);
3894
3895 let col = buffer
3896 .column(col_idx)
3897 .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", col_idx)))?;
3898 col_ptrs.push(*col.device_ptr());
3899 }
3900
3901 let packed_bytes = (num_rows as u64)
3902 .checked_mul(row_size as u64)
3903 .ok_or_else(|| XlogError::Kernel("Packed key byte size overflow".to_string()))?;
3904 let packed_slice = self.memory.alloc::<u8>(packed_bytes as usize)?;
3905 let hash_slice = self.memory.alloc::<u64>(num_rows as usize)?;
3906
3907 let mut d_col_sizes = self.memory.alloc::<u32>(col_sizes.len())?;
3908 self.htod_sync_copy_into_tracked(&col_sizes, &mut d_col_sizes)
3909 .map_err(|e| XlogError::Kernel(format!("Failed to upload col_sizes: {}", e)))?;
3910
3911 let mut d_col_ptrs = self.memory.alloc::<u64>(col_ptrs.len())?;
3912 self.htod_sync_copy_into_tracked(&col_ptrs, &mut d_col_ptrs)
3913 .map_err(|e| XlogError::Kernel(format!("Failed to upload col_ptrs: {}", e)))?;
3914
3915 let func = self
3916 .device
3917 .inner()
3918 .get_func(PACK_MODULE, pack_kernels::PACK_AND_HASH_KEYS_GENERIC)
3919 .ok_or_else(|| {
3920 XlogError::Kernel("pack_and_hash_keys_generic kernel not found".to_string())
3921 })?;
3922
3923 let block_size = 256u32;
3924 let grid_size = num_rows.div_ceil(block_size);
3925 let config = LaunchConfig {
3926 grid_dim: (grid_size, 1, 1),
3927 block_dim: (block_size, 1, 1),
3928 shared_mem_bytes: 0,
3929 };
3930
3931 unsafe {
3933 func.clone()
3934 .launch(
3935 config,
3936 (
3937 &d_col_ptrs,
3938 &d_col_sizes,
3939 key_cols.len() as u32,
3940 num_rows,
3941 row_size,
3942 &packed_slice,
3943 &hash_slice,
3944 ),
3945 )
3946 .map_err(|e| {
3947 XlogError::Kernel(format!("pack_and_hash_keys_generic launch failed: {}", e))
3948 })?;
3949 }
3950
3951 self.device.synchronize()?;
3952
3953 Ok(PackedKeyData {
3954 hashes: hash_slice,
3955 packed_keys: packed_slice,
3956 key_bytes: row_size,
3957 })
3958 }
3959
3960 pub(super) fn compute_hashes_and_pack_keys(
3968 &self,
3969 buffer: &CudaBuffer,
3970 key_cols: &[usize],
3971 ) -> Result<PackedKeyData> {
3972 if key_cols.is_empty() {
3973 return Err(XlogError::Kernel(
3974 "compute_hashes_and_pack_keys: no key columns specified".to_string(),
3975 ));
3976 }
3977
3978 if key_cols.len() <= 4 {
3979 self.pack_keys_gpu(buffer, key_cols)
3980 } else {
3981 self.pack_keys_gpu_generic(buffer, key_cols)
3982 }
3983 }
3984
3985 fn build_hash_table_v2(
3990 &self,
3991 hashes: &cudarc::driver::CudaSlice<u64>,
3992 num_rows: u32,
3993 ) -> Result<JoinHashTableV2> {
3994 let device = self.device.inner();
3995
3996 let target = (num_rows as u64).saturating_mul(2).max(1024);
3998 let num_buckets_u64 = target.next_power_of_two();
3999 let num_buckets = u32::try_from(num_buckets_u64).map_err(|_| {
4000 XlogError::Kernel(format!(
4001 "Join hash table too large: num_buckets={}",
4002 num_buckets_u64
4003 ))
4004 })?;
4005 let bucket_mask = num_buckets
4006 .checked_sub(1)
4007 .ok_or_else(|| XlogError::Kernel("Join hash table size underflow".to_string()))?;
4008
4009 let mut bucket_counts = self.memory.alloc::<u32>(num_buckets as usize)?;
4010 if num_buckets > 0 {
4011 device
4012 .memset_zeros(&mut bucket_counts)
4013 .map_err(|e| XlogError::Kernel(format!("Failed to zero bucket_counts: {}", e)))?;
4014 self.device.synchronize()?;
4015 }
4016
4017 let block_size = 256u32;
4018 let grid_size = num_rows.div_ceil(block_size);
4019 let config = LaunchConfig {
4020 grid_dim: (grid_size, 1, 1),
4021 block_dim: (block_size, 1, 1),
4022 shared_mem_bytes: 0,
4023 };
4024
4025 let count_fn = device
4026 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_BUCKET_COUNT_V2)
4027 .ok_or_else(|| {
4028 XlogError::Kernel("hash_join_bucket_count_v2 kernel not found".to_string())
4029 })?;
4030
4031 unsafe {
4033 count_fn
4034 .clone()
4035 .launch(config, (hashes, num_rows, &bucket_counts, bucket_mask))
4036 .map_err(|e| {
4037 XlogError::Kernel(format!("hash_join_bucket_count_v2 failed: {}", e))
4038 })?;
4039 }
4040 self.device.synchronize()?;
4041
4042 let mut bucket_offsets = self.memory.alloc::<u32>(num_buckets as usize)?;
4044 if num_buckets > 0 {
4045 device
4046 .dtod_copy(&bucket_counts, &mut bucket_offsets)
4047 .map_err(|e| XlogError::Kernel(format!("Failed to copy bucket_counts: {}", e)))?;
4048 self.device.synchronize()?;
4049 self.multiblock_scan_u32_inplace(&mut bucket_offsets, num_buckets)?;
4050 self.device.synchronize()?;
4051 }
4052
4053 let mut bucket_cursors = self.memory.alloc::<u32>(num_buckets as usize)?;
4055 if num_buckets > 0 {
4056 device
4057 .dtod_copy(&bucket_offsets, &mut bucket_cursors)
4058 .map_err(|e| XlogError::Kernel(format!("Failed to copy bucket_offsets: {}", e)))?;
4059 self.device.synchronize()?;
4060 }
4061
4062 let bucket_entries = self.memory.alloc::<u32>(num_rows as usize)?;
4063 let bucket_entry_hashes = self.memory.alloc::<u64>(num_rows as usize)?;
4064
4065 let scatter_fn = device
4066 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SCATTER_V2)
4067 .ok_or_else(|| {
4068 XlogError::Kernel("hash_join_scatter_v2 kernel not found".to_string())
4069 })?;
4070
4071 unsafe {
4073 scatter_fn
4074 .clone()
4075 .launch(
4076 config,
4077 (
4078 hashes,
4079 num_rows,
4080 &bucket_cursors,
4081 bucket_mask,
4082 &bucket_entries,
4083 &bucket_entry_hashes,
4084 ),
4085 )
4086 .map_err(|e| XlogError::Kernel(format!("hash_join_scatter_v2 failed: {}", e)))?;
4087 }
4088
4089 self.device.synchronize()?;
4090 Ok(JoinHashTableV2 {
4091 bucket_counts,
4092 bucket_offsets,
4093 bucket_entries,
4094 bucket_entry_hashes,
4095 bucket_mask,
4096 })
4097 }
4098
4099 pub fn build_hash_table_u64(
4101 &self,
4102 hashes: &crate::memory::TrackedCudaSlice<u64>,
4103 num_rows: u32,
4104 ) -> Result<HashTableU64> {
4105 let JoinHashTableV2 {
4106 bucket_counts,
4107 bucket_offsets,
4108 bucket_entries,
4109 bucket_entry_hashes,
4110 bucket_mask,
4111 } = self.build_hash_table_v2(hashes, num_rows)?;
4112 Ok(HashTableU64 {
4113 bucket_counts,
4114 bucket_offsets,
4115 bucket_entries,
4116 bucket_entry_hashes,
4117 bucket_mask,
4118 })
4119 }
4120
4121 fn hash_join_inner_v2(
4123 &self,
4124 left: &CudaBuffer,
4125 right: &CudaBuffer,
4126 left_keys: &[usize],
4127 right_keys: &[usize],
4128 max_output: Option<usize>,
4129 ) -> Result<CudaBuffer> {
4130 let num_left = self.device_row_count(left)?;
4131 let num_right = self.device_row_count(right)?;
4132 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4133 return Err(XlogError::Kernel(format!(
4134 "Join supports at most {} rows per side (left={}, right={})",
4135 u32::MAX,
4136 num_left,
4137 num_right
4138 )));
4139 }
4140
4141 if num_left == 0 || num_right == 0 {
4143 let combined_schema = self.combine_schemas(left.schema(), right.schema());
4144 return self.create_empty_buffer(combined_schema);
4145 }
4146
4147 if left_keys.is_empty() || right_keys.is_empty() {
4149 return Err(XlogError::Kernel(
4150 "Join requires at least one key column".to_string(),
4151 ));
4152 }
4153 if left_keys.len() != right_keys.len() {
4154 return Err(XlogError::Kernel(
4155 "Left and right key columns must have same length".to_string(),
4156 ));
4157 }
4158
4159 for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
4161 let left_type = left.schema().column_type(left_idx);
4162 let right_type = right.schema().column_type(right_idx);
4163 if left_type != right_type {
4164 return Err(XlogError::Kernel(format!(
4165 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
4166 left_idx, left_type, right_idx, right_type
4167 )));
4168 }
4169 }
4170
4171 let num_left = num_left as u32;
4172 let num_right = num_right as u32;
4173
4174 let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4176 let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
4177
4178 let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
4180
4181 let probe_func = self
4186 .device
4187 .inner()
4188 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
4189 .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
4190
4191 let block_size = 256u32;
4192 let probe_grid = num_left.div_ceil(block_size);
4193 let probe_config = LaunchConfig {
4194 grid_dim: (probe_grid, 1, 1),
4195 block_dim: (block_size, 1, 1),
4196 shared_mem_bytes: 0,
4197 };
4198
4199 let mut d_count_only = self.memory.alloc::<u32>(1)?;
4200 self.device
4201 .inner()
4202 .memset_zeros(&mut d_count_only)
4203 .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
4204 self.device.synchronize()?;
4205 let d_dummy_left = self.memory.alloc::<u32>(1)?;
4206 let d_dummy_right = self.memory.alloc::<u32>(1)?;
4207 let max_output_count_only = 0u32;
4208
4209 unsafe {
4216 let mut params: Vec<*mut c_void> = vec![
4217 (&left_packed.hashes).as_kernel_param(),
4218 num_left.as_kernel_param(),
4219 (&table.bucket_offsets).as_kernel_param(),
4220 (&table.bucket_counts).as_kernel_param(),
4221 (&table.bucket_entries).as_kernel_param(),
4222 (&table.bucket_entry_hashes).as_kernel_param(),
4223 table.bucket_mask.as_kernel_param(),
4224 (&left_packed.packed_keys).as_kernel_param(),
4225 (&right_packed.packed_keys).as_kernel_param(),
4226 left_packed.key_bytes.as_kernel_param(),
4227 (&d_dummy_left).as_kernel_param(),
4228 (&d_dummy_right).as_kernel_param(),
4229 (&d_count_only).as_kernel_param(),
4230 max_output_count_only.as_kernel_param(),
4231 ];
4232 probe_func
4233 .clone()
4234 .launch(probe_config, &mut params)
4235 .map_err(|e| {
4236 XlogError::Kernel(format!("hash_join_probe_v2 (count) failed: {}", e))
4237 })?;
4238 }
4239
4240 self.device.synchronize()?;
4241
4242 let full_count = self.read_join_output_count_metadata(&d_count_only)? as u64;
4246 let requested = max_output
4247 .map(|limit| (limit as u64).min(full_count))
4248 .unwrap_or(full_count);
4249
4250 if requested == 0 {
4251 let combined_schema = self.combine_schemas(left.schema(), right.schema());
4252 return self.create_empty_buffer(combined_schema);
4253 }
4254
4255 if requested > u32::MAX as u64 {
4256 return Err(XlogError::Kernel(format!(
4257 "Join produced {} rows which exceeds the u32 index limit",
4258 requested
4259 )));
4260 }
4261
4262 let max_output = requested as u32;
4264 let d_output_left = self.memory.alloc::<u32>(max_output as usize)?;
4265 let d_output_right = self.memory.alloc::<u32>(max_output as usize)?;
4266 let mut d_output_count = self.memory.alloc::<u32>(1)?;
4267 self.device
4268 .inner()
4269 .memset_zeros(&mut d_output_count)
4270 .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
4271 self.device.synchronize()?;
4272
4273 unsafe {
4280 let mut params: Vec<*mut c_void> = vec![
4281 (&left_packed.hashes).as_kernel_param(),
4282 num_left.as_kernel_param(),
4283 (&table.bucket_offsets).as_kernel_param(),
4284 (&table.bucket_counts).as_kernel_param(),
4285 (&table.bucket_entries).as_kernel_param(),
4286 (&table.bucket_entry_hashes).as_kernel_param(),
4287 table.bucket_mask.as_kernel_param(),
4288 (&left_packed.packed_keys).as_kernel_param(),
4289 (&right_packed.packed_keys).as_kernel_param(),
4290 left_packed.key_bytes.as_kernel_param(),
4291 (&d_output_left).as_kernel_param(),
4292 (&d_output_right).as_kernel_param(),
4293 (&d_output_count).as_kernel_param(),
4294 max_output.as_kernel_param(),
4295 ];
4296 probe_func
4297 .clone()
4298 .launch(probe_config, &mut params)
4299 .map_err(|e| XlogError::Kernel(format!("hash_join_probe_v2 failed: {}", e)))?;
4300 }
4301
4302 self.device.synchronize()?;
4303
4304 let result_count =
4311 (self.read_join_output_count_metadata(&d_output_count)? as u64).min(max_output as u64);
4312
4313 if result_count == 0 {
4314 let combined_schema = self.combine_schemas(left.schema(), right.schema());
4315 return self.create_empty_buffer(combined_schema);
4316 }
4317
4318 let output_rows = result_count as u32;
4319
4320 let gathered_left = self.gather_buffer_by_indices(left, &d_output_left, output_rows)?;
4322 let gathered_right = self.gather_buffer_by_indices(right, &d_output_right, output_rows)?;
4323
4324 let combined_schema = self.combine_schemas(left.schema(), right.schema());
4325 let mut result_columns = Vec::with_capacity(combined_schema.arity());
4326 result_columns.extend(gathered_left.columns);
4327 result_columns.extend(gathered_right.columns);
4328
4329 self.buffer_from_columns(result_columns, result_count, combined_schema)
4330 }
4331
4332 fn hash_join_inner_v2_indexed(
4333 &self,
4334 left: &CudaBuffer,
4335 right: &CudaBuffer,
4336 left_keys: &[usize],
4337 index: &JoinIndexV2,
4338 max_output: Option<usize>,
4339 ) -> Result<CudaBuffer> {
4340 let num_left = self.device_row_count(left)?;
4341 let num_right = self.device_row_count(right)?;
4342 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4343 return Err(XlogError::Kernel(format!(
4344 "Join supports at most {} rows per side (left={}, right={})",
4345 u32::MAX,
4346 num_left,
4347 num_right
4348 )));
4349 }
4350
4351 if num_left == 0 || num_right == 0 {
4353 let combined_schema = self.combine_schemas(left.schema(), right.schema());
4354 return self.create_empty_buffer(combined_schema);
4355 }
4356
4357 let num_left = num_left as u32;
4358
4359 let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4361 if left_packed.key_bytes != index.key_bytes {
4362 return Err(XlogError::Kernel(
4363 "Join key byte width mismatch between probe and cached index".to_string(),
4364 ));
4365 }
4366
4367 let table = &index.table;
4368
4369 let probe_func = self
4371 .device
4372 .inner()
4373 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
4374 .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
4375
4376 let block_size = 256u32;
4377 let probe_grid = num_left.div_ceil(block_size);
4378 let probe_config = LaunchConfig {
4379 grid_dim: (probe_grid, 1, 1),
4380 block_dim: (block_size, 1, 1),
4381 shared_mem_bytes: 0,
4382 };
4383
4384 let mut d_count_only = self.memory.alloc::<u32>(1)?;
4385 self.device
4386 .inner()
4387 .memset_zeros(&mut d_count_only)
4388 .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
4389 self.device.synchronize()?;
4390 let d_dummy_left = self.memory.alloc::<u32>(1)?;
4391 let d_dummy_right = self.memory.alloc::<u32>(1)?;
4392 let max_output_count_only = 0u32;
4393
4394 unsafe {
4396 let mut params: Vec<*mut c_void> = vec![
4397 (&left_packed.hashes).as_kernel_param(),
4398 num_left.as_kernel_param(),
4399 (&table.bucket_offsets).as_kernel_param(),
4400 (&table.bucket_counts).as_kernel_param(),
4401 (&table.bucket_entries).as_kernel_param(),
4402 (&table.bucket_entry_hashes).as_kernel_param(),
4403 table.bucket_mask.as_kernel_param(),
4404 (&left_packed.packed_keys).as_kernel_param(),
4405 (&index.packed_keys).as_kernel_param(),
4406 index.key_bytes.as_kernel_param(),
4407 (&d_dummy_left).as_kernel_param(),
4408 (&d_dummy_right).as_kernel_param(),
4409 (&d_count_only).as_kernel_param(),
4410 max_output_count_only.as_kernel_param(),
4411 ];
4412 probe_func
4413 .clone()
4414 .launch(probe_config, &mut params)
4415 .map_err(|e| {
4416 XlogError::Kernel(format!("hash_join_probe_v2 (count) failed: {}", e))
4417 })?;
4418 }
4419
4420 self.device.synchronize()?;
4421
4422 let full_count = self.read_join_output_count_metadata(&d_count_only)? as u64;
4426 let requested = max_output
4427 .map(|limit| (limit as u64).min(full_count))
4428 .unwrap_or(full_count);
4429
4430 if requested == 0 {
4431 let combined_schema = self.combine_schemas(left.schema(), right.schema());
4432 return self.create_empty_buffer(combined_schema);
4433 }
4434
4435 if requested > u32::MAX as u64 {
4436 return Err(XlogError::Kernel(format!(
4437 "Join produced {} rows which exceeds the u32 index limit",
4438 requested
4439 )));
4440 }
4441
4442 let max_output = requested as u32;
4444 let d_output_left = self.memory.alloc::<u32>(max_output as usize)?;
4445 let d_output_right = self.memory.alloc::<u32>(max_output as usize)?;
4446 let mut d_output_count = self.memory.alloc::<u32>(1)?;
4447 self.device
4448 .inner()
4449 .memset_zeros(&mut d_output_count)
4450 .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
4451 self.device.synchronize()?;
4452
4453 unsafe {
4455 let mut params: Vec<*mut c_void> = vec![
4456 (&left_packed.hashes).as_kernel_param(),
4457 num_left.as_kernel_param(),
4458 (&table.bucket_offsets).as_kernel_param(),
4459 (&table.bucket_counts).as_kernel_param(),
4460 (&table.bucket_entries).as_kernel_param(),
4461 (&table.bucket_entry_hashes).as_kernel_param(),
4462 table.bucket_mask.as_kernel_param(),
4463 (&left_packed.packed_keys).as_kernel_param(),
4464 (&index.packed_keys).as_kernel_param(),
4465 index.key_bytes.as_kernel_param(),
4466 (&d_output_left).as_kernel_param(),
4467 (&d_output_right).as_kernel_param(),
4468 (&d_output_count).as_kernel_param(),
4469 max_output.as_kernel_param(),
4470 ];
4471 probe_func
4472 .clone()
4473 .launch(probe_config, &mut params)
4474 .map_err(|e| XlogError::Kernel(format!("hash_join_probe_v2 failed: {}", e)))?;
4475 }
4476
4477 self.device.synchronize()?;
4478
4479 let result_count =
4482 (self.read_join_output_count_metadata(&d_output_count)? as u64).min(max_output as u64);
4483
4484 if result_count == 0 {
4485 let combined_schema = self.combine_schemas(left.schema(), right.schema());
4486 return self.create_empty_buffer(combined_schema);
4487 }
4488
4489 let output_rows = result_count as u32;
4490
4491 let gathered_left = self.gather_buffer_by_indices(left, &d_output_left, output_rows)?;
4492 let gathered_right = self.gather_buffer_by_indices(right, &d_output_right, output_rows)?;
4493
4494 let combined_schema = self.combine_schemas(left.schema(), right.schema());
4495 let mut result_columns = Vec::with_capacity(combined_schema.arity());
4496 result_columns.extend(gathered_left.columns);
4497 result_columns.extend(gathered_right.columns);
4498
4499 self.buffer_from_columns(result_columns, result_count, combined_schema)
4500 }
4501
4502 fn hash_join_semi_impl(
4504 &self,
4505 left: &CudaBuffer,
4506 right: &CudaBuffer,
4507 left_keys: &[usize],
4508 right_keys: &[usize],
4509 ) -> Result<CudaBuffer> {
4510 let num_left = self.device_row_count(left)?;
4511 let num_right = self.device_row_count(right)?;
4512 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4513 return Err(XlogError::Kernel(format!(
4514 "Join supports at most {} rows per side (left={}, right={})",
4515 u32::MAX,
4516 num_left,
4517 num_right
4518 )));
4519 }
4520
4521 if num_left == 0 {
4523 return self.create_empty_buffer(left.schema().clone());
4524 }
4525 if num_right == 0 {
4526 return self.create_empty_buffer(left.schema().clone());
4528 }
4529
4530 if left_keys.is_empty() || right_keys.is_empty() {
4532 return Err(XlogError::Kernel(
4533 "Join requires at least one key column".to_string(),
4534 ));
4535 }
4536 if left_keys.len() != right_keys.len() {
4537 return Err(XlogError::Kernel(
4538 "Left and right key columns must have same length".to_string(),
4539 ));
4540 }
4541
4542 for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
4544 let left_type = left.schema().column_type(left_idx);
4545 let right_type = right.schema().column_type(right_idx);
4546 if left_type != right_type {
4547 return Err(XlogError::Kernel(format!(
4548 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
4549 left_idx, left_type, right_idx, right_type
4550 )));
4551 }
4552 }
4553
4554 let num_left = num_left as u32;
4555 let num_right = num_right as u32;
4556
4557 let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4559 let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
4560
4561 let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
4563
4564 let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
4566
4567 let semi_func = self
4569 .device
4570 .inner()
4571 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
4572 .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
4573
4574 let block_size = 256u32;
4575 let grid_size = num_left.div_ceil(block_size);
4576 let config = LaunchConfig {
4577 grid_dim: (grid_size, 1, 1),
4578 block_dim: (block_size, 1, 1),
4579 shared_mem_bytes: 0,
4580 };
4581
4582 unsafe {
4587 semi_func
4588 .clone()
4589 .launch(
4590 config,
4591 (
4592 &left_packed.hashes,
4593 num_left,
4594 &table.bucket_offsets,
4595 &table.bucket_counts,
4596 &table.bucket_entries,
4597 &table.bucket_entry_hashes,
4598 table.bucket_mask,
4599 &left_packed.packed_keys,
4600 &right_packed.packed_keys,
4601 left_packed.key_bytes,
4602 &d_has_match,
4603 ),
4604 )
4605 .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
4606 }
4607
4608 self.device.synchronize()?;
4609 self.filter_by_device_mask(left, &d_has_match)
4610 }
4611
4612 pub fn membership_mask_device(
4617 &self,
4618 probe: &CudaBuffer,
4619 build: &CudaBuffer,
4620 probe_keys: &[usize],
4621 build_keys: &[usize],
4622 ) -> Result<TrackedCudaSlice<u8>> {
4623 let num_probe = self.device_row_count(probe)?;
4624 let num_build = self.device_row_count(build)?;
4625
4626 if num_probe == 0 {
4628 return self.memory.alloc::<u8>(0);
4629 }
4630
4631 if num_build == 0 {
4633 let mut d_mask = self.memory.alloc::<u8>(num_probe)?;
4634 self.device.inner().memset_zeros(&mut d_mask).map_err(|e| {
4635 XlogError::Kernel(format!(
4636 "Failed to zero membership mask for empty build: {}",
4637 e
4638 ))
4639 })?;
4640 return Ok(d_mask);
4641 }
4642
4643 if num_probe > u32::MAX as usize || num_build > u32::MAX as usize {
4644 return Err(XlogError::Kernel(format!(
4645 "membership_mask supports at most {} rows per side (probe={}, build={})",
4646 u32::MAX,
4647 num_probe,
4648 num_build
4649 )));
4650 }
4651
4652 if probe_keys.is_empty() || build_keys.is_empty() {
4654 return Err(XlogError::Kernel(
4655 "membership_mask requires at least one key column".to_string(),
4656 ));
4657 }
4658 if probe_keys.len() != build_keys.len() {
4659 return Err(XlogError::Kernel(
4660 "Probe and build key columns must have same length".to_string(),
4661 ));
4662 }
4663
4664 for (&p_idx, &b_idx) in probe_keys.iter().zip(build_keys.iter()) {
4666 let p_type = probe.schema().column_type(p_idx);
4667 let b_type = build.schema().column_type(b_idx);
4668 if p_type != b_type {
4669 return Err(XlogError::Kernel(format!(
4670 "Key column type mismatch: probe[{}]={:?}, build[{}]={:?}",
4671 p_idx, p_type, b_idx, b_type
4672 )));
4673 }
4674 }
4675
4676 let num_probe_u32 = num_probe as u32;
4677 let num_build_u32 = num_build as u32;
4678
4679 let probe_packed = self.compute_hashes_and_pack_keys(probe, probe_keys)?;
4681 let build_packed = self.compute_hashes_and_pack_keys(build, build_keys)?;
4682
4683 let table = self.build_hash_table_v2(&build_packed.hashes, num_build_u32)?;
4685
4686 let d_has_match = self.memory.alloc::<u8>(num_probe)?;
4688
4689 let semi_func = self
4691 .device
4692 .inner()
4693 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
4694 .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
4695
4696 let block_size = 256u32;
4697 let grid_size = num_probe_u32.div_ceil(block_size);
4698 let config = LaunchConfig {
4699 grid_dim: (grid_size, 1, 1),
4700 block_dim: (block_size, 1, 1),
4701 shared_mem_bytes: 0,
4702 };
4703
4704 unsafe {
4710 semi_func
4711 .clone()
4712 .launch(
4713 config,
4714 (
4715 &probe_packed.hashes,
4716 num_probe_u32,
4717 &table.bucket_offsets,
4718 &table.bucket_counts,
4719 &table.bucket_entries,
4720 &table.bucket_entry_hashes,
4721 table.bucket_mask,
4722 &probe_packed.packed_keys,
4723 &build_packed.packed_keys,
4724 probe_packed.key_bytes,
4725 &d_has_match,
4726 ),
4727 )
4728 .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
4729 }
4730
4731 Ok(d_has_match)
4732 }
4733
4734 pub fn membership_mask(
4739 &self,
4740 probe: &CudaBuffer,
4741 build: &CudaBuffer,
4742 probe_keys: &[usize],
4743 build_keys: &[usize],
4744 ) -> Result<Vec<bool>> {
4745 let d_has_match = self.membership_mask_device(probe, build, probe_keys, build_keys)?;
4746 let num_probe = d_has_match.len();
4747 if num_probe == 0 {
4748 return Ok(Vec::new());
4749 }
4750 let mut host_mask = vec![0u8; num_probe];
4751 self.device
4752 .inner()
4753 .dtoh_sync_copy_into(&d_has_match, &mut host_mask)
4754 .map_err(|e| XlogError::Kernel(format!("Failed to download membership mask: {}", e)))?;
4755 Ok(host_mask.into_iter().map(|b| b != 0).collect())
4756 }
4757
4758 fn hash_join_semi_indexed(
4759 &self,
4760 left: &CudaBuffer,
4761 left_keys: &[usize],
4762 index: &JoinIndexV2,
4763 ) -> Result<CudaBuffer> {
4764 let num_left = self.device_row_count(left)?;
4765 if num_left > u32::MAX as usize {
4766 return Err(XlogError::Kernel(format!(
4767 "Join supports at most {} rows on left side (left={})",
4768 u32::MAX,
4769 num_left
4770 )));
4771 }
4772
4773 if num_left == 0 {
4775 return self.create_empty_buffer(left.schema().clone());
4776 }
4777 if index.right_num_rows == 0 {
4778 return self.create_empty_buffer(left.schema().clone());
4779 }
4780
4781 let num_left = num_left as u32;
4782
4783 let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4784 if left_packed.key_bytes != index.key_bytes {
4785 return Err(XlogError::Kernel(
4786 "Join key byte width mismatch between probe and cached index".to_string(),
4787 ));
4788 }
4789
4790 let table = &index.table;
4791
4792 let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
4794
4795 let semi_func = self
4796 .device
4797 .inner()
4798 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
4799 .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
4800
4801 let block_size = 256u32;
4802 let grid_size = num_left.div_ceil(block_size);
4803 let config = LaunchConfig {
4804 grid_dim: (grid_size, 1, 1),
4805 block_dim: (block_size, 1, 1),
4806 shared_mem_bytes: 0,
4807 };
4808
4809 unsafe {
4811 semi_func
4812 .clone()
4813 .launch(
4814 config,
4815 (
4816 &left_packed.hashes,
4817 num_left,
4818 &table.bucket_offsets,
4819 &table.bucket_counts,
4820 &table.bucket_entries,
4821 &table.bucket_entry_hashes,
4822 table.bucket_mask,
4823 &left_packed.packed_keys,
4824 &index.packed_keys,
4825 index.key_bytes,
4826 &d_has_match,
4827 ),
4828 )
4829 .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
4830 }
4831
4832 self.device.synchronize()?;
4833 self.filter_by_device_mask(left, &d_has_match)
4834 }
4835
4836 fn hash_join_anti_impl(
4838 &self,
4839 left: &CudaBuffer,
4840 right: &CudaBuffer,
4841 left_keys: &[usize],
4842 right_keys: &[usize],
4843 ) -> Result<CudaBuffer> {
4844 let num_left = self.device_row_count(left)?;
4845 let num_right = self.device_row_count(right)?;
4846 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4847 return Err(XlogError::Kernel(format!(
4848 "Join supports at most {} rows per side (left={}, right={})",
4849 u32::MAX,
4850 num_left,
4851 num_right
4852 )));
4853 }
4854
4855 if num_left == 0 {
4857 return self.create_empty_buffer(left.schema().clone());
4858 }
4859 if num_right == 0 {
4860 return self.clone_buffer(left);
4862 }
4863
4864 if left_keys.is_empty() || right_keys.is_empty() {
4866 return Err(XlogError::Kernel(
4867 "Join requires at least one key column".to_string(),
4868 ));
4869 }
4870 if left_keys.len() != right_keys.len() {
4871 return Err(XlogError::Kernel(
4872 "Left and right key columns must have same length".to_string(),
4873 ));
4874 }
4875
4876 for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
4878 let left_type = left.schema().column_type(left_idx);
4879 let right_type = right.schema().column_type(right_idx);
4880 if left_type != right_type {
4881 return Err(XlogError::Kernel(format!(
4882 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
4883 left_idx, left_type, right_idx, right_type
4884 )));
4885 }
4886 }
4887
4888 let num_left = num_left as u32;
4889 let num_right = num_right as u32;
4890
4891 let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4893 let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
4894
4895 let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
4897
4898 let d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
4900
4901 let anti_func = self
4903 .device
4904 .inner()
4905 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_ANTI)
4906 .ok_or_else(|| XlogError::Kernel("hash_join_anti kernel not found".to_string()))?;
4907
4908 let block_size = 256u32;
4909 let grid_size = num_left.div_ceil(block_size);
4910 let config = LaunchConfig {
4911 grid_dim: (grid_size, 1, 1),
4912 block_dim: (block_size, 1, 1),
4913 shared_mem_bytes: 0,
4914 };
4915
4916 unsafe {
4921 anti_func
4922 .clone()
4923 .launch(
4924 config,
4925 (
4926 &left_packed.hashes,
4927 num_left,
4928 &table.bucket_offsets,
4929 &table.bucket_counts,
4930 &table.bucket_entries,
4931 &table.bucket_entry_hashes,
4932 table.bucket_mask,
4933 &left_packed.packed_keys,
4934 &right_packed.packed_keys,
4935 left_packed.key_bytes,
4936 &d_no_match,
4937 ),
4938 )
4939 .map_err(|e| XlogError::Kernel(format!("hash_join_anti failed: {}", e)))?;
4940 }
4941
4942 self.device.synchronize()?;
4943 self.filter_by_device_mask(left, &d_no_match)
4944 }
4945
4946 fn hash_join_anti_indexed(
4947 &self,
4948 left: &CudaBuffer,
4949 right: &CudaBuffer,
4950 left_keys: &[usize],
4951 index: &JoinIndexV2,
4952 ) -> Result<CudaBuffer> {
4953 let num_left = self.device_row_count(left)?;
4954 let num_right = self.device_row_count(right)?;
4955 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4956 return Err(XlogError::Kernel(format!(
4957 "Join supports at most {} rows per side (left={}, right={})",
4958 u32::MAX,
4959 num_left,
4960 num_right
4961 )));
4962 }
4963 if num_left == 0 {
4964 return self.create_empty_buffer(left.schema().clone());
4965 }
4966 if num_right == 0 {
4967 return self.clone_buffer(left);
4968 }
4969
4970 let num_left = num_left as u32;
4971
4972 let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4973 if left_packed.key_bytes != index.key_bytes {
4974 return Err(XlogError::Kernel(
4975 "Join key byte width mismatch between probe and cached index".to_string(),
4976 ));
4977 }
4978
4979 let table = &index.table;
4980
4981 let d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
4982
4983 let anti_func = self
4984 .device
4985 .inner()
4986 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_ANTI)
4987 .ok_or_else(|| XlogError::Kernel("hash_join_anti kernel not found".to_string()))?;
4988
4989 let block_size = 256u32;
4990 let grid_size = num_left.div_ceil(block_size);
4991 let config = LaunchConfig {
4992 grid_dim: (grid_size, 1, 1),
4993 block_dim: (block_size, 1, 1),
4994 shared_mem_bytes: 0,
4995 };
4996
4997 unsafe {
4999 anti_func
5000 .clone()
5001 .launch(
5002 config,
5003 (
5004 &left_packed.hashes,
5005 num_left,
5006 &table.bucket_offsets,
5007 &table.bucket_counts,
5008 &table.bucket_entries,
5009 &table.bucket_entry_hashes,
5010 table.bucket_mask,
5011 &left_packed.packed_keys,
5012 &index.packed_keys,
5013 index.key_bytes,
5014 &d_no_match,
5015 ),
5016 )
5017 .map_err(|e| XlogError::Kernel(format!("hash_join_anti failed: {}", e)))?;
5018 }
5019
5020 self.device.synchronize()?;
5021 self.filter_by_device_mask(left, &d_no_match)
5022 }
5023
5024 fn hash_join_left_outer_indexed(
5025 &self,
5026 left: &CudaBuffer,
5027 right: &CudaBuffer,
5028 left_keys: &[usize],
5029 index: &JoinIndexV2,
5030 max_output: Option<usize>,
5031 ) -> Result<CudaBuffer> {
5032 let num_left = self.device_row_count(left)?;
5033 let num_right = self.device_row_count(right)?;
5034 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
5035 return Err(XlogError::Kernel(format!(
5036 "Join supports at most {} rows per side (left={}, right={})",
5037 u32::MAX,
5038 num_left,
5039 num_right
5040 )));
5041 }
5042
5043 if num_left == 0 {
5045 let combined_schema = self.combine_schemas(left.schema(), right.schema());
5046 return self.create_empty_buffer(combined_schema);
5047 }
5048 if num_right == 0 {
5050 return self.left_outer_with_nulls(left, right);
5051 }
5052
5053 let num_left = num_left as u32;
5054
5055 let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
5056 if left_packed.key_bytes != index.key_bytes {
5057 return Err(XlogError::Kernel(
5058 "Join key byte width mismatch between probe and cached index".to_string(),
5059 ));
5060 }
5061
5062 let table = &index.table;
5063
5064 let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
5066
5067 let semi_func = self
5068 .device
5069 .inner()
5070 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
5071 .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
5072
5073 let block_size = 256u32;
5074 let grid_size = num_left.div_ceil(block_size);
5075 let config = LaunchConfig {
5076 grid_dim: (grid_size, 1, 1),
5077 block_dim: (block_size, 1, 1),
5078 shared_mem_bytes: 0,
5079 };
5080
5081 unsafe {
5083 semi_func
5084 .clone()
5085 .launch(
5086 config,
5087 (
5088 &left_packed.hashes,
5089 num_left,
5090 &table.bucket_offsets,
5091 &table.bucket_counts,
5092 &table.bucket_entries,
5093 &table.bucket_entry_hashes,
5094 table.bucket_mask,
5095 &left_packed.packed_keys,
5096 &index.packed_keys,
5097 index.key_bytes,
5098 &d_has_match,
5099 ),
5100 )
5101 .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
5102 }
5103
5104 let probe_func = self
5105 .device
5106 .inner()
5107 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
5108 .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
5109
5110 let mut d_count_only = self.memory.alloc::<u32>(1)?;
5112 self.device
5113 .inner()
5114 .memset_zeros(&mut d_count_only)
5115 .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
5116 let d_dummy_left = self.memory.alloc::<u32>(1)?;
5117 let d_dummy_right = self.memory.alloc::<u32>(1)?;
5118 let max_output_count_only = 0u32;
5119
5120 unsafe {
5122 let mut params: Vec<*mut c_void> = vec![
5123 (&left_packed.hashes).as_kernel_param(),
5124 num_left.as_kernel_param(),
5125 (&table.bucket_offsets).as_kernel_param(),
5126 (&table.bucket_counts).as_kernel_param(),
5127 (&table.bucket_entries).as_kernel_param(),
5128 (&table.bucket_entry_hashes).as_kernel_param(),
5129 table.bucket_mask.as_kernel_param(),
5130 (&left_packed.packed_keys).as_kernel_param(),
5131 (&index.packed_keys).as_kernel_param(),
5132 index.key_bytes.as_kernel_param(),
5133 (&d_dummy_left).as_kernel_param(),
5134 (&d_dummy_right).as_kernel_param(),
5135 (&d_count_only).as_kernel_param(),
5136 max_output_count_only.as_kernel_param(),
5137 ];
5138 probe_func
5139 .clone()
5140 .launch(config, &mut params)
5141 .map_err(|e| {
5142 XlogError::Kernel(format!("hash_join_probe_v2 (count) failed: {}", e))
5143 })?;
5144 }
5145
5146 self.device.synchronize()?;
5147
5148 let full_inner = self.read_join_output_count_metadata(&d_count_only)? as u64;
5151 let requested_inner = max_output
5152 .map(|limit| (limit as u64).min(full_inner))
5153 .unwrap_or(full_inner);
5154
5155 if requested_inner > u32::MAX as u64 {
5156 return Err(XlogError::Kernel(format!(
5157 "Join produced {} rows which exceeds the u32 index limit",
5158 requested_inner
5159 )));
5160 }
5161
5162 let max_output = requested_inner as u32;
5163 let alloc_len = (requested_inner.max(1)) as usize;
5164 let d_output_left = self.memory.alloc::<u32>(alloc_len)?;
5165 let d_output_right = self.memory.alloc::<u32>(alloc_len)?;
5166 let mut d_output_count = self.memory.alloc::<u32>(1)?;
5167 self.device
5168 .inner()
5169 .memset_zeros(&mut d_output_count)
5170 .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
5171
5172 unsafe {
5174 let mut params: Vec<*mut c_void> = vec![
5175 (&left_packed.hashes).as_kernel_param(),
5176 num_left.as_kernel_param(),
5177 (&table.bucket_offsets).as_kernel_param(),
5178 (&table.bucket_counts).as_kernel_param(),
5179 (&table.bucket_entries).as_kernel_param(),
5180 (&table.bucket_entry_hashes).as_kernel_param(),
5181 table.bucket_mask.as_kernel_param(),
5182 (&left_packed.packed_keys).as_kernel_param(),
5183 (&index.packed_keys).as_kernel_param(),
5184 index.key_bytes.as_kernel_param(),
5185 (&d_output_left).as_kernel_param(),
5186 (&d_output_right).as_kernel_param(),
5187 (&d_output_count).as_kernel_param(),
5188 max_output.as_kernel_param(),
5189 ];
5190 probe_func
5191 .clone()
5192 .launch(config, &mut params)
5193 .map_err(|e| XlogError::Kernel(format!("hash_join_probe_v2 failed: {}", e)))?;
5194 }
5195
5196 self.device.synchronize()?;
5197
5198 let device = self.device.inner();
5199
5200 let inner_count = self
5205 .read_join_output_count_metadata(&d_output_count)?
5206 .min(max_output);
5207
5208 let mask_not_fn = device
5209 .get_func(FILTER_MODULE, filter_kernels::MASK_NOT)
5210 .ok_or_else(|| XlogError::Kernel("mask_not kernel not found".to_string()))?;
5211
5212 let mut d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
5213
5214 unsafe {
5216 mask_not_fn
5217 .clone()
5218 .launch(config, (&d_has_match, &mut d_no_match, num_left))
5219 }
5220 .map_err(|e| XlogError::Kernel(format!("mask_not failed: {}", e)))?;
5221
5222 let unmatched_left = self.filter_by_device_mask(left, &d_no_match)?;
5223
5224 let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
5225 let total_rows = (inner_count as u64) + unmatched_rows;
5226
5227 let combined_schema = self.combine_schemas(left.schema(), right.schema());
5228
5229 if total_rows == 0 {
5230 return self.create_empty_buffer(combined_schema);
5231 }
5232
5233 let inner_left = self.gather_buffer_by_indices(left, &d_output_left, inner_count)?;
5234 let inner_right = self.gather_buffer_by_indices(right, &d_output_right, inner_count)?;
5235
5236 if unmatched_rows == 0 {
5237 let mut result_columns = Vec::with_capacity(combined_schema.arity());
5238 result_columns.extend(inner_left.columns);
5239 result_columns.extend(inner_right.columns);
5240 return self.buffer_from_columns(result_columns, inner_count as u64, combined_schema);
5241 }
5242
5243 if inner_count == 0 {
5244 let mut result_columns = Vec::with_capacity(combined_schema.arity());
5245 result_columns.extend(unmatched_left.columns);
5246
5247 for col_idx in 0..right.arity() {
5248 let elem_size = right
5249 .schema()
5250 .column_type(col_idx)
5251 .map(|t| t.size_bytes())
5252 .unwrap_or(4);
5253
5254 let bytes = (unmatched_rows as usize)
5255 .checked_mul(elem_size)
5256 .ok_or_else(|| {
5257 XlogError::Kernel(
5258 "Left outer join: right column byte size overflow".to_string(),
5259 )
5260 })?;
5261
5262 let mut dst_col = self.memory.alloc::<u8>(bytes)?;
5263 if bytes > 0 {
5264 device.memset_zeros(&mut dst_col).map_err(|e| {
5265 XlogError::Kernel(format!("Failed to zero null right column: {}", e))
5266 })?;
5267 }
5268 result_columns.push(dst_col.into());
5269 }
5270
5271 self.device.synchronize()?;
5272 return self.buffer_from_columns(result_columns, unmatched_rows, combined_schema);
5273 }
5274
5275 let mut result_columns = Vec::with_capacity(combined_schema.arity());
5276 let inner_rows = inner_count as u64;
5277
5278 for (col_idx, (inner_col, unmatched_col)) in inner_left
5279 .columns
5280 .into_iter()
5281 .zip(unmatched_left.columns)
5282 .enumerate()
5283 {
5284 let elem_size = left
5285 .schema()
5286 .column_type(col_idx)
5287 .map(|t| t.size_bytes())
5288 .unwrap_or(4);
5289
5290 let inner_bytes = (inner_rows as usize)
5291 .checked_mul(elem_size)
5292 .ok_or_else(|| {
5293 XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
5294 })?;
5295 let unmatched_bytes = (unmatched_rows as usize)
5296 .checked_mul(elem_size)
5297 .ok_or_else(|| {
5298 XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
5299 })?;
5300 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
5301 XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
5302 })?;
5303
5304 let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
5305
5306 if inner_bytes > 0 {
5307 let mut out_view = out_col.slice_mut(0..inner_bytes);
5308 device.dtod_copy(&inner_col, &mut out_view).map_err(|e| {
5309 XlogError::Kernel(format!("Failed to copy inner left column: {}", e))
5310 })?;
5311 }
5312 if unmatched_bytes > 0 {
5313 let mut out_view = out_col.slice_mut(inner_bytes..total_bytes);
5314 let unmatched_view = self.column_bytes_view(&unmatched_col, unmatched_bytes)?;
5315 device
5316 .dtod_copy(&unmatched_view, &mut out_view)
5317 .map_err(|e| {
5318 XlogError::Kernel(format!("Failed to copy unmatched left column: {}", e))
5319 })?;
5320 }
5321
5322 result_columns.push(out_col.into());
5323 }
5324
5325 for (col_idx, inner_col) in inner_right.columns.into_iter().enumerate() {
5326 let elem_size = right
5327 .schema()
5328 .column_type(col_idx)
5329 .map(|t| t.size_bytes())
5330 .unwrap_or(4);
5331
5332 let inner_bytes = (inner_rows as usize)
5333 .checked_mul(elem_size)
5334 .ok_or_else(|| {
5335 XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
5336 })?;
5337 let unmatched_bytes = (unmatched_rows as usize)
5338 .checked_mul(elem_size)
5339 .ok_or_else(|| {
5340 XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
5341 })?;
5342 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
5343 XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
5344 })?;
5345
5346 let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
5347
5348 if total_bytes > 0 {
5349 device.memset_zeros(&mut out_col).map_err(|e| {
5350 XlogError::Kernel(format!("Failed to zero right outer column: {}", e))
5351 })?;
5352 }
5353
5354 if inner_bytes > 0 {
5355 let mut out_view = out_col.slice_mut(0..inner_bytes);
5356 device.dtod_copy(&inner_col, &mut out_view).map_err(|e| {
5357 XlogError::Kernel(format!("Failed to copy inner right column: {}", e))
5358 })?;
5359 }
5360
5361 result_columns.push(out_col.into());
5362 }
5363
5364 self.device.synchronize()?;
5365
5366 self.buffer_from_columns(result_columns, total_rows, combined_schema)
5367 }
5368
5369 fn hash_join_left_outer_impl(
5371 &self,
5372 left: &CudaBuffer,
5373 right: &CudaBuffer,
5374 left_keys: &[usize],
5375 right_keys: &[usize],
5376 max_output: Option<usize>,
5377 ) -> Result<CudaBuffer> {
5378 let num_left = self.device_row_count(left)?;
5379 let num_right = self.device_row_count(right)?;
5380 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
5381 return Err(XlogError::Kernel(format!(
5382 "Join supports at most {} rows per side (left={}, right={})",
5383 u32::MAX,
5384 num_left,
5385 num_right
5386 )));
5387 }
5388
5389 if num_left == 0 {
5391 let combined_schema = self.combine_schemas(left.schema(), right.schema());
5392 return self.create_empty_buffer(combined_schema);
5393 }
5394
5395 if num_right == 0 {
5397 return self.left_outer_with_nulls(left, right);
5398 }
5399
5400 if left_keys.is_empty() || right_keys.is_empty() {
5402 return Err(XlogError::Kernel(
5403 "Join requires at least one key column".to_string(),
5404 ));
5405 }
5406 if left_keys.len() != right_keys.len() {
5407 return Err(XlogError::Kernel(
5408 "Left and right key columns must have same length".to_string(),
5409 ));
5410 }
5411
5412 for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
5414 let left_type = left.schema().column_type(left_idx);
5415 let right_type = right.schema().column_type(right_idx);
5416 if left_type != right_type {
5417 return Err(XlogError::Kernel(format!(
5418 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
5419 left_idx, left_type, right_idx, right_type
5420 )));
5421 }
5422 }
5423
5424 let num_left = num_left as u32;
5425 let num_right = num_right as u32;
5426
5427 let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
5429 let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
5430
5431 let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
5433
5434 let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
5436
5437 let semi_func = self
5439 .device
5440 .inner()
5441 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
5442 .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
5443
5444 let block_size = 256u32;
5445 let grid_size = num_left.div_ceil(block_size);
5446 let config = LaunchConfig {
5447 grid_dim: (grid_size, 1, 1),
5448 block_dim: (block_size, 1, 1),
5449 shared_mem_bytes: 0,
5450 };
5451
5452 unsafe {
5457 semi_func
5458 .clone()
5459 .launch(
5460 config,
5461 (
5462 &left_packed.hashes,
5463 num_left,
5464 &table.bucket_offsets,
5465 &table.bucket_counts,
5466 &table.bucket_entries,
5467 &table.bucket_entry_hashes,
5468 table.bucket_mask,
5469 &left_packed.packed_keys,
5470 &right_packed.packed_keys,
5471 left_packed.key_bytes,
5472 &d_has_match,
5473 ),
5474 )
5475 .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
5476 }
5477
5478 let probe_func = self
5479 .device
5480 .inner()
5481 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
5482 .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
5483
5484 let mut d_count_only = self.memory.alloc::<u32>(1)?;
5486 self.device
5487 .inner()
5488 .memset_zeros(&mut d_count_only)
5489 .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
5490 let d_dummy_left = self.memory.alloc::<u32>(1)?;
5491 let d_dummy_right = self.memory.alloc::<u32>(1)?;
5492 let max_output_count_only = 0u32;
5493
5494 unsafe {
5496 let mut params: Vec<*mut c_void> = vec![
5497 (&left_packed.hashes).as_kernel_param(),
5498 num_left.as_kernel_param(),
5499 (&table.bucket_offsets).as_kernel_param(),
5500 (&table.bucket_counts).as_kernel_param(),
5501 (&table.bucket_entries).as_kernel_param(),
5502 (&table.bucket_entry_hashes).as_kernel_param(),
5503 table.bucket_mask.as_kernel_param(),
5504 (&left_packed.packed_keys).as_kernel_param(),
5505 (&right_packed.packed_keys).as_kernel_param(),
5506 left_packed.key_bytes.as_kernel_param(),
5507 (&d_dummy_left).as_kernel_param(),
5508 (&d_dummy_right).as_kernel_param(),
5509 (&d_count_only).as_kernel_param(),
5510 max_output_count_only.as_kernel_param(),
5511 ];
5512 probe_func
5513 .clone()
5514 .launch(config, &mut params)
5515 .map_err(|e| {
5516 XlogError::Kernel(format!("hash_join_probe_v2 (count) failed: {}", e))
5517 })?;
5518 }
5519
5520 self.device.synchronize()?;
5521
5522 let full_inner = self.read_join_output_count_metadata(&d_count_only)? as u64;
5525 let requested_inner = max_output
5526 .map(|limit| (limit as u64).min(full_inner))
5527 .unwrap_or(full_inner);
5528
5529 if requested_inner > u32::MAX as u64 {
5530 return Err(XlogError::Kernel(format!(
5531 "Join produced {} rows which exceeds the u32 index limit",
5532 requested_inner
5533 )));
5534 }
5535
5536 let max_output = requested_inner as u32;
5537 let alloc_len = (requested_inner.max(1)) as usize;
5538 let d_output_left = self.memory.alloc::<u32>(alloc_len)?;
5539 let d_output_right = self.memory.alloc::<u32>(alloc_len)?;
5540 let mut d_output_count = self.memory.alloc::<u32>(1)?;
5541 self.device
5542 .inner()
5543 .memset_zeros(&mut d_output_count)
5544 .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
5545
5546 unsafe {
5553 let mut params: Vec<*mut c_void> = vec![
5554 (&left_packed.hashes).as_kernel_param(),
5555 num_left.as_kernel_param(),
5556 (&table.bucket_offsets).as_kernel_param(),
5557 (&table.bucket_counts).as_kernel_param(),
5558 (&table.bucket_entries).as_kernel_param(),
5559 (&table.bucket_entry_hashes).as_kernel_param(),
5560 table.bucket_mask.as_kernel_param(),
5561 (&left_packed.packed_keys).as_kernel_param(),
5562 (&right_packed.packed_keys).as_kernel_param(),
5563 left_packed.key_bytes.as_kernel_param(),
5564 (&d_output_left).as_kernel_param(),
5565 (&d_output_right).as_kernel_param(),
5566 (&d_output_count).as_kernel_param(),
5567 max_output.as_kernel_param(),
5568 ];
5569 probe_func
5570 .clone()
5571 .launch(config, &mut params)
5572 .map_err(|e| XlogError::Kernel(format!("hash_join_probe_v2 failed: {}", e)))?;
5573 }
5574
5575 self.device.synchronize()?;
5576
5577 let device = self.device.inner();
5578
5579 let inner_count = self
5584 .read_join_output_count_metadata(&d_output_count)?
5585 .min(max_output);
5586
5587 let mask_not_fn = device
5589 .get_func(FILTER_MODULE, filter_kernels::MASK_NOT)
5590 .ok_or_else(|| XlogError::Kernel("mask_not kernel not found".to_string()))?;
5591
5592 let mut d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
5593
5594 unsafe {
5596 mask_not_fn
5597 .clone()
5598 .launch(config, (&d_has_match, &mut d_no_match, num_left))
5599 }
5600 .map_err(|e| XlogError::Kernel(format!("mask_not failed: {}", e)))?;
5601
5602 let unmatched_left = self.filter_by_device_mask(left, &d_no_match)?;
5603
5604 let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
5605 let total_rows = (inner_count as u64) + unmatched_rows;
5606
5607 let combined_schema = self.combine_schemas(left.schema(), right.schema());
5608
5609 if total_rows == 0 {
5610 return self.create_empty_buffer(combined_schema);
5611 }
5612
5613 let inner_left = self.gather_buffer_by_indices(left, &d_output_left, inner_count)?;
5615 let inner_right = self.gather_buffer_by_indices(right, &d_output_right, inner_count)?;
5616
5617 if unmatched_rows == 0 {
5618 let mut result_columns = Vec::with_capacity(combined_schema.arity());
5619 result_columns.extend(inner_left.columns);
5620 result_columns.extend(inner_right.columns);
5621 return self.buffer_from_columns(result_columns, inner_count as u64, combined_schema);
5622 }
5623
5624 if inner_count == 0 {
5625 let mut result_columns = Vec::with_capacity(combined_schema.arity());
5626 result_columns.extend(unmatched_left.columns);
5627
5628 for col_idx in 0..right.arity() {
5629 let elem_size = right
5630 .schema()
5631 .column_type(col_idx)
5632 .map(|t| t.size_bytes())
5633 .unwrap_or(4);
5634
5635 let bytes = (unmatched_rows as usize)
5636 .checked_mul(elem_size)
5637 .ok_or_else(|| {
5638 XlogError::Kernel(
5639 "Left outer join: right column byte size overflow".to_string(),
5640 )
5641 })?;
5642
5643 let mut dst_col = self.memory.alloc::<u8>(bytes)?;
5644 if bytes > 0 {
5645 device.memset_zeros(&mut dst_col).map_err(|e| {
5646 XlogError::Kernel(format!("Failed to zero null right column: {}", e))
5647 })?;
5648 }
5649 result_columns.push(dst_col.into());
5650 }
5651
5652 self.device.synchronize()?;
5653 return self.buffer_from_columns(result_columns, unmatched_rows, combined_schema);
5654 }
5655
5656 let mut result_columns = Vec::with_capacity(combined_schema.arity());
5658 let inner_rows = inner_count as u64;
5659
5660 for (col_idx, (inner_col, unmatched_col)) in inner_left
5662 .columns
5663 .into_iter()
5664 .zip(unmatched_left.columns)
5665 .enumerate()
5666 {
5667 let elem_size = left
5668 .schema()
5669 .column_type(col_idx)
5670 .map(|t| t.size_bytes())
5671 .unwrap_or(4);
5672
5673 let inner_bytes = (inner_rows as usize)
5674 .checked_mul(elem_size)
5675 .ok_or_else(|| {
5676 XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
5677 })?;
5678 let unmatched_bytes = (unmatched_rows as usize)
5679 .checked_mul(elem_size)
5680 .ok_or_else(|| {
5681 XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
5682 })?;
5683 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
5684 XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
5685 })?;
5686
5687 let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
5688
5689 if inner_bytes > 0 {
5690 let mut out_view = out_col.slice_mut(0..inner_bytes);
5691 device.dtod_copy(&inner_col, &mut out_view).map_err(|e| {
5692 XlogError::Kernel(format!("Failed to copy inner left column: {}", e))
5693 })?;
5694 }
5695 if unmatched_bytes > 0 {
5696 let mut out_view = out_col.slice_mut(inner_bytes..total_bytes);
5697 let unmatched_view = self.column_bytes_view(&unmatched_col, unmatched_bytes)?;
5698 device
5699 .dtod_copy(&unmatched_view, &mut out_view)
5700 .map_err(|e| {
5701 XlogError::Kernel(format!("Failed to copy unmatched left column: {}", e))
5702 })?;
5703 }
5704
5705 result_columns.push(out_col.into());
5706 }
5707
5708 for (col_idx, inner_col) in inner_right.columns.into_iter().enumerate() {
5710 let elem_size = right
5711 .schema()
5712 .column_type(col_idx)
5713 .map(|t| t.size_bytes())
5714 .unwrap_or(4);
5715
5716 let inner_bytes = (inner_rows as usize)
5717 .checked_mul(elem_size)
5718 .ok_or_else(|| {
5719 XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
5720 })?;
5721 let unmatched_bytes = (unmatched_rows as usize)
5722 .checked_mul(elem_size)
5723 .ok_or_else(|| {
5724 XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
5725 })?;
5726 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
5727 XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
5728 })?;
5729
5730 let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
5731
5732 if total_bytes > 0 {
5733 device.memset_zeros(&mut out_col).map_err(|e| {
5734 XlogError::Kernel(format!("Failed to zero right outer column: {}", e))
5735 })?;
5736 }
5737
5738 if inner_bytes > 0 {
5739 let mut out_view = out_col.slice_mut(0..inner_bytes);
5740 device.dtod_copy(&inner_col, &mut out_view).map_err(|e| {
5741 XlogError::Kernel(format!("Failed to copy inner right column: {}", e))
5742 })?;
5743 }
5744
5745 result_columns.push(out_col.into());
5746 }
5747
5748 self.device.synchronize()?;
5749
5750 self.buffer_from_columns(result_columns, total_rows, combined_schema)
5751 }
5752
5753 fn left_outer_with_nulls(&self, left: &CudaBuffer, right: &CudaBuffer) -> Result<CudaBuffer> {
5755 let combined_schema = self.combine_schemas(left.schema(), right.schema());
5756 let num_rows = self.device_row_count(left)? as u64;
5757 if num_rows == 0 {
5758 return self.create_empty_buffer(combined_schema);
5759 }
5760 let device = self.device.inner();
5761
5762 let mut result_columns = Vec::with_capacity(combined_schema.arity());
5763
5764 for col_idx in 0..left.arity() {
5766 let col = left
5767 .column(col_idx)
5768 .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
5769
5770 let elem_size = left
5771 .schema()
5772 .column_type(col_idx)
5773 .map(|t| t.size_bytes())
5774 .unwrap_or(4);
5775
5776 let bytes = (num_rows as usize) * elem_size;
5777 let mut dst_col = self.memory.alloc::<u8>(bytes)?;
5778 if bytes > 0 {
5779 let src_view = self.column_bytes_view(col, bytes)?;
5780 device
5781 .dtod_copy(&src_view, &mut dst_col)
5782 .map_err(|e| XlogError::Kernel(format!("Failed to copy left column: {}", e)))?;
5783 }
5784
5785 result_columns.push(dst_col.into());
5786 }
5787
5788 for col_idx in 0..right.arity() {
5790 let elem_size = right
5791 .schema()
5792 .column_type(col_idx)
5793 .map(|t| t.size_bytes())
5794 .unwrap_or(4);
5795
5796 let bytes = (num_rows as usize) * elem_size;
5797 let mut dst_col = self.memory.alloc::<u8>(bytes)?;
5798 if bytes > 0 {
5799 device
5800 .memset_zeros(&mut dst_col)
5801 .map_err(|e| XlogError::Kernel(format!("Failed to zero null column: {}", e)))?;
5802 }
5803
5804 result_columns.push(dst_col.into());
5805 }
5806
5807 self.device.synchronize()?;
5808
5809 self.buffer_from_columns(result_columns, num_rows, combined_schema)
5810 }
5811
5812 pub fn clone_buffer(&self, buffer: &CudaBuffer) -> Result<CudaBuffer> {
5817 let verify = {
5822 static ENABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
5823 *ENABLED.get_or_init(|| {
5824 std::env::var("XLOG_DEBUG_VERIFY_CLONES").map(|v| v == "1") == Ok(true)
5825 })
5826 };
5827
5828 let mut result_columns = Vec::with_capacity(buffer.arity());
5829 let device = self.device.inner();
5830
5831 for col_idx in 0..buffer.arity() {
5832 let src_col = buffer
5833 .column(col_idx)
5834 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
5835 let mut dst_col = self.memory.alloc::<u8>(src_col.len())?;
5836 if !src_col.is_empty() {
5837 device
5838 .dtod_copy(src_col, &mut dst_col)
5839 .map_err(|e| XlogError::Kernel(format!("Failed to clone column: {}", e)))?;
5840 }
5841 if verify && !src_col.is_empty() {
5842 self.device.synchronize()?;
5843 let mut src_host = vec![0u8; src_col.len()];
5844 let mut dst_host = vec![0u8; dst_col.len()];
5845 device
5846 .dtoh_sync_copy_into(src_col, &mut src_host)
5847 .map_err(|e| XlogError::Kernel(format!("verify src dtoh: {}", e)))?;
5848 device
5849 .dtoh_sync_copy_into(&dst_col, &mut dst_host)
5850 .map_err(|e| XlogError::Kernel(format!("verify dst dtoh: {}", e)))?;
5851 if src_host != dst_host {
5852 let first_diff = src_host
5853 .iter()
5854 .zip(dst_host.iter())
5855 .position(|(a, b)| a != b)
5856 .unwrap_or(0);
5857 return Err(XlogError::Kernel(format!(
5858 "CLONE VERIFY FAILED: column {} differs from source at byte {} of {} (clone is wrong at birth)",
5859 col_idx,
5860 first_diff,
5861 src_col.len(),
5862 )));
5863 }
5864 }
5865 result_columns.push(dst_col.into());
5866 }
5867
5868 let mut d_num_rows = self.memory.alloc::<u32>(1)?;
5869 device
5870 .dtod_copy(buffer.num_rows_device(), &mut d_num_rows)
5871 .map_err(|e| XlogError::Kernel(format!("Failed to clone row count: {}", e)))?;
5872
5873 let cloned = CudaBuffer::from_columns(
5874 result_columns,
5875 buffer.row_cap,
5876 d_num_rows,
5877 buffer.schema().clone(),
5878 );
5879 if let Some(cached) = buffer.cached_row_count() {
5882 cloned.set_cached_row_count_if_unset(cached);
5883 }
5884 Ok(cloned)
5885 }
5886 pub fn extract_column(&self, buffer: &CudaBuffer, col_idx: usize) -> Result<CudaBuffer> {
5897 if buffer.is_empty() {
5898 let col_type = buffer
5899 .schema()
5900 .column_type(col_idx)
5901 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
5902 let schema = Schema::new(vec![("col".to_string(), col_type)]);
5903 return self.create_empty_buffer(schema);
5904 }
5905
5906 let col_type = buffer
5907 .schema()
5908 .column_type(col_idx)
5909 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
5910 let src_col = buffer
5911 .column(col_idx)
5912 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found in buffer", col_idx)))?;
5913 let mut dst_col = self.memory.alloc::<u8>(src_col.len())?;
5914 let device = self.device.inner();
5915 if !src_col.is_empty() {
5916 device
5917 .dtod_copy(src_col, &mut dst_col)
5918 .map_err(|e| XlogError::Kernel(format!("Failed to copy column: {}", e)))?;
5919 }
5920
5921 let mut d_num_rows = self.memory.alloc::<u32>(1)?;
5922 device
5923 .dtod_copy(buffer.num_rows_device(), &mut d_num_rows)
5924 .map_err(|e| XlogError::Kernel(format!("Failed to copy row count: {}", e)))?;
5925 self.device.synchronize()?;
5926
5927 let schema = Schema::new(vec![("col".to_string(), col_type)]);
5928 Ok(CudaBuffer::from_columns(
5929 vec![dst_col.into()],
5930 buffer.row_cap,
5931 d_num_rows,
5932 schema,
5933 ))
5934 }
5935
5936 pub fn extract_active_rule_indices(
5939 &self,
5940 mask_hard: &CudaBuffer,
5941 mask_soft: &CudaBuffer,
5942 n: usize,
5943 max_active: usize,
5944 ) -> Result<Vec<(u32, u32, u32)>> {
5945 let total = n * n * n;
5946 let block_size = 256usize;
5947 let grid_size = total.div_ceil(block_size);
5948
5949 let mut out_i = self.memory().alloc::<u32>(total)?;
5950 let mut out_j = self.memory().alloc::<u32>(total)?;
5951 let mut out_k = self.memory().alloc::<u32>(total)?;
5952 let mut out_p = self.memory().alloc::<f32>(total)?;
5953 let mut count = self.memory().alloc::<u32>(1)?;
5954
5955 self.htod_launch_metadata_sync_copy_into(&[0u32], &mut count)
5956 .map_err(|e| XlogError::Kernel(format!("ILP htod count: {}", e)))?;
5957
5958 let hard_col = mask_hard
5959 .column(0)
5960 .ok_or_else(|| XlogError::Kernel("ILP hard mask has no column".into()))?;
5961 let soft_col = mask_soft
5962 .column(0)
5963 .ok_or_else(|| XlogError::Kernel("ILP soft mask has no column".into()))?;
5964
5965 let kernel = self
5966 .device()
5967 .inner()
5968 .get_func(ILP_MODULE, ilp_kernels::EXTRACT_NONZERO_INDICES)
5969 .ok_or_else(|| XlogError::Kernel("extract_nonzero_indices kernel not found".into()))?;
5970
5971 let hard_bytes = total * std::mem::size_of::<f32>();
5972 let soft_bytes = total * std::mem::size_of::<f32>();
5973 let hard_view = self.column_bytes_view(hard_col, hard_bytes)?;
5974 let soft_view = self.column_bytes_view(soft_col, soft_bytes)?;
5975
5976 unsafe {
5978 kernel
5979 .clone()
5980 .launch(
5981 cudarc::driver::LaunchConfig {
5982 grid_dim: (grid_size as u32, 1, 1),
5983 block_dim: (block_size as u32, 1, 1),
5984 shared_mem_bytes: 0,
5985 },
5986 (
5987 &hard_view, &soft_view, n as u32, &mut out_i, &mut out_j, &mut out_k,
5988 &mut out_p, &mut count,
5989 ),
5990 )
5991 .map_err(|e| {
5992 XlogError::Kernel(format!("Failed to launch extract_nonzero_indices: {}", e))
5993 })?;
5994 }
5995
5996 let mut count_host = [0u32];
5997 self.device()
5998 .inner()
5999 .dtoh_sync_copy_into(&count, &mut count_host)
6000 .map_err(|e| XlogError::Kernel(format!("ILP dtoh count: {}", e)))?;
6001 let active_count = count_host[0] as usize;
6002
6003 if active_count == 0 {
6004 return Ok(Vec::new());
6005 }
6006
6007 let mut i_host = vec![0u32; active_count];
6008 let mut j_host = vec![0u32; active_count];
6009 let mut k_host = vec![0u32; active_count];
6010 let mut p_host = vec![0f32; active_count];
6011
6012 let out_i_view = out_i
6013 .try_slice(0..active_count)
6014 .ok_or_else(|| XlogError::Kernel("ILP slice i out of bounds".into()))?;
6015 let out_j_view = out_j
6016 .try_slice(0..active_count)
6017 .ok_or_else(|| XlogError::Kernel("ILP slice j out of bounds".into()))?;
6018 let out_k_view = out_k
6019 .try_slice(0..active_count)
6020 .ok_or_else(|| XlogError::Kernel("ILP slice k out of bounds".into()))?;
6021 let out_p_view = out_p
6022 .try_slice(0..active_count)
6023 .ok_or_else(|| XlogError::Kernel("ILP slice p out of bounds".into()))?;
6024
6025 self.device()
6026 .inner()
6027 .dtoh_sync_copy_into(&out_i_view, &mut i_host)
6028 .map_err(|e| XlogError::Kernel(format!("ILP dtoh i: {}", e)))?;
6029 self.device()
6030 .inner()
6031 .dtoh_sync_copy_into(&out_j_view, &mut j_host)
6032 .map_err(|e| XlogError::Kernel(format!("ILP dtoh j: {}", e)))?;
6033 self.device()
6034 .inner()
6035 .dtoh_sync_copy_into(&out_k_view, &mut k_host)
6036 .map_err(|e| XlogError::Kernel(format!("ILP dtoh k: {}", e)))?;
6037 self.device()
6038 .inner()
6039 .dtoh_sync_copy_into(&out_p_view, &mut p_host)
6040 .map_err(|e| XlogError::Kernel(format!("ILP dtoh p: {}", e)))?;
6041
6042 let mut indices: Vec<(f32, u32, u32, u32)> = (0..active_count)
6043 .map(|idx| (p_host[idx], i_host[idx], j_host[idx], k_host[idx]))
6044 .collect();
6045 indices.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
6046 indices.truncate(max_active);
6047
6048 Ok(indices.into_iter().map(|(_, i, j, k)| (i, j, k)).collect())
6049 }
6050
6051 #[allow(clippy::too_many_arguments)]
6077 fn radix_sort_u32_pairs_with_scratch_on_stream(
6078 &self,
6079 keys_a: &mut TrackedCudaSlice<u32>,
6080 keys_b: &mut TrackedCudaSlice<u32>,
6081 indices_a: &mut TrackedCudaSlice<u32>,
6082 indices_b: &mut TrackedCudaSlice<u32>,
6083 hist: &mut TrackedCudaSlice<u32>,
6084 prefix: &mut TrackedCudaSlice<u32>,
6085 ranks: &mut TrackedCudaSlice<u32>,
6086 num_rows_device: &TrackedCudaSlice<u32>,
6087 row_cap: u32,
6088 cu_stream: &cudarc::driver::CudaStream,
6089 launch_stream: StreamId,
6090 runtime: &crate::device_runtime::XlogDeviceRuntime,
6091 ) -> Result<()> {
6092 if row_cap == 0 {
6093 return Ok(());
6094 }
6095 let device = self.device.inner();
6096 let block_size = Self::SORT_BLOCK_SIZE;
6097 let grid_size = row_cap.div_ceil(block_size);
6098 let sort_config = LaunchConfig {
6099 grid_dim: (grid_size, 1, 1),
6100 block_dim: (block_size, 1, 1),
6101 shared_mem_bytes: 0,
6102 };
6103
6104 let histogram_fn = device
6105 .get_func(SORT_MODULE, sort_kernels::RADIX_HISTOGRAM)
6106 .ok_or_else(|| XlogError::Kernel("radix_histogram kernel not found".to_string()))?;
6107 let prefix_fn = device
6108 .get_func(SORT_MODULE, sort_kernels::COMPUTE_DIGIT_PREFIX_SUMS)
6109 .ok_or_else(|| {
6110 XlogError::Kernel("compute_digit_prefix_sums kernel not found".to_string())
6111 })?;
6112 let ranks_fn = device
6113 .get_func(SORT_MODULE, sort_kernels::COMPUTE_RANKS)
6114 .ok_or_else(|| XlogError::Kernel("compute_ranks kernel not found".to_string()))?;
6115 let scatter_fn = device
6116 .get_func(SORT_MODULE, sort_kernels::RADIX_SCATTER_STABLE)
6117 .ok_or_else(|| {
6118 XlogError::Kernel("radix_scatter_stable kernel not found".to_string())
6119 })?;
6120 let prefix_config = LaunchConfig {
6121 grid_dim: (1, 1, 1),
6122 block_dim: (256, 1, 1),
6123 shared_mem_bytes: 0,
6124 };
6125
6126 let mut in_a = true;
6127 for pass in 0..8u32 {
6128 let shift = pass * 4;
6129 let (keys_in, indices_in, keys_out, indices_out) = if in_a {
6130 (&*keys_a, &*indices_a, &mut *keys_b, &mut *indices_b)
6131 } else {
6132 (&*keys_b, &*indices_b, &mut *keys_a, &mut *indices_a)
6133 };
6134
6135 unsafe {
6137 histogram_fn.clone().launch_on_stream(
6138 cu_stream,
6139 sort_config,
6140 (keys_in, num_rows_device, row_cap, &mut *hist, shift),
6141 )
6142 }
6143 .map_err(|e| XlogError::Kernel(format!("radix_histogram (on_stream) failed: {}", e)))?;
6144
6145 unsafe {
6147 prefix_fn.clone().launch_on_stream(
6148 cu_stream,
6149 prefix_config,
6150 (&*hist, grid_size, &mut *prefix),
6151 )
6152 }
6153 .map_err(|e| {
6154 XlogError::Kernel(format!(
6155 "compute_digit_prefix_sums (on_stream) failed: {}",
6156 e
6157 ))
6158 })?;
6159
6160 for digit in 0..16u32 {
6163 let start = (digit * grid_size) as usize;
6164 let end = start + (grid_size as usize);
6165 let mut digit_slice = hist.slice_mut(start..end);
6166 self.multiblock_scan_u32_view_inplace_on_stream(
6167 &mut digit_slice,
6168 grid_size,
6169 cu_stream,
6170 launch_stream,
6171 runtime,
6172 )?;
6173 }
6174
6175 unsafe {
6177 ranks_fn.clone().launch_on_stream(
6178 cu_stream,
6179 sort_config,
6180 (keys_in, num_rows_device, row_cap, &mut *ranks, shift),
6181 )
6182 }
6183 .map_err(|e| XlogError::Kernel(format!("compute_ranks (on_stream) failed: {}", e)))?;
6184
6185 unsafe {
6188 scatter_fn.clone().launch_on_stream(
6189 cu_stream,
6190 sort_config,
6191 (
6192 keys_in,
6193 indices_in,
6194 &*ranks,
6195 keys_out,
6196 indices_out,
6197 &*prefix,
6198 &*hist,
6199 num_rows_device,
6200 row_cap,
6201 shift,
6202 ),
6203 )
6204 }
6205 .map_err(|e| {
6206 XlogError::Kernel(format!("radix_scatter_stable (on_stream) failed: {}", e))
6207 })?;
6208
6209 in_a = !in_a;
6210 }
6211
6212 if !in_a {
6213 return Err(XlogError::Kernel(
6214 "Unexpected radix-sort buffer parity (expected even number of passes)".to_string(),
6215 ));
6216 }
6217 Ok(())
6218 }
6219
6220 fn apply_permutation_gpu_on_stream(
6226 &self,
6227 input: &CudaBuffer,
6228 permutation: &TrackedCudaSlice<u32>,
6229 dst_cols: &mut [TrackedCudaSlice<u8>],
6230 cu_stream: &cudarc::driver::CudaStream,
6231 ) -> Result<()> {
6232 let row_cap = input.num_rows() as u32;
6233 let d_num_rows = input.num_rows_device();
6234 let device = self.device.inner();
6235
6236 let grid_size = row_cap.div_ceil(Self::SORT_BLOCK_SIZE);
6237 let launch_config = LaunchConfig {
6238 grid_dim: (grid_size, 1, 1),
6239 block_dim: (Self::SORT_BLOCK_SIZE, 1, 1),
6240 shared_mem_bytes: 0,
6241 };
6242
6243 let apply_perm_fn = device
6244 .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
6245 .ok_or_else(|| {
6246 XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
6247 })?;
6248
6249 if dst_cols.len() != input.columns.len() {
6250 return Err(XlogError::Kernel(format!(
6251 "apply_permutation_gpu_on_stream: dst_cols.len()={} mismatches input.cols={}",
6252 dst_cols.len(),
6253 input.columns.len()
6254 )));
6255 }
6256
6257 for (col_idx, dst_col) in dst_cols.iter_mut().enumerate() {
6258 let src_col = input
6259 .column(col_idx)
6260 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
6261 let elem_size = input
6262 .schema
6263 .column_type(col_idx)
6264 .ok_or_else(|| {
6265 XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
6266 })?
6267 .size_bytes() as u32;
6268 let output_bytes = (row_cap as usize) * (elem_size as usize);
6269 if src_col.num_bytes() != output_bytes {
6270 return Err(XlogError::Kernel(format!(
6271 "Column {} has {} bytes but expected {} (num_rows={}, elem_size={})",
6272 col_idx,
6273 src_col.num_bytes(),
6274 output_bytes,
6275 row_cap,
6276 elem_size
6277 )));
6278 }
6279 unsafe {
6282 apply_perm_fn.clone().launch_on_stream(
6283 cu_stream,
6284 launch_config,
6285 (
6286 src_col,
6287 &mut *dst_col,
6288 permutation,
6289 d_num_rows,
6290 row_cap,
6291 elem_size,
6292 ),
6293 )
6294 }
6295 .map_err(|e| {
6296 XlogError::Kernel(format!("apply_permutation_bytes (on_stream) failed: {}", e))
6297 })?;
6298 }
6299 Ok(())
6300 }
6301
6302 pub fn sort_recorded(
6320 &self,
6321 input: &CudaBuffer,
6322 key_cols: &[usize],
6323 launch_stream: StreamId,
6324 ) -> Result<CudaBuffer> {
6325 let runtime = self.memory.runtime().ok_or_else(|| {
6326 XlogError::Kernel(
6327 "sort_recorded requires a runtime-backed GpuMemoryManager (with_runtime)"
6328 .to_string(),
6329 )
6330 })?;
6331 let cu_stream = runtime
6332 .stream_pool()
6333 .resolve(launch_stream)
6334 .ok_or_else(|| {
6335 XlogError::Kernel(format!(
6336 "sort_recorded: launch_stream StreamId({}) does not resolve",
6337 launch_stream.0
6338 ))
6339 })?;
6340
6341 if input.num_rows() == 0 {
6342 return self.create_empty_buffer(input.schema.clone());
6343 }
6344 if key_cols.is_empty() {
6345 return Err(XlogError::Kernel(
6346 "Sort requires at least one key column".to_string(),
6347 ));
6348 }
6349 if input.num_rows() > u32::MAX as u64 {
6350 return Err(XlogError::Kernel(format!(
6351 "Sort supports at most {} rows, got {}",
6352 u32::MAX,
6353 input.num_rows()
6354 )));
6355 }
6356 for &k in key_cols {
6357 if k >= input.arity() {
6358 return Err(XlogError::Kernel(format!(
6359 "Key column index {} out of bounds (arity {})",
6360 k,
6361 input.arity()
6362 )));
6363 }
6364 let ty = input.schema.column_type(k).ok_or_else(|| {
6365 XlogError::Kernel(format!("Key column {} type not found in schema", k))
6366 })?;
6367 if !matches!(ty, ScalarType::U32 | ScalarType::Symbol | ScalarType::U64) {
6368 return Err(XlogError::Kernel(format!(
6369 "sort_recorded supports only U32 / Symbol / U64 key columns; \
6370 got {:?} for column {}",
6371 ty, k
6372 )));
6373 }
6374 }
6375
6376 let n = input.num_rows() as u32;
6377 let block_size = Self::SORT_BLOCK_SIZE;
6378 let grid_size = n.div_ceil(block_size);
6379 let device = self.device.inner();
6380 let launch_config = LaunchConfig {
6381 grid_dim: (grid_size, 1, 1),
6382 block_dim: (block_size, 1, 1),
6383 shared_mem_bytes: 0,
6384 };
6385
6386 let mut indices_a = self.memory.alloc::<u32>(n as usize)?;
6389 let mut indices_b = self.memory.alloc::<u32>(n as usize)?;
6390 let mut keys_a = self.memory.alloc::<u32>(n as usize)?;
6391 let mut keys_b = self.memory.alloc::<u32>(n as usize)?;
6392 let mut d_hist = self.memory.alloc::<u32>((grid_size as usize) * 16)?;
6393 let mut d_prefix = self.memory.alloc::<u32>(16)?;
6394 let mut d_ranks = self.memory.alloc::<u32>(n as usize)?;
6395 let output_d_num_rows = self.memory.alloc::<u32>(1)?;
6401
6402 let mut dst_cols: Vec<TrackedCudaSlice<u8>> = Vec::with_capacity(input.columns.len());
6403 for col_idx in 0..input.columns.len() {
6404 let elem_size = input
6405 .schema
6406 .column_type(col_idx)
6407 .ok_or_else(|| {
6408 XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
6409 })?
6410 .size_bytes();
6411 dst_cols.push(self.memory.alloc::<u8>((n as usize) * elem_size)?);
6412 }
6413
6414 let mut rec = LaunchRecorder::new_strict(launch_stream);
6415 rec.read(input.num_rows_device());
6416 for col_idx in 0..input.columns.len() {
6417 let c = input
6418 .column(col_idx)
6419 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
6420 rec.read_column(c);
6421 }
6422 rec.write(&indices_a);
6426 rec.write(&indices_b);
6427 rec.write(&keys_a);
6428 rec.write(&keys_b);
6429 rec.write(&d_hist);
6430 rec.write(&d_prefix);
6431 rec.write(&d_ranks);
6432 rec.write(&output_d_num_rows);
6433 for dst_col in &dst_cols {
6434 rec.write(dst_col);
6435 }
6436 rec.preflight(runtime)
6437 .map_err(|e| XlogError::Kernel(format!("sort_recorded: preflight failed: {}", e)))?;
6438
6439 let init_fn = device
6441 .get_func(SORT_MODULE, sort_kernels::INIT_INDICES)
6442 .ok_or_else(|| XlogError::Kernel("init_indices kernel not found".to_string()))?;
6443 unsafe {
6445 init_fn.clone().launch_on_stream(
6446 &cu_stream,
6447 launch_config,
6448 (&mut indices_a, input.num_rows_device(), n),
6449 )
6450 }
6451 .map_err(|e| XlogError::Kernel(format!("init_indices (on_stream) failed: {}", e)))?;
6452
6453 for &col_idx in key_cols.iter().rev() {
6460 let col = input
6461 .column(col_idx)
6462 .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", col_idx)))?;
6463 let ty = input.schema.column_type(col_idx).ok_or_else(|| {
6464 XlogError::Kernel(format!("Key column {} type not found in schema", col_idx))
6465 })?;
6466 match ty {
6467 ScalarType::U32 | ScalarType::Symbol => {
6468 let col_view = self.column_as_u32_view(col, n as usize)?;
6469 let gather_fn = device
6470 .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_U32)
6471 .ok_or_else(|| {
6472 XlogError::Kernel("apply_permutation_u32 kernel not found".to_string())
6473 })?;
6474 unsafe {
6477 gather_fn.clone().launch_on_stream(
6478 &cu_stream,
6479 launch_config,
6480 (
6481 &col_view,
6482 &mut keys_a,
6483 &indices_a,
6484 input.num_rows_device(),
6485 n,
6486 ),
6487 )
6488 }
6489 .map_err(|e| {
6490 XlogError::Kernel(format!(
6491 "apply_permutation_u32 (on_stream) failed: {}",
6492 e
6493 ))
6494 })?;
6495
6496 self.radix_sort_u32_pairs_with_scratch_on_stream(
6497 &mut keys_a,
6498 &mut keys_b,
6499 &mut indices_a,
6500 &mut indices_b,
6501 &mut d_hist,
6502 &mut d_prefix,
6503 &mut d_ranks,
6504 input.num_rows_device(),
6505 n,
6506 &cu_stream,
6507 launch_stream,
6508 runtime,
6509 )?;
6510 }
6511 ScalarType::U64 => {
6512 let col_view = self.column_as_u64_view(col, n as usize)?;
6513 for &word in &[
6514 sort_kernels::GATHER_KEYS_U64_LO_U32,
6515 sort_kernels::GATHER_KEYS_U64_HI_U32,
6516 ] {
6517 let gather_fn = device.get_func(SORT_MODULE, word).ok_or_else(|| {
6518 XlogError::Kernel(format!("{} kernel not found", word))
6519 })?;
6520 unsafe {
6523 gather_fn.clone().launch_on_stream(
6524 &cu_stream,
6525 launch_config,
6526 (
6527 &col_view,
6528 &indices_a,
6529 input.num_rows_device(),
6530 n,
6531 &mut keys_a,
6532 ),
6533 )
6534 }
6535 .map_err(|e| {
6536 XlogError::Kernel(format!("{} (on_stream) failed: {}", word, e))
6537 })?;
6538
6539 self.radix_sort_u32_pairs_with_scratch_on_stream(
6540 &mut keys_a,
6541 &mut keys_b,
6542 &mut indices_a,
6543 &mut indices_b,
6544 &mut d_hist,
6545 &mut d_prefix,
6546 &mut d_ranks,
6547 input.num_rows_device(),
6548 n,
6549 &cu_stream,
6550 launch_stream,
6551 runtime,
6552 )?;
6553 }
6554 }
6555 other => {
6556 return Err(XlogError::Kernel(format!(
6557 "sort_recorded: column {} unexpected type {:?} after guard",
6558 col_idx, other
6559 )));
6560 }
6561 }
6562 }
6563
6564 self.apply_permutation_gpu_on_stream(input, &indices_a, &mut dst_cols, &cu_stream)?;
6566
6567 unsafe {
6574 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
6575 *output_d_num_rows.device_ptr(),
6576 *input.num_rows_device().device_ptr(),
6577 std::mem::size_of::<u32>(),
6578 cu_stream.cu_stream(),
6579 );
6580 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
6581 return Err(XlogError::Kernel(format!(
6582 "sort_recorded: cuMemcpyDtoDAsync (output_d_num_rows) failed: {:?}",
6583 res
6584 )));
6585 }
6586 }
6587
6588 rec.commit(runtime)
6589 .map_err(|e| XlogError::Kernel(format!("sort_recorded: commit failed: {}", e)))?;
6590
6591 let new_columns: Vec<CudaColumn> = dst_cols.into_iter().map(|s| s.into()).collect();
6592 Ok(CudaBuffer::from_columns(
6593 new_columns,
6594 input.num_rows(),
6595 output_d_num_rows,
6596 input.schema.clone(),
6597 ))
6598 }
6599
6600 pub fn dedup_full_row_recorded(
6611 &self,
6612 input: &CudaBuffer,
6613 launch_stream: StreamId,
6614 ) -> Result<CudaBuffer> {
6615 let runtime = self.memory.runtime().ok_or_else(|| {
6616 XlogError::Kernel(
6617 "dedup_full_row_recorded requires a runtime-backed GpuMemoryManager".to_string(),
6618 )
6619 })?;
6620 let cu_stream = runtime
6621 .stream_pool()
6622 .resolve(launch_stream)
6623 .ok_or_else(|| {
6624 XlogError::Kernel(format!(
6625 "dedup_full_row_recorded: launch_stream StreamId({}) does not resolve",
6626 launch_stream.0
6627 ))
6628 })?;
6629
6630 let row_count = input.num_rows() as usize;
6631 if row_count == 0 {
6632 return self.create_empty_buffer(input.schema().clone());
6633 }
6634 if row_count == 1 {
6635 return self.clone_buffer(input);
6636 }
6637 if row_count > u32::MAX as usize {
6638 return Err(XlogError::Kernel(format!(
6639 "dedup_full_row_recorded supports at most {} rows, got {}",
6640 u32::MAX,
6641 row_count
6642 )));
6643 }
6644 let arity = input.arity();
6645 if arity == 0 {
6646 return self.buffer_from_columns(Vec::new(), 1, input.schema().clone());
6647 }
6648 for col_idx in 0..arity {
6649 let ty = input.schema.column_type(col_idx).ok_or_else(|| {
6650 XlogError::Kernel(format!("Column {} type not found in schema", col_idx))
6651 })?;
6652 if !matches!(ty, ScalarType::U32 | ScalarType::Symbol | ScalarType::U64) {
6653 return Err(XlogError::Kernel(format!(
6654 "dedup_full_row_recorded supports only U32 / Symbol / U64 columns; \
6655 got {:?} for column {}",
6656 ty, col_idx
6657 )));
6658 }
6659 }
6660
6661 let all_cols: Vec<usize> = (0..arity).collect();
6663 let sorted = self.sort_recorded(input, &all_cols, launch_stream)?;
6664 let n = sorted.num_rows() as u32;
6665 if n <= 1 {
6666 return Ok(sorted);
6667 }
6668
6669 let device = self.device.inner();
6675 let mut col_ptrs_host: Vec<u64> = Vec::with_capacity(arity);
6676 let mut col_sizes_host: Vec<u32> = Vec::with_capacity(arity);
6677 for col_idx in 0..arity {
6678 let c = sorted
6679 .column(col_idx)
6680 .ok_or_else(|| XlogError::Kernel(format!("Sorted column {} not found", col_idx)))?;
6681 let ty = sorted.schema().column_type(col_idx).ok_or_else(|| {
6682 XlogError::Kernel(format!("Sorted column {} type missing", col_idx))
6683 })?;
6684 col_ptrs_host.push(*c.device_ptr());
6685 col_sizes_host.push(ty.size_bytes() as u32);
6686 }
6687 let mut d_col_ptrs = self.memory.alloc::<u64>(arity)?;
6688 let mut d_col_sizes = self.memory.alloc::<u32>(arity)?;
6689 self.htod_launch_metadata_sync_copy_into(&col_ptrs_host, &mut d_col_ptrs)
6690 .map_err(|e| {
6691 XlogError::Kernel(format!("dedup_full_row_recorded col ptr upload: {}", e))
6692 })?;
6693 self.htod_launch_metadata_sync_copy_into(&col_sizes_host, &mut d_col_sizes)
6694 .map_err(|e| {
6695 XlogError::Kernel(format!("dedup_full_row_recorded col size upload: {}", e))
6696 })?;
6697 let d_unique_mask = self.memory.alloc::<u8>(n as usize)?;
6698
6699 let mut rec = LaunchRecorder::new_strict(launch_stream);
6700 for col_idx in 0..arity {
6701 let c = sorted
6702 .column(col_idx)
6703 .ok_or_else(|| XlogError::Kernel(format!("Sorted column {} not found", col_idx)))?;
6704 rec.read_column(c);
6705 }
6706 rec.read(sorted.num_rows_device());
6707 rec.write(&d_col_ptrs);
6708 rec.write(&d_col_sizes);
6709 rec.write(&d_unique_mask);
6710 rec.preflight(runtime).map_err(|e| {
6711 XlogError::Kernel(format!(
6712 "dedup_full_row_recorded: mark_unique preflight failed: {}",
6713 e
6714 ))
6715 })?;
6716
6717 let block_size = 256u32;
6718 let grid = n.div_ceil(block_size);
6719 let cfg = LaunchConfig {
6720 grid_dim: (grid, 1, 1),
6721 block_dim: (block_size, 1, 1),
6722 shared_mem_bytes: 0,
6723 };
6724 let mark_fn = device
6725 .get_func(DEDUP_MODULE, dedup_kernels::MARK_UNIQUE_FULL_ROW_BYTEWISE)
6726 .ok_or_else(|| {
6727 XlogError::Kernel("mark_unique_full_row_bytewise kernel not found".to_string())
6728 })?;
6729 unsafe {
6732 mark_fn.clone().launch_on_stream(
6733 &cu_stream,
6734 cfg,
6735 (
6736 &d_col_ptrs,
6737 &d_col_sizes,
6738 arity as u32,
6739 sorted.num_rows_device(),
6740 n,
6741 &d_unique_mask,
6742 ),
6743 )
6744 }
6745 .map_err(|e| {
6746 XlogError::Kernel(format!(
6747 "mark_unique_full_row_bytewise (on_stream) failed: {}",
6748 e
6749 ))
6750 })?;
6751
6752 rec.commit(runtime).map_err(|e| {
6753 XlogError::Kernel(format!(
6754 "dedup_full_row_recorded: mark_unique commit failed: {}",
6755 e
6756 ))
6757 })?;
6758
6759 self.compact_buffer_by_device_mask_counted_recorded(&sorted, &d_unique_mask, launch_stream)
6761 }
6762
6763 fn build_hash_table_v2_on_stream(
6797 &self,
6798 hashes: &TrackedCudaSlice<u64>,
6799 num_rows: u32,
6800 cu_stream: &cudarc::driver::CudaStream,
6801 launch_stream: StreamId,
6802 runtime: &crate::device_runtime::XlogDeviceRuntime,
6803 ) -> Result<crate::provider::JoinHashTableV2> {
6804 let device = self.device.inner();
6805
6806 let target = (num_rows as u64).saturating_mul(2).max(1024);
6807 let num_buckets_u64 = target.next_power_of_two();
6808 let num_buckets = u32::try_from(num_buckets_u64).map_err(|_| {
6809 XlogError::Kernel(format!(
6810 "Join hash table too large: num_buckets={}",
6811 num_buckets_u64
6812 ))
6813 })?;
6814 let bucket_mask = num_buckets
6815 .checked_sub(1)
6816 .ok_or_else(|| XlogError::Kernel("Join hash table size underflow".to_string()))?;
6817
6818 let bucket_counts = self.memory.alloc::<u32>(num_buckets as usize)?;
6819 runtime
6826 .prepare_first_use(&bucket_counts, launch_stream, Access::Write)
6827 .map_err(|e| {
6828 XlogError::Kernel(format!(
6829 "build_hash_table_v2_on_stream: prepare bucket_counts failed: {}",
6830 e
6831 ))
6832 })?;
6833 if num_buckets > 0 {
6835 unsafe {
6839 let res = cudarc::driver::sys::cuMemsetD8Async(
6840 *bucket_counts.device_ptr(),
6841 0,
6842 (num_buckets as usize) * std::mem::size_of::<u32>(),
6843 cu_stream.cu_stream(),
6844 );
6845 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
6846 return Err(XlogError::Kernel(format!(
6847 "cuMemsetD8Async (bucket_counts) failed: {:?}",
6848 res
6849 )));
6850 }
6851 }
6852 }
6853
6854 let block_size = 256u32;
6855 let grid_size = num_rows.div_ceil(block_size);
6856 let cfg = LaunchConfig {
6857 grid_dim: (grid_size, 1, 1),
6858 block_dim: (block_size, 1, 1),
6859 shared_mem_bytes: 0,
6860 };
6861
6862 let count_fn = device
6863 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_BUCKET_COUNT_V2)
6864 .ok_or_else(|| {
6865 XlogError::Kernel("hash_join_bucket_count_v2 kernel not found".to_string())
6866 })?;
6867 unsafe {
6869 count_fn.clone().launch_on_stream(
6870 cu_stream,
6871 cfg,
6872 (hashes, num_rows, &bucket_counts, bucket_mask),
6873 )
6874 }
6875 .map_err(|e| {
6876 XlogError::Kernel(format!(
6877 "hash_join_bucket_count_v2 (on_stream) failed: {}",
6878 e
6879 ))
6880 })?;
6881
6882 let mut bucket_offsets = self.memory.alloc::<u32>(num_buckets as usize)?;
6883 runtime
6886 .prepare_first_use(&bucket_offsets, launch_stream, Access::Write)
6887 .map_err(|e| {
6888 XlogError::Kernel(format!(
6889 "build_hash_table_v2_on_stream: prepare bucket_offsets failed: {}",
6890 e
6891 ))
6892 })?;
6893 if num_buckets > 0 {
6894 unsafe {
6898 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
6899 *bucket_offsets.device_ptr(),
6900 *bucket_counts.device_ptr(),
6901 (num_buckets as usize) * std::mem::size_of::<u32>(),
6902 cu_stream.cu_stream(),
6903 );
6904 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
6905 return Err(XlogError::Kernel(format!(
6906 "cuMemcpyDtoDAsync (bucket_counts → bucket_offsets) failed: {:?}",
6907 res
6908 )));
6909 }
6910 }
6911 self.multiblock_scan_u32_inplace_on_stream(
6912 &mut bucket_offsets,
6913 num_buckets,
6914 cu_stream,
6915 launch_stream,
6916 runtime,
6917 )?;
6918 }
6919
6920 let bucket_cursors = self.memory.alloc::<u32>(num_buckets as usize)?;
6921 runtime
6924 .prepare_first_use(&bucket_cursors, launch_stream, Access::Write)
6925 .map_err(|e| {
6926 XlogError::Kernel(format!(
6927 "build_hash_table_v2_on_stream: prepare bucket_cursors failed: {}",
6928 e
6929 ))
6930 })?;
6931 if num_buckets > 0 {
6932 unsafe {
6935 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
6936 *bucket_cursors.device_ptr(),
6937 *bucket_offsets.device_ptr(),
6938 (num_buckets as usize) * std::mem::size_of::<u32>(),
6939 cu_stream.cu_stream(),
6940 );
6941 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
6942 return Err(XlogError::Kernel(format!(
6943 "cuMemcpyDtoDAsync (bucket_offsets → bucket_cursors) failed: {:?}",
6944 res
6945 )));
6946 }
6947 }
6948 }
6949
6950 let bucket_entries = self.memory.alloc::<u32>(num_rows as usize)?;
6951 let bucket_entry_hashes = self.memory.alloc::<u64>(num_rows as usize)?;
6952 runtime
6955 .prepare_first_use(&bucket_entries, launch_stream, Access::Write)
6956 .map_err(|e| {
6957 XlogError::Kernel(format!(
6958 "build_hash_table_v2_on_stream: prepare bucket_entries failed: {}",
6959 e
6960 ))
6961 })?;
6962 runtime
6963 .prepare_first_use(&bucket_entry_hashes, launch_stream, Access::Write)
6964 .map_err(|e| {
6965 XlogError::Kernel(format!(
6966 "build_hash_table_v2_on_stream: prepare bucket_entry_hashes failed: {}",
6967 e
6968 ))
6969 })?;
6970
6971 let scatter_fn = device
6972 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SCATTER_V2)
6973 .ok_or_else(|| {
6974 XlogError::Kernel("hash_join_scatter_v2 kernel not found".to_string())
6975 })?;
6976 unsafe {
6978 scatter_fn.clone().launch_on_stream(
6979 cu_stream,
6980 cfg,
6981 (
6982 hashes,
6983 num_rows,
6984 &bucket_cursors,
6985 bucket_mask,
6986 &bucket_entries,
6987 &bucket_entry_hashes,
6988 ),
6989 )
6990 }
6991 .map_err(|e| {
6992 XlogError::Kernel(format!("hash_join_scatter_v2 (on_stream) failed: {}", e))
6993 })?;
6994
6995 for blk in [
7003 bucket_counts.runtime_block(),
7004 bucket_offsets.runtime_block(),
7005 bucket_cursors.runtime_block(),
7006 bucket_entries.runtime_block(),
7007 bucket_entry_hashes.runtime_block(),
7008 ] {
7009 if let Some(b) = blk {
7010 runtime
7011 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
7012 .map_err(|e| {
7013 XlogError::Kernel(format!(
7014 "build_hash_table_v2_on_stream: finish_block_use failed: {}",
7015 e
7016 ))
7017 })?;
7018 } else {
7019 return Err(XlogError::Kernel(
7020 "build_hash_table_v2_on_stream: buffer has no runtime block — \
7021 caller must use a runtime-backed manager"
7022 .to_string(),
7023 ));
7024 }
7025 }
7026
7027 Ok(crate::provider::JoinHashTableV2 {
7028 bucket_counts,
7029 bucket_offsets,
7030 bucket_entries,
7031 bucket_entry_hashes,
7032 bucket_mask,
7033 })
7034 }
7035
7036 fn gather_buffer_by_indices_on_stream(
7044 &self,
7045 input: &CudaBuffer,
7046 indices: &TrackedCudaSlice<u32>,
7047 output_rows: u32,
7048 cu_stream: &cudarc::driver::CudaStream,
7049 launch_stream: StreamId,
7050 runtime: &crate::device_runtime::XlogDeviceRuntime,
7051 ) -> Result<CudaBuffer> {
7052 if output_rows == 0 {
7053 return self.create_empty_buffer(input.schema().clone());
7054 }
7055 if input.num_rows() > u32::MAX as u64 {
7056 return Err(XlogError::Kernel(format!(
7057 "GPU gather supports at most {} input rows, got {}",
7058 u32::MAX,
7059 input.num_rows()
7060 )));
7061 }
7062
7063 let d_output_rows = self.upload_device_row_count(output_rows)?;
7064 runtime
7071 .finish_first_use(&d_output_rows, StreamId::DEFAULT, Access::Write)
7072 .map_err(|e| {
7073 XlogError::Kernel(format!(
7074 "gather_buffer_by_indices_on_stream: record d_output_rows upload failed: {}",
7075 e
7076 ))
7077 })?;
7078 runtime
7079 .prepare_first_use(&d_output_rows, launch_stream, Access::Read)
7080 .map_err(|e| {
7081 XlogError::Kernel(format!(
7082 "gather_buffer_by_indices_on_stream: prepare d_output_rows failed: {}",
7083 e
7084 ))
7085 })?;
7086 let device = self.device.inner();
7087 let block_size = 256u32;
7088 let grid_size = output_rows.div_ceil(block_size);
7089 let launch_config = LaunchConfig {
7090 grid_dim: (grid_size, 1, 1),
7091 block_dim: (block_size, 1, 1),
7092 shared_mem_bytes: 0,
7093 };
7094
7095 let gather_fn = device
7096 .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
7097 .ok_or_else(|| {
7098 XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
7099 })?;
7100
7101 let mut dst_cols: Vec<TrackedCudaSlice<u8>> = Vec::with_capacity(input.columns.len());
7102 for col_idx in 0..input.columns.len() {
7103 let elem_size = input
7104 .schema
7105 .column_type(col_idx)
7106 .ok_or_else(|| {
7107 XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
7108 })?
7109 .size_bytes() as u32;
7110 let dst_bytes = (output_rows as usize) * (elem_size as usize);
7111 let dst = self.memory.alloc::<u8>(dst_bytes)?;
7112 runtime
7115 .prepare_first_use(&dst, launch_stream, Access::Write)
7116 .map_err(|e| {
7117 XlogError::Kernel(format!(
7118 "gather_buffer_by_indices_on_stream: prepare dst_col {} failed: {}",
7119 col_idx, e
7120 ))
7121 })?;
7122 dst_cols.push(dst);
7123 }
7124
7125 for (col_idx, dst_col) in dst_cols.iter_mut().enumerate() {
7126 let src_col = input
7127 .column(col_idx)
7128 .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
7129 let elem_size = input
7130 .schema
7131 .column_type(col_idx)
7132 .map(|t| t.size_bytes() as u32)
7133 .unwrap_or(4);
7134 unsafe {
7136 gather_fn.clone().launch_on_stream(
7137 cu_stream,
7138 launch_config,
7139 (
7140 src_col,
7141 &mut *dst_col,
7142 indices,
7143 &d_output_rows,
7144 output_rows,
7145 elem_size,
7146 ),
7147 )
7148 }
7149 .map_err(|e| {
7150 XlogError::Kernel(format!("apply_permutation_bytes (on_stream) failed: {}", e))
7151 })?;
7152 }
7153
7154 runtime
7155 .finish_first_use(&d_output_rows, launch_stream, Access::Read)
7156 .map_err(|e| {
7157 XlogError::Kernel(format!(
7158 "gather_buffer_by_indices_on_stream: record d_output_rows read failed: {}",
7159 e
7160 ))
7161 })?;
7162
7163 for dst_col in &dst_cols {
7168 if let Some(b) = dst_col.runtime_block() {
7169 runtime
7170 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
7171 .map_err(|e| {
7172 XlogError::Kernel(format!(
7173 "gather_buffer_by_indices_on_stream: finish_block_use \
7174 (dst_col) failed: {}",
7175 e
7176 ))
7177 })?;
7178 } else {
7179 return Err(XlogError::Kernel(
7180 "gather_buffer_by_indices_on_stream: dst_col has no runtime block".to_string(),
7181 ));
7182 }
7183 }
7184
7185 let new_columns: Vec<CudaColumn> = dst_cols.into_iter().map(|s| s.into()).collect();
7186 Ok(CudaBuffer::from_columns(
7187 new_columns,
7188 output_rows as u64,
7189 d_output_rows,
7190 input.schema.clone(),
7191 ))
7192 }
7193
7194 pub fn hash_join_inner_v2_recorded(
7201 &self,
7202 left: &CudaBuffer,
7203 right: &CudaBuffer,
7204 left_keys: &[usize],
7205 right_keys: &[usize],
7206 max_output: Option<usize>,
7207 launch_stream: StreamId,
7208 ) -> Result<CudaBuffer> {
7209 use crate::launch::LaunchRecorder;
7210
7211 let runtime = self.memory.runtime().ok_or_else(|| {
7212 XlogError::Kernel(
7213 "hash_join_inner_v2_recorded requires a runtime-backed GpuMemoryManager"
7214 .to_string(),
7215 )
7216 })?;
7217 let cu_stream = runtime
7218 .stream_pool()
7219 .resolve(launch_stream)
7220 .ok_or_else(|| {
7221 XlogError::Kernel(format!(
7222 "hash_join_inner_v2_recorded: launch_stream StreamId({}) does not resolve",
7223 launch_stream.0
7224 ))
7225 })?;
7226
7227 let num_left = self.device_row_count(left)?;
7228 let num_right = self.device_row_count(right)?;
7229 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
7230 return Err(XlogError::Kernel(format!(
7231 "Join supports at most {} rows per side (left={}, right={})",
7232 u32::MAX,
7233 num_left,
7234 num_right
7235 )));
7236 }
7237 if num_left == 0 || num_right == 0 {
7238 let combined_schema = self.combine_schemas(left.schema(), right.schema());
7239 return self.create_empty_buffer(combined_schema);
7240 }
7241 if left_keys.is_empty() || right_keys.is_empty() {
7242 return Err(XlogError::Kernel(
7243 "Join requires at least one key column".to_string(),
7244 ));
7245 }
7246 if left_keys.len() != right_keys.len() {
7247 return Err(XlogError::Kernel(
7248 "Left and right key columns must have same length".to_string(),
7249 ));
7250 }
7251 if left_keys.len() > 4 {
7252 return Err(XlogError::Kernel(
7253 "hash_join_inner_v2_recorded: max 4 key columns supported (pack_keys constraint)"
7254 .to_string(),
7255 ));
7256 }
7257 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
7258 let lt = left.schema().column_type(l);
7259 let rt = right.schema().column_type(r);
7260 if lt != rt {
7261 return Err(XlogError::Kernel(format!(
7262 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
7263 l, lt, r, rt
7264 )));
7265 }
7266 }
7267
7268 let num_left = num_left as u32;
7269 let num_right = num_right as u32;
7270
7271 let left_packed =
7273 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
7274 let right_packed =
7275 self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
7276
7277 let table = self.build_hash_table_v2_on_stream(
7279 &right_packed.hashes,
7280 num_right,
7281 &cu_stream,
7282 launch_stream,
7283 runtime,
7284 )?;
7285
7286 let probe_func = self
7287 .device
7288 .inner()
7289 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
7290 .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
7291 let block_size = 256u32;
7292 let probe_grid = num_left.div_ceil(block_size);
7293 let probe_config = LaunchConfig {
7294 grid_dim: (probe_grid, 1, 1),
7295 block_dim: (block_size, 1, 1),
7296 shared_mem_bytes: 0,
7297 };
7298
7299 let d_count_only = self.memory.alloc::<u32>(1)?;
7304 let d_dummy_left = self.memory.alloc::<u32>(1)?;
7305 let d_dummy_right = self.memory.alloc::<u32>(1)?;
7306
7307 let max_output_count_only = 0u32;
7313 let mut rec_count = LaunchRecorder::new_strict(launch_stream);
7314 rec_count.read(&left_packed.hashes);
7315 rec_count.read(&left_packed.packed_keys);
7316 rec_count.read(&right_packed.packed_keys);
7317 rec_count.read(&table.bucket_offsets);
7318 rec_count.read(&table.bucket_counts);
7319 rec_count.read(&table.bucket_entries);
7320 rec_count.read(&table.bucket_entry_hashes);
7321 rec_count.write(&d_count_only);
7322 rec_count.write(&d_dummy_left);
7323 rec_count.write(&d_dummy_right);
7324 rec_count.preflight(runtime).map_err(|e| {
7325 XlogError::Kernel(format!(
7326 "hash_join_inner_v2_recorded: count-pass preflight failed: {}",
7327 e
7328 ))
7329 })?;
7330
7331 unsafe {
7337 let res = cudarc::driver::sys::cuMemsetD8Async(
7338 *d_count_only.device_ptr(),
7339 0,
7340 std::mem::size_of::<u32>(),
7341 cu_stream.cu_stream(),
7342 );
7343 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7344 return Err(XlogError::Kernel(format!(
7345 "cuMemsetD8Async (d_count_only) failed: {:?}",
7346 res
7347 )));
7348 }
7349 }
7350
7351 unsafe {
7356 let mut params: Vec<*mut c_void> = vec![
7357 (&left_packed.hashes).as_kernel_param(),
7358 num_left.as_kernel_param(),
7359 (&table.bucket_offsets).as_kernel_param(),
7360 (&table.bucket_counts).as_kernel_param(),
7361 (&table.bucket_entries).as_kernel_param(),
7362 (&table.bucket_entry_hashes).as_kernel_param(),
7363 table.bucket_mask.as_kernel_param(),
7364 (&left_packed.packed_keys).as_kernel_param(),
7365 (&right_packed.packed_keys).as_kernel_param(),
7366 left_packed.key_bytes.as_kernel_param(),
7367 (&d_dummy_left).as_kernel_param(),
7368 (&d_dummy_right).as_kernel_param(),
7369 (&d_count_only).as_kernel_param(),
7370 max_output_count_only.as_kernel_param(),
7371 ];
7372 probe_func
7373 .clone()
7374 .launch_on_stream(&cu_stream, probe_config, &mut params)
7375 .map_err(|e| {
7376 XlogError::Kernel(format!(
7377 "hash_join_probe_v2 (count, on_stream) failed: {}",
7378 e
7379 ))
7380 })?;
7381 }
7382
7383 rec_count.commit(runtime).map_err(|e| {
7384 XlogError::Kernel(format!(
7385 "hash_join_inner_v2_recorded: count-pass commit failed: {}",
7386 e
7387 ))
7388 })?;
7389
7390 cu_stream.synchronize().map_err(|e| {
7392 XlogError::Kernel(format!(
7393 "hash_join_inner_v2_recorded: launch_stream sync (count read) failed: {}",
7394 e
7395 ))
7396 })?;
7397 let full_count = self.read_join_output_count_metadata(&d_count_only)? as u64;
7398 let requested = max_output
7399 .map(|limit| (limit as u64).min(full_count))
7400 .unwrap_or(full_count);
7401 if requested == 0 {
7402 let combined_schema = self.combine_schemas(left.schema(), right.schema());
7403 return self.create_empty_buffer(combined_schema);
7404 }
7405 if requested > u32::MAX as u64 {
7406 return Err(XlogError::Kernel(format!(
7407 "Join produced {} rows which exceeds the u32 index limit",
7408 requested
7409 )));
7410 }
7411 let max_output_u32 = requested as u32;
7412
7413 let d_output_left = self.memory.alloc::<u32>(max_output_u32 as usize)?;
7417 let d_output_right = self.memory.alloc::<u32>(max_output_u32 as usize)?;
7418 let d_output_count = self.memory.alloc::<u32>(1)?;
7419
7420 let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
7421 rec_mat.read(&left_packed.hashes);
7422 rec_mat.read(&left_packed.packed_keys);
7423 rec_mat.read(&right_packed.packed_keys);
7424 rec_mat.read(&table.bucket_offsets);
7425 rec_mat.read(&table.bucket_counts);
7426 rec_mat.read(&table.bucket_entries);
7427 rec_mat.read(&table.bucket_entry_hashes);
7428 rec_mat.write(&d_output_left);
7429 rec_mat.write(&d_output_right);
7430 rec_mat.write(&d_output_count);
7431 rec_mat.preflight(runtime).map_err(|e| {
7432 XlogError::Kernel(format!(
7433 "hash_join_inner_v2_recorded: materialize-pass preflight failed: {}",
7434 e
7435 ))
7436 })?;
7437
7438 unsafe {
7442 let res = cudarc::driver::sys::cuMemsetD8Async(
7443 *d_output_count.device_ptr(),
7444 0,
7445 std::mem::size_of::<u32>(),
7446 cu_stream.cu_stream(),
7447 );
7448 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7449 return Err(XlogError::Kernel(format!(
7450 "cuMemsetD8Async (d_output_count) failed: {:?}",
7451 res
7452 )));
7453 }
7454 }
7455
7456 unsafe {
7458 let mut params: Vec<*mut c_void> = vec![
7459 (&left_packed.hashes).as_kernel_param(),
7460 num_left.as_kernel_param(),
7461 (&table.bucket_offsets).as_kernel_param(),
7462 (&table.bucket_counts).as_kernel_param(),
7463 (&table.bucket_entries).as_kernel_param(),
7464 (&table.bucket_entry_hashes).as_kernel_param(),
7465 table.bucket_mask.as_kernel_param(),
7466 (&left_packed.packed_keys).as_kernel_param(),
7467 (&right_packed.packed_keys).as_kernel_param(),
7468 left_packed.key_bytes.as_kernel_param(),
7469 (&d_output_left).as_kernel_param(),
7470 (&d_output_right).as_kernel_param(),
7471 (&d_output_count).as_kernel_param(),
7472 max_output_u32.as_kernel_param(),
7473 ];
7474 probe_func
7475 .clone()
7476 .launch_on_stream(&cu_stream, probe_config, &mut params)
7477 .map_err(|e| {
7478 XlogError::Kernel(format!(
7479 "hash_join_probe_v2 (materialize, on_stream) failed: {}",
7480 e
7481 ))
7482 })?;
7483 }
7484
7485 rec_mat.commit(runtime).map_err(|e| {
7486 XlogError::Kernel(format!(
7487 "hash_join_inner_v2_recorded: materialize-pass commit failed: {}",
7488 e
7489 ))
7490 })?;
7491
7492 cu_stream.synchronize().map_err(|e| {
7494 XlogError::Kernel(format!(
7495 "hash_join_inner_v2_recorded: launch_stream sync (mat read) failed: {}",
7496 e
7497 ))
7498 })?;
7499 let result_count = (self.read_join_output_count_metadata(&d_output_count)? as u64)
7500 .min(max_output_u32 as u64);
7501 if result_count == 0 {
7502 let combined_schema = self.combine_schemas(left.schema(), right.schema());
7503 return self.create_empty_buffer(combined_schema);
7504 }
7505 let output_rows = result_count as u32;
7506
7507 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
7511 for col_idx in 0..left.columns.len() {
7512 let c = left
7513 .column(col_idx)
7514 .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
7515 rec_gather.read_column(c);
7516 }
7517 for col_idx in 0..right.columns.len() {
7518 let c = right
7519 .column(col_idx)
7520 .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
7521 rec_gather.read_column(c);
7522 }
7523 rec_gather.read(&d_output_left);
7524 rec_gather.read(&d_output_right);
7525 rec_gather.preflight(runtime).map_err(|e| {
7526 XlogError::Kernel(format!(
7527 "hash_join_inner_v2_recorded: gather preflight failed: {}",
7528 e
7529 ))
7530 })?;
7531
7532 let gathered_left = self.gather_buffer_by_indices_on_stream(
7533 left,
7534 &d_output_left,
7535 output_rows,
7536 &cu_stream,
7537 launch_stream,
7538 runtime,
7539 )?;
7540 let gathered_right = self.gather_buffer_by_indices_on_stream(
7541 right,
7542 &d_output_right,
7543 output_rows,
7544 &cu_stream,
7545 launch_stream,
7546 runtime,
7547 )?;
7548
7549 rec_gather.commit(runtime).map_err(|e| {
7550 XlogError::Kernel(format!(
7551 "hash_join_inner_v2_recorded: gather commit failed: {}",
7552 e
7553 ))
7554 })?;
7555
7556 let combined_schema = self.combine_schemas(left.schema(), right.schema());
7557 let mut result_columns = Vec::with_capacity(combined_schema.arity());
7558 result_columns.extend(gathered_left.columns);
7559 result_columns.extend(gathered_right.columns);
7560 self.buffer_from_columns(result_columns, result_count, combined_schema)
7561 }
7562
7563 pub fn hash_join_inner_v2_count_scan_materialize_recorded(
7594 &self,
7595 left: &CudaBuffer,
7596 right: &CudaBuffer,
7597 left_keys: &[usize],
7598 right_keys: &[usize],
7599 max_output: Option<usize>,
7600 launch_stream: StreamId,
7601 ) -> Result<CudaBuffer> {
7602 if Self::use_csm_cuda_graph_env() {
7603 if let Some(result) = self
7604 .hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded(
7605 left,
7606 right,
7607 left_keys,
7608 right_keys,
7609 max_output,
7610 launch_stream,
7611 )?
7612 {
7613 return Ok(result);
7614 }
7615 self.csm_cuda_graph_fallbacks
7616 .fetch_add(1, Ordering::Relaxed);
7617 }
7618
7619 use crate::launch::LaunchRecorder;
7620
7621 let runtime = self.memory.runtime().ok_or_else(|| {
7622 XlogError::Kernel(
7623 "hash_join_inner_v2_count_scan_materialize_recorded requires a \
7624 runtime-backed GpuMemoryManager"
7625 .to_string(),
7626 )
7627 })?;
7628 let cu_stream = runtime
7629 .stream_pool()
7630 .resolve(launch_stream)
7631 .ok_or_else(|| {
7632 XlogError::Kernel(format!(
7633 "hash_join_inner_v2_count_scan_materialize_recorded: launch_stream \
7634 StreamId({}) does not resolve",
7635 launch_stream.0
7636 ))
7637 })?;
7638
7639 let num_left = self.device_row_count(left)?;
7641 let num_right = self.device_row_count(right)?;
7642 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
7643 return Err(XlogError::Kernel(format!(
7644 "Join supports at most {} rows per side (left={}, right={})",
7645 u32::MAX,
7646 num_left,
7647 num_right
7648 )));
7649 }
7650 if num_left == 0 || num_right == 0 {
7651 let combined_schema = self.combine_schemas(left.schema(), right.schema());
7652 return self.create_empty_buffer(combined_schema);
7653 }
7654 if left_keys.is_empty() || right_keys.is_empty() {
7655 return Err(XlogError::Kernel(
7656 "Join requires at least one key column".to_string(),
7657 ));
7658 }
7659 if left_keys.len() != right_keys.len() {
7660 return Err(XlogError::Kernel(
7661 "Left and right key columns must have same length".to_string(),
7662 ));
7663 }
7664 if left_keys.len() > 4 {
7665 return Err(XlogError::Kernel(
7666 "hash_join_inner_v2_count_scan_materialize_recorded: max 4 key \
7667 columns supported (pack_keys constraint)"
7668 .to_string(),
7669 ));
7670 }
7671 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
7672 let lt = left.schema().column_type(l);
7673 let rt = right.schema().column_type(r);
7674 if lt != rt {
7675 return Err(XlogError::Kernel(format!(
7676 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
7677 l, lt, r, rt
7678 )));
7679 }
7680 }
7681
7682 let _num_left = num_left as u32;
7683 let probe_cap = left.num_rows() as u32;
7684
7685 let left_packed =
7687 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
7688 let right_packed =
7689 self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
7690 let table = self.build_hash_table_v2_on_stream(
7691 &right_packed.hashes,
7692 num_right as u32,
7693 &cu_stream,
7694 launch_stream,
7695 runtime,
7696 )?;
7697
7698 let device = self.device.inner();
7699 let block_size = 256u32;
7700 let probe_grid = probe_cap.div_ceil(block_size);
7701 let probe_config = LaunchConfig {
7702 grid_dim: (probe_grid, 1, 1),
7703 block_dim: (block_size, 1, 1),
7704 shared_mem_bytes: 0,
7705 };
7706
7707 let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
7709 let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
7710 let d_logical_count = self.memory.alloc::<u32>(1)?;
7711 let d_overflow = self.memory.alloc::<u8>(1)?;
7712 runtime
7718 .prepare_first_use(&d_overflow, launch_stream, Access::Write)
7719 .map_err(|e| {
7720 XlogError::Kernel(format!(
7721 "hash_join_inner_v2_count_scan_materialize_recorded: prepare d_overflow \
7722 failed: {}",
7723 e
7724 ))
7725 })?;
7726 runtime
7727 .prepare_first_use(&d_logical_count, launch_stream, Access::Write)
7728 .map_err(|e| {
7729 XlogError::Kernel(format!(
7730 "hash_join_inner_v2_count_scan_materialize_recorded: prepare d_logical_count \
7731 failed: {}",
7732 e
7733 ))
7734 })?;
7735 unsafe {
7738 let res = cudarc::driver::sys::cuMemsetD8Async(
7739 *d_overflow.device_ptr(),
7740 0,
7741 1,
7742 cu_stream.cu_stream(),
7743 );
7744 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7745 return Err(XlogError::Kernel(format!(
7746 "cuMemsetD8Async (d_overflow init) failed: {:?}",
7747 res
7748 )));
7749 }
7750 let res = cudarc::driver::sys::cuMemsetD8Async(
7751 *d_logical_count.device_ptr(),
7752 0,
7753 std::mem::size_of::<u32>(),
7754 cu_stream.cu_stream(),
7755 );
7756 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7757 return Err(XlogError::Kernel(format!(
7758 "cuMemsetD8Async (d_logical_count init) failed: {:?}",
7759 res
7760 )));
7761 }
7762 }
7763
7764 let count_func = device
7770 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
7771 .ok_or_else(|| {
7772 XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
7773 })?;
7774 let total_func = device
7775 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
7776 .ok_or_else(|| {
7777 XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
7778 })?;
7779
7780 let mut rec_count = LaunchRecorder::new_strict(launch_stream);
7781 rec_count.read(&left_packed.hashes);
7782 rec_count.read(&left_packed.packed_keys);
7783 rec_count.read(&right_packed.packed_keys);
7784 rec_count.read(&table.bucket_offsets);
7785 rec_count.read(&table.bucket_counts);
7786 rec_count.read(&table.bucket_entries);
7787 rec_count.read(&table.bucket_entry_hashes);
7788 rec_count.read(left.num_rows_device());
7789 rec_count.write(&per_probe_count);
7790 rec_count.write(&per_probe_offsets);
7791 rec_count.write(&d_logical_count);
7792 rec_count.write(&d_overflow);
7793 rec_count.preflight(runtime).map_err(|e| {
7794 XlogError::Kernel(format!("csm inner: count/scan preflight failed: {}", e))
7795 })?;
7796
7797 unsafe {
7800 count_func.clone().launch_on_stream(
7801 &cu_stream,
7802 probe_config,
7803 (
7804 &left_packed.hashes,
7805 left.num_rows_device(),
7806 probe_cap,
7807 &table.bucket_offsets,
7808 &table.bucket_counts,
7809 &table.bucket_entries,
7810 &table.bucket_entry_hashes,
7811 table.bucket_mask,
7812 &left_packed.packed_keys,
7813 &right_packed.packed_keys,
7814 left_packed.key_bytes,
7815 &per_probe_count,
7816 ),
7817 )
7818 }
7819 .map_err(|e| {
7820 XlogError::Kernel(format!(
7821 "hash_join_probe_v2_count_per_row (on_stream) failed: {}",
7822 e
7823 ))
7824 })?;
7825
7826 unsafe {
7830 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
7831 *per_probe_offsets.device_ptr(),
7832 *per_probe_count.device_ptr(),
7833 (probe_cap as usize) * std::mem::size_of::<u32>(),
7834 cu_stream.cu_stream(),
7835 );
7836 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7837 return Err(XlogError::Kernel(format!(
7838 "csm inner: cuMemcpyDtoDAsync (per_probe_count → offsets) failed: {:?}",
7839 res
7840 )));
7841 }
7842 }
7843 self.multiblock_scan_u32_inplace_on_stream(
7844 &mut per_probe_offsets,
7845 probe_cap,
7846 &cu_stream,
7847 launch_stream,
7848 runtime,
7849 )?;
7850
7851 let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(num_right as u64);
7858 let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
7859 unsafe {
7860 total_func.clone().launch_on_stream(
7861 &cu_stream,
7862 LaunchConfig {
7863 grid_dim: (1, 1, 1),
7864 block_dim: (1, 1, 1),
7865 shared_mem_bytes: 0,
7866 },
7867 (
7868 &per_probe_offsets,
7869 &per_probe_count,
7870 left.num_rows_device(),
7871 probe_cap,
7872 materialize_capacity_u32,
7873 &d_logical_count,
7874 &d_overflow,
7875 ),
7876 )
7877 }
7878 .map_err(|e| {
7879 XlogError::Kernel(format!(
7880 "hash_join_total_from_scan (on_stream) failed: {}",
7881 e
7882 ))
7883 })?;
7884
7885 rec_count.commit(runtime).map_err(|e| {
7886 XlogError::Kernel(format!("csm inner: count/scan commit failed: {}", e))
7887 })?;
7888
7889 cu_stream.synchronize().map_err(|e| {
7892 XlogError::Kernel(format!("csm inner: sync (total read) failed: {}", e))
7893 })?;
7894 let total = self.read_join_output_count_metadata(&d_logical_count)? as u64;
7895 let requested = max_output
7896 .map(|limit| (limit as u64).min(total))
7897 .unwrap_or(total);
7898 if requested == 0 {
7899 let combined_schema = self.combine_schemas(left.schema(), right.schema());
7900 return self.create_empty_buffer(combined_schema);
7901 }
7902 if requested > u32::MAX as u64 {
7903 return Err(XlogError::Kernel(format!(
7904 "Join produced {} rows which exceeds the u32 index limit",
7905 requested
7906 )));
7907 }
7908 let output_capacity = requested as u32;
7909
7910 let d_output_left = self.memory.alloc::<u32>(output_capacity as usize)?;
7919 let d_output_right = self.memory.alloc::<u32>(output_capacity as usize)?;
7920
7921 let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
7922 rec_mat.read(&left_packed.hashes);
7923 rec_mat.read(&left_packed.packed_keys);
7924 rec_mat.read(&right_packed.packed_keys);
7925 rec_mat.read(&table.bucket_offsets);
7926 rec_mat.read(&table.bucket_counts);
7927 rec_mat.read(&table.bucket_entries);
7928 rec_mat.read(&table.bucket_entry_hashes);
7929 rec_mat.read(&per_probe_offsets);
7930 rec_mat.read(left.num_rows_device());
7931 rec_mat.write(&d_output_left);
7932 rec_mat.write(&d_output_right);
7933 rec_mat.write(&d_overflow);
7935 rec_mat.preflight(runtime).map_err(|e| {
7936 XlogError::Kernel(format!("csm inner: materialize preflight failed: {}", e))
7937 })?;
7938
7939 let materialize_func = device
7940 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
7941 .ok_or_else(|| {
7942 XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
7943 })?;
7944 unsafe {
7947 let mut params: Vec<*mut c_void> = vec![
7948 (&left_packed.hashes).as_kernel_param(),
7949 left.num_rows_device().as_kernel_param(),
7950 probe_cap.as_kernel_param(),
7951 (&table.bucket_offsets).as_kernel_param(),
7952 (&table.bucket_counts).as_kernel_param(),
7953 (&table.bucket_entries).as_kernel_param(),
7954 (&table.bucket_entry_hashes).as_kernel_param(),
7955 table.bucket_mask.as_kernel_param(),
7956 (&left_packed.packed_keys).as_kernel_param(),
7957 (&right_packed.packed_keys).as_kernel_param(),
7958 left_packed.key_bytes.as_kernel_param(),
7959 (&per_probe_offsets).as_kernel_param(),
7960 output_capacity.as_kernel_param(),
7961 (&d_output_left).as_kernel_param(),
7962 (&d_output_right).as_kernel_param(),
7963 (&d_overflow).as_kernel_param(),
7964 ];
7965 materialize_func
7966 .clone()
7967 .launch_on_stream(&cu_stream, probe_config, &mut params)
7968 .map_err(|e| {
7969 XlogError::Kernel(format!(
7970 "hash_join_probe_v2_materialize (on_stream) failed: {}",
7971 e
7972 ))
7973 })?;
7974 }
7975
7976 rec_mat.commit(runtime).map_err(|e| {
7977 XlogError::Kernel(format!("csm inner: materialize commit failed: {}", e))
7978 })?;
7979
7980 cu_stream.synchronize().map_err(|e| {
7981 XlogError::Kernel(format!("csm inner: sync (post-materialize) failed: {}", e))
7982 })?;
7983
7984 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
7986 for col_idx in 0..left.columns.len() {
7987 let c = left
7988 .column(col_idx)
7989 .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
7990 rec_gather.read_column(c);
7991 }
7992 for col_idx in 0..right.columns.len() {
7993 let c = right
7994 .column(col_idx)
7995 .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
7996 rec_gather.read_column(c);
7997 }
7998 rec_gather.read(&d_output_left);
7999 rec_gather.read(&d_output_right);
8000 rec_gather
8001 .preflight(runtime)
8002 .map_err(|e| XlogError::Kernel(format!("csm inner: gather preflight failed: {}", e)))?;
8003 let gathered_left = self.gather_buffer_by_indices_on_stream(
8004 left,
8005 &d_output_left,
8006 output_capacity,
8007 &cu_stream,
8008 launch_stream,
8009 runtime,
8010 )?;
8011 let gathered_right = self.gather_buffer_by_indices_on_stream(
8012 right,
8013 &d_output_right,
8014 output_capacity,
8015 &cu_stream,
8016 launch_stream,
8017 runtime,
8018 )?;
8019 rec_gather
8020 .commit(runtime)
8021 .map_err(|e| XlogError::Kernel(format!("csm inner: gather commit failed: {}", e)))?;
8022
8023 let combined_schema = self.combine_schemas(left.schema(), right.schema());
8024 let mut result_columns = Vec::with_capacity(combined_schema.arity());
8025 result_columns.extend(gathered_left.columns);
8026 result_columns.extend(gathered_right.columns);
8027 self.buffer_from_columns(result_columns, output_capacity as u64, combined_schema)
8028 }
8029
8030 fn hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded(
8031 &self,
8032 left: &CudaBuffer,
8033 right: &CudaBuffer,
8034 left_keys: &[usize],
8035 right_keys: &[usize],
8036 max_output: Option<usize>,
8037 launch_stream: StreamId,
8038 ) -> Result<Option<CudaBuffer>> {
8039 let runtime = self.memory.runtime().ok_or_else(|| {
8040 XlogError::Kernel(
8041 "hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded requires a \
8042 runtime-backed GpuMemoryManager"
8043 .to_string(),
8044 )
8045 })?;
8046 let cu_stream = runtime
8047 .stream_pool()
8048 .resolve(launch_stream)
8049 .ok_or_else(|| {
8050 XlogError::Kernel(format!(
8051 "hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded: \
8052 launch_stream StreamId({}) does not resolve",
8053 launch_stream.0
8054 ))
8055 })?;
8056
8057 let num_left = self.device_row_count(left)?;
8058 let num_right = self.device_row_count(right)?;
8059 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
8060 return Err(XlogError::Kernel(format!(
8061 "Join supports at most {} rows per side (left={}, right={})",
8062 u32::MAX,
8063 num_left,
8064 num_right
8065 )));
8066 }
8067 if num_left == 0 || num_right == 0 || max_output == Some(0) {
8068 let combined_schema = self.combine_schemas(left.schema(), right.schema());
8069 return self.create_empty_buffer(combined_schema).map(Some);
8070 }
8071 if left_keys.is_empty() || right_keys.is_empty() {
8072 return Err(XlogError::Kernel(
8073 "Join requires at least one key column".to_string(),
8074 ));
8075 }
8076 if left_keys.len() != right_keys.len() {
8077 return Err(XlogError::Kernel(
8078 "Left and right key columns must have same length".to_string(),
8079 ));
8080 }
8081 if left_keys.len() > 4 {
8082 return Err(XlogError::Kernel(
8083 "hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded: max 4 key \
8084 columns supported (pack_keys constraint)"
8085 .to_string(),
8086 ));
8087 }
8088 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
8089 let lt = left.schema().column_type(l);
8090 let rt = right.schema().column_type(r);
8091 if lt != rt {
8092 return Err(XlogError::Kernel(format!(
8093 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
8094 l, lt, r, rt
8095 )));
8096 }
8097 }
8098
8099 let logical_probe_cap = left.num_rows() as u32;
8100 let probe_cap = crate::cuda_graph::graph_capacity_class_u32(logical_probe_cap);
8101 let Some(output_capacity) =
8102 Self::csm_cuda_graph_output_capacity(logical_probe_cap, num_right as u32, max_output)?
8103 else {
8104 return Ok(None);
8105 };
8106 if output_capacity == 0 {
8107 let combined_schema = self.combine_schemas(left.schema(), right.schema());
8108 return self.create_empty_buffer(combined_schema).map(Some);
8109 }
8110
8111 let left_packed =
8112 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
8113 let right_packed =
8114 self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
8115 let graph_key = CsmCudaGraphKey::inner(
8116 left_keys.len(),
8117 left_packed.key_bytes,
8118 probe_cap,
8119 output_capacity,
8120 )?;
8121 let table = self.build_hash_table_v2_on_stream(
8122 &right_packed.hashes,
8123 num_right as u32,
8124 &cu_stream,
8125 launch_stream,
8126 runtime,
8127 )?;
8128
8129 let device = self.device.inner();
8130 let block_size = 256u32;
8131 let probe_grid = probe_cap.div_ceil(block_size);
8132 let probe_config = LaunchConfig {
8133 grid_dim: (probe_grid, 1, 1),
8134 block_dim: (block_size, 1, 1),
8135 shared_mem_bytes: 0,
8136 };
8137
8138 let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(num_right as u64);
8139 let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
8140
8141 {
8142 let mut cache = self.csm_cuda_graph_cache.lock().map_err(|e| {
8143 XlogError::Kernel(format!("csm CUDA Graph cache lock poisoned: {}", e))
8144 })?;
8145 if let Some(entry) = cache.get_mut(&graph_key) {
8146 let result = self.launch_csm_cuda_graph_entry(
8147 entry,
8148 left,
8149 right,
8150 &left_packed,
8151 &right_packed,
8152 &table,
8153 max_output,
8154 materialize_capacity_u32,
8155 probe_config,
8156 &cu_stream,
8157 launch_stream,
8158 runtime,
8159 )?;
8160 self.csm_cuda_graph_cache_hits
8161 .fetch_add(1, Ordering::Relaxed);
8162 return Ok(Some(result));
8163 }
8164 }
8165
8166 let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
8167 let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
8168 let d_logical_count = self.memory.alloc::<u32>(1)?;
8169 let d_overflow = self.memory.alloc::<u8>(1)?;
8170 let d_output_left = self.memory.alloc::<u32>(output_capacity as usize)?;
8171 let d_output_right = self.memory.alloc::<u32>(output_capacity as usize)?;
8172 let mut scan_scratch = self.multiblock_scan_u32_scratch_for_len(probe_cap)?;
8173
8174 let count_func = device
8175 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
8176 .ok_or_else(|| {
8177 XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
8178 })?;
8179 let total_func = device
8180 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
8181 .ok_or_else(|| {
8182 XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
8183 })?;
8184 let materialize_func = device
8185 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
8186 .ok_or_else(|| {
8187 XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
8188 })?;
8189
8190 let graph = CapturedCudaGraph::capture_on_stream(&cu_stream, || {
8191 unsafe {
8194 let res = cudarc::driver::sys::cuMemsetD8Async(
8195 *d_overflow.device_ptr(),
8196 0,
8197 1,
8198 cu_stream.cu_stream(),
8199 );
8200 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8201 return Err(XlogError::Kernel(format!(
8202 "csm inner graph: cuMemsetD8Async (d_overflow) failed: {:?}",
8203 res
8204 )));
8205 }
8206 let res = cudarc::driver::sys::cuMemsetD8Async(
8207 *d_logical_count.device_ptr(),
8208 0,
8209 std::mem::size_of::<u32>(),
8210 cu_stream.cu_stream(),
8211 );
8212 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8213 return Err(XlogError::Kernel(format!(
8214 "csm inner graph: cuMemsetD8Async (d_logical_count) failed: {:?}",
8215 res
8216 )));
8217 }
8218 }
8219
8220 unsafe {
8222 count_func.clone().launch_on_stream(
8223 &cu_stream,
8224 probe_config,
8225 (
8226 &left_packed.hashes,
8227 left.num_rows_device(),
8228 probe_cap,
8229 &table.bucket_offsets,
8230 &table.bucket_counts,
8231 &table.bucket_entries,
8232 &table.bucket_entry_hashes,
8233 table.bucket_mask,
8234 &left_packed.packed_keys,
8235 &right_packed.packed_keys,
8236 left_packed.key_bytes,
8237 &per_probe_count,
8238 ),
8239 )
8240 }
8241 .map_err(|e| {
8242 XlogError::Kernel(format!("csm inner graph: count_per_row failed: {}", e))
8243 })?;
8244
8245 unsafe {
8247 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
8248 *per_probe_offsets.device_ptr(),
8249 *per_probe_count.device_ptr(),
8250 (probe_cap as usize) * std::mem::size_of::<u32>(),
8251 cu_stream.cu_stream(),
8252 );
8253 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8254 return Err(XlogError::Kernel(format!(
8255 "csm inner graph: cuMemcpyDtoDAsync (count -> offsets) failed: {:?}",
8256 res
8257 )));
8258 }
8259 }
8260 self.multiblock_scan_u32_inplace_on_stream_with_scratch(
8261 &mut per_probe_offsets,
8262 probe_cap,
8263 &cu_stream,
8264 &mut scan_scratch,
8265 )?;
8266
8267 unsafe {
8269 total_func.clone().launch_on_stream(
8270 &cu_stream,
8271 LaunchConfig {
8272 grid_dim: (1, 1, 1),
8273 block_dim: (1, 1, 1),
8274 shared_mem_bytes: 0,
8275 },
8276 (
8277 &per_probe_offsets,
8278 &per_probe_count,
8279 left.num_rows_device(),
8280 probe_cap,
8281 materialize_capacity_u32,
8282 &d_logical_count,
8283 &d_overflow,
8284 ),
8285 )
8286 }
8287 .map_err(|e| XlogError::Kernel(format!("csm inner graph: total failed: {}", e)))?;
8288
8289 unsafe {
8291 let mut params: Vec<*mut c_void> = vec![
8292 (&left_packed.hashes).as_kernel_param(),
8293 left.num_rows_device().as_kernel_param(),
8294 probe_cap.as_kernel_param(),
8295 (&table.bucket_offsets).as_kernel_param(),
8296 (&table.bucket_counts).as_kernel_param(),
8297 (&table.bucket_entries).as_kernel_param(),
8298 (&table.bucket_entry_hashes).as_kernel_param(),
8299 table.bucket_mask.as_kernel_param(),
8300 (&left_packed.packed_keys).as_kernel_param(),
8301 (&right_packed.packed_keys).as_kernel_param(),
8302 left_packed.key_bytes.as_kernel_param(),
8303 (&per_probe_offsets).as_kernel_param(),
8304 output_capacity.as_kernel_param(),
8305 (&d_output_left).as_kernel_param(),
8306 (&d_output_right).as_kernel_param(),
8307 (&d_overflow).as_kernel_param(),
8308 ];
8309 materialize_func
8310 .clone()
8311 .launch_on_stream(&cu_stream, probe_config, &mut params)
8312 .map_err(|e| {
8313 XlogError::Kernel(format!("csm inner graph: materialize failed: {}", e))
8314 })?;
8315 }
8316 Ok(())
8317 })?;
8318 let nodes = Self::csm_cuda_graph_nodes(&graph)?;
8319 let mut entry = CsmCudaGraphEntry {
8320 graph,
8321 nodes,
8322 per_probe_count,
8323 per_probe_offsets,
8324 d_logical_count,
8325 d_overflow,
8326 d_output_left,
8327 d_output_right,
8328 scan_scratch,
8329 probe_capacity: probe_cap,
8330 output_capacity,
8331 };
8332 self.csm_cuda_graph_captures.fetch_add(1, Ordering::Relaxed);
8333
8334 let result = self.launch_csm_cuda_graph_entry(
8335 &mut entry,
8336 left,
8337 right,
8338 &left_packed,
8339 &right_packed,
8340 &table,
8341 max_output,
8342 materialize_capacity_u32,
8343 probe_config,
8344 &cu_stream,
8345 launch_stream,
8346 runtime,
8347 )?;
8348 self.csm_cuda_graph_cache
8349 .lock()
8350 .map_err(|e| XlogError::Kernel(format!("csm CUDA Graph cache lock poisoned: {}", e)))?
8351 .insert(graph_key, entry);
8352 Ok(Some(result))
8353 }
8354
8355 #[allow(clippy::too_many_arguments)]
8356 fn launch_csm_cuda_graph_entry(
8357 &self,
8358 entry: &mut CsmCudaGraphEntry,
8359 left: &CudaBuffer,
8360 right: &CudaBuffer,
8361 left_packed: &PackedKeyData,
8362 right_packed: &PackedKeyData,
8363 table: &JoinHashTableV2,
8364 max_output: Option<usize>,
8365 materialize_capacity_u32: u32,
8366 probe_config: LaunchConfig,
8367 cu_stream: &cudarc::driver::CudaStream,
8368 launch_stream: StreamId,
8369 runtime: &crate::device_runtime::XlogDeviceRuntime,
8370 ) -> Result<CudaBuffer> {
8371 let mut rec_graph = LaunchRecorder::new_strict(launch_stream);
8372 rec_graph.read(&left_packed.hashes);
8373 rec_graph.read(&left_packed.packed_keys);
8374 rec_graph.read(&right_packed.packed_keys);
8375 rec_graph.read(&table.bucket_offsets);
8376 rec_graph.read(&table.bucket_counts);
8377 rec_graph.read(&table.bucket_entries);
8378 rec_graph.read(&table.bucket_entry_hashes);
8379 rec_graph.read(left.num_rows_device());
8380 rec_graph.read_write(&entry.per_probe_count);
8381 rec_graph.read_write(&entry.per_probe_offsets);
8382 rec_graph.read_write(&entry.d_logical_count);
8383 rec_graph.read_write(&entry.d_overflow);
8384 rec_graph.write(&entry.d_output_left);
8385 rec_graph.write(&entry.d_output_right);
8386 for level in entry.scan_scratch.levels() {
8387 rec_graph.read_write(level);
8388 }
8389 rec_graph
8390 .preflight(runtime)
8391 .map_err(|e| XlogError::Kernel(format!("csm inner graph: preflight failed: {}", e)))?;
8392
8393 let probe_cap = entry.probe_capacity;
8394 let output_capacity = entry.output_capacity;
8395 if probe_config.grid_dim.0 != probe_cap.div_ceil(probe_config.block_dim.0) {
8396 return Err(XlogError::Kernel(format!(
8397 "csm CUDA Graph replay probe grid mismatch: graph probe_cap={}, grid={:?}",
8398 probe_cap, probe_config.grid_dim
8399 )));
8400 }
8401 if entry.nodes.node_count < 5 {
8402 return Err(XlogError::Kernel(format!(
8403 "csm CUDA Graph replay node inventory too small: {}",
8404 entry.nodes.node_count
8405 )));
8406 }
8407
8408 let mut count_params = entry.graph.kernel_node_params(entry.nodes.count)?;
8409 let mut total_params = entry.graph.kernel_node_params(entry.nodes.total)?;
8410 let mut materialize_params = entry.graph.kernel_node_params(entry.nodes.materialize)?;
8411 let mut count_args: Vec<*mut c_void> = vec![
8412 (&left_packed.hashes).as_kernel_param(),
8413 left.num_rows_device().as_kernel_param(),
8414 probe_cap.as_kernel_param(),
8415 (&table.bucket_offsets).as_kernel_param(),
8416 (&table.bucket_counts).as_kernel_param(),
8417 (&table.bucket_entries).as_kernel_param(),
8418 (&table.bucket_entry_hashes).as_kernel_param(),
8419 table.bucket_mask.as_kernel_param(),
8420 (&left_packed.packed_keys).as_kernel_param(),
8421 (&right_packed.packed_keys).as_kernel_param(),
8422 left_packed.key_bytes.as_kernel_param(),
8423 (&entry.per_probe_count).as_kernel_param(),
8424 ];
8425 let mut total_args: Vec<*mut c_void> = vec![
8426 (&entry.per_probe_offsets).as_kernel_param(),
8427 (&entry.per_probe_count).as_kernel_param(),
8428 left.num_rows_device().as_kernel_param(),
8429 probe_cap.as_kernel_param(),
8430 materialize_capacity_u32.as_kernel_param(),
8431 (&entry.d_logical_count).as_kernel_param(),
8432 (&entry.d_overflow).as_kernel_param(),
8433 ];
8434 let mut materialize_args: Vec<*mut c_void> = vec![
8435 (&left_packed.hashes).as_kernel_param(),
8436 left.num_rows_device().as_kernel_param(),
8437 probe_cap.as_kernel_param(),
8438 (&table.bucket_offsets).as_kernel_param(),
8439 (&table.bucket_counts).as_kernel_param(),
8440 (&table.bucket_entries).as_kernel_param(),
8441 (&table.bucket_entry_hashes).as_kernel_param(),
8442 table.bucket_mask.as_kernel_param(),
8443 (&left_packed.packed_keys).as_kernel_param(),
8444 (&right_packed.packed_keys).as_kernel_param(),
8445 left_packed.key_bytes.as_kernel_param(),
8446 (&entry.per_probe_offsets).as_kernel_param(),
8447 output_capacity.as_kernel_param(),
8448 (&entry.d_output_left).as_kernel_param(),
8449 (&entry.d_output_right).as_kernel_param(),
8450 (&entry.d_overflow).as_kernel_param(),
8451 ];
8452 count_params.kernelParams = count_args.as_mut_ptr();
8453 count_params.extra = std::ptr::null_mut();
8454 total_params.kernelParams = total_args.as_mut_ptr();
8455 total_params.extra = std::ptr::null_mut();
8456 materialize_params.kernelParams = materialize_args.as_mut_ptr();
8457 materialize_params.extra = std::ptr::null_mut();
8458 unsafe {
8459 entry
8460 .graph
8461 .set_kernel_node_params(entry.nodes.count, &count_params)?;
8462 entry
8463 .graph
8464 .set_kernel_node_params(entry.nodes.total, &total_params)?;
8465 entry
8466 .graph
8467 .set_kernel_node_params(entry.nodes.materialize, &materialize_params)?;
8468 }
8469
8470 entry.graph.launch(cu_stream)?;
8471 self.csm_cuda_graph_launches.fetch_add(1, Ordering::Relaxed);
8472 rec_graph
8473 .commit(runtime)
8474 .map_err(|e| XlogError::Kernel(format!("csm inner graph: commit failed: {}", e)))?;
8475
8476 cu_stream.synchronize().map_err(|e| {
8477 XlogError::Kernel(format!("csm inner graph: sync (total read) failed: {}", e))
8478 })?;
8479 let total = self.read_join_output_count_metadata(&entry.d_logical_count)? as u64;
8480 let requested = max_output
8481 .map(|limit| (limit as u64).min(total))
8482 .unwrap_or(total);
8483 if requested == 0 {
8484 let combined_schema = self.combine_schemas(left.schema(), right.schema());
8485 return self.create_empty_buffer(combined_schema);
8486 }
8487 if requested > output_capacity as u64 {
8488 return Err(XlogError::Kernel(format!(
8489 "csm inner graph produced {} rows but graph output capacity is {}",
8490 requested, output_capacity
8491 )));
8492 }
8493 let output_rows = requested as u32;
8494
8495 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
8496 for col_idx in 0..left.columns.len() {
8497 let c = left
8498 .column(col_idx)
8499 .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
8500 rec_gather.read_column(c);
8501 }
8502 for col_idx in 0..right.columns.len() {
8503 let c = right
8504 .column(col_idx)
8505 .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
8506 rec_gather.read_column(c);
8507 }
8508 rec_gather.read(&entry.d_output_left);
8509 rec_gather.read(&entry.d_output_right);
8510 rec_gather.preflight(runtime).map_err(|e| {
8511 XlogError::Kernel(format!("csm inner graph: gather preflight failed: {}", e))
8512 })?;
8513 let gathered_left = self.gather_buffer_by_indices_on_stream(
8514 left,
8515 &entry.d_output_left,
8516 output_rows,
8517 cu_stream,
8518 launch_stream,
8519 runtime,
8520 )?;
8521 let gathered_right = self.gather_buffer_by_indices_on_stream(
8522 right,
8523 &entry.d_output_right,
8524 output_rows,
8525 cu_stream,
8526 launch_stream,
8527 runtime,
8528 )?;
8529 rec_gather.commit(runtime).map_err(|e| {
8530 XlogError::Kernel(format!("csm inner graph: gather commit failed: {}", e))
8531 })?;
8532
8533 let combined_schema = self.combine_schemas(left.schema(), right.schema());
8534 let mut result_columns = Vec::with_capacity(combined_schema.arity());
8535 result_columns.extend(gathered_left.columns);
8536 result_columns.extend(gathered_right.columns);
8537 self.buffer_from_columns(result_columns, output_rows as u64, combined_schema)
8538 }
8539
8540 fn csm_cuda_graph_nodes(graph: &CapturedCudaGraph) -> Result<CsmCudaGraphNodes> {
8541 let nodes = graph.nodes()?;
8542 if nodes.len() < 5 {
8543 return Err(XlogError::Kernel(format!(
8544 "csm inner graph captured too few nodes: {}",
8545 nodes.len()
8546 )));
8547 }
8548 let kernel_nodes: Vec<_> = nodes
8549 .iter()
8550 .copied()
8551 .filter(|n| n.kind == CudaGraphNodeKind::Kernel)
8552 .collect();
8553 if kernel_nodes.len() < 3 {
8554 return Err(XlogError::Kernel(format!(
8555 "csm inner graph captured too few kernel nodes: {}",
8556 kernel_nodes.len()
8557 )));
8558 }
8559 Ok(CsmCudaGraphNodes {
8560 count: kernel_nodes[0],
8561 total: kernel_nodes[kernel_nodes.len() - 2],
8562 materialize: kernel_nodes[kernel_nodes.len() - 1],
8563 node_count: nodes.len(),
8564 })
8565 }
8566
8567 fn csm_cuda_graph_output_capacity(
8568 probe_cap: u32,
8569 num_right: u32,
8570 max_output: Option<usize>,
8571 ) -> Result<Option<u32>> {
8572 if let Some(limit) = max_output {
8573 let limit = u32::try_from(limit).map_err(|_| {
8574 XlogError::Kernel(format!(
8575 "csm CUDA Graph max_output {} exceeds u32::MAX",
8576 limit
8577 ))
8578 })?;
8579 return Ok(Some(crate::cuda_graph::graph_capacity_class_u32(limit)));
8580 }
8581
8582 let worst_case = (probe_cap as u64).saturating_mul(num_right as u64);
8583 if worst_case > u32::MAX as u64 {
8584 return Ok(None);
8585 }
8586 let auto_cap = std::env::var("XLOG_CSM_CUDA_GRAPH_AUTO_OUTPUT_CAP")
8587 .ok()
8588 .and_then(|v| v.parse::<u64>().ok())
8589 .unwrap_or(1_000_000);
8590 if worst_case <= auto_cap {
8591 Ok(Some(crate::cuda_graph::graph_capacity_class_u32(
8592 worst_case as u32,
8593 )))
8594 } else {
8595 Ok(None)
8596 }
8597 }
8598
8599 pub fn hash_join_left_outer_v2_count_scan_materialize_recorded(
8626 &self,
8627 left: &CudaBuffer,
8628 right: &CudaBuffer,
8629 left_keys: &[usize],
8630 right_keys: &[usize],
8631 max_output: Option<usize>,
8632 launch_stream: StreamId,
8633 ) -> Result<CudaBuffer> {
8634 use crate::launch::LaunchRecorder;
8635
8636 let runtime = self.memory.runtime().ok_or_else(|| {
8637 XlogError::Kernel(
8638 "hash_join_left_outer_v2_count_scan_materialize_recorded requires a \
8639 runtime-backed GpuMemoryManager"
8640 .to_string(),
8641 )
8642 })?;
8643 let cu_stream = runtime
8644 .stream_pool()
8645 .resolve(launch_stream)
8646 .ok_or_else(|| {
8647 XlogError::Kernel(format!(
8648 "csm left_outer: launch_stream StreamId({}) does not resolve",
8649 launch_stream.0
8650 ))
8651 })?;
8652
8653 let num_left = self.device_row_count(left)?;
8655 let num_right = self.device_row_count(right)?;
8656 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
8657 return Err(XlogError::Kernel(format!(
8658 "Join supports at most {} rows per side (left={}, right={})",
8659 u32::MAX,
8660 num_left,
8661 num_right
8662 )));
8663 }
8664 if num_left == 0 {
8665 let combined_schema = self.combine_schemas(left.schema(), right.schema());
8666 return self.create_empty_buffer(combined_schema);
8667 }
8668 if num_right == 0 {
8669 return self.left_outer_with_nulls(left, right);
8674 }
8675 if left_keys.is_empty() || right_keys.is_empty() {
8676 return Err(XlogError::Kernel(
8677 "Join requires at least one key column".to_string(),
8678 ));
8679 }
8680 if left_keys.len() != right_keys.len() {
8681 return Err(XlogError::Kernel(
8682 "Left and right key columns must have same length".to_string(),
8683 ));
8684 }
8685 if left_keys.len() > 4 {
8686 return Err(XlogError::Kernel(
8687 "csm left_outer: max 4 key columns supported (pack_keys constraint)".to_string(),
8688 ));
8689 }
8690 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
8691 let lt = left.schema().column_type(l);
8692 let rt = right.schema().column_type(r);
8693 if lt != rt {
8694 return Err(XlogError::Kernel(format!(
8695 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
8696 l, lt, r, rt
8697 )));
8698 }
8699 }
8700
8701 let probe_cap = u32::try_from(num_left).map_err(|_| {
8709 XlogError::Kernel("csm left_outer: left row count exceeds u32::MAX".to_string())
8710 })?;
8711 let num_right_u32 = u32::try_from(num_right).map_err(|_| {
8712 XlogError::Kernel("csm left_outer: right row count exceeds u32::MAX".to_string())
8713 })?;
8714
8715 let left_packed =
8717 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
8718 let right_packed =
8719 self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
8720 let table = self.build_hash_table_v2_on_stream(
8721 &right_packed.hashes,
8722 num_right_u32,
8723 &cu_stream,
8724 launch_stream,
8725 runtime,
8726 )?;
8727
8728 let device = self.device.inner();
8729 let block_size = 256u32;
8730 let probe_grid = probe_cap.div_ceil(block_size);
8731 let probe_config = LaunchConfig {
8732 grid_dim: (probe_grid, 1, 1),
8733 block_dim: (block_size, 1, 1),
8734 shared_mem_bytes: 0,
8735 };
8736
8737 let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
8739 let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
8740 let d_logical_count = self.memory.alloc::<u32>(1)?;
8741 let d_overflow = self.memory.alloc::<u8>(1)?;
8742 runtime
8746 .prepare_first_use(&d_overflow, launch_stream, Access::Write)
8747 .map_err(|e| {
8748 XlogError::Kernel(format!("csm left_outer: prepare d_overflow failed: {}", e))
8749 })?;
8750 runtime
8751 .prepare_first_use(&d_logical_count, launch_stream, Access::Write)
8752 .map_err(|e| {
8753 XlogError::Kernel(format!(
8754 "csm left_outer: prepare d_logical_count failed: {}",
8755 e
8756 ))
8757 })?;
8758 unsafe {
8761 let res = cudarc::driver::sys::cuMemsetD8Async(
8762 *d_overflow.device_ptr(),
8763 0,
8764 1,
8765 cu_stream.cu_stream(),
8766 );
8767 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8768 return Err(XlogError::Kernel(format!(
8769 "csm left_outer: cuMemsetD8Async (d_overflow) failed: {:?}",
8770 res
8771 )));
8772 }
8773 let res = cudarc::driver::sys::cuMemsetD8Async(
8774 *d_logical_count.device_ptr(),
8775 0,
8776 std::mem::size_of::<u32>(),
8777 cu_stream.cu_stream(),
8778 );
8779 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8780 return Err(XlogError::Kernel(format!(
8781 "csm left_outer: cuMemsetD8Async (d_logical_count) failed: {:?}",
8782 res
8783 )));
8784 }
8785 }
8786
8787 let count_func = device
8788 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
8789 .ok_or_else(|| {
8790 XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
8791 })?;
8792 let total_func = device
8793 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
8794 .ok_or_else(|| {
8795 XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
8796 })?;
8797
8798 let mut rec_count = LaunchRecorder::new_strict(launch_stream);
8799 rec_count.read(&left_packed.hashes);
8800 rec_count.read(&left_packed.packed_keys);
8801 rec_count.read(&right_packed.packed_keys);
8802 rec_count.read(&table.bucket_offsets);
8803 rec_count.read(&table.bucket_counts);
8804 rec_count.read(&table.bucket_entries);
8805 rec_count.read(&table.bucket_entry_hashes);
8806 rec_count.read(left.num_rows_device());
8807 rec_count.write(&per_probe_count);
8808 rec_count.write(&per_probe_offsets);
8809 rec_count.write(&d_logical_count);
8810 rec_count.write(&d_overflow);
8811 rec_count.preflight(runtime).map_err(|e| {
8812 XlogError::Kernel(format!(
8813 "csm left_outer: count/scan preflight failed: {}",
8814 e
8815 ))
8816 })?;
8817
8818 unsafe {
8821 count_func.clone().launch_on_stream(
8822 &cu_stream,
8823 probe_config,
8824 (
8825 &left_packed.hashes,
8826 left.num_rows_device(),
8827 probe_cap,
8828 &table.bucket_offsets,
8829 &table.bucket_counts,
8830 &table.bucket_entries,
8831 &table.bucket_entry_hashes,
8832 table.bucket_mask,
8833 &left_packed.packed_keys,
8834 &right_packed.packed_keys,
8835 left_packed.key_bytes,
8836 &per_probe_count,
8837 ),
8838 )
8839 }
8840 .map_err(|e| {
8841 XlogError::Kernel(format!(
8842 "hash_join_probe_v2_count_per_row (csm left_outer) failed: {}",
8843 e
8844 ))
8845 })?;
8846
8847 unsafe {
8851 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
8852 *per_probe_offsets.device_ptr(),
8853 *per_probe_count.device_ptr(),
8854 (probe_cap as usize) * std::mem::size_of::<u32>(),
8855 cu_stream.cu_stream(),
8856 );
8857 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8858 return Err(XlogError::Kernel(format!(
8859 "csm left_outer: cuMemcpyDtoDAsync (count → offsets) failed: {:?}",
8860 res
8861 )));
8862 }
8863 }
8864 self.multiblock_scan_u32_inplace_on_stream(
8865 &mut per_probe_offsets,
8866 probe_cap,
8867 &cu_stream,
8868 launch_stream,
8869 runtime,
8870 )?;
8871
8872 let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(num_right as u64);
8874 let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
8875 unsafe {
8877 total_func.clone().launch_on_stream(
8878 &cu_stream,
8879 LaunchConfig {
8880 grid_dim: (1, 1, 1),
8881 block_dim: (1, 1, 1),
8882 shared_mem_bytes: 0,
8883 },
8884 (
8885 &per_probe_offsets,
8886 &per_probe_count,
8887 left.num_rows_device(),
8888 probe_cap,
8889 materialize_capacity_u32,
8890 &d_logical_count,
8891 &d_overflow,
8892 ),
8893 )
8894 }
8895 .map_err(|e| {
8896 XlogError::Kernel(format!(
8897 "hash_join_total_from_scan (csm left_outer) failed: {}",
8898 e
8899 ))
8900 })?;
8901
8902 rec_count.commit(runtime).map_err(|e| {
8903 XlogError::Kernel(format!("csm left_outer: count/scan commit failed: {}", e))
8904 })?;
8905
8906 cu_stream.synchronize().map_err(|e| {
8907 XlogError::Kernel(format!("csm left_outer: sync (count read) failed: {}", e))
8908 })?;
8909 let inner_total = self.read_join_output_count_metadata(&d_logical_count)? as u64;
8910 let inner_clamped = max_output
8911 .map(|limit| (limit as u64).min(inner_total))
8912 .unwrap_or(inner_total);
8913 if inner_clamped > u32::MAX as u64 {
8914 return Err(XlogError::Kernel(format!(
8915 "Join produced {} matched rows which exceeds the u32 index limit",
8916 inner_clamped
8917 )));
8918 }
8919 let inner_count_u32 = inner_clamped as u32;
8920
8921 let materialize_func = device
8923 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
8924 .ok_or_else(|| {
8925 XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
8926 })?;
8927 let d_output_left = self.memory.alloc::<u32>(inner_count_u32.max(1) as usize)?;
8928 let d_output_right = self.memory.alloc::<u32>(inner_count_u32.max(1) as usize)?;
8929
8930 let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
8931 rec_mat.read(&left_packed.hashes);
8932 rec_mat.read(&left_packed.packed_keys);
8933 rec_mat.read(&right_packed.packed_keys);
8934 rec_mat.read(&table.bucket_offsets);
8935 rec_mat.read(&table.bucket_counts);
8936 rec_mat.read(&table.bucket_entries);
8937 rec_mat.read(&table.bucket_entry_hashes);
8938 rec_mat.read(&per_probe_offsets);
8939 rec_mat.read(left.num_rows_device());
8940 rec_mat.write(&d_output_left);
8941 rec_mat.write(&d_output_right);
8942 rec_mat.write(&d_overflow);
8944 rec_mat.preflight(runtime).map_err(|e| {
8945 XlogError::Kernel(format!(
8946 "csm left_outer: materialize preflight failed: {}",
8947 e
8948 ))
8949 })?;
8950 if inner_count_u32 > 0 {
8951 unsafe {
8953 let mut params: Vec<*mut c_void> = vec![
8954 (&left_packed.hashes).as_kernel_param(),
8955 left.num_rows_device().as_kernel_param(),
8956 probe_cap.as_kernel_param(),
8957 (&table.bucket_offsets).as_kernel_param(),
8958 (&table.bucket_counts).as_kernel_param(),
8959 (&table.bucket_entries).as_kernel_param(),
8960 (&table.bucket_entry_hashes).as_kernel_param(),
8961 table.bucket_mask.as_kernel_param(),
8962 (&left_packed.packed_keys).as_kernel_param(),
8963 (&right_packed.packed_keys).as_kernel_param(),
8964 left_packed.key_bytes.as_kernel_param(),
8965 (&per_probe_offsets).as_kernel_param(),
8966 inner_count_u32.as_kernel_param(),
8967 (&d_output_left).as_kernel_param(),
8968 (&d_output_right).as_kernel_param(),
8969 (&d_overflow).as_kernel_param(),
8970 ];
8971 materialize_func
8972 .clone()
8973 .launch_on_stream(&cu_stream, probe_config, &mut params)
8974 .map_err(|e| {
8975 XlogError::Kernel(format!(
8976 "hash_join_probe_v2_materialize (csm left_outer) failed: {}",
8977 e
8978 ))
8979 })?;
8980 }
8981 }
8982 rec_mat.commit(runtime).map_err(|e| {
8983 XlogError::Kernel(format!("csm left_outer: materialize commit failed: {}", e))
8984 })?;
8985
8986 let d_unmatched_mask = self.memory.alloc::<u8>(probe_cap as usize)?;
8988 let unmatched_mask_func = device
8989 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_CSM_UNMATCHED_MASK)
8990 .ok_or_else(|| {
8991 XlogError::Kernel("hash_join_csm_unmatched_mask kernel not found".to_string())
8992 })?;
8993 let mut rec_um = LaunchRecorder::new_strict(launch_stream);
8994 rec_um.read(&per_probe_count);
8995 rec_um.read(left.num_rows_device());
8996 rec_um.write(&d_unmatched_mask);
8997 rec_um.preflight(runtime).map_err(|e| {
8998 XlogError::Kernel(format!(
8999 "csm left_outer: unmatched mask preflight failed: {}",
9000 e
9001 ))
9002 })?;
9003 unsafe {
9005 unmatched_mask_func.clone().launch_on_stream(
9006 &cu_stream,
9007 probe_config,
9008 (
9009 &per_probe_count,
9010 left.num_rows_device(),
9011 probe_cap,
9012 &d_unmatched_mask,
9013 ),
9014 )
9015 }
9016 .map_err(|e| {
9017 XlogError::Kernel(format!(
9018 "hash_join_csm_unmatched_mask (on_stream) failed: {}",
9019 e
9020 ))
9021 })?;
9022 rec_um.commit(runtime).map_err(|e| {
9023 XlogError::Kernel(format!(
9024 "csm left_outer: unmatched mask commit failed: {}",
9025 e
9026 ))
9027 })?;
9028
9029 let unmatched_left = self.compact_buffer_by_device_mask_counted_recorded(
9030 left,
9031 &d_unmatched_mask,
9032 launch_stream,
9033 )?;
9034 let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
9035 let total_rows = (inner_count_u32 as u64) + unmatched_rows;
9036
9037 let combined_schema = self.combine_schemas(left.schema(), right.schema());
9038 if total_rows == 0 {
9039 return self.create_empty_buffer(combined_schema);
9040 }
9041
9042 let inner_left_buf;
9044 let inner_right_buf;
9045 if inner_count_u32 > 0 {
9046 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
9047 for col_idx in 0..left.columns.len() {
9048 let c = left.column(col_idx).ok_or_else(|| {
9049 XlogError::Kernel(format!("Left column {} not found", col_idx))
9050 })?;
9051 rec_gather.read_column(c);
9052 }
9053 for col_idx in 0..right.columns.len() {
9054 let c = right.column(col_idx).ok_or_else(|| {
9055 XlogError::Kernel(format!("Right column {} not found", col_idx))
9056 })?;
9057 rec_gather.read_column(c);
9058 }
9059 rec_gather.read(&d_output_left);
9060 rec_gather.read(&d_output_right);
9061 rec_gather.preflight(runtime).map_err(|e| {
9062 XlogError::Kernel(format!("csm left_outer: gather preflight failed: {}", e))
9063 })?;
9064 inner_left_buf = Some(self.gather_buffer_by_indices_on_stream(
9065 left,
9066 &d_output_left,
9067 inner_count_u32,
9068 &cu_stream,
9069 launch_stream,
9070 runtime,
9071 )?);
9072 inner_right_buf = Some(self.gather_buffer_by_indices_on_stream(
9073 right,
9074 &d_output_right,
9075 inner_count_u32,
9076 &cu_stream,
9077 launch_stream,
9078 runtime,
9079 )?);
9080 rec_gather.commit(runtime).map_err(|e| {
9081 XlogError::Kernel(format!("csm left_outer: gather commit failed: {}", e))
9082 })?;
9083 } else {
9084 inner_left_buf = None;
9085 inner_right_buf = None;
9086 }
9087
9088 let mut rec_d = LaunchRecorder::new_strict(launch_stream);
9091 for col_idx in 0..unmatched_left.columns.len() {
9092 let c = unmatched_left.column(col_idx).ok_or_else(|| {
9093 XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
9094 })?;
9095 rec_d.read_column(c);
9096 }
9097 if let Some(b) = inner_left_buf.as_ref() {
9098 for col_idx in 0..b.columns.len() {
9099 let c = b.column(col_idx).ok_or_else(|| {
9100 XlogError::Kernel(format!("inner_left col {} not found", col_idx))
9101 })?;
9102 rec_d.read_column(c);
9103 }
9104 }
9105 if let Some(b) = inner_right_buf.as_ref() {
9106 for col_idx in 0..b.columns.len() {
9107 let c = b.column(col_idx).ok_or_else(|| {
9108 XlogError::Kernel(format!("inner_right col {} not found", col_idx))
9109 })?;
9110 rec_d.read_column(c);
9111 }
9112 }
9113 rec_d.preflight(runtime).map_err(|e| {
9114 XlogError::Kernel(format!("csm left_outer: phase-E preflight failed: {}", e))
9115 })?;
9116
9117 let inner_rows = inner_count_u32 as u64;
9118 let mut result_columns: Vec<CudaColumn> = Vec::with_capacity(combined_schema.arity());
9119
9120 for col_idx in 0..left.arity() {
9122 let elem_size = left
9123 .schema()
9124 .column_type(col_idx)
9125 .map(|t| t.size_bytes())
9126 .unwrap_or(4);
9127 let inner_bytes = (inner_rows as usize)
9128 .checked_mul(elem_size)
9129 .ok_or_else(|| XlogError::Kernel("csm left_outer: inner_bytes overflow".into()))?;
9130 let unmatched_bytes = (unmatched_rows as usize)
9131 .checked_mul(elem_size)
9132 .ok_or_else(|| {
9133 XlogError::Kernel("csm left_outer: unmatched_bytes overflow".into())
9134 })?;
9135 let total_bytes = inner_bytes
9136 .checked_add(unmatched_bytes)
9137 .ok_or_else(|| XlogError::Kernel("csm left_outer: total_bytes overflow".into()))?;
9138 let out_col = self.memory.alloc::<u8>(total_bytes)?;
9139 let dst_ptr = *out_col.device_ptr();
9140 runtime
9142 .prepare_first_use(&out_col, launch_stream, Access::Write)
9143 .map_err(|e| {
9144 XlogError::Kernel(format!(
9145 "csm left_outer: prepare left out_col {} failed: {}",
9146 col_idx, e
9147 ))
9148 })?;
9149 if inner_bytes > 0 {
9150 let src_col = inner_left_buf
9151 .as_ref()
9152 .expect("inner_count > 0")
9153 .column(col_idx)
9154 .ok_or_else(|| XlogError::Kernel("inner_left col missing".into()))?;
9155 unsafe {
9157 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
9158 dst_ptr,
9159 *src_col.device_ptr(),
9160 inner_bytes,
9161 cu_stream.cu_stream(),
9162 );
9163 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9164 return Err(XlogError::Kernel(format!(
9165 "csm left_outer: dtod inner_left col {} failed: {:?}",
9166 col_idx, res
9167 )));
9168 }
9169 }
9170 }
9171 if unmatched_bytes > 0 {
9172 let src_col = unmatched_left.column(col_idx).ok_or_else(|| {
9173 XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
9174 })?;
9175 unsafe {
9177 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
9178 dst_ptr + inner_bytes as u64,
9179 *src_col.device_ptr(),
9180 unmatched_bytes,
9181 cu_stream.cu_stream(),
9182 );
9183 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9184 return Err(XlogError::Kernel(format!(
9185 "csm left_outer: dtod unmatched col {} failed: {:?}",
9186 col_idx, res
9187 )));
9188 }
9189 }
9190 }
9191 if let Some(b) = out_col.runtime_block() {
9192 runtime
9193 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
9194 .map_err(|e| {
9195 XlogError::Kernel(format!(
9196 "csm left_outer: finish_block_use (left col {}) failed: {}",
9197 col_idx, e
9198 ))
9199 })?;
9200 }
9201 result_columns.push(out_col.into());
9202 }
9203
9204 for col_idx in 0..right.arity() {
9206 let elem_size = right
9207 .schema()
9208 .column_type(col_idx)
9209 .map(|t| t.size_bytes())
9210 .unwrap_or(4);
9211 let inner_bytes = (inner_rows as usize)
9212 .checked_mul(elem_size)
9213 .ok_or_else(|| {
9214 XlogError::Kernel("csm left_outer: right inner_bytes overflow".into())
9215 })?;
9216 let unmatched_bytes = (unmatched_rows as usize)
9217 .checked_mul(elem_size)
9218 .ok_or_else(|| {
9219 XlogError::Kernel("csm left_outer: right unmatched_bytes overflow".into())
9220 })?;
9221 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
9222 XlogError::Kernel("csm left_outer: right total_bytes overflow".into())
9223 })?;
9224 let out_col = self.memory.alloc::<u8>(total_bytes)?;
9225 let dst_ptr = *out_col.device_ptr();
9226 runtime
9228 .prepare_first_use(&out_col, launch_stream, Access::Write)
9229 .map_err(|e| {
9230 XlogError::Kernel(format!(
9231 "csm left_outer: prepare right out_col {} failed: {}",
9232 col_idx, e
9233 ))
9234 })?;
9235 if total_bytes > 0 {
9236 unsafe {
9238 let res = cudarc::driver::sys::cuMemsetD8Async(
9239 dst_ptr,
9240 0,
9241 total_bytes,
9242 cu_stream.cu_stream(),
9243 );
9244 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9245 return Err(XlogError::Kernel(format!(
9246 "csm left_outer: zero-fill right col {} failed: {:?}",
9247 col_idx, res
9248 )));
9249 }
9250 }
9251 }
9252 if inner_bytes > 0 {
9253 let src_col = inner_right_buf
9254 .as_ref()
9255 .expect("inner_count > 0")
9256 .column(col_idx)
9257 .ok_or_else(|| XlogError::Kernel("inner_right col missing".into()))?;
9258 unsafe {
9260 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
9261 dst_ptr,
9262 *src_col.device_ptr(),
9263 inner_bytes,
9264 cu_stream.cu_stream(),
9265 );
9266 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9267 return Err(XlogError::Kernel(format!(
9268 "csm left_outer: dtod inner_right col {} failed: {:?}",
9269 col_idx, res
9270 )));
9271 }
9272 }
9273 }
9274 if let Some(b) = out_col.runtime_block() {
9275 runtime
9276 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
9277 .map_err(|e| {
9278 XlogError::Kernel(format!(
9279 "csm left_outer: finish_block_use (right col {}) failed: {}",
9280 col_idx, e
9281 ))
9282 })?;
9283 }
9284 result_columns.push(out_col.into());
9285 }
9286
9287 rec_d.commit(runtime).map_err(|e| {
9288 XlogError::Kernel(format!("csm left_outer: phase-E commit failed: {}", e))
9289 })?;
9290
9291 if total_rows > u32::MAX as u64 {
9300 return Err(XlogError::Kernel(format!(
9301 "csm left_outer: output row count {} exceeds u32::MAX",
9302 total_rows
9303 )));
9304 }
9305 let total_rows_u32 = total_rows as u32;
9306 let d_num_rows = self.upload_device_row_count(total_rows_u32)?;
9307 Ok(CudaBuffer::from_columns_with_host_count(
9308 result_columns,
9309 total_rows,
9310 d_num_rows,
9311 combined_schema,
9312 total_rows_u32,
9313 ))
9314 }
9315
9316 #[allow(clippy::too_many_arguments)]
9341 pub fn hash_join_inner_v2_with_index_count_scan_materialize_recorded(
9342 &self,
9343 left: &CudaBuffer,
9344 right: &CudaBuffer,
9345 left_keys: &[usize],
9346 right_keys: &[usize],
9347 index: &crate::provider::JoinIndexV2,
9348 max_output: Option<usize>,
9349 launch_stream: StreamId,
9350 ) -> Result<CudaBuffer> {
9351 use crate::launch::LaunchRecorder;
9352
9353 let runtime = self.memory.runtime().ok_or_else(|| {
9354 XlogError::Kernel(
9355 "hash_join_inner_v2_with_index_count_scan_materialize_recorded requires \
9356 a runtime-backed GpuMemoryManager"
9357 .to_string(),
9358 )
9359 })?;
9360 let cu_stream = runtime
9361 .stream_pool()
9362 .resolve(launch_stream)
9363 .ok_or_else(|| {
9364 XlogError::Kernel(format!(
9365 "indexed CSM inner: launch_stream StreamId({}) does not resolve",
9366 launch_stream.0
9367 ))
9368 })?;
9369
9370 let left_rows = self.device_row_count(left)?;
9373 let right_rows = self.device_row_count(right)?;
9374 if left_rows > u32::MAX as usize || right_rows > u32::MAX as usize {
9375 return Err(XlogError::Kernel(format!(
9376 "Join supports at most {} rows per side (left={}, right={})",
9377 u32::MAX,
9378 left_rows,
9379 right_rows
9380 )));
9381 }
9382 if left_rows == 0 || right_rows == 0 {
9383 let combined_schema = self.combine_schemas(left.schema(), right.schema());
9384 return self.create_empty_buffer(combined_schema);
9385 }
9386 if left_keys.is_empty() || right_keys.is_empty() {
9387 return Err(XlogError::Kernel(
9388 "Join requires at least one key column".to_string(),
9389 ));
9390 }
9391 if left_keys.len() != right_keys.len() {
9392 return Err(XlogError::Kernel(
9393 "Left and right key columns must have same length".to_string(),
9394 ));
9395 }
9396 if left_keys.len() > 4 {
9397 return Err(XlogError::Kernel(
9398 "indexed CSM inner: max 4 key columns supported (pack_keys constraint)".to_string(),
9399 ));
9400 }
9401 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
9402 if l >= left.arity() {
9403 return Err(XlogError::Kernel(format!(
9404 "Left key column index {} out of bounds (arity {})",
9405 l,
9406 left.arity()
9407 )));
9408 }
9409 if r >= right.arity() {
9410 return Err(XlogError::Kernel(format!(
9411 "Right key column index {} out of bounds (arity {})",
9412 r,
9413 right.arity()
9414 )));
9415 }
9416 let lt = left.schema().column_type(l);
9417 let rt = right.schema().column_type(r);
9418 if lt != rt {
9419 return Err(XlogError::Kernel(format!(
9420 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
9421 l, lt, r, rt
9422 )));
9423 }
9424 }
9425 if index.right_num_rows() != right_rows as u32 {
9426 return Err(XlogError::Kernel(
9427 "Join index row count does not match right relation".to_string(),
9428 ));
9429 }
9430 if index.right_keys() != right_keys {
9431 return Err(XlogError::Kernel(
9432 "Join index key columns do not match requested right_keys".to_string(),
9433 ));
9434 }
9435
9436 let probe_cap = left.num_rows() as u32;
9437 let table = &index.table;
9438
9439 let left_packed =
9442 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
9443 if left_packed.key_bytes != index.key_bytes {
9444 return Err(XlogError::Kernel(
9445 "Join key byte width mismatch between probe and cached index".to_string(),
9446 ));
9447 }
9448
9449 let device = self.device.inner();
9450 let block_size = 256u32;
9451 let probe_grid = probe_cap.div_ceil(block_size);
9452 let probe_config = LaunchConfig {
9453 grid_dim: (probe_grid, 1, 1),
9454 block_dim: (block_size, 1, 1),
9455 shared_mem_bytes: 0,
9456 };
9457
9458 let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
9460 let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
9461 let d_logical_count = self.memory.alloc::<u32>(1)?;
9462 let d_overflow = self.memory.alloc::<u8>(1)?;
9463 runtime
9465 .prepare_first_use(&d_overflow, launch_stream, Access::Write)
9466 .map_err(|e| {
9467 XlogError::Kernel(format!(
9468 "indexed CSM inner: prepare d_overflow failed: {}",
9469 e
9470 ))
9471 })?;
9472 runtime
9473 .prepare_first_use(&d_logical_count, launch_stream, Access::Write)
9474 .map_err(|e| {
9475 XlogError::Kernel(format!(
9476 "indexed CSM inner: prepare d_logical_count failed: {}",
9477 e
9478 ))
9479 })?;
9480 unsafe {
9483 let res = cudarc::driver::sys::cuMemsetD8Async(
9484 *d_overflow.device_ptr(),
9485 0,
9486 1,
9487 cu_stream.cu_stream(),
9488 );
9489 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9490 return Err(XlogError::Kernel(format!(
9491 "indexed CSM inner: cuMemsetD8Async (d_overflow) failed: {:?}",
9492 res
9493 )));
9494 }
9495 let res = cudarc::driver::sys::cuMemsetD8Async(
9496 *d_logical_count.device_ptr(),
9497 0,
9498 std::mem::size_of::<u32>(),
9499 cu_stream.cu_stream(),
9500 );
9501 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9502 return Err(XlogError::Kernel(format!(
9503 "indexed CSM inner: cuMemsetD8Async (d_logical_count) failed: {:?}",
9504 res
9505 )));
9506 }
9507 }
9508
9509 let count_func = device
9514 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
9515 .ok_or_else(|| {
9516 XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
9517 })?;
9518 let total_func = device
9519 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
9520 .ok_or_else(|| {
9521 XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
9522 })?;
9523
9524 let mut rec_count = LaunchRecorder::new_strict(launch_stream);
9525 rec_count.read(&left_packed.hashes);
9526 rec_count.read(&left_packed.packed_keys);
9527 rec_count.read(&index.packed_keys);
9528 rec_count.read(&table.bucket_offsets);
9529 rec_count.read(&table.bucket_counts);
9530 rec_count.read(&table.bucket_entries);
9531 rec_count.read(&table.bucket_entry_hashes);
9532 rec_count.read(left.num_rows_device());
9533 rec_count.write(&per_probe_count);
9534 rec_count.write(&per_probe_offsets);
9535 rec_count.write(&d_logical_count);
9536 rec_count.write(&d_overflow);
9537 rec_count.preflight(runtime).map_err(|e| {
9538 XlogError::Kernel(format!(
9539 "indexed CSM inner: count/scan preflight failed: {}",
9540 e
9541 ))
9542 })?;
9543
9544 unsafe {
9547 count_func.clone().launch_on_stream(
9548 &cu_stream,
9549 probe_config,
9550 (
9551 &left_packed.hashes,
9552 left.num_rows_device(),
9553 probe_cap,
9554 &table.bucket_offsets,
9555 &table.bucket_counts,
9556 &table.bucket_entries,
9557 &table.bucket_entry_hashes,
9558 table.bucket_mask,
9559 &left_packed.packed_keys,
9560 &index.packed_keys,
9561 index.key_bytes,
9562 &per_probe_count,
9563 ),
9564 )
9565 }
9566 .map_err(|e| {
9567 XlogError::Kernel(format!(
9568 "hash_join_probe_v2_count_per_row (on_stream, indexed) failed: {}",
9569 e
9570 ))
9571 })?;
9572
9573 unsafe {
9577 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
9578 *per_probe_offsets.device_ptr(),
9579 *per_probe_count.device_ptr(),
9580 (probe_cap as usize) * std::mem::size_of::<u32>(),
9581 cu_stream.cu_stream(),
9582 );
9583 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9584 return Err(XlogError::Kernel(format!(
9585 "indexed CSM inner: cuMemcpyDtoDAsync (count → offsets) failed: {:?}",
9586 res
9587 )));
9588 }
9589 }
9590 self.multiblock_scan_u32_inplace_on_stream(
9591 &mut per_probe_offsets,
9592 probe_cap,
9593 &cu_stream,
9594 launch_stream,
9595 runtime,
9596 )?;
9597
9598 let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(right_rows as u64);
9600 let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
9601 unsafe {
9603 total_func.clone().launch_on_stream(
9604 &cu_stream,
9605 LaunchConfig {
9606 grid_dim: (1, 1, 1),
9607 block_dim: (1, 1, 1),
9608 shared_mem_bytes: 0,
9609 },
9610 (
9611 &per_probe_offsets,
9612 &per_probe_count,
9613 left.num_rows_device(),
9614 probe_cap,
9615 materialize_capacity_u32,
9616 &d_logical_count,
9617 &d_overflow,
9618 ),
9619 )
9620 }
9621 .map_err(|e| {
9622 XlogError::Kernel(format!(
9623 "hash_join_total_from_scan (on_stream, indexed) failed: {}",
9624 e
9625 ))
9626 })?;
9627
9628 rec_count.commit(runtime).map_err(|e| {
9629 XlogError::Kernel(format!(
9630 "indexed CSM inner: count/scan commit failed: {}",
9631 e
9632 ))
9633 })?;
9634
9635 cu_stream.synchronize().map_err(|e| {
9636 XlogError::Kernel(format!(
9637 "indexed CSM inner: sync (total read) failed: {}",
9638 e
9639 ))
9640 })?;
9641 let total = self.read_join_output_count_metadata(&d_logical_count)? as u64;
9642 let requested = max_output
9643 .map(|limit| (limit as u64).min(total))
9644 .unwrap_or(total);
9645 if requested == 0 {
9646 let combined_schema = self.combine_schemas(left.schema(), right.schema());
9647 return self.create_empty_buffer(combined_schema);
9648 }
9649 if requested > u32::MAX as u64 {
9650 return Err(XlogError::Kernel(format!(
9651 "Join produced {} rows which exceeds the u32 index limit",
9652 requested
9653 )));
9654 }
9655 let output_capacity = requested as u32;
9656
9657 let d_output_left = self.memory.alloc::<u32>(output_capacity as usize)?;
9659 let d_output_right = self.memory.alloc::<u32>(output_capacity as usize)?;
9660
9661 let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
9662 rec_mat.read(&left_packed.hashes);
9663 rec_mat.read(&left_packed.packed_keys);
9664 rec_mat.read(&index.packed_keys);
9665 rec_mat.read(&table.bucket_offsets);
9666 rec_mat.read(&table.bucket_counts);
9667 rec_mat.read(&table.bucket_entries);
9668 rec_mat.read(&table.bucket_entry_hashes);
9669 rec_mat.read(&per_probe_offsets);
9670 rec_mat.read(left.num_rows_device());
9671 rec_mat.write(&d_output_left);
9672 rec_mat.write(&d_output_right);
9673 rec_mat.write(&d_overflow);
9675 rec_mat.preflight(runtime).map_err(|e| {
9676 XlogError::Kernel(format!(
9677 "indexed CSM inner: materialize preflight failed: {}",
9678 e
9679 ))
9680 })?;
9681
9682 let materialize_func = device
9683 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
9684 .ok_or_else(|| {
9685 XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
9686 })?;
9687 unsafe {
9689 let mut params: Vec<*mut c_void> = vec![
9690 (&left_packed.hashes).as_kernel_param(),
9691 left.num_rows_device().as_kernel_param(),
9692 probe_cap.as_kernel_param(),
9693 (&table.bucket_offsets).as_kernel_param(),
9694 (&table.bucket_counts).as_kernel_param(),
9695 (&table.bucket_entries).as_kernel_param(),
9696 (&table.bucket_entry_hashes).as_kernel_param(),
9697 table.bucket_mask.as_kernel_param(),
9698 (&left_packed.packed_keys).as_kernel_param(),
9699 (&index.packed_keys).as_kernel_param(),
9700 index.key_bytes.as_kernel_param(),
9701 (&per_probe_offsets).as_kernel_param(),
9702 output_capacity.as_kernel_param(),
9703 (&d_output_left).as_kernel_param(),
9704 (&d_output_right).as_kernel_param(),
9705 (&d_overflow).as_kernel_param(),
9706 ];
9707 materialize_func
9708 .clone()
9709 .launch_on_stream(&cu_stream, probe_config, &mut params)
9710 .map_err(|e| {
9711 XlogError::Kernel(format!(
9712 "hash_join_probe_v2_materialize (on_stream, indexed) failed: {}",
9713 e
9714 ))
9715 })?;
9716 }
9717
9718 rec_mat.commit(runtime).map_err(|e| {
9719 XlogError::Kernel(format!(
9720 "indexed CSM inner: materialize commit failed: {}",
9721 e
9722 ))
9723 })?;
9724
9725 cu_stream.synchronize().map_err(|e| {
9726 XlogError::Kernel(format!(
9727 "indexed CSM inner: sync (post-materialize) failed: {}",
9728 e
9729 ))
9730 })?;
9731
9732 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
9734 for col_idx in 0..left.columns.len() {
9735 let c = left
9736 .column(col_idx)
9737 .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
9738 rec_gather.read_column(c);
9739 }
9740 for col_idx in 0..right.columns.len() {
9741 let c = right
9742 .column(col_idx)
9743 .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
9744 rec_gather.read_column(c);
9745 }
9746 rec_gather.read(&d_output_left);
9747 rec_gather.read(&d_output_right);
9748 rec_gather.preflight(runtime).map_err(|e| {
9749 XlogError::Kernel(format!("indexed CSM inner: gather preflight failed: {}", e))
9750 })?;
9751 let gathered_left = self.gather_buffer_by_indices_on_stream(
9752 left,
9753 &d_output_left,
9754 output_capacity,
9755 &cu_stream,
9756 launch_stream,
9757 runtime,
9758 )?;
9759 let gathered_right = self.gather_buffer_by_indices_on_stream(
9760 right,
9761 &d_output_right,
9762 output_capacity,
9763 &cu_stream,
9764 launch_stream,
9765 runtime,
9766 )?;
9767 rec_gather.commit(runtime).map_err(|e| {
9768 XlogError::Kernel(format!("indexed CSM inner: gather commit failed: {}", e))
9769 })?;
9770
9771 let combined_schema = self.combine_schemas(left.schema(), right.schema());
9772 let mut result_columns = Vec::with_capacity(combined_schema.arity());
9773 result_columns.extend(gathered_left.columns);
9774 result_columns.extend(gathered_right.columns);
9775 self.buffer_from_columns(result_columns, output_capacity as u64, combined_schema)
9776 }
9777
9778 #[allow(clippy::too_many_arguments)]
9809 pub fn hash_join_left_outer_v2_with_index_count_scan_materialize_recorded(
9810 &self,
9811 left: &CudaBuffer,
9812 right: &CudaBuffer,
9813 left_keys: &[usize],
9814 right_keys: &[usize],
9815 index: &crate::provider::JoinIndexV2,
9816 max_output: Option<usize>,
9817 launch_stream: StreamId,
9818 ) -> Result<CudaBuffer> {
9819 use crate::launch::LaunchRecorder;
9820
9821 let runtime = self.memory.runtime().ok_or_else(|| {
9822 XlogError::Kernel(
9823 "hash_join_left_outer_v2_with_index_count_scan_materialize_recorded requires \
9824 a runtime-backed GpuMemoryManager"
9825 .to_string(),
9826 )
9827 })?;
9828 let cu_stream = runtime
9829 .stream_pool()
9830 .resolve(launch_stream)
9831 .ok_or_else(|| {
9832 XlogError::Kernel(format!(
9833 "indexed csm left_outer: launch_stream StreamId({}) does not resolve",
9834 launch_stream.0
9835 ))
9836 })?;
9837
9838 let num_left = self.device_row_count(left)?;
9841 let num_right = self.device_row_count(right)?;
9842 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
9843 return Err(XlogError::Kernel(format!(
9844 "Join supports at most {} rows per side (left={}, right={})",
9845 u32::MAX,
9846 num_left,
9847 num_right
9848 )));
9849 }
9850 if num_left == 0 {
9851 let combined_schema = self.combine_schemas(left.schema(), right.schema());
9852 return self.create_empty_buffer(combined_schema);
9853 }
9854 if num_right == 0 {
9855 return self.left_outer_with_nulls(left, right);
9860 }
9861 if left_keys.is_empty() || right_keys.is_empty() {
9862 return Err(XlogError::Kernel(
9863 "Join requires at least one key column".to_string(),
9864 ));
9865 }
9866 if left_keys.len() != right_keys.len() {
9867 return Err(XlogError::Kernel(
9868 "Left and right key columns must have same length".to_string(),
9869 ));
9870 }
9871 if left_keys.len() > 4 {
9872 return Err(XlogError::Kernel(
9873 "indexed csm left_outer: max 4 key columns supported (pack_keys constraint)"
9874 .to_string(),
9875 ));
9876 }
9877 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
9878 if l >= left.arity() {
9879 return Err(XlogError::Kernel(format!(
9880 "Left key column index {} out of bounds (arity {})",
9881 l,
9882 left.arity()
9883 )));
9884 }
9885 if r >= right.arity() {
9886 return Err(XlogError::Kernel(format!(
9887 "Right key column index {} out of bounds (arity {})",
9888 r,
9889 right.arity()
9890 )));
9891 }
9892 let lt = left.schema().column_type(l);
9893 let rt = right.schema().column_type(r);
9894 if lt != rt {
9895 return Err(XlogError::Kernel(format!(
9896 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
9897 l, lt, r, rt
9898 )));
9899 }
9900 }
9901 if index.right_num_rows() != num_right as u32 {
9902 return Err(XlogError::Kernel(
9903 "Join index row count does not match right relation".to_string(),
9904 ));
9905 }
9906 if index.right_keys() != right_keys {
9907 return Err(XlogError::Kernel(
9908 "Join index key columns do not match requested right_keys".to_string(),
9909 ));
9910 }
9911
9912 let probe_cap = u32::try_from(num_left).map_err(|_| {
9914 XlogError::Kernel("indexed csm left_outer: left row count exceeds u32::MAX".to_string())
9915 })?;
9916
9917 let table = &index.table;
9918
9919 let left_packed =
9922 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
9923 if left_packed.key_bytes != index.key_bytes {
9924 return Err(XlogError::Kernel(
9925 "Join key byte width mismatch between probe and cached index".to_string(),
9926 ));
9927 }
9928
9929 let device = self.device.inner();
9930 let block_size = 256u32;
9931 let probe_grid = probe_cap.div_ceil(block_size);
9932 let probe_config = LaunchConfig {
9933 grid_dim: (probe_grid, 1, 1),
9934 block_dim: (block_size, 1, 1),
9935 shared_mem_bytes: 0,
9936 };
9937
9938 let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
9940 let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
9941 let d_logical_count = self.memory.alloc::<u32>(1)?;
9942 let d_overflow = self.memory.alloc::<u8>(1)?;
9943 runtime
9944 .prepare_first_use(&d_overflow, launch_stream, Access::Write)
9945 .map_err(|e| {
9946 XlogError::Kernel(format!(
9947 "indexed csm left_outer: prepare d_overflow failed: {}",
9948 e
9949 ))
9950 })?;
9951 runtime
9952 .prepare_first_use(&d_logical_count, launch_stream, Access::Write)
9953 .map_err(|e| {
9954 XlogError::Kernel(format!(
9955 "indexed csm left_outer: prepare d_logical_count failed: {}",
9956 e
9957 ))
9958 })?;
9959 unsafe {
9962 let res = cudarc::driver::sys::cuMemsetD8Async(
9963 *d_overflow.device_ptr(),
9964 0,
9965 1,
9966 cu_stream.cu_stream(),
9967 );
9968 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9969 return Err(XlogError::Kernel(format!(
9970 "indexed csm left_outer: cuMemsetD8Async (d_overflow) failed: {:?}",
9971 res
9972 )));
9973 }
9974 let res = cudarc::driver::sys::cuMemsetD8Async(
9975 *d_logical_count.device_ptr(),
9976 0,
9977 std::mem::size_of::<u32>(),
9978 cu_stream.cu_stream(),
9979 );
9980 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9981 return Err(XlogError::Kernel(format!(
9982 "indexed csm left_outer: cuMemsetD8Async (d_logical_count) failed: {:?}",
9983 res
9984 )));
9985 }
9986 }
9987
9988 let count_func = device
9989 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
9990 .ok_or_else(|| {
9991 XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
9992 })?;
9993 let total_func = device
9994 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
9995 .ok_or_else(|| {
9996 XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
9997 })?;
9998
9999 let mut rec_count = LaunchRecorder::new_strict(launch_stream);
10000 rec_count.read(&left_packed.hashes);
10001 rec_count.read(&left_packed.packed_keys);
10002 rec_count.read(&index.packed_keys);
10003 rec_count.read(&table.bucket_offsets);
10004 rec_count.read(&table.bucket_counts);
10005 rec_count.read(&table.bucket_entries);
10006 rec_count.read(&table.bucket_entry_hashes);
10007 rec_count.read(left.num_rows_device());
10008 rec_count.write(&per_probe_count);
10009 rec_count.write(&per_probe_offsets);
10010 rec_count.write(&d_logical_count);
10011 rec_count.write(&d_overflow);
10012 rec_count.preflight(runtime).map_err(|e| {
10013 XlogError::Kernel(format!(
10014 "indexed csm left_outer: count/scan preflight failed: {}",
10015 e
10016 ))
10017 })?;
10018
10019 unsafe {
10022 count_func.clone().launch_on_stream(
10023 &cu_stream,
10024 probe_config,
10025 (
10026 &left_packed.hashes,
10027 left.num_rows_device(),
10028 probe_cap,
10029 &table.bucket_offsets,
10030 &table.bucket_counts,
10031 &table.bucket_entries,
10032 &table.bucket_entry_hashes,
10033 table.bucket_mask,
10034 &left_packed.packed_keys,
10035 &index.packed_keys,
10036 index.key_bytes,
10037 &per_probe_count,
10038 ),
10039 )
10040 }
10041 .map_err(|e| {
10042 XlogError::Kernel(format!(
10043 "hash_join_probe_v2_count_per_row (indexed csm left_outer) failed: {}",
10044 e
10045 ))
10046 })?;
10047
10048 unsafe {
10051 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
10052 *per_probe_offsets.device_ptr(),
10053 *per_probe_count.device_ptr(),
10054 (probe_cap as usize) * std::mem::size_of::<u32>(),
10055 cu_stream.cu_stream(),
10056 );
10057 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10058 return Err(XlogError::Kernel(format!(
10059 "indexed csm left_outer: cuMemcpyDtoDAsync (count → offsets) failed: {:?}",
10060 res
10061 )));
10062 }
10063 }
10064 self.multiblock_scan_u32_inplace_on_stream(
10065 &mut per_probe_offsets,
10066 probe_cap,
10067 &cu_stream,
10068 launch_stream,
10069 runtime,
10070 )?;
10071
10072 let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(num_right as u64);
10074 let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
10075 unsafe {
10077 total_func.clone().launch_on_stream(
10078 &cu_stream,
10079 LaunchConfig {
10080 grid_dim: (1, 1, 1),
10081 block_dim: (1, 1, 1),
10082 shared_mem_bytes: 0,
10083 },
10084 (
10085 &per_probe_offsets,
10086 &per_probe_count,
10087 left.num_rows_device(),
10088 probe_cap,
10089 materialize_capacity_u32,
10090 &d_logical_count,
10091 &d_overflow,
10092 ),
10093 )
10094 }
10095 .map_err(|e| {
10096 XlogError::Kernel(format!(
10097 "hash_join_total_from_scan (indexed csm left_outer) failed: {}",
10098 e
10099 ))
10100 })?;
10101
10102 rec_count.commit(runtime).map_err(|e| {
10103 XlogError::Kernel(format!(
10104 "indexed csm left_outer: count/scan commit failed: {}",
10105 e
10106 ))
10107 })?;
10108
10109 cu_stream.synchronize().map_err(|e| {
10110 XlogError::Kernel(format!(
10111 "indexed csm left_outer: sync (count read) failed: {}",
10112 e
10113 ))
10114 })?;
10115 let inner_total = self.read_join_output_count_metadata(&d_logical_count)? as u64;
10116 let inner_clamped = max_output
10117 .map(|limit| (limit as u64).min(inner_total))
10118 .unwrap_or(inner_total);
10119 if inner_clamped > u32::MAX as u64 {
10120 return Err(XlogError::Kernel(format!(
10121 "Join produced {} matched rows which exceeds the u32 index limit",
10122 inner_clamped
10123 )));
10124 }
10125 let inner_count_u32 = inner_clamped as u32;
10126
10127 let materialize_func = device
10129 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
10130 .ok_or_else(|| {
10131 XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
10132 })?;
10133 let d_output_left = self.memory.alloc::<u32>(inner_count_u32.max(1) as usize)?;
10134 let d_output_right = self.memory.alloc::<u32>(inner_count_u32.max(1) as usize)?;
10135
10136 let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
10137 rec_mat.read(&left_packed.hashes);
10138 rec_mat.read(&left_packed.packed_keys);
10139 rec_mat.read(&index.packed_keys);
10140 rec_mat.read(&table.bucket_offsets);
10141 rec_mat.read(&table.bucket_counts);
10142 rec_mat.read(&table.bucket_entries);
10143 rec_mat.read(&table.bucket_entry_hashes);
10144 rec_mat.read(&per_probe_offsets);
10145 rec_mat.read(left.num_rows_device());
10146 rec_mat.write(&d_output_left);
10147 rec_mat.write(&d_output_right);
10148 rec_mat.write(&d_overflow);
10150 rec_mat.preflight(runtime).map_err(|e| {
10151 XlogError::Kernel(format!(
10152 "indexed csm left_outer: materialize preflight failed: {}",
10153 e
10154 ))
10155 })?;
10156 if inner_count_u32 > 0 {
10157 unsafe {
10159 let mut params: Vec<*mut c_void> = vec![
10160 (&left_packed.hashes).as_kernel_param(),
10161 left.num_rows_device().as_kernel_param(),
10162 probe_cap.as_kernel_param(),
10163 (&table.bucket_offsets).as_kernel_param(),
10164 (&table.bucket_counts).as_kernel_param(),
10165 (&table.bucket_entries).as_kernel_param(),
10166 (&table.bucket_entry_hashes).as_kernel_param(),
10167 table.bucket_mask.as_kernel_param(),
10168 (&left_packed.packed_keys).as_kernel_param(),
10169 (&index.packed_keys).as_kernel_param(),
10170 index.key_bytes.as_kernel_param(),
10171 (&per_probe_offsets).as_kernel_param(),
10172 inner_count_u32.as_kernel_param(),
10173 (&d_output_left).as_kernel_param(),
10174 (&d_output_right).as_kernel_param(),
10175 (&d_overflow).as_kernel_param(),
10176 ];
10177 materialize_func
10178 .clone()
10179 .launch_on_stream(&cu_stream, probe_config, &mut params)
10180 .map_err(|e| {
10181 XlogError::Kernel(format!(
10182 "hash_join_probe_v2_materialize (indexed csm left_outer) failed: {}",
10183 e
10184 ))
10185 })?;
10186 }
10187 }
10188 rec_mat.commit(runtime).map_err(|e| {
10189 XlogError::Kernel(format!(
10190 "indexed csm left_outer: materialize commit failed: {}",
10191 e
10192 ))
10193 })?;
10194
10195 let d_unmatched_mask = self.memory.alloc::<u8>(probe_cap as usize)?;
10197 let unmatched_mask_func = device
10198 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_CSM_UNMATCHED_MASK)
10199 .ok_or_else(|| {
10200 XlogError::Kernel("hash_join_csm_unmatched_mask kernel not found".to_string())
10201 })?;
10202 let mut rec_um = LaunchRecorder::new_strict(launch_stream);
10203 rec_um.read(&per_probe_count);
10204 rec_um.read(left.num_rows_device());
10205 rec_um.write(&d_unmatched_mask);
10206 rec_um.preflight(runtime).map_err(|e| {
10207 XlogError::Kernel(format!(
10208 "indexed csm left_outer: unmatched mask preflight failed: {}",
10209 e
10210 ))
10211 })?;
10212 unsafe {
10214 unmatched_mask_func.clone().launch_on_stream(
10215 &cu_stream,
10216 probe_config,
10217 (
10218 &per_probe_count,
10219 left.num_rows_device(),
10220 probe_cap,
10221 &d_unmatched_mask,
10222 ),
10223 )
10224 }
10225 .map_err(|e| {
10226 XlogError::Kernel(format!(
10227 "hash_join_csm_unmatched_mask (indexed csm left_outer) failed: {}",
10228 e
10229 ))
10230 })?;
10231 rec_um.commit(runtime).map_err(|e| {
10232 XlogError::Kernel(format!(
10233 "indexed csm left_outer: unmatched mask commit failed: {}",
10234 e
10235 ))
10236 })?;
10237
10238 let unmatched_left = self.compact_buffer_by_device_mask_counted_recorded(
10239 left,
10240 &d_unmatched_mask,
10241 launch_stream,
10242 )?;
10243 let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
10244 let total_rows = (inner_count_u32 as u64) + unmatched_rows;
10245
10246 let combined_schema = self.combine_schemas(left.schema(), right.schema());
10247 if total_rows == 0 {
10248 return self.create_empty_buffer(combined_schema);
10249 }
10250
10251 let inner_left_buf;
10253 let inner_right_buf;
10254 if inner_count_u32 > 0 {
10255 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
10256 for col_idx in 0..left.columns.len() {
10257 let c = left.column(col_idx).ok_or_else(|| {
10258 XlogError::Kernel(format!("Left column {} not found", col_idx))
10259 })?;
10260 rec_gather.read_column(c);
10261 }
10262 for col_idx in 0..right.columns.len() {
10263 let c = right.column(col_idx).ok_or_else(|| {
10264 XlogError::Kernel(format!("Right column {} not found", col_idx))
10265 })?;
10266 rec_gather.read_column(c);
10267 }
10268 rec_gather.read(&d_output_left);
10269 rec_gather.read(&d_output_right);
10270 rec_gather.preflight(runtime).map_err(|e| {
10271 XlogError::Kernel(format!(
10272 "indexed csm left_outer: gather preflight failed: {}",
10273 e
10274 ))
10275 })?;
10276 inner_left_buf = Some(self.gather_buffer_by_indices_on_stream(
10277 left,
10278 &d_output_left,
10279 inner_count_u32,
10280 &cu_stream,
10281 launch_stream,
10282 runtime,
10283 )?);
10284 inner_right_buf = Some(self.gather_buffer_by_indices_on_stream(
10285 right,
10286 &d_output_right,
10287 inner_count_u32,
10288 &cu_stream,
10289 launch_stream,
10290 runtime,
10291 )?);
10292 rec_gather.commit(runtime).map_err(|e| {
10293 XlogError::Kernel(format!(
10294 "indexed csm left_outer: gather commit failed: {}",
10295 e
10296 ))
10297 })?;
10298 } else {
10299 inner_left_buf = None;
10300 inner_right_buf = None;
10301 }
10302
10303 let mut rec_d = LaunchRecorder::new_strict(launch_stream);
10305 for col_idx in 0..unmatched_left.columns.len() {
10306 let c = unmatched_left.column(col_idx).ok_or_else(|| {
10307 XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
10308 })?;
10309 rec_d.read_column(c);
10310 }
10311 if let Some(b) = inner_left_buf.as_ref() {
10312 for col_idx in 0..b.columns.len() {
10313 let c = b.column(col_idx).ok_or_else(|| {
10314 XlogError::Kernel(format!("inner_left col {} not found", col_idx))
10315 })?;
10316 rec_d.read_column(c);
10317 }
10318 }
10319 if let Some(b) = inner_right_buf.as_ref() {
10320 for col_idx in 0..b.columns.len() {
10321 let c = b.column(col_idx).ok_or_else(|| {
10322 XlogError::Kernel(format!("inner_right col {} not found", col_idx))
10323 })?;
10324 rec_d.read_column(c);
10325 }
10326 }
10327 rec_d.preflight(runtime).map_err(|e| {
10328 XlogError::Kernel(format!(
10329 "indexed csm left_outer: phase-E preflight failed: {}",
10330 e
10331 ))
10332 })?;
10333
10334 let inner_rows = inner_count_u32 as u64;
10335 let mut result_columns: Vec<CudaColumn> = Vec::with_capacity(combined_schema.arity());
10336
10337 for col_idx in 0..left.arity() {
10339 let elem_size = left
10340 .schema()
10341 .column_type(col_idx)
10342 .map(|t| t.size_bytes())
10343 .unwrap_or(4);
10344 let inner_bytes = (inner_rows as usize)
10345 .checked_mul(elem_size)
10346 .ok_or_else(|| {
10347 XlogError::Kernel("indexed csm left_outer: inner_bytes overflow".into())
10348 })?;
10349 let unmatched_bytes = (unmatched_rows as usize)
10350 .checked_mul(elem_size)
10351 .ok_or_else(|| {
10352 XlogError::Kernel("indexed csm left_outer: unmatched_bytes overflow".into())
10353 })?;
10354 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
10355 XlogError::Kernel("indexed csm left_outer: total_bytes overflow".into())
10356 })?;
10357 let out_col = self.memory.alloc::<u8>(total_bytes)?;
10358 let dst_ptr = *out_col.device_ptr();
10359 runtime
10360 .prepare_first_use(&out_col, launch_stream, Access::Write)
10361 .map_err(|e| {
10362 XlogError::Kernel(format!(
10363 "indexed csm left_outer: prepare left out_col {} failed: {}",
10364 col_idx, e
10365 ))
10366 })?;
10367 if inner_bytes > 0 {
10368 let src_col = inner_left_buf
10369 .as_ref()
10370 .expect("inner_count > 0")
10371 .column(col_idx)
10372 .ok_or_else(|| XlogError::Kernel("inner_left col missing".into()))?;
10373 unsafe {
10375 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
10376 dst_ptr,
10377 *src_col.device_ptr(),
10378 inner_bytes,
10379 cu_stream.cu_stream(),
10380 );
10381 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10382 return Err(XlogError::Kernel(format!(
10383 "indexed csm left_outer: dtod inner_left col {} failed: {:?}",
10384 col_idx, res
10385 )));
10386 }
10387 }
10388 }
10389 if unmatched_bytes > 0 {
10390 let src_col = unmatched_left.column(col_idx).ok_or_else(|| {
10391 XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
10392 })?;
10393 unsafe {
10395 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
10396 dst_ptr + inner_bytes as u64,
10397 *src_col.device_ptr(),
10398 unmatched_bytes,
10399 cu_stream.cu_stream(),
10400 );
10401 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10402 return Err(XlogError::Kernel(format!(
10403 "indexed csm left_outer: dtod unmatched col {} failed: {:?}",
10404 col_idx, res
10405 )));
10406 }
10407 }
10408 }
10409 if let Some(b) = out_col.runtime_block() {
10410 runtime
10411 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
10412 .map_err(|e| {
10413 XlogError::Kernel(format!(
10414 "indexed csm left_outer: finish_block_use (left col {}) failed: {}",
10415 col_idx, e
10416 ))
10417 })?;
10418 }
10419 result_columns.push(out_col.into());
10420 }
10421
10422 for col_idx in 0..right.arity() {
10424 let elem_size = right
10425 .schema()
10426 .column_type(col_idx)
10427 .map(|t| t.size_bytes())
10428 .unwrap_or(4);
10429 let inner_bytes = (inner_rows as usize)
10430 .checked_mul(elem_size)
10431 .ok_or_else(|| {
10432 XlogError::Kernel("indexed csm left_outer: right inner_bytes overflow".into())
10433 })?;
10434 let unmatched_bytes = (unmatched_rows as usize)
10435 .checked_mul(elem_size)
10436 .ok_or_else(|| {
10437 XlogError::Kernel(
10438 "indexed csm left_outer: right unmatched_bytes overflow".into(),
10439 )
10440 })?;
10441 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
10442 XlogError::Kernel("indexed csm left_outer: right total_bytes overflow".into())
10443 })?;
10444 let out_col = self.memory.alloc::<u8>(total_bytes)?;
10445 let dst_ptr = *out_col.device_ptr();
10446 runtime
10447 .prepare_first_use(&out_col, launch_stream, Access::Write)
10448 .map_err(|e| {
10449 XlogError::Kernel(format!(
10450 "indexed csm left_outer: prepare right out_col {} failed: {}",
10451 col_idx, e
10452 ))
10453 })?;
10454 if total_bytes > 0 {
10455 unsafe {
10457 let res = cudarc::driver::sys::cuMemsetD8Async(
10458 dst_ptr,
10459 0,
10460 total_bytes,
10461 cu_stream.cu_stream(),
10462 );
10463 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10464 return Err(XlogError::Kernel(format!(
10465 "indexed csm left_outer: zero-fill right col {} failed: {:?}",
10466 col_idx, res
10467 )));
10468 }
10469 }
10470 }
10471 if inner_bytes > 0 {
10472 let src_col = inner_right_buf
10473 .as_ref()
10474 .expect("inner_count > 0")
10475 .column(col_idx)
10476 .ok_or_else(|| XlogError::Kernel("inner_right col missing".into()))?;
10477 unsafe {
10479 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
10480 dst_ptr,
10481 *src_col.device_ptr(),
10482 inner_bytes,
10483 cu_stream.cu_stream(),
10484 );
10485 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10486 return Err(XlogError::Kernel(format!(
10487 "indexed csm left_outer: dtod inner_right col {} failed: {:?}",
10488 col_idx, res
10489 )));
10490 }
10491 }
10492 }
10493 if let Some(b) = out_col.runtime_block() {
10494 runtime
10495 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
10496 .map_err(|e| {
10497 XlogError::Kernel(format!(
10498 "indexed csm left_outer: finish_block_use (right col {}) failed: {}",
10499 col_idx, e
10500 ))
10501 })?;
10502 }
10503 result_columns.push(out_col.into());
10504 }
10505
10506 rec_d.commit(runtime).map_err(|e| {
10507 XlogError::Kernel(format!(
10508 "indexed csm left_outer: phase-E commit failed: {}",
10509 e
10510 ))
10511 })?;
10512
10513 if total_rows > u32::MAX as u64 {
10515 return Err(XlogError::Kernel(format!(
10516 "indexed csm left_outer: output row count {} exceeds u32::MAX",
10517 total_rows
10518 )));
10519 }
10520 let total_rows_u32 = total_rows as u32;
10521 let d_num_rows = self.upload_device_row_count(total_rows_u32)?;
10522 Ok(CudaBuffer::from_columns_with_host_count(
10523 result_columns,
10524 total_rows,
10525 d_num_rows,
10526 combined_schema,
10527 total_rows_u32,
10528 ))
10529 }
10530
10531 #[allow(clippy::too_many_arguments)]
10546 pub fn hash_join_v2_recorded(
10547 &self,
10548 left: &CudaBuffer,
10549 right: &CudaBuffer,
10550 left_keys: &[usize],
10551 right_keys: &[usize],
10552 join_type: JoinType,
10553 max_output: Option<usize>,
10554 launch_stream: StreamId,
10555 ) -> Result<CudaBuffer> {
10556 let csm_on = Self::use_recorded_csm_env();
10557 match join_type {
10558 JoinType::Inner => {
10559 if csm_on {
10560 self.csm_invocations.fetch_add(1, Ordering::Relaxed);
10561 self.hash_join_inner_v2_count_scan_materialize_recorded(
10562 left,
10563 right,
10564 left_keys,
10565 right_keys,
10566 max_output,
10567 launch_stream,
10568 )
10569 } else {
10570 self.hash_join_inner_v2_recorded(
10571 left,
10572 right,
10573 left_keys,
10574 right_keys,
10575 max_output,
10576 launch_stream,
10577 )
10578 }
10579 }
10580 JoinType::Semi => self.hash_join_semi_or_anti_v2_recorded(
10581 left,
10582 right,
10583 left_keys,
10584 right_keys,
10585 false,
10586 launch_stream,
10587 ),
10588 JoinType::Anti => self.hash_join_semi_or_anti_v2_recorded(
10589 left,
10590 right,
10591 left_keys,
10592 right_keys,
10593 true,
10594 launch_stream,
10595 ),
10596 JoinType::LeftOuter => {
10597 if csm_on {
10598 self.csm_invocations.fetch_add(1, Ordering::Relaxed);
10599 self.hash_join_left_outer_v2_count_scan_materialize_recorded(
10600 left,
10601 right,
10602 left_keys,
10603 right_keys,
10604 max_output,
10605 launch_stream,
10606 )
10607 } else {
10608 self.hash_join_left_outer_v2_recorded(
10609 left,
10610 right,
10611 left_keys,
10612 right_keys,
10613 max_output,
10614 launch_stream,
10615 )
10616 }
10617 }
10618 }
10619 }
10620
10621 fn hash_join_left_outer_v2_recorded(
10642 &self,
10643 left: &CudaBuffer,
10644 right: &CudaBuffer,
10645 left_keys: &[usize],
10646 right_keys: &[usize],
10647 max_output: Option<usize>,
10648 launch_stream: StreamId,
10649 ) -> Result<CudaBuffer> {
10650 use crate::launch::LaunchRecorder;
10651
10652 let runtime = self.memory.runtime().ok_or_else(|| {
10653 XlogError::Kernel(
10654 "hash_join_v2_recorded (left_outer) requires a runtime-backed GpuMemoryManager"
10655 .to_string(),
10656 )
10657 })?;
10658 let cu_stream = runtime
10659 .stream_pool()
10660 .resolve(launch_stream)
10661 .ok_or_else(|| {
10662 XlogError::Kernel(format!(
10663 "hash_join_v2_recorded (left_outer): launch_stream StreamId({}) does not resolve",
10664 launch_stream.0
10665 ))
10666 })?;
10667
10668 let num_left = self.device_row_count(left)?;
10669 let num_right = self.device_row_count(right)?;
10670 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
10671 return Err(XlogError::Kernel(format!(
10672 "Join supports at most {} rows per side (left={}, right={})",
10673 u32::MAX,
10674 num_left,
10675 num_right
10676 )));
10677 }
10678 if num_left == 0 {
10679 let combined_schema = self.combine_schemas(left.schema(), right.schema());
10680 return self.create_empty_buffer(combined_schema);
10681 }
10682 if num_right == 0 {
10683 return self.left_outer_with_nulls(left, right);
10688 }
10689 if left_keys.is_empty() || right_keys.is_empty() {
10690 return Err(XlogError::Kernel(
10691 "Join requires at least one key column".to_string(),
10692 ));
10693 }
10694 if left_keys.len() != right_keys.len() {
10695 return Err(XlogError::Kernel(
10696 "Left and right key columns must have same length".to_string(),
10697 ));
10698 }
10699 if left_keys.len() > 4 {
10700 return Err(XlogError::Kernel(
10701 "hash_join_v2_recorded (left_outer): max 4 key columns supported \
10702 (pack_keys constraint)"
10703 .to_string(),
10704 ));
10705 }
10706 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
10707 let lt = left.schema().column_type(l);
10708 let rt = right.schema().column_type(r);
10709 if lt != rt {
10710 return Err(XlogError::Kernel(format!(
10711 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
10712 l, lt, r, rt
10713 )));
10714 }
10715 }
10716
10717 let num_left = num_left as u32;
10718 let num_right = num_right as u32;
10719
10720 let left_packed =
10721 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
10722 let right_packed =
10723 self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
10724 let table = self.build_hash_table_v2_on_stream(
10725 &right_packed.hashes,
10726 num_right,
10727 &cu_stream,
10728 launch_stream,
10729 runtime,
10730 )?;
10731
10732 let device = self.device.inner();
10733 let block_size = 256u32;
10734 let grid_size = num_left.div_ceil(block_size);
10735 let cfg = LaunchConfig {
10736 grid_dim: (grid_size, 1, 1),
10737 block_dim: (block_size, 1, 1),
10738 shared_mem_bytes: 0,
10739 };
10740
10741 let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
10745 let d_count_only = self.memory.alloc::<u32>(1)?;
10746 let d_dummy_left = self.memory.alloc::<u32>(1)?;
10747 let d_dummy_right = self.memory.alloc::<u32>(1)?;
10748 runtime
10752 .prepare_first_use(&d_count_only, launch_stream, Access::Write)
10753 .map_err(|e| {
10754 XlogError::Kernel(format!(
10755 "left_outer recorded: prepare d_count_only failed: {}",
10756 e
10757 ))
10758 })?;
10759 unsafe {
10761 let res = cudarc::driver::sys::cuMemsetD8Async(
10762 *d_count_only.device_ptr(),
10763 0,
10764 std::mem::size_of::<u32>(),
10765 cu_stream.cu_stream(),
10766 );
10767 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10768 return Err(XlogError::Kernel(format!(
10769 "cuMemsetD8Async (left_outer d_count_only) failed: {:?}",
10770 res
10771 )));
10772 }
10773 }
10774
10775 let semi_func = device
10776 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
10777 .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
10778 let probe_func = device
10779 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
10780 .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
10781
10782 let mut rec_a = LaunchRecorder::new_strict(launch_stream);
10783 rec_a.read(&left_packed.hashes);
10784 rec_a.read(&left_packed.packed_keys);
10785 rec_a.read(&right_packed.packed_keys);
10786 rec_a.read(&table.bucket_offsets);
10787 rec_a.read(&table.bucket_counts);
10788 rec_a.read(&table.bucket_entries);
10789 rec_a.read(&table.bucket_entry_hashes);
10790 rec_a.write(&d_has_match);
10791 rec_a.write(&d_count_only);
10792 rec_a.write(&d_dummy_left);
10793 rec_a.write(&d_dummy_right);
10794 rec_a.preflight(runtime).map_err(|e| {
10795 XlogError::Kernel(format!(
10796 "hash_join_v2_recorded (left_outer): semi/count preflight failed: {}",
10797 e
10798 ))
10799 })?;
10800
10801 unsafe {
10803 semi_func.clone().launch_on_stream(
10804 &cu_stream,
10805 cfg,
10806 (
10807 &left_packed.hashes,
10808 num_left,
10809 &table.bucket_offsets,
10810 &table.bucket_counts,
10811 &table.bucket_entries,
10812 &table.bucket_entry_hashes,
10813 table.bucket_mask,
10814 &left_packed.packed_keys,
10815 &right_packed.packed_keys,
10816 left_packed.key_bytes,
10817 &d_has_match,
10818 ),
10819 )
10820 }
10821 .map_err(|e| XlogError::Kernel(format!("hash_join_semi (on_stream) failed: {}", e)))?;
10822
10823 let max_output_count_only = 0u32;
10824 unsafe {
10826 let mut params: Vec<*mut c_void> = vec![
10827 (&left_packed.hashes).as_kernel_param(),
10828 num_left.as_kernel_param(),
10829 (&table.bucket_offsets).as_kernel_param(),
10830 (&table.bucket_counts).as_kernel_param(),
10831 (&table.bucket_entries).as_kernel_param(),
10832 (&table.bucket_entry_hashes).as_kernel_param(),
10833 table.bucket_mask.as_kernel_param(),
10834 (&left_packed.packed_keys).as_kernel_param(),
10835 (&right_packed.packed_keys).as_kernel_param(),
10836 left_packed.key_bytes.as_kernel_param(),
10837 (&d_dummy_left).as_kernel_param(),
10838 (&d_dummy_right).as_kernel_param(),
10839 (&d_count_only).as_kernel_param(),
10840 max_output_count_only.as_kernel_param(),
10841 ];
10842 probe_func
10843 .clone()
10844 .launch_on_stream(&cu_stream, cfg, &mut params)
10845 .map_err(|e| {
10846 XlogError::Kernel(format!(
10847 "hash_join_probe_v2 (count, on_stream, left_outer) failed: {}",
10848 e
10849 ))
10850 })?;
10851 }
10852
10853 rec_a.commit(runtime).map_err(|e| {
10854 XlogError::Kernel(format!(
10855 "hash_join_v2_recorded (left_outer): semi/count commit failed: {}",
10856 e
10857 ))
10858 })?;
10859
10860 cu_stream.synchronize().map_err(|e| {
10862 XlogError::Kernel(format!(
10863 "hash_join_v2_recorded (left_outer): sync (count read) failed: {}",
10864 e
10865 ))
10866 })?;
10867 let full_inner = self.read_join_output_count_metadata(&d_count_only)? as u64;
10868 let requested_inner = max_output
10869 .map(|limit| (limit as u64).min(full_inner))
10870 .unwrap_or(full_inner);
10871 if requested_inner > u32::MAX as u64 {
10872 return Err(XlogError::Kernel(format!(
10873 "Join produced {} rows which exceeds the u32 index limit",
10874 requested_inner
10875 )));
10876 }
10877 let max_output_u32 = requested_inner as u32;
10878 let alloc_len = (requested_inner.max(1)) as usize;
10879
10880 let d_output_left = self.memory.alloc::<u32>(alloc_len)?;
10881 let d_output_right = self.memory.alloc::<u32>(alloc_len)?;
10882 let d_output_count = self.memory.alloc::<u32>(1)?;
10883 runtime
10886 .prepare_first_use(&d_output_count, launch_stream, Access::Write)
10887 .map_err(|e| {
10888 XlogError::Kernel(format!(
10889 "left_outer recorded: prepare d_output_count failed: {}",
10890 e
10891 ))
10892 })?;
10893 unsafe {
10895 let res = cudarc::driver::sys::cuMemsetD8Async(
10896 *d_output_count.device_ptr(),
10897 0,
10898 std::mem::size_of::<u32>(),
10899 cu_stream.cu_stream(),
10900 );
10901 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10902 return Err(XlogError::Kernel(format!(
10903 "cuMemsetD8Async (left_outer d_output_count) failed: {:?}",
10904 res
10905 )));
10906 }
10907 }
10908
10909 let mut rec_b = LaunchRecorder::new_strict(launch_stream);
10910 rec_b.read(&left_packed.hashes);
10911 rec_b.read(&left_packed.packed_keys);
10912 rec_b.read(&right_packed.packed_keys);
10913 rec_b.read(&table.bucket_offsets);
10914 rec_b.read(&table.bucket_counts);
10915 rec_b.read(&table.bucket_entries);
10916 rec_b.read(&table.bucket_entry_hashes);
10917 rec_b.write(&d_output_left);
10918 rec_b.write(&d_output_right);
10919 rec_b.write(&d_output_count);
10920 rec_b.preflight(runtime).map_err(|e| {
10921 XlogError::Kernel(format!(
10922 "hash_join_v2_recorded (left_outer): materialize preflight failed: {}",
10923 e
10924 ))
10925 })?;
10926
10927 unsafe {
10929 let mut params: Vec<*mut c_void> = vec![
10930 (&left_packed.hashes).as_kernel_param(),
10931 num_left.as_kernel_param(),
10932 (&table.bucket_offsets).as_kernel_param(),
10933 (&table.bucket_counts).as_kernel_param(),
10934 (&table.bucket_entries).as_kernel_param(),
10935 (&table.bucket_entry_hashes).as_kernel_param(),
10936 table.bucket_mask.as_kernel_param(),
10937 (&left_packed.packed_keys).as_kernel_param(),
10938 (&right_packed.packed_keys).as_kernel_param(),
10939 left_packed.key_bytes.as_kernel_param(),
10940 (&d_output_left).as_kernel_param(),
10941 (&d_output_right).as_kernel_param(),
10942 (&d_output_count).as_kernel_param(),
10943 max_output_u32.as_kernel_param(),
10944 ];
10945 probe_func
10946 .clone()
10947 .launch_on_stream(&cu_stream, cfg, &mut params)
10948 .map_err(|e| {
10949 XlogError::Kernel(format!(
10950 "hash_join_probe_v2 (materialize, on_stream, left_outer) failed: {}",
10951 e
10952 ))
10953 })?;
10954 }
10955
10956 rec_b.commit(runtime).map_err(|e| {
10957 XlogError::Kernel(format!(
10958 "hash_join_v2_recorded (left_outer): materialize commit failed: {}",
10959 e
10960 ))
10961 })?;
10962
10963 cu_stream.synchronize().map_err(|e| {
10964 XlogError::Kernel(format!(
10965 "hash_join_v2_recorded (left_outer): sync (materialize read) failed: {}",
10966 e
10967 ))
10968 })?;
10969 let inner_count = self
10970 .read_join_output_count_metadata(&d_output_count)?
10971 .min(max_output_u32);
10972
10973 let d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
10976 let mask_not_fn = device
10977 .get_func(FILTER_MODULE, filter_kernels::MASK_NOT)
10978 .ok_or_else(|| XlogError::Kernel("mask_not kernel not found".to_string()))?;
10979
10980 let mut rec_c = LaunchRecorder::new_strict(launch_stream);
10981 rec_c.read(&d_has_match);
10982 rec_c.write(&d_no_match);
10983 rec_c.preflight(runtime).map_err(|e| {
10984 XlogError::Kernel(format!(
10985 "hash_join_v2_recorded (left_outer): mask_not preflight failed: {}",
10986 e
10987 ))
10988 })?;
10989 unsafe {
10991 mask_not_fn.clone().launch_on_stream(
10992 &cu_stream,
10993 cfg,
10994 (&d_has_match, &d_no_match, num_left),
10995 )
10996 }
10997 .map_err(|e| XlogError::Kernel(format!("mask_not (on_stream) failed: {}", e)))?;
10998 rec_c.commit(runtime).map_err(|e| {
10999 XlogError::Kernel(format!(
11000 "hash_join_v2_recorded (left_outer): mask_not commit failed: {}",
11001 e
11002 ))
11003 })?;
11004
11005 let unmatched_left =
11006 self.compact_buffer_by_device_mask_counted_recorded(left, &d_no_match, launch_stream)?;
11007 let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
11008 let total_rows = (inner_count as u64) + unmatched_rows;
11009
11010 let combined_schema = self.combine_schemas(left.schema(), right.schema());
11011 if total_rows == 0 {
11012 return self.create_empty_buffer(combined_schema);
11013 }
11014
11015 let inner_count_u32 = inner_count;
11024 let inner_left_buf;
11025 let inner_right_buf;
11026 if inner_count > 0 {
11027 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
11028 for col_idx in 0..left.columns.len() {
11029 let c = left.column(col_idx).ok_or_else(|| {
11030 XlogError::Kernel(format!("Left column {} not found", col_idx))
11031 })?;
11032 rec_gather.read_column(c);
11033 }
11034 for col_idx in 0..right.columns.len() {
11035 let c = right.column(col_idx).ok_or_else(|| {
11036 XlogError::Kernel(format!("Right column {} not found", col_idx))
11037 })?;
11038 rec_gather.read_column(c);
11039 }
11040 rec_gather.read(&d_output_left);
11041 rec_gather.read(&d_output_right);
11042 rec_gather.preflight(runtime).map_err(|e| {
11043 XlogError::Kernel(format!(
11044 "hash_join_v2_recorded (left_outer): gather preflight failed: {}",
11045 e
11046 ))
11047 })?;
11048 inner_left_buf = Some(self.gather_buffer_by_indices_on_stream(
11049 left,
11050 &d_output_left,
11051 inner_count_u32,
11052 &cu_stream,
11053 launch_stream,
11054 runtime,
11055 )?);
11056 inner_right_buf = Some(self.gather_buffer_by_indices_on_stream(
11057 right,
11058 &d_output_right,
11059 inner_count_u32,
11060 &cu_stream,
11061 launch_stream,
11062 runtime,
11063 )?);
11064 rec_gather.commit(runtime).map_err(|e| {
11065 XlogError::Kernel(format!(
11066 "hash_join_v2_recorded (left_outer): gather commit failed: {}",
11067 e
11068 ))
11069 })?;
11070 } else {
11071 inner_left_buf = None;
11072 inner_right_buf = None;
11073 }
11074
11075 let mut rec_d = LaunchRecorder::new_strict(launch_stream);
11092 for col_idx in 0..unmatched_left.columns.len() {
11093 let c = unmatched_left.column(col_idx).ok_or_else(|| {
11094 XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
11095 })?;
11096 rec_d.read_column(c);
11097 }
11098 if let Some(b) = inner_left_buf.as_ref() {
11099 for col_idx in 0..b.columns.len() {
11100 let c = b.column(col_idx).ok_or_else(|| {
11101 XlogError::Kernel(format!("inner_left col {} not found", col_idx))
11102 })?;
11103 rec_d.read_column(c);
11104 }
11105 }
11106 if let Some(b) = inner_right_buf.as_ref() {
11107 for col_idx in 0..b.columns.len() {
11108 let c = b.column(col_idx).ok_or_else(|| {
11109 XlogError::Kernel(format!("inner_right col {} not found", col_idx))
11110 })?;
11111 rec_d.read_column(c);
11112 }
11113 }
11114 rec_d.preflight(runtime).map_err(|e| {
11115 XlogError::Kernel(format!(
11116 "hash_join_v2_recorded (left_outer): step-D preflight failed: {}",
11117 e
11118 ))
11119 })?;
11120
11121 let mut result_columns: Vec<CudaColumn> = Vec::with_capacity(combined_schema.arity());
11122 let inner_rows = inner_count as u64;
11123
11124 for col_idx in 0..left.arity() {
11126 let elem_size = left
11127 .schema()
11128 .column_type(col_idx)
11129 .map(|t| t.size_bytes())
11130 .unwrap_or(4);
11131 let inner_bytes = (inner_rows as usize)
11132 .checked_mul(elem_size)
11133 .ok_or_else(|| {
11134 XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
11135 })?;
11136 let unmatched_bytes = (unmatched_rows as usize)
11137 .checked_mul(elem_size)
11138 .ok_or_else(|| {
11139 XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
11140 })?;
11141 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
11142 XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
11143 })?;
11144
11145 let out_col = self.memory.alloc::<u8>(total_bytes)?;
11146 let dst_ptr = *out_col.device_ptr();
11147 runtime
11150 .prepare_first_use(&out_col, launch_stream, Access::Write)
11151 .map_err(|e| {
11152 XlogError::Kernel(format!(
11153 "left_outer recorded: prepare left out_col {} failed: {}",
11154 col_idx, e
11155 ))
11156 })?;
11157
11158 if inner_bytes > 0 {
11159 let src_col = inner_left_buf
11160 .as_ref()
11161 .expect("inner_count > 0 but inner_left_buf is None")
11162 .column(col_idx)
11163 .ok_or_else(|| {
11164 XlogError::Kernel(format!("inner_left col {} not found", col_idx))
11165 })?;
11166 unsafe {
11168 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
11169 dst_ptr,
11170 *src_col.device_ptr(),
11171 inner_bytes,
11172 cu_stream.cu_stream(),
11173 );
11174 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11175 return Err(XlogError::Kernel(format!(
11176 "cuMemcpyDtoDAsync (left_outer inner_left col {}) failed: {:?}",
11177 col_idx, res
11178 )));
11179 }
11180 }
11181 }
11182 if unmatched_bytes > 0 {
11183 let src_col = unmatched_left.column(col_idx).ok_or_else(|| {
11184 XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
11185 })?;
11186 unsafe {
11189 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
11190 dst_ptr + inner_bytes as u64,
11191 *src_col.device_ptr(),
11192 unmatched_bytes,
11193 cu_stream.cu_stream(),
11194 );
11195 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11196 return Err(XlogError::Kernel(format!(
11197 "cuMemcpyDtoDAsync (left_outer unmatched_left col {}) failed: {:?}",
11198 col_idx, res
11199 )));
11200 }
11201 }
11202 }
11203
11204 if let Some(b) = out_col.runtime_block() {
11208 runtime
11209 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
11210 .map_err(|e| {
11211 XlogError::Kernel(format!(
11212 "hash_join_v2_recorded (left_outer): finish_block_use \
11213 (left col {}) failed: {}",
11214 col_idx, e
11215 ))
11216 })?;
11217 }
11218 result_columns.push(out_col.into());
11219 }
11220
11221 for col_idx in 0..right.arity() {
11223 let elem_size = right
11224 .schema()
11225 .column_type(col_idx)
11226 .map(|t| t.size_bytes())
11227 .unwrap_or(4);
11228 let inner_bytes = (inner_rows as usize)
11229 .checked_mul(elem_size)
11230 .ok_or_else(|| {
11231 XlogError::Kernel("Left outer join: right inner_bytes overflow".to_string())
11232 })?;
11233 let unmatched_bytes = (unmatched_rows as usize)
11234 .checked_mul(elem_size)
11235 .ok_or_else(|| {
11236 XlogError::Kernel("Left outer join: right unmatched_bytes overflow".to_string())
11237 })?;
11238 let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
11239 XlogError::Kernel("Left outer join: right total_bytes overflow".to_string())
11240 })?;
11241
11242 let out_col = self.memory.alloc::<u8>(total_bytes)?;
11243 let dst_ptr = *out_col.device_ptr();
11244 runtime
11247 .prepare_first_use(&out_col, launch_stream, Access::Write)
11248 .map_err(|e| {
11249 XlogError::Kernel(format!(
11250 "left_outer recorded: prepare right out_col {} failed: {}",
11251 col_idx, e
11252 ))
11253 })?;
11254
11255 if total_bytes > 0 {
11259 unsafe {
11261 let res = cudarc::driver::sys::cuMemsetD8Async(
11262 dst_ptr,
11263 0,
11264 total_bytes,
11265 cu_stream.cu_stream(),
11266 );
11267 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11268 return Err(XlogError::Kernel(format!(
11269 "cuMemsetD8Async (left_outer right col {}) failed: {:?}",
11270 col_idx, res
11271 )));
11272 }
11273 }
11274 }
11275 if inner_bytes > 0 {
11276 let src_col = inner_right_buf
11277 .as_ref()
11278 .expect("inner_count > 0 but inner_right_buf is None")
11279 .column(col_idx)
11280 .ok_or_else(|| {
11281 XlogError::Kernel(format!("inner_right col {} not found", col_idx))
11282 })?;
11283 unsafe {
11285 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
11286 dst_ptr,
11287 *src_col.device_ptr(),
11288 inner_bytes,
11289 cu_stream.cu_stream(),
11290 );
11291 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11292 return Err(XlogError::Kernel(format!(
11293 "cuMemcpyDtoDAsync (left_outer inner_right col {}) failed: {:?}",
11294 col_idx, res
11295 )));
11296 }
11297 }
11298 }
11299
11300 if let Some(b) = out_col.runtime_block() {
11301 runtime
11302 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
11303 .map_err(|e| {
11304 XlogError::Kernel(format!(
11305 "hash_join_v2_recorded (left_outer): finish_block_use \
11306 (right col {}) failed: {}",
11307 col_idx, e
11308 ))
11309 })?;
11310 }
11311 result_columns.push(out_col.into());
11312 }
11313
11314 rec_d.commit(runtime).map_err(|e| {
11320 XlogError::Kernel(format!(
11321 "hash_join_v2_recorded (left_outer): step-D commit failed: {}",
11322 e
11323 ))
11324 })?;
11325
11326 let d_num_rows = self.upload_device_row_count(total_rows as u32)?;
11329 Ok(CudaBuffer::from_columns_with_host_count(
11330 result_columns,
11331 total_rows,
11332 d_num_rows,
11333 combined_schema,
11334 total_rows as u32,
11335 ))
11336 }
11337
11338 fn hash_join_semi_or_anti_v2_recorded(
11349 &self,
11350 left: &CudaBuffer,
11351 right: &CudaBuffer,
11352 left_keys: &[usize],
11353 right_keys: &[usize],
11354 anti: bool,
11355 launch_stream: StreamId,
11356 ) -> Result<CudaBuffer> {
11357 use crate::launch::LaunchRecorder;
11358
11359 let runtime = self.memory.runtime().ok_or_else(|| {
11360 XlogError::Kernel(
11361 "hash_join_v2_recorded (semi/anti) requires a runtime-backed GpuMemoryManager"
11362 .to_string(),
11363 )
11364 })?;
11365 let cu_stream = runtime
11366 .stream_pool()
11367 .resolve(launch_stream)
11368 .ok_or_else(|| {
11369 XlogError::Kernel(format!(
11370 "hash_join_v2_recorded (semi/anti): launch_stream StreamId({}) does not resolve",
11371 launch_stream.0
11372 ))
11373 })?;
11374
11375 let num_left = self.device_row_count(left)?;
11376 let num_right = self.device_row_count(right)?;
11377 if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
11378 return Err(XlogError::Kernel(format!(
11379 "Join supports at most {} rows per side (left={}, right={})",
11380 u32::MAX,
11381 num_left,
11382 num_right
11383 )));
11384 }
11385 if num_left == 0 {
11386 return self.create_empty_buffer(left.schema().clone());
11387 }
11388 if num_right == 0 {
11389 return if anti {
11402 self.clone_buffer(left)
11403 } else {
11404 self.create_empty_buffer(left.schema().clone())
11405 };
11406 }
11407 if left_keys.is_empty() || right_keys.is_empty() {
11408 return Err(XlogError::Kernel(
11409 "Join requires at least one key column".to_string(),
11410 ));
11411 }
11412 if left_keys.len() != right_keys.len() {
11413 return Err(XlogError::Kernel(
11414 "Left and right key columns must have same length".to_string(),
11415 ));
11416 }
11417 if left_keys.len() > 4 {
11418 return Err(XlogError::Kernel(
11419 "hash_join_v2_recorded (semi/anti): max 4 key columns supported (pack_keys constraint)"
11420 .to_string(),
11421 ));
11422 }
11423 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
11424 let lt = left.schema().column_type(l);
11425 let rt = right.schema().column_type(r);
11426 if lt != rt {
11427 return Err(XlogError::Kernel(format!(
11428 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
11429 l, lt, r, rt
11430 )));
11431 }
11432 }
11433
11434 let num_left = num_left as u32;
11435 let num_right = num_right as u32;
11436
11437 let left_packed =
11438 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
11439 let right_packed =
11440 self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
11441 let table = self.build_hash_table_v2_on_stream(
11442 &right_packed.hashes,
11443 num_right,
11444 &cu_stream,
11445 launch_stream,
11446 runtime,
11447 )?;
11448
11449 let d_mask = self.memory.alloc::<u8>(num_left as usize)?;
11450
11451 let kernel_name = if anti {
11452 join_kernels::HASH_JOIN_ANTI
11453 } else {
11454 join_kernels::HASH_JOIN_SEMI
11455 };
11456 let func = self
11457 .device
11458 .inner()
11459 .get_func(JOIN_MODULE, kernel_name)
11460 .ok_or_else(|| XlogError::Kernel(format!("{} kernel not found", kernel_name)))?;
11461
11462 let block_size = 256u32;
11463 let grid_size = num_left.div_ceil(block_size);
11464 let cfg = LaunchConfig {
11465 grid_dim: (grid_size, 1, 1),
11466 block_dim: (block_size, 1, 1),
11467 shared_mem_bytes: 0,
11468 };
11469
11470 let mut rec = LaunchRecorder::new_strict(launch_stream);
11471 rec.read(&left_packed.hashes);
11472 rec.read(&left_packed.packed_keys);
11473 rec.read(&right_packed.packed_keys);
11474 rec.read(&table.bucket_offsets);
11475 rec.read(&table.bucket_counts);
11476 rec.read(&table.bucket_entries);
11477 rec.read(&table.bucket_entry_hashes);
11478 rec.write(&d_mask);
11479 rec.preflight(runtime).map_err(|e| {
11480 XlogError::Kernel(format!(
11481 "hash_join_v2_recorded (semi/anti): preflight failed: {}",
11482 e
11483 ))
11484 })?;
11485
11486 unsafe {
11491 func.clone().launch_on_stream(
11492 &cu_stream,
11493 cfg,
11494 (
11495 &left_packed.hashes,
11496 num_left,
11497 &table.bucket_offsets,
11498 &table.bucket_counts,
11499 &table.bucket_entries,
11500 &table.bucket_entry_hashes,
11501 table.bucket_mask,
11502 &left_packed.packed_keys,
11503 &right_packed.packed_keys,
11504 left_packed.key_bytes,
11505 &d_mask,
11506 ),
11507 )
11508 }
11509 .map_err(|e| XlogError::Kernel(format!("{} (on_stream) failed: {}", kernel_name, e)))?;
11510
11511 rec.commit(runtime).map_err(|e| {
11512 XlogError::Kernel(format!(
11513 "hash_join_v2_recorded (semi/anti): commit failed: {}",
11514 e
11515 ))
11516 })?;
11517
11518 self.compact_buffer_by_device_mask_counted_recorded(left, &d_mask, launch_stream)
11525 }
11526
11527 #[allow(clippy::too_many_arguments)]
11554 pub fn hash_join_v2_with_index_recorded(
11555 &self,
11556 left: &CudaBuffer,
11557 right: &CudaBuffer,
11558 left_keys: &[usize],
11559 right_keys: &[usize],
11560 join_type: JoinType,
11561 index: &crate::provider::JoinIndexV2,
11562 max_output: Option<usize>,
11563 launch_stream: StreamId,
11564 ) -> Result<CudaBuffer> {
11565 let runtime = self.memory.runtime().ok_or_else(|| {
11566 XlogError::Kernel(
11567 "hash_join_v2_with_index_recorded requires a runtime-backed GpuMemoryManager"
11568 .to_string(),
11569 )
11570 })?;
11571 runtime
11573 .stream_pool()
11574 .resolve(launch_stream)
11575 .ok_or_else(|| {
11576 XlogError::Kernel(format!(
11577 "hash_join_v2_with_index_recorded: launch_stream StreamId({}) does not resolve",
11578 launch_stream.0
11579 ))
11580 })?;
11581
11582 let left_rows = self.device_row_count(left)?;
11584 let right_rows = self.device_row_count(right)?;
11585 if left_rows > u32::MAX as usize || right_rows > u32::MAX as usize {
11586 return Err(XlogError::Kernel(format!(
11587 "Join supports at most {} rows per side (left={}, right={})",
11588 u32::MAX,
11589 left_rows,
11590 right_rows
11591 )));
11592 }
11593 if left_rows == 0 {
11594 return match join_type {
11595 JoinType::Inner | JoinType::LeftOuter => {
11596 let combined_schema = self.combine_schemas(left.schema(), right.schema());
11597 self.create_empty_buffer(combined_schema)
11598 }
11599 JoinType::Semi | JoinType::Anti => self.create_empty_buffer(left.schema().clone()),
11600 };
11601 }
11602 if right_rows == 0 {
11603 return match join_type {
11604 JoinType::Inner => {
11605 let combined_schema = self.combine_schemas(left.schema(), right.schema());
11606 self.create_empty_buffer(combined_schema)
11607 }
11608 JoinType::Semi => self.create_empty_buffer(left.schema().clone()),
11609 JoinType::Anti => self.clone_buffer(left),
11610 JoinType::LeftOuter => self.left_outer_with_nulls(left, right),
11611 };
11612 }
11613 if left_keys.is_empty() || right_keys.is_empty() {
11614 return Err(XlogError::Kernel(
11615 "Join requires at least one key column".to_string(),
11616 ));
11617 }
11618 if left_keys.len() != right_keys.len() {
11619 return Err(XlogError::Kernel(
11620 "Left and right key columns must have same length".to_string(),
11621 ));
11622 }
11623 if left_keys.len() > 4 {
11624 return Err(XlogError::Kernel(
11625 "hash_join_v2_with_index_recorded: max 4 key columns supported \
11626 (pack_keys constraint)"
11627 .to_string(),
11628 ));
11629 }
11630 for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
11631 if l >= left.arity() {
11632 return Err(XlogError::Kernel(format!(
11633 "Left key column index {} out of bounds (arity {})",
11634 l,
11635 left.arity()
11636 )));
11637 }
11638 if r >= right.arity() {
11639 return Err(XlogError::Kernel(format!(
11640 "Right key column index {} out of bounds (arity {})",
11641 r,
11642 right.arity()
11643 )));
11644 }
11645 let lt = left.schema().column_type(l);
11646 let rt = right.schema().column_type(r);
11647 if lt != rt {
11648 return Err(XlogError::Kernel(format!(
11649 "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
11650 l, lt, r, rt
11651 )));
11652 }
11653 }
11654 if index.right_num_rows() != right_rows as u32 {
11655 return Err(XlogError::Kernel(
11656 "Join index row count does not match right relation".to_string(),
11657 ));
11658 }
11659 if index.right_keys() != right_keys {
11660 return Err(XlogError::Kernel(
11661 "Join index key columns do not match requested right_keys".to_string(),
11662 ));
11663 }
11664
11665 let csm_on = Self::use_recorded_csm_env();
11666 match join_type {
11667 JoinType::Inner => {
11668 if csm_on {
11669 self.csm_invocations.fetch_add(1, Ordering::Relaxed);
11670 self.hash_join_inner_v2_with_index_count_scan_materialize_recorded(
11671 left,
11672 right,
11673 left_keys,
11674 right_keys,
11675 index,
11676 max_output,
11677 launch_stream,
11678 )
11679 } else {
11680 self.hash_join_inner_v2_with_index_recorded(
11681 left,
11682 right,
11683 left_keys,
11684 index,
11685 max_output,
11686 launch_stream,
11687 )
11688 }
11689 }
11690 JoinType::Semi => self.hash_join_semi_or_anti_v2_with_index_recorded(
11691 left,
11692 left_keys,
11693 index,
11694 false,
11695 launch_stream,
11696 ),
11697 JoinType::Anti => self.hash_join_semi_or_anti_v2_with_index_recorded(
11698 left,
11699 left_keys,
11700 index,
11701 true,
11702 launch_stream,
11703 ),
11704 JoinType::LeftOuter => {
11705 if csm_on {
11706 self.csm_invocations.fetch_add(1, Ordering::Relaxed);
11707 self.hash_join_left_outer_v2_with_index_count_scan_materialize_recorded(
11708 left,
11709 right,
11710 left_keys,
11711 right_keys,
11712 index,
11713 max_output,
11714 launch_stream,
11715 )
11716 } else {
11717 self.hash_join_left_outer_v2_with_index_recorded(
11718 left,
11719 right,
11720 left_keys,
11721 index,
11722 max_output,
11723 launch_stream,
11724 )
11725 }
11726 }
11727 }
11728 }
11729
11730 fn hash_join_inner_v2_with_index_recorded(
11736 &self,
11737 left: &CudaBuffer,
11738 right: &CudaBuffer,
11739 left_keys: &[usize],
11740 index: &crate::provider::JoinIndexV2,
11741 max_output: Option<usize>,
11742 launch_stream: StreamId,
11743 ) -> Result<CudaBuffer> {
11744 use crate::launch::LaunchRecorder;
11745
11746 let runtime = self.memory.runtime().ok_or_else(|| {
11747 XlogError::Kernel(
11748 "hash_join_v2_with_index_recorded (inner) requires runtime-backed manager"
11749 .to_string(),
11750 )
11751 })?;
11752 let cu_stream = runtime
11753 .stream_pool()
11754 .resolve(launch_stream)
11755 .ok_or_else(|| {
11756 XlogError::Kernel("indexed inner: launch_stream does not resolve".to_string())
11757 })?;
11758
11759 let num_left = left.num_rows() as u32;
11760 let table = &index.table;
11761
11762 let left_packed =
11764 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
11765 if left_packed.key_bytes != index.key_bytes {
11766 return Err(XlogError::Kernel(
11767 "Join key byte width mismatch between probe and cached index".to_string(),
11768 ));
11769 }
11770
11771 let probe_func = self
11772 .device
11773 .inner()
11774 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
11775 .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
11776 let block_size = 256u32;
11777 let probe_grid = num_left.div_ceil(block_size);
11778 let probe_config = LaunchConfig {
11779 grid_dim: (probe_grid, 1, 1),
11780 block_dim: (block_size, 1, 1),
11781 shared_mem_bytes: 0,
11782 };
11783
11784 let d_count_only = self.memory.alloc::<u32>(1)?;
11786 let d_dummy_left = self.memory.alloc::<u32>(1)?;
11787 let d_dummy_right = self.memory.alloc::<u32>(1)?;
11788 runtime
11791 .prepare_first_use(&d_count_only, launch_stream, Access::Write)
11792 .map_err(|e| {
11793 XlogError::Kernel(format!("indexed inner: prepare d_count_only failed: {}", e))
11794 })?;
11795 unsafe {
11797 let res = cudarc::driver::sys::cuMemsetD8Async(
11798 *d_count_only.device_ptr(),
11799 0,
11800 std::mem::size_of::<u32>(),
11801 cu_stream.cu_stream(),
11802 );
11803 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11804 return Err(XlogError::Kernel(format!(
11805 "cuMemsetD8Async (indexed inner d_count_only) failed: {:?}",
11806 res
11807 )));
11808 }
11809 }
11810
11811 let max_output_count_only = 0u32;
11812 let mut rec_count = LaunchRecorder::new_strict(launch_stream);
11813 rec_count.read(&left_packed.hashes);
11814 rec_count.read(&left_packed.packed_keys);
11815 rec_count.read(&index.packed_keys);
11816 rec_count.read(&table.bucket_offsets);
11817 rec_count.read(&table.bucket_counts);
11818 rec_count.read(&table.bucket_entries);
11819 rec_count.read(&table.bucket_entry_hashes);
11820 rec_count.write(&d_count_only);
11821 rec_count.write(&d_dummy_left);
11822 rec_count.write(&d_dummy_right);
11823 rec_count.preflight(runtime).map_err(|e| {
11824 XlogError::Kernel(format!("indexed inner: count-pass preflight failed: {}", e))
11825 })?;
11826 unsafe {
11828 let mut params: Vec<*mut c_void> = vec![
11829 (&left_packed.hashes).as_kernel_param(),
11830 num_left.as_kernel_param(),
11831 (&table.bucket_offsets).as_kernel_param(),
11832 (&table.bucket_counts).as_kernel_param(),
11833 (&table.bucket_entries).as_kernel_param(),
11834 (&table.bucket_entry_hashes).as_kernel_param(),
11835 table.bucket_mask.as_kernel_param(),
11836 (&left_packed.packed_keys).as_kernel_param(),
11837 (&index.packed_keys).as_kernel_param(),
11838 index.key_bytes.as_kernel_param(),
11839 (&d_dummy_left).as_kernel_param(),
11840 (&d_dummy_right).as_kernel_param(),
11841 (&d_count_only).as_kernel_param(),
11842 max_output_count_only.as_kernel_param(),
11843 ];
11844 probe_func
11845 .clone()
11846 .launch_on_stream(&cu_stream, probe_config, &mut params)
11847 .map_err(|e| {
11848 XlogError::Kernel(format!(
11849 "hash_join_probe_v2 (indexed count, on_stream) failed: {}",
11850 e
11851 ))
11852 })?;
11853 }
11854 rec_count.commit(runtime).map_err(|e| {
11855 XlogError::Kernel(format!("indexed inner: count-pass commit failed: {}", e))
11856 })?;
11857
11858 cu_stream.synchronize().map_err(|e| {
11859 XlogError::Kernel(format!("indexed inner: sync (count read) failed: {}", e))
11860 })?;
11861 let full_count = self.read_join_output_count_metadata(&d_count_only)? as u64;
11862 let requested = max_output
11863 .map(|limit| (limit as u64).min(full_count))
11864 .unwrap_or(full_count);
11865 if requested == 0 {
11866 let combined_schema = self.combine_schemas(left.schema(), right.schema());
11867 return self.create_empty_buffer(combined_schema);
11868 }
11869 if requested > u32::MAX as u64 {
11870 return Err(XlogError::Kernel(format!(
11871 "Join produced {} rows which exceeds the u32 index limit",
11872 requested
11873 )));
11874 }
11875 let max_output_u32 = requested as u32;
11876
11877 let d_output_left = self.memory.alloc::<u32>(max_output_u32 as usize)?;
11879 let d_output_right = self.memory.alloc::<u32>(max_output_u32 as usize)?;
11880 let d_output_count = self.memory.alloc::<u32>(1)?;
11881 runtime
11884 .prepare_first_use(&d_output_count, launch_stream, Access::Write)
11885 .map_err(|e| {
11886 XlogError::Kernel(format!(
11887 "indexed inner: prepare d_output_count failed: {}",
11888 e
11889 ))
11890 })?;
11891 unsafe {
11893 let res = cudarc::driver::sys::cuMemsetD8Async(
11894 *d_output_count.device_ptr(),
11895 0,
11896 std::mem::size_of::<u32>(),
11897 cu_stream.cu_stream(),
11898 );
11899 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11900 return Err(XlogError::Kernel(format!(
11901 "cuMemsetD8Async (indexed inner d_output_count) failed: {:?}",
11902 res
11903 )));
11904 }
11905 }
11906
11907 let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
11908 rec_mat.read(&left_packed.hashes);
11909 rec_mat.read(&left_packed.packed_keys);
11910 rec_mat.read(&index.packed_keys);
11911 rec_mat.read(&table.bucket_offsets);
11912 rec_mat.read(&table.bucket_counts);
11913 rec_mat.read(&table.bucket_entries);
11914 rec_mat.read(&table.bucket_entry_hashes);
11915 rec_mat.write(&d_output_left);
11916 rec_mat.write(&d_output_right);
11917 rec_mat.write(&d_output_count);
11918 rec_mat.preflight(runtime).map_err(|e| {
11919 XlogError::Kernel(format!(
11920 "indexed inner: materialize preflight failed: {}",
11921 e
11922 ))
11923 })?;
11924 unsafe {
11926 let mut params: Vec<*mut c_void> = vec![
11927 (&left_packed.hashes).as_kernel_param(),
11928 num_left.as_kernel_param(),
11929 (&table.bucket_offsets).as_kernel_param(),
11930 (&table.bucket_counts).as_kernel_param(),
11931 (&table.bucket_entries).as_kernel_param(),
11932 (&table.bucket_entry_hashes).as_kernel_param(),
11933 table.bucket_mask.as_kernel_param(),
11934 (&left_packed.packed_keys).as_kernel_param(),
11935 (&index.packed_keys).as_kernel_param(),
11936 index.key_bytes.as_kernel_param(),
11937 (&d_output_left).as_kernel_param(),
11938 (&d_output_right).as_kernel_param(),
11939 (&d_output_count).as_kernel_param(),
11940 max_output_u32.as_kernel_param(),
11941 ];
11942 probe_func
11943 .clone()
11944 .launch_on_stream(&cu_stream, probe_config, &mut params)
11945 .map_err(|e| {
11946 XlogError::Kernel(format!(
11947 "hash_join_probe_v2 (indexed mat, on_stream) failed: {}",
11948 e
11949 ))
11950 })?;
11951 }
11952 rec_mat.commit(runtime).map_err(|e| {
11953 XlogError::Kernel(format!("indexed inner: materialize commit failed: {}", e))
11954 })?;
11955
11956 cu_stream.synchronize().map_err(|e| {
11957 XlogError::Kernel(format!("indexed inner: sync (mat read) failed: {}", e))
11958 })?;
11959 let result_count = (self.read_join_output_count_metadata(&d_output_count)? as u64)
11960 .min(max_output_u32 as u64);
11961 if result_count == 0 {
11962 let combined_schema = self.combine_schemas(left.schema(), right.schema());
11963 return self.create_empty_buffer(combined_schema);
11964 }
11965 let output_rows = result_count as u32;
11966
11967 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
11969 for col_idx in 0..left.columns.len() {
11970 let c = left
11971 .column(col_idx)
11972 .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
11973 rec_gather.read_column(c);
11974 }
11975 for col_idx in 0..right.columns.len() {
11976 let c = right
11977 .column(col_idx)
11978 .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
11979 rec_gather.read_column(c);
11980 }
11981 rec_gather.read(&d_output_left);
11982 rec_gather.read(&d_output_right);
11983 rec_gather.preflight(runtime).map_err(|e| {
11984 XlogError::Kernel(format!("indexed inner: gather preflight failed: {}", e))
11985 })?;
11986 let gathered_left = self.gather_buffer_by_indices_on_stream(
11987 left,
11988 &d_output_left,
11989 output_rows,
11990 &cu_stream,
11991 launch_stream,
11992 runtime,
11993 )?;
11994 let gathered_right = self.gather_buffer_by_indices_on_stream(
11995 right,
11996 &d_output_right,
11997 output_rows,
11998 &cu_stream,
11999 launch_stream,
12000 runtime,
12001 )?;
12002 rec_gather.commit(runtime).map_err(|e| {
12003 XlogError::Kernel(format!("indexed inner: gather commit failed: {}", e))
12004 })?;
12005
12006 let combined_schema = self.combine_schemas(left.schema(), right.schema());
12007 let mut result_columns = Vec::with_capacity(combined_schema.arity());
12008 result_columns.extend(gathered_left.columns);
12009 result_columns.extend(gathered_right.columns);
12010 self.buffer_from_columns(result_columns, result_count, combined_schema)
12011 }
12012
12013 fn hash_join_semi_or_anti_v2_with_index_recorded(
12019 &self,
12020 left: &CudaBuffer,
12021 left_keys: &[usize],
12022 index: &crate::provider::JoinIndexV2,
12023 anti: bool,
12024 launch_stream: StreamId,
12025 ) -> Result<CudaBuffer> {
12026 use crate::launch::LaunchRecorder;
12027
12028 let runtime = self.memory.runtime().ok_or_else(|| {
12029 XlogError::Kernel(
12030 "hash_join_v2_with_index_recorded (semi/anti) requires runtime-backed manager"
12031 .to_string(),
12032 )
12033 })?;
12034 let cu_stream = runtime
12035 .stream_pool()
12036 .resolve(launch_stream)
12037 .ok_or_else(|| {
12038 XlogError::Kernel("indexed semi/anti: launch_stream does not resolve".to_string())
12039 })?;
12040
12041 let num_left = left.num_rows() as u32;
12042 let table = &index.table;
12043
12044 let left_packed =
12045 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
12046 if left_packed.key_bytes != index.key_bytes {
12047 return Err(XlogError::Kernel(
12048 "Join key byte width mismatch between probe and cached index".to_string(),
12049 ));
12050 }
12051
12052 let d_mask = self.memory.alloc::<u8>(num_left as usize)?;
12053 let kernel_name = if anti {
12054 join_kernels::HASH_JOIN_ANTI
12055 } else {
12056 join_kernels::HASH_JOIN_SEMI
12057 };
12058 let func = self
12059 .device
12060 .inner()
12061 .get_func(JOIN_MODULE, kernel_name)
12062 .ok_or_else(|| XlogError::Kernel(format!("{} kernel not found", kernel_name)))?;
12063 let block_size = 256u32;
12064 let grid_size = num_left.div_ceil(block_size);
12065 let cfg = LaunchConfig {
12066 grid_dim: (grid_size, 1, 1),
12067 block_dim: (block_size, 1, 1),
12068 shared_mem_bytes: 0,
12069 };
12070
12071 let mut rec = LaunchRecorder::new_strict(launch_stream);
12072 rec.read(&left_packed.hashes);
12073 rec.read(&left_packed.packed_keys);
12074 rec.read(&index.packed_keys);
12075 rec.read(&table.bucket_offsets);
12076 rec.read(&table.bucket_counts);
12077 rec.read(&table.bucket_entries);
12078 rec.read(&table.bucket_entry_hashes);
12079 rec.write(&d_mask);
12080 rec.preflight(runtime).map_err(|e| {
12081 XlogError::Kernel(format!("indexed semi/anti: preflight failed: {}", e))
12082 })?;
12083 unsafe {
12085 func.clone().launch_on_stream(
12086 &cu_stream,
12087 cfg,
12088 (
12089 &left_packed.hashes,
12090 num_left,
12091 &table.bucket_offsets,
12092 &table.bucket_counts,
12093 &table.bucket_entries,
12094 &table.bucket_entry_hashes,
12095 table.bucket_mask,
12096 &left_packed.packed_keys,
12097 &index.packed_keys,
12098 index.key_bytes,
12099 &d_mask,
12100 ),
12101 )
12102 }
12103 .map_err(|e| {
12104 XlogError::Kernel(format!(
12105 "{} (on_stream, indexed) failed: {}",
12106 kernel_name, e
12107 ))
12108 })?;
12109 rec.commit(runtime)
12110 .map_err(|e| XlogError::Kernel(format!("indexed semi/anti: commit failed: {}", e)))?;
12111
12112 self.compact_buffer_by_device_mask_counted_recorded(left, &d_mask, launch_stream)
12113 }
12114
12115 fn hash_join_left_outer_v2_with_index_recorded(
12121 &self,
12122 left: &CudaBuffer,
12123 right: &CudaBuffer,
12124 left_keys: &[usize],
12125 index: &crate::provider::JoinIndexV2,
12126 max_output: Option<usize>,
12127 launch_stream: StreamId,
12128 ) -> Result<CudaBuffer> {
12129 use crate::launch::LaunchRecorder;
12130
12131 let runtime = self.memory.runtime().ok_or_else(|| {
12132 XlogError::Kernel(
12133 "hash_join_v2_with_index_recorded (left_outer) requires runtime-backed manager"
12134 .to_string(),
12135 )
12136 })?;
12137 let cu_stream = runtime
12138 .stream_pool()
12139 .resolve(launch_stream)
12140 .ok_or_else(|| {
12141 XlogError::Kernel("indexed left_outer: launch_stream does not resolve".to_string())
12142 })?;
12143
12144 let num_left = left.num_rows() as u32;
12145 let table = &index.table;
12146
12147 let left_packed =
12148 self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
12149 if left_packed.key_bytes != index.key_bytes {
12150 return Err(XlogError::Kernel(
12151 "Join key byte width mismatch between probe and cached index".to_string(),
12152 ));
12153 }
12154
12155 let device = self.device.inner();
12156 let block_size = 256u32;
12157 let grid_size = num_left.div_ceil(block_size);
12158 let cfg = LaunchConfig {
12159 grid_dim: (grid_size, 1, 1),
12160 block_dim: (block_size, 1, 1),
12161 shared_mem_bytes: 0,
12162 };
12163
12164 let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
12166 let d_count_only = self.memory.alloc::<u32>(1)?;
12167 let d_dummy_left = self.memory.alloc::<u32>(1)?;
12168 let d_dummy_right = self.memory.alloc::<u32>(1)?;
12169 runtime
12172 .prepare_first_use(&d_count_only, launch_stream, Access::Write)
12173 .map_err(|e| {
12174 XlogError::Kernel(format!(
12175 "indexed left_outer: prepare d_count_only failed: {}",
12176 e
12177 ))
12178 })?;
12179 unsafe {
12181 let res = cudarc::driver::sys::cuMemsetD8Async(
12182 *d_count_only.device_ptr(),
12183 0,
12184 std::mem::size_of::<u32>(),
12185 cu_stream.cu_stream(),
12186 );
12187 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12188 return Err(XlogError::Kernel(format!(
12189 "cuMemsetD8Async (indexed left_outer d_count_only) failed: {:?}",
12190 res
12191 )));
12192 }
12193 }
12194
12195 let semi_func = device
12196 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
12197 .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
12198 let probe_func = device
12199 .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
12200 .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
12201
12202 let mut rec_a = LaunchRecorder::new_strict(launch_stream);
12203 rec_a.read(&left_packed.hashes);
12204 rec_a.read(&left_packed.packed_keys);
12205 rec_a.read(&index.packed_keys);
12206 rec_a.read(&table.bucket_offsets);
12207 rec_a.read(&table.bucket_counts);
12208 rec_a.read(&table.bucket_entries);
12209 rec_a.read(&table.bucket_entry_hashes);
12210 rec_a.write(&d_has_match);
12211 rec_a.write(&d_count_only);
12212 rec_a.write(&d_dummy_left);
12213 rec_a.write(&d_dummy_right);
12214 rec_a.preflight(runtime).map_err(|e| {
12215 XlogError::Kernel(format!(
12216 "indexed left_outer: semi/count preflight failed: {}",
12217 e
12218 ))
12219 })?;
12220 unsafe {
12222 semi_func.clone().launch_on_stream(
12223 &cu_stream,
12224 cfg,
12225 (
12226 &left_packed.hashes,
12227 num_left,
12228 &table.bucket_offsets,
12229 &table.bucket_counts,
12230 &table.bucket_entries,
12231 &table.bucket_entry_hashes,
12232 table.bucket_mask,
12233 &left_packed.packed_keys,
12234 &index.packed_keys,
12235 index.key_bytes,
12236 &d_has_match,
12237 ),
12238 )
12239 }
12240 .map_err(|e| {
12241 XlogError::Kernel(format!(
12242 "hash_join_semi (on_stream, indexed left_outer) failed: {}",
12243 e
12244 ))
12245 })?;
12246
12247 let max_output_count_only = 0u32;
12248 unsafe {
12250 let mut params: Vec<*mut c_void> = vec![
12251 (&left_packed.hashes).as_kernel_param(),
12252 num_left.as_kernel_param(),
12253 (&table.bucket_offsets).as_kernel_param(),
12254 (&table.bucket_counts).as_kernel_param(),
12255 (&table.bucket_entries).as_kernel_param(),
12256 (&table.bucket_entry_hashes).as_kernel_param(),
12257 table.bucket_mask.as_kernel_param(),
12258 (&left_packed.packed_keys).as_kernel_param(),
12259 (&index.packed_keys).as_kernel_param(),
12260 index.key_bytes.as_kernel_param(),
12261 (&d_dummy_left).as_kernel_param(),
12262 (&d_dummy_right).as_kernel_param(),
12263 (&d_count_only).as_kernel_param(),
12264 max_output_count_only.as_kernel_param(),
12265 ];
12266 probe_func
12267 .clone()
12268 .launch_on_stream(&cu_stream, cfg, &mut params)
12269 .map_err(|e| {
12270 XlogError::Kernel(format!(
12271 "hash_join_probe_v2 (count, on_stream, indexed left_outer) failed: {}",
12272 e
12273 ))
12274 })?;
12275 }
12276 rec_a.commit(runtime).map_err(|e| {
12277 XlogError::Kernel(format!(
12278 "indexed left_outer: semi/count commit failed: {}",
12279 e
12280 ))
12281 })?;
12282
12283 cu_stream.synchronize().map_err(|e| {
12284 XlogError::Kernel(format!(
12285 "indexed left_outer: sync (count read) failed: {}",
12286 e
12287 ))
12288 })?;
12289 let full_inner = self.read_join_output_count_metadata(&d_count_only)? as u64;
12290 let requested_inner = max_output
12291 .map(|limit| (limit as u64).min(full_inner))
12292 .unwrap_or(full_inner);
12293 if requested_inner > u32::MAX as u64 {
12294 return Err(XlogError::Kernel(format!(
12295 "Join produced {} rows which exceeds the u32 index limit",
12296 requested_inner
12297 )));
12298 }
12299 let max_output_u32 = requested_inner as u32;
12300 let alloc_len = (requested_inner.max(1)) as usize;
12301
12302 let d_output_left = self.memory.alloc::<u32>(alloc_len)?;
12304 let d_output_right = self.memory.alloc::<u32>(alloc_len)?;
12305 let d_output_count = self.memory.alloc::<u32>(1)?;
12306 runtime
12309 .prepare_first_use(&d_output_count, launch_stream, Access::Write)
12310 .map_err(|e| {
12311 XlogError::Kernel(format!(
12312 "indexed left_outer: prepare d_output_count failed: {}",
12313 e
12314 ))
12315 })?;
12316 unsafe {
12318 let res = cudarc::driver::sys::cuMemsetD8Async(
12319 *d_output_count.device_ptr(),
12320 0,
12321 std::mem::size_of::<u32>(),
12322 cu_stream.cu_stream(),
12323 );
12324 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12325 return Err(XlogError::Kernel(format!(
12326 "cuMemsetD8Async (indexed left_outer d_output_count) failed: {:?}",
12327 res
12328 )));
12329 }
12330 }
12331
12332 let mut rec_b = LaunchRecorder::new_strict(launch_stream);
12333 rec_b.read(&left_packed.hashes);
12334 rec_b.read(&left_packed.packed_keys);
12335 rec_b.read(&index.packed_keys);
12336 rec_b.read(&table.bucket_offsets);
12337 rec_b.read(&table.bucket_counts);
12338 rec_b.read(&table.bucket_entries);
12339 rec_b.read(&table.bucket_entry_hashes);
12340 rec_b.write(&d_output_left);
12341 rec_b.write(&d_output_right);
12342 rec_b.write(&d_output_count);
12343 rec_b.preflight(runtime).map_err(|e| {
12344 XlogError::Kernel(format!(
12345 "indexed left_outer: materialize preflight failed: {}",
12346 e
12347 ))
12348 })?;
12349 unsafe {
12351 let mut params: Vec<*mut c_void> = vec![
12352 (&left_packed.hashes).as_kernel_param(),
12353 num_left.as_kernel_param(),
12354 (&table.bucket_offsets).as_kernel_param(),
12355 (&table.bucket_counts).as_kernel_param(),
12356 (&table.bucket_entries).as_kernel_param(),
12357 (&table.bucket_entry_hashes).as_kernel_param(),
12358 table.bucket_mask.as_kernel_param(),
12359 (&left_packed.packed_keys).as_kernel_param(),
12360 (&index.packed_keys).as_kernel_param(),
12361 index.key_bytes.as_kernel_param(),
12362 (&d_output_left).as_kernel_param(),
12363 (&d_output_right).as_kernel_param(),
12364 (&d_output_count).as_kernel_param(),
12365 max_output_u32.as_kernel_param(),
12366 ];
12367 probe_func
12368 .clone()
12369 .launch_on_stream(&cu_stream, cfg, &mut params)
12370 .map_err(|e| {
12371 XlogError::Kernel(format!(
12372 "hash_join_probe_v2 (mat, on_stream, indexed left_outer) failed: {}",
12373 e
12374 ))
12375 })?;
12376 }
12377 rec_b.commit(runtime).map_err(|e| {
12378 XlogError::Kernel(format!(
12379 "indexed left_outer: materialize commit failed: {}",
12380 e
12381 ))
12382 })?;
12383
12384 cu_stream.synchronize().map_err(|e| {
12385 XlogError::Kernel(format!("indexed left_outer: sync (mat read) failed: {}", e))
12386 })?;
12387 let inner_count = self
12388 .read_join_output_count_metadata(&d_output_count)?
12389 .min(max_output_u32);
12390
12391 let d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
12393 let mask_not_fn = device
12394 .get_func(FILTER_MODULE, filter_kernels::MASK_NOT)
12395 .ok_or_else(|| XlogError::Kernel("mask_not kernel not found".to_string()))?;
12396 let mut rec_c = LaunchRecorder::new_strict(launch_stream);
12397 rec_c.read(&d_has_match);
12398 rec_c.write(&d_no_match);
12399 rec_c.preflight(runtime).map_err(|e| {
12400 XlogError::Kernel(format!(
12401 "indexed left_outer: mask_not preflight failed: {}",
12402 e
12403 ))
12404 })?;
12405 unsafe {
12407 mask_not_fn.clone().launch_on_stream(
12408 &cu_stream,
12409 cfg,
12410 (&d_has_match, &d_no_match, num_left),
12411 )
12412 }
12413 .map_err(|e| {
12414 XlogError::Kernel(format!(
12415 "mask_not (on_stream, indexed left_outer) failed: {}",
12416 e
12417 ))
12418 })?;
12419 rec_c.commit(runtime).map_err(|e| {
12420 XlogError::Kernel(format!("indexed left_outer: mask_not commit failed: {}", e))
12421 })?;
12422
12423 let unmatched_left =
12424 self.compact_buffer_by_device_mask_counted_recorded(left, &d_no_match, launch_stream)?;
12425 let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
12426 let total_rows = (inner_count as u64) + unmatched_rows;
12427
12428 let combined_schema = self.combine_schemas(left.schema(), right.schema());
12429 if total_rows == 0 {
12430 return self.create_empty_buffer(combined_schema);
12431 }
12432
12433 let inner_count_u32 = inner_count;
12439 let inner_left_buf;
12440 let inner_right_buf;
12441 if inner_count > 0 {
12442 let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
12443 for col_idx in 0..left.columns.len() {
12444 let c = left.column(col_idx).ok_or_else(|| {
12445 XlogError::Kernel(format!("Left column {} not found", col_idx))
12446 })?;
12447 rec_gather.read_column(c);
12448 }
12449 for col_idx in 0..right.columns.len() {
12450 let c = right.column(col_idx).ok_or_else(|| {
12451 XlogError::Kernel(format!("Right column {} not found", col_idx))
12452 })?;
12453 rec_gather.read_column(c);
12454 }
12455 rec_gather.read(&d_output_left);
12456 rec_gather.read(&d_output_right);
12457 rec_gather.preflight(runtime).map_err(|e| {
12458 XlogError::Kernel(format!(
12459 "indexed left_outer: gather preflight failed: {}",
12460 e
12461 ))
12462 })?;
12463 inner_left_buf = Some(self.gather_buffer_by_indices_on_stream(
12464 left,
12465 &d_output_left,
12466 inner_count_u32,
12467 &cu_stream,
12468 launch_stream,
12469 runtime,
12470 )?);
12471 inner_right_buf = Some(self.gather_buffer_by_indices_on_stream(
12472 right,
12473 &d_output_right,
12474 inner_count_u32,
12475 &cu_stream,
12476 launch_stream,
12477 runtime,
12478 )?);
12479 rec_gather.commit(runtime).map_err(|e| {
12480 XlogError::Kernel(format!("indexed left_outer: gather commit failed: {}", e))
12481 })?;
12482 } else {
12483 inner_left_buf = None;
12484 inner_right_buf = None;
12485 }
12486
12487 let mut rec_d = LaunchRecorder::new_strict(launch_stream);
12493 for col_idx in 0..unmatched_left.columns.len() {
12494 let c = unmatched_left.column(col_idx).ok_or_else(|| {
12495 XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
12496 })?;
12497 rec_d.read_column(c);
12498 }
12499 if let Some(b) = inner_left_buf.as_ref() {
12500 for col_idx in 0..b.columns.len() {
12501 let c = b.column(col_idx).ok_or_else(|| {
12502 XlogError::Kernel(format!("inner_left col {} not found", col_idx))
12503 })?;
12504 rec_d.read_column(c);
12505 }
12506 }
12507 if let Some(b) = inner_right_buf.as_ref() {
12508 for col_idx in 0..b.columns.len() {
12509 let c = b.column(col_idx).ok_or_else(|| {
12510 XlogError::Kernel(format!("inner_right col {} not found", col_idx))
12511 })?;
12512 rec_d.read_column(c);
12513 }
12514 }
12515 rec_d.preflight(runtime).map_err(|e| {
12516 XlogError::Kernel(format!(
12517 "indexed left_outer: step-D preflight failed: {}",
12518 e
12519 ))
12520 })?;
12521
12522 let mut result_columns: Vec<CudaColumn> = Vec::with_capacity(combined_schema.arity());
12523 let inner_rows = inner_count as u64;
12524
12525 for col_idx in 0..left.arity() {
12526 let elem_size = left
12527 .schema()
12528 .column_type(col_idx)
12529 .map(|t| t.size_bytes())
12530 .unwrap_or(4);
12531 let inner_bytes = (inner_rows as usize)
12532 .checked_mul(elem_size)
12533 .ok_or_else(|| XlogError::Kernel("inner_bytes overflow".to_string()))?;
12534 let unmatched_bytes = (unmatched_rows as usize)
12535 .checked_mul(elem_size)
12536 .ok_or_else(|| XlogError::Kernel("unmatched_bytes overflow".to_string()))?;
12537 let total_bytes = inner_bytes
12538 .checked_add(unmatched_bytes)
12539 .ok_or_else(|| XlogError::Kernel("total_bytes overflow".to_string()))?;
12540 let out_col = self.memory.alloc::<u8>(total_bytes)?;
12541 let dst_ptr = *out_col.device_ptr();
12542 runtime
12544 .prepare_first_use(&out_col, launch_stream, Access::Write)
12545 .map_err(|e| {
12546 XlogError::Kernel(format!(
12547 "indexed left_outer: prepare left out_col {} failed: {}",
12548 col_idx, e
12549 ))
12550 })?;
12551 if inner_bytes > 0 {
12552 let src_col = inner_left_buf
12553 .as_ref()
12554 .expect("inner_count > 0")
12555 .column(col_idx)
12556 .ok_or_else(|| XlogError::Kernel("inner_left col missing".to_string()))?;
12557 unsafe {
12559 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
12560 dst_ptr,
12561 *src_col.device_ptr(),
12562 inner_bytes,
12563 cu_stream.cu_stream(),
12564 );
12565 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12566 return Err(XlogError::Kernel(format!(
12567 "indexed left_outer: dtod copy inner_left col {} failed: {:?}",
12568 col_idx, res
12569 )));
12570 }
12571 }
12572 }
12573 if unmatched_bytes > 0 {
12574 let src_col = unmatched_left
12575 .column(col_idx)
12576 .ok_or_else(|| XlogError::Kernel("unmatched col missing".to_string()))?;
12577 unsafe {
12579 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
12580 dst_ptr + inner_bytes as u64,
12581 *src_col.device_ptr(),
12582 unmatched_bytes,
12583 cu_stream.cu_stream(),
12584 );
12585 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12586 return Err(XlogError::Kernel(format!(
12587 "indexed left_outer: dtod copy unmatched col {} failed: {:?}",
12588 col_idx, res
12589 )));
12590 }
12591 }
12592 }
12593 if let Some(b) = out_col.runtime_block() {
12594 runtime
12595 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
12596 .map_err(|e| {
12597 XlogError::Kernel(format!(
12598 "indexed left_outer: finish_block_use (left col {}) failed: {}",
12599 col_idx, e
12600 ))
12601 })?;
12602 }
12603 result_columns.push(out_col.into());
12604 }
12605
12606 for col_idx in 0..right.arity() {
12607 let elem_size = right
12608 .schema()
12609 .column_type(col_idx)
12610 .map(|t| t.size_bytes())
12611 .unwrap_or(4);
12612 let inner_bytes = (inner_rows as usize)
12613 .checked_mul(elem_size)
12614 .ok_or_else(|| XlogError::Kernel("right inner_bytes overflow".to_string()))?;
12615 let unmatched_bytes = (unmatched_rows as usize)
12616 .checked_mul(elem_size)
12617 .ok_or_else(|| XlogError::Kernel("right unmatched_bytes overflow".to_string()))?;
12618 let total_bytes = inner_bytes
12619 .checked_add(unmatched_bytes)
12620 .ok_or_else(|| XlogError::Kernel("right total_bytes overflow".to_string()))?;
12621 let out_col = self.memory.alloc::<u8>(total_bytes)?;
12622 let dst_ptr = *out_col.device_ptr();
12623 runtime
12625 .prepare_first_use(&out_col, launch_stream, Access::Write)
12626 .map_err(|e| {
12627 XlogError::Kernel(format!(
12628 "indexed left_outer: prepare right out_col {} failed: {}",
12629 col_idx, e
12630 ))
12631 })?;
12632 if total_bytes > 0 {
12633 unsafe {
12635 let res = cudarc::driver::sys::cuMemsetD8Async(
12636 dst_ptr,
12637 0,
12638 total_bytes,
12639 cu_stream.cu_stream(),
12640 );
12641 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12642 return Err(XlogError::Kernel(format!(
12643 "indexed left_outer: zero-fill right col {} failed: {:?}",
12644 col_idx, res
12645 )));
12646 }
12647 }
12648 }
12649 if inner_bytes > 0 {
12650 let src_col = inner_right_buf
12651 .as_ref()
12652 .expect("inner_count > 0")
12653 .column(col_idx)
12654 .ok_or_else(|| XlogError::Kernel("inner_right col missing".to_string()))?;
12655 unsafe {
12657 let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
12658 dst_ptr,
12659 *src_col.device_ptr(),
12660 inner_bytes,
12661 cu_stream.cu_stream(),
12662 );
12663 if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12664 return Err(XlogError::Kernel(format!(
12665 "indexed left_outer: dtod copy inner_right col {} failed: {:?}",
12666 col_idx, res
12667 )));
12668 }
12669 }
12670 }
12671 if let Some(b) = out_col.runtime_block() {
12672 runtime
12673 .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
12674 .map_err(|e| {
12675 XlogError::Kernel(format!(
12676 "indexed left_outer: finish_block_use (right col {}) failed: {}",
12677 col_idx, e
12678 ))
12679 })?;
12680 }
12681 result_columns.push(out_col.into());
12682 }
12683
12684 rec_d.commit(runtime).map_err(|e| {
12687 XlogError::Kernel(format!("indexed left_outer: step-D commit failed: {}", e))
12688 })?;
12689
12690 let d_num_rows = self.upload_device_row_count(total_rows as u32)?;
12691 Ok(CudaBuffer::from_columns_with_host_count(
12692 result_columns,
12693 total_rows,
12694 d_num_rows,
12695 combined_schema,
12696 total_rows as u32,
12697 ))
12698 }
12699}