xlog_cuda/provider/
relational.rs

1//! Relational operations: join, dedup, union, diff, sort, and related helpers.
2
3use std::ffi::c_void;
4use std::sync::atomic::Ordering;
5
6use crate::{
7    cuda_graph::{CapturedCudaGraph, CsmCudaGraphKey, CudaGraphNodeKind},
8    AsKernelParam, DeviceSlice, LaunchAsync, LaunchConfig,
9};
10use xlog_core::{Result, ScalarType, Schema, XlogError};
11
12use super::{
13    dedup_kernels, filter_kernels, ilp_kernels, join_kernels, pack_kernels, scan_kernels,
14    set_ops_kernels, sort_kernels, CsmCudaGraphEntry, CsmCudaGraphNodes, HashTableU64,
15    JoinHashTableV2, JoinIndexV2, JoinType, PackedKeyData, RadixSortScratch, DEDUP_MODULE,
16    DEFAULT_JOIN_MAX_OUTPUT, FILTER_MODULE, ILP_MODULE, JOIN_MODULE, NESTED_LOOP_TOTAL_THRESHOLD,
17    PACK_MODULE, SCAN_MODULE, SET_OPS_MODULE, SORT_MODULE,
18};
19use crate::device_runtime::{Access, BlockId, StreamId};
20use crate::launch::LaunchRecorder;
21use crate::memory::{CudaColumn, TrackedCudaSlice};
22use crate::CudaBuffer;
23
24// Per-column scalar-type encoding used by the deterministic full-row
25// dedup/diff kernels. Must match the `XLOG_TY_*` defines in
26// `kernels/dedup.cu`. Centralized here as named constants and one helper
27// so all full-row callers in this module share one source of truth.
28const XLOG_TY_U32: u8 = 0;
29const XLOG_TY_U64: u8 = 1;
30const XLOG_TY_I32: u8 = 2;
31const XLOG_TY_I64: u8 = 3;
32const XLOG_TY_F32: u8 = 4;
33const XLOG_TY_F64: u8 = 5;
34const XLOG_TY_BOOL: u8 = 6;
35const XLOG_TY_SYMBOL: u8 = 7;
36const SMALL_FULL_ROW_SORT_MAX_ROWS: usize = 1024;
37
38#[inline]
39fn scalar_type_code_dedup(ty: ScalarType) -> u8 {
40    match ty {
41        ScalarType::U32 => XLOG_TY_U32,
42        ScalarType::U64 => XLOG_TY_U64,
43        ScalarType::I32 => XLOG_TY_I32,
44        ScalarType::I64 => XLOG_TY_I64,
45        ScalarType::F32 => XLOG_TY_F32,
46        ScalarType::F64 => XLOG_TY_F64,
47        ScalarType::Bool => XLOG_TY_BOOL,
48        ScalarType::Symbol => XLOG_TY_SYMBOL,
49    }
50}
51
52impl super::CudaKernelProvider {
53    /// Perform a hash join between two buffers
54    ///
55    /// Uses a two-phase hash join:
56    /// 1. Build phase: Insert keys from `right` into a hash table
57    /// 2. Probe phase: Match keys from `left` against the hash table
58    ///
59    /// # Arguments
60    /// * `left` - The left (probe) buffer
61    /// * `right` - The right (build) buffer
62    /// * `left_keys` - Column indices for join keys in left buffer
63    /// * `right_keys` - Column indices for join keys in right buffer
64    ///
65    /// # Returns
66    /// A buffer containing the joined rows with columns from both inputs
67    ///
68    /// # Errors
69    /// Returns `XlogError::Kernel` if kernel execution fails
70    pub fn hash_join(
71        &self,
72        left: &CudaBuffer,
73        right: &CudaBuffer,
74        left_keys: &[usize],
75        right_keys: &[usize],
76    ) -> Result<CudaBuffer> {
77        self.hash_join_with_limit(left, right, left_keys, right_keys, None)
78    }
79
80    /// Hash join with configurable maximum output size
81    ///
82    /// Uses a two-phase hash join:
83    /// 1. Build phase: Insert keys from `right` into a hash table
84    /// 2. Probe phase: Match keys from `left` against the hash table
85    ///
86    /// # Arguments
87    /// * `left` - The left (probe) buffer
88    /// * `right` - The right (build) buffer
89    /// * `left_keys` - Column indices for join keys in left buffer
90    /// * `right_keys` - Column indices for join keys in right buffer
91    /// * `max_output` - Maximum number of output rows (defaults to DEFAULT_JOIN_MAX_OUTPUT)
92    ///
93    /// # Returns
94    /// A buffer containing the joined rows with columns from both inputs
95    ///
96    /// # Errors
97    /// Returns `XlogError::Kernel` if kernel execution fails
98    pub fn hash_join_with_limit(
99        &self,
100        left: &CudaBuffer,
101        right: &CudaBuffer,
102        left_keys: &[usize],
103        right_keys: &[usize],
104        max_output: Option<usize>,
105    ) -> Result<CudaBuffer> {
106        let max_output_limit = max_output.unwrap_or(DEFAULT_JOIN_MAX_OUTPUT);
107
108        // Validate key columns early (even for empty inputs).
109        if left_keys.is_empty() || right_keys.is_empty() {
110            return Err(XlogError::Kernel(
111                "Join requires at least one key column".to_string(),
112            ));
113        }
114        if left_keys.len() != right_keys.len() {
115            return Err(XlogError::Kernel(
116                "Left and right key columns must have same length".to_string(),
117            ));
118        }
119        for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
120            if left_idx >= left.arity() {
121                return Err(XlogError::Kernel(format!(
122                    "Left key column index {} out of bounds (arity {})",
123                    left_idx,
124                    left.arity()
125                )));
126            }
127            if right_idx >= right.arity() {
128                return Err(XlogError::Kernel(format!(
129                    "Right key column index {} out of bounds (arity {})",
130                    right_idx,
131                    right.arity()
132                )));
133            }
134        }
135
136        // Natural-join output: all left columns + right non-key columns.
137        let right_key_set: std::collections::HashSet<usize> = right_keys.iter().copied().collect();
138        let mut result_columns_schema = left.schema().columns.clone();
139        let mut result_sort_labels = left.schema().sort_labels().to_vec();
140        for (idx, col) in right.schema().columns.iter().enumerate() {
141            if !right_key_set.contains(&idx) {
142                result_columns_schema.push(col.clone());
143                result_sort_labels.push(
144                    right
145                        .schema()
146                        .column_sort_label(idx)
147                        .unwrap_or(&col.0)
148                        .to_string(),
149                );
150            }
151        }
152        let result_schema = Schema::new(result_columns_schema)
153            .with_sort_labels(result_sort_labels)
154            .expect("natural join sort labels match result schema arity");
155
156        // Handle empty inputs
157        if left.is_empty() || right.is_empty() {
158            return self.create_empty_buffer(result_schema);
159        }
160
161        // Delegate to the v2 implementation for correctness across key types and cardinalities.
162        let combined = self.hash_join_v2_with_limit(
163            left,
164            right,
165            left_keys,
166            right_keys,
167            JoinType::Inner,
168            Some(max_output_limit),
169        )?;
170
171        if combined.is_empty() {
172            return self.create_empty_buffer(result_schema);
173        }
174
175        let left_arity = left.arity();
176        let right_arity = right.arity();
177
178        let CudaBuffer {
179            columns: combined_columns,
180            row_cap,
181            d_num_rows,
182            schema: _,
183            ..
184        } = combined;
185
186        if combined_columns.len() != left_arity + right_arity {
187            return Err(XlogError::Kernel(format!(
188                "Join internal error: expected {} columns, got {}",
189                left_arity + right_arity,
190                combined_columns.len()
191            )));
192        }
193
194        let mut output_columns = Vec::with_capacity(result_schema.arity());
195        let mut it = combined_columns.into_iter();
196
197        // Left columns (all preserved)
198        for _ in 0..left_arity {
199            let col = it.next().ok_or_else(|| {
200                XlogError::Kernel("Join internal error: missing left columns".to_string())
201            })?;
202            output_columns.push(col);
203        }
204
205        // Right columns, excluding join keys
206        for (right_col_idx, col) in it.enumerate() {
207            if !right_key_set.contains(&right_col_idx) {
208                output_columns.push(col);
209            }
210        }
211
212        Ok(CudaBuffer::from_columns(
213            output_columns,
214            row_cap,
215            d_num_rows,
216            result_schema,
217        ))
218    }
219    /// Remove duplicate rows based on key columns
220    ///
221    /// Sorts the input by the provided key columns, then removes adjacent duplicates.
222    ///
223    /// # Arguments
224    /// * `input` - The input buffer
225    /// * `key_cols` - Column indices to use for duplicate detection
226    ///
227    /// # Returns
228    /// A buffer containing one row per duplicate-equivalence class
229    ///
230    /// # Errors
231    /// Returns `XlogError::Kernel` if kernel execution fails
232    pub fn dedup(&self, input: &CudaBuffer, key_cols: &[usize]) -> Result<CudaBuffer> {
233        if input.is_empty() {
234            return self.create_empty_buffer(input.schema().clone());
235        }
236
237        if key_cols.is_empty() {
238            if input.arity() == 0 {
239                // A 0-arity relation is either empty or {()}, and dedup collapses any
240                // non-empty multiplicity to a single empty tuple.
241                let rows = self.device_row_count(input)?;
242                if rows == 0 {
243                    return self.create_empty_buffer(input.schema().clone());
244                }
245                return self.buffer_from_columns(Vec::new(), 1, input.schema().clone());
246            }
247            return Err(XlogError::Kernel(
248                "Dedup requires at least one key column".to_string(),
249            ));
250        }
251
252        if Self::is_full_row_key(key_cols, input.arity()) && input.arity() > 1 {
253            return self.dedup_full_row_deterministic(input);
254        }
255
256        let sorted = self.sort(input, key_cols)?;
257        self.dedup_sorted(&sorted, key_cols)
258    }
259
260    /// Remove duplicate rows from a buffer that is already sorted by key columns
261    ///
262    /// This is an optimized version of `dedup` that skips the sorting step.
263    /// The caller must ensure the input is already sorted by the key columns.
264    ///
265    /// # Arguments
266    /// * `input` - The input buffer (must be sorted by key columns)
267    /// * `key_cols` - Column indices to use for duplicate detection
268    ///
269    /// # Returns
270    /// A buffer containing one row per duplicate-equivalence class
271    pub fn dedup_sorted(&self, input: &CudaBuffer, key_cols: &[usize]) -> Result<CudaBuffer> {
272        if input.is_empty() {
273            return self.create_empty_buffer(input.schema().clone());
274        }
275
276        if key_cols.is_empty() {
277            if input.arity() == 0 {
278                let rows = self.device_row_count(input)?;
279                if rows == 0 {
280                    return self.create_empty_buffer(input.schema().clone());
281                }
282                return self.buffer_from_columns(Vec::new(), 1, input.schema().clone());
283            }
284            return Err(XlogError::Kernel(
285                "Dedup requires at least one key column".to_string(),
286            ));
287        }
288
289        if Self::is_full_row_key(key_cols, input.arity()) && input.arity() > 1 {
290            return self.dedup_full_row_deterministic(input);
291        }
292
293        if input.num_rows() <= 1 {
294            return self.clone_buffer(input);
295        }
296
297        if input.num_rows() > u32::MAX as u64 {
298            return Err(XlogError::Kernel(format!(
299                "Dedup supports at most {} rows, got {}",
300                u32::MAX,
301                input.num_rows()
302            )));
303        }
304
305        // Use the module-level `scalar_type_code_dedup` so the host
306        // encoding stays in lockstep with the `XLOG_TY_*` defines in
307        // `kernels/dedup.cu`.
308        let scalar_type_code = scalar_type_code_dedup;
309
310        let device = self.device.inner();
311        let num_rows = input.num_rows() as u32;
312
313        let mut col_ptrs_host: Vec<u64> = Vec::with_capacity(key_cols.len());
314        let mut col_sizes_host: Vec<u32> = Vec::with_capacity(key_cols.len());
315        let mut col_types_host: Vec<u8> = Vec::with_capacity(key_cols.len());
316
317        for &key_col in key_cols {
318            if key_col >= input.arity() {
319                return Err(XlogError::Kernel(format!(
320                    "Key column {} out of bounds (arity {})",
321                    key_col,
322                    input.arity()
323                )));
324            }
325
326            let col = input
327                .column(key_col)
328                .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", key_col)))?;
329            let ty = input.schema().column_type(key_col).ok_or_else(|| {
330                XlogError::Kernel(format!("Key column {} type not found in schema", key_col))
331            })?;
332
333            let elem_size = ty.size_bytes();
334            let expected_bytes = (num_rows as usize) * elem_size;
335            if col.num_bytes() != expected_bytes {
336                return Err(XlogError::Kernel(format!(
337                    "Key column {} has {} bytes but expected {} (num_rows={}, elem_size={})",
338                    key_col,
339                    col.num_bytes(),
340                    expected_bytes,
341                    num_rows,
342                    elem_size
343                )));
344            }
345
346            let ptr = *col.device_ptr();
347            col_ptrs_host.push(ptr);
348            col_sizes_host.push(elem_size as u32);
349            col_types_host.push(scalar_type_code(ty));
350        }
351
352        let num_key_cols = key_cols.len() as u32;
353        let mut d_col_ptrs = self.memory.alloc::<u64>(key_cols.len())?;
354        let mut d_col_sizes = self.memory.alloc::<u32>(key_cols.len())?;
355        let mut d_col_types = self.memory.alloc::<u8>(key_cols.len())?;
356
357        self.htod_launch_metadata_sync_copy_into(&col_ptrs_host, &mut d_col_ptrs)
358            .map_err(|e| XlogError::Kernel(format!("Failed to upload key column ptrs: {}", e)))?;
359        self.htod_launch_metadata_sync_copy_into(&col_sizes_host, &mut d_col_sizes)
360            .map_err(|e| XlogError::Kernel(format!("Failed to upload key column sizes: {}", e)))?;
361        self.htod_launch_metadata_sync_copy_into(&col_types_host, &mut d_col_types)
362            .map_err(|e| XlogError::Kernel(format!("Failed to upload key column types: {}", e)))?;
363
364        let block_size = 256u32;
365        let num_blocks = num_rows.div_ceil(block_size);
366        let config = LaunchConfig {
367            grid_dim: (num_blocks, 1, 1),
368            block_dim: (block_size, 1, 1),
369            shared_mem_bytes: 0,
370        };
371
372        let d_unique_mask = self.memory.alloc::<u8>(num_rows as usize)?;
373        let d_prefix_sum = self.memory.alloc::<u32>(num_rows as usize)?;
374        let mut d_block_sums = self.memory.alloc::<u32>(num_blocks as usize)?;
375
376        let mark_and_scan_fn = device
377            .get_func(DEDUP_MODULE, dedup_kernels::MARK_UNIQUE_AND_SCAN_COLUMNAR)
378            .ok_or_else(|| {
379                XlogError::Kernel("mark_unique_and_scan_columnar kernel not found".to_string())
380            })?;
381
382        // SAFETY: mark_unique_and_scan_columnar(col_ptrs, col_sizes, col_types, num_key_cols,
383        //                                       num_rows_device, row_cap,
384        //                                       unique_mask, prefix_sum, block_sums)
385        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
386        unsafe {
387            mark_and_scan_fn.clone().launch(
388                config,
389                (
390                    &d_col_ptrs,
391                    &d_col_sizes,
392                    &d_col_types,
393                    num_key_cols,
394                    input.num_rows_device(),
395                    num_rows,
396                    &d_unique_mask,
397                    &d_prefix_sum,
398                    &d_block_sums,
399                ),
400            )
401        }
402        .map_err(|e| XlogError::Kernel(format!("mark_unique_and_scan_columnar failed: {}", e)))?;
403        self.device.synchronize()?;
404
405        if num_blocks > 1 {
406            self.multiblock_scan_u32_inplace(&mut d_block_sums, num_blocks)?;
407
408            let phase3_fn = device
409                .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE3)
410                .ok_or_else(|| {
411                    XlogError::Kernel("Failed to get multiblock_scan_phase3 kernel".to_string())
412                })?;
413
414            // SAFETY: multiblock_scan_phase3(uint32_t* prefix_sum, const uint32_t* block_offsets, uint32_t n)
415            unsafe {
416                phase3_fn.clone().launch(
417                    LaunchConfig {
418                        grid_dim: (num_blocks, 1, 1),
419                        block_dim: (block_size, 1, 1),
420                        shared_mem_bytes: 0,
421                    },
422                    (&d_prefix_sum, &d_block_sums, num_rows),
423                )
424            }
425            .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase3 failed: {}", e)))?;
426            self.device.synchronize()?;
427        }
428
429        self.device.synchronize()?;
430
431        let d_out_count = self.capture_compact_count(&d_prefix_sum, &d_unique_mask, num_rows)?;
432        self.compact_buffer_by_device_mask_device_count(
433            input,
434            &d_unique_mask,
435            &d_prefix_sum,
436            d_out_count,
437        )
438    }
439    /// Compute union of two buffers (GPU-native, deduped)
440    ///
441    /// # Arguments
442    /// * `a` - First buffer
443    /// * `b` - Second buffer
444    ///
445    /// # Returns
446    /// A buffer containing the deduplicated union of both inputs
447    ///
448    /// # Errors
449    /// Returns `XlogError::Kernel` if schemas don't match or operation fails
450    pub fn union(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
451        self.union_gpu(a, b)
452    }
453
454    fn concat_buffers_gpu(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
455        if !self.schemas_type_compatible(a.schema(), b.schema()) {
456            return Err(XlogError::Kernel(format!(
457                "Concat requires compatible schemas: {:?} vs {:?}",
458                a.schema(),
459                b.schema()
460            )));
461        }
462
463        let schema = a.schema().clone();
464        let a_rows = self.device_row_count(a)? as u64;
465        let b_rows = self.device_row_count(b)? as u64;
466
467        if a_rows == 0 && b_rows == 0 {
468            return self.create_empty_buffer(schema);
469        }
470        if a_rows == 0 {
471            return self.clone_buffer(b);
472        }
473        if b_rows == 0 {
474            return self.clone_buffer(a);
475        }
476
477        let total_rows = a_rows + b_rows;
478        if total_rows > u32::MAX as u64 {
479            return Err(XlogError::Kernel(format!(
480                "Concat supports at most {} rows, got {}",
481                u32::MAX,
482                total_rows
483            )));
484        }
485
486        let device = self.device.inner();
487        let concat_fn = device
488            .get_func(SET_OPS_MODULE, set_ops_kernels::CONCAT_BYTES)
489            .ok_or_else(|| XlogError::Kernel("concat_bytes kernel not found".to_string()))?;
490
491        let block_size = 256u32;
492
493        let a_rows = usize::try_from(a_rows)
494            .map_err(|_| XlogError::Kernel(format!("Concat: a has too many rows: {}", a_rows)))?;
495        let b_rows = usize::try_from(b_rows)
496            .map_err(|_| XlogError::Kernel(format!("Concat: b has too many rows: {}", b_rows)))?;
497
498        let mut result_columns = Vec::with_capacity(schema.arity());
499        for col_idx in 0..schema.arity() {
500            let elem_size = schema
501                .column_type(col_idx)
502                .map(|t| t.size_bytes())
503                .unwrap_or(4);
504
505            let a_bytes = a_rows
506                .checked_mul(elem_size)
507                .ok_or_else(|| XlogError::Kernel("Concat: a_bytes overflow".to_string()))?;
508            let b_bytes = b_rows
509                .checked_mul(elem_size)
510                .ok_or_else(|| XlogError::Kernel("Concat: b_bytes overflow".to_string()))?;
511            let total_bytes = a_bytes
512                .checked_add(b_bytes)
513                .ok_or_else(|| XlogError::Kernel("Concat: total_bytes overflow".to_string()))?;
514
515            let a_bytes_u32 = u32::try_from(a_bytes).map_err(|_| {
516                XlogError::Kernel(format!("Concat: a_bytes too large: {}", a_bytes))
517            })?;
518            let b_bytes_u32 = u32::try_from(b_bytes).map_err(|_| {
519                XlogError::Kernel(format!("Concat: b_bytes too large: {}", b_bytes))
520            })?;
521            let total_bytes_u32 = u32::try_from(total_bytes).map_err(|_| {
522                XlogError::Kernel(format!("Concat: total_bytes too large: {}", total_bytes))
523            })?;
524
525            let a_col = a
526                .column(col_idx)
527                .ok_or_else(|| XlogError::Kernel(format!("A column {} not found", col_idx)))?;
528            let b_col = b
529                .column(col_idx)
530                .ok_or_else(|| XlogError::Kernel(format!("B column {} not found", col_idx)))?;
531
532            let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
533
534            if total_bytes_u32 > 0 {
535                let grid_size = total_bytes_u32.div_ceil(block_size);
536                let config = LaunchConfig {
537                    grid_dim: (grid_size, 1, 1),
538                    block_dim: (block_size, 1, 1),
539                    shared_mem_bytes: 0,
540                };
541
542                // SAFETY: concat_bytes(const uint8_t* a, uint32_t a_bytes, const uint8_t* b, uint32_t b_bytes, uint8_t* output)
543                unsafe {
544                    concat_fn.clone().launch(
545                        config,
546                        (a_col, a_bytes_u32, b_col, b_bytes_u32, &mut out_col),
547                    )
548                }
549                .map_err(|e| XlogError::Kernel(format!("concat_bytes failed: {}", e)))?;
550            }
551
552            result_columns.push(out_col.into());
553        }
554
555        self.device.synchronize()?;
556
557        self.buffer_from_columns(result_columns, total_rows, schema)
558    }
559    /// Compute set difference (a - b)
560    ///
561    /// Returns rows from `a` that don't exist in `b`.
562    /// Uses hash-based approach: build hash table from b, probe with a.
563    ///
564    /// # Arguments
565    /// * `a` - Source buffer
566    /// * `b` - Buffer to subtract
567    ///
568    /// # Returns
569    /// A buffer containing rows in `a` but not in `b`
570    ///
571    /// # Errors
572    /// Returns `XlogError::Kernel` if schemas don't match or operation fails
573    pub fn diff(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
574        let num_a = self.device_row_count(a)?;
575        let num_b = self.device_row_count(b)?;
576        if num_a > u32::MAX as usize || num_b > u32::MAX as usize {
577            return Err(XlogError::Kernel(format!(
578                "Diff supports at most {} rows per side (a={}, b={})",
579                u32::MAX,
580                num_a,
581                num_b
582            )));
583        }
584
585        // Handle empty cases
586        if num_a == 0 {
587            return self.create_empty_buffer(a.schema().clone());
588        }
589        if num_b == 0 {
590            return self.clone_buffer(a);
591        }
592
593        // Verify schemas have compatible types (ignore column names for Datalog negation)
594        if !self.schemas_type_compatible(a.schema(), b.schema()) {
595            return Err(XlogError::Kernel(format!(
596                "Diff requires compatible schemas: {:?} vs {:?}",
597                a.schema(),
598                b.schema()
599            )));
600        }
601
602        // Use first column as key for hash-based diff
603        if a.arity() == 0 {
604            return Err(XlogError::Kernel(
605                "Diff requires at least one column".to_string(),
606            ));
607        }
608
609        let num_b = num_b as u32;
610        let num_a = num_a as u32;
611
612        // Build hash table from b
613        let hash_table_size = (num_b as usize * 2).max(1024) as u32;
614        let hash_table_alloc_size = (hash_table_size * 3) as usize;
615        let mut hash_table = self.memory.alloc::<u32>(hash_table_alloc_size)?;
616        let mut next_ptrs = self.memory.alloc::<u32>(num_b as usize)?;
617
618        // Initialize all hash table entries
619        let init_val = 0xFFFFFFFFu32;
620        self.device
621            .inner()
622            .htod_sync_copy_into(&vec![init_val; hash_table_alloc_size], &mut hash_table)
623            .map_err(|e| XlogError::Kernel(format!("Failed to init hash table: {}", e)))?;
624        self.device
625            .inner()
626            .htod_sync_copy_into(&vec![init_val; num_b as usize], &mut next_ptrs)
627            .map_err(|e| XlogError::Kernel(format!("Failed to init next pointers: {}", e)))?;
628
629        // Build phase with b's keys using transmute for direct GPU access
630        let build_func = self
631            .device
632            .inner()
633            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_BUILD)
634            .ok_or_else(|| XlogError::Kernel("hash_join_build kernel not found".to_string()))?;
635
636        let b_key_col = b
637            .column(0)
638            .ok_or_else(|| XlogError::Kernel("B key column not found".to_string()))?;
639        let b_keys_view = self.column_as_u32_view(b_key_col, num_b as usize)?;
640
641        let block_size = 256u32;
642        let build_grid = num_b.div_ceil(block_size);
643        let build_config = LaunchConfig {
644            grid_dim: (build_grid, 1, 1),
645            block_dim: (block_size, 1, 1),
646            shared_mem_bytes: 0,
647        };
648
649        // SAFETY: Kernel parameters match expected signature
650        unsafe {
651            build_func
652                .clone()
653                .launch(
654                    build_config,
655                    (
656                        &b_keys_view,
657                        &b_keys_view, // payload = key for diff
658                        num_b,
659                        &hash_table,
660                        &next_ptrs,
661                        hash_table_size,
662                    ),
663                )
664                .map_err(|e| XlogError::Kernel(format!("Build kernel failed: {}", e)))?;
665        }
666
667        // Synchronize and build lookup set for filtering
668        self.device.synchronize()?;
669
670        // Get a's keys using transmute
671        let a_key_col = a
672            .column(0)
673            .ok_or_else(|| XlogError::Kernel("A key column not found".to_string()))?;
674
675        // Read keys to host for filtering (set difference requires iterating)
676        let mut a_keys_host = vec![0u8; (num_a as usize) * 4];
677        self.dtoh_sync_copy_into_tracked(a_key_col, &mut a_keys_host)
678            .map_err(|e| XlogError::Kernel(format!("Failed to read a keys: {}", e)))?;
679
680        let mut b_keys_host = vec![0u8; (num_b as usize) * 4];
681        self.dtoh_sync_copy_into_tracked(b_key_col, &mut b_keys_host)
682            .map_err(|e| XlogError::Kernel(format!("Failed to read b keys: {}", e)))?;
683
684        // Build lookup set from b
685        let b_keys_set: std::collections::HashSet<u32> = b_keys_host
686            .chunks_exact(4)
687            .map(|chunk| u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
688            .collect();
689
690        // Find indices of a rows not in b
691        let diff_indices: Vec<usize> = a_keys_host
692            .chunks_exact(4)
693            .enumerate()
694            .map(|(i, chunk)| {
695                (
696                    i,
697                    u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]),
698                )
699            })
700            .filter(|(_, k)| !b_keys_set.contains(k))
701            .map(|(i, _)| i)
702            .collect();
703
704        let diff_count = diff_indices.len() as u64;
705
706        if diff_count == 0 {
707            return self.create_empty_buffer(a.schema().clone());
708        }
709
710        // Build result by selecting rows
711        let schema = a.schema().clone();
712        let mut result_columns = Vec::with_capacity(schema.arity());
713
714        for col_idx in 0..schema.arity() {
715            let col_type_size = schema
716                .column_type(col_idx)
717                .map(|t| t.size_bytes())
718                .unwrap_or(4);
719            let result_bytes = (diff_count as usize) * col_type_size;
720
721            if let Some(a_col) = a.column(col_idx) {
722                // Read column data
723                let a_col_bytes = (num_a as usize) * col_type_size;
724                let mut a_col_host = vec![0u8; a_col_bytes];
725                self.dtoh_sync_copy_into_tracked(a_col, &mut a_col_host)
726                    .map_err(|e| XlogError::Kernel(format!("Failed to read column: {}", e)))?;
727
728                // Select rows matching diff indices
729                let mut result_host = Vec::with_capacity(result_bytes);
730                for &idx in &diff_indices {
731                    let start = idx * col_type_size;
732                    let end = start + col_type_size;
733                    result_host.extend_from_slice(&a_col_host[start..end]);
734                }
735
736                // Upload result
737                let mut result_col = self.memory.alloc::<u8>(result_bytes)?;
738                self.device
739                    .inner()
740                    .htod_sync_copy_into(&result_host, &mut result_col)
741                    .map_err(|e| XlogError::Kernel(format!("Failed to upload result: {}", e)))?;
742
743                result_columns.push(result_col.into());
744            }
745        }
746
747        self.buffer_from_columns(result_columns, diff_count, schema)
748    }
749    // ============== GPU-Native Set Operations ==============
750
751    /// GPU-native union (no host roundtrip)
752    ///
753    /// Computes the union of two buffers entirely on the GPU using:
754    /// 1. Concatenate arrays using concat_u32 kernel
755    /// 2. Sort the concatenated result
756    /// 3. Deduplicate using existing dedup()
757    ///
758    /// # Arguments
759    /// * `a` - First buffer
760    /// * `b` - Second buffer
761    ///
762    /// # Returns
763    /// A buffer containing deduplicated union of both inputs, sorted
764    ///
765    /// # Errors
766    /// Returns `XlogError::Kernel` if schemas don't match or operation fails
767    pub fn union_gpu(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
768        // Verify schemas have compatible types (ignore column names for Datalog union).
769        if !self.schemas_type_compatible(a.schema(), b.schema()) {
770            return Err(XlogError::Kernel(format!(
771                "Union requires compatible schemas: {:?} vs {:?}",
772                a.schema(),
773                b.schema()
774            )));
775        }
776
777        let schema = a.schema().clone();
778        let a_rows = self.device_row_count(a)?;
779        let b_rows = self.device_row_count(b)?;
780        if schema.arity() == 0 {
781            // 0-arity set union: empty ∪ empty = empty; otherwise {()}.
782            if a_rows == 0 && b_rows == 0 {
783                return self.create_empty_buffer(schema);
784            }
785            return self.buffer_from_columns(Vec::new(), 1, schema);
786        }
787
788        let key_cols: Vec<usize> = (0..schema.arity()).collect();
789        if a_rows == 0 && b_rows == 0 {
790            return self.create_empty_buffer(schema);
791        }
792
793        // Set semantics require dedup even when one side is empty.
794        if a_rows == 0 {
795            return self.dedup(b, &key_cols);
796        }
797        if b_rows == 0 {
798            return self.dedup(a, &key_cols);
799        }
800
801        let concat = self.concat_buffers_gpu(a, b)?;
802        if Self::use_csm_cuda_graph_env()
803            && schema.arity() > 1
804            && a_rows.saturating_add(b_rows) <= SMALL_FULL_ROW_SORT_MAX_ROWS
805        {
806            return self.dedup_full_row_deterministic(&concat);
807        }
808
809        let sorted = self.sort(&concat, &key_cols)?;
810        self.dedup_sorted(&sorted, &key_cols)
811    }
812
813    /// Set difference (a - b) with deterministic set semantics.
814    ///
815    /// Single-column `u32` buffers use a GPU sorted-diff fast path. General
816    /// multi-column buffers use a byte-exact host set fallback after GPU dedup;
817    /// the hash anti-join implementation is intentionally not used for Datalog
818    /// delta subtraction because its unordered parallel probe path can leak
819    /// nondeterminism into recursive fixed-point convergence.
820    ///
821    /// # Arguments
822    /// * `a` - Source buffer
823    /// * `b` - Buffer to subtract
824    ///
825    /// # Returns
826    /// A buffer containing elements in a but not in b, sorted and deduped
827    ///
828    /// # Errors
829    /// Returns `XlogError::Kernel` if schemas don't match or operation fails
830    pub fn diff_gpu(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
831        let num_a = self.device_row_count(a)?;
832        let num_b = self.device_row_count(b)?;
833        if num_a > u32::MAX as usize || num_b > u32::MAX as usize {
834            return Err(XlogError::Kernel(format!(
835                "Diff supports at most {} rows per side (a={}, b={})",
836                u32::MAX,
837                num_a,
838                num_b
839            )));
840        }
841
842        if num_a == 0 {
843            return self.create_empty_buffer(a.schema().clone());
844        }
845
846        // Verify schemas have compatible types (ignore column names for Datalog negation)
847        if !self.schemas_type_compatible(a.schema(), b.schema()) {
848            return Err(XlogError::Kernel(format!(
849                "Diff requires compatible schemas: {:?} vs {:?}",
850                a.schema(),
851                b.schema()
852            )));
853        }
854
855        if a.arity() == 0 {
856            // 0-arity set difference: {()} - empty = {()}, {()} - {()} = empty.
857            if num_b == 0 {
858                return self.buffer_from_columns(Vec::new(), 1, a.schema().clone());
859            }
860            return self.create_empty_buffer(a.schema().clone());
861        }
862
863        let col_type = a
864            .schema()
865            .column_type(0)
866            .ok_or_else(|| XlogError::Kernel("No columns".to_string()))?;
867
868        // Keep the single-column U32 fast path; all other cases use the
869        // deterministic byte-exact set-difference fallback.
870        if a.arity() == 1 && matches!(col_type, ScalarType::U32) && num_b != 0 {
871            return self.diff_gpu_u32(a, b);
872        }
873
874        self.diff_via_deterministic_set(a, b)
875    }
876
877    /// U32-optimized diff using GPU sort
878    fn diff_gpu_u32(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
879        // For multi-column U32 buffers, use deterministic set difference on all columns.
880        if a.arity() != 1 {
881            return self.diff_via_deterministic_set(a, b);
882        }
883
884        // Step 1: Sort and dedup both inputs (sorted, so use dedup_sorted)
885        let sorted_a = self.sort(a, &[0])?;
886        let deduped_a = self.dedup_sorted(&sorted_a, &[0])?;
887
888        let sorted_b = self.sort(b, &[0])?;
889        let deduped_b = self.dedup_sorted(&sorted_b, &[0])?;
890
891        let num_a = self.device_row_count(&deduped_a)?;
892        let num_b = self.device_row_count(&deduped_b)?;
893        if num_a > u32::MAX as usize || num_b > u32::MAX as usize {
894            return Err(XlogError::Kernel(format!(
895                "Diff supports at most {} rows per side (a={}, b={})",
896                u32::MAX,
897                num_a,
898                num_b
899            )));
900        }
901
902        if num_a == 0 {
903            return self.create_empty_buffer(a.schema().clone());
904        }
905
906        let num_a = num_a as u32;
907        let num_b = num_b as u32;
908
909        // Step 2: Mark elements in a not in b using sorted_diff_mark kernel
910        let diff_mark_fn = self
911            .device
912            .inner()
913            .get_func(SET_OPS_MODULE, set_ops_kernels::SORTED_DIFF_MARK)
914            .ok_or_else(|| XlogError::Kernel("sorted_diff_mark kernel not found".to_string()))?;
915
916        // Get column data as u32 views
917        let a_col = deduped_a
918            .column(0)
919            .ok_or_else(|| XlogError::Kernel("A column 0 not found".to_string()))?;
920        let b_col = deduped_b
921            .column(0)
922            .ok_or_else(|| XlogError::Kernel("B column 0 not found".to_string()))?;
923
924        let a_view = self.column_as_u32_view(a_col, num_a as usize)?;
925        let b_view = self.column_as_u32_view(b_col, num_b as usize)?;
926
927        // Allocate mask for diff marking
928        let diff_mask = self.memory.alloc::<u8>(num_a as usize)?;
929
930        // Launch diff mark kernel
931        let block_size = 256u32;
932        let grid_size = num_a.div_ceil(block_size);
933        let config = LaunchConfig {
934            grid_dim: (grid_size, 1, 1),
935            block_dim: (block_size, 1, 1),
936            shared_mem_bytes: 0,
937        };
938
939        // SAFETY: Kernel signature matches:
940        // sorted_diff_mark(a, a_len_device, a_cap, b, b_len_device, b_cap, in_diff)
941        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
942        unsafe {
943            diff_mark_fn.clone().launch(
944                config,
945                (
946                    &a_view,
947                    deduped_a.num_rows_device(),
948                    num_a,
949                    &b_view,
950                    deduped_b.num_rows_device(),
951                    num_b,
952                    &diff_mask,
953                ),
954            )
955        }
956        .map_err(|e| XlogError::Kernel(format!("sorted_diff_mark failed: {}", e)))?;
957
958        // Compute prefix sum of diff mask on GPU.
959        let device = self.device.inner();
960        let num_blocks = grid_size;
961        let d_prefix_sum = self.memory.alloc::<u32>(num_a as usize)?;
962        let mut d_block_sums = self.memory.alloc::<u32>(num_blocks as usize)?;
963
964        let phase1_fn = device
965            .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE1)
966            .ok_or_else(|| {
967                XlogError::Kernel("Failed to get multiblock_scan_phase1 kernel".to_string())
968            })?;
969
970        // SAFETY: multiblock_scan_phase1(const uint8_t* mask, uint32_t* prefix_sum, uint32_t* block_sums, uint32_t n)
971        unsafe {
972            phase1_fn.clone().launch(
973                LaunchConfig {
974                    grid_dim: (num_blocks, 1, 1),
975                    block_dim: (block_size, 1, 1),
976                    shared_mem_bytes: 0,
977                },
978                (&diff_mask, &d_prefix_sum, &d_block_sums, num_a),
979            )
980        }
981        .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase1 failed: {}", e)))?;
982
983        if num_blocks > 1 {
984            self.multiblock_scan_u32_inplace(&mut d_block_sums, num_blocks)?;
985
986            let phase3_fn = device
987                .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE3)
988                .ok_or_else(|| {
989                    XlogError::Kernel("Failed to get multiblock_scan_phase3 kernel".to_string())
990                })?;
991
992            // SAFETY: multiblock_scan_phase3(uint32_t* prefix_sum, const uint32_t* block_offsets, uint32_t n)
993            unsafe {
994                phase3_fn.clone().launch(
995                    LaunchConfig {
996                        grid_dim: (num_blocks, 1, 1),
997                        block_dim: (block_size, 1, 1),
998                        shared_mem_bytes: 0,
999                    },
1000                    (&d_prefix_sum, &d_block_sums, num_a),
1001                )
1002            }
1003            .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase3 failed: {}", e)))?;
1004        }
1005
1006        self.device.synchronize()?;
1007
1008        let d_out_count = self.capture_compact_count(&d_prefix_sum, &diff_mask, num_a)?;
1009        self.compact_buffer_by_device_mask_device_count(
1010            &deduped_a,
1011            &diff_mask,
1012            &d_prefix_sum,
1013            d_out_count,
1014        )
1015    }
1016
1017    /// General-arity deterministic full-row set difference (a \ b) on the GPU.
1018    ///
1019    /// Pipeline:
1020    ///   1. Dedup both sides to set semantics (`a` and `b` may carry
1021    ///      duplicates from upstream union/concat steps).
1022    ///   2. Sort `b` by all columns using the typed multi-column sort.
1023    ///   3. Per-row binary search of each `a` row against sorted `b` using
1024    ///      the same typed comparator — `mark_diff_full_row_typed_sorted`.
1025    ///   4. Multi-block exclusive scan on the keep mask.
1026    ///   5. Column-wise gather via the existing
1027    ///      `compact_buffer_by_device_mask_device_count` helper.
1028    ///
1029    /// The typed comparator agrees with the multi-column sort's order
1030    /// convention (signed-int sign-flip; float total-order normalization),
1031    /// so the binary search converges. Equality under the typed comparator
1032    /// is bytewise equality, which is the same set semantics used by the
1033    /// host-side `BTreeSet<Vec<u8>>` fallback this method replaces.
1034    ///
1035    /// Recursive Datalog evaluation relies on stable delta subtraction for
1036    /// fixpoint convergence. This path deliberately avoids the GPU hash
1037    /// anti-join (whose unordered parallel probe path can leak
1038    /// nondeterminism into recursive fixed-point convergence).
1039    fn diff_via_deterministic_set(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
1040        // Step 1: dedup both inputs to set semantics. Route through
1041        // `dedup_full_row_deterministic` regardless of arity so the
1042        // dedup step and the diff-probe typed comparator agree on
1043        // equality even for single-column float buffers — the legacy
1044        // `dedup` single-column kernel uses IEEE `==` (collapses
1045        // +0/-0), which would mismatch the totalOrder-key probe and
1046        // could drop one of {+0, -0} silently from the result.
1047        let deduped_a = self.dedup_full_row_deterministic(a)?;
1048        let deduped_b = self.dedup_full_row_deterministic(b)?;
1049
1050        let a_rows = self.device_row_count(&deduped_a)? as u32;
1051        let b_rows = self.device_row_count(&deduped_b)? as u32;
1052        if a_rows == 0 {
1053            return self.create_empty_buffer(a.schema().clone());
1054        }
1055        if b_rows == 0 {
1056            return Ok(deduped_a);
1057        }
1058        let arity = deduped_a.arity();
1059        if arity == 0 {
1060            // 0-arity: {()} - {()} = empty.
1061            return self.create_empty_buffer(a.schema().clone());
1062        }
1063
1064        // Step 2: `dedup_full_row_deterministic` already returns `b`
1065        // sorted by the same typed multi-column sort the diff probe's
1066        // comparator agrees with, so reuse the deduplicated buffer
1067        // directly — re-sorting would double the cost on a hot path
1068        // (negation / delta subtraction).
1069        let sorted_b = deduped_b;
1070
1071        // Step 3: build the typed per-column descriptor arrays for the
1072        // diff kernel — ptrs and sizes per column for both a and b, plus
1073        // the type code per column (used by the typed comparator).
1074        let schema = deduped_a.schema().clone();
1075        let device = self.device.inner();
1076
1077        let mut a_col_ptrs: Vec<u64> = Vec::with_capacity(arity);
1078        let mut b_col_ptrs: Vec<u64> = Vec::with_capacity(arity);
1079        let mut col_sizes: Vec<u32> = Vec::with_capacity(arity);
1080        let mut col_types: Vec<u8> = Vec::with_capacity(arity);
1081        for col_idx in 0..arity {
1082            let a_col = deduped_a.column(col_idx).ok_or_else(|| {
1083                XlogError::Kernel(format!("diff_full_row: a column {} missing", col_idx))
1084            })?;
1085            let b_col = sorted_b.column(col_idx).ok_or_else(|| {
1086                XlogError::Kernel(format!("diff_full_row: b column {} missing", col_idx))
1087            })?;
1088            let ty = schema.column_type(col_idx).ok_or_else(|| {
1089                XlogError::Kernel(format!("diff_full_row: column {} type missing", col_idx))
1090            })?;
1091            a_col_ptrs.push(*a_col.device_ptr());
1092            b_col_ptrs.push(*b_col.device_ptr());
1093            col_sizes.push(ty.size_bytes() as u32);
1094            col_types.push(scalar_type_code_dedup(ty));
1095        }
1096
1097        let mut d_a_ptrs = self.memory.alloc::<u64>(arity)?;
1098        let mut d_b_ptrs = self.memory.alloc::<u64>(arity)?;
1099        let mut d_sizes = self.memory.alloc::<u32>(arity)?;
1100        let mut d_types = self.memory.alloc::<u8>(arity)?;
1101        self.htod_launch_metadata_sync_copy_into(&a_col_ptrs, &mut d_a_ptrs)
1102            .map_err(|e| XlogError::Kernel(format!("diff_full_row a ptr upload: {}", e)))?;
1103        self.htod_launch_metadata_sync_copy_into(&b_col_ptrs, &mut d_b_ptrs)
1104            .map_err(|e| XlogError::Kernel(format!("diff_full_row b ptr upload: {}", e)))?;
1105        self.htod_launch_metadata_sync_copy_into(&col_sizes, &mut d_sizes)
1106            .map_err(|e| XlogError::Kernel(format!("diff_full_row size upload: {}", e)))?;
1107        self.htod_launch_metadata_sync_copy_into(&col_types, &mut d_types)
1108            .map_err(|e| XlogError::Kernel(format!("diff_full_row type upload: {}", e)))?;
1109
1110        let block_size = 256u32;
1111        let grid = a_rows.div_ceil(block_size);
1112        let cfg = LaunchConfig {
1113            grid_dim: (grid, 1, 1),
1114            block_dim: (block_size, 1, 1),
1115            shared_mem_bytes: 0,
1116        };
1117
1118        let d_keep_mask = self.memory.alloc::<u8>(a_rows as usize)?;
1119        let diff_fn = device
1120            .get_func(DEDUP_MODULE, dedup_kernels::MARK_DIFF_FULL_ROW_TYPED_SORTED)
1121            .ok_or_else(|| {
1122                XlogError::Kernel("mark_diff_full_row_typed_sorted kernel not found".to_string())
1123            })?;
1124        // SAFETY: kernel signature matches:
1125        //   mark_diff_full_row_typed_sorted(a_col_ptrs, b_col_ptrs, col_sizes,
1126        //       col_types, num_cols, num_a_device, num_b, a_cap, keep_mask)
1127        unsafe {
1128            diff_fn.clone().launch(
1129                cfg,
1130                (
1131                    &d_a_ptrs,
1132                    &d_b_ptrs,
1133                    &d_sizes,
1134                    &d_types,
1135                    arity as u32,
1136                    deduped_a.num_rows_device(),
1137                    b_rows,
1138                    a_rows,
1139                    &d_keep_mask,
1140                ),
1141            )
1142        }
1143        .map_err(|e| XlogError::Kernel(format!("mark_diff_full_row_typed_sorted launch: {}", e)))?;
1144        self.device.synchronize()?;
1145
1146        // Step 4 + 5: scan + column-wise gather of kept a rows.
1147        let (d_prefix_sum, d_out_count) =
1148            self.scan_mask_to_prefix_with_count(&d_keep_mask, a_rows)?;
1149
1150        self.compact_buffer_by_device_mask_device_count(
1151            &deduped_a,
1152            &d_keep_mask,
1153            &d_prefix_sum,
1154            d_out_count,
1155        )
1156    }
1157
1158    /// Public deterministic full-row dedup with totalOrder-bytewise
1159    /// equality semantics for *all* arities (including single-column
1160    /// float buffers).
1161    ///
1162    /// Differs from `dedup(input, &[0])` for single-column float
1163    /// columns: the legacy single-column GPU kernel collapses +0/-0
1164    /// (IEEE `==` says they're equal) and treats two NaNs with
1165    /// different payloads as distinct. `dedup_full_row` instead uses
1166    /// totalOrder-bijective bytewise equality, so:
1167    ///
1168    ///   * `+0.0` and `-0.0` are distinct.
1169    ///   * Two NaNs collapse iff bit-identical.
1170    ///
1171    /// Routing today:
1172    ///   * `dedup(input, &all_cols)` with `arity > 1` routes to the
1173    ///     full-row pipeline (same semantics as this method).
1174    ///   * `dedup(input, &[0])` with `arity == 1` keeps the legacy
1175    ///     single-column GPU kernel — IEEE `==` for floats, so +0/-0
1176    ///     collapse and NaNs collapse iff bit-identical-or-IEEE-eq.
1177    ///   * `dedup_full_row(input)` always uses bytewise totalOrder
1178    ///     equality for *all* arities, so single-column float
1179    ///     callers must use this method explicitly to get the
1180    ///     totalOrder semantics.
1181    ///
1182    /// Multi-column callers that pass the all-columns key vector to
1183    /// `dedup` already route through the same deterministic full-row
1184    /// pipeline; single-column callers that want totalOrder semantics
1185    /// must call `dedup_full_row` directly.
1186    pub fn dedup_full_row(&self, input: &CudaBuffer) -> Result<CudaBuffer> {
1187        // Env-gated recorded dispatch. `dedup_full_row_recorded`
1188        // requires every column to be U32 / Symbol;
1189        // mixed-type schemas fall through to the legacy path.
1190        if Self::use_recorded_dedup_env() && input.num_rows() > 1 && input.arity() > 0 {
1191            if let Some(launch_stream) = self.recorded_op_stream_or_init() {
1192                let recorded_compatible = (0..input.arity()).all(|c| {
1193                    matches!(
1194                        input.schema.column_type(c),
1195                        Some(ScalarType::U32) | Some(ScalarType::Symbol)
1196                    )
1197                });
1198                if recorded_compatible {
1199                    return self.dedup_full_row_recorded(input, launch_stream);
1200                }
1201            }
1202        }
1203        self.dedup_full_row_deterministic(input)
1204    }
1205
1206    /// Public deterministic full-row set difference. Equivalent to
1207    /// `diff_gpu(a, b)` for the multi-column path but named explicitly so
1208    /// callers cannot mistake it for the older first-column-key `diff`.
1209    /// `a` and `b` must have type-compatible schemas.
1210    pub fn diff_full_row(&self, a: &CudaBuffer, b: &CudaBuffer) -> Result<CudaBuffer> {
1211        // Single-column types still go through `diff_gpu` so the existing
1212        // u32 fast path is preserved; the deterministic-set fallback is
1213        // now the GPU pipeline regardless.
1214        self.diff_gpu(a, b)
1215    }
1216
1217    /// Read a binary-join output-count scalar from device memory.
1218    ///
1219    /// **Why this is metadata, not data-plane:** the value is a single
1220    /// `u32` produced by an atomic-increment counter inside the join
1221    /// kernel, used solely to size the next allocation (in the
1222    /// count-only pass) or to drive the result buffer's logical row
1223    /// count (in the post-materialize pass). It is control-plane state
1224    /// in the same sense as a relation's row count — never tuple data.
1225    ///
1226    /// The strict deterministic-Datalog D2H gate explicitly allows this
1227    /// category via `dtoh_scalar_untracked`, which is the auditable,
1228    /// single-purpose API for metadata reads.
1229    ///
1230    /// The v0.5.5 metadata-read hardening replaced eight
1231    /// `dtoh_sync_copy_into_tracked` reads of these scalars with this
1232    /// helper, so binary-join materialization runs strict-clean.
1233    /// **This does not make binary-join materialization fully
1234    /// GPU-resident**: the count is still a host scalar, used by host
1235    /// code to drive allocation. A future worst-case-bounded
1236    /// GPU-resident output buffer can localize the upgrade here once a
1237    /// memory-budget-aware upper bound exists.
1238    fn read_join_output_count_metadata(&self, d_count: &TrackedCudaSlice<u32>) -> Result<u32> {
1239        // Avoid double-prefixing the error string. `XlogError` Display
1240        // already prefixes its variants (e.g. "Kernel error: ..."), so
1241        // wrapping the whole error would produce
1242        // "Kernel error: Failed to read output count: Kernel error: ...".
1243        // Extract the inner message for the common Kernel variant; fall
1244        // back to Display for everything else.
1245        self.dtoh_scalar_untracked::<u32>(d_count, 0)
1246            .map_err(|e| match e {
1247                XlogError::Kernel(message) => {
1248                    XlogError::Kernel(format!("Failed to read output count: {}", message))
1249                }
1250                other => XlogError::Kernel(format!("Failed to read output count: {}", other)),
1251            })
1252    }
1253
1254    fn is_full_row_key(key_cols: &[usize], arity: usize) -> bool {
1255        key_cols.len() == arity
1256            && key_cols
1257                .iter()
1258                .copied()
1259                .enumerate()
1260                .all(|(expected, actual)| expected == actual)
1261    }
1262
1263    /// Deterministic GPU full-row dedup pipeline.
1264    ///
1265    /// Pipeline: typed multi-column sort → bytewise per-column adjacent
1266    /// equality mask → multi-block exclusive prefix scan → column-wise
1267    /// gather (via `compact_buffer_by_device_mask_device_count`).
1268    ///
1269    /// Bytewise equality matches the host-fallback `BTreeSet<Vec<u8>>`
1270    /// semantics. For floats it agrees with IEEE-754 totalOrder equality
1271    /// under the project's `f{32,64}_to_ordered_u{32,64}` normalization
1272    /// (kernels/sort.cu): the normalization is bijective, so distinct bit
1273    /// patterns map to distinct ordered keys, so bytewise eq on the
1274    /// post-sort buffer is the same membership relation. +0/-0 stay
1275    /// distinct; two NaNs collapse iff bit-identical.
1276    ///
1277    /// Replaces the host-side `BTreeSet<Vec<u8>>` fallback that the
1278    /// strict deterministic-Datalog D2H gate flags as a violator.
1279    fn dedup_full_row_deterministic(&self, input: &CudaBuffer) -> Result<CudaBuffer> {
1280        let row_count = self.device_row_count(input)?;
1281        if row_count == 0 {
1282            return self.create_empty_buffer(input.schema().clone());
1283        }
1284        if row_count == 1 {
1285            return self.clone_buffer(input);
1286        }
1287        if row_count > u32::MAX as usize {
1288            return Err(XlogError::Kernel(format!(
1289                "dedup_full_row supports at most {} rows, got {}",
1290                u32::MAX,
1291                row_count
1292            )));
1293        }
1294        let arity = input.arity();
1295        if arity == 0 {
1296            // 0-arity, non-empty: collapse to {()}.
1297            return self.buffer_from_columns(Vec::new(), 1, input.schema().clone());
1298        }
1299
1300        // Step 1: typed multi-column sort. Float columns use total-order
1301        // normalization; signed integers use sign-flipped unsigned compare.
1302        let sorted = if Self::use_csm_cuda_graph_env() && row_count <= SMALL_FULL_ROW_SORT_MAX_ROWS
1303        {
1304            self.small_sort_full_row_deterministic(input, row_count)?
1305        } else {
1306            let all_cols: Vec<usize> = (0..arity).collect();
1307            self.sort(input, &all_cols)?
1308        };
1309
1310        // Step 2: bytewise adjacent-equality mask on the sorted buffer.
1311        let n = self.device_row_count(&sorted)? as u32;
1312        if n <= 1 {
1313            return Ok(sorted);
1314        }
1315
1316        let device = self.device.inner();
1317        let mut col_ptrs_host: Vec<u64> = Vec::with_capacity(arity);
1318        let mut col_sizes_host: Vec<u32> = Vec::with_capacity(arity);
1319        for col_idx in 0..arity {
1320            let col = sorted
1321                .column(col_idx)
1322                .ok_or_else(|| XlogError::Kernel(format!("Sorted column {} not found", col_idx)))?;
1323            let ty = sorted.schema().column_type(col_idx).ok_or_else(|| {
1324                XlogError::Kernel(format!("Sorted column {} type missing", col_idx))
1325            })?;
1326            col_ptrs_host.push(*col.device_ptr());
1327            col_sizes_host.push(ty.size_bytes() as u32);
1328        }
1329
1330        let mut d_col_ptrs = self.memory.alloc::<u64>(arity)?;
1331        let mut d_col_sizes = self.memory.alloc::<u32>(arity)?;
1332        self.htod_launch_metadata_sync_copy_into(&col_ptrs_host, &mut d_col_ptrs)
1333            .map_err(|e| XlogError::Kernel(format!("dedup_full_row_gpu col ptr upload: {}", e)))?;
1334        self.htod_launch_metadata_sync_copy_into(&col_sizes_host, &mut d_col_sizes)
1335            .map_err(|e| XlogError::Kernel(format!("dedup_full_row_gpu col size upload: {}", e)))?;
1336
1337        let block_size = 256u32;
1338        let grid = n.div_ceil(block_size);
1339        let cfg = LaunchConfig {
1340            grid_dim: (grid, 1, 1),
1341            block_dim: (block_size, 1, 1),
1342            shared_mem_bytes: 0,
1343        };
1344
1345        let d_unique_mask = self.memory.alloc::<u8>(n as usize)?;
1346        let mark_fn = device
1347            .get_func(DEDUP_MODULE, dedup_kernels::MARK_UNIQUE_FULL_ROW_BYTEWISE)
1348            .ok_or_else(|| {
1349                XlogError::Kernel("mark_unique_full_row_bytewise kernel not found".to_string())
1350            })?;
1351
1352        // SAFETY: kernel signature matches:
1353        //   mark_unique_full_row_bytewise(col_ptrs, col_sizes, num_cols,
1354        //                                 num_rows_device, row_cap, unique_mask)
1355        unsafe {
1356            mark_fn.clone().launch(
1357                cfg,
1358                (
1359                    &d_col_ptrs,
1360                    &d_col_sizes,
1361                    arity as u32,
1362                    sorted.num_rows_device(),
1363                    n,
1364                    &d_unique_mask,
1365                ),
1366            )
1367        }
1368        .map_err(|e| {
1369            XlogError::Kernel(format!(
1370                "mark_unique_full_row_bytewise launch failed: {}",
1371                e
1372            ))
1373        })?;
1374        self.device.synchronize()?;
1375
1376        // Step 3: exclusive prefix scan over the mask.
1377        let (d_prefix_sum, d_out_count) = self.scan_mask_to_prefix_with_count(&d_unique_mask, n)?;
1378
1379        // Step 4: gather the kept rows using the existing column-wise
1380        // compaction helper. This reuses the same machinery the
1381        // existing GPU dedup_sorted typed-columnar path uses.
1382        self.compact_buffer_by_device_mask_device_count(
1383            &sorted,
1384            &d_unique_mask,
1385            &d_prefix_sum,
1386            d_out_count,
1387        )
1388    }
1389
1390    fn small_sort_full_row_deterministic(
1391        &self,
1392        input: &CudaBuffer,
1393        row_count: usize,
1394    ) -> Result<CudaBuffer> {
1395        if row_count > SMALL_FULL_ROW_SORT_MAX_ROWS {
1396            return Err(XlogError::Kernel(format!(
1397                "small full-row sort supports at most {} rows, got {}",
1398                SMALL_FULL_ROW_SORT_MAX_ROWS, row_count
1399            )));
1400        }
1401        if row_count == 0 {
1402            return self.create_empty_buffer(input.schema().clone());
1403        }
1404        if row_count == 1 {
1405            return self.clone_buffer(input);
1406        }
1407
1408        let arity = input.arity();
1409        let device = self.device.inner();
1410        let mut col_ptrs_host: Vec<u64> = Vec::with_capacity(arity);
1411        let mut col_sizes_host: Vec<u32> = Vec::with_capacity(arity);
1412        let mut col_types_host: Vec<u8> = Vec::with_capacity(arity);
1413        for col_idx in 0..arity {
1414            let col = input.column(col_idx).ok_or_else(|| {
1415                XlogError::Kernel(format!("small full-row sort: column {} missing", col_idx))
1416            })?;
1417            let ty = input.schema().column_type(col_idx).ok_or_else(|| {
1418                XlogError::Kernel(format!(
1419                    "small full-row sort: column {} type missing",
1420                    col_idx
1421                ))
1422            })?;
1423            let elem_size = ty.size_bytes();
1424            let expected_bytes_u64 =
1425                input
1426                    .num_rows()
1427                    .checked_mul(elem_size as u64)
1428                    .ok_or_else(|| {
1429                        XlogError::Kernel(
1430                            "small full-row sort: column byte-size overflow".to_string(),
1431                        )
1432                    })?;
1433            let expected_bytes = usize::try_from(expected_bytes_u64).map_err(|_| {
1434                XlogError::Kernel(format!(
1435                    "small full-row sort: expected byte size {} exceeds usize::MAX",
1436                    expected_bytes_u64
1437                ))
1438            })?;
1439            if col.num_bytes() != expected_bytes {
1440                return Err(XlogError::Kernel(format!(
1441                    "small full-row sort: column {} has {} bytes but expected {}",
1442                    col_idx,
1443                    col.num_bytes(),
1444                    expected_bytes
1445                )));
1446            }
1447            col_ptrs_host.push(*col.device_ptr());
1448            col_sizes_host.push(elem_size as u32);
1449            col_types_host.push(scalar_type_code_dedup(ty));
1450        }
1451
1452        let mut d_col_ptrs = self.memory.alloc::<u64>(arity)?;
1453        let mut d_col_sizes = self.memory.alloc::<u32>(arity)?;
1454        let mut d_col_types = self.memory.alloc::<u8>(arity)?;
1455        self.htod_launch_metadata_sync_copy_into(&col_ptrs_host, &mut d_col_ptrs)
1456            .map_err(|e| XlogError::Kernel(format!("small full-row sort ptr upload: {}", e)))?;
1457        self.htod_launch_metadata_sync_copy_into(&col_sizes_host, &mut d_col_sizes)
1458            .map_err(|e| XlogError::Kernel(format!("small full-row sort size upload: {}", e)))?;
1459        self.htod_launch_metadata_sync_copy_into(&col_types_host, &mut d_col_types)
1460            .map_err(|e| XlogError::Kernel(format!("small full-row sort type upload: {}", e)))?;
1461
1462        let mut d_indices = self.memory.alloc::<u32>(row_count)?;
1463        let sort_fn = device
1464            .get_func(
1465                DEDUP_MODULE,
1466                dedup_kernels::SMALL_SORT_FULL_ROW_INDICES_TYPED,
1467            )
1468            .ok_or_else(|| {
1469                XlogError::Kernel("small_sort_full_row_indices_typed kernel not found".to_string())
1470            })?;
1471        let cfg = LaunchConfig {
1472            grid_dim: (1, 1, 1),
1473            block_dim: (SMALL_FULL_ROW_SORT_MAX_ROWS as u32, 1, 1),
1474            shared_mem_bytes: 0,
1475        };
1476
1477        // SAFETY: kernel signature matches:
1478        //   small_sort_full_row_indices_typed(col_ptrs, col_sizes, col_types,
1479        //       num_cols, num_rows_device, row_cap, out_indices)
1480        unsafe {
1481            sort_fn.clone().launch(
1482                cfg,
1483                (
1484                    &d_col_ptrs,
1485                    &d_col_sizes,
1486                    &d_col_types,
1487                    arity as u32,
1488                    input.num_rows_device(),
1489                    row_count as u32,
1490                    &mut d_indices,
1491                ),
1492            )
1493        }
1494        .map_err(|e| {
1495            XlogError::Kernel(format!(
1496                "small_sort_full_row_indices_typed launch failed: {}",
1497                e
1498            ))
1499        })?;
1500        self.device.synchronize()?;
1501        self.small_full_row_sort_invocations
1502            .fetch_add(1, Ordering::Relaxed);
1503
1504        self.gather_buffer_by_indices(input, &d_indices, row_count as u32)
1505    }
1506
1507    /// Run the multi-block exclusive-scan pipeline on a u8 mask of length
1508    /// `n` and return the per-row prefix-sum buffer plus a device-resident
1509    /// scalar with the total number of marked rows. Mirrors the helper
1510    /// pattern already used by `diff_gpu_u32` and `dedup_sorted`.
1511    fn scan_mask_to_prefix_with_count(
1512        &self,
1513        d_mask: &cudarc::driver::CudaSlice<u8>,
1514        n: u32,
1515    ) -> Result<(
1516        crate::memory::TrackedCudaSlice<u32>,
1517        crate::memory::TrackedCudaSlice<u32>,
1518    )> {
1519        let device = self.device.inner();
1520        let block_size = 256u32;
1521        let num_blocks = n.div_ceil(block_size);
1522
1523        let d_prefix_sum = self.memory.alloc::<u32>(n as usize)?;
1524        let mut d_block_sums = self.memory.alloc::<u32>(num_blocks as usize)?;
1525
1526        let phase1_fn = device
1527            .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE1)
1528            .ok_or_else(|| {
1529                XlogError::Kernel("Failed to get multiblock_scan_phase1 kernel".to_string())
1530            })?;
1531        // SAFETY: kernel signature matches multiblock_scan_phase1.
1532        unsafe {
1533            phase1_fn.clone().launch(
1534                LaunchConfig {
1535                    grid_dim: (num_blocks, 1, 1),
1536                    block_dim: (block_size, 1, 1),
1537                    shared_mem_bytes: 0,
1538                },
1539                (d_mask, &d_prefix_sum, &d_block_sums, n),
1540            )
1541        }
1542        .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase1 failed: {}", e)))?;
1543
1544        if num_blocks > 1 {
1545            self.multiblock_scan_u32_inplace(&mut d_block_sums, num_blocks)?;
1546
1547            let phase3_fn = device
1548                .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE3)
1549                .ok_or_else(|| {
1550                    XlogError::Kernel("Failed to get multiblock_scan_phase3 kernel".to_string())
1551                })?;
1552            // SAFETY: kernel signature matches multiblock_scan_phase3.
1553            unsafe {
1554                phase3_fn.clone().launch(
1555                    LaunchConfig {
1556                        grid_dim: (num_blocks, 1, 1),
1557                        block_dim: (block_size, 1, 1),
1558                        shared_mem_bytes: 0,
1559                    },
1560                    (&d_prefix_sum, &d_block_sums, n),
1561                )
1562            }
1563            .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase3 failed: {}", e)))?;
1564        }
1565        self.device.synchronize()?;
1566
1567        let d_out_count = self.capture_compact_count(&d_prefix_sum, d_mask, n)?;
1568        Ok((d_prefix_sum, d_out_count))
1569    }
1570
1571    // ============== Sort Methods ==============
1572
1573    pub(super) const SORT_BLOCK_SIZE: u32 = 256;
1574
1575    /// Sort buffer by key columns.
1576    ///
1577    /// Computes a stable row permutation on the GPU (supports multi-column and all scalar types),
1578    /// then applies the permutation on the GPU to reorder all columns.
1579    ///
1580    /// # Arguments
1581    /// * `input` - The input buffer to sort
1582    /// * `key_cols` - Column indices to use for sorting (lexicographic, first key is most significant)
1583    ///
1584    /// # Returns
1585    /// A new buffer with rows sorted by the key columns
1586    ///
1587    /// # Errors
1588    /// Returns `XlogError::Kernel` if:
1589    /// - `key_cols` is empty or out of bounds
1590    /// - Input has more than `u32::MAX` rows
1591    /// - Download/upload or kernel execution fails
1592    pub fn sort(&self, input: &CudaBuffer, key_cols: &[usize]) -> Result<CudaBuffer> {
1593        // Env-gated recorded dispatch. Eligibility check
1594        // mirrors `sort_recorded`'s validation:
1595        // U32 / Symbol key columns only. Other types fall
1596        // through to the legacy multi-type path.
1597        if Self::use_recorded_sort_env() && !key_cols.is_empty() && input.num_rows() > 0 {
1598            if let Some(launch_stream) = self.recorded_op_stream_or_init() {
1599                let recorded_compatible = key_cols.iter().all(|&k| {
1600                    matches!(
1601                        input.schema.column_type(k),
1602                        Some(ScalarType::U32) | Some(ScalarType::Symbol)
1603                    )
1604                });
1605                if recorded_compatible {
1606                    return self.sort_recorded(input, key_cols, launch_stream);
1607                }
1608            }
1609        }
1610
1611        if input.num_rows() == 0 {
1612            return self.create_empty_buffer(input.schema.clone());
1613        }
1614
1615        if key_cols.is_empty() {
1616            return Err(XlogError::Kernel(
1617                "Sort requires at least one key column".to_string(),
1618            ));
1619        }
1620
1621        if input.num_rows() > u32::MAX as u64 {
1622            return Err(XlogError::Kernel(format!(
1623                "Sort supports at most {} rows, got {}",
1624                u32::MAX,
1625                input.num_rows()
1626            )));
1627        }
1628
1629        for &key_col in key_cols {
1630            if key_col >= input.arity() {
1631                return Err(XlogError::Kernel(format!(
1632                    "Key column index {} out of bounds (arity {})",
1633                    key_col,
1634                    input.arity()
1635                )));
1636            }
1637        }
1638
1639        let n = input.num_rows() as u32;
1640        let d_num_rows = input.num_rows_device();
1641        let device = self.device.inner();
1642
1643        let block_size = Self::SORT_BLOCK_SIZE;
1644        let grid_size = n.div_ceil(block_size);
1645        let launch_config = LaunchConfig {
1646            grid_dim: (grid_size, 1, 1),
1647            block_dim: (block_size, 1, 1),
1648            shared_mem_bytes: 0,
1649        };
1650
1651        // Allocate and initialize identity permutation.
1652        let init_fn = device
1653            .get_func(SORT_MODULE, sort_kernels::INIT_INDICES)
1654            .ok_or_else(|| XlogError::Kernel("init_indices kernel not found".to_string()))?;
1655
1656        let mut indices_a = self.memory.alloc::<u32>(n as usize)?;
1657        let mut indices_b = self.memory.alloc::<u32>(n as usize)?;
1658
1659        // SAFETY: init_indices(indices, num_rows_device, row_cap)
1660        unsafe {
1661            init_fn
1662                .clone()
1663                .launch(launch_config, (&mut indices_a, d_num_rows, n))
1664        }
1665        .map_err(|e| XlogError::Kernel(format!("init_indices failed: {}", e)))?;
1666        self.device.synchronize()?;
1667
1668        // Working key buffers (u32 words).
1669        let mut keys_a = self.memory.alloc::<u32>(n as usize)?;
1670        let mut keys_b = self.memory.alloc::<u32>(n as usize)?;
1671
1672        // Radix-sort scratch.
1673        let mut d_hist = self.memory.alloc::<u32>((grid_size as usize) * 16)?;
1674        let mut d_prefix = self.memory.alloc::<u32>(16)?;
1675        let mut d_ranks = self.memory.alloc::<u32>(n as usize)?;
1676
1677        // Process key columns from least-significant to most-significant (stable LSD).
1678        for &col_idx in key_cols.iter().rev() {
1679            let ty = input.schema.column_type(col_idx).ok_or_else(|| {
1680                XlogError::Kernel(format!("Key column {} type not found in schema", col_idx))
1681            })?;
1682
1683            let col = input
1684                .column(col_idx)
1685                .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", col_idx)))?;
1686
1687            match ty {
1688                ScalarType::U32 | ScalarType::Symbol => {
1689                    let col_view = self.column_as_u32_view(col, n as usize)?;
1690                    let gather_fn = device
1691                        .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_U32)
1692                        .ok_or_else(|| {
1693                            XlogError::Kernel("apply_permutation_u32 kernel not found".to_string())
1694                        })?;
1695
1696                    // SAFETY: apply_permutation_u32(input, output, permutation, num_rows_device, row_cap)
1697                    unsafe {
1698                        gather_fn.clone().launch(
1699                            launch_config,
1700                            (&col_view, &mut keys_a, &indices_a, d_num_rows, n),
1701                        )
1702                    }
1703                    .map_err(|e| {
1704                        XlogError::Kernel(format!("apply_permutation_u32 failed: {}", e))
1705                    })?;
1706
1707                    self.radix_sort_u32_pairs_with_scratch(
1708                        &mut keys_a,
1709                        &mut keys_b,
1710                        &mut indices_a,
1711                        &mut indices_b,
1712                        &mut d_hist,
1713                        &mut d_prefix,
1714                        &mut d_ranks,
1715                        d_num_rows,
1716                        n,
1717                    )?;
1718                }
1719                ScalarType::I32 => {
1720                    let col_bits = self.column_as_u32_view(col, n as usize)?;
1721                    let gather_fn = device
1722                        .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_I32_ORDERED_U32)
1723                        .ok_or_else(|| {
1724                            XlogError::Kernel(
1725                                "gather_keys_i32_ordered_u32 kernel not found".to_string(),
1726                            )
1727                        })?;
1728
1729                    // SAFETY: gather_keys_i32_ordered_u32(i32_bits, permutation, num_rows_device, row_cap, out_keys)
1730                    unsafe {
1731                        gather_fn.clone().launch(
1732                            launch_config,
1733                            (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1734                        )
1735                    }
1736                    .map_err(|e| {
1737                        XlogError::Kernel(format!("gather_keys_i32_ordered_u32 failed: {}", e))
1738                    })?;
1739
1740                    self.radix_sort_u32_pairs_with_scratch(
1741                        &mut keys_a,
1742                        &mut keys_b,
1743                        &mut indices_a,
1744                        &mut indices_b,
1745                        &mut d_hist,
1746                        &mut d_prefix,
1747                        &mut d_ranks,
1748                        d_num_rows,
1749                        n,
1750                    )?;
1751                }
1752                ScalarType::F32 => {
1753                    let col_bits = self.column_as_u32_view(col, n as usize)?;
1754                    let gather_fn = device
1755                        .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_F32_ORDERED_U32)
1756                        .ok_or_else(|| {
1757                            XlogError::Kernel(
1758                                "gather_keys_f32_ordered_u32 kernel not found".to_string(),
1759                            )
1760                        })?;
1761
1762                    // SAFETY: gather_keys_f32_ordered_u32(f32_bits, permutation, num_rows_device, row_cap, out_keys)
1763                    unsafe {
1764                        gather_fn.clone().launch(
1765                            launch_config,
1766                            (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1767                        )
1768                    }
1769                    .map_err(|e| {
1770                        XlogError::Kernel(format!("gather_keys_f32_ordered_u32 failed: {}", e))
1771                    })?;
1772
1773                    self.radix_sort_u32_pairs_with_scratch(
1774                        &mut keys_a,
1775                        &mut keys_b,
1776                        &mut indices_a,
1777                        &mut indices_b,
1778                        &mut d_hist,
1779                        &mut d_prefix,
1780                        &mut d_ranks,
1781                        d_num_rows,
1782                        n,
1783                    )?;
1784                }
1785                ScalarType::Bool => {
1786                    if col.num_bytes() < n as usize {
1787                        return Err(XlogError::Kernel(format!(
1788                            "Bool column {} has {} bytes but expected {}",
1789                            col_idx,
1790                            col.num_bytes(),
1791                            n
1792                        )));
1793                    }
1794
1795                    let gather_fn = device
1796                        .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_BOOL_ORDERED_U32)
1797                        .ok_or_else(|| {
1798                            XlogError::Kernel(
1799                                "gather_keys_bool_ordered_u32 kernel not found".to_string(),
1800                            )
1801                        })?;
1802
1803                    // SAFETY: gather_keys_bool_ordered_u32(bools, permutation, num_rows_device, row_cap, out_keys)
1804                    unsafe {
1805                        gather_fn
1806                            .clone()
1807                            .launch(launch_config, (col, &indices_a, d_num_rows, n, &mut keys_a))
1808                    }
1809                    .map_err(|e| {
1810                        XlogError::Kernel(format!("gather_keys_bool_ordered_u32 failed: {}", e))
1811                    })?;
1812
1813                    self.radix_sort_u32_pairs_with_scratch(
1814                        &mut keys_a,
1815                        &mut keys_b,
1816                        &mut indices_a,
1817                        &mut indices_b,
1818                        &mut d_hist,
1819                        &mut d_prefix,
1820                        &mut d_ranks,
1821                        d_num_rows,
1822                        n,
1823                    )?;
1824                }
1825                ScalarType::U64 => {
1826                    let col_bits = self.column_as_u64_view(col, n as usize)?;
1827                    for &word in &[
1828                        sort_kernels::GATHER_KEYS_U64_LO_U32,
1829                        sort_kernels::GATHER_KEYS_U64_HI_U32,
1830                    ] {
1831                        let gather_fn = device.get_func(SORT_MODULE, word).ok_or_else(|| {
1832                            XlogError::Kernel(format!("{} kernel not found", word))
1833                        })?;
1834
1835                        // SAFETY: gather_keys_u64_*_u32(vals, permutation, num_rows_device, row_cap, out_keys)
1836                        unsafe {
1837                            gather_fn.clone().launch(
1838                                launch_config,
1839                                (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1840                            )
1841                        }
1842                        .map_err(|e| XlogError::Kernel(format!("{} failed: {}", word, e)))?;
1843
1844                        self.radix_sort_u32_pairs_with_scratch(
1845                            &mut keys_a,
1846                            &mut keys_b,
1847                            &mut indices_a,
1848                            &mut indices_b,
1849                            &mut d_hist,
1850                            &mut d_prefix,
1851                            &mut d_ranks,
1852                            d_num_rows,
1853                            n,
1854                        )?;
1855                    }
1856                }
1857                ScalarType::I64 => {
1858                    let col_bits = self.column_as_u64_view(col, n as usize)?;
1859                    for &word in &[
1860                        sort_kernels::GATHER_KEYS_I64_LO_U32,
1861                        sort_kernels::GATHER_KEYS_I64_HI_U32,
1862                    ] {
1863                        let gather_fn = device.get_func(SORT_MODULE, word).ok_or_else(|| {
1864                            XlogError::Kernel(format!("{} kernel not found", word))
1865                        })?;
1866
1867                        // SAFETY: gather_keys_i64_*_u32(i64_bits, permutation, num_rows_device, row_cap, out_keys)
1868                        unsafe {
1869                            gather_fn.clone().launch(
1870                                launch_config,
1871                                (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1872                            )
1873                        }
1874                        .map_err(|e| XlogError::Kernel(format!("{} failed: {}", word, e)))?;
1875
1876                        self.radix_sort_u32_pairs_with_scratch(
1877                            &mut keys_a,
1878                            &mut keys_b,
1879                            &mut indices_a,
1880                            &mut indices_b,
1881                            &mut d_hist,
1882                            &mut d_prefix,
1883                            &mut d_ranks,
1884                            d_num_rows,
1885                            n,
1886                        )?;
1887                    }
1888                }
1889                ScalarType::F64 => {
1890                    let col_bits = self.column_as_u64_view(col, n as usize)?;
1891                    for &word in &[
1892                        sort_kernels::GATHER_KEYS_F64_LO_U32,
1893                        sort_kernels::GATHER_KEYS_F64_HI_U32,
1894                    ] {
1895                        let gather_fn = device.get_func(SORT_MODULE, word).ok_or_else(|| {
1896                            XlogError::Kernel(format!("{} kernel not found", word))
1897                        })?;
1898
1899                        // SAFETY: gather_keys_f64_*_u32(f64_bits, permutation, num_rows_device, row_cap, out_keys)
1900                        unsafe {
1901                            gather_fn.clone().launch(
1902                                launch_config,
1903                                (&col_bits, &indices_a, d_num_rows, n, &mut keys_a),
1904                            )
1905                        }
1906                        .map_err(|e| XlogError::Kernel(format!("{} failed: {}", word, e)))?;
1907
1908                        self.radix_sort_u32_pairs_with_scratch(
1909                            &mut keys_a,
1910                            &mut keys_b,
1911                            &mut indices_a,
1912                            &mut indices_b,
1913                            &mut d_hist,
1914                            &mut d_prefix,
1915                            &mut d_ranks,
1916                            d_num_rows,
1917                            n,
1918                        )?;
1919                    }
1920                }
1921            }
1922        }
1923
1924        self.apply_permutation_gpu(input, &indices_a)
1925    }
1926
1927    #[allow(clippy::too_many_arguments)]
1928    fn radix_sort_u32_pairs_with_scratch(
1929        &self,
1930        keys_a: &mut crate::memory::TrackedCudaSlice<u32>,
1931        keys_b: &mut crate::memory::TrackedCudaSlice<u32>,
1932        indices_a: &mut crate::memory::TrackedCudaSlice<u32>,
1933        indices_b: &mut crate::memory::TrackedCudaSlice<u32>,
1934        hist: &mut crate::memory::TrackedCudaSlice<u32>,
1935        prefix: &mut crate::memory::TrackedCudaSlice<u32>,
1936        ranks: &mut crate::memory::TrackedCudaSlice<u32>,
1937        num_rows_device: &crate::memory::TrackedCudaSlice<u32>,
1938        row_cap: u32,
1939    ) -> Result<()> {
1940        if row_cap == 0 {
1941            return Ok(());
1942        }
1943        self.device.synchronize()?;
1944
1945        let device = self.device.inner();
1946        let block_size = Self::SORT_BLOCK_SIZE;
1947        let grid_size = row_cap.div_ceil(block_size);
1948
1949        let sort_config = LaunchConfig {
1950            grid_dim: (grid_size, 1, 1),
1951            block_dim: (block_size, 1, 1),
1952            shared_mem_bytes: 0,
1953        };
1954
1955        let histogram_fn = device
1956            .get_func(SORT_MODULE, sort_kernels::RADIX_HISTOGRAM)
1957            .ok_or_else(|| XlogError::Kernel("radix_histogram kernel not found".to_string()))?;
1958        let prefix_fn = device
1959            .get_func(SORT_MODULE, sort_kernels::COMPUTE_DIGIT_PREFIX_SUMS)
1960            .ok_or_else(|| {
1961                XlogError::Kernel("compute_digit_prefix_sums kernel not found".to_string())
1962            })?;
1963        let ranks_fn = device
1964            .get_func(SORT_MODULE, sort_kernels::COMPUTE_RANKS)
1965            .ok_or_else(|| XlogError::Kernel("compute_ranks kernel not found".to_string()))?;
1966        let scatter_fn = device
1967            .get_func(SORT_MODULE, sort_kernels::RADIX_SCATTER_STABLE)
1968            .ok_or_else(|| {
1969                XlogError::Kernel("radix_scatter_stable kernel not found".to_string())
1970            })?;
1971
1972        let prefix_config = LaunchConfig {
1973            grid_dim: (1, 1, 1),
1974            block_dim: (256, 1, 1),
1975            shared_mem_bytes: 0,
1976        };
1977
1978        let mut in_a = true;
1979        for pass in 0..8u32 {
1980            let shift = pass * 4;
1981
1982            let (keys_in, indices_in, keys_out, indices_out) = if in_a {
1983                (&*keys_a, &*indices_a, &mut *keys_b, &mut *indices_b)
1984            } else {
1985                (&*keys_b, &*indices_b, &mut *keys_a, &mut *indices_a)
1986            };
1987
1988            // Histogram (digit-major): hist[digit * grid_size + block] = count
1989            // SAFETY: radix_histogram(keys, num_rows_device, row_cap, histograms, shift)
1990            unsafe {
1991                histogram_fn.clone().launch(
1992                    sort_config,
1993                    (keys_in, num_rows_device, row_cap, &mut *hist, shift),
1994                )
1995            }
1996            .map_err(|e| XlogError::Kernel(format!("radix_histogram failed: {}", e)))?;
1997            self.device.synchronize()?;
1998
1999            // Compute global digit prefix sums.
2000            // SAFETY: compute_digit_prefix_sums(histograms, grid_size, prefix_sums)
2001            unsafe {
2002                prefix_fn
2003                    .clone()
2004                    .launch(prefix_config, (&*hist, grid_size, &mut *prefix))
2005            }
2006            .map_err(|e| XlogError::Kernel(format!("compute_digit_prefix_sums failed: {}", e)))?;
2007            self.device.synchronize()?;
2008
2009            // Convert per-block histograms to per-block exclusive offsets (in-place scan per digit).
2010            for digit in 0..16u32 {
2011                let start = (digit * grid_size) as usize;
2012                let end = start + (grid_size as usize);
2013                let mut digit_slice = hist.slice_mut(start..end);
2014                self.multiblock_scan_u32_view_inplace(&mut digit_slice, grid_size)?;
2015            }
2016            self.device.synchronize()?;
2017
2018            // Compute per-element ranks for stability.
2019            // SAFETY: compute_ranks(keys, num_rows_device, row_cap, ranks, shift)
2020            unsafe {
2021                ranks_fn.clone().launch(
2022                    sort_config,
2023                    (keys_in, num_rows_device, row_cap, &mut *ranks, shift),
2024                )
2025            }
2026            .map_err(|e| XlogError::Kernel(format!("compute_ranks failed: {}", e)))?;
2027            self.device.synchronize()?;
2028
2029            // Stable scatter using digit prefix + per-block offsets + ranks.
2030            // SAFETY: radix_scatter_stable(keys_in, indices_in, ranks, keys_out, indices_out, prefix_sums, block_offsets, num_rows_device, row_cap, shift)
2031            unsafe {
2032                scatter_fn.clone().launch(
2033                    sort_config,
2034                    (
2035                        keys_in,
2036                        indices_in,
2037                        &*ranks,
2038                        keys_out,
2039                        indices_out,
2040                        &*prefix,
2041                        &*hist,
2042                        num_rows_device,
2043                        row_cap,
2044                        shift,
2045                    ),
2046                )
2047            }
2048            .map_err(|e| XlogError::Kernel(format!("radix_scatter_stable failed: {}", e)))?;
2049            self.device.synchronize()?;
2050
2051            in_a = !in_a;
2052        }
2053
2054        // 8 passes (32b/4b) => even number of swaps => sorted data ends in A.
2055        if !in_a {
2056            return Err(XlogError::Kernel(
2057                "Unexpected radix-sort buffer parity (expected even number of passes)".to_string(),
2058            ));
2059        }
2060
2061        Ok(())
2062    }
2063    /// Initialize indices array with 0..n-1 on device.
2064    pub fn init_indices(
2065        &self,
2066        indices: &mut crate::memory::TrackedCudaSlice<u32>,
2067        n: u32,
2068    ) -> Result<()> {
2069        if n == 0 {
2070            return Ok(());
2071        }
2072        if n as usize > indices.len() {
2073            return Err(XlogError::Kernel(format!(
2074                "init_indices: n={} exceeds indices len={}",
2075                n,
2076                indices.len()
2077            )));
2078        }
2079        let device = self.device.inner();
2080        let block_size = Self::SORT_BLOCK_SIZE;
2081        let grid_size = n.div_ceil(block_size);
2082        let config = LaunchConfig {
2083            grid_dim: (grid_size, 1, 1),
2084            block_dim: (block_size, 1, 1),
2085            shared_mem_bytes: 0,
2086        };
2087        let init_fn = device
2088            .get_func(SORT_MODULE, sort_kernels::INIT_INDICES)
2089            .ok_or_else(|| XlogError::Kernel("init_indices kernel not found".to_string()))?;
2090        let d_num_rows = self.upload_device_row_count(n)?;
2091        // SAFETY: init_indices(indices, num_rows_device, row_cap)
2092        unsafe {
2093            init_fn
2094                .clone()
2095                .launch(config, (&mut *indices, &d_num_rows, n))
2096        }
2097        .map_err(|e| XlogError::Kernel(format!("init_indices failed: {}", e)))?;
2098        Ok(())
2099    }
2100
2101    /// Gather u32 keys by permutation: out[i] = input[indices[i]].
2102    pub fn gather_u32_by_indices(
2103        &self,
2104        input: &crate::memory::TrackedCudaSlice<u32>,
2105        indices: &crate::memory::TrackedCudaSlice<u32>,
2106        output: &mut crate::memory::TrackedCudaSlice<u32>,
2107        n: u32,
2108    ) -> Result<()> {
2109        if n == 0 {
2110            return Ok(());
2111        }
2112        if n as usize > output.len() {
2113            return Err(XlogError::Kernel(format!(
2114                "gather_u32_by_indices: n={} exceeds output len={}",
2115                n,
2116                output.len()
2117            )));
2118        }
2119        let device = self.device.inner();
2120        let block_size = Self::SORT_BLOCK_SIZE;
2121        let grid_size = n.div_ceil(block_size);
2122        let config = LaunchConfig {
2123            grid_dim: (grid_size, 1, 1),
2124            block_dim: (block_size, 1, 1),
2125            shared_mem_bytes: 0,
2126        };
2127        let gather_fn = device
2128            .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_U32)
2129            .ok_or_else(|| {
2130                XlogError::Kernel("apply_permutation_u32 kernel not found".to_string())
2131            })?;
2132        let d_num_rows = self.upload_device_row_count(n)?;
2133        // SAFETY: apply_permutation_u32(input, output, permutation, num_rows_device, row_cap)
2134        unsafe {
2135            gather_fn
2136                .clone()
2137                .launch(config, (input, output, indices, &d_num_rows, n))
2138        }
2139        .map_err(|e| XlogError::Kernel(format!("gather_u32_by_indices failed: {}", e)))?;
2140        Ok(())
2141    }
2142
2143    /// Gather u8 values by permutation: out[i] = input[indices[i]].
2144    pub fn gather_u8_by_indices(
2145        &self,
2146        input: &crate::memory::TrackedCudaSlice<u8>,
2147        indices: &crate::memory::TrackedCudaSlice<u32>,
2148        output: &mut crate::memory::TrackedCudaSlice<u8>,
2149        n: u32,
2150    ) -> Result<()> {
2151        if n == 0 {
2152            return Ok(());
2153        }
2154        if n as usize > output.len() {
2155            return Err(XlogError::Kernel(format!(
2156                "gather_u8_by_indices: n={} exceeds output len={}",
2157                n,
2158                output.len()
2159            )));
2160        }
2161        let device = self.device.inner();
2162        let block_size = Self::SORT_BLOCK_SIZE;
2163        let grid_size = n.div_ceil(block_size);
2164        let config = LaunchConfig {
2165            grid_dim: (grid_size, 1, 1),
2166            block_dim: (block_size, 1, 1),
2167            shared_mem_bytes: 0,
2168        };
2169        let gather_fn = device
2170            .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
2171            .ok_or_else(|| {
2172                XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
2173            })?;
2174        let d_num_rows = self.upload_device_row_count(n)?;
2175        // SAFETY: apply_permutation_bytes(input, output, permutation, num_rows_device, row_cap, elem_size)
2176        unsafe {
2177            gather_fn
2178                .clone()
2179                .launch(config, (input, output, indices, &d_num_rows, n, 1u32))
2180        }
2181        .map_err(|e| XlogError::Kernel(format!("gather_u8_by_indices failed: {}", e)))?;
2182        Ok(())
2183    }
2184
2185    /// Gather low 32 bits of u64 values by permutation.
2186    pub fn gather_u64_lo_by_indices(
2187        &self,
2188        input: &crate::memory::TrackedCudaSlice<u64>,
2189        indices: &crate::memory::TrackedCudaSlice<u32>,
2190        output: &mut crate::memory::TrackedCudaSlice<u32>,
2191        n: u32,
2192    ) -> Result<()> {
2193        if n == 0 {
2194            return Ok(());
2195        }
2196        let device = self.device.inner();
2197        let block_size = Self::SORT_BLOCK_SIZE;
2198        let grid_size = n.div_ceil(block_size);
2199        let config = LaunchConfig {
2200            grid_dim: (grid_size, 1, 1),
2201            block_dim: (block_size, 1, 1),
2202            shared_mem_bytes: 0,
2203        };
2204        let gather_fn = device
2205            .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_U64_LO_U32)
2206            .ok_or_else(|| XlogError::Kernel("gather_keys_u64_lo_u32 not found".to_string()))?;
2207        let d_num_rows = self.upload_device_row_count(n)?;
2208        // SAFETY: gather_keys_u64_lo_u32(vals, permutation, num_rows_device, row_cap, out_keys)
2209        unsafe {
2210            gather_fn
2211                .clone()
2212                .launch(config, (input, indices, &d_num_rows, n, output))
2213        }
2214        .map_err(|e| XlogError::Kernel(format!("gather_u64_lo_by_indices failed: {}", e)))?;
2215        Ok(())
2216    }
2217
2218    /// Gather high 32 bits of u64 values by permutation.
2219    pub fn gather_u64_hi_by_indices(
2220        &self,
2221        input: &crate::memory::TrackedCudaSlice<u64>,
2222        indices: &crate::memory::TrackedCudaSlice<u32>,
2223        output: &mut crate::memory::TrackedCudaSlice<u32>,
2224        n: u32,
2225    ) -> Result<()> {
2226        if n == 0 {
2227            return Ok(());
2228        }
2229        let device = self.device.inner();
2230        let block_size = Self::SORT_BLOCK_SIZE;
2231        let grid_size = n.div_ceil(block_size);
2232        let config = LaunchConfig {
2233            grid_dim: (grid_size, 1, 1),
2234            block_dim: (block_size, 1, 1),
2235            shared_mem_bytes: 0,
2236        };
2237        let gather_fn = device
2238            .get_func(SORT_MODULE, sort_kernels::GATHER_KEYS_U64_HI_U32)
2239            .ok_or_else(|| XlogError::Kernel("gather_keys_u64_hi_u32 not found".to_string()))?;
2240        let d_num_rows = self.upload_device_row_count(n)?;
2241        // SAFETY: gather_keys_u64_hi_u32(vals, permutation, num_rows_device, row_cap, out_keys)
2242        unsafe {
2243            gather_fn
2244                .clone()
2245                .launch(config, (input, indices, &d_num_rows, n, output))
2246        }
2247        .map_err(|e| XlogError::Kernel(format!("gather_u64_hi_by_indices failed: {}", e)))?;
2248        Ok(())
2249    }
2250
2251    /// Stable radix sort of (key, value) u32 pairs using reusable scratch.
2252    pub fn radix_sort_u32_pairs(
2253        &self,
2254        keys: &mut crate::memory::TrackedCudaSlice<u32>,
2255        values: &mut crate::memory::TrackedCudaSlice<u32>,
2256        n: u32,
2257        scratch: &mut RadixSortScratch,
2258    ) -> Result<()> {
2259        if n == 0 {
2260            return Ok(());
2261        }
2262        scratch.ensure_capacity(self, n)?;
2263        let d_num_rows = self.upload_device_row_count(n)?;
2264        self.radix_sort_u32_pairs_with_scratch(
2265            keys,
2266            &mut scratch.keys_b,
2267            values,
2268            &mut scratch.values_b,
2269            &mut scratch.hist,
2270            &mut scratch.prefix,
2271            &mut scratch.ranks,
2272            &d_num_rows,
2273            n,
2274        )
2275    }
2276    /// Compute exclusive prefix sum of u8 mask on device (no host reads).
2277    pub fn scan_u8_mask_device(
2278        &self,
2279        mask: &crate::memory::TrackedCudaSlice<u8>,
2280        n: u32,
2281    ) -> Result<crate::memory::TrackedCudaSlice<u32>> {
2282        if n == 0 {
2283            return self.memory.alloc::<u32>(0);
2284        }
2285        if n as usize > mask.len() {
2286            return Err(XlogError::Kernel(format!(
2287                "scan_u8_mask_device: n={} exceeds mask len={}",
2288                n,
2289                mask.len()
2290            )));
2291        }
2292        let device = self.device.inner();
2293        let block_size = 256u32;
2294        let num_blocks = n.div_ceil(block_size);
2295
2296        let mut prefix_sum = self.memory.alloc::<u32>(n as usize)?;
2297        let mut block_sums = self.memory.alloc::<u32>(num_blocks as usize)?;
2298
2299        let phase1_fn = device
2300            .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE1)
2301            .ok_or_else(|| {
2302                XlogError::Kernel("multiblock_scan_phase1 kernel not found".to_string())
2303            })?;
2304
2305        // SAFETY: multiblock_scan_phase1(mask, prefix_sum, block_sums, n)
2306        unsafe {
2307            phase1_fn.clone().launch(
2308                LaunchConfig {
2309                    grid_dim: (num_blocks, 1, 1),
2310                    block_dim: (block_size, 1, 1),
2311                    shared_mem_bytes: 0,
2312                },
2313                (mask, &mut prefix_sum, &mut block_sums, n),
2314            )
2315        }
2316        .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase1 failed: {}", e)))?;
2317
2318        if num_blocks > 1 {
2319            self.multiblock_scan_u32_inplace(&mut block_sums, num_blocks)?;
2320
2321            let phase3_fn = device
2322                .get_func(SCAN_MODULE, scan_kernels::MULTIBLOCK_SCAN_PHASE3)
2323                .ok_or_else(|| {
2324                    XlogError::Kernel("multiblock_scan_phase3 kernel not found".to_string())
2325                })?;
2326
2327            // SAFETY: multiblock_scan_phase3(prefix_sum, block_offsets, n)
2328            unsafe {
2329                phase3_fn.clone().launch(
2330                    LaunchConfig {
2331                        grid_dim: (num_blocks, 1, 1),
2332                        block_dim: (block_size, 1, 1),
2333                        shared_mem_bytes: 0,
2334                    },
2335                    (&mut prefix_sum, &block_sums, n),
2336                )
2337            }
2338            .map_err(|e| XlogError::Kernel(format!("multiblock_scan_phase3 failed: {}", e)))?;
2339        }
2340
2341        Ok(prefix_sum)
2342    }
2343
2344    /// Count non-zero entries in a u8 mask on device (no host reads).
2345    ///
2346    /// Returns a 1-element device buffer containing the count.
2347    pub fn count_mask_device(
2348        &self,
2349        mask: &crate::memory::TrackedCudaSlice<u8>,
2350        n: u32,
2351    ) -> Result<crate::memory::TrackedCudaSlice<u32>> {
2352        let mut d_count = self.memory.alloc::<u32>(1)?;
2353        self.htod_launch_metadata_sync_copy_into(&[0u32], &mut d_count)
2354            .map_err(|e| {
2355                XlogError::Kernel(format!("count_mask_device: zero init failed: {}", e))
2356            })?;
2357
2358        if n == 0 {
2359            return Ok(d_count);
2360        }
2361
2362        let device = self.device.inner();
2363        let block_size = 256u32;
2364        let grid_size = n.div_ceil(block_size);
2365
2366        let count_fn = device
2367            .get_func(SCAN_MODULE, scan_kernels::COUNT_MASK)
2368            .ok_or_else(|| XlogError::Kernel("count_mask kernel not found".to_string()))?;
2369
2370        // SAFETY: count_mask(mask, n, count)
2371        unsafe {
2372            count_fn.clone().launch(
2373                LaunchConfig {
2374                    grid_dim: (grid_size, 1, 1),
2375                    block_dim: (block_size, 1, 1),
2376                    shared_mem_bytes: 0,
2377                },
2378                (mask, n, &mut d_count),
2379            )
2380        }
2381        .map_err(|e| XlogError::Kernel(format!("count_mask kernel failed: {}", e)))?;
2382
2383        self.device.synchronize()?;
2384
2385        Ok(d_count)
2386    }
2387
2388    /// Count 1-bits in `mask[0..n]` and write the result into
2389    /// `task_counts[slot_idx]` via the existing `count_mask` kernel.
2390    ///
2391    /// The caller MUST ensure `task_counts[slot_idx]` is zero before
2392    /// calling (e.g. by zeroing the whole array once).
2393    ///
2394    /// This avoids allocating a fresh 1-element device buffer per call,
2395    /// which matters when iterating over hundreds of tasks.
2396    pub fn count_mask_into_slot(
2397        &self,
2398        mask: &crate::memory::TrackedCudaSlice<u8>,
2399        n: u32,
2400        task_counts: &mut crate::memory::TrackedCudaSlice<u32>,
2401        slot_idx: usize,
2402    ) -> Result<()> {
2403        if n == 0 {
2404            // Slot is already zero (caller pre-zeroed); nothing to do.
2405            return Ok(());
2406        }
2407        if slot_idx >= task_counts.len() {
2408            return Err(XlogError::Kernel(format!(
2409                "count_mask_into_slot: slot_idx={} >= len={}",
2410                slot_idx,
2411                task_counts.len()
2412            )));
2413        }
2414
2415        let device = self.device.inner();
2416        let block_size = 256u32;
2417        let grid_size = n.div_ceil(block_size);
2418
2419        let count_fn = device
2420            .get_func(SCAN_MODULE, scan_kernels::COUNT_MASK)
2421            .ok_or_else(|| XlogError::Kernel("count_mask kernel not found".to_string()))?;
2422
2423        // Get a mutable sub-slice pointing at task_counts[slot_idx..slot_idx+1].
2424        let mut slot = task_counts.slice_mut(slot_idx..slot_idx + 1);
2425
2426        // SAFETY: count_mask(mask, n, count) — writes atomicAdd into count ptr.
2427        // The slot was pre-zeroed by the caller.
2428        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
2429        unsafe {
2430            count_fn.clone().launch(
2431                LaunchConfig {
2432                    grid_dim: (grid_size, 1, 1),
2433                    block_dim: (block_size, 1, 1),
2434                    shared_mem_bytes: 0,
2435                },
2436                (mask, n, &mut slot),
2437            )
2438        }
2439        .map_err(|e| XlogError::Kernel(format!("count_mask_into_slot kernel failed: {}", e)))?;
2440
2441        Ok(())
2442    }
2443    /// Apply permutation to reorder all columns in buffer using GPU
2444    fn apply_permutation_gpu(
2445        &self,
2446        input: &CudaBuffer,
2447        permutation: &cudarc::driver::CudaSlice<u32>,
2448    ) -> Result<CudaBuffer> {
2449        let row_cap = input.num_rows() as u32;
2450        let d_num_rows = input.num_rows_device();
2451        let device = self.device.inner();
2452
2453        let grid_size = row_cap.div_ceil(Self::SORT_BLOCK_SIZE);
2454        let launch_config = LaunchConfig {
2455            grid_dim: (grid_size, 1, 1),
2456            block_dim: (Self::SORT_BLOCK_SIZE, 1, 1),
2457            shared_mem_bytes: 0,
2458        };
2459
2460        let apply_perm_fn = device
2461            .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
2462            .ok_or_else(|| {
2463                XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
2464            })?;
2465
2466        let mut new_columns = Vec::with_capacity(input.columns.len());
2467
2468        for col_idx in 0..input.columns.len() {
2469            let src_col = input
2470                .column(col_idx)
2471                .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
2472
2473            let elem_size = input
2474                .schema
2475                .column_type(col_idx)
2476                .ok_or_else(|| {
2477                    XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
2478                })?
2479                .size_bytes() as u32;
2480
2481            let output_bytes = (row_cap as usize) * (elem_size as usize);
2482            if src_col.num_bytes() != output_bytes {
2483                return Err(XlogError::Kernel(format!(
2484                    "Column {} has {} bytes but expected {} (num_rows={}, elem_size={})",
2485                    col_idx,
2486                    src_col.num_bytes(),
2487                    output_bytes,
2488                    row_cap,
2489                    elem_size
2490                )));
2491            }
2492            let dst_col = self.memory.alloc::<u8>(output_bytes)?;
2493
2494            // SAFETY: Kernel signature matches: apply_permutation_bytes(input, output, permutation, num_rows_device, row_cap, elem_size)
2495            unsafe {
2496                apply_perm_fn.clone().launch(
2497                    launch_config,
2498                    (
2499                        src_col,
2500                        &dst_col,
2501                        permutation,
2502                        d_num_rows,
2503                        row_cap,
2504                        elem_size,
2505                    ),
2506                )
2507            }
2508            .map_err(|e| XlogError::Kernel(format!("apply_permutation_bytes failed: {}", e)))?;
2509
2510            new_columns.push(dst_col.into());
2511        }
2512
2513        self.device.synchronize()?;
2514
2515        self.buffer_from_columns_with_device_count(
2516            new_columns,
2517            input.num_rows(),
2518            input.schema.clone(),
2519            input,
2520        )
2521    }
2522
2523    /// Gather rows by explicit indices on GPU: output[i] = input[indices[i]].
2524    ///
2525    /// This is like `apply_permutation_gpu`, but the input can be larger than the output
2526    /// (i.e. `output_rows` < input.num_rows()).
2527    fn gather_buffer_by_indices(
2528        &self,
2529        input: &CudaBuffer,
2530        indices: &cudarc::driver::CudaSlice<u32>,
2531        output_rows: u32,
2532    ) -> Result<CudaBuffer> {
2533        if output_rows == 0 {
2534            return self.create_empty_buffer(input.schema().clone());
2535        }
2536
2537        if input.num_rows() > u32::MAX as u64 {
2538            return Err(XlogError::Kernel(format!(
2539                "GPU gather supports at most {} input rows, got {}",
2540                u32::MAX,
2541                input.num_rows()
2542            )));
2543        }
2544
2545        let d_output_rows = self.upload_device_row_count(output_rows)?;
2546        let device = self.device.inner();
2547        let block_size = 256u32;
2548        let grid_size = output_rows.div_ceil(block_size);
2549        let launch_config = LaunchConfig {
2550            grid_dim: (grid_size, 1, 1),
2551            block_dim: (block_size, 1, 1),
2552            shared_mem_bytes: 0,
2553        };
2554
2555        let gather_fn = device
2556            .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
2557            .ok_or_else(|| {
2558                XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
2559            })?;
2560
2561        let mut new_columns = Vec::with_capacity(input.columns.len());
2562        for col_idx in 0..input.columns.len() {
2563            let src_col = input
2564                .column(col_idx)
2565                .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
2566
2567            let elem_size = input
2568                .schema
2569                .column_type(col_idx)
2570                .ok_or_else(|| {
2571                    XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
2572                })?
2573                .size_bytes() as u32;
2574
2575            let expected_src_bytes = (input.num_rows() as usize) * (elem_size as usize);
2576            if src_col.num_bytes() != expected_src_bytes {
2577                return Err(XlogError::Kernel(format!(
2578                    "Column {} has {} bytes but expected {} (num_rows={}, elem_size={})",
2579                    col_idx,
2580                    src_col.num_bytes(),
2581                    expected_src_bytes,
2582                    input.num_rows(),
2583                    elem_size
2584                )));
2585            }
2586
2587            let dst_bytes = (output_rows as usize) * (elem_size as usize);
2588            let dst_col = self.memory.alloc::<u8>(dst_bytes)?;
2589
2590            // SAFETY: Kernel signature matches: apply_permutation_bytes(input, output, permutation, num_rows_device, row_cap, elem_size)
2591            unsafe {
2592                gather_fn.clone().launch(
2593                    launch_config,
2594                    (
2595                        src_col,
2596                        &dst_col,
2597                        indices,
2598                        &d_output_rows,
2599                        output_rows,
2600                        elem_size,
2601                    ),
2602                )
2603            }
2604            .map_err(|e| XlogError::Kernel(format!("apply_permutation_bytes failed: {}", e)))?;
2605
2606            new_columns.push(dst_col.into());
2607        }
2608
2609        self.device.synchronize()?;
2610
2611        Ok(CudaBuffer::from_columns(
2612            new_columns,
2613            output_rows as u64,
2614            d_output_rows,
2615            input.schema.clone(),
2616        ))
2617    }
2618    // ============== Hash Join V2 Implementation ==============
2619
2620    /// Multi-column hash join with support for different join types.
2621    ///
2622    /// # Arguments
2623    /// * `left` - The left (probe) buffer
2624    /// * `right` - The right (build) buffer
2625    /// * `left_keys` - Column indices for join keys in left buffer
2626    /// * `right_keys` - Column indices for join keys in right buffer
2627    /// * `join_type` - Type of join to perform (Inner, Semi, Anti, LeftOuter)
2628    ///
2629    /// # Errors
2630    /// Returns `XlogError::Kernel` if kernel execution fails or parameters are invalid
2631    pub fn hash_join_v2(
2632        &self,
2633        left: &CudaBuffer,
2634        right: &CudaBuffer,
2635        left_keys: &[usize],
2636        right_keys: &[usize],
2637        join_type: JoinType,
2638    ) -> Result<CudaBuffer> {
2639        self.hash_join_v2_with_limit(left, right, left_keys, right_keys, join_type, None)
2640    }
2641
2642    /// V2 hash join with configurable maximum output size
2643    ///
2644    /// Multi-column join with typed key comparison, supporting different join types.
2645    /// Uses composite hashing (FNV-1a) for multi-column keys with full key verification.
2646    ///
2647    /// # Arguments
2648    /// * `left` - The left (probe) buffer
2649    /// * `right` - The right (build) buffer
2650    /// * `left_keys` - Column indices for join keys in left buffer
2651    /// * `right_keys` - Column indices for join keys in right buffer
2652    /// * `join_type` - Type of join to perform (Inner, Semi, Anti, LeftOuter)
2653    /// * `max_output` - Optional maximum number of output rows (None = unlimited, subject to memory budget)
2654    ///
2655    /// # Errors
2656    /// Returns `XlogError::Kernel` if kernel execution fails or parameters are invalid
2657    pub fn hash_join_v2_with_limit(
2658        &self,
2659        left: &CudaBuffer,
2660        right: &CudaBuffer,
2661        left_keys: &[usize],
2662        right_keys: &[usize],
2663        join_type: JoinType,
2664        max_output: Option<usize>,
2665    ) -> Result<CudaBuffer> {
2666        // Env-gated recorded dispatch. The `pack_keys`
2667        // constraint inherited by `hash_join_v2_recorded`
2668        // requires `left_keys.len() <= 4`. Mismatch falls
2669        // through to the legacy path.
2670        if Self::use_recorded_hash_join_env()
2671            && !left_keys.is_empty()
2672            && left_keys.len() == right_keys.len()
2673            && left_keys.len() <= 4
2674        {
2675            if let Some(launch_stream) = self.recorded_op_stream_or_init() {
2676                return self.hash_join_v2_recorded(
2677                    left,
2678                    right,
2679                    left_keys,
2680                    right_keys,
2681                    join_type,
2682                    max_output,
2683                    launch_stream,
2684                );
2685            }
2686        }
2687        match join_type {
2688            JoinType::Inner => {
2689                self.hash_join_inner_v2(left, right, left_keys, right_keys, max_output)
2690            }
2691            JoinType::Semi => self.hash_join_semi_impl(left, right, left_keys, right_keys),
2692            JoinType::Anti => self.hash_join_anti_impl(left, right, left_keys, right_keys),
2693            JoinType::LeftOuter => {
2694                self.hash_join_left_outer_impl(left, right, left_keys, right_keys, max_output)
2695            }
2696        }
2697    }
2698
2699    /// Nested-loop inner join (emit-pairs design).
2700    ///
2701    /// Drop-in compatible with `hash_join_v2(_, _, &[left_key],
2702    /// &[right_key], JoinType::Inner)`: same input types, same
2703    /// output schema (`combine_schemas(left, right)`), same row
2704    /// set. Caller (the executor's dispatch site) is
2705    /// responsible for choosing between `hash_join_v2` and this
2706    /// fn based on the eligibility predicate + threshold check;
2707    /// this fn validates the same contract fail-closed and
2708    /// returns `Err` if a caller violates it.
2709    ///
2710    /// # Eligibility (validated inside; `Err` on violation)
2711    ///
2712    /// * `left.arity() > left_key && right.arity() > right_key`.
2713    /// * Left and right key columns share the same `ScalarType`,
2714    ///   and that shared type is `U32` or `Symbol` (Symbol is
2715    ///   `u32` at the byte level — same kernel applies).
2716    /// * Each key column's allocation is at least `num_rows * 4`
2717    ///   bytes (preflight lower-bound validation; mirrors the
2718    ///   `crates/xlog-cuda/src/provider/ilp.rs:18` codebase idiom
2719    ///   `col.num_bytes() < required_bytes`). `CudaColumn::num_bytes()`
2720    ///   reports the allocation size, which can exceed
2721    ///   `num_rows * 4` when the buffer has spare capacity
2722    ///   (`row_cap > num_rows`); strict-equality validation would
2723    ///   false-positive-reject normal over-allocated buffers
2724    ///   reaching this path through `Executor::execute_node`.
2725    /// * `num_left * num_right <= NESTED_LOOP_TOTAL_THRESHOLD`
2726    ///   (computed via `checked_mul`; release-mode wrapping
2727    ///   multiply is forbidden).
2728    ///
2729    /// # Implementation outline
2730    ///
2731    /// 1. Read logical row counts via `device_row_count` (NOT
2732    ///    `row_cap`).
2733    /// 2. Empty-input fast path: if either side is empty, return
2734    ///    `create_empty_buffer(combine_schemas(...))` — mirrors
2735    ///    `hash_join_inner_v2`'s pattern at
2736    ///    `crates/xlog-cuda/src/provider/relational.rs:3165-3170`.
2737    /// 3. Validate eligibility (above).
2738    /// 4. Allocate two `u32` index arrays of length
2739    ///    `num_left * num_right` (bounded at 32 MB total under
2740    ///    the threshold).
2741    /// 5. Launch `nested_loop_join_inner_u32_1key_pairs` with
2742    ///    `&CudaColumn` key pointers (variant-agnostic).
2743    /// 6. D2H the output count.
2744    /// 7. Materialize via `gather_buffer_by_indices` for both
2745    ///    sides + concatenate columns.
2746    pub fn nested_loop_join_v2_inner_u32_1key(
2747        &self,
2748        left: &CudaBuffer,
2749        right: &CudaBuffer,
2750        left_key: usize,
2751        right_key: usize,
2752    ) -> Result<CudaBuffer> {
2753        // ----- 1. Logical row counts (NOT row_cap) -----
2754        let num_left = self.device_row_count(left)?;
2755        let num_right = self.device_row_count(right)?;
2756
2757        // ----- 2. Empty-input fast path (inner-join schema) -----
2758        if num_left == 0 || num_right == 0 {
2759            let combined_schema = self.combine_schemas(left.schema(), right.schema());
2760            return self.create_empty_buffer(combined_schema);
2761        }
2762
2763        // ----- 3. Eligibility validation -----
2764        if left.arity() <= left_key {
2765            return Err(XlogError::Kernel(format!(
2766                "nested_loop: left_key={} out of bounds (arity={})",
2767                left_key,
2768                left.arity()
2769            )));
2770        }
2771        if right.arity() <= right_key {
2772            return Err(XlogError::Kernel(format!(
2773                "nested_loop: right_key={} out of bounds (arity={})",
2774                right_key,
2775                right.arity()
2776            )));
2777        }
2778        let lt = left.schema().column_type(left_key);
2779        let rt = right.schema().column_type(right_key);
2780        if lt != rt || !matches!(lt, Some(ScalarType::U32) | Some(ScalarType::Symbol)) {
2781            return Err(XlogError::Kernel(format!(
2782                "nested_loop: key types must be equal U32/Symbol; got left={:?} right={:?}",
2783                lt, rt
2784            )));
2785        }
2786        let left_col = left
2787            .column(left_key)
2788            .ok_or_else(|| XlogError::Kernel(format!("nested_loop: left.column({})", left_key)))?;
2789        let right_col = right.column(right_key).ok_or_else(|| {
2790            XlogError::Kernel(format!("nested_loop: right.column({})", right_key))
2791        })?;
2792        // Byte-length lower-bound check (corrected to lower-bound
2793        // semantics). The codebase convention is that
2794        // `CudaColumn::num_bytes()` reports the ALLOCATION size,
2795        // which can exceed `num_rows * sizeof(T)` when the buffer
2796        // has spare capacity (row_cap > num_rows). The check
2797        // must therefore be a lower-bound (column has AT LEAST
2798        // enough bytes for the kernel's `num_rows` reads), NOT
2799        // strict equality. Mirrors the `ilp.rs:18` idiom in this
2800        // codebase (`col.num_bytes() < required_bytes` for the
2801        // failure case). Strict equality would falsely reject
2802        // any normal buffer with spare allocation — surfaced as
2803        // a regression in `test_simple_join` and
2804        // `test_transitive_closure` after the executor dispatch
2805        // wiring routed those joins through this path.
2806        let required_left_bytes = num_left
2807            .checked_mul(4)
2808            .ok_or_else(|| XlogError::Kernel("nested_loop: left byte-count overflow".into()))?;
2809        let required_right_bytes = num_right
2810            .checked_mul(4)
2811            .ok_or_else(|| XlogError::Kernel("nested_loop: right byte-count overflow".into()))?;
2812        if left_col.num_bytes() < required_left_bytes {
2813            return Err(XlogError::Kernel(format!(
2814                "nested_loop: left key column has {} bytes; \
2815                 require at least {} ({} rows × 4) — buffer allocation \
2816                 is smaller than logical row count",
2817                left_col.num_bytes(),
2818                required_left_bytes,
2819                num_left
2820            )));
2821        }
2822        if right_col.num_bytes() < required_right_bytes {
2823            return Err(XlogError::Kernel(format!(
2824                "nested_loop: right key column has {} bytes; \
2825                 require at least {} ({} rows × 4) — buffer allocation \
2826                 is smaller than logical row count",
2827                right_col.num_bytes(),
2828                required_right_bytes,
2829                num_right
2830            )));
2831        }
2832
2833        // ----- 4. Fail-closed threshold check via checked_mul -----
2834        let upper_bound: u64 = (num_left as u64)
2835            .checked_mul(num_right as u64)
2836            .ok_or_else(|| XlogError::Kernel("nested_loop: row-count product overflow".into()))?;
2837        if upper_bound > NESTED_LOOP_TOTAL_THRESHOLD {
2838            return Err(XlogError::Kernel(format!(
2839                "nested_loop: caller violated eligibility threshold: \
2840                 num_left * num_right = {} > {} (NESTED_LOOP_TOTAL_THRESHOLD)",
2841                upper_bound, NESTED_LOOP_TOTAL_THRESHOLD
2842            )));
2843        }
2844
2845        // ----- 5. Allocate index arrays + counter -----
2846        let upper_bound_usize = upper_bound as usize;
2847        let mut d_output_left_idx = self.memory.alloc::<u32>(upper_bound_usize)?;
2848        let mut d_output_right_idx = self.memory.alloc::<u32>(upper_bound_usize)?;
2849        let mut d_output_count = self.memory.alloc::<u32>(1)?;
2850        self.device
2851            .inner()
2852            .memset_zeros(&mut d_output_count)
2853            .map_err(|e| XlogError::Kernel(format!("nested_loop: counter zero failed: {}", e)))?;
2854
2855        // ----- 6. Launch kernel (variant-agnostic column refs) -----
2856        let func = self
2857            .device
2858            .inner()
2859            .get_func(
2860                JOIN_MODULE,
2861                join_kernels::NESTED_LOOP_JOIN_INNER_U32_1KEY_PAIRS,
2862            )
2863            .ok_or_else(|| {
2864                XlogError::Kernel("nested_loop_join_inner_u32_1key_pairs kernel not found".into())
2865            })?;
2866
2867        let num_left_u32 = num_left as u32;
2868        let num_right_u32 = num_right as u32;
2869        let upper_bound_u32 = upper_bound as u32;
2870        let block_size = 256u32;
2871        let grid_size = num_left_u32.div_ceil(block_size);
2872        let config = LaunchConfig {
2873            grid_dim: (grid_size, 1, 1),
2874            block_dim: (block_size, 1, 1),
2875            shared_mem_bytes: 0,
2876        };
2877
2878        // SAFETY: kernel signature matches PTX:
2879        //   nested_loop_join_inner_u32_1key_pairs(
2880        //     const uint32_t* left_keys, const uint32_t* right_keys,
2881        //     uint32_t num_left, uint32_t num_right,
2882        //     uint32_t* output_left_idx, uint32_t* output_right_idx,
2883        //     uint32_t* output_count, uint32_t output_capacity)
2884        // Byte lengths validated above; counts fit in u32 by the
2885        // threshold; allocations sized to upper_bound; counter
2886        // pre-zeroed.
2887        unsafe {
2888            func.clone()
2889                .launch(
2890                    config,
2891                    (
2892                        left_col,
2893                        right_col,
2894                        num_left_u32,
2895                        num_right_u32,
2896                        &mut d_output_left_idx,
2897                        &mut d_output_right_idx,
2898                        &mut d_output_count,
2899                        upper_bound_u32,
2900                    ),
2901                )
2902                .map_err(|e| XlogError::Kernel(format!("nested_loop launch failed: {}", e)))?;
2903        }
2904
2905        self.device.synchronize()?;
2906
2907        // ----- 7. D2H the output count (single u32) -----
2908        let output_rows = self.dtoh_scalar_untracked(&d_output_count, 0)?;
2909        // Defense-in-depth: kernel guarantees output_rows ≤
2910        // upper_bound by the in-kernel atomic-cap branch, but
2911        // double-check to surface contract violations early.
2912        if (output_rows as u64) > upper_bound {
2913            return Err(XlogError::Kernel(format!(
2914                "nested_loop: kernel reported {} output rows > upper_bound {}",
2915                output_rows, upper_bound
2916            )));
2917        }
2918
2919        // ----- 8. Gather both sides via existing GPU machinery -----
2920        let gathered_left = self.gather_buffer_by_indices(left, &d_output_left_idx, output_rows)?;
2921        let gathered_right =
2922            self.gather_buffer_by_indices(right, &d_output_right_idx, output_rows)?;
2923
2924        // ----- 9. Combine columns + return drop-in result -----
2925        let combined_schema = self.combine_schemas(left.schema(), right.schema());
2926        let mut result_columns = Vec::with_capacity(combined_schema.arity());
2927        result_columns.extend(gathered_left.columns);
2928        result_columns.extend(gathered_right.columns);
2929        // `buffer_from_columns` takes `row_cap: u64` — see
2930        // `crates/xlog-cuda/src/provider/mod.rs:2133-2138`.
2931        self.buffer_from_columns(result_columns, output_rows as u64, combined_schema)
2932    }
2933
2934    /// Sort-merge sortedness-detection wrapper. Returns `Ok(true)` iff
2935    /// the column at `key_col` of `buf` is sorted ascending
2936    /// (`keys[i] <= keys[i+1]` for all i in `[0, num_rows-1)`),
2937    /// `Ok(false)` if a violation is detected, `Err(_)` on
2938    /// kernel-launch / D2H failure.
2939    ///
2940    /// **Empty / single-row fast path**: `n < 2` returns `Ok(true)` BEFORE allocation
2941    ///   or kernel launch. The detection kernel's grid `(n + 255)
2942    ///   / 256` is undefined for `n == 0`; single-row sequences
2943    ///   are trivially sorted. This is the load-bearing
2944    ///   invariant the empty-input sortedness checks verify.
2945    ///
2946    /// Validation:
2947    ///   * Key column index within arity bounds.
2948    ///   * Key column type is `U32` or `Symbol`
2949    ///     (byte-identical at the kernel level).
2950    ///   * Key column allocation `>= num_rows * 4` bytes
2951    ///     (mirrors the nested-loop byte-length lower-bound idiom).
2952    ///
2953    /// **Caller surface**: this fn has no executor-dispatch caller after
2954    /// benchmark-backed unwiring. Its only callers are operator-level tests
2955    /// and the production sort-merge benchmark
2956    /// (sort-merge-with-detection timing). The provider returns the honest `Result<bool>`
2957    /// — the kernel can fail (allocation, launch, D2H), and
2958    /// `Err(_)` is preserved so callers can log or surface it
2959    /// at their abstraction level. There is no fail-closed
2960    /// dispatch contract anymore. Earlier fail-closed callers used
2961    /// `matches!(_, Ok(true))`; after the dispatch site was unwired,
2962    /// any later caller must decide its own Err-handling policy.
2963    pub fn is_sorted_ascending_u32(&self, buf: &CudaBuffer, key_col: usize) -> Result<bool> {
2964        // Empty / single-row fast path.
2965        let n = self.device_row_count(buf)?;
2966        if n < 2 {
2967            return Ok(true);
2968        }
2969
2970        // Validate key column.
2971        if buf.arity() <= key_col {
2972            return Err(XlogError::Kernel(format!(
2973                "is_sorted_ascending_u32: key_col={} out of bounds (arity={})",
2974                key_col,
2975                buf.arity()
2976            )));
2977        }
2978        let kt = buf.schema().column_type(key_col);
2979        if !matches!(kt, Some(ScalarType::U32) | Some(ScalarType::Symbol)) {
2980            return Err(XlogError::Kernel(format!(
2981                "is_sorted_ascending_u32: key column must be U32 or Symbol; got {:?}",
2982                kt
2983            )));
2984        }
2985        let key_column = buf.column(key_col).ok_or_else(|| {
2986            XlogError::Kernel(format!(
2987                "is_sorted_ascending_u32: column({}) missing",
2988                key_col
2989            ))
2990        })?;
2991        let required_bytes = n
2992            .checked_mul(4)
2993            .ok_or_else(|| XlogError::Kernel("is_sorted_ascending_u32: byte overflow".into()))?;
2994        if key_column.num_bytes() < required_bytes {
2995            return Err(XlogError::Kernel(format!(
2996                "is_sorted_ascending_u32: key column has {} bytes; require at least {} ({} rows × 4)",
2997                key_column.num_bytes(),
2998                required_bytes,
2999                n
3000            )));
3001        }
3002
3003        // Allocate result flag, initialize to 1 (sorted by
3004        // default; kernel atomically writes 0 only on
3005        // detected violation).
3006        let mut d_result = self.memory.alloc::<u32>(1)?;
3007        self.htod_launch_metadata_sync_copy_into(&[1u32], &mut d_result)
3008            .map_err(|e| {
3009                XlogError::Kernel(format!("is_sorted_ascending_u32: htod result init: {}", e))
3010            })?;
3011
3012        // Launch detection kernel.
3013        let func = self
3014            .device
3015            .inner()
3016            .get_func(SORT_MODULE, sort_kernels::CHECK_ASCENDING_SORTED_U32)
3017            .ok_or_else(|| {
3018                XlogError::Kernel("check_ascending_sorted_u32 kernel not found".into())
3019            })?;
3020        let n_u32 = n as u32;
3021        let block_size = 256u32;
3022        let grid_size = n_u32.div_ceil(block_size);
3023        let config = LaunchConfig {
3024            grid_dim: (grid_size, 1, 1),
3025            block_dim: (block_size, 1, 1),
3026            shared_mem_bytes: 0,
3027        };
3028
3029        // SAFETY: kernel signature
3030        //   check_ascending_sorted_u32(
3031        //     const uint32_t* keys, uint32_t num_rows,
3032        //     uint32_t* result)
3033        // Byte length validated above; `n` fits in u32 by
3034        // device_row_count's u32 underlying representation;
3035        // result allocation is 1 u32, initialized to 1.
3036        unsafe {
3037            func.clone()
3038                .launch(config, (key_column, n_u32, &mut d_result))
3039                .map_err(|e| {
3040                    XlogError::Kernel(format!("check_ascending_sorted_u32 launch: {}", e))
3041                })?;
3042        }
3043
3044        self.device.synchronize()?;
3045        let result = self.dtoh_scalar_untracked(&d_result, 0)?;
3046        Ok(result == 1)
3047    }
3048
3049    /// Sort-merge inner join (caller-asserted pre-sorted
3050    /// inputs). Drop-in compatible with `hash_join_v2(_, _,
3051    /// &[left_key], &[right_key], JoinType::Inner)`: same
3052    /// input types, same output schema
3053    /// (`combine_schemas(left, right)`), same row set.
3054    ///
3055    /// **Caller surface**: this fn has no executor-dispatch caller after
3056    /// benchmark-backed unwiring. Production benchmark evidence rejected
3057    /// default executor precedence for sort-merge at `execute_join`; this fn
3058    /// remains graduated operator work for direct provider callers and tests.
3059    /// Current callers: operator-level provider parity tests in
3060    /// `crates/xlog-integration/tests/test_w43_sort_merge_dispatch.rs`
3061    /// and the production sort-merge benchmark at
3062    /// `crates/xlog-integration/benches/sort_merge_production_bench.rs`
3063    /// (sort-merge-with-detection Path 1 timing).
3064    ///
3065    /// **Caller contract**: both inputs are pre-sorted ascending
3066    /// by their respective key column. The kernel does NOT
3067    /// detect or enforce sortedness; callers may pre-check via
3068    /// `is_sorted_ascending_u32`. On unsorted inputs the row-set
3069    /// output is undefined; the dispatch-site fallback path no longer exists.
3070    ///
3071    /// # Eligibility (validated inside; `Err` on violation)
3072    ///
3073    /// * `left.arity() > left_key && right.arity() > right_key`.
3074    /// * Left and right key columns share the same `ScalarType`,
3075    ///   and that shared type is `U32` or `Symbol`.
3076    /// * Each key column's allocation is at least `num_rows * 4`
3077    ///   bytes (lower-bound check, mirrors the nested-loop byte-length guard).
3078    /// * `num_left * num_right <= NESTED_LOOP_TOTAL_THRESHOLD`
3079    ///   (shared with the nested-loop operator; computed via
3080    ///   `checked_mul`; release-mode wrapping multiply is
3081    ///   forbidden).
3082    ///
3083    /// # Implementation outline
3084    ///
3085    /// Mirrors `nested_loop_join_v2_inner_u32_1key` implementation idioms:
3086    /// empty fast path with no `?`, byte-length lower-bound `<` check,
3087    /// `checked_mul` for threshold, `as u64` for `row_cap`, and
3088    /// variant-agnostic `&CudaColumn` launch.
3089    ///
3090    /// 1. Read logical row counts via `device_row_count` (NOT
3091    ///    `row_cap`).
3092    /// 2. Empty-input fast path: if either side is empty,
3093    ///    return `create_empty_buffer(combine_schemas(...))` —
3094    ///    mirrors `hash_join_inner_v2` at `relational.rs:3165-3170`
3095    ///    AND `nested_loop_join_v2_inner_u32_1key`'s identical
3096    ///    pattern.
3097    /// 3. Validate eligibility (above).
3098    /// 4. Allocate two `u32` index arrays of length
3099    ///    `num_left * num_right` (bounded at 32 MB total under
3100    ///    the shared threshold).
3101    /// 5. Launch `sort_merge_join_inner_u32_1key_pairs` with
3102    ///    `&CudaColumn` key pointers (variant-agnostic).
3103    /// 6. D2H the output count.
3104    /// 7. Materialize via `gather_buffer_by_indices` for both
3105    ///    sides + concatenate columns.
3106    pub fn sort_merge_join_v2_inner_u32_1key(
3107        &self,
3108        left: &CudaBuffer,
3109        right: &CudaBuffer,
3110        left_key: usize,
3111        right_key: usize,
3112    ) -> Result<CudaBuffer> {
3113        // ----- 1. Logical row counts (NOT row_cap) -----
3114        let num_left = self.device_row_count(left)?;
3115        let num_right = self.device_row_count(right)?;
3116
3117        // ----- 2. Empty-input fast path (inner-join schema) -----
3118        if num_left == 0 || num_right == 0 {
3119            let combined_schema = self.combine_schemas(left.schema(), right.schema());
3120            return self.create_empty_buffer(combined_schema);
3121        }
3122
3123        // ----- 3. Eligibility validation -----
3124        if left.arity() <= left_key {
3125            return Err(XlogError::Kernel(format!(
3126                "sort_merge: left_key={} out of bounds (arity={})",
3127                left_key,
3128                left.arity()
3129            )));
3130        }
3131        if right.arity() <= right_key {
3132            return Err(XlogError::Kernel(format!(
3133                "sort_merge: right_key={} out of bounds (arity={})",
3134                right_key,
3135                right.arity()
3136            )));
3137        }
3138        let lt = left.schema().column_type(left_key);
3139        let rt = right.schema().column_type(right_key);
3140        if lt != rt || !matches!(lt, Some(ScalarType::U32) | Some(ScalarType::Symbol)) {
3141            return Err(XlogError::Kernel(format!(
3142                "sort_merge: key types must be equal U32/Symbol; got left={:?} right={:?}",
3143                lt, rt
3144            )));
3145        }
3146        let left_col = left
3147            .column(left_key)
3148            .ok_or_else(|| XlogError::Kernel(format!("sort_merge: left.column({})", left_key)))?;
3149        let right_col = right
3150            .column(right_key)
3151            .ok_or_else(|| XlogError::Kernel(format!("sort_merge: right.column({})", right_key)))?;
3152        let required_left_bytes = num_left
3153            .checked_mul(4)
3154            .ok_or_else(|| XlogError::Kernel("sort_merge: left byte overflow".into()))?;
3155        let required_right_bytes = num_right
3156            .checked_mul(4)
3157            .ok_or_else(|| XlogError::Kernel("sort_merge: right byte overflow".into()))?;
3158        if left_col.num_bytes() < required_left_bytes {
3159            return Err(XlogError::Kernel(format!(
3160                "sort_merge: left key column has {} bytes; \
3161                 require at least {} ({} rows × 4)",
3162                left_col.num_bytes(),
3163                required_left_bytes,
3164                num_left
3165            )));
3166        }
3167        if right_col.num_bytes() < required_right_bytes {
3168            return Err(XlogError::Kernel(format!(
3169                "sort_merge: right key column has {} bytes; \
3170                 require at least {} ({} rows × 4)",
3171                right_col.num_bytes(),
3172                required_right_bytes,
3173                num_right
3174            )));
3175        }
3176
3177        // ----- 4. Fail-closed threshold check via checked_mul -----
3178        let upper_bound: u64 = (num_left as u64)
3179            .checked_mul(num_right as u64)
3180            .ok_or_else(|| XlogError::Kernel("sort_merge: row-count product overflow".into()))?;
3181        if upper_bound > NESTED_LOOP_TOTAL_THRESHOLD {
3182            return Err(XlogError::Kernel(format!(
3183                "sort_merge: caller violated eligibility threshold: \
3184                 num_left * num_right = {} > {} (NESTED_LOOP_TOTAL_THRESHOLD)",
3185                upper_bound, NESTED_LOOP_TOTAL_THRESHOLD
3186            )));
3187        }
3188
3189        // ----- 5. Allocate index arrays + counter -----
3190        let upper_bound_usize = upper_bound as usize;
3191        let mut d_output_left_idx = self.memory.alloc::<u32>(upper_bound_usize)?;
3192        let mut d_output_right_idx = self.memory.alloc::<u32>(upper_bound_usize)?;
3193        let mut d_output_count = self.memory.alloc::<u32>(1)?;
3194        self.device
3195            .inner()
3196            .memset_zeros(&mut d_output_count)
3197            .map_err(|e| XlogError::Kernel(format!("sort_merge: counter zero: {}", e)))?;
3198
3199        // ----- 6. Launch kernel (variant-agnostic column refs) -----
3200        let func = self
3201            .device
3202            .inner()
3203            .get_func(
3204                JOIN_MODULE,
3205                join_kernels::SORT_MERGE_JOIN_INNER_U32_1KEY_PAIRS,
3206            )
3207            .ok_or_else(|| {
3208                XlogError::Kernel("sort_merge_join_inner_u32_1key_pairs kernel not found".into())
3209            })?;
3210
3211        let num_left_u32 = num_left as u32;
3212        let num_right_u32 = num_right as u32;
3213        let upper_bound_u32 = upper_bound as u32;
3214        let block_size = 256u32;
3215        let grid_size = num_left_u32.div_ceil(block_size);
3216        let config = LaunchConfig {
3217            grid_dim: (grid_size, 1, 1),
3218            block_dim: (block_size, 1, 1),
3219            shared_mem_bytes: 0,
3220        };
3221
3222        // SAFETY: kernel signature matches PTX:
3223        //   sort_merge_join_inner_u32_1key_pairs(
3224        //     const uint32_t* left_keys (sorted ascending),
3225        //     const uint32_t* right_keys (sorted ascending),
3226        //     uint32_t num_left, uint32_t num_right,
3227        //     uint32_t* output_left_idx, uint32_t* output_right_idx,
3228        //     uint32_t* output_count, uint32_t output_capacity)
3229        // Byte length validated above; sortedness is a caller-
3230        // supplied invariant; no dispatch-site pre-check exists after
3231        // the executor unwiring. Counts fit
3232        // in u32 by caller-supplied input-size bound; allocations
3233        // sized to upper_bound; counter pre-zeroed.
3234        unsafe {
3235            func.clone()
3236                .launch(
3237                    config,
3238                    (
3239                        left_col,
3240                        right_col,
3241                        num_left_u32,
3242                        num_right_u32,
3243                        &mut d_output_left_idx,
3244                        &mut d_output_right_idx,
3245                        &mut d_output_count,
3246                        upper_bound_u32,
3247                    ),
3248                )
3249                .map_err(|e| XlogError::Kernel(format!("sort_merge launch: {}", e)))?;
3250        }
3251
3252        self.device.synchronize()?;
3253
3254        // ----- 7. D2H the output count (single u32) -----
3255        let output_rows = self.dtoh_scalar_untracked(&d_output_count, 0)?;
3256        // Defense-in-depth: kernel's atomic-cap branch ensures
3257        // output_rows ≤ upper_bound, but double-check.
3258        if (output_rows as u64) > upper_bound {
3259            return Err(XlogError::Kernel(format!(
3260                "sort_merge: kernel reported {} output rows > upper_bound {}",
3261                output_rows, upper_bound
3262            )));
3263        }
3264
3265        // ----- 8. Gather both sides via existing GPU machinery -----
3266        let gathered_left = self.gather_buffer_by_indices(left, &d_output_left_idx, output_rows)?;
3267        let gathered_right =
3268            self.gather_buffer_by_indices(right, &d_output_right_idx, output_rows)?;
3269
3270        // ----- 9. Combine columns + return drop-in result -----
3271        let combined_schema = self.combine_schemas(left.schema(), right.schema());
3272        let mut result_columns = Vec::with_capacity(combined_schema.arity());
3273        result_columns.extend(gathered_left.columns);
3274        result_columns.extend(gathered_right.columns);
3275        self.buffer_from_columns(result_columns, output_rows as u64, combined_schema)
3276    }
3277
3278    /// Sorted-chain variant of [`Self::sort_merge_join_v2_inner_u32_1key`].
3279    ///
3280    /// The sort-merge operator is product-thresholded because it allocates
3281    /// `|left| * |right|` candidate pairs. Chain routing uses this bounded
3282    /// variant only for sorted large inputs where the expected
3283    /// fanout is one-to-one; capacity is caller supplied and the kernel's
3284    /// logical output counter is checked after launch. If duplicates make
3285    /// the true output exceed `output_capacity`, this returns an error so
3286    /// the caller can fail closed to the hash fallback.
3287    pub fn sort_merge_join_v2_inner_u32_1key_bounded(
3288        &self,
3289        left: &CudaBuffer,
3290        right: &CudaBuffer,
3291        left_key: usize,
3292        right_key: usize,
3293        output_capacity: usize,
3294    ) -> Result<CudaBuffer> {
3295        let num_left = self.device_row_count(left)?;
3296        let num_right = self.device_row_count(right)?;
3297
3298        if num_left == 0 || num_right == 0 {
3299            let combined_schema = self.combine_schemas(left.schema(), right.schema());
3300            return self.create_empty_buffer(combined_schema);
3301        }
3302        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
3303            return Err(XlogError::Kernel(format!(
3304                "sort_merge_bounded: row counts exceed u32 surface: left={} right={}",
3305                num_left, num_right
3306            )));
3307        }
3308        if output_capacity == 0 || output_capacity > u32::MAX as usize {
3309            return Err(XlogError::Kernel(format!(
3310                "sort_merge_bounded: invalid output capacity {}",
3311                output_capacity
3312            )));
3313        }
3314        if left.arity() <= left_key {
3315            return Err(XlogError::Kernel(format!(
3316                "sort_merge_bounded: left_key={} out of bounds (arity={})",
3317                left_key,
3318                left.arity()
3319            )));
3320        }
3321        if right.arity() <= right_key {
3322            return Err(XlogError::Kernel(format!(
3323                "sort_merge_bounded: right_key={} out of bounds (arity={})",
3324                right_key,
3325                right.arity()
3326            )));
3327        }
3328        let lt = left.schema().column_type(left_key);
3329        let rt = right.schema().column_type(right_key);
3330        if lt != rt || !matches!(lt, Some(ScalarType::U32) | Some(ScalarType::Symbol)) {
3331            return Err(XlogError::Kernel(format!(
3332                "sort_merge_bounded: key types must be equal U32/Symbol; got left={:?} right={:?}",
3333                lt, rt
3334            )));
3335        }
3336
3337        let left_col = left.column(left_key).ok_or_else(|| {
3338            XlogError::Kernel(format!("sort_merge_bounded: left.column({})", left_key))
3339        })?;
3340        let right_col = right.column(right_key).ok_or_else(|| {
3341            XlogError::Kernel(format!("sort_merge_bounded: right.column({})", right_key))
3342        })?;
3343        let required_left_bytes = num_left
3344            .checked_mul(4)
3345            .ok_or_else(|| XlogError::Kernel("sort_merge_bounded: left byte overflow".into()))?;
3346        let required_right_bytes = num_right
3347            .checked_mul(4)
3348            .ok_or_else(|| XlogError::Kernel("sort_merge_bounded: right byte overflow".into()))?;
3349        if left_col.num_bytes() < required_left_bytes {
3350            return Err(XlogError::Kernel(format!(
3351                "sort_merge_bounded: left key column has {} bytes; require at least {}",
3352                left_col.num_bytes(),
3353                required_left_bytes
3354            )));
3355        }
3356        if right_col.num_bytes() < required_right_bytes {
3357            return Err(XlogError::Kernel(format!(
3358                "sort_merge_bounded: right key column has {} bytes; require at least {}",
3359                right_col.num_bytes(),
3360                required_right_bytes
3361            )));
3362        }
3363
3364        let mut d_output_left_idx = self.memory.alloc::<u32>(output_capacity)?;
3365        let mut d_output_right_idx = self.memory.alloc::<u32>(output_capacity)?;
3366        let mut d_output_count = self.memory.alloc::<u32>(1)?;
3367        self.device
3368            .inner()
3369            .memset_zeros(&mut d_output_count)
3370            .map_err(|e| XlogError::Kernel(format!("sort_merge_bounded: counter zero: {}", e)))?;
3371
3372        let func = self
3373            .device
3374            .inner()
3375            .get_func(
3376                JOIN_MODULE,
3377                join_kernels::SORT_MERGE_JOIN_INNER_U32_1KEY_PAIRS,
3378            )
3379            .ok_or_else(|| {
3380                XlogError::Kernel("sort_merge_join_inner_u32_1key_pairs kernel not found".into())
3381            })?;
3382
3383        let num_left_u32 = num_left as u32;
3384        let num_right_u32 = num_right as u32;
3385        let output_capacity_u32 = output_capacity as u32;
3386        let block_size = 256u32;
3387        let grid_size = num_left_u32.div_ceil(block_size);
3388        let config = LaunchConfig {
3389            grid_dim: (grid_size, 1, 1),
3390            block_dim: (block_size, 1, 1),
3391            shared_mem_bytes: 0,
3392        };
3393
3394        unsafe {
3395            func.clone()
3396                .launch(
3397                    config,
3398                    (
3399                        left_col,
3400                        right_col,
3401                        num_left_u32,
3402                        num_right_u32,
3403                        &mut d_output_left_idx,
3404                        &mut d_output_right_idx,
3405                        &mut d_output_count,
3406                        output_capacity_u32,
3407                    ),
3408                )
3409                .map_err(|e| XlogError::Kernel(format!("sort_merge_bounded launch: {}", e)))?;
3410        }
3411
3412        self.device.synchronize()?;
3413        let output_rows = self.dtoh_scalar_untracked(&d_output_count, 0)?;
3414        if output_rows as usize > output_capacity {
3415            return Err(XlogError::Kernel(format!(
3416                "sort_merge_bounded: output {} exceeded bounded capacity {}",
3417                output_rows, output_capacity
3418            )));
3419        }
3420
3421        let gathered_left = self.gather_buffer_by_indices(left, &d_output_left_idx, output_rows)?;
3422        let gathered_right =
3423            self.gather_buffer_by_indices(right, &d_output_right_idx, output_rows)?;
3424
3425        let combined_schema = self.combine_schemas(left.schema(), right.schema());
3426        let mut result_columns = Vec::with_capacity(combined_schema.arity());
3427        result_columns.extend(gathered_left.columns);
3428        result_columns.extend(gathered_right.columns);
3429        self.buffer_from_columns(result_columns, output_rows as u64, combined_schema)
3430    }
3431
3432    /// Build a cached join index for the right/build side of v2 hash join.
3433    pub fn build_join_index_v2(
3434        &self,
3435        right: &CudaBuffer,
3436        right_keys: &[usize],
3437    ) -> Result<JoinIndexV2> {
3438        let num_right = self.device_row_count(right)?;
3439        if num_right == 0 {
3440            return Err(XlogError::Kernel(
3441                "Cannot build join index for empty relation".to_string(),
3442            ));
3443        }
3444        if num_right > u32::MAX as usize {
3445            return Err(XlogError::Kernel(format!(
3446                "Join index supports at most {} rows, got {}",
3447                u32::MAX,
3448                num_right
3449            )));
3450        }
3451        if right_keys.is_empty() {
3452            return Err(XlogError::Kernel(
3453                "Join requires at least one key column".to_string(),
3454            ));
3455        }
3456        for &k in right_keys {
3457            if k >= right.arity() {
3458                return Err(XlogError::Kernel(format!(
3459                    "Right key column index {} out of bounds (arity {})",
3460                    k,
3461                    right.arity()
3462                )));
3463            }
3464        }
3465
3466        let num_right = num_right as u32;
3467        let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
3468        let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
3469
3470        Ok(JoinIndexV2 {
3471            right_num_rows: num_right,
3472            right_keys: right_keys.to_vec(),
3473            key_bytes: right_packed.key_bytes,
3474            packed_keys: right_packed.packed_keys,
3475            table,
3476        })
3477    }
3478
3479    /// Build a cached join index for background persistent-index mode.
3480    ///
3481    /// When recorded hash joins are enabled and the provider has a runtime-backed
3482    /// manager, the build is enqueued on the provider's recorded operation stream
3483    /// and dependency-recorded like the indexed join consumer path. Otherwise this
3484    /// falls back to the legacy synchronous builder.
3485    pub fn build_join_index_v2_background(
3486        &self,
3487        right: &CudaBuffer,
3488        right_keys: &[usize],
3489    ) -> Result<JoinIndexV2> {
3490        if Self::use_recorded_hash_join_env()
3491            && !right_keys.is_empty()
3492            && right_keys.len() <= 4
3493            && right.num_rows() > 0
3494        {
3495            if let Some(launch_stream) = self.recorded_op_stream_or_init() {
3496                return self.build_join_index_v2_recorded(right, right_keys, launch_stream);
3497            }
3498        }
3499
3500        self.build_join_index_v2(right, right_keys)
3501    }
3502
3503    /// Recorded-stream join-index builder used by persistent background builds.
3504    ///
3505    /// The build side is packed and bucketized on `launch_stream`; the returned
3506    /// `JoinIndexV2` carries runtime-tracked buffers whose writes were committed
3507    /// through the launch recorder / stream dependency machinery.
3508    pub fn build_join_index_v2_recorded(
3509        &self,
3510        right: &CudaBuffer,
3511        right_keys: &[usize],
3512        launch_stream: StreamId,
3513    ) -> Result<JoinIndexV2> {
3514        let runtime = self.memory.runtime().ok_or_else(|| {
3515            XlogError::Kernel(
3516                "build_join_index_v2_recorded requires a runtime-backed GpuMemoryManager"
3517                    .to_string(),
3518            )
3519        })?;
3520        let cu_stream = runtime
3521            .stream_pool()
3522            .resolve(launch_stream)
3523            .ok_or_else(|| {
3524                XlogError::Kernel(format!(
3525                    "build_join_index_v2_recorded: launch_stream StreamId({}) does not resolve",
3526                    launch_stream.0
3527                ))
3528            })?;
3529
3530        let num_right = self.device_row_count(right)?;
3531        if num_right == 0 {
3532            return Err(XlogError::Kernel(
3533                "Cannot build join index for empty relation".to_string(),
3534            ));
3535        }
3536        if num_right > u32::MAX as usize {
3537            return Err(XlogError::Kernel(format!(
3538                "Join index supports at most {} rows, got {}",
3539                u32::MAX,
3540                num_right
3541            )));
3542        }
3543        if right_keys.is_empty() {
3544            return Err(XlogError::Kernel(
3545                "Join requires at least one key column".to_string(),
3546            ));
3547        }
3548        if right_keys.len() > 4 {
3549            return Err(XlogError::Kernel(
3550                "build_join_index_v2_recorded: max 4 key columns supported".to_string(),
3551            ));
3552        }
3553        for &k in right_keys {
3554            if k >= right.arity() {
3555                return Err(XlogError::Kernel(format!(
3556                    "Right key column index {} out of bounds (arity {})",
3557                    k,
3558                    right.arity()
3559                )));
3560            }
3561        }
3562
3563        let num_right = num_right as u32;
3564        let right_packed =
3565            self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
3566        let table = self.build_hash_table_v2_on_stream(
3567            &right_packed.hashes,
3568            num_right,
3569            &cu_stream,
3570            launch_stream,
3571            runtime,
3572        )?;
3573
3574        Ok(JoinIndexV2 {
3575            right_num_rows: num_right,
3576            right_keys: right_keys.to_vec(),
3577            key_bytes: right_packed.key_bytes,
3578            packed_keys: right_packed.packed_keys,
3579            table,
3580        })
3581    }
3582
3583    /// Hash join using a cached build-side join index.
3584    ///
3585    /// The `index` must have been built for the same `right` buffer and `right_keys`.
3586    #[allow(clippy::too_many_arguments)]
3587    pub fn hash_join_v2_with_index(
3588        &self,
3589        left: &CudaBuffer,
3590        right: &CudaBuffer,
3591        left_keys: &[usize],
3592        right_keys: &[usize],
3593        join_type: JoinType,
3594        index: &JoinIndexV2,
3595        max_output: Option<usize>,
3596    ) -> Result<CudaBuffer> {
3597        // Env-gated recorded dispatch. Same `≤4 key column`
3598        // constraint as the non-indexed variant.
3599        if Self::use_recorded_hash_join_env()
3600            && !left_keys.is_empty()
3601            && left_keys.len() == right_keys.len()
3602            && left_keys.len() <= 4
3603        {
3604            if let Some(launch_stream) = self.recorded_op_stream_or_init() {
3605                return self.hash_join_v2_with_index_recorded(
3606                    left,
3607                    right,
3608                    left_keys,
3609                    right_keys,
3610                    join_type,
3611                    index,
3612                    max_output,
3613                    launch_stream,
3614                );
3615            }
3616        }
3617        let left_rows = self.device_row_count(left)?;
3618        let right_rows = self.device_row_count(right)?;
3619        if left_rows > u32::MAX as usize || right_rows > u32::MAX as usize {
3620            return Err(XlogError::Kernel(format!(
3621                "Join supports at most {} rows per side (left={}, right={})",
3622                u32::MAX,
3623                left_rows,
3624                right_rows
3625            )));
3626        }
3627
3628        // Handle empty inputs early.
3629        if left_rows == 0 {
3630            return match join_type {
3631                JoinType::Inner | JoinType::LeftOuter => {
3632                    let combined_schema = self.combine_schemas(left.schema(), right.schema());
3633                    self.create_empty_buffer(combined_schema)
3634                }
3635                JoinType::Semi | JoinType::Anti => self.create_empty_buffer(left.schema().clone()),
3636            };
3637        }
3638        if right_rows == 0 {
3639            return match join_type {
3640                JoinType::Inner => {
3641                    let combined_schema = self.combine_schemas(left.schema(), right.schema());
3642                    self.create_empty_buffer(combined_schema)
3643                }
3644                JoinType::Semi => self.create_empty_buffer(left.schema().clone()),
3645                JoinType::Anti => self.clone_buffer(left),
3646                JoinType::LeftOuter => self.left_outer_with_nulls(left, right),
3647            };
3648        }
3649
3650        // Validate key columns.
3651        if left_keys.is_empty() || right_keys.is_empty() {
3652            return Err(XlogError::Kernel(
3653                "Join requires at least one key column".to_string(),
3654            ));
3655        }
3656        if left_keys.len() != right_keys.len() {
3657            return Err(XlogError::Kernel(
3658                "Left and right key columns must have same length".to_string(),
3659            ));
3660        }
3661        for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
3662            if left_idx >= left.arity() {
3663                return Err(XlogError::Kernel(format!(
3664                    "Left key column index {} out of bounds (arity {})",
3665                    left_idx,
3666                    left.arity()
3667                )));
3668            }
3669            if right_idx >= right.arity() {
3670                return Err(XlogError::Kernel(format!(
3671                    "Right key column index {} out of bounds (arity {})",
3672                    right_idx,
3673                    right.arity()
3674                )));
3675            }
3676            let left_type = left.schema().column_type(left_idx);
3677            let right_type = right.schema().column_type(right_idx);
3678            if left_type != right_type {
3679                return Err(XlogError::Kernel(format!(
3680                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
3681                    left_idx, left_type, right_idx, right_type
3682                )));
3683            }
3684        }
3685
3686        // Validate index matches the right side.
3687        if index.right_num_rows != right_rows as u32 {
3688            return Err(XlogError::Kernel(
3689                "Join index row count does not match right relation".to_string(),
3690            ));
3691        }
3692        if index.right_keys.as_slice() != right_keys {
3693            return Err(XlogError::Kernel(
3694                "Join index key columns do not match requested right_keys".to_string(),
3695            ));
3696        }
3697
3698        match join_type {
3699            JoinType::Inner => {
3700                self.hash_join_inner_v2_indexed(left, right, left_keys, index, max_output)
3701            }
3702            JoinType::Semi => self.hash_join_semi_indexed(left, left_keys, index),
3703            JoinType::Anti => self.hash_join_anti_indexed(left, right, left_keys, index),
3704            JoinType::LeftOuter => {
3705                self.hash_join_left_outer_indexed(left, right, left_keys, index, max_output)
3706            }
3707        }
3708    }
3709
3710    /// Pack key columns on GPU and compute hashes (no host roundtrip).
3711    ///
3712    /// Uses the fused `pack_and_hash_keys` kernel for optimal performance when both
3713    /// packed keys and hashes are needed. This eliminates the host roundtrip that
3714    /// was previously required for column-major to row-major conversion.
3715    ///
3716    /// # Arguments
3717    /// * `buffer` - Source buffer with columns to pack
3718    /// * `key_cols` - Indices of columns to pack as keys (max 4)
3719    ///
3720    /// # Returns
3721    /// `PackedKeyData` containing GPU-resident packed keys and hashes
3722    ///
3723    /// # Errors
3724    /// Returns `XlogError::Kernel` if:
3725    /// - No key columns specified
3726    /// - More than 4 key columns specified (kernel limitation)
3727    /// - Column index is out of bounds
3728    /// - Kernel launch fails
3729    fn pack_keys_gpu(&self, buffer: &CudaBuffer, key_cols: &[usize]) -> Result<PackedKeyData> {
3730        if key_cols.is_empty() {
3731            return Err(XlogError::Kernel(
3732                "pack_keys_gpu: no key columns specified".into(),
3733            ));
3734        }
3735        if key_cols.len() > 4 {
3736            return Err(XlogError::Kernel(
3737                "pack_keys_gpu: max 4 key columns supported".into(),
3738            ));
3739        }
3740
3741        let num_rows = self.device_row_count(buffer)?;
3742        if num_rows > u32::MAX as usize {
3743            return Err(XlogError::Kernel(format!(
3744                "pack_keys_gpu supports at most {} rows, got {}",
3745                u32::MAX,
3746                num_rows
3747            )));
3748        }
3749        let num_rows = num_rows as u32;
3750        if num_rows == 0 {
3751            // Handle empty buffer case
3752            return Ok(PackedKeyData {
3753                hashes: self.memory.alloc::<u64>(0)?,
3754                packed_keys: self.memory.alloc::<u8>(0)?,
3755                key_bytes: 0,
3756            });
3757        }
3758
3759        // Calculate column sizes and total row size
3760        let mut col_sizes: Vec<u32> = Vec::with_capacity(key_cols.len());
3761        let mut row_size: u32 = 0;
3762        for &col_idx in key_cols {
3763            let col_type = buffer
3764                .schema()
3765                .column_type(col_idx)
3766                .ok_or_else(|| XlogError::Kernel(format!("Invalid column index: {}", col_idx)))?;
3767            let size = col_type.size_bytes() as u32;
3768            col_sizes.push(size);
3769            row_size += size;
3770        }
3771
3772        // Allocate output buffers on GPU
3773        let packed_bytes = (num_rows as u64) * (row_size as u64);
3774        let packed_slice = self.memory.alloc::<u8>(packed_bytes as usize)?;
3775        let hash_slice = self.memory.alloc::<u64>(num_rows as usize)?;
3776
3777        // Get column device pointers as u64 values for the kernel
3778        // The kernel expects raw pointers as u64
3779        let mut col_ptrs: [u64; 4] = [0; 4];
3780        for (i, &col_idx) in key_cols.iter().enumerate() {
3781            let col = buffer
3782                .column(col_idx)
3783                .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", col_idx)))?;
3784            // Get the device pointer as a raw u64 value
3785            col_ptrs[i] = *col.device_ptr();
3786        }
3787        let mut packed_col_sizes = 0u64;
3788        for (i, size) in col_sizes.iter().copied().enumerate() {
3789            if size > u16::MAX as u32 {
3790                return Err(XlogError::Kernel(format!(
3791                    "pack_keys_gpu: column element size {} exceeds 16-bit kernel argument",
3792                    size
3793                )));
3794            }
3795            packed_col_sizes |= (size as u64) << (i * 16);
3796        }
3797
3798        // Get the kernel function
3799        let func = self
3800            .device
3801            .inner()
3802            .get_func(PACK_MODULE, pack_kernels::PACK_AND_HASH_KEYS)
3803            .ok_or_else(|| XlogError::Kernel("pack_and_hash_keys kernel not found".to_string()))?;
3804
3805        // Launch configuration
3806        let block_size = 256u32;
3807        let grid_size = num_rows.div_ceil(block_size);
3808        let config = LaunchConfig {
3809            grid_dim: (grid_size, 1, 1),
3810            block_dim: (block_size, 1, 1),
3811            shared_mem_bytes: 0,
3812        };
3813
3814        // Launch the fused pack+hash kernel
3815        // SAFETY: Kernel signature matches pack_and_hash_keys in pack.cu:
3816        // pack_and_hash_keys(col0, col1, col2, col3, packed_col_sizes, num_cols, num_rows, row_size, packed_output, hashes)
3817        // Column pointers are passed as CudaSlice references - the kernel sees raw device pointers.
3818        // We pass column data as raw pointers cast to u8* in the kernel.
3819        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
3820        unsafe {
3821            func.clone()
3822                .launch(
3823                    config,
3824                    (
3825                        col_ptrs[0],
3826                        col_ptrs[1],
3827                        col_ptrs[2],
3828                        col_ptrs[3],
3829                        packed_col_sizes,
3830                        key_cols.len() as u32,
3831                        num_rows,
3832                        row_size,
3833                        &packed_slice,
3834                        &hash_slice,
3835                    ),
3836                )
3837                .map_err(|e| {
3838                    XlogError::Kernel(format!("pack_and_hash_keys launch failed: {}", e))
3839                })?;
3840        }
3841
3842        self.device.synchronize()?;
3843
3844        Ok(PackedKeyData {
3845            hashes: hash_slice,
3846            packed_keys: packed_slice,
3847            key_bytes: row_size,
3848        })
3849    }
3850
3851    /// Pack key columns on GPU and compute hashes for arbitrary column counts.
3852    fn pack_keys_gpu_generic(
3853        &self,
3854        buffer: &CudaBuffer,
3855        key_cols: &[usize],
3856    ) -> Result<PackedKeyData> {
3857        if key_cols.is_empty() {
3858            return Err(XlogError::Kernel(
3859                "pack_keys_gpu_generic: no key columns specified".into(),
3860            ));
3861        }
3862
3863        let num_rows = self.device_row_count(buffer)?;
3864        if num_rows > u32::MAX as usize {
3865            return Err(XlogError::Kernel(format!(
3866                "pack_keys_gpu_generic supports at most {} rows, got {}",
3867                u32::MAX,
3868                num_rows
3869            )));
3870        }
3871        let num_rows = num_rows as u32;
3872        if num_rows == 0 {
3873            return Ok(PackedKeyData {
3874                hashes: self.memory.alloc::<u64>(0)?,
3875                packed_keys: self.memory.alloc::<u8>(0)?,
3876                key_bytes: 0,
3877            });
3878        }
3879
3880        let mut col_sizes: Vec<u32> = Vec::with_capacity(key_cols.len());
3881        let mut col_ptrs: Vec<u64> = Vec::with_capacity(key_cols.len());
3882        let mut row_size: u32 = 0;
3883
3884        for &col_idx in key_cols {
3885            let col_type = buffer
3886                .schema()
3887                .column_type(col_idx)
3888                .ok_or_else(|| XlogError::Kernel(format!("Invalid column index: {}", col_idx)))?;
3889            let size = col_type.size_bytes() as u32;
3890            row_size = row_size
3891                .checked_add(size)
3892                .ok_or_else(|| XlogError::Kernel("Row size overflow".to_string()))?;
3893            col_sizes.push(size);
3894
3895            let col = buffer
3896                .column(col_idx)
3897                .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", col_idx)))?;
3898            col_ptrs.push(*col.device_ptr());
3899        }
3900
3901        let packed_bytes = (num_rows as u64)
3902            .checked_mul(row_size as u64)
3903            .ok_or_else(|| XlogError::Kernel("Packed key byte size overflow".to_string()))?;
3904        let packed_slice = self.memory.alloc::<u8>(packed_bytes as usize)?;
3905        let hash_slice = self.memory.alloc::<u64>(num_rows as usize)?;
3906
3907        let mut d_col_sizes = self.memory.alloc::<u32>(col_sizes.len())?;
3908        self.htod_sync_copy_into_tracked(&col_sizes, &mut d_col_sizes)
3909            .map_err(|e| XlogError::Kernel(format!("Failed to upload col_sizes: {}", e)))?;
3910
3911        let mut d_col_ptrs = self.memory.alloc::<u64>(col_ptrs.len())?;
3912        self.htod_sync_copy_into_tracked(&col_ptrs, &mut d_col_ptrs)
3913            .map_err(|e| XlogError::Kernel(format!("Failed to upload col_ptrs: {}", e)))?;
3914
3915        let func = self
3916            .device
3917            .inner()
3918            .get_func(PACK_MODULE, pack_kernels::PACK_AND_HASH_KEYS_GENERIC)
3919            .ok_or_else(|| {
3920                XlogError::Kernel("pack_and_hash_keys_generic kernel not found".to_string())
3921            })?;
3922
3923        let block_size = 256u32;
3924        let grid_size = num_rows.div_ceil(block_size);
3925        let config = LaunchConfig {
3926            grid_dim: (grid_size, 1, 1),
3927            block_dim: (block_size, 1, 1),
3928            shared_mem_bytes: 0,
3929        };
3930
3931        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
3932        unsafe {
3933            func.clone()
3934                .launch(
3935                    config,
3936                    (
3937                        &d_col_ptrs,
3938                        &d_col_sizes,
3939                        key_cols.len() as u32,
3940                        num_rows,
3941                        row_size,
3942                        &packed_slice,
3943                        &hash_slice,
3944                    ),
3945                )
3946                .map_err(|e| {
3947                    XlogError::Kernel(format!("pack_and_hash_keys_generic launch failed: {}", e))
3948                })?;
3949        }
3950
3951        self.device.synchronize()?;
3952
3953        Ok(PackedKeyData {
3954            hashes: hash_slice,
3955            packed_keys: packed_slice,
3956            key_bytes: row_size,
3957        })
3958    }
3959
3960    /// Compute composite hashes AND return packed key data for key verification.
3961    ///
3962    /// Uses GPU-side packing via `pack_keys_gpu` when possible (1-4 key columns).
3963    /// Falls back to CPU packing for edge cases or when GPU packing fails.
3964    ///
3965    /// Uses FNV-1a hash to combine all key columns into a single u64 hash per row.
3966    /// Also returns the packed key data for byte-by-byte comparison in join kernels.
3967    pub(super) fn compute_hashes_and_pack_keys(
3968        &self,
3969        buffer: &CudaBuffer,
3970        key_cols: &[usize],
3971    ) -> Result<PackedKeyData> {
3972        if key_cols.is_empty() {
3973            return Err(XlogError::Kernel(
3974                "compute_hashes_and_pack_keys: no key columns specified".to_string(),
3975            ));
3976        }
3977
3978        if key_cols.len() <= 4 {
3979            self.pack_keys_gpu(buffer, key_cols)
3980        } else {
3981            self.pack_keys_gpu_generic(buffer, key_cols)
3982        }
3983    }
3984
3985    /// Build a cache-friendly hash table from u64 hashes (v2).
3986    ///
3987    /// The table uses a bucketed CSR layout (counts + offsets + entries), avoiding linked-list
3988    /// pointer chasing during probe.
3989    fn build_hash_table_v2(
3990        &self,
3991        hashes: &cudarc::driver::CudaSlice<u64>,
3992        num_rows: u32,
3993    ) -> Result<JoinHashTableV2> {
3994        let device = self.device.inner();
3995
3996        // Number of buckets: next power-of-two >= max(2*num_rows, 1024)
3997        let target = (num_rows as u64).saturating_mul(2).max(1024);
3998        let num_buckets_u64 = target.next_power_of_two();
3999        let num_buckets = u32::try_from(num_buckets_u64).map_err(|_| {
4000            XlogError::Kernel(format!(
4001                "Join hash table too large: num_buckets={}",
4002                num_buckets_u64
4003            ))
4004        })?;
4005        let bucket_mask = num_buckets
4006            .checked_sub(1)
4007            .ok_or_else(|| XlogError::Kernel("Join hash table size underflow".to_string()))?;
4008
4009        let mut bucket_counts = self.memory.alloc::<u32>(num_buckets as usize)?;
4010        if num_buckets > 0 {
4011            device
4012                .memset_zeros(&mut bucket_counts)
4013                .map_err(|e| XlogError::Kernel(format!("Failed to zero bucket_counts: {}", e)))?;
4014            self.device.synchronize()?;
4015        }
4016
4017        let block_size = 256u32;
4018        let grid_size = num_rows.div_ceil(block_size);
4019        let config = LaunchConfig {
4020            grid_dim: (grid_size, 1, 1),
4021            block_dim: (block_size, 1, 1),
4022            shared_mem_bytes: 0,
4023        };
4024
4025        let count_fn = device
4026            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_BUCKET_COUNT_V2)
4027            .ok_or_else(|| {
4028                XlogError::Kernel("hash_join_bucket_count_v2 kernel not found".to_string())
4029            })?;
4030
4031        // SAFETY: hash_join_bucket_count_v2(hashes, num_rows, bucket_counts, bucket_mask)
4032        unsafe {
4033            count_fn
4034                .clone()
4035                .launch(config, (hashes, num_rows, &bucket_counts, bucket_mask))
4036                .map_err(|e| {
4037                    XlogError::Kernel(format!("hash_join_bucket_count_v2 failed: {}", e))
4038                })?;
4039        }
4040        self.device.synchronize()?;
4041
4042        // bucket_offsets = exclusive scan(bucket_counts)
4043        let mut bucket_offsets = self.memory.alloc::<u32>(num_buckets as usize)?;
4044        if num_buckets > 0 {
4045            device
4046                .dtod_copy(&bucket_counts, &mut bucket_offsets)
4047                .map_err(|e| XlogError::Kernel(format!("Failed to copy bucket_counts: {}", e)))?;
4048            self.device.synchronize()?;
4049            self.multiblock_scan_u32_inplace(&mut bucket_offsets, num_buckets)?;
4050            self.device.synchronize()?;
4051        }
4052
4053        // bucket_cursors = bucket_offsets (then atomically incremented during scatter)
4054        let mut bucket_cursors = self.memory.alloc::<u32>(num_buckets as usize)?;
4055        if num_buckets > 0 {
4056            device
4057                .dtod_copy(&bucket_offsets, &mut bucket_cursors)
4058                .map_err(|e| XlogError::Kernel(format!("Failed to copy bucket_offsets: {}", e)))?;
4059            self.device.synchronize()?;
4060        }
4061
4062        let bucket_entries = self.memory.alloc::<u32>(num_rows as usize)?;
4063        let bucket_entry_hashes = self.memory.alloc::<u64>(num_rows as usize)?;
4064
4065        let scatter_fn = device
4066            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SCATTER_V2)
4067            .ok_or_else(|| {
4068                XlogError::Kernel("hash_join_scatter_v2 kernel not found".to_string())
4069            })?;
4070
4071        // SAFETY: hash_join_scatter_v2(hashes, num_rows, bucket_cursors, bucket_mask, bucket_entries, bucket_entry_hashes)
4072        unsafe {
4073            scatter_fn
4074                .clone()
4075                .launch(
4076                    config,
4077                    (
4078                        hashes,
4079                        num_rows,
4080                        &bucket_cursors,
4081                        bucket_mask,
4082                        &bucket_entries,
4083                        &bucket_entry_hashes,
4084                    ),
4085                )
4086                .map_err(|e| XlogError::Kernel(format!("hash_join_scatter_v2 failed: {}", e)))?;
4087        }
4088
4089        self.device.synchronize()?;
4090        Ok(JoinHashTableV2 {
4091            bucket_counts,
4092            bucket_offsets,
4093            bucket_entries,
4094            bucket_entry_hashes,
4095            bucket_mask,
4096        })
4097    }
4098
4099    /// Build a bucketed hash table from a u64 hash array.
4100    pub fn build_hash_table_u64(
4101        &self,
4102        hashes: &crate::memory::TrackedCudaSlice<u64>,
4103        num_rows: u32,
4104    ) -> Result<HashTableU64> {
4105        let JoinHashTableV2 {
4106            bucket_counts,
4107            bucket_offsets,
4108            bucket_entries,
4109            bucket_entry_hashes,
4110            bucket_mask,
4111        } = self.build_hash_table_v2(hashes, num_rows)?;
4112        Ok(HashTableU64 {
4113            bucket_counts,
4114            bucket_offsets,
4115            bucket_entries,
4116            bucket_entry_hashes,
4117            bucket_mask,
4118        })
4119    }
4120
4121    /// Inner join implementation using v2 kernels
4122    fn hash_join_inner_v2(
4123        &self,
4124        left: &CudaBuffer,
4125        right: &CudaBuffer,
4126        left_keys: &[usize],
4127        right_keys: &[usize],
4128        max_output: Option<usize>,
4129    ) -> Result<CudaBuffer> {
4130        let num_left = self.device_row_count(left)?;
4131        let num_right = self.device_row_count(right)?;
4132        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4133            return Err(XlogError::Kernel(format!(
4134                "Join supports at most {} rows per side (left={}, right={})",
4135                u32::MAX,
4136                num_left,
4137                num_right
4138            )));
4139        }
4140
4141        // Handle empty inputs
4142        if num_left == 0 || num_right == 0 {
4143            let combined_schema = self.combine_schemas(left.schema(), right.schema());
4144            return self.create_empty_buffer(combined_schema);
4145        }
4146
4147        // Validate key columns
4148        if left_keys.is_empty() || right_keys.is_empty() {
4149            return Err(XlogError::Kernel(
4150                "Join requires at least one key column".to_string(),
4151            ));
4152        }
4153        if left_keys.len() != right_keys.len() {
4154            return Err(XlogError::Kernel(
4155                "Left and right key columns must have same length".to_string(),
4156            ));
4157        }
4158
4159        // Validate key column types match
4160        for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
4161            let left_type = left.schema().column_type(left_idx);
4162            let right_type = right.schema().column_type(right_idx);
4163            if left_type != right_type {
4164                return Err(XlogError::Kernel(format!(
4165                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
4166                    left_idx, left_type, right_idx, right_type
4167                )));
4168            }
4169        }
4170
4171        let num_left = num_left as u32;
4172        let num_right = num_right as u32;
4173
4174        // Compute composite hashes and pack keys for both sides
4175        let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4176        let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
4177
4178        // Build hash table from right side (cache-friendly bucket layout).
4179        let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
4180
4181        // Count join output (no truncation) to size buffers precisely.
4182        //
4183        // The probe kernel always increments output_count, even when max_output==0,
4184        // so we can run a first pass with max_output=0 to get the full match count.
4185        let probe_func = self
4186            .device
4187            .inner()
4188            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
4189            .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
4190
4191        let block_size = 256u32;
4192        let probe_grid = num_left.div_ceil(block_size);
4193        let probe_config = LaunchConfig {
4194            grid_dim: (probe_grid, 1, 1),
4195            block_dim: (block_size, 1, 1),
4196            shared_mem_bytes: 0,
4197        };
4198
4199        let mut d_count_only = self.memory.alloc::<u32>(1)?;
4200        self.device
4201            .inner()
4202            .memset_zeros(&mut d_count_only)
4203            .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
4204        self.device.synchronize()?;
4205        let d_dummy_left = self.memory.alloc::<u32>(1)?;
4206        let d_dummy_right = self.memory.alloc::<u32>(1)?;
4207        let max_output_count_only = 0u32;
4208
4209        // SAFETY: hash_join_probe_v2(probe_hashes, num_probe,
4210        //                            bucket_offsets, bucket_counts, bucket_entries, bucket_entry_hashes, bucket_mask,
4211        //                            probe_keys, build_keys, key_bytes,
4212        //                            output_left, output_right, output_count, max_output)
4213        // Note: Using raw pointer launch because tuple exceeds 12-element limit
4214        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4215        unsafe {
4216            let mut params: Vec<*mut c_void> = vec![
4217                (&left_packed.hashes).as_kernel_param(),
4218                num_left.as_kernel_param(),
4219                (&table.bucket_offsets).as_kernel_param(),
4220                (&table.bucket_counts).as_kernel_param(),
4221                (&table.bucket_entries).as_kernel_param(),
4222                (&table.bucket_entry_hashes).as_kernel_param(),
4223                table.bucket_mask.as_kernel_param(),
4224                (&left_packed.packed_keys).as_kernel_param(),
4225                (&right_packed.packed_keys).as_kernel_param(),
4226                left_packed.key_bytes.as_kernel_param(),
4227                (&d_dummy_left).as_kernel_param(),
4228                (&d_dummy_right).as_kernel_param(),
4229                (&d_count_only).as_kernel_param(),
4230                max_output_count_only.as_kernel_param(),
4231            ];
4232            probe_func
4233                .clone()
4234                .launch(probe_config, &mut params)
4235                .map_err(|e| {
4236                    XlogError::Kernel(format!("hash_join_probe_v2 (count) failed: {}", e))
4237                })?;
4238        }
4239
4240        self.device.synchronize()?;
4241
4242        // Metadata read: this u32 is the join's count-only result, used to
4243        // size the next allocation. See `read_join_output_count_metadata`
4244        // for the metadata-vs-data-plane rationale.
4245        let full_count = self.read_join_output_count_metadata(&d_count_only)? as u64;
4246        let requested = max_output
4247            .map(|limit| (limit as u64).min(full_count))
4248            .unwrap_or(full_count);
4249
4250        if requested == 0 {
4251            let combined_schema = self.combine_schemas(left.schema(), right.schema());
4252            return self.create_empty_buffer(combined_schema);
4253        }
4254
4255        if requested > u32::MAX as u64 {
4256            return Err(XlogError::Kernel(format!(
4257                "Join produced {} rows which exceeds the u32 index limit",
4258                requested
4259            )));
4260        }
4261
4262        // Allocate output buffers for row index pairs and rerun probe to materialize results.
4263        let max_output = requested as u32;
4264        let d_output_left = self.memory.alloc::<u32>(max_output as usize)?;
4265        let d_output_right = self.memory.alloc::<u32>(max_output as usize)?;
4266        let mut d_output_count = self.memory.alloc::<u32>(1)?;
4267        self.device
4268            .inner()
4269            .memset_zeros(&mut d_output_count)
4270            .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
4271        self.device.synchronize()?;
4272
4273        // SAFETY: hash_join_probe_v2(probe_hashes, num_probe,
4274        //                            bucket_offsets, bucket_counts, bucket_entries, bucket_entry_hashes, bucket_mask,
4275        //                            probe_keys, build_keys, key_bytes,
4276        //                            output_left, output_right, output_count, max_output)
4277        // Note: Using raw pointer launch because tuple exceeds 12-element limit
4278        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4279        unsafe {
4280            let mut params: Vec<*mut c_void> = vec![
4281                (&left_packed.hashes).as_kernel_param(),
4282                num_left.as_kernel_param(),
4283                (&table.bucket_offsets).as_kernel_param(),
4284                (&table.bucket_counts).as_kernel_param(),
4285                (&table.bucket_entries).as_kernel_param(),
4286                (&table.bucket_entry_hashes).as_kernel_param(),
4287                table.bucket_mask.as_kernel_param(),
4288                (&left_packed.packed_keys).as_kernel_param(),
4289                (&right_packed.packed_keys).as_kernel_param(),
4290                left_packed.key_bytes.as_kernel_param(),
4291                (&d_output_left).as_kernel_param(),
4292                (&d_output_right).as_kernel_param(),
4293                (&d_output_count).as_kernel_param(),
4294                max_output.as_kernel_param(),
4295            ];
4296            probe_func
4297                .clone()
4298                .launch(probe_config, &mut params)
4299                .map_err(|e| XlogError::Kernel(format!("hash_join_probe_v2 failed: {}", e)))?;
4300        }
4301
4302        self.device.synchronize()?;
4303
4304        // Metadata read: post-materialize device-side atomic count.
4305        // Used as the result buffer's logical row count after clamping
4306        // to the host-allocated upper bound. See
4307        // `read_join_output_count_metadata` for the rationale.
4308        // Clamp to max_output to prevent buffer overflow (kernel atomically
4309        // increments before bounds check, so count can exceed max_output).
4310        let result_count =
4311            (self.read_join_output_count_metadata(&d_output_count)? as u64).min(max_output as u64);
4312
4313        if result_count == 0 {
4314            let combined_schema = self.combine_schemas(left.schema(), right.schema());
4315            return self.create_empty_buffer(combined_schema);
4316        }
4317
4318        let output_rows = result_count as u32;
4319
4320        // Gather join results fully on-GPU (avoid host index download + host gather).
4321        let gathered_left = self.gather_buffer_by_indices(left, &d_output_left, output_rows)?;
4322        let gathered_right = self.gather_buffer_by_indices(right, &d_output_right, output_rows)?;
4323
4324        let combined_schema = self.combine_schemas(left.schema(), right.schema());
4325        let mut result_columns = Vec::with_capacity(combined_schema.arity());
4326        result_columns.extend(gathered_left.columns);
4327        result_columns.extend(gathered_right.columns);
4328
4329        self.buffer_from_columns(result_columns, result_count, combined_schema)
4330    }
4331
4332    fn hash_join_inner_v2_indexed(
4333        &self,
4334        left: &CudaBuffer,
4335        right: &CudaBuffer,
4336        left_keys: &[usize],
4337        index: &JoinIndexV2,
4338        max_output: Option<usize>,
4339    ) -> Result<CudaBuffer> {
4340        let num_left = self.device_row_count(left)?;
4341        let num_right = self.device_row_count(right)?;
4342        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4343            return Err(XlogError::Kernel(format!(
4344                "Join supports at most {} rows per side (left={}, right={})",
4345                u32::MAX,
4346                num_left,
4347                num_right
4348            )));
4349        }
4350
4351        // Handle empty inputs.
4352        if num_left == 0 || num_right == 0 {
4353            let combined_schema = self.combine_schemas(left.schema(), right.schema());
4354            return self.create_empty_buffer(combined_schema);
4355        }
4356
4357        let num_left = num_left as u32;
4358
4359        // Compute composite hashes and pack probe keys.
4360        let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4361        if left_packed.key_bytes != index.key_bytes {
4362            return Err(XlogError::Kernel(
4363                "Join key byte width mismatch between probe and cached index".to_string(),
4364            ));
4365        }
4366
4367        let table = &index.table;
4368
4369        // Count join output (no truncation) to size buffers precisely.
4370        let probe_func = self
4371            .device
4372            .inner()
4373            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
4374            .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
4375
4376        let block_size = 256u32;
4377        let probe_grid = num_left.div_ceil(block_size);
4378        let probe_config = LaunchConfig {
4379            grid_dim: (probe_grid, 1, 1),
4380            block_dim: (block_size, 1, 1),
4381            shared_mem_bytes: 0,
4382        };
4383
4384        let mut d_count_only = self.memory.alloc::<u32>(1)?;
4385        self.device
4386            .inner()
4387            .memset_zeros(&mut d_count_only)
4388            .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
4389        self.device.synchronize()?;
4390        let d_dummy_left = self.memory.alloc::<u32>(1)?;
4391        let d_dummy_right = self.memory.alloc::<u32>(1)?;
4392        let max_output_count_only = 0u32;
4393
4394        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4395        unsafe {
4396            let mut params: Vec<*mut c_void> = vec![
4397                (&left_packed.hashes).as_kernel_param(),
4398                num_left.as_kernel_param(),
4399                (&table.bucket_offsets).as_kernel_param(),
4400                (&table.bucket_counts).as_kernel_param(),
4401                (&table.bucket_entries).as_kernel_param(),
4402                (&table.bucket_entry_hashes).as_kernel_param(),
4403                table.bucket_mask.as_kernel_param(),
4404                (&left_packed.packed_keys).as_kernel_param(),
4405                (&index.packed_keys).as_kernel_param(),
4406                index.key_bytes.as_kernel_param(),
4407                (&d_dummy_left).as_kernel_param(),
4408                (&d_dummy_right).as_kernel_param(),
4409                (&d_count_only).as_kernel_param(),
4410                max_output_count_only.as_kernel_param(),
4411            ];
4412            probe_func
4413                .clone()
4414                .launch(probe_config, &mut params)
4415                .map_err(|e| {
4416                    XlogError::Kernel(format!("hash_join_probe_v2 (count) failed: {}", e))
4417                })?;
4418        }
4419
4420        self.device.synchronize()?;
4421
4422        // Metadata read: this u32 is the join's count-only result, used to
4423        // size the next allocation. See `read_join_output_count_metadata`
4424        // for the metadata-vs-data-plane rationale.
4425        let full_count = self.read_join_output_count_metadata(&d_count_only)? as u64;
4426        let requested = max_output
4427            .map(|limit| (limit as u64).min(full_count))
4428            .unwrap_or(full_count);
4429
4430        if requested == 0 {
4431            let combined_schema = self.combine_schemas(left.schema(), right.schema());
4432            return self.create_empty_buffer(combined_schema);
4433        }
4434
4435        if requested > u32::MAX as u64 {
4436            return Err(XlogError::Kernel(format!(
4437                "Join produced {} rows which exceeds the u32 index limit",
4438                requested
4439            )));
4440        }
4441
4442        // Allocate output buffers for row index pairs and rerun probe to materialize results.
4443        let max_output = requested as u32;
4444        let d_output_left = self.memory.alloc::<u32>(max_output as usize)?;
4445        let d_output_right = self.memory.alloc::<u32>(max_output as usize)?;
4446        let mut d_output_count = self.memory.alloc::<u32>(1)?;
4447        self.device
4448            .inner()
4449            .memset_zeros(&mut d_output_count)
4450            .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
4451        self.device.synchronize()?;
4452
4453        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4454        unsafe {
4455            let mut params: Vec<*mut c_void> = vec![
4456                (&left_packed.hashes).as_kernel_param(),
4457                num_left.as_kernel_param(),
4458                (&table.bucket_offsets).as_kernel_param(),
4459                (&table.bucket_counts).as_kernel_param(),
4460                (&table.bucket_entries).as_kernel_param(),
4461                (&table.bucket_entry_hashes).as_kernel_param(),
4462                table.bucket_mask.as_kernel_param(),
4463                (&left_packed.packed_keys).as_kernel_param(),
4464                (&index.packed_keys).as_kernel_param(),
4465                index.key_bytes.as_kernel_param(),
4466                (&d_output_left).as_kernel_param(),
4467                (&d_output_right).as_kernel_param(),
4468                (&d_output_count).as_kernel_param(),
4469                max_output.as_kernel_param(),
4470            ];
4471            probe_func
4472                .clone()
4473                .launch(probe_config, &mut params)
4474                .map_err(|e| XlogError::Kernel(format!("hash_join_probe_v2 failed: {}", e)))?;
4475        }
4476
4477        self.device.synchronize()?;
4478
4479        // Metadata read: post-materialize device-side atomic count, used
4480        // as the result buffer's logical row count after clamping.
4481        let result_count =
4482            (self.read_join_output_count_metadata(&d_output_count)? as u64).min(max_output as u64);
4483
4484        if result_count == 0 {
4485            let combined_schema = self.combine_schemas(left.schema(), right.schema());
4486            return self.create_empty_buffer(combined_schema);
4487        }
4488
4489        let output_rows = result_count as u32;
4490
4491        let gathered_left = self.gather_buffer_by_indices(left, &d_output_left, output_rows)?;
4492        let gathered_right = self.gather_buffer_by_indices(right, &d_output_right, output_rows)?;
4493
4494        let combined_schema = self.combine_schemas(left.schema(), right.schema());
4495        let mut result_columns = Vec::with_capacity(combined_schema.arity());
4496        result_columns.extend(gathered_left.columns);
4497        result_columns.extend(gathered_right.columns);
4498
4499        self.buffer_from_columns(result_columns, result_count, combined_schema)
4500    }
4501
4502    /// Semi-join implementation: return left rows that have matches in right
4503    fn hash_join_semi_impl(
4504        &self,
4505        left: &CudaBuffer,
4506        right: &CudaBuffer,
4507        left_keys: &[usize],
4508        right_keys: &[usize],
4509    ) -> Result<CudaBuffer> {
4510        let num_left = self.device_row_count(left)?;
4511        let num_right = self.device_row_count(right)?;
4512        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4513            return Err(XlogError::Kernel(format!(
4514                "Join supports at most {} rows per side (left={}, right={})",
4515                u32::MAX,
4516                num_left,
4517                num_right
4518            )));
4519        }
4520
4521        // Handle empty inputs
4522        if num_left == 0 {
4523            return self.create_empty_buffer(left.schema().clone());
4524        }
4525        if num_right == 0 {
4526            // No matches possible - return empty with left schema
4527            return self.create_empty_buffer(left.schema().clone());
4528        }
4529
4530        // Validate key columns
4531        if left_keys.is_empty() || right_keys.is_empty() {
4532            return Err(XlogError::Kernel(
4533                "Join requires at least one key column".to_string(),
4534            ));
4535        }
4536        if left_keys.len() != right_keys.len() {
4537            return Err(XlogError::Kernel(
4538                "Left and right key columns must have same length".to_string(),
4539            ));
4540        }
4541
4542        // Validate key column types match
4543        for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
4544            let left_type = left.schema().column_type(left_idx);
4545            let right_type = right.schema().column_type(right_idx);
4546            if left_type != right_type {
4547                return Err(XlogError::Kernel(format!(
4548                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
4549                    left_idx, left_type, right_idx, right_type
4550                )));
4551            }
4552        }
4553
4554        let num_left = num_left as u32;
4555        let num_right = num_right as u32;
4556
4557        // Compute composite hashes and pack keys for both sides
4558        let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4559        let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
4560
4561        // Build hash table from right side (cache-friendly bucket layout).
4562        let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
4563
4564        // Allocate output mask
4565        let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
4566
4567        // Launch semi-join kernel
4568        let semi_func = self
4569            .device
4570            .inner()
4571            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
4572            .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
4573
4574        let block_size = 256u32;
4575        let grid_size = num_left.div_ceil(block_size);
4576        let config = LaunchConfig {
4577            grid_dim: (grid_size, 1, 1),
4578            block_dim: (block_size, 1, 1),
4579            shared_mem_bytes: 0,
4580        };
4581
4582        // SAFETY: hash_join_semi(probe_hashes, num_probe,
4583        //                        bucket_offsets, bucket_counts, bucket_entries, bucket_entry_hashes, bucket_mask,
4584        //                        probe_keys, build_keys, key_bytes, has_match)
4585        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4586        unsafe {
4587            semi_func
4588                .clone()
4589                .launch(
4590                    config,
4591                    (
4592                        &left_packed.hashes,
4593                        num_left,
4594                        &table.bucket_offsets,
4595                        &table.bucket_counts,
4596                        &table.bucket_entries,
4597                        &table.bucket_entry_hashes,
4598                        table.bucket_mask,
4599                        &left_packed.packed_keys,
4600                        &right_packed.packed_keys,
4601                        left_packed.key_bytes,
4602                        &d_has_match,
4603                    ),
4604                )
4605                .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
4606        }
4607
4608        self.device.synchronize()?;
4609        self.filter_by_device_mask(left, &d_has_match)
4610    }
4611
4612    /// Compute a per-row membership mask on device: for each row in `probe`,
4613    /// check whether a matching row exists in `build` (by the specified key
4614    /// columns).  Returns a `TrackedCudaSlice<u8>` of length = probe row count
4615    /// that stays GPU-resident (no D2H transfer).
4616    pub fn membership_mask_device(
4617        &self,
4618        probe: &CudaBuffer,
4619        build: &CudaBuffer,
4620        probe_keys: &[usize],
4621        build_keys: &[usize],
4622    ) -> Result<TrackedCudaSlice<u8>> {
4623        let num_probe = self.device_row_count(probe)?;
4624        let num_build = self.device_row_count(build)?;
4625
4626        // Edge case: empty probe → empty device allocation
4627        if num_probe == 0 {
4628            return self.memory.alloc::<u8>(0);
4629        }
4630
4631        // Edge case: empty build → no matches possible, return zeroed mask
4632        if num_build == 0 {
4633            let mut d_mask = self.memory.alloc::<u8>(num_probe)?;
4634            self.device.inner().memset_zeros(&mut d_mask).map_err(|e| {
4635                XlogError::Kernel(format!(
4636                    "Failed to zero membership mask for empty build: {}",
4637                    e
4638                ))
4639            })?;
4640            return Ok(d_mask);
4641        }
4642
4643        if num_probe > u32::MAX as usize || num_build > u32::MAX as usize {
4644            return Err(XlogError::Kernel(format!(
4645                "membership_mask supports at most {} rows per side (probe={}, build={})",
4646                u32::MAX,
4647                num_probe,
4648                num_build
4649            )));
4650        }
4651
4652        // Validate key columns
4653        if probe_keys.is_empty() || build_keys.is_empty() {
4654            return Err(XlogError::Kernel(
4655                "membership_mask requires at least one key column".to_string(),
4656            ));
4657        }
4658        if probe_keys.len() != build_keys.len() {
4659            return Err(XlogError::Kernel(
4660                "Probe and build key columns must have same length".to_string(),
4661            ));
4662        }
4663
4664        // Validate key column types match
4665        for (&p_idx, &b_idx) in probe_keys.iter().zip(build_keys.iter()) {
4666            let p_type = probe.schema().column_type(p_idx);
4667            let b_type = build.schema().column_type(b_idx);
4668            if p_type != b_type {
4669                return Err(XlogError::Kernel(format!(
4670                    "Key column type mismatch: probe[{}]={:?}, build[{}]={:?}",
4671                    p_idx, p_type, b_idx, b_type
4672                )));
4673            }
4674        }
4675
4676        let num_probe_u32 = num_probe as u32;
4677        let num_build_u32 = num_build as u32;
4678
4679        // Compute composite hashes and pack keys for both sides
4680        let probe_packed = self.compute_hashes_and_pack_keys(probe, probe_keys)?;
4681        let build_packed = self.compute_hashes_and_pack_keys(build, build_keys)?;
4682
4683        // Build hash table from build side
4684        let table = self.build_hash_table_v2(&build_packed.hashes, num_build_u32)?;
4685
4686        // Allocate output mask on device
4687        let d_has_match = self.memory.alloc::<u8>(num_probe)?;
4688
4689        // Launch semi-join kernel to populate the mask
4690        let semi_func = self
4691            .device
4692            .inner()
4693            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
4694            .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
4695
4696        let block_size = 256u32;
4697        let grid_size = num_probe_u32.div_ceil(block_size);
4698        let config = LaunchConfig {
4699            grid_dim: (grid_size, 1, 1),
4700            block_dim: (block_size, 1, 1),
4701            shared_mem_bytes: 0,
4702        };
4703
4704        // SAFETY: hash_join_semi(probe_hashes, num_probe,
4705        //                        bucket_offsets, bucket_counts, bucket_entries,
4706        //                        bucket_entry_hashes, bucket_mask,
4707        //                        probe_keys, build_keys, key_bytes, has_match)
4708        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4709        unsafe {
4710            semi_func
4711                .clone()
4712                .launch(
4713                    config,
4714                    (
4715                        &probe_packed.hashes,
4716                        num_probe_u32,
4717                        &table.bucket_offsets,
4718                        &table.bucket_counts,
4719                        &table.bucket_entries,
4720                        &table.bucket_entry_hashes,
4721                        table.bucket_mask,
4722                        &probe_packed.packed_keys,
4723                        &build_packed.packed_keys,
4724                        probe_packed.key_bytes,
4725                        &d_has_match,
4726                    ),
4727                )
4728                .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
4729        }
4730
4731        Ok(d_has_match)
4732    }
4733
4734    /// Compute a per-row membership mask: for each row in `probe`, check whether
4735    /// a matching row exists in `build` (by the specified key columns).
4736    /// Returns a `Vec<bool>` of length = probe row count.
4737    /// This downloads only num_probe bytes (the mask), NOT column data.
4738    pub fn membership_mask(
4739        &self,
4740        probe: &CudaBuffer,
4741        build: &CudaBuffer,
4742        probe_keys: &[usize],
4743        build_keys: &[usize],
4744    ) -> Result<Vec<bool>> {
4745        let d_has_match = self.membership_mask_device(probe, build, probe_keys, build_keys)?;
4746        let num_probe = d_has_match.len();
4747        if num_probe == 0 {
4748            return Ok(Vec::new());
4749        }
4750        let mut host_mask = vec![0u8; num_probe];
4751        self.device
4752            .inner()
4753            .dtoh_sync_copy_into(&d_has_match, &mut host_mask)
4754            .map_err(|e| XlogError::Kernel(format!("Failed to download membership mask: {}", e)))?;
4755        Ok(host_mask.into_iter().map(|b| b != 0).collect())
4756    }
4757
4758    fn hash_join_semi_indexed(
4759        &self,
4760        left: &CudaBuffer,
4761        left_keys: &[usize],
4762        index: &JoinIndexV2,
4763    ) -> Result<CudaBuffer> {
4764        let num_left = self.device_row_count(left)?;
4765        if num_left > u32::MAX as usize {
4766            return Err(XlogError::Kernel(format!(
4767                "Join supports at most {} rows on left side (left={})",
4768                u32::MAX,
4769                num_left
4770            )));
4771        }
4772
4773        // Handle empty inputs.
4774        if num_left == 0 {
4775            return self.create_empty_buffer(left.schema().clone());
4776        }
4777        if index.right_num_rows == 0 {
4778            return self.create_empty_buffer(left.schema().clone());
4779        }
4780
4781        let num_left = num_left as u32;
4782
4783        let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4784        if left_packed.key_bytes != index.key_bytes {
4785            return Err(XlogError::Kernel(
4786                "Join key byte width mismatch between probe and cached index".to_string(),
4787            ));
4788        }
4789
4790        let table = &index.table;
4791
4792        // Allocate output mask.
4793        let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
4794
4795        let semi_func = self
4796            .device
4797            .inner()
4798            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
4799            .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
4800
4801        let block_size = 256u32;
4802        let grid_size = num_left.div_ceil(block_size);
4803        let config = LaunchConfig {
4804            grid_dim: (grid_size, 1, 1),
4805            block_dim: (block_size, 1, 1),
4806            shared_mem_bytes: 0,
4807        };
4808
4809        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4810        unsafe {
4811            semi_func
4812                .clone()
4813                .launch(
4814                    config,
4815                    (
4816                        &left_packed.hashes,
4817                        num_left,
4818                        &table.bucket_offsets,
4819                        &table.bucket_counts,
4820                        &table.bucket_entries,
4821                        &table.bucket_entry_hashes,
4822                        table.bucket_mask,
4823                        &left_packed.packed_keys,
4824                        &index.packed_keys,
4825                        index.key_bytes,
4826                        &d_has_match,
4827                    ),
4828                )
4829                .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
4830        }
4831
4832        self.device.synchronize()?;
4833        self.filter_by_device_mask(left, &d_has_match)
4834    }
4835
4836    /// Anti-join implementation: return left rows that have NO matches in right
4837    fn hash_join_anti_impl(
4838        &self,
4839        left: &CudaBuffer,
4840        right: &CudaBuffer,
4841        left_keys: &[usize],
4842        right_keys: &[usize],
4843    ) -> Result<CudaBuffer> {
4844        let num_left = self.device_row_count(left)?;
4845        let num_right = self.device_row_count(right)?;
4846        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4847            return Err(XlogError::Kernel(format!(
4848                "Join supports at most {} rows per side (left={}, right={})",
4849                u32::MAX,
4850                num_left,
4851                num_right
4852            )));
4853        }
4854
4855        // Handle empty inputs
4856        if num_left == 0 {
4857            return self.create_empty_buffer(left.schema().clone());
4858        }
4859        if num_right == 0 {
4860            // No matches possible - return all left rows
4861            return self.clone_buffer(left);
4862        }
4863
4864        // Validate key columns
4865        if left_keys.is_empty() || right_keys.is_empty() {
4866            return Err(XlogError::Kernel(
4867                "Join requires at least one key column".to_string(),
4868            ));
4869        }
4870        if left_keys.len() != right_keys.len() {
4871            return Err(XlogError::Kernel(
4872                "Left and right key columns must have same length".to_string(),
4873            ));
4874        }
4875
4876        // Validate key column types match
4877        for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
4878            let left_type = left.schema().column_type(left_idx);
4879            let right_type = right.schema().column_type(right_idx);
4880            if left_type != right_type {
4881                return Err(XlogError::Kernel(format!(
4882                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
4883                    left_idx, left_type, right_idx, right_type
4884                )));
4885            }
4886        }
4887
4888        let num_left = num_left as u32;
4889        let num_right = num_right as u32;
4890
4891        // Compute composite hashes and pack keys for both sides
4892        let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4893        let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
4894
4895        // Build hash table from right side (cache-friendly bucket layout).
4896        let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
4897
4898        // Allocate output mask
4899        let d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
4900
4901        // Launch anti-join kernel
4902        let anti_func = self
4903            .device
4904            .inner()
4905            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_ANTI)
4906            .ok_or_else(|| XlogError::Kernel("hash_join_anti kernel not found".to_string()))?;
4907
4908        let block_size = 256u32;
4909        let grid_size = num_left.div_ceil(block_size);
4910        let config = LaunchConfig {
4911            grid_dim: (grid_size, 1, 1),
4912            block_dim: (block_size, 1, 1),
4913            shared_mem_bytes: 0,
4914        };
4915
4916        // SAFETY: hash_join_anti(probe_hashes, num_probe,
4917        //                        bucket_offsets, bucket_counts, bucket_entries, bucket_entry_hashes, bucket_mask,
4918        //                        probe_keys, build_keys, key_bytes, no_match)
4919        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4920        unsafe {
4921            anti_func
4922                .clone()
4923                .launch(
4924                    config,
4925                    (
4926                        &left_packed.hashes,
4927                        num_left,
4928                        &table.bucket_offsets,
4929                        &table.bucket_counts,
4930                        &table.bucket_entries,
4931                        &table.bucket_entry_hashes,
4932                        table.bucket_mask,
4933                        &left_packed.packed_keys,
4934                        &right_packed.packed_keys,
4935                        left_packed.key_bytes,
4936                        &d_no_match,
4937                    ),
4938                )
4939                .map_err(|e| XlogError::Kernel(format!("hash_join_anti failed: {}", e)))?;
4940        }
4941
4942        self.device.synchronize()?;
4943        self.filter_by_device_mask(left, &d_no_match)
4944    }
4945
4946    fn hash_join_anti_indexed(
4947        &self,
4948        left: &CudaBuffer,
4949        right: &CudaBuffer,
4950        left_keys: &[usize],
4951        index: &JoinIndexV2,
4952    ) -> Result<CudaBuffer> {
4953        let num_left = self.device_row_count(left)?;
4954        let num_right = self.device_row_count(right)?;
4955        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
4956            return Err(XlogError::Kernel(format!(
4957                "Join supports at most {} rows per side (left={}, right={})",
4958                u32::MAX,
4959                num_left,
4960                num_right
4961            )));
4962        }
4963        if num_left == 0 {
4964            return self.create_empty_buffer(left.schema().clone());
4965        }
4966        if num_right == 0 {
4967            return self.clone_buffer(left);
4968        }
4969
4970        let num_left = num_left as u32;
4971
4972        let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
4973        if left_packed.key_bytes != index.key_bytes {
4974            return Err(XlogError::Kernel(
4975                "Join key byte width mismatch between probe and cached index".to_string(),
4976            ));
4977        }
4978
4979        let table = &index.table;
4980
4981        let d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
4982
4983        let anti_func = self
4984            .device
4985            .inner()
4986            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_ANTI)
4987            .ok_or_else(|| XlogError::Kernel("hash_join_anti kernel not found".to_string()))?;
4988
4989        let block_size = 256u32;
4990        let grid_size = num_left.div_ceil(block_size);
4991        let config = LaunchConfig {
4992            grid_dim: (grid_size, 1, 1),
4993            block_dim: (block_size, 1, 1),
4994            shared_mem_bytes: 0,
4995        };
4996
4997        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
4998        unsafe {
4999            anti_func
5000                .clone()
5001                .launch(
5002                    config,
5003                    (
5004                        &left_packed.hashes,
5005                        num_left,
5006                        &table.bucket_offsets,
5007                        &table.bucket_counts,
5008                        &table.bucket_entries,
5009                        &table.bucket_entry_hashes,
5010                        table.bucket_mask,
5011                        &left_packed.packed_keys,
5012                        &index.packed_keys,
5013                        index.key_bytes,
5014                        &d_no_match,
5015                    ),
5016                )
5017                .map_err(|e| XlogError::Kernel(format!("hash_join_anti failed: {}", e)))?;
5018        }
5019
5020        self.device.synchronize()?;
5021        self.filter_by_device_mask(left, &d_no_match)
5022    }
5023
5024    fn hash_join_left_outer_indexed(
5025        &self,
5026        left: &CudaBuffer,
5027        right: &CudaBuffer,
5028        left_keys: &[usize],
5029        index: &JoinIndexV2,
5030        max_output: Option<usize>,
5031    ) -> Result<CudaBuffer> {
5032        let num_left = self.device_row_count(left)?;
5033        let num_right = self.device_row_count(right)?;
5034        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
5035            return Err(XlogError::Kernel(format!(
5036                "Join supports at most {} rows per side (left={}, right={})",
5037                u32::MAX,
5038                num_left,
5039                num_right
5040            )));
5041        }
5042
5043        // Handle empty left - return empty with combined schema.
5044        if num_left == 0 {
5045            let combined_schema = self.combine_schemas(left.schema(), right.schema());
5046            return self.create_empty_buffer(combined_schema);
5047        }
5048        // Handle empty right - return left rows with null right columns.
5049        if num_right == 0 {
5050            return self.left_outer_with_nulls(left, right);
5051        }
5052
5053        let num_left = num_left as u32;
5054
5055        let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
5056        if left_packed.key_bytes != index.key_bytes {
5057            return Err(XlogError::Kernel(
5058                "Join key byte width mismatch between probe and cached index".to_string(),
5059            ));
5060        }
5061
5062        let table = &index.table;
5063
5064        // Allocate mask for semi-join to check which left rows have matches.
5065        let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
5066
5067        let semi_func = self
5068            .device
5069            .inner()
5070            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
5071            .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
5072
5073        let block_size = 256u32;
5074        let grid_size = num_left.div_ceil(block_size);
5075        let config = LaunchConfig {
5076            grid_dim: (grid_size, 1, 1),
5077            block_dim: (block_size, 1, 1),
5078            shared_mem_bytes: 0,
5079        };
5080
5081        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
5082        unsafe {
5083            semi_func
5084                .clone()
5085                .launch(
5086                    config,
5087                    (
5088                        &left_packed.hashes,
5089                        num_left,
5090                        &table.bucket_offsets,
5091                        &table.bucket_counts,
5092                        &table.bucket_entries,
5093                        &table.bucket_entry_hashes,
5094                        table.bucket_mask,
5095                        &left_packed.packed_keys,
5096                        &index.packed_keys,
5097                        index.key_bytes,
5098                        &d_has_match,
5099                    ),
5100                )
5101                .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
5102        }
5103
5104        let probe_func = self
5105            .device
5106            .inner()
5107            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
5108            .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
5109
5110        // Count inner-join matches to size buffers precisely.
5111        let mut d_count_only = self.memory.alloc::<u32>(1)?;
5112        self.device
5113            .inner()
5114            .memset_zeros(&mut d_count_only)
5115            .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
5116        let d_dummy_left = self.memory.alloc::<u32>(1)?;
5117        let d_dummy_right = self.memory.alloc::<u32>(1)?;
5118        let max_output_count_only = 0u32;
5119
5120        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
5121        unsafe {
5122            let mut params: Vec<*mut c_void> = vec![
5123                (&left_packed.hashes).as_kernel_param(),
5124                num_left.as_kernel_param(),
5125                (&table.bucket_offsets).as_kernel_param(),
5126                (&table.bucket_counts).as_kernel_param(),
5127                (&table.bucket_entries).as_kernel_param(),
5128                (&table.bucket_entry_hashes).as_kernel_param(),
5129                table.bucket_mask.as_kernel_param(),
5130                (&left_packed.packed_keys).as_kernel_param(),
5131                (&index.packed_keys).as_kernel_param(),
5132                index.key_bytes.as_kernel_param(),
5133                (&d_dummy_left).as_kernel_param(),
5134                (&d_dummy_right).as_kernel_param(),
5135                (&d_count_only).as_kernel_param(),
5136                max_output_count_only.as_kernel_param(),
5137            ];
5138            probe_func
5139                .clone()
5140                .launch(config, &mut params)
5141                .map_err(|e| {
5142                    XlogError::Kernel(format!("hash_join_probe_v2 (count) failed: {}", e))
5143                })?;
5144        }
5145
5146        self.device.synchronize()?;
5147
5148        // Metadata read: this u32 is the join's count-only result, used
5149        // to size the next allocation.
5150        let full_inner = self.read_join_output_count_metadata(&d_count_only)? as u64;
5151        let requested_inner = max_output
5152            .map(|limit| (limit as u64).min(full_inner))
5153            .unwrap_or(full_inner);
5154
5155        if requested_inner > u32::MAX as u64 {
5156            return Err(XlogError::Kernel(format!(
5157                "Join produced {} rows which exceeds the u32 index limit",
5158                requested_inner
5159            )));
5160        }
5161
5162        let max_output = requested_inner as u32;
5163        let alloc_len = (requested_inner.max(1)) as usize;
5164        let d_output_left = self.memory.alloc::<u32>(alloc_len)?;
5165        let d_output_right = self.memory.alloc::<u32>(alloc_len)?;
5166        let mut d_output_count = self.memory.alloc::<u32>(1)?;
5167        self.device
5168            .inner()
5169            .memset_zeros(&mut d_output_count)
5170            .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
5171
5172        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
5173        unsafe {
5174            let mut params: Vec<*mut c_void> = vec![
5175                (&left_packed.hashes).as_kernel_param(),
5176                num_left.as_kernel_param(),
5177                (&table.bucket_offsets).as_kernel_param(),
5178                (&table.bucket_counts).as_kernel_param(),
5179                (&table.bucket_entries).as_kernel_param(),
5180                (&table.bucket_entry_hashes).as_kernel_param(),
5181                table.bucket_mask.as_kernel_param(),
5182                (&left_packed.packed_keys).as_kernel_param(),
5183                (&index.packed_keys).as_kernel_param(),
5184                index.key_bytes.as_kernel_param(),
5185                (&d_output_left).as_kernel_param(),
5186                (&d_output_right).as_kernel_param(),
5187                (&d_output_count).as_kernel_param(),
5188                max_output.as_kernel_param(),
5189            ];
5190            probe_func
5191                .clone()
5192                .launch(config, &mut params)
5193                .map_err(|e| XlogError::Kernel(format!("hash_join_probe_v2 failed: {}", e)))?;
5194        }
5195
5196        self.device.synchronize()?;
5197
5198        let device = self.device.inner();
5199
5200        // Metadata read: post-materialize device-side atomic count, used
5201        // as the inner-join result's logical row count after clamping.
5202        // Clamp to max_output to prevent buffer overflow (kernel atomically
5203        // increments before bounds check, so count can exceed max_output).
5204        let inner_count = self
5205            .read_join_output_count_metadata(&d_output_count)?
5206            .min(max_output);
5207
5208        let mask_not_fn = device
5209            .get_func(FILTER_MODULE, filter_kernels::MASK_NOT)
5210            .ok_or_else(|| XlogError::Kernel("mask_not kernel not found".to_string()))?;
5211
5212        let mut d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
5213
5214        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
5215        unsafe {
5216            mask_not_fn
5217                .clone()
5218                .launch(config, (&d_has_match, &mut d_no_match, num_left))
5219        }
5220        .map_err(|e| XlogError::Kernel(format!("mask_not failed: {}", e)))?;
5221
5222        let unmatched_left = self.filter_by_device_mask(left, &d_no_match)?;
5223
5224        let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
5225        let total_rows = (inner_count as u64) + unmatched_rows;
5226
5227        let combined_schema = self.combine_schemas(left.schema(), right.schema());
5228
5229        if total_rows == 0 {
5230            return self.create_empty_buffer(combined_schema);
5231        }
5232
5233        let inner_left = self.gather_buffer_by_indices(left, &d_output_left, inner_count)?;
5234        let inner_right = self.gather_buffer_by_indices(right, &d_output_right, inner_count)?;
5235
5236        if unmatched_rows == 0 {
5237            let mut result_columns = Vec::with_capacity(combined_schema.arity());
5238            result_columns.extend(inner_left.columns);
5239            result_columns.extend(inner_right.columns);
5240            return self.buffer_from_columns(result_columns, inner_count as u64, combined_schema);
5241        }
5242
5243        if inner_count == 0 {
5244            let mut result_columns = Vec::with_capacity(combined_schema.arity());
5245            result_columns.extend(unmatched_left.columns);
5246
5247            for col_idx in 0..right.arity() {
5248                let elem_size = right
5249                    .schema()
5250                    .column_type(col_idx)
5251                    .map(|t| t.size_bytes())
5252                    .unwrap_or(4);
5253
5254                let bytes = (unmatched_rows as usize)
5255                    .checked_mul(elem_size)
5256                    .ok_or_else(|| {
5257                        XlogError::Kernel(
5258                            "Left outer join: right column byte size overflow".to_string(),
5259                        )
5260                    })?;
5261
5262                let mut dst_col = self.memory.alloc::<u8>(bytes)?;
5263                if bytes > 0 {
5264                    device.memset_zeros(&mut dst_col).map_err(|e| {
5265                        XlogError::Kernel(format!("Failed to zero null right column: {}", e))
5266                    })?;
5267                }
5268                result_columns.push(dst_col.into());
5269            }
5270
5271            self.device.synchronize()?;
5272            return self.buffer_from_columns(result_columns, unmatched_rows, combined_schema);
5273        }
5274
5275        let mut result_columns = Vec::with_capacity(combined_schema.arity());
5276        let inner_rows = inner_count as u64;
5277
5278        for (col_idx, (inner_col, unmatched_col)) in inner_left
5279            .columns
5280            .into_iter()
5281            .zip(unmatched_left.columns)
5282            .enumerate()
5283        {
5284            let elem_size = left
5285                .schema()
5286                .column_type(col_idx)
5287                .map(|t| t.size_bytes())
5288                .unwrap_or(4);
5289
5290            let inner_bytes = (inner_rows as usize)
5291                .checked_mul(elem_size)
5292                .ok_or_else(|| {
5293                    XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
5294                })?;
5295            let unmatched_bytes = (unmatched_rows as usize)
5296                .checked_mul(elem_size)
5297                .ok_or_else(|| {
5298                    XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
5299                })?;
5300            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
5301                XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
5302            })?;
5303
5304            let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
5305
5306            if inner_bytes > 0 {
5307                let mut out_view = out_col.slice_mut(0..inner_bytes);
5308                device.dtod_copy(&inner_col, &mut out_view).map_err(|e| {
5309                    XlogError::Kernel(format!("Failed to copy inner left column: {}", e))
5310                })?;
5311            }
5312            if unmatched_bytes > 0 {
5313                let mut out_view = out_col.slice_mut(inner_bytes..total_bytes);
5314                let unmatched_view = self.column_bytes_view(&unmatched_col, unmatched_bytes)?;
5315                device
5316                    .dtod_copy(&unmatched_view, &mut out_view)
5317                    .map_err(|e| {
5318                        XlogError::Kernel(format!("Failed to copy unmatched left column: {}", e))
5319                    })?;
5320            }
5321
5322            result_columns.push(out_col.into());
5323        }
5324
5325        for (col_idx, inner_col) in inner_right.columns.into_iter().enumerate() {
5326            let elem_size = right
5327                .schema()
5328                .column_type(col_idx)
5329                .map(|t| t.size_bytes())
5330                .unwrap_or(4);
5331
5332            let inner_bytes = (inner_rows as usize)
5333                .checked_mul(elem_size)
5334                .ok_or_else(|| {
5335                    XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
5336                })?;
5337            let unmatched_bytes = (unmatched_rows as usize)
5338                .checked_mul(elem_size)
5339                .ok_or_else(|| {
5340                    XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
5341                })?;
5342            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
5343                XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
5344            })?;
5345
5346            let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
5347
5348            if total_bytes > 0 {
5349                device.memset_zeros(&mut out_col).map_err(|e| {
5350                    XlogError::Kernel(format!("Failed to zero right outer column: {}", e))
5351                })?;
5352            }
5353
5354            if inner_bytes > 0 {
5355                let mut out_view = out_col.slice_mut(0..inner_bytes);
5356                device.dtod_copy(&inner_col, &mut out_view).map_err(|e| {
5357                    XlogError::Kernel(format!("Failed to copy inner right column: {}", e))
5358                })?;
5359            }
5360
5361            result_columns.push(out_col.into());
5362        }
5363
5364        self.device.synchronize()?;
5365
5366        self.buffer_from_columns(result_columns, total_rows, combined_schema)
5367    }
5368
5369    /// Left outer join implementation: return all left rows with matched right columns or nulls
5370    fn hash_join_left_outer_impl(
5371        &self,
5372        left: &CudaBuffer,
5373        right: &CudaBuffer,
5374        left_keys: &[usize],
5375        right_keys: &[usize],
5376        max_output: Option<usize>,
5377    ) -> Result<CudaBuffer> {
5378        let num_left = self.device_row_count(left)?;
5379        let num_right = self.device_row_count(right)?;
5380        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
5381            return Err(XlogError::Kernel(format!(
5382                "Join supports at most {} rows per side (left={}, right={})",
5383                u32::MAX,
5384                num_left,
5385                num_right
5386            )));
5387        }
5388
5389        // Handle empty left - return empty with combined schema
5390        if num_left == 0 {
5391            let combined_schema = self.combine_schemas(left.schema(), right.schema());
5392            return self.create_empty_buffer(combined_schema);
5393        }
5394
5395        // Handle empty right - return left rows with null right columns
5396        if num_right == 0 {
5397            return self.left_outer_with_nulls(left, right);
5398        }
5399
5400        // Validate key columns
5401        if left_keys.is_empty() || right_keys.is_empty() {
5402            return Err(XlogError::Kernel(
5403                "Join requires at least one key column".to_string(),
5404            ));
5405        }
5406        if left_keys.len() != right_keys.len() {
5407            return Err(XlogError::Kernel(
5408                "Left and right key columns must have same length".to_string(),
5409            ));
5410        }
5411
5412        // Validate key column types match
5413        for (&left_idx, &right_idx) in left_keys.iter().zip(right_keys.iter()) {
5414            let left_type = left.schema().column_type(left_idx);
5415            let right_type = right.schema().column_type(right_idx);
5416            if left_type != right_type {
5417                return Err(XlogError::Kernel(format!(
5418                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
5419                    left_idx, left_type, right_idx, right_type
5420                )));
5421            }
5422        }
5423
5424        let num_left = num_left as u32;
5425        let num_right = num_right as u32;
5426
5427        // Compute composite hashes and pack keys for both sides
5428        let left_packed = self.compute_hashes_and_pack_keys(left, left_keys)?;
5429        let right_packed = self.compute_hashes_and_pack_keys(right, right_keys)?;
5430
5431        // Build hash table from right side (cache-friendly bucket layout).
5432        let table = self.build_hash_table_v2(&right_packed.hashes, num_right)?;
5433
5434        // Allocate mask for semi-join to check which left rows have matches
5435        let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
5436
5437        // Launch semi-join kernel to get match information
5438        let semi_func = self
5439            .device
5440            .inner()
5441            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
5442            .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
5443
5444        let block_size = 256u32;
5445        let grid_size = num_left.div_ceil(block_size);
5446        let config = LaunchConfig {
5447            grid_dim: (grid_size, 1, 1),
5448            block_dim: (block_size, 1, 1),
5449            shared_mem_bytes: 0,
5450        };
5451
5452        // SAFETY: hash_join_semi(probe_hashes, num_probe,
5453        //                        bucket_offsets, bucket_counts, bucket_entries, bucket_entry_hashes, bucket_mask,
5454        //                        probe_keys, build_keys, key_bytes, has_match)
5455        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
5456        unsafe {
5457            semi_func
5458                .clone()
5459                .launch(
5460                    config,
5461                    (
5462                        &left_packed.hashes,
5463                        num_left,
5464                        &table.bucket_offsets,
5465                        &table.bucket_counts,
5466                        &table.bucket_entries,
5467                        &table.bucket_entry_hashes,
5468                        table.bucket_mask,
5469                        &left_packed.packed_keys,
5470                        &right_packed.packed_keys,
5471                        left_packed.key_bytes,
5472                        &d_has_match,
5473                    ),
5474                )
5475                .map_err(|e| XlogError::Kernel(format!("hash_join_semi failed: {}", e)))?;
5476        }
5477
5478        let probe_func = self
5479            .device
5480            .inner()
5481            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
5482            .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
5483
5484        // Count inner-join matches to size buffers precisely.
5485        let mut d_count_only = self.memory.alloc::<u32>(1)?;
5486        self.device
5487            .inner()
5488            .memset_zeros(&mut d_count_only)
5489            .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
5490        let d_dummy_left = self.memory.alloc::<u32>(1)?;
5491        let d_dummy_right = self.memory.alloc::<u32>(1)?;
5492        let max_output_count_only = 0u32;
5493
5494        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
5495        unsafe {
5496            let mut params: Vec<*mut c_void> = vec![
5497                (&left_packed.hashes).as_kernel_param(),
5498                num_left.as_kernel_param(),
5499                (&table.bucket_offsets).as_kernel_param(),
5500                (&table.bucket_counts).as_kernel_param(),
5501                (&table.bucket_entries).as_kernel_param(),
5502                (&table.bucket_entry_hashes).as_kernel_param(),
5503                table.bucket_mask.as_kernel_param(),
5504                (&left_packed.packed_keys).as_kernel_param(),
5505                (&right_packed.packed_keys).as_kernel_param(),
5506                left_packed.key_bytes.as_kernel_param(),
5507                (&d_dummy_left).as_kernel_param(),
5508                (&d_dummy_right).as_kernel_param(),
5509                (&d_count_only).as_kernel_param(),
5510                max_output_count_only.as_kernel_param(),
5511            ];
5512            probe_func
5513                .clone()
5514                .launch(config, &mut params)
5515                .map_err(|e| {
5516                    XlogError::Kernel(format!("hash_join_probe_v2 (count) failed: {}", e))
5517                })?;
5518        }
5519
5520        self.device.synchronize()?;
5521
5522        // Metadata read: this u32 is the join's count-only result, used
5523        // to size the next allocation.
5524        let full_inner = self.read_join_output_count_metadata(&d_count_only)? as u64;
5525        let requested_inner = max_output
5526            .map(|limit| (limit as u64).min(full_inner))
5527            .unwrap_or(full_inner);
5528
5529        if requested_inner > u32::MAX as u64 {
5530            return Err(XlogError::Kernel(format!(
5531                "Join produced {} rows which exceeds the u32 index limit",
5532                requested_inner
5533            )));
5534        }
5535
5536        let max_output = requested_inner as u32;
5537        let alloc_len = (requested_inner.max(1)) as usize;
5538        let d_output_left = self.memory.alloc::<u32>(alloc_len)?;
5539        let d_output_right = self.memory.alloc::<u32>(alloc_len)?;
5540        let mut d_output_count = self.memory.alloc::<u32>(1)?;
5541        self.device
5542            .inner()
5543            .memset_zeros(&mut d_output_count)
5544            .map_err(|e| XlogError::Kernel(format!("Failed to zero output count: {}", e)))?;
5545
5546        // SAFETY: hash_join_probe_v2(probe_hashes, num_probe,
5547        //                            bucket_offsets, bucket_counts, bucket_entries, bucket_entry_hashes, bucket_mask,
5548        //                            probe_keys, build_keys, key_bytes,
5549        //                            output_left, output_right, output_count, max_output)
5550        // Note: Using raw pointer launch because tuple exceeds 12-element limit
5551        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
5552        unsafe {
5553            let mut params: Vec<*mut c_void> = vec![
5554                (&left_packed.hashes).as_kernel_param(),
5555                num_left.as_kernel_param(),
5556                (&table.bucket_offsets).as_kernel_param(),
5557                (&table.bucket_counts).as_kernel_param(),
5558                (&table.bucket_entries).as_kernel_param(),
5559                (&table.bucket_entry_hashes).as_kernel_param(),
5560                table.bucket_mask.as_kernel_param(),
5561                (&left_packed.packed_keys).as_kernel_param(),
5562                (&right_packed.packed_keys).as_kernel_param(),
5563                left_packed.key_bytes.as_kernel_param(),
5564                (&d_output_left).as_kernel_param(),
5565                (&d_output_right).as_kernel_param(),
5566                (&d_output_count).as_kernel_param(),
5567                max_output.as_kernel_param(),
5568            ];
5569            probe_func
5570                .clone()
5571                .launch(config, &mut params)
5572                .map_err(|e| XlogError::Kernel(format!("hash_join_probe_v2 failed: {}", e)))?;
5573        }
5574
5575        self.device.synchronize()?;
5576
5577        let device = self.device.inner();
5578
5579        // Metadata read: post-materialize device-side atomic count, used
5580        // as the inner-join result's logical row count after clamping.
5581        // Clamp to max_output to prevent buffer overflow (kernel atomically
5582        // increments before bounds check, so count can exceed max_output).
5583        let inner_count = self
5584            .read_join_output_count_metadata(&d_output_count)?
5585            .min(max_output);
5586
5587        // Build unmatched-left buffer by inverting has_match mask and compacting on-GPU.
5588        let mask_not_fn = device
5589            .get_func(FILTER_MODULE, filter_kernels::MASK_NOT)
5590            .ok_or_else(|| XlogError::Kernel("mask_not kernel not found".to_string()))?;
5591
5592        let mut d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
5593
5594        // SAFETY: mask_not(const uint8_t* a, uint8_t* out, uint32_t n)
5595        unsafe {
5596            mask_not_fn
5597                .clone()
5598                .launch(config, (&d_has_match, &mut d_no_match, num_left))
5599        }
5600        .map_err(|e| XlogError::Kernel(format!("mask_not failed: {}", e)))?;
5601
5602        let unmatched_left = self.filter_by_device_mask(left, &d_no_match)?;
5603
5604        let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
5605        let total_rows = (inner_count as u64) + unmatched_rows;
5606
5607        let combined_schema = self.combine_schemas(left.schema(), right.schema());
5608
5609        if total_rows == 0 {
5610            return self.create_empty_buffer(combined_schema);
5611        }
5612
5613        // Gather matched rows (if any) using on-GPU indices.
5614        let inner_left = self.gather_buffer_by_indices(left, &d_output_left, inner_count)?;
5615        let inner_right = self.gather_buffer_by_indices(right, &d_output_right, inner_count)?;
5616
5617        if unmatched_rows == 0 {
5618            let mut result_columns = Vec::with_capacity(combined_schema.arity());
5619            result_columns.extend(inner_left.columns);
5620            result_columns.extend(inner_right.columns);
5621            return self.buffer_from_columns(result_columns, inner_count as u64, combined_schema);
5622        }
5623
5624        if inner_count == 0 {
5625            let mut result_columns = Vec::with_capacity(combined_schema.arity());
5626            result_columns.extend(unmatched_left.columns);
5627
5628            for col_idx in 0..right.arity() {
5629                let elem_size = right
5630                    .schema()
5631                    .column_type(col_idx)
5632                    .map(|t| t.size_bytes())
5633                    .unwrap_or(4);
5634
5635                let bytes = (unmatched_rows as usize)
5636                    .checked_mul(elem_size)
5637                    .ok_or_else(|| {
5638                        XlogError::Kernel(
5639                            "Left outer join: right column byte size overflow".to_string(),
5640                        )
5641                    })?;
5642
5643                let mut dst_col = self.memory.alloc::<u8>(bytes)?;
5644                if bytes > 0 {
5645                    device.memset_zeros(&mut dst_col).map_err(|e| {
5646                        XlogError::Kernel(format!("Failed to zero null right column: {}", e))
5647                    })?;
5648                }
5649                result_columns.push(dst_col.into());
5650            }
5651
5652            self.device.synchronize()?;
5653            return self.buffer_from_columns(result_columns, unmatched_rows, combined_schema);
5654        }
5655
5656        // Concatenate: matched rows followed by unmatched rows (null-extended on right).
5657        let mut result_columns = Vec::with_capacity(combined_schema.arity());
5658        let inner_rows = inner_count as u64;
5659
5660        // Left columns: inner-left then unmatched-left.
5661        for (col_idx, (inner_col, unmatched_col)) in inner_left
5662            .columns
5663            .into_iter()
5664            .zip(unmatched_left.columns)
5665            .enumerate()
5666        {
5667            let elem_size = left
5668                .schema()
5669                .column_type(col_idx)
5670                .map(|t| t.size_bytes())
5671                .unwrap_or(4);
5672
5673            let inner_bytes = (inner_rows as usize)
5674                .checked_mul(elem_size)
5675                .ok_or_else(|| {
5676                    XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
5677                })?;
5678            let unmatched_bytes = (unmatched_rows as usize)
5679                .checked_mul(elem_size)
5680                .ok_or_else(|| {
5681                    XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
5682                })?;
5683            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
5684                XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
5685            })?;
5686
5687            let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
5688
5689            if inner_bytes > 0 {
5690                let mut out_view = out_col.slice_mut(0..inner_bytes);
5691                device.dtod_copy(&inner_col, &mut out_view).map_err(|e| {
5692                    XlogError::Kernel(format!("Failed to copy inner left column: {}", e))
5693                })?;
5694            }
5695            if unmatched_bytes > 0 {
5696                let mut out_view = out_col.slice_mut(inner_bytes..total_bytes);
5697                let unmatched_view = self.column_bytes_view(&unmatched_col, unmatched_bytes)?;
5698                device
5699                    .dtod_copy(&unmatched_view, &mut out_view)
5700                    .map_err(|e| {
5701                        XlogError::Kernel(format!("Failed to copy unmatched left column: {}", e))
5702                    })?;
5703            }
5704
5705            result_columns.push(out_col.into());
5706        }
5707
5708        // Right columns: inner-right then zeros.
5709        for (col_idx, inner_col) in inner_right.columns.into_iter().enumerate() {
5710            let elem_size = right
5711                .schema()
5712                .column_type(col_idx)
5713                .map(|t| t.size_bytes())
5714                .unwrap_or(4);
5715
5716            let inner_bytes = (inner_rows as usize)
5717                .checked_mul(elem_size)
5718                .ok_or_else(|| {
5719                    XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
5720                })?;
5721            let unmatched_bytes = (unmatched_rows as usize)
5722                .checked_mul(elem_size)
5723                .ok_or_else(|| {
5724                    XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
5725                })?;
5726            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
5727                XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
5728            })?;
5729
5730            let mut out_col = self.memory.alloc::<u8>(total_bytes)?;
5731
5732            if total_bytes > 0 {
5733                device.memset_zeros(&mut out_col).map_err(|e| {
5734                    XlogError::Kernel(format!("Failed to zero right outer column: {}", e))
5735                })?;
5736            }
5737
5738            if inner_bytes > 0 {
5739                let mut out_view = out_col.slice_mut(0..inner_bytes);
5740                device.dtod_copy(&inner_col, &mut out_view).map_err(|e| {
5741                    XlogError::Kernel(format!("Failed to copy inner right column: {}", e))
5742                })?;
5743            }
5744
5745            result_columns.push(out_col.into());
5746        }
5747
5748        self.device.synchronize()?;
5749
5750        self.buffer_from_columns(result_columns, total_rows, combined_schema)
5751    }
5752
5753    /// Helper for left outer join with empty right: all left rows with null right columns
5754    fn left_outer_with_nulls(&self, left: &CudaBuffer, right: &CudaBuffer) -> Result<CudaBuffer> {
5755        let combined_schema = self.combine_schemas(left.schema(), right.schema());
5756        let num_rows = self.device_row_count(left)? as u64;
5757        if num_rows == 0 {
5758            return self.create_empty_buffer(combined_schema);
5759        }
5760        let device = self.device.inner();
5761
5762        let mut result_columns = Vec::with_capacity(combined_schema.arity());
5763
5764        // Copy all left columns device-to-device
5765        for col_idx in 0..left.arity() {
5766            let col = left
5767                .column(col_idx)
5768                .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
5769
5770            let elem_size = left
5771                .schema()
5772                .column_type(col_idx)
5773                .map(|t| t.size_bytes())
5774                .unwrap_or(4);
5775
5776            let bytes = (num_rows as usize) * elem_size;
5777            let mut dst_col = self.memory.alloc::<u8>(bytes)?;
5778            if bytes > 0 {
5779                let src_view = self.column_bytes_view(col, bytes)?;
5780                device
5781                    .dtod_copy(&src_view, &mut dst_col)
5782                    .map_err(|e| XlogError::Kernel(format!("Failed to copy left column: {}", e)))?;
5783            }
5784
5785            result_columns.push(dst_col.into());
5786        }
5787
5788        // Create null (zero) columns for right side on-device
5789        for col_idx in 0..right.arity() {
5790            let elem_size = right
5791                .schema()
5792                .column_type(col_idx)
5793                .map(|t| t.size_bytes())
5794                .unwrap_or(4);
5795
5796            let bytes = (num_rows as usize) * elem_size;
5797            let mut dst_col = self.memory.alloc::<u8>(bytes)?;
5798            if bytes > 0 {
5799                device
5800                    .memset_zeros(&mut dst_col)
5801                    .map_err(|e| XlogError::Kernel(format!("Failed to zero null column: {}", e)))?;
5802            }
5803
5804            result_columns.push(dst_col.into());
5805        }
5806
5807        self.device.synchronize()?;
5808
5809        self.buffer_from_columns(result_columns, num_rows, combined_schema)
5810    }
5811
5812    /// Clone a buffer (deep copy) on-device.
5813    ///
5814    /// This is primarily used when a caller needs owned buffer state for a
5815    /// separate runtime object while preserving the original relation store.
5816    pub fn clone_buffer(&self, buffer: &CudaBuffer) -> Result<CudaBuffer> {
5817        // Debug probe (XLOG_DEBUG_VERIFY_CLONES=1): byte-compare every
5818        // cloned column against its source immediately after the copy.
5819        // Discriminates transport faults (clone wrong at birth) from
5820        // source faults (clone faithful, source already corrupt).
5821        let verify = {
5822            static ENABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
5823            *ENABLED.get_or_init(|| {
5824                std::env::var("XLOG_DEBUG_VERIFY_CLONES").map(|v| v == "1") == Ok(true)
5825            })
5826        };
5827
5828        let mut result_columns = Vec::with_capacity(buffer.arity());
5829        let device = self.device.inner();
5830
5831        for col_idx in 0..buffer.arity() {
5832            let src_col = buffer
5833                .column(col_idx)
5834                .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
5835            let mut dst_col = self.memory.alloc::<u8>(src_col.len())?;
5836            if !src_col.is_empty() {
5837                device
5838                    .dtod_copy(src_col, &mut dst_col)
5839                    .map_err(|e| XlogError::Kernel(format!("Failed to clone column: {}", e)))?;
5840            }
5841            if verify && !src_col.is_empty() {
5842                self.device.synchronize()?;
5843                let mut src_host = vec![0u8; src_col.len()];
5844                let mut dst_host = vec![0u8; dst_col.len()];
5845                device
5846                    .dtoh_sync_copy_into(src_col, &mut src_host)
5847                    .map_err(|e| XlogError::Kernel(format!("verify src dtoh: {}", e)))?;
5848                device
5849                    .dtoh_sync_copy_into(&dst_col, &mut dst_host)
5850                    .map_err(|e| XlogError::Kernel(format!("verify dst dtoh: {}", e)))?;
5851                if src_host != dst_host {
5852                    let first_diff = src_host
5853                        .iter()
5854                        .zip(dst_host.iter())
5855                        .position(|(a, b)| a != b)
5856                        .unwrap_or(0);
5857                    return Err(XlogError::Kernel(format!(
5858                        "CLONE VERIFY FAILED: column {} differs from source at byte {} of {} (clone is wrong at birth)",
5859                        col_idx,
5860                        first_diff,
5861                        src_col.len(),
5862                    )));
5863                }
5864            }
5865            result_columns.push(dst_col.into());
5866        }
5867
5868        let mut d_num_rows = self.memory.alloc::<u32>(1)?;
5869        device
5870            .dtod_copy(buffer.num_rows_device(), &mut d_num_rows)
5871            .map_err(|e| XlogError::Kernel(format!("Failed to clone row count: {}", e)))?;
5872
5873        let cloned = CudaBuffer::from_columns(
5874            result_columns,
5875            buffer.row_cap,
5876            d_num_rows,
5877            buffer.schema().clone(),
5878        );
5879        // Preserve the host-side row-count cache so downstream code can avoid
5880        // a D2H read of num_rows_device() just to learn the row count.
5881        if let Some(cached) = buffer.cached_row_count() {
5882            cloned.set_cached_row_count_if_unset(cached);
5883        }
5884        Ok(cloned)
5885    }
5886    // ============== Arithmetic Operations (GPU) ==============
5887
5888    /// Extract a single column from a buffer as a new single-column buffer
5889    ///
5890    /// # Arguments
5891    /// * `buffer` - The source buffer
5892    /// * `col_idx` - The column index to extract
5893    ///
5894    /// # Returns
5895    /// A new single-column CudaBuffer containing just the specified column
5896    pub fn extract_column(&self, buffer: &CudaBuffer, col_idx: usize) -> Result<CudaBuffer> {
5897        if buffer.is_empty() {
5898            let col_type = buffer
5899                .schema()
5900                .column_type(col_idx)
5901                .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
5902            let schema = Schema::new(vec![("col".to_string(), col_type)]);
5903            return self.create_empty_buffer(schema);
5904        }
5905
5906        let col_type = buffer
5907            .schema()
5908            .column_type(col_idx)
5909            .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
5910        let src_col = buffer
5911            .column(col_idx)
5912            .ok_or_else(|| XlogError::Kernel(format!("Column {} not found in buffer", col_idx)))?;
5913        let mut dst_col = self.memory.alloc::<u8>(src_col.len())?;
5914        let device = self.device.inner();
5915        if !src_col.is_empty() {
5916            device
5917                .dtod_copy(src_col, &mut dst_col)
5918                .map_err(|e| XlogError::Kernel(format!("Failed to copy column: {}", e)))?;
5919        }
5920
5921        let mut d_num_rows = self.memory.alloc::<u32>(1)?;
5922        device
5923            .dtod_copy(buffer.num_rows_device(), &mut d_num_rows)
5924            .map_err(|e| XlogError::Kernel(format!("Failed to copy row count: {}", e)))?;
5925        self.device.synchronize()?;
5926
5927        let schema = Schema::new(vec![("col".to_string(), col_type)]);
5928        Ok(CudaBuffer::from_columns(
5929            vec![dst_col.into()],
5930            buffer.row_cap,
5931            d_num_rows,
5932            schema,
5933        ))
5934    }
5935
5936    /// Extract active (i,j,k) rule indices from a flattened N×N×N mask.
5937    /// Returns up to `max_active` entries sorted by soft-mask priority.
5938    pub fn extract_active_rule_indices(
5939        &self,
5940        mask_hard: &CudaBuffer,
5941        mask_soft: &CudaBuffer,
5942        n: usize,
5943        max_active: usize,
5944    ) -> Result<Vec<(u32, u32, u32)>> {
5945        let total = n * n * n;
5946        let block_size = 256usize;
5947        let grid_size = total.div_ceil(block_size);
5948
5949        let mut out_i = self.memory().alloc::<u32>(total)?;
5950        let mut out_j = self.memory().alloc::<u32>(total)?;
5951        let mut out_k = self.memory().alloc::<u32>(total)?;
5952        let mut out_p = self.memory().alloc::<f32>(total)?;
5953        let mut count = self.memory().alloc::<u32>(1)?;
5954
5955        self.htod_launch_metadata_sync_copy_into(&[0u32], &mut count)
5956            .map_err(|e| XlogError::Kernel(format!("ILP htod count: {}", e)))?;
5957
5958        let hard_col = mask_hard
5959            .column(0)
5960            .ok_or_else(|| XlogError::Kernel("ILP hard mask has no column".into()))?;
5961        let soft_col = mask_soft
5962            .column(0)
5963            .ok_or_else(|| XlogError::Kernel("ILP soft mask has no column".into()))?;
5964
5965        let kernel = self
5966            .device()
5967            .inner()
5968            .get_func(ILP_MODULE, ilp_kernels::EXTRACT_NONZERO_INDICES)
5969            .ok_or_else(|| XlogError::Kernel("extract_nonzero_indices kernel not found".into()))?;
5970
5971        let hard_bytes = total * std::mem::size_of::<f32>();
5972        let soft_bytes = total * std::mem::size_of::<f32>();
5973        let hard_view = self.column_bytes_view(hard_col, hard_bytes)?;
5974        let soft_view = self.column_bytes_view(soft_col, soft_bytes)?;
5975
5976        // SAFETY: kernel arguments match the PTX signature; device buffers were allocated with sufficient size
5977        unsafe {
5978            kernel
5979                .clone()
5980                .launch(
5981                    cudarc::driver::LaunchConfig {
5982                        grid_dim: (grid_size as u32, 1, 1),
5983                        block_dim: (block_size as u32, 1, 1),
5984                        shared_mem_bytes: 0,
5985                    },
5986                    (
5987                        &hard_view, &soft_view, n as u32, &mut out_i, &mut out_j, &mut out_k,
5988                        &mut out_p, &mut count,
5989                    ),
5990                )
5991                .map_err(|e| {
5992                    XlogError::Kernel(format!("Failed to launch extract_nonzero_indices: {}", e))
5993                })?;
5994        }
5995
5996        let mut count_host = [0u32];
5997        self.device()
5998            .inner()
5999            .dtoh_sync_copy_into(&count, &mut count_host)
6000            .map_err(|e| XlogError::Kernel(format!("ILP dtoh count: {}", e)))?;
6001        let active_count = count_host[0] as usize;
6002
6003        if active_count == 0 {
6004            return Ok(Vec::new());
6005        }
6006
6007        let mut i_host = vec![0u32; active_count];
6008        let mut j_host = vec![0u32; active_count];
6009        let mut k_host = vec![0u32; active_count];
6010        let mut p_host = vec![0f32; active_count];
6011
6012        let out_i_view = out_i
6013            .try_slice(0..active_count)
6014            .ok_or_else(|| XlogError::Kernel("ILP slice i out of bounds".into()))?;
6015        let out_j_view = out_j
6016            .try_slice(0..active_count)
6017            .ok_or_else(|| XlogError::Kernel("ILP slice j out of bounds".into()))?;
6018        let out_k_view = out_k
6019            .try_slice(0..active_count)
6020            .ok_or_else(|| XlogError::Kernel("ILP slice k out of bounds".into()))?;
6021        let out_p_view = out_p
6022            .try_slice(0..active_count)
6023            .ok_or_else(|| XlogError::Kernel("ILP slice p out of bounds".into()))?;
6024
6025        self.device()
6026            .inner()
6027            .dtoh_sync_copy_into(&out_i_view, &mut i_host)
6028            .map_err(|e| XlogError::Kernel(format!("ILP dtoh i: {}", e)))?;
6029        self.device()
6030            .inner()
6031            .dtoh_sync_copy_into(&out_j_view, &mut j_host)
6032            .map_err(|e| XlogError::Kernel(format!("ILP dtoh j: {}", e)))?;
6033        self.device()
6034            .inner()
6035            .dtoh_sync_copy_into(&out_k_view, &mut k_host)
6036            .map_err(|e| XlogError::Kernel(format!("ILP dtoh k: {}", e)))?;
6037        self.device()
6038            .inner()
6039            .dtoh_sync_copy_into(&out_p_view, &mut p_host)
6040            .map_err(|e| XlogError::Kernel(format!("ILP dtoh p: {}", e)))?;
6041
6042        let mut indices: Vec<(f32, u32, u32, u32)> = (0..active_count)
6043            .map(|idx| (p_host[idx], i_host[idx], j_host[idx], k_host[idx]))
6044            .collect();
6045        indices.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
6046        indices.truncate(max_active);
6047
6048        Ok(indices.into_iter().map(|(_, i, j, k)| (i, j, k)).collect())
6049    }
6050
6051    // ============== Recorded sort + dedup_full_row ==============
6052    //
6053    // Strict-recorder, launch_stream-routed siblings of `sort` and
6054    // `dedup_full_row`. Scope is intentionally narrow:
6055    //   * `sort_recorded` accepts only u32 / Symbol key columns; other key
6056    //     types return XlogError::Kernel
6057    //     before any kernel work is queued.
6058    //   * `dedup_full_row_recorded` requires every column to be u32 / Symbol
6059    //     (it composes sort_recorded internally). Mixed-type full-row dedup
6060    //     remains on the legacy `dedup_full_row`.
6061    //
6062    // No legacy default-routed code is touched. Existing callers are
6063    // unchanged. Runtime/provider opt-in wiring is NOT included in this
6064    // path — callers that want recorded sort/dedup must invoke the
6065    // recorded methods directly with a launch_stream.
6066
6067    /// Stream-aware variant of
6068    /// [`Self::radix_sort_u32_pairs_with_scratch`]. Same kernel
6069    /// chain but every launch is `launch_on_stream` and there
6070    /// are no internal `device.synchronize()` calls. Caller-owned
6071    /// scratch (`keys_a/b`, `indices_a/b`, `hist`, `prefix`,
6072    /// `ranks`) is recorded by the caller; intermediate
6073    /// `block_sums` allocations created by the inner scan are
6074    /// recorded directly inside
6075    /// [`Self::multiblock_scan_u32_view_inplace_on_stream`].
6076    #[allow(clippy::too_many_arguments)]
6077    fn radix_sort_u32_pairs_with_scratch_on_stream(
6078        &self,
6079        keys_a: &mut TrackedCudaSlice<u32>,
6080        keys_b: &mut TrackedCudaSlice<u32>,
6081        indices_a: &mut TrackedCudaSlice<u32>,
6082        indices_b: &mut TrackedCudaSlice<u32>,
6083        hist: &mut TrackedCudaSlice<u32>,
6084        prefix: &mut TrackedCudaSlice<u32>,
6085        ranks: &mut TrackedCudaSlice<u32>,
6086        num_rows_device: &TrackedCudaSlice<u32>,
6087        row_cap: u32,
6088        cu_stream: &cudarc::driver::CudaStream,
6089        launch_stream: StreamId,
6090        runtime: &crate::device_runtime::XlogDeviceRuntime,
6091    ) -> Result<()> {
6092        if row_cap == 0 {
6093            return Ok(());
6094        }
6095        let device = self.device.inner();
6096        let block_size = Self::SORT_BLOCK_SIZE;
6097        let grid_size = row_cap.div_ceil(block_size);
6098        let sort_config = LaunchConfig {
6099            grid_dim: (grid_size, 1, 1),
6100            block_dim: (block_size, 1, 1),
6101            shared_mem_bytes: 0,
6102        };
6103
6104        let histogram_fn = device
6105            .get_func(SORT_MODULE, sort_kernels::RADIX_HISTOGRAM)
6106            .ok_or_else(|| XlogError::Kernel("radix_histogram kernel not found".to_string()))?;
6107        let prefix_fn = device
6108            .get_func(SORT_MODULE, sort_kernels::COMPUTE_DIGIT_PREFIX_SUMS)
6109            .ok_or_else(|| {
6110                XlogError::Kernel("compute_digit_prefix_sums kernel not found".to_string())
6111            })?;
6112        let ranks_fn = device
6113            .get_func(SORT_MODULE, sort_kernels::COMPUTE_RANKS)
6114            .ok_or_else(|| XlogError::Kernel("compute_ranks kernel not found".to_string()))?;
6115        let scatter_fn = device
6116            .get_func(SORT_MODULE, sort_kernels::RADIX_SCATTER_STABLE)
6117            .ok_or_else(|| {
6118                XlogError::Kernel("radix_scatter_stable kernel not found".to_string())
6119            })?;
6120        let prefix_config = LaunchConfig {
6121            grid_dim: (1, 1, 1),
6122            block_dim: (256, 1, 1),
6123            shared_mem_bytes: 0,
6124        };
6125
6126        let mut in_a = true;
6127        for pass in 0..8u32 {
6128            let shift = pass * 4;
6129            let (keys_in, indices_in, keys_out, indices_out) = if in_a {
6130                (&*keys_a, &*indices_a, &mut *keys_b, &mut *indices_b)
6131            } else {
6132                (&*keys_b, &*indices_b, &mut *keys_a, &mut *indices_a)
6133            };
6134
6135            // SAFETY: radix_histogram(keys, num_rows_device, row_cap, histograms, shift)
6136            unsafe {
6137                histogram_fn.clone().launch_on_stream(
6138                    cu_stream,
6139                    sort_config,
6140                    (keys_in, num_rows_device, row_cap, &mut *hist, shift),
6141                )
6142            }
6143            .map_err(|e| XlogError::Kernel(format!("radix_histogram (on_stream) failed: {}", e)))?;
6144
6145            // SAFETY: compute_digit_prefix_sums(histograms, grid_size, prefix_sums)
6146            unsafe {
6147                prefix_fn.clone().launch_on_stream(
6148                    cu_stream,
6149                    prefix_config,
6150                    (&*hist, grid_size, &mut *prefix),
6151                )
6152            }
6153            .map_err(|e| {
6154                XlogError::Kernel(format!(
6155                    "compute_digit_prefix_sums (on_stream) failed: {}",
6156                    e
6157                ))
6158            })?;
6159
6160            // Per-digit per-block exclusive offsets — in-place
6161            // scan on a 16-strided view of `hist`.
6162            for digit in 0..16u32 {
6163                let start = (digit * grid_size) as usize;
6164                let end = start + (grid_size as usize);
6165                let mut digit_slice = hist.slice_mut(start..end);
6166                self.multiblock_scan_u32_view_inplace_on_stream(
6167                    &mut digit_slice,
6168                    grid_size,
6169                    cu_stream,
6170                    launch_stream,
6171                    runtime,
6172                )?;
6173            }
6174
6175            // SAFETY: compute_ranks(keys, num_rows_device, row_cap, ranks, shift)
6176            unsafe {
6177                ranks_fn.clone().launch_on_stream(
6178                    cu_stream,
6179                    sort_config,
6180                    (keys_in, num_rows_device, row_cap, &mut *ranks, shift),
6181                )
6182            }
6183            .map_err(|e| XlogError::Kernel(format!("compute_ranks (on_stream) failed: {}", e)))?;
6184
6185            // SAFETY: radix_scatter_stable(keys_in, indices_in, ranks, keys_out,
6186            // indices_out, prefix_sums, block_offsets, num_rows_device, row_cap, shift)
6187            unsafe {
6188                scatter_fn.clone().launch_on_stream(
6189                    cu_stream,
6190                    sort_config,
6191                    (
6192                        keys_in,
6193                        indices_in,
6194                        &*ranks,
6195                        keys_out,
6196                        indices_out,
6197                        &*prefix,
6198                        &*hist,
6199                        num_rows_device,
6200                        row_cap,
6201                        shift,
6202                    ),
6203                )
6204            }
6205            .map_err(|e| {
6206                XlogError::Kernel(format!("radix_scatter_stable (on_stream) failed: {}", e))
6207            })?;
6208
6209            in_a = !in_a;
6210        }
6211
6212        if !in_a {
6213            return Err(XlogError::Kernel(
6214                "Unexpected radix-sort buffer parity (expected even number of passes)".to_string(),
6215            ));
6216        }
6217        Ok(())
6218    }
6219
6220    /// Stream-aware variant of
6221    /// [`Self::apply_permutation_gpu`]. Permutes every input
6222    /// column on `launch_stream` into caller-allocated
6223    /// `dst_cols`. No internal sync; caller records both the
6224    /// permutation slice and `dst_cols`.
6225    fn apply_permutation_gpu_on_stream(
6226        &self,
6227        input: &CudaBuffer,
6228        permutation: &TrackedCudaSlice<u32>,
6229        dst_cols: &mut [TrackedCudaSlice<u8>],
6230        cu_stream: &cudarc::driver::CudaStream,
6231    ) -> Result<()> {
6232        let row_cap = input.num_rows() as u32;
6233        let d_num_rows = input.num_rows_device();
6234        let device = self.device.inner();
6235
6236        let grid_size = row_cap.div_ceil(Self::SORT_BLOCK_SIZE);
6237        let launch_config = LaunchConfig {
6238            grid_dim: (grid_size, 1, 1),
6239            block_dim: (Self::SORT_BLOCK_SIZE, 1, 1),
6240            shared_mem_bytes: 0,
6241        };
6242
6243        let apply_perm_fn = device
6244            .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
6245            .ok_or_else(|| {
6246                XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
6247            })?;
6248
6249        if dst_cols.len() != input.columns.len() {
6250            return Err(XlogError::Kernel(format!(
6251                "apply_permutation_gpu_on_stream: dst_cols.len()={} mismatches input.cols={}",
6252                dst_cols.len(),
6253                input.columns.len()
6254            )));
6255        }
6256
6257        for (col_idx, dst_col) in dst_cols.iter_mut().enumerate() {
6258            let src_col = input
6259                .column(col_idx)
6260                .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
6261            let elem_size = input
6262                .schema
6263                .column_type(col_idx)
6264                .ok_or_else(|| {
6265                    XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
6266                })?
6267                .size_bytes() as u32;
6268            let output_bytes = (row_cap as usize) * (elem_size as usize);
6269            if src_col.num_bytes() != output_bytes {
6270                return Err(XlogError::Kernel(format!(
6271                    "Column {} has {} bytes but expected {} (num_rows={}, elem_size={})",
6272                    col_idx,
6273                    src_col.num_bytes(),
6274                    output_bytes,
6275                    row_cap,
6276                    elem_size
6277                )));
6278            }
6279            // SAFETY: apply_permutation_bytes(input, output, permutation,
6280            // num_rows_device, row_cap, elem_size)
6281            unsafe {
6282                apply_perm_fn.clone().launch_on_stream(
6283                    cu_stream,
6284                    launch_config,
6285                    (
6286                        src_col,
6287                        &mut *dst_col,
6288                        permutation,
6289                        d_num_rows,
6290                        row_cap,
6291                        elem_size,
6292                    ),
6293                )
6294            }
6295            .map_err(|e| {
6296                XlogError::Kernel(format!("apply_permutation_bytes (on_stream) failed: {}", e))
6297            })?;
6298        }
6299        Ok(())
6300    }
6301
6302    /// Strict-recorder variant of [`Self::sort`] — narrow to
6303    /// `u32` / `Symbol` key columns. The whole sort chain
6304    /// (init → LSD radix passes → multi-column gather) runs on
6305    /// the caller-supplied `launch_stream`; every input column
6306    /// and the input row-count buffer are recorded as reads
6307    /// before preflight; every fresh runtime-backed allocation
6308    /// (scratch + output columns + output `d_num_rows`) is
6309    /// recorded via `write` BEFORE preflight (snapshot drops the borrow so kernel `&mut` borrows after preflight remain valid)
6310    /// enqueue.
6311    ///
6312    /// # Errors
6313    ///   * Manager not runtime-backed.
6314    ///   * `launch_stream` does not resolve.
6315    ///   * Empty `key_cols` or out-of-bounds index.
6316    ///   * Any key column type other than `U32` / `Symbol`
6317    ///     (multi-type recorded sort is outside this API surface).
6318    ///   * Preflight / kernel / commit failures.
6319    pub fn sort_recorded(
6320        &self,
6321        input: &CudaBuffer,
6322        key_cols: &[usize],
6323        launch_stream: StreamId,
6324    ) -> Result<CudaBuffer> {
6325        let runtime = self.memory.runtime().ok_or_else(|| {
6326            XlogError::Kernel(
6327                "sort_recorded requires a runtime-backed GpuMemoryManager (with_runtime)"
6328                    .to_string(),
6329            )
6330        })?;
6331        let cu_stream = runtime
6332            .stream_pool()
6333            .resolve(launch_stream)
6334            .ok_or_else(|| {
6335                XlogError::Kernel(format!(
6336                    "sort_recorded: launch_stream StreamId({}) does not resolve",
6337                    launch_stream.0
6338                ))
6339            })?;
6340
6341        if input.num_rows() == 0 {
6342            return self.create_empty_buffer(input.schema.clone());
6343        }
6344        if key_cols.is_empty() {
6345            return Err(XlogError::Kernel(
6346                "Sort requires at least one key column".to_string(),
6347            ));
6348        }
6349        if input.num_rows() > u32::MAX as u64 {
6350            return Err(XlogError::Kernel(format!(
6351                "Sort supports at most {} rows, got {}",
6352                u32::MAX,
6353                input.num_rows()
6354            )));
6355        }
6356        for &k in key_cols {
6357            if k >= input.arity() {
6358                return Err(XlogError::Kernel(format!(
6359                    "Key column index {} out of bounds (arity {})",
6360                    k,
6361                    input.arity()
6362                )));
6363            }
6364            let ty = input.schema.column_type(k).ok_or_else(|| {
6365                XlogError::Kernel(format!("Key column {} type not found in schema", k))
6366            })?;
6367            if !matches!(ty, ScalarType::U32 | ScalarType::Symbol | ScalarType::U64) {
6368                return Err(XlogError::Kernel(format!(
6369                    "sort_recorded supports only U32 / Symbol / U64 key columns; \
6370                     got {:?} for column {}",
6371                    ty, k
6372                )));
6373            }
6374        }
6375
6376        let n = input.num_rows() as u32;
6377        let block_size = Self::SORT_BLOCK_SIZE;
6378        let grid_size = n.div_ceil(block_size);
6379        let device = self.device.inner();
6380        let launch_config = LaunchConfig {
6381            grid_dim: (grid_size, 1, 1),
6382            block_dim: (block_size, 1, 1),
6383            shared_mem_bytes: 0,
6384        };
6385
6386        // Pre-allocate ALL fresh runtime-backed buffers BEFORE
6387        // recorder construction (Rust drop order).
6388        let mut indices_a = self.memory.alloc::<u32>(n as usize)?;
6389        let mut indices_b = self.memory.alloc::<u32>(n as usize)?;
6390        let mut keys_a = self.memory.alloc::<u32>(n as usize)?;
6391        let mut keys_b = self.memory.alloc::<u32>(n as usize)?;
6392        let mut d_hist = self.memory.alloc::<u32>((grid_size as usize) * 16)?;
6393        let mut d_prefix = self.memory.alloc::<u32>(16)?;
6394        let mut d_ranks = self.memory.alloc::<u32>(n as usize)?;
6395        // Output's d_num_rows must reflect input's LOGICAL
6396        // row count (not row_cap). The legacy `sort` clones
6397        // `input.num_rows_device()` via `dtod_copy`; recorded
6398        // sort does the same on launch_stream so downstream
6399        // consumers see the correct logical count.
6400        let output_d_num_rows = self.memory.alloc::<u32>(1)?;
6401
6402        let mut dst_cols: Vec<TrackedCudaSlice<u8>> = Vec::with_capacity(input.columns.len());
6403        for col_idx in 0..input.columns.len() {
6404            let elem_size = input
6405                .schema
6406                .column_type(col_idx)
6407                .ok_or_else(|| {
6408                    XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
6409                })?
6410                .size_bytes();
6411            dst_cols.push(self.memory.alloc::<u8>((n as usize) * elem_size)?);
6412        }
6413
6414        let mut rec = LaunchRecorder::new_strict(launch_stream);
6415        rec.read(input.num_rows_device());
6416        for col_idx in 0..input.columns.len() {
6417            let c = input
6418                .column(col_idx)
6419                .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
6420            rec.read_column(c);
6421        }
6422        // Pre-launch fresh writes: the recorder snapshots block
6423        // identity at record time and drops the slice borrow,
6424        // so kernel `&mut` borrows after preflight are unaffected.
6425        rec.write(&indices_a);
6426        rec.write(&indices_b);
6427        rec.write(&keys_a);
6428        rec.write(&keys_b);
6429        rec.write(&d_hist);
6430        rec.write(&d_prefix);
6431        rec.write(&d_ranks);
6432        rec.write(&output_d_num_rows);
6433        for dst_col in &dst_cols {
6434            rec.write(dst_col);
6435        }
6436        rec.preflight(runtime)
6437            .map_err(|e| XlogError::Kernel(format!("sort_recorded: preflight failed: {}", e)))?;
6438
6439        // Step 1: init_indices.
6440        let init_fn = device
6441            .get_func(SORT_MODULE, sort_kernels::INIT_INDICES)
6442            .ok_or_else(|| XlogError::Kernel("init_indices kernel not found".to_string()))?;
6443        // SAFETY: init_indices(indices, num_rows_device, row_cap)
6444        unsafe {
6445            init_fn.clone().launch_on_stream(
6446                &cu_stream,
6447                launch_config,
6448                (&mut indices_a, input.num_rows_device(), n),
6449            )
6450        }
6451        .map_err(|e| XlogError::Kernel(format!("init_indices (on_stream) failed: {}", e)))?;
6452
6453        // Step 2: LSD radix passes per key column. U32 / Symbol
6454        // are 4-byte → one radix pass per column. U64 keys use
6455        // the hi/lo gather pair (mirrors legacy `sort()`'s
6456        // strategy at line ~1691): one radix pass per half,
6457        // lo-first then hi, so the stable LSD ordering is
6458        // hi-most-significant.
6459        for &col_idx in key_cols.iter().rev() {
6460            let col = input
6461                .column(col_idx)
6462                .ok_or_else(|| XlogError::Kernel(format!("Key column {} not found", col_idx)))?;
6463            let ty = input.schema.column_type(col_idx).ok_or_else(|| {
6464                XlogError::Kernel(format!("Key column {} type not found in schema", col_idx))
6465            })?;
6466            match ty {
6467                ScalarType::U32 | ScalarType::Symbol => {
6468                    let col_view = self.column_as_u32_view(col, n as usize)?;
6469                    let gather_fn = device
6470                        .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_U32)
6471                        .ok_or_else(|| {
6472                            XlogError::Kernel("apply_permutation_u32 kernel not found".to_string())
6473                        })?;
6474                    // SAFETY: apply_permutation_u32(input, output, permutation,
6475                    // num_rows_device, row_cap)
6476                    unsafe {
6477                        gather_fn.clone().launch_on_stream(
6478                            &cu_stream,
6479                            launch_config,
6480                            (
6481                                &col_view,
6482                                &mut keys_a,
6483                                &indices_a,
6484                                input.num_rows_device(),
6485                                n,
6486                            ),
6487                        )
6488                    }
6489                    .map_err(|e| {
6490                        XlogError::Kernel(format!(
6491                            "apply_permutation_u32 (on_stream) failed: {}",
6492                            e
6493                        ))
6494                    })?;
6495
6496                    self.radix_sort_u32_pairs_with_scratch_on_stream(
6497                        &mut keys_a,
6498                        &mut keys_b,
6499                        &mut indices_a,
6500                        &mut indices_b,
6501                        &mut d_hist,
6502                        &mut d_prefix,
6503                        &mut d_ranks,
6504                        input.num_rows_device(),
6505                        n,
6506                        &cu_stream,
6507                        launch_stream,
6508                        runtime,
6509                    )?;
6510                }
6511                ScalarType::U64 => {
6512                    let col_view = self.column_as_u64_view(col, n as usize)?;
6513                    for &word in &[
6514                        sort_kernels::GATHER_KEYS_U64_LO_U32,
6515                        sort_kernels::GATHER_KEYS_U64_HI_U32,
6516                    ] {
6517                        let gather_fn = device.get_func(SORT_MODULE, word).ok_or_else(|| {
6518                            XlogError::Kernel(format!("{} kernel not found", word))
6519                        })?;
6520                        // SAFETY: gather_keys_u64_*_u32(vals, permutation,
6521                        // num_rows_device, row_cap, out_keys)
6522                        unsafe {
6523                            gather_fn.clone().launch_on_stream(
6524                                &cu_stream,
6525                                launch_config,
6526                                (
6527                                    &col_view,
6528                                    &indices_a,
6529                                    input.num_rows_device(),
6530                                    n,
6531                                    &mut keys_a,
6532                                ),
6533                            )
6534                        }
6535                        .map_err(|e| {
6536                            XlogError::Kernel(format!("{} (on_stream) failed: {}", word, e))
6537                        })?;
6538
6539                        self.radix_sort_u32_pairs_with_scratch_on_stream(
6540                            &mut keys_a,
6541                            &mut keys_b,
6542                            &mut indices_a,
6543                            &mut indices_b,
6544                            &mut d_hist,
6545                            &mut d_prefix,
6546                            &mut d_ranks,
6547                            input.num_rows_device(),
6548                            n,
6549                            &cu_stream,
6550                            launch_stream,
6551                            runtime,
6552                        )?;
6553                    }
6554                }
6555                other => {
6556                    return Err(XlogError::Kernel(format!(
6557                        "sort_recorded: column {} unexpected type {:?} after guard",
6558                        col_idx, other
6559                    )));
6560                }
6561            }
6562        }
6563
6564        // Step 3: gather all input columns by the final permutation.
6565        self.apply_permutation_gpu_on_stream(input, &indices_a, &mut dst_cols, &cu_stream)?;
6566
6567        // Step 4: copy input's logical d_num_rows into the
6568        // output's slot via dtod-async on launch_stream.
6569        // Sort preserves row count, so this matches what
6570        // legacy `apply_permutation_gpu` does via
6571        // `clone_device_row_count`.
6572        // SAFETY: runtime-backed buffers, 4-byte u32 copy.
6573        unsafe {
6574            let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
6575                *output_d_num_rows.device_ptr(),
6576                *input.num_rows_device().device_ptr(),
6577                std::mem::size_of::<u32>(),
6578                cu_stream.cu_stream(),
6579            );
6580            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
6581                return Err(XlogError::Kernel(format!(
6582                    "sort_recorded: cuMemcpyDtoDAsync (output_d_num_rows) failed: {:?}",
6583                    res
6584                )));
6585            }
6586        }
6587
6588        rec.commit(runtime)
6589            .map_err(|e| XlogError::Kernel(format!("sort_recorded: commit failed: {}", e)))?;
6590
6591        let new_columns: Vec<CudaColumn> = dst_cols.into_iter().map(|s| s.into()).collect();
6592        Ok(CudaBuffer::from_columns(
6593            new_columns,
6594            input.num_rows(),
6595            output_d_num_rows,
6596            input.schema.clone(),
6597        ))
6598    }
6599
6600    /// Strict-recorder variant of [`Self::dedup_full_row`] —
6601    /// narrow to U32 / Symbol / U64 columns.
6602    ///
6603    /// Composes [`Self::sort_recorded`] (typed multi-column
6604    /// sort) → on-stream `mark_unique_full_row_bytewise` →
6605    /// [`Self::compact_buffer_by_device_mask_counted_recorded`]
6606    /// (gather kept rows). All three primitives commit
6607    /// independently; the runtime's record-all + wait-all
6608    /// `last_use_events: Vec<CudaEvent>` semantics chain the
6609    /// deallocate safety end-to-end.
6610    pub fn dedup_full_row_recorded(
6611        &self,
6612        input: &CudaBuffer,
6613        launch_stream: StreamId,
6614    ) -> Result<CudaBuffer> {
6615        let runtime = self.memory.runtime().ok_or_else(|| {
6616            XlogError::Kernel(
6617                "dedup_full_row_recorded requires a runtime-backed GpuMemoryManager".to_string(),
6618            )
6619        })?;
6620        let cu_stream = runtime
6621            .stream_pool()
6622            .resolve(launch_stream)
6623            .ok_or_else(|| {
6624                XlogError::Kernel(format!(
6625                    "dedup_full_row_recorded: launch_stream StreamId({}) does not resolve",
6626                    launch_stream.0
6627                ))
6628            })?;
6629
6630        let row_count = input.num_rows() as usize;
6631        if row_count == 0 {
6632            return self.create_empty_buffer(input.schema().clone());
6633        }
6634        if row_count == 1 {
6635            return self.clone_buffer(input);
6636        }
6637        if row_count > u32::MAX as usize {
6638            return Err(XlogError::Kernel(format!(
6639                "dedup_full_row_recorded supports at most {} rows, got {}",
6640                u32::MAX,
6641                row_count
6642            )));
6643        }
6644        let arity = input.arity();
6645        if arity == 0 {
6646            return self.buffer_from_columns(Vec::new(), 1, input.schema().clone());
6647        }
6648        for col_idx in 0..arity {
6649            let ty = input.schema.column_type(col_idx).ok_or_else(|| {
6650                XlogError::Kernel(format!("Column {} type not found in schema", col_idx))
6651            })?;
6652            if !matches!(ty, ScalarType::U32 | ScalarType::Symbol | ScalarType::U64) {
6653                return Err(XlogError::Kernel(format!(
6654                    "dedup_full_row_recorded supports only U32 / Symbol / U64 columns; \
6655                     got {:?} for column {}",
6656                    ty, col_idx
6657                )));
6658            }
6659        }
6660
6661        // Step 1: typed sort on launch_stream.
6662        let all_cols: Vec<usize> = (0..arity).collect();
6663        let sorted = self.sort_recorded(input, &all_cols, launch_stream)?;
6664        let n = sorted.num_rows() as u32;
6665        if n <= 1 {
6666            return Ok(sorted);
6667        }
6668
6669        // Step 2: bytewise adjacent-equality mask. Allocate
6670        // d_col_ptrs / d_col_sizes / d_unique_mask up front
6671        // before the recorder. col_ptrs/sizes are populated
6672        // synchronously as launch metadata (ordered before the
6673        // launch_stream kernel sees them).
6674        let device = self.device.inner();
6675        let mut col_ptrs_host: Vec<u64> = Vec::with_capacity(arity);
6676        let mut col_sizes_host: Vec<u32> = Vec::with_capacity(arity);
6677        for col_idx in 0..arity {
6678            let c = sorted
6679                .column(col_idx)
6680                .ok_or_else(|| XlogError::Kernel(format!("Sorted column {} not found", col_idx)))?;
6681            let ty = sorted.schema().column_type(col_idx).ok_or_else(|| {
6682                XlogError::Kernel(format!("Sorted column {} type missing", col_idx))
6683            })?;
6684            col_ptrs_host.push(*c.device_ptr());
6685            col_sizes_host.push(ty.size_bytes() as u32);
6686        }
6687        let mut d_col_ptrs = self.memory.alloc::<u64>(arity)?;
6688        let mut d_col_sizes = self.memory.alloc::<u32>(arity)?;
6689        self.htod_launch_metadata_sync_copy_into(&col_ptrs_host, &mut d_col_ptrs)
6690            .map_err(|e| {
6691                XlogError::Kernel(format!("dedup_full_row_recorded col ptr upload: {}", e))
6692            })?;
6693        self.htod_launch_metadata_sync_copy_into(&col_sizes_host, &mut d_col_sizes)
6694            .map_err(|e| {
6695                XlogError::Kernel(format!("dedup_full_row_recorded col size upload: {}", e))
6696            })?;
6697        let d_unique_mask = self.memory.alloc::<u8>(n as usize)?;
6698
6699        let mut rec = LaunchRecorder::new_strict(launch_stream);
6700        for col_idx in 0..arity {
6701            let c = sorted
6702                .column(col_idx)
6703                .ok_or_else(|| XlogError::Kernel(format!("Sorted column {} not found", col_idx)))?;
6704            rec.read_column(c);
6705        }
6706        rec.read(sorted.num_rows_device());
6707        rec.write(&d_col_ptrs);
6708        rec.write(&d_col_sizes);
6709        rec.write(&d_unique_mask);
6710        rec.preflight(runtime).map_err(|e| {
6711            XlogError::Kernel(format!(
6712                "dedup_full_row_recorded: mark_unique preflight failed: {}",
6713                e
6714            ))
6715        })?;
6716
6717        let block_size = 256u32;
6718        let grid = n.div_ceil(block_size);
6719        let cfg = LaunchConfig {
6720            grid_dim: (grid, 1, 1),
6721            block_dim: (block_size, 1, 1),
6722            shared_mem_bytes: 0,
6723        };
6724        let mark_fn = device
6725            .get_func(DEDUP_MODULE, dedup_kernels::MARK_UNIQUE_FULL_ROW_BYTEWISE)
6726            .ok_or_else(|| {
6727                XlogError::Kernel("mark_unique_full_row_bytewise kernel not found".to_string())
6728            })?;
6729        // SAFETY: mark_unique_full_row_bytewise(col_ptrs, col_sizes,
6730        // num_cols, num_rows_device, row_cap, unique_mask)
6731        unsafe {
6732            mark_fn.clone().launch_on_stream(
6733                &cu_stream,
6734                cfg,
6735                (
6736                    &d_col_ptrs,
6737                    &d_col_sizes,
6738                    arity as u32,
6739                    sorted.num_rows_device(),
6740                    n,
6741                    &d_unique_mask,
6742                ),
6743            )
6744        }
6745        .map_err(|e| {
6746            XlogError::Kernel(format!(
6747                "mark_unique_full_row_bytewise (on_stream) failed: {}",
6748                e
6749            ))
6750        })?;
6751
6752        rec.commit(runtime).map_err(|e| {
6753            XlogError::Kernel(format!(
6754                "dedup_full_row_recorded: mark_unique commit failed: {}",
6755                e
6756            ))
6757        })?;
6758
6759        // Step 3: gather kept rows via the recorded compact tail.
6760        self.compact_buffer_by_device_mask_counted_recorded(&sorted, &d_unique_mask, launch_stream)
6761    }
6762
6763    // ============== Recorded hash join: inner only ==============
6764    //
6765    // Strict-recorder, launch_stream-routed sibling of
6766    // `hash_join_inner_v2`. Composes the existing recorded
6767    // pack helper (`pack_keys_gpu_on_stream`) with
6768    // two new on-stream helpers — `build_hash_table_v2_on_stream`
6769    // and `gather_buffer_by_indices_on_stream` — and runs the
6770    // probe kernel + count + materialize chain entirely on
6771    // launch_stream. Existing `hash_join_v2_*` callers keep
6772    // their bit-for-bit semantics; runtime/planner wiring is
6773    // not part of this provider helper.
6774    //
6775    // Scope:
6776    //   * `JoinType::Inner` only. Semi/Anti/LeftOuter and the
6777    //     indexed variant (`hash_join_v2_with_index`) are outside
6778    //     this recorded provider surface.
6779    //   * Pack-keys constraint of ≤4 columns inherits from
6780    //     `pack_keys_gpu_on_stream`.
6781    //   * Algorithm is unchanged: count-then-materialize
6782    //     (two probe passes); the GPU-resident
6783    //     count-prefix-materialize prototype is not reintroduced here.
6784
6785    /// Stream-aware variant of `build_hash_table_v2`. Mirrors
6786    /// the legacy bucket-count → exclusive-scan → scatter chain
6787    /// on the caller-supplied `launch_stream` (no internal
6788    /// `device.synchronize()`). Each fresh scratch allocation is
6789    /// fenced via `prepare_first_use(Access::Write)` immediately
6790    /// after alloc so the first cross-stream consumer (memset /
6791    /// dtod-copy / kernel) waits for cuMemAllocAsync to complete;
6792    /// at exit, every block that escapes (the four returned
6793    /// bucket buffers) plus the internal `bucket_cursors` scratch
6794    /// is finalized with `finish_block_use(Access::Write)` so
6795    /// end-of-scope drops are correctly serialized.
6796    fn build_hash_table_v2_on_stream(
6797        &self,
6798        hashes: &TrackedCudaSlice<u64>,
6799        num_rows: u32,
6800        cu_stream: &cudarc::driver::CudaStream,
6801        launch_stream: StreamId,
6802        runtime: &crate::device_runtime::XlogDeviceRuntime,
6803    ) -> Result<crate::provider::JoinHashTableV2> {
6804        let device = self.device.inner();
6805
6806        let target = (num_rows as u64).saturating_mul(2).max(1024);
6807        let num_buckets_u64 = target.next_power_of_two();
6808        let num_buckets = u32::try_from(num_buckets_u64).map_err(|_| {
6809            XlogError::Kernel(format!(
6810                "Join hash table too large: num_buckets={}",
6811                num_buckets_u64
6812            ))
6813        })?;
6814        let bucket_mask = num_buckets
6815            .checked_sub(1)
6816            .ok_or_else(|| XlogError::Kernel("Join hash table size underflow".to_string()))?;
6817
6818        let bucket_counts = self.memory.alloc::<u32>(num_buckets as usize)?;
6819        // Fence the alloc-ready event from `bucket_counts`'s
6820        // alloc_stream onto `launch_stream` BEFORE the memset
6821        // below — the memset would otherwise execute against a
6822        // stream that has not waited on cuMemAllocAsync's
6823        // completion event, producing garbage / pool-recycled
6824        // bytes when the alloc and use streams differ.
6825        runtime
6826            .prepare_first_use(&bucket_counts, launch_stream, Access::Write)
6827            .map_err(|e| {
6828                XlogError::Kernel(format!(
6829                    "build_hash_table_v2_on_stream: prepare bucket_counts failed: {}",
6830                    e
6831                ))
6832            })?;
6833        // Async u32 zero-fill on launch_stream.
6834        if num_buckets > 0 {
6835            // SAFETY: bucket_counts is runtime-backed for
6836            // num_buckets * 4 bytes; cu_stream is a valid
6837            // stream the runtime owns.
6838            unsafe {
6839                let res = cudarc::driver::sys::cuMemsetD8Async(
6840                    *bucket_counts.device_ptr(),
6841                    0,
6842                    (num_buckets as usize) * std::mem::size_of::<u32>(),
6843                    cu_stream.cu_stream(),
6844                );
6845                if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
6846                    return Err(XlogError::Kernel(format!(
6847                        "cuMemsetD8Async (bucket_counts) failed: {:?}",
6848                        res
6849                    )));
6850                }
6851            }
6852        }
6853
6854        let block_size = 256u32;
6855        let grid_size = num_rows.div_ceil(block_size);
6856        let cfg = LaunchConfig {
6857            grid_dim: (grid_size, 1, 1),
6858            block_dim: (block_size, 1, 1),
6859            shared_mem_bytes: 0,
6860        };
6861
6862        let count_fn = device
6863            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_BUCKET_COUNT_V2)
6864            .ok_or_else(|| {
6865                XlogError::Kernel("hash_join_bucket_count_v2 kernel not found".to_string())
6866            })?;
6867        // SAFETY: hash_join_bucket_count_v2(hashes, num_rows, bucket_counts, bucket_mask)
6868        unsafe {
6869            count_fn.clone().launch_on_stream(
6870                cu_stream,
6871                cfg,
6872                (hashes, num_rows, &bucket_counts, bucket_mask),
6873            )
6874        }
6875        .map_err(|e| {
6876            XlogError::Kernel(format!(
6877                "hash_join_bucket_count_v2 (on_stream) failed: {}",
6878                e
6879            ))
6880        })?;
6881
6882        let mut bucket_offsets = self.memory.alloc::<u32>(num_buckets as usize)?;
6883        // See `bucket_counts` rationale above: fence
6884        // alloc-ready → launch_stream before the dtod-copy.
6885        runtime
6886            .prepare_first_use(&bucket_offsets, launch_stream, Access::Write)
6887            .map_err(|e| {
6888                XlogError::Kernel(format!(
6889                    "build_hash_table_v2_on_stream: prepare bucket_offsets failed: {}",
6890                    e
6891                ))
6892            })?;
6893        if num_buckets > 0 {
6894            // dtod copy bucket_counts → bucket_offsets on launch_stream.
6895            // SAFETY: both buffers are runtime-backed for the
6896            // same num_buckets * 4 bytes; cu_stream is valid.
6897            unsafe {
6898                let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
6899                    *bucket_offsets.device_ptr(),
6900                    *bucket_counts.device_ptr(),
6901                    (num_buckets as usize) * std::mem::size_of::<u32>(),
6902                    cu_stream.cu_stream(),
6903                );
6904                if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
6905                    return Err(XlogError::Kernel(format!(
6906                        "cuMemcpyDtoDAsync (bucket_counts → bucket_offsets) failed: {:?}",
6907                        res
6908                    )));
6909                }
6910            }
6911            self.multiblock_scan_u32_inplace_on_stream(
6912                &mut bucket_offsets,
6913                num_buckets,
6914                cu_stream,
6915                launch_stream,
6916                runtime,
6917            )?;
6918        }
6919
6920        let bucket_cursors = self.memory.alloc::<u32>(num_buckets as usize)?;
6921        // Fence alloc-ready → launch_stream for cursors before
6922        // the dtod-copy.
6923        runtime
6924            .prepare_first_use(&bucket_cursors, launch_stream, Access::Write)
6925            .map_err(|e| {
6926                XlogError::Kernel(format!(
6927                    "build_hash_table_v2_on_stream: prepare bucket_cursors failed: {}",
6928                    e
6929                ))
6930            })?;
6931        if num_buckets > 0 {
6932            // dtod copy bucket_offsets → bucket_cursors on launch_stream.
6933            // SAFETY: same shape and size constraints as above.
6934            unsafe {
6935                let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
6936                    *bucket_cursors.device_ptr(),
6937                    *bucket_offsets.device_ptr(),
6938                    (num_buckets as usize) * std::mem::size_of::<u32>(),
6939                    cu_stream.cu_stream(),
6940                );
6941                if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
6942                    return Err(XlogError::Kernel(format!(
6943                        "cuMemcpyDtoDAsync (bucket_offsets → bucket_cursors) failed: {:?}",
6944                        res
6945                    )));
6946                }
6947            }
6948        }
6949
6950        let bucket_entries = self.memory.alloc::<u32>(num_rows as usize)?;
6951        let bucket_entry_hashes = self.memory.alloc::<u64>(num_rows as usize)?;
6952        // Fence alloc-ready → launch_stream for both before the
6953        // scatter kernel writes them.
6954        runtime
6955            .prepare_first_use(&bucket_entries, launch_stream, Access::Write)
6956            .map_err(|e| {
6957                XlogError::Kernel(format!(
6958                    "build_hash_table_v2_on_stream: prepare bucket_entries failed: {}",
6959                    e
6960                ))
6961            })?;
6962        runtime
6963            .prepare_first_use(&bucket_entry_hashes, launch_stream, Access::Write)
6964            .map_err(|e| {
6965                XlogError::Kernel(format!(
6966                    "build_hash_table_v2_on_stream: prepare bucket_entry_hashes failed: {}",
6967                    e
6968                ))
6969            })?;
6970
6971        let scatter_fn = device
6972            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SCATTER_V2)
6973            .ok_or_else(|| {
6974                XlogError::Kernel("hash_join_scatter_v2 kernel not found".to_string())
6975            })?;
6976        // SAFETY: hash_join_scatter_v2(hashes, num_rows, bucket_cursors, bucket_mask, bucket_entries, bucket_entry_hashes)
6977        unsafe {
6978            scatter_fn.clone().launch_on_stream(
6979                cu_stream,
6980                cfg,
6981                (
6982                    hashes,
6983                    num_rows,
6984                    &bucket_cursors,
6985                    bucket_mask,
6986                    &bucket_entries,
6987                    &bucket_entry_hashes,
6988                ),
6989            )
6990        }
6991        .map_err(|e| {
6992            XlogError::Kernel(format!("hash_join_scatter_v2 (on_stream) failed: {}", e))
6993        })?;
6994
6995        // Record uses on launch_stream:
6996        // * bucket_cursors drops at end of helper — must be
6997        //   recorded so the runtime defers its free behind
6998        //   the scatter kernel.
6999        // * bucket_counts / bucket_offsets / bucket_entries /
7000        //   bucket_entry_hashes escape via JoinHashTableV2;
7001        //   record so downstream drops are gated.
7002        for blk in [
7003            bucket_counts.runtime_block(),
7004            bucket_offsets.runtime_block(),
7005            bucket_cursors.runtime_block(),
7006            bucket_entries.runtime_block(),
7007            bucket_entry_hashes.runtime_block(),
7008        ] {
7009            if let Some(b) = blk {
7010                runtime
7011                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
7012                    .map_err(|e| {
7013                        XlogError::Kernel(format!(
7014                            "build_hash_table_v2_on_stream: finish_block_use failed: {}",
7015                            e
7016                        ))
7017                    })?;
7018            } else {
7019                return Err(XlogError::Kernel(
7020                    "build_hash_table_v2_on_stream: buffer has no runtime block — \
7021                     caller must use a runtime-backed manager"
7022                        .to_string(),
7023                ));
7024            }
7025        }
7026
7027        Ok(crate::provider::JoinHashTableV2 {
7028            bucket_counts,
7029            bucket_offsets,
7030            bucket_entries,
7031            bucket_entry_hashes,
7032            bucket_mask,
7033        })
7034    }
7035
7036    /// Stream-aware variant of `gather_buffer_by_indices`.
7037    /// Allocates output column storage, runs
7038    /// `apply_permutation_bytes` per input column on
7039    /// `launch_stream`, and assembles a `CudaBuffer`. Records
7040    /// every fresh allocation directly via the runtime so the
7041    /// returned buffer is safe to drop before any pending
7042    /// gather kernel completes.
7043    fn gather_buffer_by_indices_on_stream(
7044        &self,
7045        input: &CudaBuffer,
7046        indices: &TrackedCudaSlice<u32>,
7047        output_rows: u32,
7048        cu_stream: &cudarc::driver::CudaStream,
7049        launch_stream: StreamId,
7050        runtime: &crate::device_runtime::XlogDeviceRuntime,
7051    ) -> Result<CudaBuffer> {
7052        if output_rows == 0 {
7053            return self.create_empty_buffer(input.schema().clone());
7054        }
7055        if input.num_rows() > u32::MAX as u64 {
7056            return Err(XlogError::Kernel(format!(
7057                "GPU gather supports at most {} input rows, got {}",
7058                u32::MAX,
7059                input.num_rows()
7060            )));
7061        }
7062
7063        let d_output_rows = self.upload_device_row_count(output_rows)?;
7064        // `upload_device_row_count` initializes this scalar on
7065        // the manager/default stream. Publish that write into
7066        // the runtime dependency state, then fence launch_stream
7067        // before the gather kernels read it. The scalar is local
7068        // scratch, so we also finish a read after the kernels so
7069        // its drop/free waits for launch_stream completion.
7070        runtime
7071            .finish_first_use(&d_output_rows, StreamId::DEFAULT, Access::Write)
7072            .map_err(|e| {
7073                XlogError::Kernel(format!(
7074                    "gather_buffer_by_indices_on_stream: record d_output_rows upload failed: {}",
7075                    e
7076                ))
7077            })?;
7078        runtime
7079            .prepare_first_use(&d_output_rows, launch_stream, Access::Read)
7080            .map_err(|e| {
7081                XlogError::Kernel(format!(
7082                    "gather_buffer_by_indices_on_stream: prepare d_output_rows failed: {}",
7083                    e
7084                ))
7085            })?;
7086        let device = self.device.inner();
7087        let block_size = 256u32;
7088        let grid_size = output_rows.div_ceil(block_size);
7089        let launch_config = LaunchConfig {
7090            grid_dim: (grid_size, 1, 1),
7091            block_dim: (block_size, 1, 1),
7092            shared_mem_bytes: 0,
7093        };
7094
7095        let gather_fn = device
7096            .get_func(SORT_MODULE, sort_kernels::APPLY_PERMUTATION_BYTES)
7097            .ok_or_else(|| {
7098                XlogError::Kernel("apply_permutation_bytes kernel not found".to_string())
7099            })?;
7100
7101        let mut dst_cols: Vec<TrackedCudaSlice<u8>> = Vec::with_capacity(input.columns.len());
7102        for col_idx in 0..input.columns.len() {
7103            let elem_size = input
7104                .schema
7105                .column_type(col_idx)
7106                .ok_or_else(|| {
7107                    XlogError::Kernel(format!("Schema type for column {} not found", col_idx))
7108                })?
7109                .size_bytes() as u32;
7110            let dst_bytes = (output_rows as usize) * (elem_size as usize);
7111            let dst = self.memory.alloc::<u8>(dst_bytes)?;
7112            // Fence alloc-ready → launch_stream for each fresh
7113            // dst_col before the gather kernel writes it.
7114            runtime
7115                .prepare_first_use(&dst, launch_stream, Access::Write)
7116                .map_err(|e| {
7117                    XlogError::Kernel(format!(
7118                        "gather_buffer_by_indices_on_stream: prepare dst_col {} failed: {}",
7119                        col_idx, e
7120                    ))
7121                })?;
7122            dst_cols.push(dst);
7123        }
7124
7125        for (col_idx, dst_col) in dst_cols.iter_mut().enumerate() {
7126            let src_col = input
7127                .column(col_idx)
7128                .ok_or_else(|| XlogError::Kernel(format!("Column {} not found", col_idx)))?;
7129            let elem_size = input
7130                .schema
7131                .column_type(col_idx)
7132                .map(|t| t.size_bytes() as u32)
7133                .unwrap_or(4);
7134            // SAFETY: apply_permutation_bytes(input, output, permutation, num_rows_device, row_cap, elem_size)
7135            unsafe {
7136                gather_fn.clone().launch_on_stream(
7137                    cu_stream,
7138                    launch_config,
7139                    (
7140                        src_col,
7141                        &mut *dst_col,
7142                        indices,
7143                        &d_output_rows,
7144                        output_rows,
7145                        elem_size,
7146                    ),
7147                )
7148            }
7149            .map_err(|e| {
7150                XlogError::Kernel(format!("apply_permutation_bytes (on_stream) failed: {}", e))
7151            })?;
7152        }
7153
7154        runtime
7155            .finish_first_use(&d_output_rows, launch_stream, Access::Read)
7156            .map_err(|e| {
7157                XlogError::Kernel(format!(
7158                    "gather_buffer_by_indices_on_stream: record d_output_rows read failed: {}",
7159                    e
7160                ))
7161            })?;
7162
7163        // Record uses on launch_stream for buffers we wrote
7164        // (the dst_cols escape via the returned CudaBuffer).
7165        // input.column[i] reads will be recorded by the
7166        // caller's outer LaunchRecorder.
7167        for dst_col in &dst_cols {
7168            if let Some(b) = dst_col.runtime_block() {
7169                runtime
7170                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
7171                    .map_err(|e| {
7172                        XlogError::Kernel(format!(
7173                            "gather_buffer_by_indices_on_stream: finish_block_use \
7174                         (dst_col) failed: {}",
7175                            e
7176                        ))
7177                    })?;
7178            } else {
7179                return Err(XlogError::Kernel(
7180                    "gather_buffer_by_indices_on_stream: dst_col has no runtime block".to_string(),
7181                ));
7182            }
7183        }
7184
7185        let new_columns: Vec<CudaColumn> = dst_cols.into_iter().map(|s| s.into()).collect();
7186        Ok(CudaBuffer::from_columns(
7187            new_columns,
7188            output_rows as u64,
7189            d_output_rows,
7190            input.schema.clone(),
7191        ))
7192    }
7193
7194    /// Strict-recorder variant of `hash_join_inner_v2`.
7195    /// `JoinType::Inner` only. Same count-then-materialize
7196    /// algorithm as the legacy variant, but every kernel
7197    /// runs on the caller-supplied `launch_stream` and host
7198    /// scalar reads of the join output count are explicitly
7199    /// ordered against the stream.
7200    pub fn hash_join_inner_v2_recorded(
7201        &self,
7202        left: &CudaBuffer,
7203        right: &CudaBuffer,
7204        left_keys: &[usize],
7205        right_keys: &[usize],
7206        max_output: Option<usize>,
7207        launch_stream: StreamId,
7208    ) -> Result<CudaBuffer> {
7209        use crate::launch::LaunchRecorder;
7210
7211        let runtime = self.memory.runtime().ok_or_else(|| {
7212            XlogError::Kernel(
7213                "hash_join_inner_v2_recorded requires a runtime-backed GpuMemoryManager"
7214                    .to_string(),
7215            )
7216        })?;
7217        let cu_stream = runtime
7218            .stream_pool()
7219            .resolve(launch_stream)
7220            .ok_or_else(|| {
7221                XlogError::Kernel(format!(
7222                    "hash_join_inner_v2_recorded: launch_stream StreamId({}) does not resolve",
7223                    launch_stream.0
7224                ))
7225            })?;
7226
7227        let num_left = self.device_row_count(left)?;
7228        let num_right = self.device_row_count(right)?;
7229        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
7230            return Err(XlogError::Kernel(format!(
7231                "Join supports at most {} rows per side (left={}, right={})",
7232                u32::MAX,
7233                num_left,
7234                num_right
7235            )));
7236        }
7237        if num_left == 0 || num_right == 0 {
7238            let combined_schema = self.combine_schemas(left.schema(), right.schema());
7239            return self.create_empty_buffer(combined_schema);
7240        }
7241        if left_keys.is_empty() || right_keys.is_empty() {
7242            return Err(XlogError::Kernel(
7243                "Join requires at least one key column".to_string(),
7244            ));
7245        }
7246        if left_keys.len() != right_keys.len() {
7247            return Err(XlogError::Kernel(
7248                "Left and right key columns must have same length".to_string(),
7249            ));
7250        }
7251        if left_keys.len() > 4 {
7252            return Err(XlogError::Kernel(
7253                "hash_join_inner_v2_recorded: max 4 key columns supported (pack_keys constraint)"
7254                    .to_string(),
7255            ));
7256        }
7257        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
7258            let lt = left.schema().column_type(l);
7259            let rt = right.schema().column_type(r);
7260            if lt != rt {
7261                return Err(XlogError::Kernel(format!(
7262                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
7263                    l, lt, r, rt
7264                )));
7265            }
7266        }
7267
7268        let num_left = num_left as u32;
7269        let num_right = num_right as u32;
7270
7271        // Step 1+2: pack keys for both sides on launch_stream.
7272        let left_packed =
7273            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
7274        let right_packed =
7275            self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
7276
7277        // Step 3: build hash table on launch_stream.
7278        let table = self.build_hash_table_v2_on_stream(
7279            &right_packed.hashes,
7280            num_right,
7281            &cu_stream,
7282            launch_stream,
7283            runtime,
7284        )?;
7285
7286        let probe_func = self
7287            .device
7288            .inner()
7289            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
7290            .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
7291        let block_size = 256u32;
7292        let probe_grid = num_left.div_ceil(block_size);
7293        let probe_config = LaunchConfig {
7294            grid_dim: (probe_grid, 1, 1),
7295            block_dim: (block_size, 1, 1),
7296            shared_mem_bytes: 0,
7297        };
7298
7299        // Step 4: count-only pass. Allocate count + dummy
7300        // output buffers up front. The recorder's preflight will
7301        // queue the alloc-ready waits before either the memset
7302        // OR the kernel runs on launch_stream.
7303        let d_count_only = self.memory.alloc::<u32>(1)?;
7304        let d_dummy_left = self.memory.alloc::<u32>(1)?;
7305        let d_dummy_right = self.memory.alloc::<u32>(1)?;
7306
7307        // Build the recorder for the probe / output stage.
7308        // Reads BEFORE preflight: hashes + packed_keys (already
7309        // recorded by pack_keys_gpu_on_stream as writes on
7310        // launch_stream — recording reads here adds the next
7311        // event in the chain), table buckets, dummy buffers.
7312        let max_output_count_only = 0u32;
7313        let mut rec_count = LaunchRecorder::new_strict(launch_stream);
7314        rec_count.read(&left_packed.hashes);
7315        rec_count.read(&left_packed.packed_keys);
7316        rec_count.read(&right_packed.packed_keys);
7317        rec_count.read(&table.bucket_offsets);
7318        rec_count.read(&table.bucket_counts);
7319        rec_count.read(&table.bucket_entries);
7320        rec_count.read(&table.bucket_entry_hashes);
7321        rec_count.write(&d_count_only);
7322        rec_count.write(&d_dummy_left);
7323        rec_count.write(&d_dummy_right);
7324        rec_count.preflight(runtime).map_err(|e| {
7325            XlogError::Kernel(format!(
7326                "hash_join_inner_v2_recorded: count-pass preflight failed: {}",
7327                e
7328            ))
7329        })?;
7330
7331        // Zero-init d_count_only via async memset on
7332        // launch_stream — runs AFTER preflight has queued the
7333        // alloc-ready waits, so the memset is correctly fenced
7334        // behind cuMemAllocAsync's completion.
7335        // SAFETY: d_count_only is runtime-backed for 4 bytes.
7336        unsafe {
7337            let res = cudarc::driver::sys::cuMemsetD8Async(
7338                *d_count_only.device_ptr(),
7339                0,
7340                std::mem::size_of::<u32>(),
7341                cu_stream.cu_stream(),
7342            );
7343            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7344                return Err(XlogError::Kernel(format!(
7345                    "cuMemsetD8Async (d_count_only) failed: {:?}",
7346                    res
7347                )));
7348            }
7349        }
7350
7351        // SAFETY: hash_join_probe_v2 14-arg signature — see
7352        // legacy hash_join_inner_v2 for the canonical
7353        // documentation. Tuple exceeds 12-element limit, so
7354        // we use the raw-pointer launch path.
7355        unsafe {
7356            let mut params: Vec<*mut c_void> = vec![
7357                (&left_packed.hashes).as_kernel_param(),
7358                num_left.as_kernel_param(),
7359                (&table.bucket_offsets).as_kernel_param(),
7360                (&table.bucket_counts).as_kernel_param(),
7361                (&table.bucket_entries).as_kernel_param(),
7362                (&table.bucket_entry_hashes).as_kernel_param(),
7363                table.bucket_mask.as_kernel_param(),
7364                (&left_packed.packed_keys).as_kernel_param(),
7365                (&right_packed.packed_keys).as_kernel_param(),
7366                left_packed.key_bytes.as_kernel_param(),
7367                (&d_dummy_left).as_kernel_param(),
7368                (&d_dummy_right).as_kernel_param(),
7369                (&d_count_only).as_kernel_param(),
7370                max_output_count_only.as_kernel_param(),
7371            ];
7372            probe_func
7373                .clone()
7374                .launch_on_stream(&cu_stream, probe_config, &mut params)
7375                .map_err(|e| {
7376                    XlogError::Kernel(format!(
7377                        "hash_join_probe_v2 (count, on_stream) failed: {}",
7378                        e
7379                    ))
7380                })?;
7381        }
7382
7383        rec_count.commit(runtime).map_err(|e| {
7384            XlogError::Kernel(format!(
7385                "hash_join_inner_v2_recorded: count-pass commit failed: {}",
7386                e
7387            ))
7388        })?;
7389
7390        // Explicit barrier before host scalar read of full_count.
7391        cu_stream.synchronize().map_err(|e| {
7392            XlogError::Kernel(format!(
7393                "hash_join_inner_v2_recorded: launch_stream sync (count read) failed: {}",
7394                e
7395            ))
7396        })?;
7397        let full_count = self.read_join_output_count_metadata(&d_count_only)? as u64;
7398        let requested = max_output
7399            .map(|limit| (limit as u64).min(full_count))
7400            .unwrap_or(full_count);
7401        if requested == 0 {
7402            let combined_schema = self.combine_schemas(left.schema(), right.schema());
7403            return self.create_empty_buffer(combined_schema);
7404        }
7405        if requested > u32::MAX as u64 {
7406            return Err(XlogError::Kernel(format!(
7407                "Join produced {} rows which exceeds the u32 index limit",
7408                requested
7409            )));
7410        }
7411        let max_output_u32 = requested as u32;
7412
7413        // Step 5: materialize pass. Allocate output index
7414        // buffers + count. Memset runs AFTER preflight has
7415        // queued the alloc-ready waits.
7416        let d_output_left = self.memory.alloc::<u32>(max_output_u32 as usize)?;
7417        let d_output_right = self.memory.alloc::<u32>(max_output_u32 as usize)?;
7418        let d_output_count = self.memory.alloc::<u32>(1)?;
7419
7420        let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
7421        rec_mat.read(&left_packed.hashes);
7422        rec_mat.read(&left_packed.packed_keys);
7423        rec_mat.read(&right_packed.packed_keys);
7424        rec_mat.read(&table.bucket_offsets);
7425        rec_mat.read(&table.bucket_counts);
7426        rec_mat.read(&table.bucket_entries);
7427        rec_mat.read(&table.bucket_entry_hashes);
7428        rec_mat.write(&d_output_left);
7429        rec_mat.write(&d_output_right);
7430        rec_mat.write(&d_output_count);
7431        rec_mat.preflight(runtime).map_err(|e| {
7432            XlogError::Kernel(format!(
7433                "hash_join_inner_v2_recorded: materialize-pass preflight failed: {}",
7434                e
7435            ))
7436        })?;
7437
7438        // Zero-init d_output_count via async memset on
7439        // launch_stream — fenced behind alloc-ready waits.
7440        // SAFETY: runtime-backed 4-byte buffer.
7441        unsafe {
7442            let res = cudarc::driver::sys::cuMemsetD8Async(
7443                *d_output_count.device_ptr(),
7444                0,
7445                std::mem::size_of::<u32>(),
7446                cu_stream.cu_stream(),
7447            );
7448            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7449                return Err(XlogError::Kernel(format!(
7450                    "cuMemsetD8Async (d_output_count) failed: {:?}",
7451                    res
7452                )));
7453            }
7454        }
7455
7456        // SAFETY: same 14-arg probe signature.
7457        unsafe {
7458            let mut params: Vec<*mut c_void> = vec![
7459                (&left_packed.hashes).as_kernel_param(),
7460                num_left.as_kernel_param(),
7461                (&table.bucket_offsets).as_kernel_param(),
7462                (&table.bucket_counts).as_kernel_param(),
7463                (&table.bucket_entries).as_kernel_param(),
7464                (&table.bucket_entry_hashes).as_kernel_param(),
7465                table.bucket_mask.as_kernel_param(),
7466                (&left_packed.packed_keys).as_kernel_param(),
7467                (&right_packed.packed_keys).as_kernel_param(),
7468                left_packed.key_bytes.as_kernel_param(),
7469                (&d_output_left).as_kernel_param(),
7470                (&d_output_right).as_kernel_param(),
7471                (&d_output_count).as_kernel_param(),
7472                max_output_u32.as_kernel_param(),
7473            ];
7474            probe_func
7475                .clone()
7476                .launch_on_stream(&cu_stream, probe_config, &mut params)
7477                .map_err(|e| {
7478                    XlogError::Kernel(format!(
7479                        "hash_join_probe_v2 (materialize, on_stream) failed: {}",
7480                        e
7481                    ))
7482                })?;
7483        }
7484
7485        rec_mat.commit(runtime).map_err(|e| {
7486            XlogError::Kernel(format!(
7487                "hash_join_inner_v2_recorded: materialize-pass commit failed: {}",
7488                e
7489            ))
7490        })?;
7491
7492        // Explicit barrier before host scalar read of result_count.
7493        cu_stream.synchronize().map_err(|e| {
7494            XlogError::Kernel(format!(
7495                "hash_join_inner_v2_recorded: launch_stream sync (mat read) failed: {}",
7496                e
7497            ))
7498        })?;
7499        let result_count = (self.read_join_output_count_metadata(&d_output_count)? as u64)
7500            .min(max_output_u32 as u64);
7501        if result_count == 0 {
7502            let combined_schema = self.combine_schemas(left.schema(), right.schema());
7503            return self.create_empty_buffer(combined_schema);
7504        }
7505        let output_rows = result_count as u32;
7506
7507        // Step 6: gather both sides on launch_stream. Each
7508        // gather records reads of input.column[i] via its own
7509        // outer LaunchRecorder — set up below.
7510        let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
7511        for col_idx in 0..left.columns.len() {
7512            let c = left
7513                .column(col_idx)
7514                .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
7515            rec_gather.read_column(c);
7516        }
7517        for col_idx in 0..right.columns.len() {
7518            let c = right
7519                .column(col_idx)
7520                .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
7521            rec_gather.read_column(c);
7522        }
7523        rec_gather.read(&d_output_left);
7524        rec_gather.read(&d_output_right);
7525        rec_gather.preflight(runtime).map_err(|e| {
7526            XlogError::Kernel(format!(
7527                "hash_join_inner_v2_recorded: gather preflight failed: {}",
7528                e
7529            ))
7530        })?;
7531
7532        let gathered_left = self.gather_buffer_by_indices_on_stream(
7533            left,
7534            &d_output_left,
7535            output_rows,
7536            &cu_stream,
7537            launch_stream,
7538            runtime,
7539        )?;
7540        let gathered_right = self.gather_buffer_by_indices_on_stream(
7541            right,
7542            &d_output_right,
7543            output_rows,
7544            &cu_stream,
7545            launch_stream,
7546            runtime,
7547        )?;
7548
7549        rec_gather.commit(runtime).map_err(|e| {
7550            XlogError::Kernel(format!(
7551                "hash_join_inner_v2_recorded: gather commit failed: {}",
7552                e
7553            ))
7554        })?;
7555
7556        let combined_schema = self.combine_schemas(left.schema(), right.schema());
7557        let mut result_columns = Vec::with_capacity(combined_schema.arity());
7558        result_columns.extend(gathered_left.columns);
7559        result_columns.extend(gathered_right.columns);
7560        self.buffer_from_columns(result_columns, result_count, combined_schema)
7561    }
7562
7563    /// Strict-recorder, deterministic-ordering Inner hash
7564    /// join using the deterministic binary-join path.
7565    ///
7566    /// Algorithm: count → exclusive scan → device-resident
7567    /// total → host scalar read → materialize with
7568    /// per-probe-row offsets. Each probe row writes its
7569    /// `local`-th match to
7570    /// `output[per_probe_offsets[tid] + local]` directly —
7571    /// no global `atomicAdd(output_count)` on the
7572    /// materialize pass, so the output ordering is a
7573    /// deterministic function of (probe-row index,
7574    /// per-row match discovery order). Compare to
7575    /// [`Self::hash_join_inner_v2_recorded`] which uses the
7576    /// legacy count-then-atomic-materialize chain (correct
7577    /// but with atomic-induced order non-determinism across
7578    /// threads/blocks).
7579    ///
7580    /// Sourced from the archived `archive/gpu-resident-binary-join-prototype-*`
7581    /// branches — three new kernels migrated:
7582    /// `hash_join_probe_v2_count_per_row`,
7583    /// `hash_join_probe_v2_materialize`,
7584    /// `hash_join_total_from_scan`. LeftOuter / Semi / Anti
7585    /// / indexed variants from the prototype are
7586    /// intentionally not migrated here.
7587    ///
7588    /// Reuses the recorded helpers `pack_keys_gpu_on_stream`,
7589    /// `build_hash_table_v2_on_stream`,
7590    /// `multiblock_scan_u32_inplace_on_stream`, and
7591    /// `gather_buffer_by_indices_on_stream`. Inherits the compact / pack
7592    /// fixes via composition.
7593    pub fn hash_join_inner_v2_count_scan_materialize_recorded(
7594        &self,
7595        left: &CudaBuffer,
7596        right: &CudaBuffer,
7597        left_keys: &[usize],
7598        right_keys: &[usize],
7599        max_output: Option<usize>,
7600        launch_stream: StreamId,
7601    ) -> Result<CudaBuffer> {
7602        if Self::use_csm_cuda_graph_env() {
7603            if let Some(result) = self
7604                .hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded(
7605                    left,
7606                    right,
7607                    left_keys,
7608                    right_keys,
7609                    max_output,
7610                    launch_stream,
7611                )?
7612            {
7613                return Ok(result);
7614            }
7615            self.csm_cuda_graph_fallbacks
7616                .fetch_add(1, Ordering::Relaxed);
7617        }
7618
7619        use crate::launch::LaunchRecorder;
7620
7621        let runtime = self.memory.runtime().ok_or_else(|| {
7622            XlogError::Kernel(
7623                "hash_join_inner_v2_count_scan_materialize_recorded requires a \
7624                 runtime-backed GpuMemoryManager"
7625                    .to_string(),
7626            )
7627        })?;
7628        let cu_stream = runtime
7629            .stream_pool()
7630            .resolve(launch_stream)
7631            .ok_or_else(|| {
7632                XlogError::Kernel(format!(
7633                    "hash_join_inner_v2_count_scan_materialize_recorded: launch_stream \
7634                 StreamId({}) does not resolve",
7635                    launch_stream.0
7636                ))
7637            })?;
7638
7639        // Validation (mirrors `hash_join_inner_v2_recorded`).
7640        let num_left = self.device_row_count(left)?;
7641        let num_right = self.device_row_count(right)?;
7642        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
7643            return Err(XlogError::Kernel(format!(
7644                "Join supports at most {} rows per side (left={}, right={})",
7645                u32::MAX,
7646                num_left,
7647                num_right
7648            )));
7649        }
7650        if num_left == 0 || num_right == 0 {
7651            let combined_schema = self.combine_schemas(left.schema(), right.schema());
7652            return self.create_empty_buffer(combined_schema);
7653        }
7654        if left_keys.is_empty() || right_keys.is_empty() {
7655            return Err(XlogError::Kernel(
7656                "Join requires at least one key column".to_string(),
7657            ));
7658        }
7659        if left_keys.len() != right_keys.len() {
7660            return Err(XlogError::Kernel(
7661                "Left and right key columns must have same length".to_string(),
7662            ));
7663        }
7664        if left_keys.len() > 4 {
7665            return Err(XlogError::Kernel(
7666                "hash_join_inner_v2_count_scan_materialize_recorded: max 4 key \
7667                 columns supported (pack_keys constraint)"
7668                    .to_string(),
7669            ));
7670        }
7671        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
7672            let lt = left.schema().column_type(l);
7673            let rt = right.schema().column_type(r);
7674            if lt != rt {
7675                return Err(XlogError::Kernel(format!(
7676                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
7677                    l, lt, r, rt
7678                )));
7679            }
7680        }
7681
7682        let _num_left = num_left as u32;
7683        let probe_cap = left.num_rows() as u32;
7684
7685        // Steps 1+2: pack + table on launch_stream.
7686        let left_packed =
7687            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
7688        let right_packed =
7689            self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
7690        let table = self.build_hash_table_v2_on_stream(
7691            &right_packed.hashes,
7692            num_right as u32,
7693            &cu_stream,
7694            launch_stream,
7695            runtime,
7696        )?;
7697
7698        let device = self.device.inner();
7699        let block_size = 256u32;
7700        let probe_grid = probe_cap.div_ceil(block_size);
7701        let probe_config = LaunchConfig {
7702            grid_dim: (probe_grid, 1, 1),
7703            block_dim: (block_size, 1, 1),
7704            shared_mem_bytes: 0,
7705        };
7706
7707        // Allocate count + offsets + total scalar + overflow flag.
7708        let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
7709        let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
7710        let d_logical_count = self.memory.alloc::<u32>(1)?;
7711        let d_overflow = self.memory.alloc::<u8>(1)?;
7712        // Fence alloc-ready → launch_stream for both before
7713        // the memset writes them. The recorder below will
7714        // attach further dependencies, but the memset runs
7715        // ahead of the recorder's preflight so we need this
7716        // direct fence.
7717        runtime
7718            .prepare_first_use(&d_overflow, launch_stream, Access::Write)
7719            .map_err(|e| {
7720                XlogError::Kernel(format!(
7721                    "hash_join_inner_v2_count_scan_materialize_recorded: prepare d_overflow \
7722                     failed: {}",
7723                    e
7724                ))
7725            })?;
7726        runtime
7727            .prepare_first_use(&d_logical_count, launch_stream, Access::Write)
7728            .map_err(|e| {
7729                XlogError::Kernel(format!(
7730                    "hash_join_inner_v2_count_scan_materialize_recorded: prepare d_logical_count \
7731                     failed: {}",
7732                    e
7733                ))
7734            })?;
7735        // Zero-init overflow + logical_count on launch_stream.
7736        // SAFETY: 1-byte and 4-byte runtime-backed buffers.
7737        unsafe {
7738            let res = cudarc::driver::sys::cuMemsetD8Async(
7739                *d_overflow.device_ptr(),
7740                0,
7741                1,
7742                cu_stream.cu_stream(),
7743            );
7744            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7745                return Err(XlogError::Kernel(format!(
7746                    "cuMemsetD8Async (d_overflow init) failed: {:?}",
7747                    res
7748                )));
7749            }
7750            let res = cudarc::driver::sys::cuMemsetD8Async(
7751                *d_logical_count.device_ptr(),
7752                0,
7753                std::mem::size_of::<u32>(),
7754                cu_stream.cu_stream(),
7755            );
7756            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7757                return Err(XlogError::Kernel(format!(
7758                    "cuMemsetD8Async (d_logical_count init) failed: {:?}",
7759                    res
7760                )));
7761            }
7762        }
7763
7764        // Build the count/scan recorder. Reads on inputs that
7765        // outlive this recorder (left/right packed + table)
7766        // BEFORE preflight; fresh writes on per_probe_count /
7767        // per_probe_offsets / d_logical_count / d_overflow
7768        // AFTER kernels enqueue.
7769        let count_func = device
7770            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
7771            .ok_or_else(|| {
7772                XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
7773            })?;
7774        let total_func = device
7775            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
7776            .ok_or_else(|| {
7777                XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
7778            })?;
7779
7780        let mut rec_count = LaunchRecorder::new_strict(launch_stream);
7781        rec_count.read(&left_packed.hashes);
7782        rec_count.read(&left_packed.packed_keys);
7783        rec_count.read(&right_packed.packed_keys);
7784        rec_count.read(&table.bucket_offsets);
7785        rec_count.read(&table.bucket_counts);
7786        rec_count.read(&table.bucket_entries);
7787        rec_count.read(&table.bucket_entry_hashes);
7788        rec_count.read(left.num_rows_device());
7789        rec_count.write(&per_probe_count);
7790        rec_count.write(&per_probe_offsets);
7791        rec_count.write(&d_logical_count);
7792        rec_count.write(&d_overflow);
7793        rec_count.preflight(runtime).map_err(|e| {
7794            XlogError::Kernel(format!("csm inner: count/scan preflight failed: {}", e))
7795        })?;
7796
7797        // Step 3: count_per_row.
7798        // SAFETY: 12-arg signature matches the PTX kernel.
7799        unsafe {
7800            count_func.clone().launch_on_stream(
7801                &cu_stream,
7802                probe_config,
7803                (
7804                    &left_packed.hashes,
7805                    left.num_rows_device(),
7806                    probe_cap,
7807                    &table.bucket_offsets,
7808                    &table.bucket_counts,
7809                    &table.bucket_entries,
7810                    &table.bucket_entry_hashes,
7811                    table.bucket_mask,
7812                    &left_packed.packed_keys,
7813                    &right_packed.packed_keys,
7814                    left_packed.key_bytes,
7815                    &per_probe_count,
7816                ),
7817            )
7818        }
7819        .map_err(|e| {
7820            XlogError::Kernel(format!(
7821                "hash_join_probe_v2_count_per_row (on_stream) failed: {}",
7822                e
7823            ))
7824        })?;
7825
7826        // Step 4: dtod-async copy per_probe_count → per_probe_offsets,
7827        // then exclusive in-place scan.
7828        // SAFETY: same length, both runtime-backed u32 buffers.
7829        unsafe {
7830            let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
7831                *per_probe_offsets.device_ptr(),
7832                *per_probe_count.device_ptr(),
7833                (probe_cap as usize) * std::mem::size_of::<u32>(),
7834                cu_stream.cu_stream(),
7835            );
7836            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
7837                return Err(XlogError::Kernel(format!(
7838                    "csm inner: cuMemcpyDtoDAsync (per_probe_count → offsets) failed: {:?}",
7839                    res
7840                )));
7841            }
7842        }
7843        self.multiblock_scan_u32_inplace_on_stream(
7844            &mut per_probe_offsets,
7845            probe_cap,
7846            &cu_stream,
7847            launch_stream,
7848            runtime,
7849        )?;
7850
7851        // Step 5: total_from_scan — writes d_logical_count + d_overflow.
7852        // SAFETY: 7-arg signature. capacity = probe_cap *
7853        // num_right is the worst-case bound (cross-product);
7854        // in practice the chain caps the actual write count
7855        // after the host scalar read below sizes the output
7856        // index buffers exactly.
7857        let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(num_right as u64);
7858        let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
7859        unsafe {
7860            total_func.clone().launch_on_stream(
7861                &cu_stream,
7862                LaunchConfig {
7863                    grid_dim: (1, 1, 1),
7864                    block_dim: (1, 1, 1),
7865                    shared_mem_bytes: 0,
7866                },
7867                (
7868                    &per_probe_offsets,
7869                    &per_probe_count,
7870                    left.num_rows_device(),
7871                    probe_cap,
7872                    materialize_capacity_u32,
7873                    &d_logical_count,
7874                    &d_overflow,
7875                ),
7876            )
7877        }
7878        .map_err(|e| {
7879            XlogError::Kernel(format!(
7880                "hash_join_total_from_scan (on_stream) failed: {}",
7881                e
7882            ))
7883        })?;
7884
7885        rec_count.commit(runtime).map_err(|e| {
7886            XlogError::Kernel(format!("csm inner: count/scan commit failed: {}", e))
7887        })?;
7888
7889        // Sync + host scalar read of total. dtoh_scalar_untracked
7890        // is the sanctioned metadata-read API.
7891        cu_stream.synchronize().map_err(|e| {
7892            XlogError::Kernel(format!("csm inner: sync (total read) failed: {}", e))
7893        })?;
7894        let total = self.read_join_output_count_metadata(&d_logical_count)? as u64;
7895        let requested = max_output
7896            .map(|limit| (limit as u64).min(total))
7897            .unwrap_or(total);
7898        if requested == 0 {
7899            let combined_schema = self.combine_schemas(left.schema(), right.schema());
7900            return self.create_empty_buffer(combined_schema);
7901        }
7902        if requested > u32::MAX as u64 {
7903            return Err(XlogError::Kernel(format!(
7904                "Join produced {} rows which exceeds the u32 index limit",
7905                requested
7906            )));
7907        }
7908        let output_capacity = requested as u32;
7909
7910        // Step 6: materialize. Allocate index outputs sized to
7911        // `output_capacity` (the user-clamped total). If
7912        // `requested < total`, the kernel suppresses writes
7913        // past `output_capacity` and raises d_overflow — a
7914        // separate metadata flag the caller can inspect via
7915        // a future helper. For now, this path trusts the
7916        // tail of the result is the deterministic "last
7917        // requested" rows.)
7918        let d_output_left = self.memory.alloc::<u32>(output_capacity as usize)?;
7919        let d_output_right = self.memory.alloc::<u32>(output_capacity as usize)?;
7920
7921        let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
7922        rec_mat.read(&left_packed.hashes);
7923        rec_mat.read(&left_packed.packed_keys);
7924        rec_mat.read(&right_packed.packed_keys);
7925        rec_mat.read(&table.bucket_offsets);
7926        rec_mat.read(&table.bucket_counts);
7927        rec_mat.read(&table.bucket_entries);
7928        rec_mat.read(&table.bucket_entry_hashes);
7929        rec_mat.read(&per_probe_offsets);
7930        rec_mat.read(left.num_rows_device());
7931        rec_mat.write(&d_output_left);
7932        rec_mat.write(&d_output_right);
7933        // d_overflow is consumed by the materialize kernel — recorder must own it through commit.
7934        rec_mat.write(&d_overflow);
7935        rec_mat.preflight(runtime).map_err(|e| {
7936            XlogError::Kernel(format!("csm inner: materialize preflight failed: {}", e))
7937        })?;
7938
7939        let materialize_func = device
7940            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
7941            .ok_or_else(|| {
7942                XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
7943            })?;
7944        // SAFETY: 16-arg signature; tuple form supports up to
7945        // 12 elements, so we use the raw-param launch path.
7946        unsafe {
7947            let mut params: Vec<*mut c_void> = vec![
7948                (&left_packed.hashes).as_kernel_param(),
7949                left.num_rows_device().as_kernel_param(),
7950                probe_cap.as_kernel_param(),
7951                (&table.bucket_offsets).as_kernel_param(),
7952                (&table.bucket_counts).as_kernel_param(),
7953                (&table.bucket_entries).as_kernel_param(),
7954                (&table.bucket_entry_hashes).as_kernel_param(),
7955                table.bucket_mask.as_kernel_param(),
7956                (&left_packed.packed_keys).as_kernel_param(),
7957                (&right_packed.packed_keys).as_kernel_param(),
7958                left_packed.key_bytes.as_kernel_param(),
7959                (&per_probe_offsets).as_kernel_param(),
7960                output_capacity.as_kernel_param(),
7961                (&d_output_left).as_kernel_param(),
7962                (&d_output_right).as_kernel_param(),
7963                (&d_overflow).as_kernel_param(),
7964            ];
7965            materialize_func
7966                .clone()
7967                .launch_on_stream(&cu_stream, probe_config, &mut params)
7968                .map_err(|e| {
7969                    XlogError::Kernel(format!(
7970                        "hash_join_probe_v2_materialize (on_stream) failed: {}",
7971                        e
7972                    ))
7973                })?;
7974        }
7975
7976        rec_mat.commit(runtime).map_err(|e| {
7977            XlogError::Kernel(format!("csm inner: materialize commit failed: {}", e))
7978        })?;
7979
7980        cu_stream.synchronize().map_err(|e| {
7981            XlogError::Kernel(format!("csm inner: sync (post-materialize) failed: {}", e))
7982        })?;
7983
7984        // Step 7: gather both sides on launch_stream.
7985        let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
7986        for col_idx in 0..left.columns.len() {
7987            let c = left
7988                .column(col_idx)
7989                .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
7990            rec_gather.read_column(c);
7991        }
7992        for col_idx in 0..right.columns.len() {
7993            let c = right
7994                .column(col_idx)
7995                .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
7996            rec_gather.read_column(c);
7997        }
7998        rec_gather.read(&d_output_left);
7999        rec_gather.read(&d_output_right);
8000        rec_gather
8001            .preflight(runtime)
8002            .map_err(|e| XlogError::Kernel(format!("csm inner: gather preflight failed: {}", e)))?;
8003        let gathered_left = self.gather_buffer_by_indices_on_stream(
8004            left,
8005            &d_output_left,
8006            output_capacity,
8007            &cu_stream,
8008            launch_stream,
8009            runtime,
8010        )?;
8011        let gathered_right = self.gather_buffer_by_indices_on_stream(
8012            right,
8013            &d_output_right,
8014            output_capacity,
8015            &cu_stream,
8016            launch_stream,
8017            runtime,
8018        )?;
8019        rec_gather
8020            .commit(runtime)
8021            .map_err(|e| XlogError::Kernel(format!("csm inner: gather commit failed: {}", e)))?;
8022
8023        let combined_schema = self.combine_schemas(left.schema(), right.schema());
8024        let mut result_columns = Vec::with_capacity(combined_schema.arity());
8025        result_columns.extend(gathered_left.columns);
8026        result_columns.extend(gathered_right.columns);
8027        self.buffer_from_columns(result_columns, output_capacity as u64, combined_schema)
8028    }
8029
8030    fn hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded(
8031        &self,
8032        left: &CudaBuffer,
8033        right: &CudaBuffer,
8034        left_keys: &[usize],
8035        right_keys: &[usize],
8036        max_output: Option<usize>,
8037        launch_stream: StreamId,
8038    ) -> Result<Option<CudaBuffer>> {
8039        let runtime = self.memory.runtime().ok_or_else(|| {
8040            XlogError::Kernel(
8041                "hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded requires a \
8042                 runtime-backed GpuMemoryManager"
8043                    .to_string(),
8044            )
8045        })?;
8046        let cu_stream = runtime
8047            .stream_pool()
8048            .resolve(launch_stream)
8049            .ok_or_else(|| {
8050                XlogError::Kernel(format!(
8051                    "hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded: \
8052                     launch_stream StreamId({}) does not resolve",
8053                    launch_stream.0
8054                ))
8055            })?;
8056
8057        let num_left = self.device_row_count(left)?;
8058        let num_right = self.device_row_count(right)?;
8059        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
8060            return Err(XlogError::Kernel(format!(
8061                "Join supports at most {} rows per side (left={}, right={})",
8062                u32::MAX,
8063                num_left,
8064                num_right
8065            )));
8066        }
8067        if num_left == 0 || num_right == 0 || max_output == Some(0) {
8068            let combined_schema = self.combine_schemas(left.schema(), right.schema());
8069            return self.create_empty_buffer(combined_schema).map(Some);
8070        }
8071        if left_keys.is_empty() || right_keys.is_empty() {
8072            return Err(XlogError::Kernel(
8073                "Join requires at least one key column".to_string(),
8074            ));
8075        }
8076        if left_keys.len() != right_keys.len() {
8077            return Err(XlogError::Kernel(
8078                "Left and right key columns must have same length".to_string(),
8079            ));
8080        }
8081        if left_keys.len() > 4 {
8082            return Err(XlogError::Kernel(
8083                "hash_join_inner_v2_count_scan_materialize_cuda_graph_recorded: max 4 key \
8084                 columns supported (pack_keys constraint)"
8085                    .to_string(),
8086            ));
8087        }
8088        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
8089            let lt = left.schema().column_type(l);
8090            let rt = right.schema().column_type(r);
8091            if lt != rt {
8092                return Err(XlogError::Kernel(format!(
8093                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
8094                    l, lt, r, rt
8095                )));
8096            }
8097        }
8098
8099        let logical_probe_cap = left.num_rows() as u32;
8100        let probe_cap = crate::cuda_graph::graph_capacity_class_u32(logical_probe_cap);
8101        let Some(output_capacity) =
8102            Self::csm_cuda_graph_output_capacity(logical_probe_cap, num_right as u32, max_output)?
8103        else {
8104            return Ok(None);
8105        };
8106        if output_capacity == 0 {
8107            let combined_schema = self.combine_schemas(left.schema(), right.schema());
8108            return self.create_empty_buffer(combined_schema).map(Some);
8109        }
8110
8111        let left_packed =
8112            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
8113        let right_packed =
8114            self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
8115        let graph_key = CsmCudaGraphKey::inner(
8116            left_keys.len(),
8117            left_packed.key_bytes,
8118            probe_cap,
8119            output_capacity,
8120        )?;
8121        let table = self.build_hash_table_v2_on_stream(
8122            &right_packed.hashes,
8123            num_right as u32,
8124            &cu_stream,
8125            launch_stream,
8126            runtime,
8127        )?;
8128
8129        let device = self.device.inner();
8130        let block_size = 256u32;
8131        let probe_grid = probe_cap.div_ceil(block_size);
8132        let probe_config = LaunchConfig {
8133            grid_dim: (probe_grid, 1, 1),
8134            block_dim: (block_size, 1, 1),
8135            shared_mem_bytes: 0,
8136        };
8137
8138        let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(num_right as u64);
8139        let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
8140
8141        {
8142            let mut cache = self.csm_cuda_graph_cache.lock().map_err(|e| {
8143                XlogError::Kernel(format!("csm CUDA Graph cache lock poisoned: {}", e))
8144            })?;
8145            if let Some(entry) = cache.get_mut(&graph_key) {
8146                let result = self.launch_csm_cuda_graph_entry(
8147                    entry,
8148                    left,
8149                    right,
8150                    &left_packed,
8151                    &right_packed,
8152                    &table,
8153                    max_output,
8154                    materialize_capacity_u32,
8155                    probe_config,
8156                    &cu_stream,
8157                    launch_stream,
8158                    runtime,
8159                )?;
8160                self.csm_cuda_graph_cache_hits
8161                    .fetch_add(1, Ordering::Relaxed);
8162                return Ok(Some(result));
8163            }
8164        }
8165
8166        let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
8167        let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
8168        let d_logical_count = self.memory.alloc::<u32>(1)?;
8169        let d_overflow = self.memory.alloc::<u8>(1)?;
8170        let d_output_left = self.memory.alloc::<u32>(output_capacity as usize)?;
8171        let d_output_right = self.memory.alloc::<u32>(output_capacity as usize)?;
8172        let mut scan_scratch = self.multiblock_scan_u32_scratch_for_len(probe_cap)?;
8173
8174        let count_func = device
8175            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
8176            .ok_or_else(|| {
8177                XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
8178            })?;
8179        let total_func = device
8180            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
8181            .ok_or_else(|| {
8182                XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
8183            })?;
8184        let materialize_func = device
8185            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
8186            .ok_or_else(|| {
8187                XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
8188            })?;
8189
8190        let graph = CapturedCudaGraph::capture_on_stream(&cu_stream, || {
8191            // SAFETY: graph capture records these writes; replay preflight orders
8192            // the runtime-backed buffers before the graph is launched.
8193            unsafe {
8194                let res = cudarc::driver::sys::cuMemsetD8Async(
8195                    *d_overflow.device_ptr(),
8196                    0,
8197                    1,
8198                    cu_stream.cu_stream(),
8199                );
8200                if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8201                    return Err(XlogError::Kernel(format!(
8202                        "csm inner graph: cuMemsetD8Async (d_overflow) failed: {:?}",
8203                        res
8204                    )));
8205                }
8206                let res = cudarc::driver::sys::cuMemsetD8Async(
8207                    *d_logical_count.device_ptr(),
8208                    0,
8209                    std::mem::size_of::<u32>(),
8210                    cu_stream.cu_stream(),
8211                );
8212                if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8213                    return Err(XlogError::Kernel(format!(
8214                        "csm inner graph: cuMemsetD8Async (d_logical_count) failed: {:?}",
8215                        res
8216                    )));
8217                }
8218            }
8219
8220            // SAFETY: 12-arg signature matches the PTX kernel.
8221            unsafe {
8222                count_func.clone().launch_on_stream(
8223                    &cu_stream,
8224                    probe_config,
8225                    (
8226                        &left_packed.hashes,
8227                        left.num_rows_device(),
8228                        probe_cap,
8229                        &table.bucket_offsets,
8230                        &table.bucket_counts,
8231                        &table.bucket_entries,
8232                        &table.bucket_entry_hashes,
8233                        table.bucket_mask,
8234                        &left_packed.packed_keys,
8235                        &right_packed.packed_keys,
8236                        left_packed.key_bytes,
8237                        &per_probe_count,
8238                    ),
8239                )
8240            }
8241            .map_err(|e| {
8242                XlogError::Kernel(format!("csm inner graph: count_per_row failed: {}", e))
8243            })?;
8244
8245            // SAFETY: same length, both runtime-backed u32 buffers.
8246            unsafe {
8247                let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
8248                    *per_probe_offsets.device_ptr(),
8249                    *per_probe_count.device_ptr(),
8250                    (probe_cap as usize) * std::mem::size_of::<u32>(),
8251                    cu_stream.cu_stream(),
8252                );
8253                if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8254                    return Err(XlogError::Kernel(format!(
8255                        "csm inner graph: cuMemcpyDtoDAsync (count -> offsets) failed: {:?}",
8256                        res
8257                    )));
8258                }
8259            }
8260            self.multiblock_scan_u32_inplace_on_stream_with_scratch(
8261                &mut per_probe_offsets,
8262                probe_cap,
8263                &cu_stream,
8264                &mut scan_scratch,
8265            )?;
8266
8267            // SAFETY: 7-arg signature matches the PTX kernel.
8268            unsafe {
8269                total_func.clone().launch_on_stream(
8270                    &cu_stream,
8271                    LaunchConfig {
8272                        grid_dim: (1, 1, 1),
8273                        block_dim: (1, 1, 1),
8274                        shared_mem_bytes: 0,
8275                    },
8276                    (
8277                        &per_probe_offsets,
8278                        &per_probe_count,
8279                        left.num_rows_device(),
8280                        probe_cap,
8281                        materialize_capacity_u32,
8282                        &d_logical_count,
8283                        &d_overflow,
8284                    ),
8285                )
8286            }
8287            .map_err(|e| XlogError::Kernel(format!("csm inner graph: total failed: {}", e)))?;
8288
8289            // SAFETY: 16-arg signature; tuple form supports up to 12 elements, so use raw params.
8290            unsafe {
8291                let mut params: Vec<*mut c_void> = vec![
8292                    (&left_packed.hashes).as_kernel_param(),
8293                    left.num_rows_device().as_kernel_param(),
8294                    probe_cap.as_kernel_param(),
8295                    (&table.bucket_offsets).as_kernel_param(),
8296                    (&table.bucket_counts).as_kernel_param(),
8297                    (&table.bucket_entries).as_kernel_param(),
8298                    (&table.bucket_entry_hashes).as_kernel_param(),
8299                    table.bucket_mask.as_kernel_param(),
8300                    (&left_packed.packed_keys).as_kernel_param(),
8301                    (&right_packed.packed_keys).as_kernel_param(),
8302                    left_packed.key_bytes.as_kernel_param(),
8303                    (&per_probe_offsets).as_kernel_param(),
8304                    output_capacity.as_kernel_param(),
8305                    (&d_output_left).as_kernel_param(),
8306                    (&d_output_right).as_kernel_param(),
8307                    (&d_overflow).as_kernel_param(),
8308                ];
8309                materialize_func
8310                    .clone()
8311                    .launch_on_stream(&cu_stream, probe_config, &mut params)
8312                    .map_err(|e| {
8313                        XlogError::Kernel(format!("csm inner graph: materialize failed: {}", e))
8314                    })?;
8315            }
8316            Ok(())
8317        })?;
8318        let nodes = Self::csm_cuda_graph_nodes(&graph)?;
8319        let mut entry = CsmCudaGraphEntry {
8320            graph,
8321            nodes,
8322            per_probe_count,
8323            per_probe_offsets,
8324            d_logical_count,
8325            d_overflow,
8326            d_output_left,
8327            d_output_right,
8328            scan_scratch,
8329            probe_capacity: probe_cap,
8330            output_capacity,
8331        };
8332        self.csm_cuda_graph_captures.fetch_add(1, Ordering::Relaxed);
8333
8334        let result = self.launch_csm_cuda_graph_entry(
8335            &mut entry,
8336            left,
8337            right,
8338            &left_packed,
8339            &right_packed,
8340            &table,
8341            max_output,
8342            materialize_capacity_u32,
8343            probe_config,
8344            &cu_stream,
8345            launch_stream,
8346            runtime,
8347        )?;
8348        self.csm_cuda_graph_cache
8349            .lock()
8350            .map_err(|e| XlogError::Kernel(format!("csm CUDA Graph cache lock poisoned: {}", e)))?
8351            .insert(graph_key, entry);
8352        Ok(Some(result))
8353    }
8354
8355    #[allow(clippy::too_many_arguments)]
8356    fn launch_csm_cuda_graph_entry(
8357        &self,
8358        entry: &mut CsmCudaGraphEntry,
8359        left: &CudaBuffer,
8360        right: &CudaBuffer,
8361        left_packed: &PackedKeyData,
8362        right_packed: &PackedKeyData,
8363        table: &JoinHashTableV2,
8364        max_output: Option<usize>,
8365        materialize_capacity_u32: u32,
8366        probe_config: LaunchConfig,
8367        cu_stream: &cudarc::driver::CudaStream,
8368        launch_stream: StreamId,
8369        runtime: &crate::device_runtime::XlogDeviceRuntime,
8370    ) -> Result<CudaBuffer> {
8371        let mut rec_graph = LaunchRecorder::new_strict(launch_stream);
8372        rec_graph.read(&left_packed.hashes);
8373        rec_graph.read(&left_packed.packed_keys);
8374        rec_graph.read(&right_packed.packed_keys);
8375        rec_graph.read(&table.bucket_offsets);
8376        rec_graph.read(&table.bucket_counts);
8377        rec_graph.read(&table.bucket_entries);
8378        rec_graph.read(&table.bucket_entry_hashes);
8379        rec_graph.read(left.num_rows_device());
8380        rec_graph.read_write(&entry.per_probe_count);
8381        rec_graph.read_write(&entry.per_probe_offsets);
8382        rec_graph.read_write(&entry.d_logical_count);
8383        rec_graph.read_write(&entry.d_overflow);
8384        rec_graph.write(&entry.d_output_left);
8385        rec_graph.write(&entry.d_output_right);
8386        for level in entry.scan_scratch.levels() {
8387            rec_graph.read_write(level);
8388        }
8389        rec_graph
8390            .preflight(runtime)
8391            .map_err(|e| XlogError::Kernel(format!("csm inner graph: preflight failed: {}", e)))?;
8392
8393        let probe_cap = entry.probe_capacity;
8394        let output_capacity = entry.output_capacity;
8395        if probe_config.grid_dim.0 != probe_cap.div_ceil(probe_config.block_dim.0) {
8396            return Err(XlogError::Kernel(format!(
8397                "csm CUDA Graph replay probe grid mismatch: graph probe_cap={}, grid={:?}",
8398                probe_cap, probe_config.grid_dim
8399            )));
8400        }
8401        if entry.nodes.node_count < 5 {
8402            return Err(XlogError::Kernel(format!(
8403                "csm CUDA Graph replay node inventory too small: {}",
8404                entry.nodes.node_count
8405            )));
8406        }
8407
8408        let mut count_params = entry.graph.kernel_node_params(entry.nodes.count)?;
8409        let mut total_params = entry.graph.kernel_node_params(entry.nodes.total)?;
8410        let mut materialize_params = entry.graph.kernel_node_params(entry.nodes.materialize)?;
8411        let mut count_args: Vec<*mut c_void> = vec![
8412            (&left_packed.hashes).as_kernel_param(),
8413            left.num_rows_device().as_kernel_param(),
8414            probe_cap.as_kernel_param(),
8415            (&table.bucket_offsets).as_kernel_param(),
8416            (&table.bucket_counts).as_kernel_param(),
8417            (&table.bucket_entries).as_kernel_param(),
8418            (&table.bucket_entry_hashes).as_kernel_param(),
8419            table.bucket_mask.as_kernel_param(),
8420            (&left_packed.packed_keys).as_kernel_param(),
8421            (&right_packed.packed_keys).as_kernel_param(),
8422            left_packed.key_bytes.as_kernel_param(),
8423            (&entry.per_probe_count).as_kernel_param(),
8424        ];
8425        let mut total_args: Vec<*mut c_void> = vec![
8426            (&entry.per_probe_offsets).as_kernel_param(),
8427            (&entry.per_probe_count).as_kernel_param(),
8428            left.num_rows_device().as_kernel_param(),
8429            probe_cap.as_kernel_param(),
8430            materialize_capacity_u32.as_kernel_param(),
8431            (&entry.d_logical_count).as_kernel_param(),
8432            (&entry.d_overflow).as_kernel_param(),
8433        ];
8434        let mut materialize_args: Vec<*mut c_void> = vec![
8435            (&left_packed.hashes).as_kernel_param(),
8436            left.num_rows_device().as_kernel_param(),
8437            probe_cap.as_kernel_param(),
8438            (&table.bucket_offsets).as_kernel_param(),
8439            (&table.bucket_counts).as_kernel_param(),
8440            (&table.bucket_entries).as_kernel_param(),
8441            (&table.bucket_entry_hashes).as_kernel_param(),
8442            table.bucket_mask.as_kernel_param(),
8443            (&left_packed.packed_keys).as_kernel_param(),
8444            (&right_packed.packed_keys).as_kernel_param(),
8445            left_packed.key_bytes.as_kernel_param(),
8446            (&entry.per_probe_offsets).as_kernel_param(),
8447            output_capacity.as_kernel_param(),
8448            (&entry.d_output_left).as_kernel_param(),
8449            (&entry.d_output_right).as_kernel_param(),
8450            (&entry.d_overflow).as_kernel_param(),
8451        ];
8452        count_params.kernelParams = count_args.as_mut_ptr();
8453        count_params.extra = std::ptr::null_mut();
8454        total_params.kernelParams = total_args.as_mut_ptr();
8455        total_params.extra = std::ptr::null_mut();
8456        materialize_params.kernelParams = materialize_args.as_mut_ptr();
8457        materialize_params.extra = std::ptr::null_mut();
8458        unsafe {
8459            entry
8460                .graph
8461                .set_kernel_node_params(entry.nodes.count, &count_params)?;
8462            entry
8463                .graph
8464                .set_kernel_node_params(entry.nodes.total, &total_params)?;
8465            entry
8466                .graph
8467                .set_kernel_node_params(entry.nodes.materialize, &materialize_params)?;
8468        }
8469
8470        entry.graph.launch(cu_stream)?;
8471        self.csm_cuda_graph_launches.fetch_add(1, Ordering::Relaxed);
8472        rec_graph
8473            .commit(runtime)
8474            .map_err(|e| XlogError::Kernel(format!("csm inner graph: commit failed: {}", e)))?;
8475
8476        cu_stream.synchronize().map_err(|e| {
8477            XlogError::Kernel(format!("csm inner graph: sync (total read) failed: {}", e))
8478        })?;
8479        let total = self.read_join_output_count_metadata(&entry.d_logical_count)? as u64;
8480        let requested = max_output
8481            .map(|limit| (limit as u64).min(total))
8482            .unwrap_or(total);
8483        if requested == 0 {
8484            let combined_schema = self.combine_schemas(left.schema(), right.schema());
8485            return self.create_empty_buffer(combined_schema);
8486        }
8487        if requested > output_capacity as u64 {
8488            return Err(XlogError::Kernel(format!(
8489                "csm inner graph produced {} rows but graph output capacity is {}",
8490                requested, output_capacity
8491            )));
8492        }
8493        let output_rows = requested as u32;
8494
8495        let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
8496        for col_idx in 0..left.columns.len() {
8497            let c = left
8498                .column(col_idx)
8499                .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
8500            rec_gather.read_column(c);
8501        }
8502        for col_idx in 0..right.columns.len() {
8503            let c = right
8504                .column(col_idx)
8505                .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
8506            rec_gather.read_column(c);
8507        }
8508        rec_gather.read(&entry.d_output_left);
8509        rec_gather.read(&entry.d_output_right);
8510        rec_gather.preflight(runtime).map_err(|e| {
8511            XlogError::Kernel(format!("csm inner graph: gather preflight failed: {}", e))
8512        })?;
8513        let gathered_left = self.gather_buffer_by_indices_on_stream(
8514            left,
8515            &entry.d_output_left,
8516            output_rows,
8517            cu_stream,
8518            launch_stream,
8519            runtime,
8520        )?;
8521        let gathered_right = self.gather_buffer_by_indices_on_stream(
8522            right,
8523            &entry.d_output_right,
8524            output_rows,
8525            cu_stream,
8526            launch_stream,
8527            runtime,
8528        )?;
8529        rec_gather.commit(runtime).map_err(|e| {
8530            XlogError::Kernel(format!("csm inner graph: gather commit failed: {}", e))
8531        })?;
8532
8533        let combined_schema = self.combine_schemas(left.schema(), right.schema());
8534        let mut result_columns = Vec::with_capacity(combined_schema.arity());
8535        result_columns.extend(gathered_left.columns);
8536        result_columns.extend(gathered_right.columns);
8537        self.buffer_from_columns(result_columns, output_rows as u64, combined_schema)
8538    }
8539
8540    fn csm_cuda_graph_nodes(graph: &CapturedCudaGraph) -> Result<CsmCudaGraphNodes> {
8541        let nodes = graph.nodes()?;
8542        if nodes.len() < 5 {
8543            return Err(XlogError::Kernel(format!(
8544                "csm inner graph captured too few nodes: {}",
8545                nodes.len()
8546            )));
8547        }
8548        let kernel_nodes: Vec<_> = nodes
8549            .iter()
8550            .copied()
8551            .filter(|n| n.kind == CudaGraphNodeKind::Kernel)
8552            .collect();
8553        if kernel_nodes.len() < 3 {
8554            return Err(XlogError::Kernel(format!(
8555                "csm inner graph captured too few kernel nodes: {}",
8556                kernel_nodes.len()
8557            )));
8558        }
8559        Ok(CsmCudaGraphNodes {
8560            count: kernel_nodes[0],
8561            total: kernel_nodes[kernel_nodes.len() - 2],
8562            materialize: kernel_nodes[kernel_nodes.len() - 1],
8563            node_count: nodes.len(),
8564        })
8565    }
8566
8567    fn csm_cuda_graph_output_capacity(
8568        probe_cap: u32,
8569        num_right: u32,
8570        max_output: Option<usize>,
8571    ) -> Result<Option<u32>> {
8572        if let Some(limit) = max_output {
8573            let limit = u32::try_from(limit).map_err(|_| {
8574                XlogError::Kernel(format!(
8575                    "csm CUDA Graph max_output {} exceeds u32::MAX",
8576                    limit
8577                ))
8578            })?;
8579            return Ok(Some(crate::cuda_graph::graph_capacity_class_u32(limit)));
8580        }
8581
8582        let worst_case = (probe_cap as u64).saturating_mul(num_right as u64);
8583        if worst_case > u32::MAX as u64 {
8584            return Ok(None);
8585        }
8586        let auto_cap = std::env::var("XLOG_CSM_CUDA_GRAPH_AUTO_OUTPUT_CAP")
8587            .ok()
8588            .and_then(|v| v.parse::<u64>().ok())
8589            .unwrap_or(1_000_000);
8590        if worst_case <= auto_cap {
8591            Ok(Some(crate::cuda_graph::graph_capacity_class_u32(
8592                worst_case as u32,
8593            )))
8594        } else {
8595            Ok(None)
8596        }
8597    }
8598
8599    /// Non-indexed LeftOuter CSM using the deterministic binary-join path.
8600    ///
8601    /// Deterministic count → scan → materialize chain producing
8602    /// MATCHED `(left_idx, right_idx)` pairs first (Inner CSM
8603    /// machinery), then a per-probe-row unmatched mask
8604    /// (`hash_join_csm_unmatched_mask`) compacted via the
8605    /// recorded compact tail to produce `unmatched_left`. The
8606    /// final result is `inner_left | unmatched_left` per left
8607    /// column and `inner_right | zeros` per right column —
8608    /// matching the legacy `hash_join_left_outer_v2_recorded`
8609    /// row-ordering invariant downstream consumers depend on.
8610    ///
8611    /// This path does not adopt the archived prototype's
8612    /// `hash_join_left_outer_count_per_row` /
8613    /// `hash_join_left_outer_materialize` design — those
8614    /// kernels interleave matched and null-sentinel rows by
8615    /// probe-row index, which would change the legacy
8616    /// LeftOuter ordering downstream consumers depend on.
8617    ///
8618    /// # Errors
8619    ///   * Manager not runtime-backed.
8620    ///   * `launch_stream` does not resolve.
8621    ///   * `left_keys`/`right_keys` empty, mismatched length,
8622    ///     or > 4 (pack_keys constraint).
8623    ///   * Key column type mismatch.
8624    ///   * Preflight / kernel / commit failures.
8625    pub fn hash_join_left_outer_v2_count_scan_materialize_recorded(
8626        &self,
8627        left: &CudaBuffer,
8628        right: &CudaBuffer,
8629        left_keys: &[usize],
8630        right_keys: &[usize],
8631        max_output: Option<usize>,
8632        launch_stream: StreamId,
8633    ) -> Result<CudaBuffer> {
8634        use crate::launch::LaunchRecorder;
8635
8636        let runtime = self.memory.runtime().ok_or_else(|| {
8637            XlogError::Kernel(
8638                "hash_join_left_outer_v2_count_scan_materialize_recorded requires a \
8639                 runtime-backed GpuMemoryManager"
8640                    .to_string(),
8641            )
8642        })?;
8643        let cu_stream = runtime
8644            .stream_pool()
8645            .resolve(launch_stream)
8646            .ok_or_else(|| {
8647                XlogError::Kernel(format!(
8648                    "csm left_outer: launch_stream StreamId({}) does not resolve",
8649                    launch_stream.0
8650                ))
8651            })?;
8652
8653        // Validation (mirrors hash_join_left_outer_v2_recorded).
8654        let num_left = self.device_row_count(left)?;
8655        let num_right = self.device_row_count(right)?;
8656        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
8657            return Err(XlogError::Kernel(format!(
8658                "Join supports at most {} rows per side (left={}, right={})",
8659                u32::MAX,
8660                num_left,
8661                num_right
8662            )));
8663        }
8664        if num_left == 0 {
8665            let combined_schema = self.combine_schemas(left.schema(), right.schema());
8666            return self.create_empty_buffer(combined_schema);
8667        }
8668        if num_right == 0 {
8669            // Empty right → all left rows with zero-filled right
8670            // columns. Same legacy fallback as
8671            // `hash_join_left_outer_v2_recorded` — host-sync,
8672            // no launch_stream work queued.
8673            return self.left_outer_with_nulls(left, right);
8674        }
8675        if left_keys.is_empty() || right_keys.is_empty() {
8676            return Err(XlogError::Kernel(
8677                "Join requires at least one key column".to_string(),
8678            ));
8679        }
8680        if left_keys.len() != right_keys.len() {
8681            return Err(XlogError::Kernel(
8682                "Left and right key columns must have same length".to_string(),
8683            ));
8684        }
8685        if left_keys.len() > 4 {
8686            return Err(XlogError::Kernel(
8687                "csm left_outer: max 4 key columns supported (pack_keys constraint)".to_string(),
8688            ));
8689        }
8690        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
8691            let lt = left.schema().column_type(l);
8692            let rt = right.schema().column_type(r);
8693            if lt != rt {
8694                return Err(XlogError::Kernel(format!(
8695                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
8696                    l, lt, r, rt
8697                )));
8698            }
8699        }
8700
8701        // Base `probe_cap` on the validated logical row count
8702        // (`num_left`) with a checked cast — `left.num_rows()` is
8703        // the row capacity and could over-allocate per-probe
8704        // scratch when `row_cap > num_left`, and silently
8705        // truncate if either exceeded `u32::MAX`. The earlier
8706        // validation already rejects `num_left > u32::MAX as
8707        // usize`, but make the cast explicit at the use site.
8708        let probe_cap = u32::try_from(num_left).map_err(|_| {
8709            XlogError::Kernel("csm left_outer: left row count exceeds u32::MAX".to_string())
8710        })?;
8711        let num_right_u32 = u32::try_from(num_right).map_err(|_| {
8712            XlogError::Kernel("csm left_outer: right row count exceeds u32::MAX".to_string())
8713        })?;
8714
8715        // Steps 1+2: pack + table on launch_stream.
8716        let left_packed =
8717            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
8718        let right_packed =
8719            self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
8720        let table = self.build_hash_table_v2_on_stream(
8721            &right_packed.hashes,
8722            num_right_u32,
8723            &cu_stream,
8724            launch_stream,
8725            runtime,
8726        )?;
8727
8728        let device = self.device.inner();
8729        let block_size = 256u32;
8730        let probe_grid = probe_cap.div_ceil(block_size);
8731        let probe_config = LaunchConfig {
8732            grid_dim: (probe_grid, 1, 1),
8733            block_dim: (block_size, 1, 1),
8734            shared_mem_bytes: 0,
8735        };
8736
8737        // Phase A: count + scan + total (Inner CSM machinery).
8738        let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
8739        let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
8740        let d_logical_count = self.memory.alloc::<u32>(1)?;
8741        let d_overflow = self.memory.alloc::<u8>(1)?;
8742        // Fence alloc-ready → launch_stream for the scalars
8743        // before the memsets below run (memsets enqueue ahead
8744        // of any preflight that registers them).
8745        runtime
8746            .prepare_first_use(&d_overflow, launch_stream, Access::Write)
8747            .map_err(|e| {
8748                XlogError::Kernel(format!("csm left_outer: prepare d_overflow failed: {}", e))
8749            })?;
8750        runtime
8751            .prepare_first_use(&d_logical_count, launch_stream, Access::Write)
8752            .map_err(|e| {
8753                XlogError::Kernel(format!(
8754                    "csm left_outer: prepare d_logical_count failed: {}",
8755                    e
8756                ))
8757            })?;
8758        // Zero-init overflow + logical_count on launch_stream.
8759        // SAFETY: 1-byte and 4-byte runtime-backed buffers.
8760        unsafe {
8761            let res = cudarc::driver::sys::cuMemsetD8Async(
8762                *d_overflow.device_ptr(),
8763                0,
8764                1,
8765                cu_stream.cu_stream(),
8766            );
8767            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8768                return Err(XlogError::Kernel(format!(
8769                    "csm left_outer: cuMemsetD8Async (d_overflow) failed: {:?}",
8770                    res
8771                )));
8772            }
8773            let res = cudarc::driver::sys::cuMemsetD8Async(
8774                *d_logical_count.device_ptr(),
8775                0,
8776                std::mem::size_of::<u32>(),
8777                cu_stream.cu_stream(),
8778            );
8779            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8780                return Err(XlogError::Kernel(format!(
8781                    "csm left_outer: cuMemsetD8Async (d_logical_count) failed: {:?}",
8782                    res
8783                )));
8784            }
8785        }
8786
8787        let count_func = device
8788            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
8789            .ok_or_else(|| {
8790                XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
8791            })?;
8792        let total_func = device
8793            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
8794            .ok_or_else(|| {
8795                XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
8796            })?;
8797
8798        let mut rec_count = LaunchRecorder::new_strict(launch_stream);
8799        rec_count.read(&left_packed.hashes);
8800        rec_count.read(&left_packed.packed_keys);
8801        rec_count.read(&right_packed.packed_keys);
8802        rec_count.read(&table.bucket_offsets);
8803        rec_count.read(&table.bucket_counts);
8804        rec_count.read(&table.bucket_entries);
8805        rec_count.read(&table.bucket_entry_hashes);
8806        rec_count.read(left.num_rows_device());
8807        rec_count.write(&per_probe_count);
8808        rec_count.write(&per_probe_offsets);
8809        rec_count.write(&d_logical_count);
8810        rec_count.write(&d_overflow);
8811        rec_count.preflight(runtime).map_err(|e| {
8812            XlogError::Kernel(format!(
8813                "csm left_outer: count/scan preflight failed: {}",
8814                e
8815            ))
8816        })?;
8817
8818        // Step A1: count_per_row.
8819        // SAFETY: 12-arg signature.
8820        unsafe {
8821            count_func.clone().launch_on_stream(
8822                &cu_stream,
8823                probe_config,
8824                (
8825                    &left_packed.hashes,
8826                    left.num_rows_device(),
8827                    probe_cap,
8828                    &table.bucket_offsets,
8829                    &table.bucket_counts,
8830                    &table.bucket_entries,
8831                    &table.bucket_entry_hashes,
8832                    table.bucket_mask,
8833                    &left_packed.packed_keys,
8834                    &right_packed.packed_keys,
8835                    left_packed.key_bytes,
8836                    &per_probe_count,
8837                ),
8838            )
8839        }
8840        .map_err(|e| {
8841            XlogError::Kernel(format!(
8842                "hash_join_probe_v2_count_per_row (csm left_outer) failed: {}",
8843                e
8844            ))
8845        })?;
8846
8847        // Step A2: dtod copy per_probe_count → per_probe_offsets,
8848        // then exclusive in-place scan.
8849        // SAFETY: same length, both runtime-backed u32.
8850        unsafe {
8851            let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
8852                *per_probe_offsets.device_ptr(),
8853                *per_probe_count.device_ptr(),
8854                (probe_cap as usize) * std::mem::size_of::<u32>(),
8855                cu_stream.cu_stream(),
8856            );
8857            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
8858                return Err(XlogError::Kernel(format!(
8859                    "csm left_outer: cuMemcpyDtoDAsync (count → offsets) failed: {:?}",
8860                    res
8861                )));
8862            }
8863        }
8864        self.multiblock_scan_u32_inplace_on_stream(
8865            &mut per_probe_offsets,
8866            probe_cap,
8867            &cu_stream,
8868            launch_stream,
8869            runtime,
8870        )?;
8871
8872        // Step A3: total_from_scan — writes d_logical_count + d_overflow.
8873        let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(num_right as u64);
8874        let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
8875        // SAFETY: 7-arg signature.
8876        unsafe {
8877            total_func.clone().launch_on_stream(
8878                &cu_stream,
8879                LaunchConfig {
8880                    grid_dim: (1, 1, 1),
8881                    block_dim: (1, 1, 1),
8882                    shared_mem_bytes: 0,
8883                },
8884                (
8885                    &per_probe_offsets,
8886                    &per_probe_count,
8887                    left.num_rows_device(),
8888                    probe_cap,
8889                    materialize_capacity_u32,
8890                    &d_logical_count,
8891                    &d_overflow,
8892                ),
8893            )
8894        }
8895        .map_err(|e| {
8896            XlogError::Kernel(format!(
8897                "hash_join_total_from_scan (csm left_outer) failed: {}",
8898                e
8899            ))
8900        })?;
8901
8902        rec_count.commit(runtime).map_err(|e| {
8903            XlogError::Kernel(format!("csm left_outer: count/scan commit failed: {}", e))
8904        })?;
8905
8906        cu_stream.synchronize().map_err(|e| {
8907            XlogError::Kernel(format!("csm left_outer: sync (count read) failed: {}", e))
8908        })?;
8909        let inner_total = self.read_join_output_count_metadata(&d_logical_count)? as u64;
8910        let inner_clamped = max_output
8911            .map(|limit| (limit as u64).min(inner_total))
8912            .unwrap_or(inner_total);
8913        if inner_clamped > u32::MAX as u64 {
8914            return Err(XlogError::Kernel(format!(
8915                "Join produced {} matched rows which exceeds the u32 index limit",
8916                inner_clamped
8917            )));
8918        }
8919        let inner_count_u32 = inner_clamped as u32;
8920
8921        // Phase B: materialize matched index pairs.
8922        let materialize_func = device
8923            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
8924            .ok_or_else(|| {
8925                XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
8926            })?;
8927        let d_output_left = self.memory.alloc::<u32>(inner_count_u32.max(1) as usize)?;
8928        let d_output_right = self.memory.alloc::<u32>(inner_count_u32.max(1) as usize)?;
8929
8930        let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
8931        rec_mat.read(&left_packed.hashes);
8932        rec_mat.read(&left_packed.packed_keys);
8933        rec_mat.read(&right_packed.packed_keys);
8934        rec_mat.read(&table.bucket_offsets);
8935        rec_mat.read(&table.bucket_counts);
8936        rec_mat.read(&table.bucket_entries);
8937        rec_mat.read(&table.bucket_entry_hashes);
8938        rec_mat.read(&per_probe_offsets);
8939        rec_mat.read(left.num_rows_device());
8940        rec_mat.write(&d_output_left);
8941        rec_mat.write(&d_output_right);
8942        // d_overflow is consumed by the materialize kernel — recorder must own it through commit.
8943        rec_mat.write(&d_overflow);
8944        rec_mat.preflight(runtime).map_err(|e| {
8945            XlogError::Kernel(format!(
8946                "csm left_outer: materialize preflight failed: {}",
8947                e
8948            ))
8949        })?;
8950        if inner_count_u32 > 0 {
8951            // SAFETY: 16-arg signature; raw-param launch.
8952            unsafe {
8953                let mut params: Vec<*mut c_void> = vec![
8954                    (&left_packed.hashes).as_kernel_param(),
8955                    left.num_rows_device().as_kernel_param(),
8956                    probe_cap.as_kernel_param(),
8957                    (&table.bucket_offsets).as_kernel_param(),
8958                    (&table.bucket_counts).as_kernel_param(),
8959                    (&table.bucket_entries).as_kernel_param(),
8960                    (&table.bucket_entry_hashes).as_kernel_param(),
8961                    table.bucket_mask.as_kernel_param(),
8962                    (&left_packed.packed_keys).as_kernel_param(),
8963                    (&right_packed.packed_keys).as_kernel_param(),
8964                    left_packed.key_bytes.as_kernel_param(),
8965                    (&per_probe_offsets).as_kernel_param(),
8966                    inner_count_u32.as_kernel_param(),
8967                    (&d_output_left).as_kernel_param(),
8968                    (&d_output_right).as_kernel_param(),
8969                    (&d_overflow).as_kernel_param(),
8970                ];
8971                materialize_func
8972                    .clone()
8973                    .launch_on_stream(&cu_stream, probe_config, &mut params)
8974                    .map_err(|e| {
8975                        XlogError::Kernel(format!(
8976                            "hash_join_probe_v2_materialize (csm left_outer) failed: {}",
8977                            e
8978                        ))
8979                    })?;
8980            }
8981        }
8982        rec_mat.commit(runtime).map_err(|e| {
8983            XlogError::Kernel(format!("csm left_outer: materialize commit failed: {}", e))
8984        })?;
8985
8986        // Phase C: unmatched-left mask + recorded compact tail.
8987        let d_unmatched_mask = self.memory.alloc::<u8>(probe_cap as usize)?;
8988        let unmatched_mask_func = device
8989            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_CSM_UNMATCHED_MASK)
8990            .ok_or_else(|| {
8991                XlogError::Kernel("hash_join_csm_unmatched_mask kernel not found".to_string())
8992            })?;
8993        let mut rec_um = LaunchRecorder::new_strict(launch_stream);
8994        rec_um.read(&per_probe_count);
8995        rec_um.read(left.num_rows_device());
8996        rec_um.write(&d_unmatched_mask);
8997        rec_um.preflight(runtime).map_err(|e| {
8998            XlogError::Kernel(format!(
8999                "csm left_outer: unmatched mask preflight failed: {}",
9000                e
9001            ))
9002        })?;
9003        // SAFETY: 4-arg signature.
9004        unsafe {
9005            unmatched_mask_func.clone().launch_on_stream(
9006                &cu_stream,
9007                probe_config,
9008                (
9009                    &per_probe_count,
9010                    left.num_rows_device(),
9011                    probe_cap,
9012                    &d_unmatched_mask,
9013                ),
9014            )
9015        }
9016        .map_err(|e| {
9017            XlogError::Kernel(format!(
9018                "hash_join_csm_unmatched_mask (on_stream) failed: {}",
9019                e
9020            ))
9021        })?;
9022        rec_um.commit(runtime).map_err(|e| {
9023            XlogError::Kernel(format!(
9024                "csm left_outer: unmatched mask commit failed: {}",
9025                e
9026            ))
9027        })?;
9028
9029        let unmatched_left = self.compact_buffer_by_device_mask_counted_recorded(
9030            left,
9031            &d_unmatched_mask,
9032            launch_stream,
9033        )?;
9034        let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
9035        let total_rows = (inner_count_u32 as u64) + unmatched_rows;
9036
9037        let combined_schema = self.combine_schemas(left.schema(), right.schema());
9038        if total_rows == 0 {
9039            return self.create_empty_buffer(combined_schema);
9040        }
9041
9042        // Phase D: gather matched left + right (only if inner_count > 0).
9043        let inner_left_buf;
9044        let inner_right_buf;
9045        if inner_count_u32 > 0 {
9046            let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
9047            for col_idx in 0..left.columns.len() {
9048                let c = left.column(col_idx).ok_or_else(|| {
9049                    XlogError::Kernel(format!("Left column {} not found", col_idx))
9050                })?;
9051                rec_gather.read_column(c);
9052            }
9053            for col_idx in 0..right.columns.len() {
9054                let c = right.column(col_idx).ok_or_else(|| {
9055                    XlogError::Kernel(format!("Right column {} not found", col_idx))
9056                })?;
9057                rec_gather.read_column(c);
9058            }
9059            rec_gather.read(&d_output_left);
9060            rec_gather.read(&d_output_right);
9061            rec_gather.preflight(runtime).map_err(|e| {
9062                XlogError::Kernel(format!("csm left_outer: gather preflight failed: {}", e))
9063            })?;
9064            inner_left_buf = Some(self.gather_buffer_by_indices_on_stream(
9065                left,
9066                &d_output_left,
9067                inner_count_u32,
9068                &cu_stream,
9069                launch_stream,
9070                runtime,
9071            )?);
9072            inner_right_buf = Some(self.gather_buffer_by_indices_on_stream(
9073                right,
9074                &d_output_right,
9075                inner_count_u32,
9076                &cu_stream,
9077                launch_stream,
9078                runtime,
9079            )?);
9080            rec_gather.commit(runtime).map_err(|e| {
9081                XlogError::Kernel(format!("csm left_outer: gather commit failed: {}", e))
9082            })?;
9083        } else {
9084            inner_left_buf = None;
9085            inner_right_buf = None;
9086        }
9087
9088        // Phase E: per-column dtod-async concat.
9089        // Same step-D pattern as `hash_join_left_outer_v2_recorded`.
9090        let mut rec_d = LaunchRecorder::new_strict(launch_stream);
9091        for col_idx in 0..unmatched_left.columns.len() {
9092            let c = unmatched_left.column(col_idx).ok_or_else(|| {
9093                XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
9094            })?;
9095            rec_d.read_column(c);
9096        }
9097        if let Some(b) = inner_left_buf.as_ref() {
9098            for col_idx in 0..b.columns.len() {
9099                let c = b.column(col_idx).ok_or_else(|| {
9100                    XlogError::Kernel(format!("inner_left col {} not found", col_idx))
9101                })?;
9102                rec_d.read_column(c);
9103            }
9104        }
9105        if let Some(b) = inner_right_buf.as_ref() {
9106            for col_idx in 0..b.columns.len() {
9107                let c = b.column(col_idx).ok_or_else(|| {
9108                    XlogError::Kernel(format!("inner_right col {} not found", col_idx))
9109                })?;
9110                rec_d.read_column(c);
9111            }
9112        }
9113        rec_d.preflight(runtime).map_err(|e| {
9114            XlogError::Kernel(format!("csm left_outer: phase-E preflight failed: {}", e))
9115        })?;
9116
9117        let inner_rows = inner_count_u32 as u64;
9118        let mut result_columns: Vec<CudaColumn> = Vec::with_capacity(combined_schema.arity());
9119
9120        // Per-left-column: inner_left | unmatched_left.
9121        for col_idx in 0..left.arity() {
9122            let elem_size = left
9123                .schema()
9124                .column_type(col_idx)
9125                .map(|t| t.size_bytes())
9126                .unwrap_or(4);
9127            let inner_bytes = (inner_rows as usize)
9128                .checked_mul(elem_size)
9129                .ok_or_else(|| XlogError::Kernel("csm left_outer: inner_bytes overflow".into()))?;
9130            let unmatched_bytes = (unmatched_rows as usize)
9131                .checked_mul(elem_size)
9132                .ok_or_else(|| {
9133                    XlogError::Kernel("csm left_outer: unmatched_bytes overflow".into())
9134                })?;
9135            let total_bytes = inner_bytes
9136                .checked_add(unmatched_bytes)
9137                .ok_or_else(|| XlogError::Kernel("csm left_outer: total_bytes overflow".into()))?;
9138            let out_col = self.memory.alloc::<u8>(total_bytes)?;
9139            let dst_ptr = *out_col.device_ptr();
9140            // Fence alloc-ready → launch_stream for out_col.
9141            runtime
9142                .prepare_first_use(&out_col, launch_stream, Access::Write)
9143                .map_err(|e| {
9144                    XlogError::Kernel(format!(
9145                        "csm left_outer: prepare left out_col {} failed: {}",
9146                        col_idx, e
9147                    ))
9148                })?;
9149            if inner_bytes > 0 {
9150                let src_col = inner_left_buf
9151                    .as_ref()
9152                    .expect("inner_count > 0")
9153                    .column(col_idx)
9154                    .ok_or_else(|| XlogError::Kernel("inner_left col missing".into()))?;
9155                // SAFETY: dtod async on cu_stream.
9156                unsafe {
9157                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
9158                        dst_ptr,
9159                        *src_col.device_ptr(),
9160                        inner_bytes,
9161                        cu_stream.cu_stream(),
9162                    );
9163                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9164                        return Err(XlogError::Kernel(format!(
9165                            "csm left_outer: dtod inner_left col {} failed: {:?}",
9166                            col_idx, res
9167                        )));
9168                    }
9169                }
9170            }
9171            if unmatched_bytes > 0 {
9172                let src_col = unmatched_left.column(col_idx).ok_or_else(|| {
9173                    XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
9174                })?;
9175                // SAFETY: bounded by total_bytes.
9176                unsafe {
9177                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
9178                        dst_ptr + inner_bytes as u64,
9179                        *src_col.device_ptr(),
9180                        unmatched_bytes,
9181                        cu_stream.cu_stream(),
9182                    );
9183                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9184                        return Err(XlogError::Kernel(format!(
9185                            "csm left_outer: dtod unmatched col {} failed: {:?}",
9186                            col_idx, res
9187                        )));
9188                    }
9189                }
9190            }
9191            if let Some(b) = out_col.runtime_block() {
9192                runtime
9193                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
9194                    .map_err(|e| {
9195                        XlogError::Kernel(format!(
9196                            "csm left_outer: finish_block_use (left col {}) failed: {}",
9197                            col_idx, e
9198                        ))
9199                    })?;
9200            }
9201            result_columns.push(out_col.into());
9202        }
9203
9204        // Per-right-column: inner_right | zeros.
9205        for col_idx in 0..right.arity() {
9206            let elem_size = right
9207                .schema()
9208                .column_type(col_idx)
9209                .map(|t| t.size_bytes())
9210                .unwrap_or(4);
9211            let inner_bytes = (inner_rows as usize)
9212                .checked_mul(elem_size)
9213                .ok_or_else(|| {
9214                    XlogError::Kernel("csm left_outer: right inner_bytes overflow".into())
9215                })?;
9216            let unmatched_bytes = (unmatched_rows as usize)
9217                .checked_mul(elem_size)
9218                .ok_or_else(|| {
9219                    XlogError::Kernel("csm left_outer: right unmatched_bytes overflow".into())
9220                })?;
9221            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
9222                XlogError::Kernel("csm left_outer: right total_bytes overflow".into())
9223            })?;
9224            let out_col = self.memory.alloc::<u8>(total_bytes)?;
9225            let dst_ptr = *out_col.device_ptr();
9226            // Fence alloc-ready → launch_stream for out_col.
9227            runtime
9228                .prepare_first_use(&out_col, launch_stream, Access::Write)
9229                .map_err(|e| {
9230                    XlogError::Kernel(format!(
9231                        "csm left_outer: prepare right out_col {} failed: {}",
9232                        col_idx, e
9233                    ))
9234                })?;
9235            if total_bytes > 0 {
9236                // SAFETY: zero-fill whole column on cu_stream.
9237                unsafe {
9238                    let res = cudarc::driver::sys::cuMemsetD8Async(
9239                        dst_ptr,
9240                        0,
9241                        total_bytes,
9242                        cu_stream.cu_stream(),
9243                    );
9244                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9245                        return Err(XlogError::Kernel(format!(
9246                            "csm left_outer: zero-fill right col {} failed: {:?}",
9247                            col_idx, res
9248                        )));
9249                    }
9250                }
9251            }
9252            if inner_bytes > 0 {
9253                let src_col = inner_right_buf
9254                    .as_ref()
9255                    .expect("inner_count > 0")
9256                    .column(col_idx)
9257                    .ok_or_else(|| XlogError::Kernel("inner_right col missing".into()))?;
9258                // SAFETY: dtod async on cu_stream.
9259                unsafe {
9260                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
9261                        dst_ptr,
9262                        *src_col.device_ptr(),
9263                        inner_bytes,
9264                        cu_stream.cu_stream(),
9265                    );
9266                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9267                        return Err(XlogError::Kernel(format!(
9268                            "csm left_outer: dtod inner_right col {} failed: {:?}",
9269                            col_idx, res
9270                        )));
9271                    }
9272                }
9273            }
9274            if let Some(b) = out_col.runtime_block() {
9275                runtime
9276                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
9277                    .map_err(|e| {
9278                        XlogError::Kernel(format!(
9279                            "csm left_outer: finish_block_use (right col {}) failed: {}",
9280                            col_idx, e
9281                        ))
9282                    })?;
9283            }
9284            result_columns.push(out_col.into());
9285        }
9286
9287        rec_d.commit(runtime).map_err(|e| {
9288            XlogError::Kernel(format!("csm left_outer: phase-E commit failed: {}", e))
9289        })?;
9290
9291        // Guard the u32 metadata cast: `inner_count_u32` is
9292        // already u32, but `unmatched_rows` is read from the
9293        // device row count of `unmatched_left` and could push
9294        // `total_rows` past u32::MAX in pathological inputs.
9295        // Truncating `total_rows as u32` would corrupt the
9296        // host-side row-count cache and the device-side
9297        // `d_num_rows` scalar, leading to OOB reads in
9298        // downstream consumers.
9299        if total_rows > u32::MAX as u64 {
9300            return Err(XlogError::Kernel(format!(
9301                "csm left_outer: output row count {} exceeds u32::MAX",
9302                total_rows
9303            )));
9304        }
9305        let total_rows_u32 = total_rows as u32;
9306        let d_num_rows = self.upload_device_row_count(total_rows_u32)?;
9307        Ok(CudaBuffer::from_columns_with_host_count(
9308            result_columns,
9309            total_rows,
9310            d_num_rows,
9311            combined_schema,
9312            total_rows_u32,
9313        ))
9314    }
9315
9316    /// Indexed-Inner CSM using the deterministic binary-join path.
9317    ///
9318    /// Same deterministic count→scan→materialize algorithm as
9319    /// [`Self::hash_join_inner_v2_count_scan_materialize_recorded`]
9320    /// but skips pack-right + table-build — the cached
9321    /// [`crate::provider::JoinIndexV2`] supplies
9322    /// `index.packed_keys` and `&index.table`. Only the probe
9323    /// (left) side is packed on `launch_stream`.
9324    ///
9325    /// Reuses the three CSM kernels from the non-indexed inner path
9326    /// (`hash_join_probe_v2_count_per_row`,
9327    /// `hash_join_probe_v2_materialize`,
9328    /// `hash_join_total_from_scan`) — no new kernel additions.
9329    /// Composes `pack_keys_gpu_on_stream`,
9330    /// `multiblock_scan_u32_inplace_on_stream`, and
9331    /// `gather_buffer_by_indices_on_stream` unchanged from
9332    /// recorded helper paths.
9333    ///
9334    /// Index buffers (packed_keys + 4 table buckets) are
9335    /// owned by the caller and recorded as reads on
9336    /// `launch_stream` for the count and materialize
9337    /// recorders — dropping the index after the call returns
9338    /// is correctly serialized through the runtime's
9339    /// record-all + wait-all event chain.
9340    #[allow(clippy::too_many_arguments)]
9341    pub fn hash_join_inner_v2_with_index_count_scan_materialize_recorded(
9342        &self,
9343        left: &CudaBuffer,
9344        right: &CudaBuffer,
9345        left_keys: &[usize],
9346        right_keys: &[usize],
9347        index: &crate::provider::JoinIndexV2,
9348        max_output: Option<usize>,
9349        launch_stream: StreamId,
9350    ) -> Result<CudaBuffer> {
9351        use crate::launch::LaunchRecorder;
9352
9353        let runtime = self.memory.runtime().ok_or_else(|| {
9354            XlogError::Kernel(
9355                "hash_join_inner_v2_with_index_count_scan_materialize_recorded requires \
9356                 a runtime-backed GpuMemoryManager"
9357                    .to_string(),
9358            )
9359        })?;
9360        let cu_stream = runtime
9361            .stream_pool()
9362            .resolve(launch_stream)
9363            .ok_or_else(|| {
9364                XlogError::Kernel(format!(
9365                    "indexed CSM inner: launch_stream StreamId({}) does not resolve",
9366                    launch_stream.0
9367                ))
9368            })?;
9369
9370        // Validation (mirror legacy hash_join_v2_with_index +
9371        // CSM constraints).
9372        let left_rows = self.device_row_count(left)?;
9373        let right_rows = self.device_row_count(right)?;
9374        if left_rows > u32::MAX as usize || right_rows > u32::MAX as usize {
9375            return Err(XlogError::Kernel(format!(
9376                "Join supports at most {} rows per side (left={}, right={})",
9377                u32::MAX,
9378                left_rows,
9379                right_rows
9380            )));
9381        }
9382        if left_rows == 0 || right_rows == 0 {
9383            let combined_schema = self.combine_schemas(left.schema(), right.schema());
9384            return self.create_empty_buffer(combined_schema);
9385        }
9386        if left_keys.is_empty() || right_keys.is_empty() {
9387            return Err(XlogError::Kernel(
9388                "Join requires at least one key column".to_string(),
9389            ));
9390        }
9391        if left_keys.len() != right_keys.len() {
9392            return Err(XlogError::Kernel(
9393                "Left and right key columns must have same length".to_string(),
9394            ));
9395        }
9396        if left_keys.len() > 4 {
9397            return Err(XlogError::Kernel(
9398                "indexed CSM inner: max 4 key columns supported (pack_keys constraint)".to_string(),
9399            ));
9400        }
9401        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
9402            if l >= left.arity() {
9403                return Err(XlogError::Kernel(format!(
9404                    "Left key column index {} out of bounds (arity {})",
9405                    l,
9406                    left.arity()
9407                )));
9408            }
9409            if r >= right.arity() {
9410                return Err(XlogError::Kernel(format!(
9411                    "Right key column index {} out of bounds (arity {})",
9412                    r,
9413                    right.arity()
9414                )));
9415            }
9416            let lt = left.schema().column_type(l);
9417            let rt = right.schema().column_type(r);
9418            if lt != rt {
9419                return Err(XlogError::Kernel(format!(
9420                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
9421                    l, lt, r, rt
9422                )));
9423            }
9424        }
9425        if index.right_num_rows() != right_rows as u32 {
9426            return Err(XlogError::Kernel(
9427                "Join index row count does not match right relation".to_string(),
9428            ));
9429        }
9430        if index.right_keys() != right_keys {
9431            return Err(XlogError::Kernel(
9432                "Join index key columns do not match requested right_keys".to_string(),
9433            ));
9434        }
9435
9436        let probe_cap = left.num_rows() as u32;
9437        let table = &index.table;
9438
9439        // Pack only LEFT on launch_stream. Build side comes
9440        // from the cached index.
9441        let left_packed =
9442            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
9443        if left_packed.key_bytes != index.key_bytes {
9444            return Err(XlogError::Kernel(
9445                "Join key byte width mismatch between probe and cached index".to_string(),
9446            ));
9447        }
9448
9449        let device = self.device.inner();
9450        let block_size = 256u32;
9451        let probe_grid = probe_cap.div_ceil(block_size);
9452        let probe_config = LaunchConfig {
9453            grid_dim: (probe_grid, 1, 1),
9454            block_dim: (block_size, 1, 1),
9455            shared_mem_bytes: 0,
9456        };
9457
9458        // Allocate count + offsets + total scalar + overflow flag.
9459        let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
9460        let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
9461        let d_logical_count = self.memory.alloc::<u32>(1)?;
9462        let d_overflow = self.memory.alloc::<u8>(1)?;
9463        // Fence alloc-ready → launch_stream for both before memset.
9464        runtime
9465            .prepare_first_use(&d_overflow, launch_stream, Access::Write)
9466            .map_err(|e| {
9467                XlogError::Kernel(format!(
9468                    "indexed CSM inner: prepare d_overflow failed: {}",
9469                    e
9470                ))
9471            })?;
9472        runtime
9473            .prepare_first_use(&d_logical_count, launch_stream, Access::Write)
9474            .map_err(|e| {
9475                XlogError::Kernel(format!(
9476                    "indexed CSM inner: prepare d_logical_count failed: {}",
9477                    e
9478                ))
9479            })?;
9480        // Zero-init both scalars on launch_stream.
9481        // SAFETY: 1-byte and 4-byte runtime-backed buffers.
9482        unsafe {
9483            let res = cudarc::driver::sys::cuMemsetD8Async(
9484                *d_overflow.device_ptr(),
9485                0,
9486                1,
9487                cu_stream.cu_stream(),
9488            );
9489            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9490                return Err(XlogError::Kernel(format!(
9491                    "indexed CSM inner: cuMemsetD8Async (d_overflow) failed: {:?}",
9492                    res
9493                )));
9494            }
9495            let res = cudarc::driver::sys::cuMemsetD8Async(
9496                *d_logical_count.device_ptr(),
9497                0,
9498                std::mem::size_of::<u32>(),
9499                cu_stream.cu_stream(),
9500            );
9501            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9502                return Err(XlogError::Kernel(format!(
9503                    "indexed CSM inner: cuMemsetD8Async (d_logical_count) failed: {:?}",
9504                    res
9505                )));
9506            }
9507        }
9508
9509        // Count/scan recorder. Reads on left_packed + index
9510        // buffers + left.num_rows_device BEFORE preflight;
9511        // post-preflight fresh writes for the four newly
9512        // allocated buffers.
9513        let count_func = device
9514            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
9515            .ok_or_else(|| {
9516                XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
9517            })?;
9518        let total_func = device
9519            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
9520            .ok_or_else(|| {
9521                XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
9522            })?;
9523
9524        let mut rec_count = LaunchRecorder::new_strict(launch_stream);
9525        rec_count.read(&left_packed.hashes);
9526        rec_count.read(&left_packed.packed_keys);
9527        rec_count.read(&index.packed_keys);
9528        rec_count.read(&table.bucket_offsets);
9529        rec_count.read(&table.bucket_counts);
9530        rec_count.read(&table.bucket_entries);
9531        rec_count.read(&table.bucket_entry_hashes);
9532        rec_count.read(left.num_rows_device());
9533        rec_count.write(&per_probe_count);
9534        rec_count.write(&per_probe_offsets);
9535        rec_count.write(&d_logical_count);
9536        rec_count.write(&d_overflow);
9537        rec_count.preflight(runtime).map_err(|e| {
9538            XlogError::Kernel(format!(
9539                "indexed CSM inner: count/scan preflight failed: {}",
9540                e
9541            ))
9542        })?;
9543
9544        // Step 3: count_per_row.
9545        // SAFETY: 12-arg signature matches the PTX kernel.
9546        unsafe {
9547            count_func.clone().launch_on_stream(
9548                &cu_stream,
9549                probe_config,
9550                (
9551                    &left_packed.hashes,
9552                    left.num_rows_device(),
9553                    probe_cap,
9554                    &table.bucket_offsets,
9555                    &table.bucket_counts,
9556                    &table.bucket_entries,
9557                    &table.bucket_entry_hashes,
9558                    table.bucket_mask,
9559                    &left_packed.packed_keys,
9560                    &index.packed_keys,
9561                    index.key_bytes,
9562                    &per_probe_count,
9563                ),
9564            )
9565        }
9566        .map_err(|e| {
9567            XlogError::Kernel(format!(
9568                "hash_join_probe_v2_count_per_row (on_stream, indexed) failed: {}",
9569                e
9570            ))
9571        })?;
9572
9573        // Step 4: dtod-async copy per_probe_count → per_probe_offsets,
9574        // then exclusive in-place scan.
9575        // SAFETY: same length, both runtime-backed u32 buffers.
9576        unsafe {
9577            let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
9578                *per_probe_offsets.device_ptr(),
9579                *per_probe_count.device_ptr(),
9580                (probe_cap as usize) * std::mem::size_of::<u32>(),
9581                cu_stream.cu_stream(),
9582            );
9583            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9584                return Err(XlogError::Kernel(format!(
9585                    "indexed CSM inner: cuMemcpyDtoDAsync (count → offsets) failed: {:?}",
9586                    res
9587                )));
9588            }
9589        }
9590        self.multiblock_scan_u32_inplace_on_stream(
9591            &mut per_probe_offsets,
9592            probe_cap,
9593            &cu_stream,
9594            launch_stream,
9595            runtime,
9596        )?;
9597
9598        // Step 5: total_from_scan.
9599        let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(right_rows as u64);
9600        let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
9601        // SAFETY: 7-arg signature.
9602        unsafe {
9603            total_func.clone().launch_on_stream(
9604                &cu_stream,
9605                LaunchConfig {
9606                    grid_dim: (1, 1, 1),
9607                    block_dim: (1, 1, 1),
9608                    shared_mem_bytes: 0,
9609                },
9610                (
9611                    &per_probe_offsets,
9612                    &per_probe_count,
9613                    left.num_rows_device(),
9614                    probe_cap,
9615                    materialize_capacity_u32,
9616                    &d_logical_count,
9617                    &d_overflow,
9618                ),
9619            )
9620        }
9621        .map_err(|e| {
9622            XlogError::Kernel(format!(
9623                "hash_join_total_from_scan (on_stream, indexed) failed: {}",
9624                e
9625            ))
9626        })?;
9627
9628        rec_count.commit(runtime).map_err(|e| {
9629            XlogError::Kernel(format!(
9630                "indexed CSM inner: count/scan commit failed: {}",
9631                e
9632            ))
9633        })?;
9634
9635        cu_stream.synchronize().map_err(|e| {
9636            XlogError::Kernel(format!(
9637                "indexed CSM inner: sync (total read) failed: {}",
9638                e
9639            ))
9640        })?;
9641        let total = self.read_join_output_count_metadata(&d_logical_count)? as u64;
9642        let requested = max_output
9643            .map(|limit| (limit as u64).min(total))
9644            .unwrap_or(total);
9645        if requested == 0 {
9646            let combined_schema = self.combine_schemas(left.schema(), right.schema());
9647            return self.create_empty_buffer(combined_schema);
9648        }
9649        if requested > u32::MAX as u64 {
9650            return Err(XlogError::Kernel(format!(
9651                "Join produced {} rows which exceeds the u32 index limit",
9652                requested
9653            )));
9654        }
9655        let output_capacity = requested as u32;
9656
9657        // Step 6: materialize.
9658        let d_output_left = self.memory.alloc::<u32>(output_capacity as usize)?;
9659        let d_output_right = self.memory.alloc::<u32>(output_capacity as usize)?;
9660
9661        let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
9662        rec_mat.read(&left_packed.hashes);
9663        rec_mat.read(&left_packed.packed_keys);
9664        rec_mat.read(&index.packed_keys);
9665        rec_mat.read(&table.bucket_offsets);
9666        rec_mat.read(&table.bucket_counts);
9667        rec_mat.read(&table.bucket_entries);
9668        rec_mat.read(&table.bucket_entry_hashes);
9669        rec_mat.read(&per_probe_offsets);
9670        rec_mat.read(left.num_rows_device());
9671        rec_mat.write(&d_output_left);
9672        rec_mat.write(&d_output_right);
9673        // d_overflow is consumed by the materialize kernel — recorder must own it through commit.
9674        rec_mat.write(&d_overflow);
9675        rec_mat.preflight(runtime).map_err(|e| {
9676            XlogError::Kernel(format!(
9677                "indexed CSM inner: materialize preflight failed: {}",
9678                e
9679            ))
9680        })?;
9681
9682        let materialize_func = device
9683            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
9684            .ok_or_else(|| {
9685                XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
9686            })?;
9687        // SAFETY: 16-arg signature; raw-param launch path.
9688        unsafe {
9689            let mut params: Vec<*mut c_void> = vec![
9690                (&left_packed.hashes).as_kernel_param(),
9691                left.num_rows_device().as_kernel_param(),
9692                probe_cap.as_kernel_param(),
9693                (&table.bucket_offsets).as_kernel_param(),
9694                (&table.bucket_counts).as_kernel_param(),
9695                (&table.bucket_entries).as_kernel_param(),
9696                (&table.bucket_entry_hashes).as_kernel_param(),
9697                table.bucket_mask.as_kernel_param(),
9698                (&left_packed.packed_keys).as_kernel_param(),
9699                (&index.packed_keys).as_kernel_param(),
9700                index.key_bytes.as_kernel_param(),
9701                (&per_probe_offsets).as_kernel_param(),
9702                output_capacity.as_kernel_param(),
9703                (&d_output_left).as_kernel_param(),
9704                (&d_output_right).as_kernel_param(),
9705                (&d_overflow).as_kernel_param(),
9706            ];
9707            materialize_func
9708                .clone()
9709                .launch_on_stream(&cu_stream, probe_config, &mut params)
9710                .map_err(|e| {
9711                    XlogError::Kernel(format!(
9712                        "hash_join_probe_v2_materialize (on_stream, indexed) failed: {}",
9713                        e
9714                    ))
9715                })?;
9716        }
9717
9718        rec_mat.commit(runtime).map_err(|e| {
9719            XlogError::Kernel(format!(
9720                "indexed CSM inner: materialize commit failed: {}",
9721                e
9722            ))
9723        })?;
9724
9725        cu_stream.synchronize().map_err(|e| {
9726            XlogError::Kernel(format!(
9727                "indexed CSM inner: sync (post-materialize) failed: {}",
9728                e
9729            ))
9730        })?;
9731
9732        // Step 7: gather both sides on launch_stream.
9733        let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
9734        for col_idx in 0..left.columns.len() {
9735            let c = left
9736                .column(col_idx)
9737                .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
9738            rec_gather.read_column(c);
9739        }
9740        for col_idx in 0..right.columns.len() {
9741            let c = right
9742                .column(col_idx)
9743                .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
9744            rec_gather.read_column(c);
9745        }
9746        rec_gather.read(&d_output_left);
9747        rec_gather.read(&d_output_right);
9748        rec_gather.preflight(runtime).map_err(|e| {
9749            XlogError::Kernel(format!("indexed CSM inner: gather preflight failed: {}", e))
9750        })?;
9751        let gathered_left = self.gather_buffer_by_indices_on_stream(
9752            left,
9753            &d_output_left,
9754            output_capacity,
9755            &cu_stream,
9756            launch_stream,
9757            runtime,
9758        )?;
9759        let gathered_right = self.gather_buffer_by_indices_on_stream(
9760            right,
9761            &d_output_right,
9762            output_capacity,
9763            &cu_stream,
9764            launch_stream,
9765            runtime,
9766        )?;
9767        rec_gather.commit(runtime).map_err(|e| {
9768            XlogError::Kernel(format!("indexed CSM inner: gather commit failed: {}", e))
9769        })?;
9770
9771        let combined_schema = self.combine_schemas(left.schema(), right.schema());
9772        let mut result_columns = Vec::with_capacity(combined_schema.arity());
9773        result_columns.extend(gathered_left.columns);
9774        result_columns.extend(gathered_right.columns);
9775        self.buffer_from_columns(result_columns, output_capacity as u64, combined_schema)
9776    }
9777
9778    /// Indexed LeftOuter CSM using the indexed deterministic binary-join path.
9779    ///
9780    /// Combines the indexed-Inner CSM Phases A+B (probe-only
9781    /// pack on `launch_stream`; cached
9782    /// [`crate::provider::JoinIndexV2`] supplies the build
9783    /// side's `packed_keys` and `&index.table`) with the
9784    /// non-indexed LeftOuter CSM Phases C–E (per-probe
9785    /// unmatched-mask via `hash_join_csm_unmatched_mask` →
9786    /// recorded compact tail → gather matched left + right →
9787    /// per-column `inner | unmatched` / `inner | zeros`
9788    /// concat). Same row-ordering invariant as
9789    /// [`Self::hash_join_left_outer_v2_count_scan_materialize_recorded`]:
9790    /// matched rows first, unmatched-with-zero-right second.
9791    ///
9792    /// No new kernels — reuses the four already-migrated CSM
9793    /// kernels plus `hash_join_csm_unmatched_mask` from
9794    /// the non-indexed LeftOuter CSM path.
9795    ///
9796    /// # Errors
9797    ///   * Manager not runtime-backed.
9798    ///   * `launch_stream` does not resolve.
9799    ///   * `left_keys`/`right_keys` empty, mismatched length,
9800    ///     or > 4 (pack_keys constraint).
9801    ///   * Key column type mismatch.
9802    ///   * `index.right_num_rows()` mismatches the right
9803    ///     buffer's logical row count.
9804    ///   * `index.right_keys()` mismatches the requested
9805    ///     `right_keys`.
9806    ///   * `left_packed.key_bytes` mismatches `index.key_bytes`.
9807    ///   * Preflight / kernel / commit failures.
9808    #[allow(clippy::too_many_arguments)]
9809    pub fn hash_join_left_outer_v2_with_index_count_scan_materialize_recorded(
9810        &self,
9811        left: &CudaBuffer,
9812        right: &CudaBuffer,
9813        left_keys: &[usize],
9814        right_keys: &[usize],
9815        index: &crate::provider::JoinIndexV2,
9816        max_output: Option<usize>,
9817        launch_stream: StreamId,
9818    ) -> Result<CudaBuffer> {
9819        use crate::launch::LaunchRecorder;
9820
9821        let runtime = self.memory.runtime().ok_or_else(|| {
9822            XlogError::Kernel(
9823                "hash_join_left_outer_v2_with_index_count_scan_materialize_recorded requires \
9824                 a runtime-backed GpuMemoryManager"
9825                    .to_string(),
9826            )
9827        })?;
9828        let cu_stream = runtime
9829            .stream_pool()
9830            .resolve(launch_stream)
9831            .ok_or_else(|| {
9832                XlogError::Kernel(format!(
9833                    "indexed csm left_outer: launch_stream StreamId({}) does not resolve",
9834                    launch_stream.0
9835                ))
9836            })?;
9837
9838        // Validation (mirror hash_join_inner_v2_with_index_count_scan_materialize_recorded
9839        // + non-indexed LeftOuter CSM).
9840        let num_left = self.device_row_count(left)?;
9841        let num_right = self.device_row_count(right)?;
9842        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
9843            return Err(XlogError::Kernel(format!(
9844                "Join supports at most {} rows per side (left={}, right={})",
9845                u32::MAX,
9846                num_left,
9847                num_right
9848            )));
9849        }
9850        if num_left == 0 {
9851            let combined_schema = self.combine_schemas(left.schema(), right.schema());
9852            return self.create_empty_buffer(combined_schema);
9853        }
9854        if num_right == 0 {
9855            // Empty right → all left rows with zero-filled
9856            // right columns. Same legacy fallback as the
9857            // non-indexed LeftOuter CSM and
9858            // `hash_join_left_outer_v2_recorded`.
9859            return self.left_outer_with_nulls(left, right);
9860        }
9861        if left_keys.is_empty() || right_keys.is_empty() {
9862            return Err(XlogError::Kernel(
9863                "Join requires at least one key column".to_string(),
9864            ));
9865        }
9866        if left_keys.len() != right_keys.len() {
9867            return Err(XlogError::Kernel(
9868                "Left and right key columns must have same length".to_string(),
9869            ));
9870        }
9871        if left_keys.len() > 4 {
9872            return Err(XlogError::Kernel(
9873                "indexed csm left_outer: max 4 key columns supported (pack_keys constraint)"
9874                    .to_string(),
9875            ));
9876        }
9877        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
9878            if l >= left.arity() {
9879                return Err(XlogError::Kernel(format!(
9880                    "Left key column index {} out of bounds (arity {})",
9881                    l,
9882                    left.arity()
9883                )));
9884            }
9885            if r >= right.arity() {
9886                return Err(XlogError::Kernel(format!(
9887                    "Right key column index {} out of bounds (arity {})",
9888                    r,
9889                    right.arity()
9890                )));
9891            }
9892            let lt = left.schema().column_type(l);
9893            let rt = right.schema().column_type(r);
9894            if lt != rt {
9895                return Err(XlogError::Kernel(format!(
9896                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
9897                    l, lt, r, rt
9898                )));
9899            }
9900        }
9901        if index.right_num_rows() != num_right as u32 {
9902            return Err(XlogError::Kernel(
9903                "Join index row count does not match right relation".to_string(),
9904            ));
9905        }
9906        if index.right_keys() != right_keys {
9907            return Err(XlogError::Kernel(
9908                "Join index key columns do not match requested right_keys".to_string(),
9909            ));
9910        }
9911
9912        // Base `probe_cap` on the validated logical row count.
9913        let probe_cap = u32::try_from(num_left).map_err(|_| {
9914            XlogError::Kernel("indexed csm left_outer: left row count exceeds u32::MAX".to_string())
9915        })?;
9916
9917        let table = &index.table;
9918
9919        // Pack only LEFT on launch_stream. Build side comes from
9920        // the cached index.
9921        let left_packed =
9922            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
9923        if left_packed.key_bytes != index.key_bytes {
9924            return Err(XlogError::Kernel(
9925                "Join key byte width mismatch between probe and cached index".to_string(),
9926            ));
9927        }
9928
9929        let device = self.device.inner();
9930        let block_size = 256u32;
9931        let probe_grid = probe_cap.div_ceil(block_size);
9932        let probe_config = LaunchConfig {
9933            grid_dim: (probe_grid, 1, 1),
9934            block_dim: (block_size, 1, 1),
9935            shared_mem_bytes: 0,
9936        };
9937
9938        // Phase A: count + scan + total.
9939        let per_probe_count = self.memory.alloc::<u32>(probe_cap as usize)?;
9940        let mut per_probe_offsets = self.memory.alloc::<u32>(probe_cap as usize)?;
9941        let d_logical_count = self.memory.alloc::<u32>(1)?;
9942        let d_overflow = self.memory.alloc::<u8>(1)?;
9943        runtime
9944            .prepare_first_use(&d_overflow, launch_stream, Access::Write)
9945            .map_err(|e| {
9946                XlogError::Kernel(format!(
9947                    "indexed csm left_outer: prepare d_overflow failed: {}",
9948                    e
9949                ))
9950            })?;
9951        runtime
9952            .prepare_first_use(&d_logical_count, launch_stream, Access::Write)
9953            .map_err(|e| {
9954                XlogError::Kernel(format!(
9955                    "indexed csm left_outer: prepare d_logical_count failed: {}",
9956                    e
9957                ))
9958            })?;
9959        // Zero-init scalars on launch_stream.
9960        // SAFETY: 1-byte and 4-byte runtime-backed buffers.
9961        unsafe {
9962            let res = cudarc::driver::sys::cuMemsetD8Async(
9963                *d_overflow.device_ptr(),
9964                0,
9965                1,
9966                cu_stream.cu_stream(),
9967            );
9968            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9969                return Err(XlogError::Kernel(format!(
9970                    "indexed csm left_outer: cuMemsetD8Async (d_overflow) failed: {:?}",
9971                    res
9972                )));
9973            }
9974            let res = cudarc::driver::sys::cuMemsetD8Async(
9975                *d_logical_count.device_ptr(),
9976                0,
9977                std::mem::size_of::<u32>(),
9978                cu_stream.cu_stream(),
9979            );
9980            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
9981                return Err(XlogError::Kernel(format!(
9982                    "indexed csm left_outer: cuMemsetD8Async (d_logical_count) failed: {:?}",
9983                    res
9984                )));
9985            }
9986        }
9987
9988        let count_func = device
9989            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_COUNT_PER_ROW)
9990            .ok_or_else(|| {
9991                XlogError::Kernel("hash_join_probe_v2_count_per_row kernel not found".to_string())
9992            })?;
9993        let total_func = device
9994            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_TOTAL_FROM_SCAN)
9995            .ok_or_else(|| {
9996                XlogError::Kernel("hash_join_total_from_scan kernel not found".to_string())
9997            })?;
9998
9999        let mut rec_count = LaunchRecorder::new_strict(launch_stream);
10000        rec_count.read(&left_packed.hashes);
10001        rec_count.read(&left_packed.packed_keys);
10002        rec_count.read(&index.packed_keys);
10003        rec_count.read(&table.bucket_offsets);
10004        rec_count.read(&table.bucket_counts);
10005        rec_count.read(&table.bucket_entries);
10006        rec_count.read(&table.bucket_entry_hashes);
10007        rec_count.read(left.num_rows_device());
10008        rec_count.write(&per_probe_count);
10009        rec_count.write(&per_probe_offsets);
10010        rec_count.write(&d_logical_count);
10011        rec_count.write(&d_overflow);
10012        rec_count.preflight(runtime).map_err(|e| {
10013            XlogError::Kernel(format!(
10014                "indexed csm left_outer: count/scan preflight failed: {}",
10015                e
10016            ))
10017        })?;
10018
10019        // Step A1: count_per_row.
10020        // SAFETY: 12-arg signature.
10021        unsafe {
10022            count_func.clone().launch_on_stream(
10023                &cu_stream,
10024                probe_config,
10025                (
10026                    &left_packed.hashes,
10027                    left.num_rows_device(),
10028                    probe_cap,
10029                    &table.bucket_offsets,
10030                    &table.bucket_counts,
10031                    &table.bucket_entries,
10032                    &table.bucket_entry_hashes,
10033                    table.bucket_mask,
10034                    &left_packed.packed_keys,
10035                    &index.packed_keys,
10036                    index.key_bytes,
10037                    &per_probe_count,
10038                ),
10039            )
10040        }
10041        .map_err(|e| {
10042            XlogError::Kernel(format!(
10043                "hash_join_probe_v2_count_per_row (indexed csm left_outer) failed: {}",
10044                e
10045            ))
10046        })?;
10047
10048        // Step A2: dtod copy + exclusive scan.
10049        // SAFETY: same length, both runtime-backed u32.
10050        unsafe {
10051            let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
10052                *per_probe_offsets.device_ptr(),
10053                *per_probe_count.device_ptr(),
10054                (probe_cap as usize) * std::mem::size_of::<u32>(),
10055                cu_stream.cu_stream(),
10056            );
10057            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10058                return Err(XlogError::Kernel(format!(
10059                    "indexed csm left_outer: cuMemcpyDtoDAsync (count → offsets) failed: {:?}",
10060                    res
10061                )));
10062            }
10063        }
10064        self.multiblock_scan_u32_inplace_on_stream(
10065            &mut per_probe_offsets,
10066            probe_cap,
10067            &cu_stream,
10068            launch_stream,
10069            runtime,
10070        )?;
10071
10072        // Step A3: total_from_scan — writes d_logical_count + d_overflow.
10073        let materialize_capacity_bound: u64 = (probe_cap as u64).saturating_mul(num_right as u64);
10074        let materialize_capacity_u32 = materialize_capacity_bound.min(u32::MAX as u64) as u32;
10075        // SAFETY: 7-arg signature.
10076        unsafe {
10077            total_func.clone().launch_on_stream(
10078                &cu_stream,
10079                LaunchConfig {
10080                    grid_dim: (1, 1, 1),
10081                    block_dim: (1, 1, 1),
10082                    shared_mem_bytes: 0,
10083                },
10084                (
10085                    &per_probe_offsets,
10086                    &per_probe_count,
10087                    left.num_rows_device(),
10088                    probe_cap,
10089                    materialize_capacity_u32,
10090                    &d_logical_count,
10091                    &d_overflow,
10092                ),
10093            )
10094        }
10095        .map_err(|e| {
10096            XlogError::Kernel(format!(
10097                "hash_join_total_from_scan (indexed csm left_outer) failed: {}",
10098                e
10099            ))
10100        })?;
10101
10102        rec_count.commit(runtime).map_err(|e| {
10103            XlogError::Kernel(format!(
10104                "indexed csm left_outer: count/scan commit failed: {}",
10105                e
10106            ))
10107        })?;
10108
10109        cu_stream.synchronize().map_err(|e| {
10110            XlogError::Kernel(format!(
10111                "indexed csm left_outer: sync (count read) failed: {}",
10112                e
10113            ))
10114        })?;
10115        let inner_total = self.read_join_output_count_metadata(&d_logical_count)? as u64;
10116        let inner_clamped = max_output
10117            .map(|limit| (limit as u64).min(inner_total))
10118            .unwrap_or(inner_total);
10119        if inner_clamped > u32::MAX as u64 {
10120            return Err(XlogError::Kernel(format!(
10121                "Join produced {} matched rows which exceeds the u32 index limit",
10122                inner_clamped
10123            )));
10124        }
10125        let inner_count_u32 = inner_clamped as u32;
10126
10127        // Phase B: materialize matched index pairs.
10128        let materialize_func = device
10129            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2_MATERIALIZE)
10130            .ok_or_else(|| {
10131                XlogError::Kernel("hash_join_probe_v2_materialize kernel not found".to_string())
10132            })?;
10133        let d_output_left = self.memory.alloc::<u32>(inner_count_u32.max(1) as usize)?;
10134        let d_output_right = self.memory.alloc::<u32>(inner_count_u32.max(1) as usize)?;
10135
10136        let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
10137        rec_mat.read(&left_packed.hashes);
10138        rec_mat.read(&left_packed.packed_keys);
10139        rec_mat.read(&index.packed_keys);
10140        rec_mat.read(&table.bucket_offsets);
10141        rec_mat.read(&table.bucket_counts);
10142        rec_mat.read(&table.bucket_entries);
10143        rec_mat.read(&table.bucket_entry_hashes);
10144        rec_mat.read(&per_probe_offsets);
10145        rec_mat.read(left.num_rows_device());
10146        rec_mat.write(&d_output_left);
10147        rec_mat.write(&d_output_right);
10148        // d_overflow is consumed by the materialize kernel — recorder must own it through commit.
10149        rec_mat.write(&d_overflow);
10150        rec_mat.preflight(runtime).map_err(|e| {
10151            XlogError::Kernel(format!(
10152                "indexed csm left_outer: materialize preflight failed: {}",
10153                e
10154            ))
10155        })?;
10156        if inner_count_u32 > 0 {
10157            // SAFETY: 16-arg signature; raw-param launch.
10158            unsafe {
10159                let mut params: Vec<*mut c_void> = vec![
10160                    (&left_packed.hashes).as_kernel_param(),
10161                    left.num_rows_device().as_kernel_param(),
10162                    probe_cap.as_kernel_param(),
10163                    (&table.bucket_offsets).as_kernel_param(),
10164                    (&table.bucket_counts).as_kernel_param(),
10165                    (&table.bucket_entries).as_kernel_param(),
10166                    (&table.bucket_entry_hashes).as_kernel_param(),
10167                    table.bucket_mask.as_kernel_param(),
10168                    (&left_packed.packed_keys).as_kernel_param(),
10169                    (&index.packed_keys).as_kernel_param(),
10170                    index.key_bytes.as_kernel_param(),
10171                    (&per_probe_offsets).as_kernel_param(),
10172                    inner_count_u32.as_kernel_param(),
10173                    (&d_output_left).as_kernel_param(),
10174                    (&d_output_right).as_kernel_param(),
10175                    (&d_overflow).as_kernel_param(),
10176                ];
10177                materialize_func
10178                    .clone()
10179                    .launch_on_stream(&cu_stream, probe_config, &mut params)
10180                    .map_err(|e| {
10181                        XlogError::Kernel(format!(
10182                            "hash_join_probe_v2_materialize (indexed csm left_outer) failed: {}",
10183                            e
10184                        ))
10185                    })?;
10186            }
10187        }
10188        rec_mat.commit(runtime).map_err(|e| {
10189            XlogError::Kernel(format!(
10190                "indexed csm left_outer: materialize commit failed: {}",
10191                e
10192            ))
10193        })?;
10194
10195        // Phase C: unmatched-left mask + recorded compact tail.
10196        let d_unmatched_mask = self.memory.alloc::<u8>(probe_cap as usize)?;
10197        let unmatched_mask_func = device
10198            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_CSM_UNMATCHED_MASK)
10199            .ok_or_else(|| {
10200                XlogError::Kernel("hash_join_csm_unmatched_mask kernel not found".to_string())
10201            })?;
10202        let mut rec_um = LaunchRecorder::new_strict(launch_stream);
10203        rec_um.read(&per_probe_count);
10204        rec_um.read(left.num_rows_device());
10205        rec_um.write(&d_unmatched_mask);
10206        rec_um.preflight(runtime).map_err(|e| {
10207            XlogError::Kernel(format!(
10208                "indexed csm left_outer: unmatched mask preflight failed: {}",
10209                e
10210            ))
10211        })?;
10212        // SAFETY: 4-arg signature.
10213        unsafe {
10214            unmatched_mask_func.clone().launch_on_stream(
10215                &cu_stream,
10216                probe_config,
10217                (
10218                    &per_probe_count,
10219                    left.num_rows_device(),
10220                    probe_cap,
10221                    &d_unmatched_mask,
10222                ),
10223            )
10224        }
10225        .map_err(|e| {
10226            XlogError::Kernel(format!(
10227                "hash_join_csm_unmatched_mask (indexed csm left_outer) failed: {}",
10228                e
10229            ))
10230        })?;
10231        rec_um.commit(runtime).map_err(|e| {
10232            XlogError::Kernel(format!(
10233                "indexed csm left_outer: unmatched mask commit failed: {}",
10234                e
10235            ))
10236        })?;
10237
10238        let unmatched_left = self.compact_buffer_by_device_mask_counted_recorded(
10239            left,
10240            &d_unmatched_mask,
10241            launch_stream,
10242        )?;
10243        let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
10244        let total_rows = (inner_count_u32 as u64) + unmatched_rows;
10245
10246        let combined_schema = self.combine_schemas(left.schema(), right.schema());
10247        if total_rows == 0 {
10248            return self.create_empty_buffer(combined_schema);
10249        }
10250
10251        // Phase D: gather matched left + right (only if inner_count > 0).
10252        let inner_left_buf;
10253        let inner_right_buf;
10254        if inner_count_u32 > 0 {
10255            let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
10256            for col_idx in 0..left.columns.len() {
10257                let c = left.column(col_idx).ok_or_else(|| {
10258                    XlogError::Kernel(format!("Left column {} not found", col_idx))
10259                })?;
10260                rec_gather.read_column(c);
10261            }
10262            for col_idx in 0..right.columns.len() {
10263                let c = right.column(col_idx).ok_or_else(|| {
10264                    XlogError::Kernel(format!("Right column {} not found", col_idx))
10265                })?;
10266                rec_gather.read_column(c);
10267            }
10268            rec_gather.read(&d_output_left);
10269            rec_gather.read(&d_output_right);
10270            rec_gather.preflight(runtime).map_err(|e| {
10271                XlogError::Kernel(format!(
10272                    "indexed csm left_outer: gather preflight failed: {}",
10273                    e
10274                ))
10275            })?;
10276            inner_left_buf = Some(self.gather_buffer_by_indices_on_stream(
10277                left,
10278                &d_output_left,
10279                inner_count_u32,
10280                &cu_stream,
10281                launch_stream,
10282                runtime,
10283            )?);
10284            inner_right_buf = Some(self.gather_buffer_by_indices_on_stream(
10285                right,
10286                &d_output_right,
10287                inner_count_u32,
10288                &cu_stream,
10289                launch_stream,
10290                runtime,
10291            )?);
10292            rec_gather.commit(runtime).map_err(|e| {
10293                XlogError::Kernel(format!(
10294                    "indexed csm left_outer: gather commit failed: {}",
10295                    e
10296                ))
10297            })?;
10298        } else {
10299            inner_left_buf = None;
10300            inner_right_buf = None;
10301        }
10302
10303        // Phase E: per-column dtod-async concat.
10304        let mut rec_d = LaunchRecorder::new_strict(launch_stream);
10305        for col_idx in 0..unmatched_left.columns.len() {
10306            let c = unmatched_left.column(col_idx).ok_or_else(|| {
10307                XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
10308            })?;
10309            rec_d.read_column(c);
10310        }
10311        if let Some(b) = inner_left_buf.as_ref() {
10312            for col_idx in 0..b.columns.len() {
10313                let c = b.column(col_idx).ok_or_else(|| {
10314                    XlogError::Kernel(format!("inner_left col {} not found", col_idx))
10315                })?;
10316                rec_d.read_column(c);
10317            }
10318        }
10319        if let Some(b) = inner_right_buf.as_ref() {
10320            for col_idx in 0..b.columns.len() {
10321                let c = b.column(col_idx).ok_or_else(|| {
10322                    XlogError::Kernel(format!("inner_right col {} not found", col_idx))
10323                })?;
10324                rec_d.read_column(c);
10325            }
10326        }
10327        rec_d.preflight(runtime).map_err(|e| {
10328            XlogError::Kernel(format!(
10329                "indexed csm left_outer: phase-E preflight failed: {}",
10330                e
10331            ))
10332        })?;
10333
10334        let inner_rows = inner_count_u32 as u64;
10335        let mut result_columns: Vec<CudaColumn> = Vec::with_capacity(combined_schema.arity());
10336
10337        // Per-left-column: inner_left | unmatched_left.
10338        for col_idx in 0..left.arity() {
10339            let elem_size = left
10340                .schema()
10341                .column_type(col_idx)
10342                .map(|t| t.size_bytes())
10343                .unwrap_or(4);
10344            let inner_bytes = (inner_rows as usize)
10345                .checked_mul(elem_size)
10346                .ok_or_else(|| {
10347                    XlogError::Kernel("indexed csm left_outer: inner_bytes overflow".into())
10348                })?;
10349            let unmatched_bytes = (unmatched_rows as usize)
10350                .checked_mul(elem_size)
10351                .ok_or_else(|| {
10352                    XlogError::Kernel("indexed csm left_outer: unmatched_bytes overflow".into())
10353                })?;
10354            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
10355                XlogError::Kernel("indexed csm left_outer: total_bytes overflow".into())
10356            })?;
10357            let out_col = self.memory.alloc::<u8>(total_bytes)?;
10358            let dst_ptr = *out_col.device_ptr();
10359            runtime
10360                .prepare_first_use(&out_col, launch_stream, Access::Write)
10361                .map_err(|e| {
10362                    XlogError::Kernel(format!(
10363                        "indexed csm left_outer: prepare left out_col {} failed: {}",
10364                        col_idx, e
10365                    ))
10366                })?;
10367            if inner_bytes > 0 {
10368                let src_col = inner_left_buf
10369                    .as_ref()
10370                    .expect("inner_count > 0")
10371                    .column(col_idx)
10372                    .ok_or_else(|| XlogError::Kernel("inner_left col missing".into()))?;
10373                // SAFETY: dtod async on cu_stream.
10374                unsafe {
10375                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
10376                        dst_ptr,
10377                        *src_col.device_ptr(),
10378                        inner_bytes,
10379                        cu_stream.cu_stream(),
10380                    );
10381                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10382                        return Err(XlogError::Kernel(format!(
10383                            "indexed csm left_outer: dtod inner_left col {} failed: {:?}",
10384                            col_idx, res
10385                        )));
10386                    }
10387                }
10388            }
10389            if unmatched_bytes > 0 {
10390                let src_col = unmatched_left.column(col_idx).ok_or_else(|| {
10391                    XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
10392                })?;
10393                // SAFETY: bounded by total_bytes.
10394                unsafe {
10395                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
10396                        dst_ptr + inner_bytes as u64,
10397                        *src_col.device_ptr(),
10398                        unmatched_bytes,
10399                        cu_stream.cu_stream(),
10400                    );
10401                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10402                        return Err(XlogError::Kernel(format!(
10403                            "indexed csm left_outer: dtod unmatched col {} failed: {:?}",
10404                            col_idx, res
10405                        )));
10406                    }
10407                }
10408            }
10409            if let Some(b) = out_col.runtime_block() {
10410                runtime
10411                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
10412                    .map_err(|e| {
10413                        XlogError::Kernel(format!(
10414                            "indexed csm left_outer: finish_block_use (left col {}) failed: {}",
10415                            col_idx, e
10416                        ))
10417                    })?;
10418            }
10419            result_columns.push(out_col.into());
10420        }
10421
10422        // Per-right-column: inner_right | zeros.
10423        for col_idx in 0..right.arity() {
10424            let elem_size = right
10425                .schema()
10426                .column_type(col_idx)
10427                .map(|t| t.size_bytes())
10428                .unwrap_or(4);
10429            let inner_bytes = (inner_rows as usize)
10430                .checked_mul(elem_size)
10431                .ok_or_else(|| {
10432                    XlogError::Kernel("indexed csm left_outer: right inner_bytes overflow".into())
10433                })?;
10434            let unmatched_bytes = (unmatched_rows as usize)
10435                .checked_mul(elem_size)
10436                .ok_or_else(|| {
10437                    XlogError::Kernel(
10438                        "indexed csm left_outer: right unmatched_bytes overflow".into(),
10439                    )
10440                })?;
10441            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
10442                XlogError::Kernel("indexed csm left_outer: right total_bytes overflow".into())
10443            })?;
10444            let out_col = self.memory.alloc::<u8>(total_bytes)?;
10445            let dst_ptr = *out_col.device_ptr();
10446            runtime
10447                .prepare_first_use(&out_col, launch_stream, Access::Write)
10448                .map_err(|e| {
10449                    XlogError::Kernel(format!(
10450                        "indexed csm left_outer: prepare right out_col {} failed: {}",
10451                        col_idx, e
10452                    ))
10453                })?;
10454            if total_bytes > 0 {
10455                // SAFETY: zero-fill whole column on cu_stream.
10456                unsafe {
10457                    let res = cudarc::driver::sys::cuMemsetD8Async(
10458                        dst_ptr,
10459                        0,
10460                        total_bytes,
10461                        cu_stream.cu_stream(),
10462                    );
10463                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10464                        return Err(XlogError::Kernel(format!(
10465                            "indexed csm left_outer: zero-fill right col {} failed: {:?}",
10466                            col_idx, res
10467                        )));
10468                    }
10469                }
10470            }
10471            if inner_bytes > 0 {
10472                let src_col = inner_right_buf
10473                    .as_ref()
10474                    .expect("inner_count > 0")
10475                    .column(col_idx)
10476                    .ok_or_else(|| XlogError::Kernel("inner_right col missing".into()))?;
10477                // SAFETY: dtod async on cu_stream.
10478                unsafe {
10479                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
10480                        dst_ptr,
10481                        *src_col.device_ptr(),
10482                        inner_bytes,
10483                        cu_stream.cu_stream(),
10484                    );
10485                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10486                        return Err(XlogError::Kernel(format!(
10487                            "indexed csm left_outer: dtod inner_right col {} failed: {:?}",
10488                            col_idx, res
10489                        )));
10490                    }
10491                }
10492            }
10493            if let Some(b) = out_col.runtime_block() {
10494                runtime
10495                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
10496                    .map_err(|e| {
10497                        XlogError::Kernel(format!(
10498                            "indexed csm left_outer: finish_block_use (right col {}) failed: {}",
10499                            col_idx, e
10500                        ))
10501                    })?;
10502            }
10503            result_columns.push(out_col.into());
10504        }
10505
10506        rec_d.commit(runtime).map_err(|e| {
10507            XlogError::Kernel(format!(
10508                "indexed csm left_outer: phase-E commit failed: {}",
10509                e
10510            ))
10511        })?;
10512
10513        // Guard u32 metadata cast.
10514        if total_rows > u32::MAX as u64 {
10515            return Err(XlogError::Kernel(format!(
10516                "indexed csm left_outer: output row count {} exceeds u32::MAX",
10517                total_rows
10518            )));
10519        }
10520        let total_rows_u32 = total_rows as u32;
10521        let d_num_rows = self.upload_device_row_count(total_rows_u32)?;
10522        Ok(CudaBuffer::from_columns_with_host_count(
10523            result_columns,
10524            total_rows,
10525            d_num_rows,
10526            combined_schema,
10527            total_rows_u32,
10528        ))
10529    }
10530
10531    /// Strict-recorder, launch_stream-routed variant of
10532    /// `hash_join_v2`. Covers all four join types
10533    /// (`Inner` / `Semi` / `Anti` / `LeftOuter`) via dedicated
10534    /// per-type recorded methods.
10535    ///
10536    /// When [`Self::use_recorded_csm_env`] is on, `Inner` and
10537    /// `LeftOuter` route through the CSM (count-scan-materialize)
10538    /// methods; otherwise they route through the legacy recorded
10539    /// methods. `Semi` / `Anti` always route through their
10540    /// existing recorded methods — no CSM implementation exists
10541    /// for them. All eligibility checks (runtime-backed manager,
10542    /// ≤4 keys, key-type match, row-count caps) are validated
10543    /// upstream by the public `hash_join_v2_with_limit` and inside
10544    /// each per-type method.
10545    #[allow(clippy::too_many_arguments)]
10546    pub fn hash_join_v2_recorded(
10547        &self,
10548        left: &CudaBuffer,
10549        right: &CudaBuffer,
10550        left_keys: &[usize],
10551        right_keys: &[usize],
10552        join_type: JoinType,
10553        max_output: Option<usize>,
10554        launch_stream: StreamId,
10555    ) -> Result<CudaBuffer> {
10556        let csm_on = Self::use_recorded_csm_env();
10557        match join_type {
10558            JoinType::Inner => {
10559                if csm_on {
10560                    self.csm_invocations.fetch_add(1, Ordering::Relaxed);
10561                    self.hash_join_inner_v2_count_scan_materialize_recorded(
10562                        left,
10563                        right,
10564                        left_keys,
10565                        right_keys,
10566                        max_output,
10567                        launch_stream,
10568                    )
10569                } else {
10570                    self.hash_join_inner_v2_recorded(
10571                        left,
10572                        right,
10573                        left_keys,
10574                        right_keys,
10575                        max_output,
10576                        launch_stream,
10577                    )
10578                }
10579            }
10580            JoinType::Semi => self.hash_join_semi_or_anti_v2_recorded(
10581                left,
10582                right,
10583                left_keys,
10584                right_keys,
10585                false,
10586                launch_stream,
10587            ),
10588            JoinType::Anti => self.hash_join_semi_or_anti_v2_recorded(
10589                left,
10590                right,
10591                left_keys,
10592                right_keys,
10593                true,
10594                launch_stream,
10595            ),
10596            JoinType::LeftOuter => {
10597                if csm_on {
10598                    self.csm_invocations.fetch_add(1, Ordering::Relaxed);
10599                    self.hash_join_left_outer_v2_count_scan_materialize_recorded(
10600                        left,
10601                        right,
10602                        left_keys,
10603                        right_keys,
10604                        max_output,
10605                        launch_stream,
10606                    )
10607                } else {
10608                    self.hash_join_left_outer_v2_recorded(
10609                        left,
10610                        right,
10611                        left_keys,
10612                        right_keys,
10613                        max_output,
10614                        launch_stream,
10615                    )
10616                }
10617            }
10618        }
10619    }
10620
10621    /// Strict-recorder LeftOuter hash join.
10622    ///
10623    /// Mirrors the legacy `hash_join_left_outer_impl` chain on
10624    /// `launch_stream`:
10625    ///   1. pack keys both sides + build hash table on stream
10626    ///      (via the recorded pack and hash-table helpers).
10627    ///   2. SEMI kernel → `d_has_match` mask.
10628    ///   3. PROBE count + materialize → inner-join indices.
10629    ///   4. `mask_not` → `d_no_match`; recorded compact tail
10630    ///      filters `left` to `unmatched_left`.
10631    ///   5. Gather inner left + inner right on stream.
10632    ///   6. Concatenate: per-left-column `inner | unmatched`,
10633    ///      per-right-column `inner | zeros`. All copies and
10634    ///      zero-fills go on `cu_stream` via
10635    ///      `cuMemcpyDtoDAsync_v2` / `cuMemsetD8Async`.
10636    ///
10637    /// Empty-right edge case keeps the legacy
10638    /// `left_outer_with_nulls` (synchronous on default stream)
10639    /// — no launch_stream work is queued for that path, which
10640    /// is the correct semantic.
10641    fn hash_join_left_outer_v2_recorded(
10642        &self,
10643        left: &CudaBuffer,
10644        right: &CudaBuffer,
10645        left_keys: &[usize],
10646        right_keys: &[usize],
10647        max_output: Option<usize>,
10648        launch_stream: StreamId,
10649    ) -> Result<CudaBuffer> {
10650        use crate::launch::LaunchRecorder;
10651
10652        let runtime = self.memory.runtime().ok_or_else(|| {
10653            XlogError::Kernel(
10654                "hash_join_v2_recorded (left_outer) requires a runtime-backed GpuMemoryManager"
10655                    .to_string(),
10656            )
10657        })?;
10658        let cu_stream = runtime
10659            .stream_pool()
10660            .resolve(launch_stream)
10661            .ok_or_else(|| {
10662                XlogError::Kernel(format!(
10663                "hash_join_v2_recorded (left_outer): launch_stream StreamId({}) does not resolve",
10664                launch_stream.0
10665            ))
10666            })?;
10667
10668        let num_left = self.device_row_count(left)?;
10669        let num_right = self.device_row_count(right)?;
10670        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
10671            return Err(XlogError::Kernel(format!(
10672                "Join supports at most {} rows per side (left={}, right={})",
10673                u32::MAX,
10674                num_left,
10675                num_right
10676            )));
10677        }
10678        if num_left == 0 {
10679            let combined_schema = self.combine_schemas(left.schema(), right.schema());
10680            return self.create_empty_buffer(combined_schema);
10681        }
10682        if num_right == 0 {
10683            // Empty right: all left rows, null right columns.
10684            // Falls back to legacy default-stream path. No
10685            // launch_stream work is queued; caller drops are
10686            // safe because legacy syncs before returning.
10687            return self.left_outer_with_nulls(left, right);
10688        }
10689        if left_keys.is_empty() || right_keys.is_empty() {
10690            return Err(XlogError::Kernel(
10691                "Join requires at least one key column".to_string(),
10692            ));
10693        }
10694        if left_keys.len() != right_keys.len() {
10695            return Err(XlogError::Kernel(
10696                "Left and right key columns must have same length".to_string(),
10697            ));
10698        }
10699        if left_keys.len() > 4 {
10700            return Err(XlogError::Kernel(
10701                "hash_join_v2_recorded (left_outer): max 4 key columns supported \
10702                 (pack_keys constraint)"
10703                    .to_string(),
10704            ));
10705        }
10706        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
10707            let lt = left.schema().column_type(l);
10708            let rt = right.schema().column_type(r);
10709            if lt != rt {
10710                return Err(XlogError::Kernel(format!(
10711                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
10712                    l, lt, r, rt
10713                )));
10714            }
10715        }
10716
10717        let num_left = num_left as u32;
10718        let num_right = num_right as u32;
10719
10720        let left_packed =
10721            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
10722        let right_packed =
10723            self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
10724        let table = self.build_hash_table_v2_on_stream(
10725            &right_packed.hashes,
10726            num_right,
10727            &cu_stream,
10728            launch_stream,
10729            runtime,
10730        )?;
10731
10732        let device = self.device.inner();
10733        let block_size = 256u32;
10734        let grid_size = num_left.div_ceil(block_size);
10735        let cfg = LaunchConfig {
10736            grid_dim: (grid_size, 1, 1),
10737            block_dim: (block_size, 1, 1),
10738            shared_mem_bytes: 0,
10739        };
10740
10741        // Step A: SEMI mask (d_has_match) — used for unmatched
10742        // detection later. Plus PROBE count + materialize for
10743        // inner-join row indices.
10744        let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
10745        let d_count_only = self.memory.alloc::<u32>(1)?;
10746        let d_dummy_left = self.memory.alloc::<u32>(1)?;
10747        let d_dummy_right = self.memory.alloc::<u32>(1)?;
10748        // Fence alloc-ready → launch_stream for d_count_only
10749        // before the memset writes it (the memset runs ahead
10750        // of the recorder's preflight below).
10751        runtime
10752            .prepare_first_use(&d_count_only, launch_stream, Access::Write)
10753            .map_err(|e| {
10754                XlogError::Kernel(format!(
10755                    "left_outer recorded: prepare d_count_only failed: {}",
10756                    e
10757                ))
10758            })?;
10759        // SAFETY: runtime-backed 4-byte buffer.
10760        unsafe {
10761            let res = cudarc::driver::sys::cuMemsetD8Async(
10762                *d_count_only.device_ptr(),
10763                0,
10764                std::mem::size_of::<u32>(),
10765                cu_stream.cu_stream(),
10766            );
10767            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10768                return Err(XlogError::Kernel(format!(
10769                    "cuMemsetD8Async (left_outer d_count_only) failed: {:?}",
10770                    res
10771                )));
10772            }
10773        }
10774
10775        let semi_func = device
10776            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
10777            .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
10778        let probe_func = device
10779            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
10780            .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
10781
10782        let mut rec_a = LaunchRecorder::new_strict(launch_stream);
10783        rec_a.read(&left_packed.hashes);
10784        rec_a.read(&left_packed.packed_keys);
10785        rec_a.read(&right_packed.packed_keys);
10786        rec_a.read(&table.bucket_offsets);
10787        rec_a.read(&table.bucket_counts);
10788        rec_a.read(&table.bucket_entries);
10789        rec_a.read(&table.bucket_entry_hashes);
10790        rec_a.write(&d_has_match);
10791        rec_a.write(&d_count_only);
10792        rec_a.write(&d_dummy_left);
10793        rec_a.write(&d_dummy_right);
10794        rec_a.preflight(runtime).map_err(|e| {
10795            XlogError::Kernel(format!(
10796                "hash_join_v2_recorded (left_outer): semi/count preflight failed: {}",
10797                e
10798            ))
10799        })?;
10800
10801        // SAFETY: hash_join_semi 11-arg signature.
10802        unsafe {
10803            semi_func.clone().launch_on_stream(
10804                &cu_stream,
10805                cfg,
10806                (
10807                    &left_packed.hashes,
10808                    num_left,
10809                    &table.bucket_offsets,
10810                    &table.bucket_counts,
10811                    &table.bucket_entries,
10812                    &table.bucket_entry_hashes,
10813                    table.bucket_mask,
10814                    &left_packed.packed_keys,
10815                    &right_packed.packed_keys,
10816                    left_packed.key_bytes,
10817                    &d_has_match,
10818                ),
10819            )
10820        }
10821        .map_err(|e| XlogError::Kernel(format!("hash_join_semi (on_stream) failed: {}", e)))?;
10822
10823        let max_output_count_only = 0u32;
10824        // SAFETY: hash_join_probe_v2 14-arg signature; raw-param launch.
10825        unsafe {
10826            let mut params: Vec<*mut c_void> = vec![
10827                (&left_packed.hashes).as_kernel_param(),
10828                num_left.as_kernel_param(),
10829                (&table.bucket_offsets).as_kernel_param(),
10830                (&table.bucket_counts).as_kernel_param(),
10831                (&table.bucket_entries).as_kernel_param(),
10832                (&table.bucket_entry_hashes).as_kernel_param(),
10833                table.bucket_mask.as_kernel_param(),
10834                (&left_packed.packed_keys).as_kernel_param(),
10835                (&right_packed.packed_keys).as_kernel_param(),
10836                left_packed.key_bytes.as_kernel_param(),
10837                (&d_dummy_left).as_kernel_param(),
10838                (&d_dummy_right).as_kernel_param(),
10839                (&d_count_only).as_kernel_param(),
10840                max_output_count_only.as_kernel_param(),
10841            ];
10842            probe_func
10843                .clone()
10844                .launch_on_stream(&cu_stream, cfg, &mut params)
10845                .map_err(|e| {
10846                    XlogError::Kernel(format!(
10847                        "hash_join_probe_v2 (count, on_stream, left_outer) failed: {}",
10848                        e
10849                    ))
10850                })?;
10851        }
10852
10853        rec_a.commit(runtime).map_err(|e| {
10854            XlogError::Kernel(format!(
10855                "hash_join_v2_recorded (left_outer): semi/count commit failed: {}",
10856                e
10857            ))
10858        })?;
10859
10860        // Sync + read inner-count.
10861        cu_stream.synchronize().map_err(|e| {
10862            XlogError::Kernel(format!(
10863                "hash_join_v2_recorded (left_outer): sync (count read) failed: {}",
10864                e
10865            ))
10866        })?;
10867        let full_inner = self.read_join_output_count_metadata(&d_count_only)? as u64;
10868        let requested_inner = max_output
10869            .map(|limit| (limit as u64).min(full_inner))
10870            .unwrap_or(full_inner);
10871        if requested_inner > u32::MAX as u64 {
10872            return Err(XlogError::Kernel(format!(
10873                "Join produced {} rows which exceeds the u32 index limit",
10874                requested_inner
10875            )));
10876        }
10877        let max_output_u32 = requested_inner as u32;
10878        let alloc_len = (requested_inner.max(1)) as usize;
10879
10880        let d_output_left = self.memory.alloc::<u32>(alloc_len)?;
10881        let d_output_right = self.memory.alloc::<u32>(alloc_len)?;
10882        let d_output_count = self.memory.alloc::<u32>(1)?;
10883        // Fence alloc-ready → launch_stream for d_output_count
10884        // before the memset (memset runs ahead of preflight).
10885        runtime
10886            .prepare_first_use(&d_output_count, launch_stream, Access::Write)
10887            .map_err(|e| {
10888                XlogError::Kernel(format!(
10889                    "left_outer recorded: prepare d_output_count failed: {}",
10890                    e
10891                ))
10892            })?;
10893        // SAFETY: runtime-backed 4-byte buffer.
10894        unsafe {
10895            let res = cudarc::driver::sys::cuMemsetD8Async(
10896                *d_output_count.device_ptr(),
10897                0,
10898                std::mem::size_of::<u32>(),
10899                cu_stream.cu_stream(),
10900            );
10901            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
10902                return Err(XlogError::Kernel(format!(
10903                    "cuMemsetD8Async (left_outer d_output_count) failed: {:?}",
10904                    res
10905                )));
10906            }
10907        }
10908
10909        let mut rec_b = LaunchRecorder::new_strict(launch_stream);
10910        rec_b.read(&left_packed.hashes);
10911        rec_b.read(&left_packed.packed_keys);
10912        rec_b.read(&right_packed.packed_keys);
10913        rec_b.read(&table.bucket_offsets);
10914        rec_b.read(&table.bucket_counts);
10915        rec_b.read(&table.bucket_entries);
10916        rec_b.read(&table.bucket_entry_hashes);
10917        rec_b.write(&d_output_left);
10918        rec_b.write(&d_output_right);
10919        rec_b.write(&d_output_count);
10920        rec_b.preflight(runtime).map_err(|e| {
10921            XlogError::Kernel(format!(
10922                "hash_join_v2_recorded (left_outer): materialize preflight failed: {}",
10923                e
10924            ))
10925        })?;
10926
10927        // SAFETY: hash_join_probe_v2 14-arg materialize.
10928        unsafe {
10929            let mut params: Vec<*mut c_void> = vec![
10930                (&left_packed.hashes).as_kernel_param(),
10931                num_left.as_kernel_param(),
10932                (&table.bucket_offsets).as_kernel_param(),
10933                (&table.bucket_counts).as_kernel_param(),
10934                (&table.bucket_entries).as_kernel_param(),
10935                (&table.bucket_entry_hashes).as_kernel_param(),
10936                table.bucket_mask.as_kernel_param(),
10937                (&left_packed.packed_keys).as_kernel_param(),
10938                (&right_packed.packed_keys).as_kernel_param(),
10939                left_packed.key_bytes.as_kernel_param(),
10940                (&d_output_left).as_kernel_param(),
10941                (&d_output_right).as_kernel_param(),
10942                (&d_output_count).as_kernel_param(),
10943                max_output_u32.as_kernel_param(),
10944            ];
10945            probe_func
10946                .clone()
10947                .launch_on_stream(&cu_stream, cfg, &mut params)
10948                .map_err(|e| {
10949                    XlogError::Kernel(format!(
10950                        "hash_join_probe_v2 (materialize, on_stream, left_outer) failed: {}",
10951                        e
10952                    ))
10953                })?;
10954        }
10955
10956        rec_b.commit(runtime).map_err(|e| {
10957            XlogError::Kernel(format!(
10958                "hash_join_v2_recorded (left_outer): materialize commit failed: {}",
10959                e
10960            ))
10961        })?;
10962
10963        cu_stream.synchronize().map_err(|e| {
10964            XlogError::Kernel(format!(
10965                "hash_join_v2_recorded (left_outer): sync (materialize read) failed: {}",
10966                e
10967            ))
10968        })?;
10969        let inner_count = self
10970            .read_join_output_count_metadata(&d_output_count)?
10971            .min(max_output_u32);
10972
10973        // Step B: mask_not(d_has_match) → d_no_match, then
10974        // recorded compact tail filters `left` to unmatched_left.
10975        let d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
10976        let mask_not_fn = device
10977            .get_func(FILTER_MODULE, filter_kernels::MASK_NOT)
10978            .ok_or_else(|| XlogError::Kernel("mask_not kernel not found".to_string()))?;
10979
10980        let mut rec_c = LaunchRecorder::new_strict(launch_stream);
10981        rec_c.read(&d_has_match);
10982        rec_c.write(&d_no_match);
10983        rec_c.preflight(runtime).map_err(|e| {
10984            XlogError::Kernel(format!(
10985                "hash_join_v2_recorded (left_outer): mask_not preflight failed: {}",
10986                e
10987            ))
10988        })?;
10989        // SAFETY: mask_not(in_mask, out_mask, num_rows)
10990        unsafe {
10991            mask_not_fn.clone().launch_on_stream(
10992                &cu_stream,
10993                cfg,
10994                (&d_has_match, &d_no_match, num_left),
10995            )
10996        }
10997        .map_err(|e| XlogError::Kernel(format!("mask_not (on_stream) failed: {}", e)))?;
10998        rec_c.commit(runtime).map_err(|e| {
10999            XlogError::Kernel(format!(
11000                "hash_join_v2_recorded (left_outer): mask_not commit failed: {}",
11001                e
11002            ))
11003        })?;
11004
11005        let unmatched_left =
11006            self.compact_buffer_by_device_mask_counted_recorded(left, &d_no_match, launch_stream)?;
11007        let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
11008        let total_rows = (inner_count as u64) + unmatched_rows;
11009
11010        let combined_schema = self.combine_schemas(left.schema(), right.schema());
11011        if total_rows == 0 {
11012            return self.create_empty_buffer(combined_schema);
11013        }
11014
11015        // Step C: gather inner-left and inner-right on stream
11016        // (only when there are inner matches). Wrap the
11017        // gather kernels in a recorder so reads of
11018        // `left.column[i]`, `right.column[i]`, and the index
11019        // buffers `d_output_{left,right}` are registered on
11020        // launch_stream — without it, dropping `left` /
11021        // `right` after this method returns could race the
11022        // still-pending gather reads.
11023        let inner_count_u32 = inner_count;
11024        let inner_left_buf;
11025        let inner_right_buf;
11026        if inner_count > 0 {
11027            let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
11028            for col_idx in 0..left.columns.len() {
11029                let c = left.column(col_idx).ok_or_else(|| {
11030                    XlogError::Kernel(format!("Left column {} not found", col_idx))
11031                })?;
11032                rec_gather.read_column(c);
11033            }
11034            for col_idx in 0..right.columns.len() {
11035                let c = right.column(col_idx).ok_or_else(|| {
11036                    XlogError::Kernel(format!("Right column {} not found", col_idx))
11037                })?;
11038                rec_gather.read_column(c);
11039            }
11040            rec_gather.read(&d_output_left);
11041            rec_gather.read(&d_output_right);
11042            rec_gather.preflight(runtime).map_err(|e| {
11043                XlogError::Kernel(format!(
11044                    "hash_join_v2_recorded (left_outer): gather preflight failed: {}",
11045                    e
11046                ))
11047            })?;
11048            inner_left_buf = Some(self.gather_buffer_by_indices_on_stream(
11049                left,
11050                &d_output_left,
11051                inner_count_u32,
11052                &cu_stream,
11053                launch_stream,
11054                runtime,
11055            )?);
11056            inner_right_buf = Some(self.gather_buffer_by_indices_on_stream(
11057                right,
11058                &d_output_right,
11059                inner_count_u32,
11060                &cu_stream,
11061                launch_stream,
11062                runtime,
11063            )?);
11064            rec_gather.commit(runtime).map_err(|e| {
11065                XlogError::Kernel(format!(
11066                    "hash_join_v2_recorded (left_outer): gather commit failed: {}",
11067                    e
11068                ))
11069            })?;
11070        } else {
11071            inner_left_buf = None;
11072            inner_right_buf = None;
11073        }
11074
11075        // Step D: concatenate per-column on launch_stream.
11076        // Left columns: inner_left | unmatched_left.
11077        // Right columns: inner_right | zeros.
11078        //
11079        // The dtod copies queue AFTER the gather/compact
11080        // commits, so the events those commits recorded on
11081        // the source buffers (`unmatched_left.column[i]`,
11082        // `inner_*_buf.column[i]`) do NOT cover the still-
11083        // pending dtod copies. We open a new recorder around
11084        // the entire concat block, record reads on every
11085        // source column, preflight, run the dtod copies and
11086        // zero-fills, and commit. The commit's event is
11087        // recorded AFTER all dtod copies are queued — so a
11088        // subsequent drop of `unmatched_left` /
11089        // `inner_*_buf` correctly waits for the dtod copies
11090        // to complete.
11091        let mut rec_d = LaunchRecorder::new_strict(launch_stream);
11092        for col_idx in 0..unmatched_left.columns.len() {
11093            let c = unmatched_left.column(col_idx).ok_or_else(|| {
11094                XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
11095            })?;
11096            rec_d.read_column(c);
11097        }
11098        if let Some(b) = inner_left_buf.as_ref() {
11099            for col_idx in 0..b.columns.len() {
11100                let c = b.column(col_idx).ok_or_else(|| {
11101                    XlogError::Kernel(format!("inner_left col {} not found", col_idx))
11102                })?;
11103                rec_d.read_column(c);
11104            }
11105        }
11106        if let Some(b) = inner_right_buf.as_ref() {
11107            for col_idx in 0..b.columns.len() {
11108                let c = b.column(col_idx).ok_or_else(|| {
11109                    XlogError::Kernel(format!("inner_right col {} not found", col_idx))
11110                })?;
11111                rec_d.read_column(c);
11112            }
11113        }
11114        rec_d.preflight(runtime).map_err(|e| {
11115            XlogError::Kernel(format!(
11116                "hash_join_v2_recorded (left_outer): step-D preflight failed: {}",
11117                e
11118            ))
11119        })?;
11120
11121        let mut result_columns: Vec<CudaColumn> = Vec::with_capacity(combined_schema.arity());
11122        let inner_rows = inner_count as u64;
11123
11124        // Per-left-column concat.
11125        for col_idx in 0..left.arity() {
11126            let elem_size = left
11127                .schema()
11128                .column_type(col_idx)
11129                .map(|t| t.size_bytes())
11130                .unwrap_or(4);
11131            let inner_bytes = (inner_rows as usize)
11132                .checked_mul(elem_size)
11133                .ok_or_else(|| {
11134                    XlogError::Kernel("Left outer join: inner_bytes overflow".to_string())
11135                })?;
11136            let unmatched_bytes = (unmatched_rows as usize)
11137                .checked_mul(elem_size)
11138                .ok_or_else(|| {
11139                    XlogError::Kernel("Left outer join: unmatched_bytes overflow".to_string())
11140                })?;
11141            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
11142                XlogError::Kernel("Left outer join: total_bytes overflow".to_string())
11143            })?;
11144
11145            let out_col = self.memory.alloc::<u8>(total_bytes)?;
11146            let dst_ptr = *out_col.device_ptr();
11147            // Fence alloc-ready → launch_stream for out_col
11148            // before the dtod-copies write it.
11149            runtime
11150                .prepare_first_use(&out_col, launch_stream, Access::Write)
11151                .map_err(|e| {
11152                    XlogError::Kernel(format!(
11153                        "left_outer recorded: prepare left out_col {} failed: {}",
11154                        col_idx, e
11155                    ))
11156                })?;
11157
11158            if inner_bytes > 0 {
11159                let src_col = inner_left_buf
11160                    .as_ref()
11161                    .expect("inner_count > 0 but inner_left_buf is None")
11162                    .column(col_idx)
11163                    .ok_or_else(|| {
11164                        XlogError::Kernel(format!("inner_left col {} not found", col_idx))
11165                    })?;
11166                // SAFETY: cuMemcpyDtoDAsync_v2 on cu_stream.
11167                unsafe {
11168                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
11169                        dst_ptr,
11170                        *src_col.device_ptr(),
11171                        inner_bytes,
11172                        cu_stream.cu_stream(),
11173                    );
11174                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11175                        return Err(XlogError::Kernel(format!(
11176                            "cuMemcpyDtoDAsync (left_outer inner_left col {}) failed: {:?}",
11177                            col_idx, res
11178                        )));
11179                    }
11180                }
11181            }
11182            if unmatched_bytes > 0 {
11183                let src_col = unmatched_left.column(col_idx).ok_or_else(|| {
11184                    XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
11185                })?;
11186                // SAFETY: dst_ptr + inner_bytes is in-bounds
11187                // (inner_bytes + unmatched_bytes == total_bytes).
11188                unsafe {
11189                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
11190                        dst_ptr + inner_bytes as u64,
11191                        *src_col.device_ptr(),
11192                        unmatched_bytes,
11193                        cu_stream.cu_stream(),
11194                    );
11195                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11196                        return Err(XlogError::Kernel(format!(
11197                            "cuMemcpyDtoDAsync (left_outer unmatched_left col {}) failed: {:?}",
11198                            col_idx, res
11199                        )));
11200                    }
11201                }
11202            }
11203
11204            // Record use on launch_stream so end-of-scope drop
11205            // (when result_columns goes out of scope down the
11206            // line via output buffer drop) defers correctly.
11207            if let Some(b) = out_col.runtime_block() {
11208                runtime
11209                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
11210                    .map_err(|e| {
11211                        XlogError::Kernel(format!(
11212                            "hash_join_v2_recorded (left_outer): finish_block_use \
11213                         (left col {}) failed: {}",
11214                            col_idx, e
11215                        ))
11216                    })?;
11217            }
11218            result_columns.push(out_col.into());
11219        }
11220
11221        // Per-right-column: inner_right | zeros.
11222        for col_idx in 0..right.arity() {
11223            let elem_size = right
11224                .schema()
11225                .column_type(col_idx)
11226                .map(|t| t.size_bytes())
11227                .unwrap_or(4);
11228            let inner_bytes = (inner_rows as usize)
11229                .checked_mul(elem_size)
11230                .ok_or_else(|| {
11231                    XlogError::Kernel("Left outer join: right inner_bytes overflow".to_string())
11232                })?;
11233            let unmatched_bytes = (unmatched_rows as usize)
11234                .checked_mul(elem_size)
11235                .ok_or_else(|| {
11236                    XlogError::Kernel("Left outer join: right unmatched_bytes overflow".to_string())
11237                })?;
11238            let total_bytes = inner_bytes.checked_add(unmatched_bytes).ok_or_else(|| {
11239                XlogError::Kernel("Left outer join: right total_bytes overflow".to_string())
11240            })?;
11241
11242            let out_col = self.memory.alloc::<u8>(total_bytes)?;
11243            let dst_ptr = *out_col.device_ptr();
11244            // Fence alloc-ready → launch_stream for out_col
11245            // before the memset / dtod-copy write it.
11246            runtime
11247                .prepare_first_use(&out_col, launch_stream, Access::Write)
11248                .map_err(|e| {
11249                    XlogError::Kernel(format!(
11250                        "left_outer recorded: prepare right out_col {} failed: {}",
11251                        col_idx, e
11252                    ))
11253                })?;
11254
11255            // Zero whole column (unmatched portion will stay
11256            // zero; inner portion will be overwritten by the
11257            // dtod copy below if inner_bytes > 0).
11258            if total_bytes > 0 {
11259                // SAFETY: out_col has total_bytes bytes; cu_stream is valid.
11260                unsafe {
11261                    let res = cudarc::driver::sys::cuMemsetD8Async(
11262                        dst_ptr,
11263                        0,
11264                        total_bytes,
11265                        cu_stream.cu_stream(),
11266                    );
11267                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11268                        return Err(XlogError::Kernel(format!(
11269                            "cuMemsetD8Async (left_outer right col {}) failed: {:?}",
11270                            col_idx, res
11271                        )));
11272                    }
11273                }
11274            }
11275            if inner_bytes > 0 {
11276                let src_col = inner_right_buf
11277                    .as_ref()
11278                    .expect("inner_count > 0 but inner_right_buf is None")
11279                    .column(col_idx)
11280                    .ok_or_else(|| {
11281                        XlogError::Kernel(format!("inner_right col {} not found", col_idx))
11282                    })?;
11283                // SAFETY: cuMemcpyDtoDAsync_v2 on cu_stream.
11284                unsafe {
11285                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
11286                        dst_ptr,
11287                        *src_col.device_ptr(),
11288                        inner_bytes,
11289                        cu_stream.cu_stream(),
11290                    );
11291                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11292                        return Err(XlogError::Kernel(format!(
11293                            "cuMemcpyDtoDAsync (left_outer inner_right col {}) failed: {:?}",
11294                            col_idx, res
11295                        )));
11296                    }
11297                }
11298            }
11299
11300            if let Some(b) = out_col.runtime_block() {
11301                runtime
11302                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
11303                    .map_err(|e| {
11304                        XlogError::Kernel(format!(
11305                            "hash_join_v2_recorded (left_outer): finish_block_use \
11306                         (right col {}) failed: {}",
11307                            col_idx, e
11308                        ))
11309                    })?;
11310            }
11311            result_columns.push(out_col.into());
11312        }
11313
11314        // Commit the step-D recorder NOW that every dtod
11315        // copy is queued. The recorded event captures up to
11316        // commit time, so a subsequent drop of any source
11317        // buffer (unmatched_left / inner_*_buf) correctly
11318        // waits.
11319        rec_d.commit(runtime).map_err(|e| {
11320            XlogError::Kernel(format!(
11321                "hash_join_v2_recorded (left_outer): step-D commit failed: {}",
11322                e
11323            ))
11324        })?;
11325
11326        // d_num_rows scalar for the output buffer (uploaded
11327        // synchronously; no launch_stream work touches it).
11328        let d_num_rows = self.upload_device_row_count(total_rows as u32)?;
11329        Ok(CudaBuffer::from_columns_with_host_count(
11330            result_columns,
11331            total_rows,
11332            d_num_rows,
11333            combined_schema,
11334            total_rows as u32,
11335        ))
11336    }
11337
11338    /// Strict-recorder Semi/Anti hash join.
11339    /// Single helper parametrized by `anti`: both share the
11340    /// kernel-arg shape and chain — pack keys for both sides,
11341    /// build the hash table, run the `hash_join_semi` /
11342    /// `hash_join_anti` kernel to produce a per-left-row mask,
11343    /// then compose with
11344    /// `compact_buffer_by_device_mask_counted_recorded` to
11345    /// filter `left` by the mask. Semi mask is "has match",
11346    /// Anti mask is "no match"; the recorded compact tail keeps
11347    /// rows where the mask byte is non-zero either way.
11348    fn hash_join_semi_or_anti_v2_recorded(
11349        &self,
11350        left: &CudaBuffer,
11351        right: &CudaBuffer,
11352        left_keys: &[usize],
11353        right_keys: &[usize],
11354        anti: bool,
11355        launch_stream: StreamId,
11356    ) -> Result<CudaBuffer> {
11357        use crate::launch::LaunchRecorder;
11358
11359        let runtime = self.memory.runtime().ok_or_else(|| {
11360            XlogError::Kernel(
11361                "hash_join_v2_recorded (semi/anti) requires a runtime-backed GpuMemoryManager"
11362                    .to_string(),
11363            )
11364        })?;
11365        let cu_stream = runtime
11366            .stream_pool()
11367            .resolve(launch_stream)
11368            .ok_or_else(|| {
11369                XlogError::Kernel(format!(
11370                "hash_join_v2_recorded (semi/anti): launch_stream StreamId({}) does not resolve",
11371                launch_stream.0
11372            ))
11373            })?;
11374
11375        let num_left = self.device_row_count(left)?;
11376        let num_right = self.device_row_count(right)?;
11377        if num_left > u32::MAX as usize || num_right > u32::MAX as usize {
11378            return Err(XlogError::Kernel(format!(
11379                "Join supports at most {} rows per side (left={}, right={})",
11380                u32::MAX,
11381                num_left,
11382                num_right
11383            )));
11384        }
11385        if num_left == 0 {
11386            return self.create_empty_buffer(left.schema().clone());
11387        }
11388        if num_right == 0 {
11389            // No matches possible.
11390            //   * Semi: empty result.
11391            //   * Anti: keep all left rows.
11392            //
11393            // The Anti edge case is a copy of `left`. We use
11394            // legacy `clone_buffer` here, which runs on the
11395            // default stream and synchronizes before
11396            // returning. No launch_stream work is queued, so
11397            // there are no recorded events on the original
11398            // input columns from this call — which is the
11399            // correct semantic: we did not touch them on
11400            // launch_stream.
11401            return if anti {
11402                self.clone_buffer(left)
11403            } else {
11404                self.create_empty_buffer(left.schema().clone())
11405            };
11406        }
11407        if left_keys.is_empty() || right_keys.is_empty() {
11408            return Err(XlogError::Kernel(
11409                "Join requires at least one key column".to_string(),
11410            ));
11411        }
11412        if left_keys.len() != right_keys.len() {
11413            return Err(XlogError::Kernel(
11414                "Left and right key columns must have same length".to_string(),
11415            ));
11416        }
11417        if left_keys.len() > 4 {
11418            return Err(XlogError::Kernel(
11419                "hash_join_v2_recorded (semi/anti): max 4 key columns supported (pack_keys constraint)"
11420                    .to_string(),
11421            ));
11422        }
11423        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
11424            let lt = left.schema().column_type(l);
11425            let rt = right.schema().column_type(r);
11426            if lt != rt {
11427                return Err(XlogError::Kernel(format!(
11428                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
11429                    l, lt, r, rt
11430                )));
11431            }
11432        }
11433
11434        let num_left = num_left as u32;
11435        let num_right = num_right as u32;
11436
11437        let left_packed =
11438            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
11439        let right_packed =
11440            self.pack_keys_gpu_on_stream(right, right_keys, &cu_stream, launch_stream, runtime)?;
11441        let table = self.build_hash_table_v2_on_stream(
11442            &right_packed.hashes,
11443            num_right,
11444            &cu_stream,
11445            launch_stream,
11446            runtime,
11447        )?;
11448
11449        let d_mask = self.memory.alloc::<u8>(num_left as usize)?;
11450
11451        let kernel_name = if anti {
11452            join_kernels::HASH_JOIN_ANTI
11453        } else {
11454            join_kernels::HASH_JOIN_SEMI
11455        };
11456        let func = self
11457            .device
11458            .inner()
11459            .get_func(JOIN_MODULE, kernel_name)
11460            .ok_or_else(|| XlogError::Kernel(format!("{} kernel not found", kernel_name)))?;
11461
11462        let block_size = 256u32;
11463        let grid_size = num_left.div_ceil(block_size);
11464        let cfg = LaunchConfig {
11465            grid_dim: (grid_size, 1, 1),
11466            block_dim: (block_size, 1, 1),
11467            shared_mem_bytes: 0,
11468        };
11469
11470        let mut rec = LaunchRecorder::new_strict(launch_stream);
11471        rec.read(&left_packed.hashes);
11472        rec.read(&left_packed.packed_keys);
11473        rec.read(&right_packed.packed_keys);
11474        rec.read(&table.bucket_offsets);
11475        rec.read(&table.bucket_counts);
11476        rec.read(&table.bucket_entries);
11477        rec.read(&table.bucket_entry_hashes);
11478        rec.write(&d_mask);
11479        rec.preflight(runtime).map_err(|e| {
11480            XlogError::Kernel(format!(
11481                "hash_join_v2_recorded (semi/anti): preflight failed: {}",
11482                e
11483            ))
11484        })?;
11485
11486        // SAFETY: hash_join_{semi,anti}(probe_hashes, num_probe,
11487        //   bucket_offsets, bucket_counts, bucket_entries,
11488        //   bucket_entry_hashes, bucket_mask, probe_keys,
11489        //   build_keys, key_bytes, mask). 11 args.
11490        unsafe {
11491            func.clone().launch_on_stream(
11492                &cu_stream,
11493                cfg,
11494                (
11495                    &left_packed.hashes,
11496                    num_left,
11497                    &table.bucket_offsets,
11498                    &table.bucket_counts,
11499                    &table.bucket_entries,
11500                    &table.bucket_entry_hashes,
11501                    table.bucket_mask,
11502                    &left_packed.packed_keys,
11503                    &right_packed.packed_keys,
11504                    left_packed.key_bytes,
11505                    &d_mask,
11506                ),
11507            )
11508        }
11509        .map_err(|e| XlogError::Kernel(format!("{} (on_stream) failed: {}", kernel_name, e)))?;
11510
11511        rec.commit(runtime).map_err(|e| {
11512            XlogError::Kernel(format!(
11513                "hash_join_v2_recorded (semi/anti): commit failed: {}",
11514                e
11515            ))
11516        })?;
11517
11518        // Filter `left` by the mask via the recorded compact
11519        // tail. Its own LaunchRecorder records reads on
11520        // left.column[i] and left.num_rows_device against
11521        // launch_stream — so dropping `left` after this
11522        // method returns is correctly serialized through the
11523        // runtime's record-all + wait-all event chain.
11524        self.compact_buffer_by_device_mask_counted_recorded(left, &d_mask, launch_stream)
11525    }
11526
11527    // ============== Recorded indexed hash join ==============
11528    //
11529    // Strict-recorder, launch_stream-routed sibling of
11530    // `hash_join_v2_with_index`. Covers Inner, Semi, Anti, and
11531    // LeftOuter via a single dispatcher. The build-side
11532    // packed keys + hash table come from the cached
11533    // `JoinIndexV2`; the probe (left) side is packed on
11534    // launch_stream via `pack_keys_gpu_on_stream`. Recorded
11535    // gather / compact / mask_not helpers from earlier recorded paths
11536    // are reused unchanged.
11537    //
11538    // Existing legacy `hash_join_v2_with_index*` paths are
11539    // unchanged; runtime/planner wiring is NOT included.
11540
11541    /// Strict-recorder, launch_stream-routed variant of
11542    /// `hash_join_v2_with_index`. Supports all four join
11543    /// types — the indexed variants share the same
11544    /// `(packed_keys, table)` shape, so a single recorded
11545    /// surface covers them.
11546    ///
11547    /// When [`Self::use_recorded_csm_env`] is on, `Inner` and
11548    /// `LeftOuter` route through the indexed CSM
11549    /// (count-scan-materialize) methods; otherwise they route
11550    /// through the legacy indexed recorded methods. `Semi` /
11551    /// `Anti` always route through their existing indexed
11552    /// recorded methods — no CSM implementation exists for them.
11553    #[allow(clippy::too_many_arguments)]
11554    pub fn hash_join_v2_with_index_recorded(
11555        &self,
11556        left: &CudaBuffer,
11557        right: &CudaBuffer,
11558        left_keys: &[usize],
11559        right_keys: &[usize],
11560        join_type: JoinType,
11561        index: &crate::provider::JoinIndexV2,
11562        max_output: Option<usize>,
11563        launch_stream: StreamId,
11564    ) -> Result<CudaBuffer> {
11565        let runtime = self.memory.runtime().ok_or_else(|| {
11566            XlogError::Kernel(
11567                "hash_join_v2_with_index_recorded requires a runtime-backed GpuMemoryManager"
11568                    .to_string(),
11569            )
11570        })?;
11571        // Resolve once; sub-helpers re-resolve as needed.
11572        runtime
11573            .stream_pool()
11574            .resolve(launch_stream)
11575            .ok_or_else(|| {
11576                XlogError::Kernel(format!(
11577                    "hash_join_v2_with_index_recorded: launch_stream StreamId({}) does not resolve",
11578                    launch_stream.0
11579                ))
11580            })?;
11581
11582        // Validate inputs (mirror legacy hash_join_v2_with_index).
11583        let left_rows = self.device_row_count(left)?;
11584        let right_rows = self.device_row_count(right)?;
11585        if left_rows > u32::MAX as usize || right_rows > u32::MAX as usize {
11586            return Err(XlogError::Kernel(format!(
11587                "Join supports at most {} rows per side (left={}, right={})",
11588                u32::MAX,
11589                left_rows,
11590                right_rows
11591            )));
11592        }
11593        if left_rows == 0 {
11594            return match join_type {
11595                JoinType::Inner | JoinType::LeftOuter => {
11596                    let combined_schema = self.combine_schemas(left.schema(), right.schema());
11597                    self.create_empty_buffer(combined_schema)
11598                }
11599                JoinType::Semi | JoinType::Anti => self.create_empty_buffer(left.schema().clone()),
11600            };
11601        }
11602        if right_rows == 0 {
11603            return match join_type {
11604                JoinType::Inner => {
11605                    let combined_schema = self.combine_schemas(left.schema(), right.schema());
11606                    self.create_empty_buffer(combined_schema)
11607                }
11608                JoinType::Semi => self.create_empty_buffer(left.schema().clone()),
11609                JoinType::Anti => self.clone_buffer(left),
11610                JoinType::LeftOuter => self.left_outer_with_nulls(left, right),
11611            };
11612        }
11613        if left_keys.is_empty() || right_keys.is_empty() {
11614            return Err(XlogError::Kernel(
11615                "Join requires at least one key column".to_string(),
11616            ));
11617        }
11618        if left_keys.len() != right_keys.len() {
11619            return Err(XlogError::Kernel(
11620                "Left and right key columns must have same length".to_string(),
11621            ));
11622        }
11623        if left_keys.len() > 4 {
11624            return Err(XlogError::Kernel(
11625                "hash_join_v2_with_index_recorded: max 4 key columns supported \
11626                 (pack_keys constraint)"
11627                    .to_string(),
11628            ));
11629        }
11630        for (&l, &r) in left_keys.iter().zip(right_keys.iter()) {
11631            if l >= left.arity() {
11632                return Err(XlogError::Kernel(format!(
11633                    "Left key column index {} out of bounds (arity {})",
11634                    l,
11635                    left.arity()
11636                )));
11637            }
11638            if r >= right.arity() {
11639                return Err(XlogError::Kernel(format!(
11640                    "Right key column index {} out of bounds (arity {})",
11641                    r,
11642                    right.arity()
11643                )));
11644            }
11645            let lt = left.schema().column_type(l);
11646            let rt = right.schema().column_type(r);
11647            if lt != rt {
11648                return Err(XlogError::Kernel(format!(
11649                    "Key column type mismatch: left[{}]={:?}, right[{}]={:?}",
11650                    l, lt, r, rt
11651                )));
11652            }
11653        }
11654        if index.right_num_rows() != right_rows as u32 {
11655            return Err(XlogError::Kernel(
11656                "Join index row count does not match right relation".to_string(),
11657            ));
11658        }
11659        if index.right_keys() != right_keys {
11660            return Err(XlogError::Kernel(
11661                "Join index key columns do not match requested right_keys".to_string(),
11662            ));
11663        }
11664
11665        let csm_on = Self::use_recorded_csm_env();
11666        match join_type {
11667            JoinType::Inner => {
11668                if csm_on {
11669                    self.csm_invocations.fetch_add(1, Ordering::Relaxed);
11670                    self.hash_join_inner_v2_with_index_count_scan_materialize_recorded(
11671                        left,
11672                        right,
11673                        left_keys,
11674                        right_keys,
11675                        index,
11676                        max_output,
11677                        launch_stream,
11678                    )
11679                } else {
11680                    self.hash_join_inner_v2_with_index_recorded(
11681                        left,
11682                        right,
11683                        left_keys,
11684                        index,
11685                        max_output,
11686                        launch_stream,
11687                    )
11688                }
11689            }
11690            JoinType::Semi => self.hash_join_semi_or_anti_v2_with_index_recorded(
11691                left,
11692                left_keys,
11693                index,
11694                false,
11695                launch_stream,
11696            ),
11697            JoinType::Anti => self.hash_join_semi_or_anti_v2_with_index_recorded(
11698                left,
11699                left_keys,
11700                index,
11701                true,
11702                launch_stream,
11703            ),
11704            JoinType::LeftOuter => {
11705                if csm_on {
11706                    self.csm_invocations.fetch_add(1, Ordering::Relaxed);
11707                    self.hash_join_left_outer_v2_with_index_count_scan_materialize_recorded(
11708                        left,
11709                        right,
11710                        left_keys,
11711                        right_keys,
11712                        index,
11713                        max_output,
11714                        launch_stream,
11715                    )
11716                } else {
11717                    self.hash_join_left_outer_v2_with_index_recorded(
11718                        left,
11719                        right,
11720                        left_keys,
11721                        index,
11722                        max_output,
11723                        launch_stream,
11724                    )
11725                }
11726            }
11727        }
11728    }
11729
11730    /// Indexed-Inner recorded. Mirrors `hash_join_inner_v2_recorded`
11731    /// minus the right-side pack + hash-table build (the
11732    /// cached `JoinIndexV2` provides `index.packed_keys` and
11733    /// `&index.table`). Probe count + materialize + gather all
11734    /// run on `launch_stream`.
11735    fn hash_join_inner_v2_with_index_recorded(
11736        &self,
11737        left: &CudaBuffer,
11738        right: &CudaBuffer,
11739        left_keys: &[usize],
11740        index: &crate::provider::JoinIndexV2,
11741        max_output: Option<usize>,
11742        launch_stream: StreamId,
11743    ) -> Result<CudaBuffer> {
11744        use crate::launch::LaunchRecorder;
11745
11746        let runtime = self.memory.runtime().ok_or_else(|| {
11747            XlogError::Kernel(
11748                "hash_join_v2_with_index_recorded (inner) requires runtime-backed manager"
11749                    .to_string(),
11750            )
11751        })?;
11752        let cu_stream = runtime
11753            .stream_pool()
11754            .resolve(launch_stream)
11755            .ok_or_else(|| {
11756                XlogError::Kernel("indexed inner: launch_stream does not resolve".to_string())
11757            })?;
11758
11759        let num_left = left.num_rows() as u32;
11760        let table = &index.table;
11761
11762        // Pack left only on launch_stream.
11763        let left_packed =
11764            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
11765        if left_packed.key_bytes != index.key_bytes {
11766            return Err(XlogError::Kernel(
11767                "Join key byte width mismatch between probe and cached index".to_string(),
11768            ));
11769        }
11770
11771        let probe_func = self
11772            .device
11773            .inner()
11774            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
11775            .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
11776        let block_size = 256u32;
11777        let probe_grid = num_left.div_ceil(block_size);
11778        let probe_config = LaunchConfig {
11779            grid_dim: (probe_grid, 1, 1),
11780            block_dim: (block_size, 1, 1),
11781            shared_mem_bytes: 0,
11782        };
11783
11784        // Count pass.
11785        let d_count_only = self.memory.alloc::<u32>(1)?;
11786        let d_dummy_left = self.memory.alloc::<u32>(1)?;
11787        let d_dummy_right = self.memory.alloc::<u32>(1)?;
11788        // Fence alloc-ready → launch_stream for d_count_only
11789        // before the memset (memset runs ahead of preflight).
11790        runtime
11791            .prepare_first_use(&d_count_only, launch_stream, Access::Write)
11792            .map_err(|e| {
11793                XlogError::Kernel(format!("indexed inner: prepare d_count_only failed: {}", e))
11794            })?;
11795        // SAFETY: 4-byte runtime-backed buffer.
11796        unsafe {
11797            let res = cudarc::driver::sys::cuMemsetD8Async(
11798                *d_count_only.device_ptr(),
11799                0,
11800                std::mem::size_of::<u32>(),
11801                cu_stream.cu_stream(),
11802            );
11803            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11804                return Err(XlogError::Kernel(format!(
11805                    "cuMemsetD8Async (indexed inner d_count_only) failed: {:?}",
11806                    res
11807                )));
11808            }
11809        }
11810
11811        let max_output_count_only = 0u32;
11812        let mut rec_count = LaunchRecorder::new_strict(launch_stream);
11813        rec_count.read(&left_packed.hashes);
11814        rec_count.read(&left_packed.packed_keys);
11815        rec_count.read(&index.packed_keys);
11816        rec_count.read(&table.bucket_offsets);
11817        rec_count.read(&table.bucket_counts);
11818        rec_count.read(&table.bucket_entries);
11819        rec_count.read(&table.bucket_entry_hashes);
11820        rec_count.write(&d_count_only);
11821        rec_count.write(&d_dummy_left);
11822        rec_count.write(&d_dummy_right);
11823        rec_count.preflight(runtime).map_err(|e| {
11824            XlogError::Kernel(format!("indexed inner: count-pass preflight failed: {}", e))
11825        })?;
11826        // SAFETY: 14-arg probe via raw-param launch.
11827        unsafe {
11828            let mut params: Vec<*mut c_void> = vec![
11829                (&left_packed.hashes).as_kernel_param(),
11830                num_left.as_kernel_param(),
11831                (&table.bucket_offsets).as_kernel_param(),
11832                (&table.bucket_counts).as_kernel_param(),
11833                (&table.bucket_entries).as_kernel_param(),
11834                (&table.bucket_entry_hashes).as_kernel_param(),
11835                table.bucket_mask.as_kernel_param(),
11836                (&left_packed.packed_keys).as_kernel_param(),
11837                (&index.packed_keys).as_kernel_param(),
11838                index.key_bytes.as_kernel_param(),
11839                (&d_dummy_left).as_kernel_param(),
11840                (&d_dummy_right).as_kernel_param(),
11841                (&d_count_only).as_kernel_param(),
11842                max_output_count_only.as_kernel_param(),
11843            ];
11844            probe_func
11845                .clone()
11846                .launch_on_stream(&cu_stream, probe_config, &mut params)
11847                .map_err(|e| {
11848                    XlogError::Kernel(format!(
11849                        "hash_join_probe_v2 (indexed count, on_stream) failed: {}",
11850                        e
11851                    ))
11852                })?;
11853        }
11854        rec_count.commit(runtime).map_err(|e| {
11855            XlogError::Kernel(format!("indexed inner: count-pass commit failed: {}", e))
11856        })?;
11857
11858        cu_stream.synchronize().map_err(|e| {
11859            XlogError::Kernel(format!("indexed inner: sync (count read) failed: {}", e))
11860        })?;
11861        let full_count = self.read_join_output_count_metadata(&d_count_only)? as u64;
11862        let requested = max_output
11863            .map(|limit| (limit as u64).min(full_count))
11864            .unwrap_or(full_count);
11865        if requested == 0 {
11866            let combined_schema = self.combine_schemas(left.schema(), right.schema());
11867            return self.create_empty_buffer(combined_schema);
11868        }
11869        if requested > u32::MAX as u64 {
11870            return Err(XlogError::Kernel(format!(
11871                "Join produced {} rows which exceeds the u32 index limit",
11872                requested
11873            )));
11874        }
11875        let max_output_u32 = requested as u32;
11876
11877        // Materialize pass.
11878        let d_output_left = self.memory.alloc::<u32>(max_output_u32 as usize)?;
11879        let d_output_right = self.memory.alloc::<u32>(max_output_u32 as usize)?;
11880        let d_output_count = self.memory.alloc::<u32>(1)?;
11881        // Fence alloc-ready → launch_stream for d_output_count
11882        // before the memset.
11883        runtime
11884            .prepare_first_use(&d_output_count, launch_stream, Access::Write)
11885            .map_err(|e| {
11886                XlogError::Kernel(format!(
11887                    "indexed inner: prepare d_output_count failed: {}",
11888                    e
11889                ))
11890            })?;
11891        // SAFETY: 4-byte runtime-backed buffer.
11892        unsafe {
11893            let res = cudarc::driver::sys::cuMemsetD8Async(
11894                *d_output_count.device_ptr(),
11895                0,
11896                std::mem::size_of::<u32>(),
11897                cu_stream.cu_stream(),
11898            );
11899            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
11900                return Err(XlogError::Kernel(format!(
11901                    "cuMemsetD8Async (indexed inner d_output_count) failed: {:?}",
11902                    res
11903                )));
11904            }
11905        }
11906
11907        let mut rec_mat = LaunchRecorder::new_strict(launch_stream);
11908        rec_mat.read(&left_packed.hashes);
11909        rec_mat.read(&left_packed.packed_keys);
11910        rec_mat.read(&index.packed_keys);
11911        rec_mat.read(&table.bucket_offsets);
11912        rec_mat.read(&table.bucket_counts);
11913        rec_mat.read(&table.bucket_entries);
11914        rec_mat.read(&table.bucket_entry_hashes);
11915        rec_mat.write(&d_output_left);
11916        rec_mat.write(&d_output_right);
11917        rec_mat.write(&d_output_count);
11918        rec_mat.preflight(runtime).map_err(|e| {
11919            XlogError::Kernel(format!(
11920                "indexed inner: materialize preflight failed: {}",
11921                e
11922            ))
11923        })?;
11924        // SAFETY: 14-arg probe via raw-param launch.
11925        unsafe {
11926            let mut params: Vec<*mut c_void> = vec![
11927                (&left_packed.hashes).as_kernel_param(),
11928                num_left.as_kernel_param(),
11929                (&table.bucket_offsets).as_kernel_param(),
11930                (&table.bucket_counts).as_kernel_param(),
11931                (&table.bucket_entries).as_kernel_param(),
11932                (&table.bucket_entry_hashes).as_kernel_param(),
11933                table.bucket_mask.as_kernel_param(),
11934                (&left_packed.packed_keys).as_kernel_param(),
11935                (&index.packed_keys).as_kernel_param(),
11936                index.key_bytes.as_kernel_param(),
11937                (&d_output_left).as_kernel_param(),
11938                (&d_output_right).as_kernel_param(),
11939                (&d_output_count).as_kernel_param(),
11940                max_output_u32.as_kernel_param(),
11941            ];
11942            probe_func
11943                .clone()
11944                .launch_on_stream(&cu_stream, probe_config, &mut params)
11945                .map_err(|e| {
11946                    XlogError::Kernel(format!(
11947                        "hash_join_probe_v2 (indexed mat, on_stream) failed: {}",
11948                        e
11949                    ))
11950                })?;
11951        }
11952        rec_mat.commit(runtime).map_err(|e| {
11953            XlogError::Kernel(format!("indexed inner: materialize commit failed: {}", e))
11954        })?;
11955
11956        cu_stream.synchronize().map_err(|e| {
11957            XlogError::Kernel(format!("indexed inner: sync (mat read) failed: {}", e))
11958        })?;
11959        let result_count = (self.read_join_output_count_metadata(&d_output_count)? as u64)
11960            .min(max_output_u32 as u64);
11961        if result_count == 0 {
11962            let combined_schema = self.combine_schemas(left.schema(), right.schema());
11963            return self.create_empty_buffer(combined_schema);
11964        }
11965        let output_rows = result_count as u32;
11966
11967        // Gather both sides on launch_stream.
11968        let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
11969        for col_idx in 0..left.columns.len() {
11970            let c = left
11971                .column(col_idx)
11972                .ok_or_else(|| XlogError::Kernel(format!("Left column {} not found", col_idx)))?;
11973            rec_gather.read_column(c);
11974        }
11975        for col_idx in 0..right.columns.len() {
11976            let c = right
11977                .column(col_idx)
11978                .ok_or_else(|| XlogError::Kernel(format!("Right column {} not found", col_idx)))?;
11979            rec_gather.read_column(c);
11980        }
11981        rec_gather.read(&d_output_left);
11982        rec_gather.read(&d_output_right);
11983        rec_gather.preflight(runtime).map_err(|e| {
11984            XlogError::Kernel(format!("indexed inner: gather preflight failed: {}", e))
11985        })?;
11986        let gathered_left = self.gather_buffer_by_indices_on_stream(
11987            left,
11988            &d_output_left,
11989            output_rows,
11990            &cu_stream,
11991            launch_stream,
11992            runtime,
11993        )?;
11994        let gathered_right = self.gather_buffer_by_indices_on_stream(
11995            right,
11996            &d_output_right,
11997            output_rows,
11998            &cu_stream,
11999            launch_stream,
12000            runtime,
12001        )?;
12002        rec_gather.commit(runtime).map_err(|e| {
12003            XlogError::Kernel(format!("indexed inner: gather commit failed: {}", e))
12004        })?;
12005
12006        let combined_schema = self.combine_schemas(left.schema(), right.schema());
12007        let mut result_columns = Vec::with_capacity(combined_schema.arity());
12008        result_columns.extend(gathered_left.columns);
12009        result_columns.extend(gathered_right.columns);
12010        self.buffer_from_columns(result_columns, result_count, combined_schema)
12011    }
12012
12013    /// Indexed Semi/Anti recorded. Mirrors
12014    /// `hash_join_semi_or_anti_v2_recorded` minus the
12015    /// right-side pack + table build. Composes pack-left →
12016    /// SEMI/ANTI kernel → recorded compact tail. Anti-empty-
12017    /// right edge case is handled by the dispatcher.
12018    fn hash_join_semi_or_anti_v2_with_index_recorded(
12019        &self,
12020        left: &CudaBuffer,
12021        left_keys: &[usize],
12022        index: &crate::provider::JoinIndexV2,
12023        anti: bool,
12024        launch_stream: StreamId,
12025    ) -> Result<CudaBuffer> {
12026        use crate::launch::LaunchRecorder;
12027
12028        let runtime = self.memory.runtime().ok_or_else(|| {
12029            XlogError::Kernel(
12030                "hash_join_v2_with_index_recorded (semi/anti) requires runtime-backed manager"
12031                    .to_string(),
12032            )
12033        })?;
12034        let cu_stream = runtime
12035            .stream_pool()
12036            .resolve(launch_stream)
12037            .ok_or_else(|| {
12038                XlogError::Kernel("indexed semi/anti: launch_stream does not resolve".to_string())
12039            })?;
12040
12041        let num_left = left.num_rows() as u32;
12042        let table = &index.table;
12043
12044        let left_packed =
12045            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
12046        if left_packed.key_bytes != index.key_bytes {
12047            return Err(XlogError::Kernel(
12048                "Join key byte width mismatch between probe and cached index".to_string(),
12049            ));
12050        }
12051
12052        let d_mask = self.memory.alloc::<u8>(num_left as usize)?;
12053        let kernel_name = if anti {
12054            join_kernels::HASH_JOIN_ANTI
12055        } else {
12056            join_kernels::HASH_JOIN_SEMI
12057        };
12058        let func = self
12059            .device
12060            .inner()
12061            .get_func(JOIN_MODULE, kernel_name)
12062            .ok_or_else(|| XlogError::Kernel(format!("{} kernel not found", kernel_name)))?;
12063        let block_size = 256u32;
12064        let grid_size = num_left.div_ceil(block_size);
12065        let cfg = LaunchConfig {
12066            grid_dim: (grid_size, 1, 1),
12067            block_dim: (block_size, 1, 1),
12068            shared_mem_bytes: 0,
12069        };
12070
12071        let mut rec = LaunchRecorder::new_strict(launch_stream);
12072        rec.read(&left_packed.hashes);
12073        rec.read(&left_packed.packed_keys);
12074        rec.read(&index.packed_keys);
12075        rec.read(&table.bucket_offsets);
12076        rec.read(&table.bucket_counts);
12077        rec.read(&table.bucket_entries);
12078        rec.read(&table.bucket_entry_hashes);
12079        rec.write(&d_mask);
12080        rec.preflight(runtime).map_err(|e| {
12081            XlogError::Kernel(format!("indexed semi/anti: preflight failed: {}", e))
12082        })?;
12083        // SAFETY: 11-arg semi/anti.
12084        unsafe {
12085            func.clone().launch_on_stream(
12086                &cu_stream,
12087                cfg,
12088                (
12089                    &left_packed.hashes,
12090                    num_left,
12091                    &table.bucket_offsets,
12092                    &table.bucket_counts,
12093                    &table.bucket_entries,
12094                    &table.bucket_entry_hashes,
12095                    table.bucket_mask,
12096                    &left_packed.packed_keys,
12097                    &index.packed_keys,
12098                    index.key_bytes,
12099                    &d_mask,
12100                ),
12101            )
12102        }
12103        .map_err(|e| {
12104            XlogError::Kernel(format!(
12105                "{} (on_stream, indexed) failed: {}",
12106                kernel_name, e
12107            ))
12108        })?;
12109        rec.commit(runtime)
12110            .map_err(|e| XlogError::Kernel(format!("indexed semi/anti: commit failed: {}", e)))?;
12111
12112        self.compact_buffer_by_device_mask_counted_recorded(left, &d_mask, launch_stream)
12113    }
12114
12115    /// Indexed LeftOuter recorded. Mirrors
12116    /// `hash_join_left_outer_v2_recorded` minus the right-side
12117    /// pack + table build. Same chain shape: SEMI mask + PROBE
12118    /// count/materialize + mask_not + recorded compact for
12119    /// unmatched + gather inner + per-column dtod-async concat.
12120    fn hash_join_left_outer_v2_with_index_recorded(
12121        &self,
12122        left: &CudaBuffer,
12123        right: &CudaBuffer,
12124        left_keys: &[usize],
12125        index: &crate::provider::JoinIndexV2,
12126        max_output: Option<usize>,
12127        launch_stream: StreamId,
12128    ) -> Result<CudaBuffer> {
12129        use crate::launch::LaunchRecorder;
12130
12131        let runtime = self.memory.runtime().ok_or_else(|| {
12132            XlogError::Kernel(
12133                "hash_join_v2_with_index_recorded (left_outer) requires runtime-backed manager"
12134                    .to_string(),
12135            )
12136        })?;
12137        let cu_stream = runtime
12138            .stream_pool()
12139            .resolve(launch_stream)
12140            .ok_or_else(|| {
12141                XlogError::Kernel("indexed left_outer: launch_stream does not resolve".to_string())
12142            })?;
12143
12144        let num_left = left.num_rows() as u32;
12145        let table = &index.table;
12146
12147        let left_packed =
12148            self.pack_keys_gpu_on_stream(left, left_keys, &cu_stream, launch_stream, runtime)?;
12149        if left_packed.key_bytes != index.key_bytes {
12150            return Err(XlogError::Kernel(
12151                "Join key byte width mismatch between probe and cached index".to_string(),
12152            ));
12153        }
12154
12155        let device = self.device.inner();
12156        let block_size = 256u32;
12157        let grid_size = num_left.div_ceil(block_size);
12158        let cfg = LaunchConfig {
12159            grid_dim: (grid_size, 1, 1),
12160            block_dim: (block_size, 1, 1),
12161            shared_mem_bytes: 0,
12162        };
12163
12164        // Step A: SEMI mask + PROBE count.
12165        let d_has_match = self.memory.alloc::<u8>(num_left as usize)?;
12166        let d_count_only = self.memory.alloc::<u32>(1)?;
12167        let d_dummy_left = self.memory.alloc::<u32>(1)?;
12168        let d_dummy_right = self.memory.alloc::<u32>(1)?;
12169        // Fence alloc-ready → launch_stream for d_count_only
12170        // before the memset.
12171        runtime
12172            .prepare_first_use(&d_count_only, launch_stream, Access::Write)
12173            .map_err(|e| {
12174                XlogError::Kernel(format!(
12175                    "indexed left_outer: prepare d_count_only failed: {}",
12176                    e
12177                ))
12178            })?;
12179        // SAFETY: 4-byte runtime-backed buffer.
12180        unsafe {
12181            let res = cudarc::driver::sys::cuMemsetD8Async(
12182                *d_count_only.device_ptr(),
12183                0,
12184                std::mem::size_of::<u32>(),
12185                cu_stream.cu_stream(),
12186            );
12187            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12188                return Err(XlogError::Kernel(format!(
12189                    "cuMemsetD8Async (indexed left_outer d_count_only) failed: {:?}",
12190                    res
12191                )));
12192            }
12193        }
12194
12195        let semi_func = device
12196            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_SEMI)
12197            .ok_or_else(|| XlogError::Kernel("hash_join_semi kernel not found".to_string()))?;
12198        let probe_func = device
12199            .get_func(JOIN_MODULE, join_kernels::HASH_JOIN_PROBE_V2)
12200            .ok_or_else(|| XlogError::Kernel("hash_join_probe_v2 kernel not found".to_string()))?;
12201
12202        let mut rec_a = LaunchRecorder::new_strict(launch_stream);
12203        rec_a.read(&left_packed.hashes);
12204        rec_a.read(&left_packed.packed_keys);
12205        rec_a.read(&index.packed_keys);
12206        rec_a.read(&table.bucket_offsets);
12207        rec_a.read(&table.bucket_counts);
12208        rec_a.read(&table.bucket_entries);
12209        rec_a.read(&table.bucket_entry_hashes);
12210        rec_a.write(&d_has_match);
12211        rec_a.write(&d_count_only);
12212        rec_a.write(&d_dummy_left);
12213        rec_a.write(&d_dummy_right);
12214        rec_a.preflight(runtime).map_err(|e| {
12215            XlogError::Kernel(format!(
12216                "indexed left_outer: semi/count preflight failed: {}",
12217                e
12218            ))
12219        })?;
12220        // SAFETY: hash_join_semi 11-arg.
12221        unsafe {
12222            semi_func.clone().launch_on_stream(
12223                &cu_stream,
12224                cfg,
12225                (
12226                    &left_packed.hashes,
12227                    num_left,
12228                    &table.bucket_offsets,
12229                    &table.bucket_counts,
12230                    &table.bucket_entries,
12231                    &table.bucket_entry_hashes,
12232                    table.bucket_mask,
12233                    &left_packed.packed_keys,
12234                    &index.packed_keys,
12235                    index.key_bytes,
12236                    &d_has_match,
12237                ),
12238            )
12239        }
12240        .map_err(|e| {
12241            XlogError::Kernel(format!(
12242                "hash_join_semi (on_stream, indexed left_outer) failed: {}",
12243                e
12244            ))
12245        })?;
12246
12247        let max_output_count_only = 0u32;
12248        // SAFETY: hash_join_probe_v2 14-arg count pass.
12249        unsafe {
12250            let mut params: Vec<*mut c_void> = vec![
12251                (&left_packed.hashes).as_kernel_param(),
12252                num_left.as_kernel_param(),
12253                (&table.bucket_offsets).as_kernel_param(),
12254                (&table.bucket_counts).as_kernel_param(),
12255                (&table.bucket_entries).as_kernel_param(),
12256                (&table.bucket_entry_hashes).as_kernel_param(),
12257                table.bucket_mask.as_kernel_param(),
12258                (&left_packed.packed_keys).as_kernel_param(),
12259                (&index.packed_keys).as_kernel_param(),
12260                index.key_bytes.as_kernel_param(),
12261                (&d_dummy_left).as_kernel_param(),
12262                (&d_dummy_right).as_kernel_param(),
12263                (&d_count_only).as_kernel_param(),
12264                max_output_count_only.as_kernel_param(),
12265            ];
12266            probe_func
12267                .clone()
12268                .launch_on_stream(&cu_stream, cfg, &mut params)
12269                .map_err(|e| {
12270                    XlogError::Kernel(format!(
12271                        "hash_join_probe_v2 (count, on_stream, indexed left_outer) failed: {}",
12272                        e
12273                    ))
12274                })?;
12275        }
12276        rec_a.commit(runtime).map_err(|e| {
12277            XlogError::Kernel(format!(
12278                "indexed left_outer: semi/count commit failed: {}",
12279                e
12280            ))
12281        })?;
12282
12283        cu_stream.synchronize().map_err(|e| {
12284            XlogError::Kernel(format!(
12285                "indexed left_outer: sync (count read) failed: {}",
12286                e
12287            ))
12288        })?;
12289        let full_inner = self.read_join_output_count_metadata(&d_count_only)? as u64;
12290        let requested_inner = max_output
12291            .map(|limit| (limit as u64).min(full_inner))
12292            .unwrap_or(full_inner);
12293        if requested_inner > u32::MAX as u64 {
12294            return Err(XlogError::Kernel(format!(
12295                "Join produced {} rows which exceeds the u32 index limit",
12296                requested_inner
12297            )));
12298        }
12299        let max_output_u32 = requested_inner as u32;
12300        let alloc_len = (requested_inner.max(1)) as usize;
12301
12302        // PROBE materialize.
12303        let d_output_left = self.memory.alloc::<u32>(alloc_len)?;
12304        let d_output_right = self.memory.alloc::<u32>(alloc_len)?;
12305        let d_output_count = self.memory.alloc::<u32>(1)?;
12306        // Fence alloc-ready → launch_stream for d_output_count
12307        // before the memset.
12308        runtime
12309            .prepare_first_use(&d_output_count, launch_stream, Access::Write)
12310            .map_err(|e| {
12311                XlogError::Kernel(format!(
12312                    "indexed left_outer: prepare d_output_count failed: {}",
12313                    e
12314                ))
12315            })?;
12316        // SAFETY: 4-byte runtime-backed buffer.
12317        unsafe {
12318            let res = cudarc::driver::sys::cuMemsetD8Async(
12319                *d_output_count.device_ptr(),
12320                0,
12321                std::mem::size_of::<u32>(),
12322                cu_stream.cu_stream(),
12323            );
12324            if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12325                return Err(XlogError::Kernel(format!(
12326                    "cuMemsetD8Async (indexed left_outer d_output_count) failed: {:?}",
12327                    res
12328                )));
12329            }
12330        }
12331
12332        let mut rec_b = LaunchRecorder::new_strict(launch_stream);
12333        rec_b.read(&left_packed.hashes);
12334        rec_b.read(&left_packed.packed_keys);
12335        rec_b.read(&index.packed_keys);
12336        rec_b.read(&table.bucket_offsets);
12337        rec_b.read(&table.bucket_counts);
12338        rec_b.read(&table.bucket_entries);
12339        rec_b.read(&table.bucket_entry_hashes);
12340        rec_b.write(&d_output_left);
12341        rec_b.write(&d_output_right);
12342        rec_b.write(&d_output_count);
12343        rec_b.preflight(runtime).map_err(|e| {
12344            XlogError::Kernel(format!(
12345                "indexed left_outer: materialize preflight failed: {}",
12346                e
12347            ))
12348        })?;
12349        // SAFETY: hash_join_probe_v2 14-arg materialize.
12350        unsafe {
12351            let mut params: Vec<*mut c_void> = vec![
12352                (&left_packed.hashes).as_kernel_param(),
12353                num_left.as_kernel_param(),
12354                (&table.bucket_offsets).as_kernel_param(),
12355                (&table.bucket_counts).as_kernel_param(),
12356                (&table.bucket_entries).as_kernel_param(),
12357                (&table.bucket_entry_hashes).as_kernel_param(),
12358                table.bucket_mask.as_kernel_param(),
12359                (&left_packed.packed_keys).as_kernel_param(),
12360                (&index.packed_keys).as_kernel_param(),
12361                index.key_bytes.as_kernel_param(),
12362                (&d_output_left).as_kernel_param(),
12363                (&d_output_right).as_kernel_param(),
12364                (&d_output_count).as_kernel_param(),
12365                max_output_u32.as_kernel_param(),
12366            ];
12367            probe_func
12368                .clone()
12369                .launch_on_stream(&cu_stream, cfg, &mut params)
12370                .map_err(|e| {
12371                    XlogError::Kernel(format!(
12372                        "hash_join_probe_v2 (mat, on_stream, indexed left_outer) failed: {}",
12373                        e
12374                    ))
12375                })?;
12376        }
12377        rec_b.commit(runtime).map_err(|e| {
12378            XlogError::Kernel(format!(
12379                "indexed left_outer: materialize commit failed: {}",
12380                e
12381            ))
12382        })?;
12383
12384        cu_stream.synchronize().map_err(|e| {
12385            XlogError::Kernel(format!("indexed left_outer: sync (mat read) failed: {}", e))
12386        })?;
12387        let inner_count = self
12388            .read_join_output_count_metadata(&d_output_count)?
12389            .min(max_output_u32);
12390
12391        // Step B: mask_not → unmatched filter via recorded compact tail.
12392        let d_no_match = self.memory.alloc::<u8>(num_left as usize)?;
12393        let mask_not_fn = device
12394            .get_func(FILTER_MODULE, filter_kernels::MASK_NOT)
12395            .ok_or_else(|| XlogError::Kernel("mask_not kernel not found".to_string()))?;
12396        let mut rec_c = LaunchRecorder::new_strict(launch_stream);
12397        rec_c.read(&d_has_match);
12398        rec_c.write(&d_no_match);
12399        rec_c.preflight(runtime).map_err(|e| {
12400            XlogError::Kernel(format!(
12401                "indexed left_outer: mask_not preflight failed: {}",
12402                e
12403            ))
12404        })?;
12405        // SAFETY: mask_not(in, out, n).
12406        unsafe {
12407            mask_not_fn.clone().launch_on_stream(
12408                &cu_stream,
12409                cfg,
12410                (&d_has_match, &d_no_match, num_left),
12411            )
12412        }
12413        .map_err(|e| {
12414            XlogError::Kernel(format!(
12415                "mask_not (on_stream, indexed left_outer) failed: {}",
12416                e
12417            ))
12418        })?;
12419        rec_c.commit(runtime).map_err(|e| {
12420            XlogError::Kernel(format!("indexed left_outer: mask_not commit failed: {}", e))
12421        })?;
12422
12423        let unmatched_left =
12424            self.compact_buffer_by_device_mask_counted_recorded(left, &d_no_match, launch_stream)?;
12425        let unmatched_rows = self.device_row_count(&unmatched_left)? as u64;
12426        let total_rows = (inner_count as u64) + unmatched_rows;
12427
12428        let combined_schema = self.combine_schemas(left.schema(), right.schema());
12429        if total_rows == 0 {
12430            return self.create_empty_buffer(combined_schema);
12431        }
12432
12433        // Step C: gather inner sides. Same outer-recorder
12434        // wrapping as the non-indexed LeftOuter — registers
12435        // launch_stream reads on left/right columns and the
12436        // probe-output index buffers so the caller's drop of
12437        // those inputs is correctly serialized.
12438        let inner_count_u32 = inner_count;
12439        let inner_left_buf;
12440        let inner_right_buf;
12441        if inner_count > 0 {
12442            let mut rec_gather = LaunchRecorder::new_strict(launch_stream);
12443            for col_idx in 0..left.columns.len() {
12444                let c = left.column(col_idx).ok_or_else(|| {
12445                    XlogError::Kernel(format!("Left column {} not found", col_idx))
12446                })?;
12447                rec_gather.read_column(c);
12448            }
12449            for col_idx in 0..right.columns.len() {
12450                let c = right.column(col_idx).ok_or_else(|| {
12451                    XlogError::Kernel(format!("Right column {} not found", col_idx))
12452                })?;
12453                rec_gather.read_column(c);
12454            }
12455            rec_gather.read(&d_output_left);
12456            rec_gather.read(&d_output_right);
12457            rec_gather.preflight(runtime).map_err(|e| {
12458                XlogError::Kernel(format!(
12459                    "indexed left_outer: gather preflight failed: {}",
12460                    e
12461                ))
12462            })?;
12463            inner_left_buf = Some(self.gather_buffer_by_indices_on_stream(
12464                left,
12465                &d_output_left,
12466                inner_count_u32,
12467                &cu_stream,
12468                launch_stream,
12469                runtime,
12470            )?);
12471            inner_right_buf = Some(self.gather_buffer_by_indices_on_stream(
12472                right,
12473                &d_output_right,
12474                inner_count_u32,
12475                &cu_stream,
12476                launch_stream,
12477                runtime,
12478            )?);
12479            rec_gather.commit(runtime).map_err(|e| {
12480                XlogError::Kernel(format!("indexed left_outer: gather commit failed: {}", e))
12481            })?;
12482        } else {
12483            inner_left_buf = None;
12484            inner_right_buf = None;
12485        }
12486
12487        // Step D: concatenate per-column on launch_stream.
12488        // Same step-D recorder discipline as the non-indexed
12489        // LeftOuter: re-record source columns AFTER the dtod
12490        // copies are queued, so a drop of `unmatched_left` /
12491        // `inner_*_buf` waits on the correct event.
12492        let mut rec_d = LaunchRecorder::new_strict(launch_stream);
12493        for col_idx in 0..unmatched_left.columns.len() {
12494            let c = unmatched_left.column(col_idx).ok_or_else(|| {
12495                XlogError::Kernel(format!("unmatched_left col {} not found", col_idx))
12496            })?;
12497            rec_d.read_column(c);
12498        }
12499        if let Some(b) = inner_left_buf.as_ref() {
12500            for col_idx in 0..b.columns.len() {
12501                let c = b.column(col_idx).ok_or_else(|| {
12502                    XlogError::Kernel(format!("inner_left col {} not found", col_idx))
12503                })?;
12504                rec_d.read_column(c);
12505            }
12506        }
12507        if let Some(b) = inner_right_buf.as_ref() {
12508            for col_idx in 0..b.columns.len() {
12509                let c = b.column(col_idx).ok_or_else(|| {
12510                    XlogError::Kernel(format!("inner_right col {} not found", col_idx))
12511                })?;
12512                rec_d.read_column(c);
12513            }
12514        }
12515        rec_d.preflight(runtime).map_err(|e| {
12516            XlogError::Kernel(format!(
12517                "indexed left_outer: step-D preflight failed: {}",
12518                e
12519            ))
12520        })?;
12521
12522        let mut result_columns: Vec<CudaColumn> = Vec::with_capacity(combined_schema.arity());
12523        let inner_rows = inner_count as u64;
12524
12525        for col_idx in 0..left.arity() {
12526            let elem_size = left
12527                .schema()
12528                .column_type(col_idx)
12529                .map(|t| t.size_bytes())
12530                .unwrap_or(4);
12531            let inner_bytes = (inner_rows as usize)
12532                .checked_mul(elem_size)
12533                .ok_or_else(|| XlogError::Kernel("inner_bytes overflow".to_string()))?;
12534            let unmatched_bytes = (unmatched_rows as usize)
12535                .checked_mul(elem_size)
12536                .ok_or_else(|| XlogError::Kernel("unmatched_bytes overflow".to_string()))?;
12537            let total_bytes = inner_bytes
12538                .checked_add(unmatched_bytes)
12539                .ok_or_else(|| XlogError::Kernel("total_bytes overflow".to_string()))?;
12540            let out_col = self.memory.alloc::<u8>(total_bytes)?;
12541            let dst_ptr = *out_col.device_ptr();
12542            // Fence alloc-ready → launch_stream for out_col.
12543            runtime
12544                .prepare_first_use(&out_col, launch_stream, Access::Write)
12545                .map_err(|e| {
12546                    XlogError::Kernel(format!(
12547                        "indexed left_outer: prepare left out_col {} failed: {}",
12548                        col_idx, e
12549                    ))
12550                })?;
12551            if inner_bytes > 0 {
12552                let src_col = inner_left_buf
12553                    .as_ref()
12554                    .expect("inner_count > 0")
12555                    .column(col_idx)
12556                    .ok_or_else(|| XlogError::Kernel("inner_left col missing".to_string()))?;
12557                // SAFETY: dtod async on cu_stream.
12558                unsafe {
12559                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
12560                        dst_ptr,
12561                        *src_col.device_ptr(),
12562                        inner_bytes,
12563                        cu_stream.cu_stream(),
12564                    );
12565                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12566                        return Err(XlogError::Kernel(format!(
12567                            "indexed left_outer: dtod copy inner_left col {} failed: {:?}",
12568                            col_idx, res
12569                        )));
12570                    }
12571                }
12572            }
12573            if unmatched_bytes > 0 {
12574                let src_col = unmatched_left
12575                    .column(col_idx)
12576                    .ok_or_else(|| XlogError::Kernel("unmatched col missing".to_string()))?;
12577                // SAFETY: bounded by total_bytes.
12578                unsafe {
12579                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
12580                        dst_ptr + inner_bytes as u64,
12581                        *src_col.device_ptr(),
12582                        unmatched_bytes,
12583                        cu_stream.cu_stream(),
12584                    );
12585                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12586                        return Err(XlogError::Kernel(format!(
12587                            "indexed left_outer: dtod copy unmatched col {} failed: {:?}",
12588                            col_idx, res
12589                        )));
12590                    }
12591                }
12592            }
12593            if let Some(b) = out_col.runtime_block() {
12594                runtime
12595                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
12596                    .map_err(|e| {
12597                        XlogError::Kernel(format!(
12598                            "indexed left_outer: finish_block_use (left col {}) failed: {}",
12599                            col_idx, e
12600                        ))
12601                    })?;
12602            }
12603            result_columns.push(out_col.into());
12604        }
12605
12606        for col_idx in 0..right.arity() {
12607            let elem_size = right
12608                .schema()
12609                .column_type(col_idx)
12610                .map(|t| t.size_bytes())
12611                .unwrap_or(4);
12612            let inner_bytes = (inner_rows as usize)
12613                .checked_mul(elem_size)
12614                .ok_or_else(|| XlogError::Kernel("right inner_bytes overflow".to_string()))?;
12615            let unmatched_bytes = (unmatched_rows as usize)
12616                .checked_mul(elem_size)
12617                .ok_or_else(|| XlogError::Kernel("right unmatched_bytes overflow".to_string()))?;
12618            let total_bytes = inner_bytes
12619                .checked_add(unmatched_bytes)
12620                .ok_or_else(|| XlogError::Kernel("right total_bytes overflow".to_string()))?;
12621            let out_col = self.memory.alloc::<u8>(total_bytes)?;
12622            let dst_ptr = *out_col.device_ptr();
12623            // Fence alloc-ready → launch_stream for out_col.
12624            runtime
12625                .prepare_first_use(&out_col, launch_stream, Access::Write)
12626                .map_err(|e| {
12627                    XlogError::Kernel(format!(
12628                        "indexed left_outer: prepare right out_col {} failed: {}",
12629                        col_idx, e
12630                    ))
12631                })?;
12632            if total_bytes > 0 {
12633                // SAFETY: zero-fill the whole column.
12634                unsafe {
12635                    let res = cudarc::driver::sys::cuMemsetD8Async(
12636                        dst_ptr,
12637                        0,
12638                        total_bytes,
12639                        cu_stream.cu_stream(),
12640                    );
12641                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12642                        return Err(XlogError::Kernel(format!(
12643                            "indexed left_outer: zero-fill right col {} failed: {:?}",
12644                            col_idx, res
12645                        )));
12646                    }
12647                }
12648            }
12649            if inner_bytes > 0 {
12650                let src_col = inner_right_buf
12651                    .as_ref()
12652                    .expect("inner_count > 0")
12653                    .column(col_idx)
12654                    .ok_or_else(|| XlogError::Kernel("inner_right col missing".to_string()))?;
12655                // SAFETY: dtod async on cu_stream.
12656                unsafe {
12657                    let res = cudarc::driver::sys::cuMemcpyDtoDAsync_v2(
12658                        dst_ptr,
12659                        *src_col.device_ptr(),
12660                        inner_bytes,
12661                        cu_stream.cu_stream(),
12662                    );
12663                    if res != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
12664                        return Err(XlogError::Kernel(format!(
12665                            "indexed left_outer: dtod copy inner_right col {} failed: {:?}",
12666                            col_idx, res
12667                        )));
12668                    }
12669                }
12670            }
12671            if let Some(b) = out_col.runtime_block() {
12672                runtime
12673                    .finish_block_use(BlockId::from_block(b), launch_stream, Access::Write)
12674                    .map_err(|e| {
12675                        XlogError::Kernel(format!(
12676                            "indexed left_outer: finish_block_use (right col {}) failed: {}",
12677                            col_idx, e
12678                        ))
12679                    })?;
12680            }
12681            result_columns.push(out_col.into());
12682        }
12683
12684        // Commit step-D recorder; see non-indexed LeftOuter
12685        // for the rationale.
12686        rec_d.commit(runtime).map_err(|e| {
12687            XlogError::Kernel(format!("indexed left_outer: step-D commit failed: {}", e))
12688        })?;
12689
12690        let d_num_rows = self.upload_device_row_count(total_rows as u32)?;
12691        Ok(CudaBuffer::from_columns_with_host_count(
12692            result_columns,
12693            total_rows,
12694            d_num_rows,
12695            combined_schema,
12696            total_rows as u32,
12697        ))
12698    }
12699}
xlog_cuda/provider/relational.rs

xlog_cuda/provider/
relational.rs