Skip to main content

xlog_runtime/executor/
epistemic_workspace.rs

1//! Epistemic GPU workspace allocation.
2
3use std::{collections::BTreeSet, ffi::c_void, sync::Arc};
4
5use cudarc::driver::LaunchConfig;
6use xlog_core::{RelId, Result, ScalarType, Schema, XlogError};
7use xlog_cuda::provider::{
8    epistemic_kernels, HostLaunchMetadataTransferStats, HostTransferStats, EPISTEMIC_MODULE,
9};
10use xlog_cuda::{
11    memory::{validate_logical_row_count, TrackedCudaSlice},
12    sys, AsKernelParam, CudaBuffer, CudaColumn, DeviceSlice, DriverError, LaunchAsync,
13};
14use xlog_ir::rir::{MultiwayPlan, PlannedHashReason, RirNode, StreamGroupId};
15use xlog_ir::{
16    EirEpistemicMode, EirEpistemicOp, EirTerm, EpistemicCpuFallbackCounters,
17    EpistemicExecutablePlan, EpistemicGpuBufferKind, EpistemicGpuHotPathPhase, EpistemicGpuPlan,
18    EpistemicTupleMembershipBinding, EpistemicWcojReductionStatus,
19};
20
21use super::Executor;
22
23const XLOG_CONSTRAINT_RELATION_PREFIX: &str = "__xlog_constraint_";
24
25/// Capacity limits for an epistemic GPU workspace allocation.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub struct EpistemicGpuWorkspaceCapacities {
28    /// Maximum generated epistemic candidates.
29    pub max_candidates: usize,
30    /// Maximum worlds tracked per candidate.
31    pub max_worlds: usize,
32    /// Maximum reduced-program models tracked per reduction.
33    pub max_models_per_reduction: usize,
34}
35
36/// Concrete device-buffer layout for an epistemic GPU workspace.
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub struct EpistemicGpuWorkspaceLayout {
39    /// Candidate-assumption buffer size in bytes.
40    pub candidate_assumption_bytes: usize,
41    /// World-view buffer size in bytes.
42    pub world_view_bytes: usize,
43    /// Model-membership buffer size in bytes.
44    pub model_membership_bytes: usize,
45    /// Rejection-reason slot count.
46    pub rejection_reason_slots: usize,
47}
48
49impl EpistemicGpuWorkspaceLayout {
50    /// Build a workspace layout from an epistemic GPU plan and capacity limits.
51    pub fn for_plan(
52        plan: &EpistemicGpuPlan,
53        capacities: EpistemicGpuWorkspaceCapacities,
54    ) -> Result<Self> {
55        require_positive(
56            capacities.max_candidates,
57            "epistemic GPU workspace candidates",
58        )?;
59        require_positive(capacities.max_worlds, "epistemic GPU workspace worlds")?;
60        require_positive(
61            capacities.max_models_per_reduction,
62            "epistemic GPU workspace models",
63        )?;
64        require_positive(
65            plan.epistemic_literals.len(),
66            "epistemic GPU workspace literals",
67        )?;
68        require_positive(plan.reductions.len(), "epistemic GPU workspace reductions")?;
69
70        let literal_count = plan.epistemic_literals.len();
71        let reduction_count = plan.reductions.len();
72        let candidate_assumption_bytes = checked_product(capacities.max_candidates, literal_count)?;
73        let world_view_stride = capacities
74            .max_worlds
75            .max(world_view_bitset_bytes_per_candidate(literal_count)?);
76        let world_view_bytes = checked_product(capacities.max_candidates, world_view_stride)?;
77        let model_membership_bytes = checked_product(
78            checked_product(
79                checked_product(
80                    capacities.max_candidates,
81                    capacities.max_models_per_reduction,
82                )?,
83                reduction_count,
84            )?,
85            literal_count,
86        )?;
87
88        Ok(Self {
89            candidate_assumption_bytes,
90            world_view_bytes,
91            model_membership_bytes,
92            rejection_reason_slots: capacities.max_candidates,
93        })
94    }
95
96    /// Total workspace byte size across every device buffer category.
97    pub fn total_bytes(&self) -> usize {
98        self.try_total_bytes()
99            .expect("epistemic GPU workspace layout byte total overflowed")
100    }
101
102    /// Checked total workspace byte size across every device buffer category.
103    pub fn try_total_bytes(&self) -> Result<usize> {
104        let rejection_reason_bytes =
105            checked_product(self.rejection_reason_slots, std::mem::size_of::<u32>())?;
106        checked_sum(
107            checked_sum(
108                checked_sum(self.candidate_assumption_bytes, self.world_view_bytes)?,
109                self.model_membership_bytes,
110            )?,
111            rejection_reason_bytes,
112        )
113    }
114}
115
116/// Device-resident buffers for epistemic Generate-Propagate-Test execution.
117pub struct EpistemicGpuWorkspace {
118    /// Workspace layout used for allocation.
119    pub layout: EpistemicGpuWorkspaceLayout,
120    /// Candidate-assumption bitset buffer.
121    pub candidate_assumptions: TrackedCudaSlice<u8>,
122    /// Candidate and accepted world-view bitset buffer.
123    pub world_views: TrackedCudaSlice<u8>,
124    /// Per-model membership check buffer.
125    pub model_membership: TrackedCudaSlice<u8>,
126    /// Structured rejection-reason code buffer.
127    pub rejection_reasons: TrackedCudaSlice<u32>,
128    /// Per-candidate firing integrity-constraint index buffer. Parallel to
129    /// `rejection_reasons`, sized `layout.rejection_reason_slots`. Holds the
130    /// declaration-order index of the constraint that rejected a candidate, or
131    /// the sentinel `u32::MAX` when no integrity constraint rejected it. The
132    /// reason code in `rejection_reasons` is left at 6 for constraint
133    /// violations; this buffer adds the constraint-specific detail.
134    pub constraint_violation_index: TrackedCudaSlice<u32>,
135}
136
137impl EpistemicGpuWorkspace {
138    /// Require retained device buffers to match the certified workspace layout.
139    pub fn require_buffer_lengths_match_layout(&self, construct: &str) -> Result<()> {
140        if self.candidate_assumptions.len() != self.layout.candidate_assumption_bytes
141            || self.world_views.len() != self.layout.world_view_bytes
142            || self.model_membership.len() != self.layout.model_membership_bytes
143            || self.rejection_reasons.len() != self.layout.rejection_reason_slots
144            || self.constraint_violation_index.len() != self.layout.rejection_reason_slots
145        {
146            return Err(XlogError::UnsupportedEpistemicConstruct {
147                construct: construct.to_string(),
148                context: format!(
149                    "prepared GPU workspace buffer lengths do not match layout: \
150                     candidate_bytes={}/{} world_view_bytes={}/{} model_membership_bytes={}/{} \
151                     rejection_reason_slots={}/{} constraint_violation_index_slots={}/{}",
152                    self.candidate_assumptions.len(),
153                    self.layout.candidate_assumption_bytes,
154                    self.world_views.len(),
155                    self.layout.world_view_bytes,
156                    self.model_membership.len(),
157                    self.layout.model_membership_bytes,
158                    self.rejection_reasons.len(),
159                    self.layout.rejection_reason_slots,
160                    self.constraint_violation_index.len(),
161                    self.layout.rejection_reason_slots
162                ),
163            });
164        }
165
166        Ok(())
167    }
168}
169
170/// Trace proving an epistemic GPU workspace was initialized on device.
171#[derive(Debug, Clone, Copy, PartialEq, Eq)]
172pub struct EpistemicGpuWorkspaceResetTrace {
173    /// Candidate-assumption bytes zeroed on device.
174    pub candidate_assumption_bytes: usize,
175    /// World-view bytes zeroed on device.
176    pub world_view_bytes: usize,
177    /// Model-membership bytes zeroed on device.
178    pub model_membership_bytes: usize,
179    /// Rejection-reason bytes zeroed on device.
180    pub rejection_reason_bytes: usize,
181    /// Device zeroing operations submitted by the reset path.
182    pub device_zero_ops: u32,
183    /// Host writes used by the reset path. Accepted GPU execution requires zero.
184    pub host_write_ops: u32,
185}
186
187/// CUDA-event timing captured around one epistemic GPU kernel launch.
188#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
189pub struct EpistemicGpuKernelTimingTrace {
190    /// CUDA event pairs recorded around the launch. Runtime traces require one.
191    pub cuda_event_pairs: u32,
192    /// CUDA event synchronizations used to make elapsed time observable on host.
193    pub timing_sync_ops: u32,
194    /// Event-measured stream elapsed time, converted from milliseconds to nanoseconds.
195    pub kernel_elapsed_nanos: u64,
196}
197
198/// Trace proving candidate assumptions were generated by a GPU kernel.
199#[derive(Debug, Clone, Copy, PartialEq, Eq)]
200pub struct EpistemicGpuCandidateGenerationTrace {
201    /// Number of epistemic literals represented per candidate.
202    pub literal_count: usize,
203    /// Number of candidate rows generated on device.
204    pub generated_candidates: usize,
205    /// Candidate-assumption bytes written by the kernel.
206    pub candidate_assumption_bytes: usize,
207    /// Candidate-generation kernel launches.
208    pub kernel_launches: u32,
209    /// Host writes used by candidate generation. Accepted execution requires zero.
210    pub host_write_ops: u32,
211    /// CUDA-event timing for the launched kernel.
212    pub kernel_timing: EpistemicGpuKernelTimingTrace,
213}
214
215/// Trace proving staged candidate buffers were validated by a GPU kernel.
216#[derive(Debug, Clone, Copy, PartialEq, Eq)]
217pub struct EpistemicGpuCandidateValidationTrace {
218    /// Number of epistemic literals represented per candidate.
219    pub literal_count: usize,
220    /// Number of candidate rows validated on device.
221    pub validated_candidates: usize,
222    /// Candidate-assumption bytes checked by the kernel.
223    pub candidate_assumption_bytes_checked: usize,
224    /// World-view staging bytes checked by the kernel.
225    pub world_view_bytes_checked: usize,
226    /// Rejection-reason slots written by the kernel.
227    pub rejection_reason_slots_written: usize,
228    /// Candidate-validation kernel launches.
229    pub kernel_launches: u32,
230    /// Host writes used by validation. Accepted GPU execution requires zero.
231    pub host_write_ops: u32,
232    /// CUDA-event timing for the launched kernel.
233    pub kernel_timing: EpistemicGpuKernelTimingTrace,
234}
235
236/// Trace proving accepted-candidate materialization staging used a GPU kernel.
237#[derive(Debug, Clone, Copy, PartialEq, Eq)]
238pub struct EpistemicGpuMaterializationTrace {
239    /// Number of candidate rows materialized on device.
240    pub materialized_candidates: usize,
241    /// World-view slots written by the kernel.
242    pub world_view_slots_written: usize,
243    /// Materialization kernel launches.
244    pub kernel_launches: u32,
245    /// Host writes used by materialization. Accepted GPU execution requires zero.
246    pub host_write_ops: u32,
247    /// CUDA-event timing for the launched kernel.
248    pub kernel_timing: EpistemicGpuKernelTimingTrace,
249}
250
251/// Trace proving final result flags were materialized from device-side output metadata.
252#[derive(Debug, Clone, Copy, PartialEq, Eq)]
253pub struct EpistemicGpuFinalResultMaterializationTrace {
254    /// Number of candidate rows materialized on device.
255    pub materialized_candidates: usize,
256    /// Device output row-count scalars read by the kernel.
257    pub output_row_count_device_reads: u32,
258    /// World-view result slots written by the kernel.
259    pub world_view_slots_written: usize,
260    /// Final-result materialization kernel launches.
261    pub kernel_launches: u32,
262    /// Host writes used by final-result materialization. Accepted execution requires zero.
263    pub host_write_ops: u32,
264    /// CUDA-event timing for the launched kernel.
265    pub kernel_timing: EpistemicGpuKernelTimingTrace,
266}
267
268/// Trace proving final query tuples were materialized into a device-resident buffer.
269#[derive(Debug, Clone, Copy, PartialEq, Eq)]
270pub struct EpistemicGpuFinalTupleMaterializationTrace {
271    /// Number of output columns copied into the final device buffer.
272    pub output_column_count: usize,
273    /// Row capacity of the final output buffer.
274    pub output_row_capacity: usize,
275    /// Device tuple bytes covered by the materialization kernels.
276    pub tuple_bytes_capacity: usize,
277    /// Device output row-count scalars read by the kernels.
278    pub output_row_count_device_reads: u32,
279    /// Model-membership bytes checked by the kernels before tuple materialization.
280    pub model_membership_bytes_checked: usize,
281    /// Bounded model slots available per reduction during final tuple materialization.
282    pub bounded_model_slots_per_reduction: usize,
283    /// Output row capacity that can be checked against row-specific model slots.
284    pub row_specific_membership_row_capacity: usize,
285    /// Output row capacity beyond the bounded model-slot window.
286    pub row_filter_row_capacity_outside_model_slot_window: usize,
287    /// World-view slots checked by the kernels before tuple materialization.
288    pub world_view_slots_checked: usize,
289    /// Variable-bound tuple row filters applied by the final-row map kernel.
290    pub row_filter_count: usize,
291    /// Negated variable-bound tuple row filters applied by the final-row map kernel.
292    pub negated_row_filter_count: usize,
293    /// Device final row-count scalars written by the kernels.
294    pub final_row_count_device_writes: u32,
295    /// Final tuple materialization kernel launches.
296    pub kernel_launches: u32,
297    /// Host writes used by final tuple materialization. Accepted execution requires zero.
298    pub host_write_ops: u32,
299    /// CUDA-event timing for the launched kernel batch.
300    pub kernel_timing: EpistemicGpuKernelTimingTrace,
301}
302
303/// Trace proving the epistemic GPU hot path avoided tracked data-plane host transfers.
304#[derive(Debug, Clone, Copy, PartialEq, Eq)]
305pub struct EpistemicGpuTransferBudgetTrace {
306    /// Number of candidate rows covered by this transfer-budget check.
307    pub candidate_count: usize,
308    /// Tracked device-to-host bytes observed inside the GPU hot path.
309    pub tracked_dtoh_bytes: u64,
310    /// Tracked data-plane host-to-device bytes observed inside the GPU hot path.
311    pub tracked_htod_bytes: u64,
312    /// Tracked device-to-host calls observed inside the GPU hot path.
313    pub tracked_dtoh_calls: u64,
314    /// Tracked data-plane host-to-device calls observed inside the GPU hot path.
315    pub tracked_htod_calls: u64,
316    /// Tracked aggregate host-to-device bytes observed inside the GPU hot path.
317    pub tracked_aggregate_htod_bytes: u64,
318    /// Tracked aggregate host-to-device calls observed inside the GPU hot path.
319    pub tracked_aggregate_htod_calls: u64,
320    /// Tracked launch-metadata host-to-device bytes observed inside the GPU hot path.
321    pub tracked_launch_metadata_htod_bytes: u64,
322    /// Tracked launch-metadata host-to-device calls observed inside the GPU hot path.
323    pub tracked_launch_metadata_htod_calls: u64,
324    /// Tracked data-plane host-to-device bytes observed inside the GPU hot path.
325    pub tracked_data_plane_htod_bytes: u64,
326    /// Tracked data-plane host-to-device calls observed inside the GPU hot path.
327    pub tracked_data_plane_htod_calls: u64,
328    /// Per-candidate host round trips observed inside the GPU hot path.
329    pub per_candidate_host_round_trips: u64,
330}
331
332/// Trace accounting for the bounded final-result transfer after the GPU hot path.
333#[derive(Debug, Clone, Copy, PartialEq, Eq)]
334pub struct EpistemicGpuFinalResultTransferTrace {
335    /// Logical rows in the final device-resident output buffer.
336    pub final_output_rows: usize,
337    /// Number of final output columns that a caller may export.
338    pub final_output_column_count: usize,
339    /// Bytes in one final output row.
340    pub final_output_row_width_bytes: usize,
341    /// Bounded data-plane payload bytes represented by the final output.
342    pub final_output_payload_bytes: u64,
343    /// Device row-count metadata reads used for this accounting.
344    pub row_count_device_reads: u32,
345    /// Data-plane device-to-host calls issued by accepted execution. Execution returns a device buffer, so this is zero.
346    pub tracked_data_plane_dtoh_calls: u64,
347    /// Data-plane device-to-host bytes issued by accepted execution. Execution returns a device buffer, so this is zero.
348    pub tracked_data_plane_dtoh_bytes: u64,
349}
350
351/// Bounded validation of reduced integrity-constraint relations after GPU execution.
352#[derive(Debug, Clone, Copy, PartialEq, Eq)]
353pub struct EpistemicGpuConstraintValidationTrace {
354    /// Number of compiler-generated `__xlog_constraint_N` relations checked.
355    pub checked_constraint_relations: usize,
356    /// Number of checked constraint relations that contained violating rows.
357    pub violated_constraint_relations: usize,
358    /// Constraint row-count reads that had to consult device metadata.
359    pub row_count_device_reads: u32,
360}
361
362impl EpistemicGpuConstraintValidationTrace {
363    /// Require reduced integrity-constraint validation to match preflight obligations.
364    pub fn require_matches_preflight(
365        &self,
366        construct: &str,
367        preflight: &EpistemicGpuRuntimePreflight,
368    ) -> Result<()> {
369        if self.checked_constraint_relations != preflight.reduced_constraint_relation_count
370            || self.violated_constraint_relations != 0
371            || self.row_count_device_reads as usize > self.checked_constraint_relations
372        {
373            return Err(XlogError::UnsupportedEpistemicConstruct {
374                construct: construct.to_string(),
375                context: format!(
376                    "constraint validation trace must match reduced runtime preflight, got \
377                     checked={} expected_checked={} violations={} row_count_reads={}",
378                    self.checked_constraint_relations,
379                    preflight.reduced_constraint_relation_count,
380                    self.violated_constraint_relations,
381                    self.row_count_device_reads
382                ),
383            });
384        }
385
386        Ok(())
387    }
388}
389
390/// Typed interpretation of nonzero GPU epistemic rejection codes.
391#[derive(Debug, Clone, Copy, PartialEq, Eq)]
392pub enum EpistemicGpuRejectionReason {
393    /// Candidate was rejected because its world-view row was inactive.
394    InactiveWorld,
395    /// Candidate buffer contained a value outside the valid boolean bit range.
396    InvalidCandidateBit,
397    /// Candidate did not have a reduced-model tuple source to validate against.
398    MissingReducedModel,
399    /// Candidate assumptions were not supported by model-membership evidence.
400    UnsatisfiedMembership,
401    /// Accepted world view satisfied an epistemic integrity constraint body.
402    WorldViewConstraintViolation,
403}
404
405impl EpistemicGpuRejectionReason {
406    /// Return the raw device rejection code used by the CUDA kernels.
407    pub const fn code(self) -> u32 {
408        match self {
409            Self::InactiveWorld => 2,
410            Self::InvalidCandidateBit => 3,
411            Self::MissingReducedModel => 4,
412            Self::UnsatisfiedMembership => 5,
413            Self::WorldViewConstraintViolation => 6,
414        }
415    }
416
417    /// Decode a nonzero device rejection code into a typed reason.
418    pub fn from_code(code: u32) -> Result<Self> {
419        match code {
420            2 => Ok(Self::InactiveWorld),
421            3 => Ok(Self::InvalidCandidateBit),
422            4 => Ok(Self::MissingReducedModel),
423            5 => Ok(Self::UnsatisfiedMembership),
424            6 => Ok(Self::WorldViewConstraintViolation),
425            other => Err(XlogError::UnsupportedEpistemicConstruct {
426                construct: "epistemic GPU rejection reason".to_string(),
427                context: format!("unknown device rejection code {other}"),
428            }),
429        }
430    }
431}
432
433/// Device-derived semantic summary for Generate-Propagate-Test execution.
434#[derive(Debug, Clone, PartialEq, Eq)]
435pub struct EpistemicGpuSemanticTrace {
436    /// Number of candidate rows generated on device.
437    pub generated_candidates: usize,
438    /// Number of epistemic guesses represented by generated candidate rows.
439    pub guesses: usize,
440    /// Number of candidate rows propagated on device.
441    pub propagated_candidates: usize,
442    /// Number of generated candidates not propagated.
443    pub pruned_candidates: usize,
444    /// Number of candidate rows checked by world-view validation.
445    pub tested_candidates: usize,
446    /// Number of reduced model slots checked by model-membership/world-view kernels.
447    pub reduced_model_slots_checked: usize,
448    /// Number of accepted candidates observed in the device rejection buffer.
449    pub accepted_candidates: usize,
450    /// Candidate indices accepted by the device rejection buffer.
451    pub accepted_candidate_indices: Vec<usize>,
452    /// Number of accepted world views represented by accepted candidates.
453    pub accepted_world_views: usize,
454    /// Number of rejected candidates observed in the device rejection buffer.
455    pub rejected_candidates: usize,
456    /// Candidate indices rejected by the device rejection buffer.
457    pub rejected_candidate_indices: Vec<usize>,
458    /// Nonzero rejection reason codes copied from the device rejection buffer.
459    pub rejection_reasons: Vec<u32>,
460    /// Constraint-specific reason per rejected candidate, aligned 1:1 with
461    /// `rejected_candidate_indices`. `Some(idx)` when an integrity constraint
462    /// (reason code 6) rejected the candidate, where `idx` is the firing
463    /// constraint's declaration-order index; `None` for every other rejection
464    /// reason. Surfaces constraint-specific rejection detail.
465    pub constraint_violation_indices: Vec<Option<u32>>,
466    /// Bounded metadata reads from the device rejection buffer after the hot path.
467    pub rejection_reason_device_reads: u32,
468    /// Bytes read as bounded rejection-reason metadata after the hot path.
469    pub rejection_reason_metadata_bytes: u64,
470    /// CPU candidate enumerations used by the accepted path.
471    pub cpu_candidate_enumerations: u32,
472    /// CPU world-view validations used by the accepted path.
473    pub cpu_world_view_validations: u32,
474}
475
476/// Trace proving model-membership staging was performed by a GPU kernel.
477#[derive(Debug, Clone, Copy, PartialEq, Eq)]
478pub struct EpistemicGpuModelMembershipTrace {
479    /// Number of epistemic literals represented per candidate/model.
480    pub literal_count: usize,
481    /// Number of candidate rows checked on device.
482    pub candidates_checked: usize,
483    /// Number of reduced-program summaries represented in the membership layout.
484    pub reduction_count: usize,
485    /// Maximum models represented per reduction.
486    pub models_per_reduction: usize,
487    /// Model-membership bytes written by the kernel.
488    pub model_membership_bytes_written: usize,
489    /// Device output row-count scalars read by the kernel.
490    pub output_row_count_device_reads: u32,
491    /// Device tuple-source row-count scalars read by the kernel.
492    pub tuple_source_row_count_device_reads: u32,
493    /// Device tuple-key columns read by tuple-source membership kernels.
494    pub tuple_source_key_column_device_reads: u32,
495    /// Rejection-reason slots checked by the kernel.
496    pub rejection_reason_slots_checked: usize,
497    /// Source used to populate model-membership bytes.
498    pub membership_source: EpistemicGpuModelMembershipSource,
499    /// Model-membership staging kernel launches.
500    pub kernel_launches: u32,
501    /// Host writes used by model-membership staging. Accepted execution requires zero.
502    pub host_write_ops: u32,
503    /// CUDA-event timing for the launched kernel.
504    pub kernel_timing: EpistemicGpuKernelTimingTrace,
505}
506
507/// Source of GPU model-membership bytes for epistemic world-view validation.
508#[derive(Debug, Clone, Copy, PartialEq, Eq)]
509pub enum EpistemicGpuModelMembershipSource {
510    /// Current bounded staging only proves the reduced output has rows.
511    ReducedOutputRowCountOnly,
512    /// Model-membership bytes were populated from reduced stable-model tuple buffers.
513    StableModelTupleBuffer,
514}
515
516/// Trace proving staged model memberships were validated against world views on GPU.
517#[derive(Debug, Clone, Copy, PartialEq, Eq)]
518pub struct EpistemicGpuWorldViewValidationTrace {
519    /// Number of epistemic literals represented per candidate/model.
520    pub literal_count: usize,
521    /// Number of candidate rows checked on device.
522    pub candidates_checked: usize,
523    /// Number of reduced-program summaries represented in the membership layout.
524    pub reduction_count: usize,
525    /// Maximum models represented per reduction.
526    pub models_per_reduction: usize,
527    /// Model-membership bytes checked by the kernel.
528    pub model_membership_bytes_checked: usize,
529    /// World-view staging slots checked by the kernel.
530    pub world_view_slots_checked: usize,
531    /// Rejection-reason slots written by the kernel.
532    pub rejection_reason_slots_written: usize,
533    /// World-view validation kernel launches.
534    pub kernel_launches: u32,
535    /// Host writes used by world-view validation. Accepted execution requires zero.
536    pub host_write_ops: u32,
537    /// CUDA-event timing for the launched kernel.
538    pub kernel_timing: EpistemicGpuKernelTimingTrace,
539}
540
541/// Trace proving epistemic integrity constraints were evaluated against world
542/// views on GPU.
543///
544/// World-view integrity constraints (`:- know unsafe().`) prune accepted
545/// candidate world views on device after modal world-view validation. The
546/// device kernel never reads accepted worlds back to the host, so accepted
547/// execution keeps `host_write_ops` at zero.
548#[derive(Debug, Clone, Copy, PartialEq, Eq)]
549pub struct EpistemicGpuConstraintWorldViewValidationTrace {
550    /// Number of epistemic integrity constraints checked on device.
551    pub constraint_count: usize,
552    /// Number of constraint-body literal references checked on device.
553    pub constraint_literal_refs: usize,
554    /// Number of candidate world views checked by the constraint kernel.
555    pub candidates_checked: usize,
556    /// Rejection-reason slots written by the kernel.
557    pub rejection_reason_slots_written: usize,
558    /// Constraint world-view validation kernel launches.
559    pub kernel_launches: u32,
560    /// Host writes used by constraint validation. Accepted execution requires zero.
561    pub host_write_ops: u32,
562    /// CUDA-event timing for the launched kernel.
563    pub kernel_timing: EpistemicGpuKernelTimingTrace,
564}
565
566/// Trace proving candidate propagation staging was performed by a GPU kernel.
567#[derive(Debug, Clone, Copy, PartialEq, Eq)]
568pub struct EpistemicGpuPropagationTrace {
569    /// Number of epistemic literals represented per candidate.
570    pub literal_count: usize,
571    /// Number of candidate rows propagated on device.
572    pub propagated_candidates: usize,
573    /// World-view staging bytes written by the kernel.
574    pub world_view_bytes_written: usize,
575    /// Rejection-reason slots initialized by the kernel.
576    pub rejection_reason_slots_written: usize,
577    /// Candidate-propagation kernel launches.
578    pub kernel_launches: u32,
579    /// Host writes used by propagation. Accepted GPU execution requires zero.
580    pub host_write_ops: u32,
581    /// CUDA-event timing for the launched kernel.
582    pub kernel_timing: EpistemicGpuKernelTimingTrace,
583}
584
585impl EpistemicGpuKernelTimingTrace {
586    /// Empty timing marker used before a runtime launch records CUDA events.
587    pub const fn unrecorded() -> Self {
588        Self {
589            cuda_event_pairs: 0,
590            timing_sync_ops: 0,
591            kernel_elapsed_nanos: 0,
592        }
593    }
594
595    /// Convert CUDA's native event elapsed time in milliseconds to a trace.
596    pub fn from_cuda_elapsed_ms(elapsed_ms: f32) -> Result<Self> {
597        if !elapsed_ms.is_finite() || elapsed_ms < 0.0 {
598            return Err(XlogError::Execution(format!(
599                "invalid epistemic GPU kernel elapsed time: {elapsed_ms}"
600            )));
601        }
602        let elapsed_nanos = ((elapsed_ms as f64) * 1_000_000.0).round();
603        if elapsed_nanos >= u64::MAX as f64 {
604            return Err(XlogError::UnsupportedEpistemicConstruct {
605                construct: "epistemic GPU kernel timing trace".to_string(),
606                context: format!(
607                    "CUDA elapsed time {elapsed_ms}ms exceeds the u64 nanosecond trace counter"
608                ),
609            });
610        }
611
612        Ok(Self {
613            cuda_event_pairs: 1,
614            timing_sync_ops: 1,
615            kernel_elapsed_nanos: elapsed_nanos as u64,
616        })
617    }
618
619    /// Whether CUDA-event timing was recorded for this trace.
620    pub const fn is_recorded(&self) -> bool {
621        self.cuda_event_pairs > 0 && self.timing_sync_ops > 0
622    }
623
624    /// Saturating sum used when aggregating multi-kernel or split-batch traces.
625    pub fn saturating_add(self, other: Self) -> Self {
626        Self {
627            cuda_event_pairs: self.cuda_event_pairs.saturating_add(other.cuda_event_pairs),
628            timing_sync_ops: self.timing_sync_ops.saturating_add(other.timing_sync_ops),
629            kernel_elapsed_nanos: self
630                .kernel_elapsed_nanos
631                .saturating_add(other.kernel_elapsed_nanos),
632        }
633    }
634
635    /// Checked sum used by accepted certification paths.
636    pub fn checked_add(self, other: Self) -> Result<Self> {
637        Ok(Self {
638            cuda_event_pairs: self
639                .cuda_event_pairs
640                .checked_add(other.cuda_event_pairs)
641                .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
642                    construct: "epistemic GPU kernel timing trace".to_string(),
643                    context: format!(
644                        "CUDA event-pair counter overflowed while adding {} to {}",
645                        other.cuda_event_pairs, self.cuda_event_pairs
646                    ),
647                })?,
648            timing_sync_ops: self
649                .timing_sync_ops
650                .checked_add(other.timing_sync_ops)
651                .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
652                    construct: "epistemic GPU kernel timing trace".to_string(),
653                    context: format!(
654                        "CUDA timing-sync counter overflowed while adding {} to {}",
655                        other.timing_sync_ops, self.timing_sync_ops
656                    ),
657                })?,
658            kernel_elapsed_nanos: self
659                .kernel_elapsed_nanos
660                .checked_add(other.kernel_elapsed_nanos)
661                .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
662                    construct: "epistemic GPU kernel timing trace".to_string(),
663                    context: format!(
664                        "kernel elapsed-time counter overflowed while adding {} to {}",
665                        other.kernel_elapsed_nanos, self.kernel_elapsed_nanos
666                    ),
667                })?,
668        })
669    }
670
671    /// Aggregate timing traces from a single execution or split-batch result.
672    pub fn sum(traces: impl IntoIterator<Item = Self>) -> Self {
673        traces
674            .into_iter()
675            .fold(Self::unrecorded(), Self::saturating_add)
676    }
677
678    /// Checked aggregate timing traces for accepted certification paths.
679    pub fn checked_sum(traces: impl IntoIterator<Item = Self>) -> Result<Self> {
680        traces
681            .into_iter()
682            .try_fold(Self::unrecorded(), Self::checked_add)
683    }
684}
685
686impl EpistemicGpuCandidateGenerationTrace {
687    /// Build a candidate-generation trace for a bounded device launch.
688    pub fn for_counts(literal_count: usize, candidate_count: usize) -> Result<Self> {
689        require_positive(literal_count, "epistemic GPU candidate literals")?;
690        require_positive(candidate_count, "epistemic GPU candidate count")?;
691        if literal_count > 31 {
692            return Err(XlogError::UnsupportedEpistemicConstruct {
693                construct: "epistemic GPU candidate generation".to_string(),
694                context: format!("literal count {literal_count} exceeds 31-bit candidate mask"),
695            });
696        }
697        if candidate_count > (1usize << literal_count) {
698            return Err(XlogError::ResourceExhausted {
699                context: "epistemic GPU candidate count".to_string(),
700                estimated_bytes: candidate_count as u64,
701                budget_bytes: (1usize << literal_count) as u64,
702            });
703        }
704
705        let candidate_assumption_bytes = checked_product(literal_count, candidate_count)?;
706        require_u32_launch_bound(
707            candidate_assumption_bytes,
708            "epistemic GPU candidate generation launch",
709        )?;
710
711        Ok(Self {
712            literal_count,
713            generated_candidates: candidate_count,
714            candidate_assumption_bytes,
715            kernel_launches: 1,
716            host_write_ops: 0,
717            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
718        })
719    }
720
721    /// Attach CUDA-event timing captured by the runtime launch path.
722    pub const fn with_kernel_timing(
723        mut self,
724        kernel_timing: EpistemicGpuKernelTimingTrace,
725    ) -> Self {
726        self.kernel_timing = kernel_timing;
727        self
728    }
729}
730
731impl EpistemicGpuCandidateValidationTrace {
732    /// Build a validation trace for a bounded device launch.
733    pub fn for_counts(literal_count: usize, candidate_count: usize) -> Result<Self> {
734        require_positive(literal_count, "epistemic GPU candidate validation literals")?;
735        require_positive(
736            candidate_count,
737            "epistemic GPU candidate validation candidates",
738        )?;
739        require_u32_launch_dimensions(
740            &[literal_count, candidate_count],
741            "epistemic GPU validation launch",
742        )?;
743        let candidate_assumption_bytes_checked = checked_product(literal_count, candidate_count)?;
744
745        Ok(Self {
746            literal_count,
747            validated_candidates: candidate_count,
748            candidate_assumption_bytes_checked,
749            world_view_bytes_checked: candidate_count,
750            rejection_reason_slots_written: candidate_count,
751            kernel_launches: 1,
752            host_write_ops: 0,
753            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
754        })
755    }
756
757    /// Attach CUDA-event timing captured by the runtime launch path.
758    pub const fn with_kernel_timing(
759        mut self,
760        kernel_timing: EpistemicGpuKernelTimingTrace,
761    ) -> Self {
762        self.kernel_timing = kernel_timing;
763        self
764    }
765
766    /// Require validation coverage to match the generated candidate workspace.
767    pub fn require_matches_candidate_generation(
768        &self,
769        construct: &str,
770        candidate_generation: &EpistemicGpuCandidateGenerationTrace,
771    ) -> Result<()> {
772        let expected_world_view_bytes = checked_product(
773            world_view_bitset_bytes_per_candidate(candidate_generation.literal_count)?,
774            candidate_generation.generated_candidates,
775        )?;
776        if self.literal_count != candidate_generation.literal_count
777            || self.validated_candidates != candidate_generation.generated_candidates
778            || self.candidate_assumption_bytes_checked
779                != candidate_generation.candidate_assumption_bytes
780            || self.world_view_bytes_checked != expected_world_view_bytes
781            || self.rejection_reason_slots_written != candidate_generation.generated_candidates
782        {
783            return Err(XlogError::UnsupportedEpistemicConstruct {
784                construct: construct.to_string(),
785                context: format!(
786                    "candidate validation trace does not match generated GPU candidates: \
787                     literals={}/{} candidates={}/{} candidate_bytes={}/{} \
788                     world_view_bytes={}/{} rejection_slots={}/{}",
789                    self.literal_count,
790                    candidate_generation.literal_count,
791                    self.validated_candidates,
792                    candidate_generation.generated_candidates,
793                    self.candidate_assumption_bytes_checked,
794                    candidate_generation.candidate_assumption_bytes,
795                    self.world_view_bytes_checked,
796                    expected_world_view_bytes,
797                    self.rejection_reason_slots_written,
798                    candidate_generation.generated_candidates
799                ),
800            });
801        }
802
803        Ok(())
804    }
805}
806
807impl EpistemicGpuMaterializationTrace {
808    /// Build a materialization trace for a bounded device launch.
809    pub fn for_count(candidate_count: usize) -> Result<Self> {
810        require_positive(candidate_count, "epistemic GPU materialization candidates")?;
811        require_u32_launch_bound(candidate_count, "epistemic GPU materialization launch")?;
812
813        Ok(Self {
814            materialized_candidates: candidate_count,
815            world_view_slots_written: candidate_count,
816            kernel_launches: 1,
817            host_write_ops: 0,
818            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
819        })
820    }
821
822    /// Attach CUDA-event timing captured by the runtime launch path.
823    pub const fn with_kernel_timing(
824        mut self,
825        kernel_timing: EpistemicGpuKernelTimingTrace,
826    ) -> Self {
827        self.kernel_timing = kernel_timing;
828        self
829    }
830}
831
832impl EpistemicGpuFinalResultMaterializationTrace {
833    /// Build a final-result materialization trace for a bounded device launch.
834    pub fn for_count(candidate_count: usize) -> Result<Self> {
835        require_positive(
836            candidate_count,
837            "epistemic GPU final-result materialization candidates",
838        )?;
839        require_u32_launch_bound(candidate_count, "epistemic GPU final-result launch")?;
840
841        Ok(Self {
842            materialized_candidates: candidate_count,
843            output_row_count_device_reads: 1,
844            world_view_slots_written: candidate_count,
845            kernel_launches: 1,
846            host_write_ops: 0,
847            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
848        })
849    }
850
851    /// Attach CUDA-event timing captured by the runtime launch path.
852    pub const fn with_kernel_timing(
853        mut self,
854        kernel_timing: EpistemicGpuKernelTimingTrace,
855    ) -> Self {
856        self.kernel_timing = kernel_timing;
857        self
858    }
859}
860
861impl EpistemicGpuFinalTupleMaterializationTrace {
862    /// Build a final tuple materialization trace for a device-side output buffer.
863    pub fn for_counts(
864        output_column_count: usize,
865        output_row_capacity: usize,
866        tuple_bytes_capacity: usize,
867        literal_count: usize,
868        candidate_count: usize,
869        reduction_count: usize,
870        models_per_reduction: usize,
871    ) -> Result<Self> {
872        if output_column_count > u32::MAX as usize {
873            return Err(XlogError::ResourceExhausted {
874                context: "epistemic GPU final-tuple output columns".to_string(),
875                estimated_bytes: output_column_count as u64,
876                budget_bytes: u32::MAX as u64,
877            });
878        }
879        require_u32_launch_bound(output_row_capacity, "epistemic GPU final-tuple output rows")?;
880        require_positive(literal_count, "epistemic GPU final-tuple literals")?;
881        require_positive(candidate_count, "epistemic GPU final-tuple candidates")?;
882        require_positive(reduction_count, "epistemic GPU final-tuple reductions")?;
883        require_positive(models_per_reduction, "epistemic GPU final-tuple models")?;
884        let model_membership_bytes_checked = checked_product(
885            checked_product(
886                checked_product(candidate_count, reduction_count)?,
887                models_per_reduction,
888            )?,
889            literal_count,
890        )?;
891        require_u32_launch_bound(
892            model_membership_bytes_checked,
893            "epistemic GPU final-tuple membership launch",
894        )?;
895        let output_row_count_device_reads = checked_sum(output_column_count, 1)?;
896        let kernel_launches = checked_sum(output_row_count_device_reads, 1)?;
897        if kernel_launches > u32::MAX as usize {
898            return Err(XlogError::ResourceExhausted {
899                context: "epistemic GPU final-tuple kernel launches".to_string(),
900                estimated_bytes: kernel_launches as u64,
901                budget_bytes: u32::MAX as u64,
902            });
903        }
904
905        Ok(Self {
906            output_column_count,
907            output_row_capacity,
908            tuple_bytes_capacity,
909            output_row_count_device_reads: output_row_count_device_reads as u32,
910            model_membership_bytes_checked,
911            bounded_model_slots_per_reduction: models_per_reduction,
912            row_specific_membership_row_capacity: 0,
913            row_filter_row_capacity_outside_model_slot_window: 0,
914            world_view_slots_checked: candidate_count,
915            row_filter_count: 0,
916            negated_row_filter_count: 0,
917            final_row_count_device_writes: 1,
918            kernel_launches: kernel_launches as u32,
919            host_write_ops: 0,
920            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
921        })
922    }
923
924    /// Attach CUDA-event timing captured by the runtime launch path.
925    pub const fn with_kernel_timing(
926        mut self,
927        kernel_timing: EpistemicGpuKernelTimingTrace,
928    ) -> Self {
929        self.kernel_timing = kernel_timing;
930        self
931    }
932
933    /// Attach final-row filter metadata captured before launching the row-map kernel.
934    pub fn with_row_filter_counts(
935        mut self,
936        row_filter_count: usize,
937        negated_row_filter_count: usize,
938    ) -> Result<Self> {
939        if negated_row_filter_count > row_filter_count {
940            return Err(XlogError::ResourceExhausted {
941                context: "epistemic GPU final-tuple negated row filters".to_string(),
942                estimated_bytes: negated_row_filter_count as u64,
943                budget_bytes: row_filter_count as u64,
944            });
945        }
946        self.row_filter_count = row_filter_count;
947        self.negated_row_filter_count = negated_row_filter_count;
948        if row_filter_count > 0 {
949            self.row_specific_membership_row_capacity = self
950                .output_row_capacity
951                .min(self.bounded_model_slots_per_reduction);
952            self.row_filter_row_capacity_outside_model_slot_window = self
953                .output_row_capacity
954                .saturating_sub(self.row_specific_membership_row_capacity);
955        }
956        Ok(self)
957    }
958
959    /// Require GPU evidence that row-filtered tuple output fits the validated coverage window.
960    pub fn require_row_filter_materialization_evidence(
961        &self,
962        construct: &str,
963        final_output_rows: usize,
964    ) -> Result<()> {
965        if final_output_rows > self.output_row_capacity {
966            return Err(XlogError::UnsupportedEpistemicConstruct {
967                construct: construct.to_string(),
968                context: format!(
969                    "final tuple materialization reported {} logical rows for output row \
970                     capacity {}",
971                    final_output_rows, self.output_row_capacity
972                ),
973            });
974        }
975        if self.negated_row_filter_count > self.row_filter_count {
976            return Err(XlogError::UnsupportedEpistemicConstruct {
977                construct: construct.to_string(),
978                context: format!(
979                    "row-filtered final tuple materialization reported {} negated row filters \
980                     for {} total row filters",
981                    self.negated_row_filter_count, self.row_filter_count
982                ),
983            });
984        }
985        if self.row_filter_count == 0 {
986            if self.row_specific_membership_row_capacity != 0
987                || self.row_filter_row_capacity_outside_model_slot_window != 0
988            {
989                return Err(XlogError::UnsupportedEpistemicConstruct {
990                    construct: construct.to_string(),
991                    context: format!(
992                        "final tuple materialization without row filters reported row-filter \
993                         coverage row_specific_capacity={} fallback_capacity={}",
994                        self.row_specific_membership_row_capacity,
995                        self.row_filter_row_capacity_outside_model_slot_window
996                    ),
997                });
998            }
999            return Ok(());
1000        }
1001
1002        // EMPTY FOUNDED EXTENSION: a row-filtered reduction whose reduced base is
1003        // empty (e.g. an unfounded FAEEL self-support rule excluded from the founded
1004        // model) legitimately materializes zero output rows. With no candidate output
1005        // rows there is no row-specific membership window to cover, so the
1006        // coverage-equality invariant below (which requires a positive output capacity)
1007        // does not apply. This mirrors the `row_filter_count == 0` early-Ok above: an
1008        // all-empty result is sound, not under-coverage.
1009        if final_output_rows == 0 && self.output_row_capacity == 0 {
1010            return Ok(());
1011        }
1012
1013        let covered_row_capacity = checked_sum(
1014            self.row_specific_membership_row_capacity,
1015            self.row_filter_row_capacity_outside_model_slot_window,
1016        )?;
1017        if self.output_row_capacity == 0
1018            || self.row_specific_membership_row_capacity == 0
1019            || covered_row_capacity != self.output_row_capacity
1020        {
1021            return Err(XlogError::UnsupportedEpistemicConstruct {
1022                construct: construct.to_string(),
1023                context: format!(
1024                    "row-filtered final tuple materialization requires GPU row-filter coverage, \
1025                     got row_filters={} final_output_rows={} output_row_capacity={} \
1026                     row_specific_capacity={} fallback_capacity={} model_slots_per_reduction={}",
1027                    self.row_filter_count,
1028                    final_output_rows,
1029                    self.output_row_capacity,
1030                    self.row_specific_membership_row_capacity,
1031                    self.row_filter_row_capacity_outside_model_slot_window,
1032                    self.bounded_model_slots_per_reduction
1033                ),
1034            });
1035        }
1036
1037        let fallback_rows =
1038            final_output_rows.saturating_sub(self.row_specific_membership_row_capacity);
1039        if fallback_rows > self.row_filter_row_capacity_outside_model_slot_window {
1040            return Err(XlogError::UnsupportedEpistemicConstruct {
1041                construct: construct.to_string(),
1042                context: format!(
1043                    "row-filtered final tuple materialization has {} logical rows beyond the \
1044                     row-specific model-slot window but only {} fallback row-filter capacity",
1045                    fallback_rows, self.row_filter_row_capacity_outside_model_slot_window
1046                ),
1047            });
1048        }
1049        Ok(())
1050    }
1051}
1052
1053impl EpistemicGpuTransferBudgetTrace {
1054    /// Build a hot-path transfer trace from provider host-transfer snapshots.
1055    pub fn from_host_transfer_stats(
1056        candidate_count: usize,
1057        before: HostTransferStats,
1058        after: HostTransferStats,
1059    ) -> Result<Self> {
1060        Self::from_host_transfer_stats_with_launch_metadata(
1061            candidate_count,
1062            before,
1063            after,
1064            HostLaunchMetadataTransferStats::default(),
1065            HostLaunchMetadataTransferStats::default(),
1066        )
1067    }
1068
1069    /// Build a hot-path transfer trace while distinguishing bounded launch
1070    /// metadata host-to-device transfers from data-plane transfers.
1071    pub fn from_host_transfer_stats_with_launch_metadata(
1072        candidate_count: usize,
1073        before: HostTransferStats,
1074        after: HostTransferStats,
1075        launch_metadata_before: HostLaunchMetadataTransferStats,
1076        launch_metadata_after: HostLaunchMetadataTransferStats,
1077    ) -> Result<Self> {
1078        require_positive(candidate_count, "epistemic GPU transfer-budget candidates")?;
1079
1080        let tracked_dtoh_bytes =
1081            transfer_counter_delta("dtoh_bytes", before.dtoh_bytes, after.dtoh_bytes)?;
1082        let tracked_data_plane_htod_bytes =
1083            transfer_counter_delta("htod_bytes", before.htod_bytes, after.htod_bytes)?;
1084        let tracked_dtoh_calls =
1085            transfer_counter_delta("dtoh_calls", before.dtoh_calls, after.dtoh_calls)?;
1086        let tracked_data_plane_htod_calls =
1087            transfer_counter_delta("htod_calls", before.htod_calls, after.htod_calls)?;
1088        let tracked_launch_metadata_htod_bytes = transfer_counter_delta(
1089            "launch_metadata_htod_bytes",
1090            launch_metadata_before.htod_bytes,
1091            launch_metadata_after.htod_bytes,
1092        )?;
1093        let tracked_launch_metadata_htod_calls = transfer_counter_delta(
1094            "launch_metadata_htod_calls",
1095            launch_metadata_before.htod_calls,
1096            launch_metadata_after.htod_calls,
1097        )?;
1098        let tracked_aggregate_htod_bytes = tracked_data_plane_htod_bytes
1099            .checked_add(tracked_launch_metadata_htod_bytes)
1100            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
1101                construct: "epistemic GPU transfer budget".to_string(),
1102                context: format!(
1103                    "aggregate H2D bytes overflowed while adding launch metadata: \
1104                     data_plane_htod_bytes={tracked_data_plane_htod_bytes}, \
1105                     launch_metadata_htod_bytes={tracked_launch_metadata_htod_bytes}"
1106                ),
1107            })?;
1108        let tracked_aggregate_htod_calls = tracked_data_plane_htod_calls
1109            .checked_add(tracked_launch_metadata_htod_calls)
1110            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
1111                construct: "epistemic GPU transfer budget".to_string(),
1112                context: format!(
1113                    "aggregate H2D calls overflowed while adding launch metadata: \
1114                     data_plane_htod_calls={tracked_data_plane_htod_calls}, \
1115                     launch_metadata_htod_calls={tracked_launch_metadata_htod_calls}"
1116                ),
1117            })?;
1118
1119        if tracked_launch_metadata_htod_bytes != 0 && tracked_launch_metadata_htod_calls == 0 {
1120            return Err(XlogError::UnsupportedEpistemicConstruct {
1121                construct: "epistemic GPU transfer budget".to_string(),
1122                context: format!(
1123                    "launch metadata H2D bytes require matching H2D calls, got bytes={} calls=0",
1124                    tracked_launch_metadata_htod_bytes
1125                ),
1126            });
1127        }
1128        if tracked_launch_metadata_htod_calls != 0 && tracked_launch_metadata_htod_bytes == 0 {
1129            return Err(XlogError::UnsupportedEpistemicConstruct {
1130                construct: "epistemic GPU transfer budget".to_string(),
1131                context: format!(
1132                    "launch metadata H2D calls require matching payload bytes, got calls={} bytes=0",
1133                    tracked_launch_metadata_htod_calls
1134                ),
1135            });
1136        }
1137
1138        if tracked_dtoh_bytes != 0
1139            || tracked_data_plane_htod_bytes != 0
1140            || tracked_dtoh_calls != 0
1141            || tracked_data_plane_htod_calls != 0
1142        {
1143            return Err(XlogError::UnsupportedEpistemicConstruct {
1144                construct: "epistemic GPU transfer budget".to_string(),
1145                context: format!(
1146                    "tracked host transfer in GPU hot path: tracked data-plane host transfer: \
1147                     dtoh_bytes={tracked_dtoh_bytes}, \
1148                     data_plane_htod_bytes={tracked_data_plane_htod_bytes}, \
1149                     dtoh_calls={tracked_dtoh_calls}, \
1150                     data_plane_htod_calls={tracked_data_plane_htod_calls}, \
1151                     launch_metadata_htod_bytes={tracked_launch_metadata_htod_bytes}, \
1152                     launch_metadata_htod_calls={tracked_launch_metadata_htod_calls}"
1153                ),
1154            });
1155        }
1156
1157        Ok(Self {
1158            candidate_count,
1159            tracked_dtoh_bytes,
1160            tracked_htod_bytes: tracked_data_plane_htod_bytes,
1161            tracked_dtoh_calls,
1162            tracked_htod_calls: tracked_data_plane_htod_calls,
1163            tracked_aggregate_htod_bytes,
1164            tracked_aggregate_htod_calls,
1165            tracked_launch_metadata_htod_bytes,
1166            tracked_launch_metadata_htod_calls,
1167            tracked_data_plane_htod_bytes,
1168            tracked_data_plane_htod_calls,
1169            per_candidate_host_round_trips: 0,
1170        })
1171    }
1172}
1173
1174impl EpistemicGpuFinalResultTransferTrace {
1175    /// Account for the final device-resident output after the hot-path budget window closes.
1176    pub fn from_final_output(
1177        provider: &xlog_cuda::CudaKernelProvider,
1178        final_output: &CudaBuffer,
1179    ) -> Result<Self> {
1180        let row_count_was_cached = final_output.cached_row_count().is_some();
1181        let final_output_rows = provider.device_row_count(final_output)?;
1182        let final_output_column_count = final_output.arity();
1183        let final_output_row_width_bytes = final_output.schema().row_size_bytes();
1184        let final_output_payload_bytes =
1185            checked_product(final_output_rows, final_output_row_width_bytes)? as u64;
1186
1187        Ok(Self {
1188            final_output_rows,
1189            final_output_column_count,
1190            final_output_row_width_bytes,
1191            final_output_payload_bytes,
1192            row_count_device_reads: u32::from(!row_count_was_cached),
1193            tracked_data_plane_dtoh_calls: 0,
1194            tracked_data_plane_dtoh_bytes: 0,
1195        })
1196    }
1197
1198    /// Require retained final-result transfer accounting to match the final device buffer.
1199    pub fn require_matches_final_output(
1200        &self,
1201        construct: &str,
1202        final_output: &CudaBuffer,
1203    ) -> Result<()> {
1204        let Some(cached_rows) = final_output.cached_row_count() else {
1205            return Err(XlogError::UnsupportedEpistemicConstruct {
1206                construct: construct.to_string(),
1207                context:
1208                    "final-result transfer certification requires cached device final row count"
1209                        .to_string(),
1210            });
1211        };
1212        let logical_rows =
1213            validate_logical_row_count(final_output.num_rows(), cached_rows as usize).map_err(
1214                |err| XlogError::UnsupportedEpistemicConstruct {
1215                    construct: construct.to_string(),
1216                    context: format!("invalid final-output logical row count: {err}"),
1217                },
1218            )?;
1219        let row_width = final_output.schema().row_size_bytes();
1220        let payload_bytes = checked_product(logical_rows, row_width)? as u64;
1221        if self.final_output_rows != logical_rows
1222            || self.final_output_column_count != final_output.arity()
1223            || self.final_output_row_width_bytes != row_width
1224            || self.final_output_payload_bytes != payload_bytes
1225        {
1226            return Err(XlogError::UnsupportedEpistemicConstruct {
1227                construct: construct.to_string(),
1228                context: format!(
1229                    "final-result transfer trace does not match final device output: rows={}/{} \
1230                     columns={}/{} row_width={}/{} payload_bytes={}/{}",
1231                    self.final_output_rows,
1232                    logical_rows,
1233                    self.final_output_column_count,
1234                    final_output.arity(),
1235                    self.final_output_row_width_bytes,
1236                    row_width,
1237                    self.final_output_payload_bytes,
1238                    payload_bytes
1239                ),
1240            });
1241        }
1242        if self.row_count_device_reads > 1 {
1243            return Err(XlogError::UnsupportedEpistemicConstruct {
1244                construct: construct.to_string(),
1245                context: format!(
1246                    "final-result transfer reads one device row-count scalar at most, got {}",
1247                    self.row_count_device_reads
1248                ),
1249            });
1250        }
1251
1252        Ok(())
1253    }
1254}
1255
1256impl EpistemicGpuSemanticTrace {
1257    /// Require semantic phase counts to match the retained GPU execution traces.
1258    pub fn require_matches_execution_traces(
1259        &self,
1260        construct: &str,
1261        candidate_generation: &EpistemicGpuCandidateGenerationTrace,
1262        propagation: &EpistemicGpuPropagationTrace,
1263        model_membership: &EpistemicGpuModelMembershipTrace,
1264        world_view_validation: &EpistemicGpuWorldViewValidationTrace,
1265    ) -> Result<()> {
1266        let expected_pruned = self
1267            .generated_candidates
1268            .checked_sub(propagation.propagated_candidates)
1269            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
1270                construct: construct.to_string(),
1271                context: format!(
1272                    "semantic trace phase counts cannot propagate more candidates than were \
1273                     generated: generated={} propagated={}",
1274                    self.generated_candidates, propagation.propagated_candidates
1275                ),
1276            })?;
1277        let expected_reduced_model_slots = checked_product(
1278            checked_product(
1279                world_view_validation.candidates_checked,
1280                model_membership.reduction_count,
1281            )?,
1282            model_membership.models_per_reduction,
1283        )?;
1284        let expected_guesses = checked_product(
1285            candidate_generation.generated_candidates,
1286            candidate_generation.literal_count,
1287        )?;
1288        if self.generated_candidates != candidate_generation.generated_candidates
1289            || self.guesses != expected_guesses
1290            || self.propagated_candidates != propagation.propagated_candidates
1291            || self.pruned_candidates != expected_pruned
1292            || self.tested_candidates != world_view_validation.candidates_checked
1293            || self.reduced_model_slots_checked != expected_reduced_model_slots
1294        {
1295            return Err(XlogError::UnsupportedEpistemicConstruct {
1296                construct: construct.to_string(),
1297                context: format!(
1298                    "semantic trace phase counts must match retained GPU execution traces, got \
1299                     generated={} expected_generated={} guesses={} expected_guesses={} \
1300                     propagated={} expected_propagated={} pruned={} expected_pruned={} \
1301                     tested={} expected_tested={} reduced_model_slots={} \
1302                     expected_reduced_model_slots={}",
1303                    self.generated_candidates,
1304                    candidate_generation.generated_candidates,
1305                    self.guesses,
1306                    expected_guesses,
1307                    self.propagated_candidates,
1308                    propagation.propagated_candidates,
1309                    self.pruned_candidates,
1310                    expected_pruned,
1311                    self.tested_candidates,
1312                    world_view_validation.candidates_checked,
1313                    self.reduced_model_slots_checked,
1314                    expected_reduced_model_slots
1315                ),
1316            });
1317        }
1318
1319        Ok(())
1320    }
1321
1322    /// Require bounded rejection-buffer metadata accounting to match generated candidates.
1323    pub fn require_rejection_metadata_accounting(&self, construct: &str) -> Result<()> {
1324        let expected_metadata_bytes =
1325            checked_product(self.generated_candidates, std::mem::size_of::<u32>())? as u64;
1326        if self.rejection_reason_device_reads != 1
1327            || self.rejection_reason_metadata_bytes != expected_metadata_bytes
1328        {
1329            return Err(XlogError::UnsupportedEpistemicConstruct {
1330                construct: construct.to_string(),
1331                context: format!(
1332                    "semantic trace rejection metadata accounting must match the bounded device \
1333                     rejection-buffer read, got reads={} bytes={} expected_reads=1 \
1334                     expected_bytes={}",
1335                    self.rejection_reason_device_reads,
1336                    self.rejection_reason_metadata_bytes,
1337                    expected_metadata_bytes
1338                ),
1339            });
1340        }
1341
1342        Ok(())
1343    }
1344
1345    /// Require accepted/rejected candidate indices to partition generated candidates.
1346    pub fn require_candidate_index_partition(&self, construct: &str) -> Result<()> {
1347        let accounted_candidates = self.accepted_candidates.checked_add(self.rejected_candidates).ok_or_else(|| {
1348            XlogError::UnsupportedEpistemicConstruct {
1349                construct: construct.to_string(),
1350                context: format!(
1351                    "semantic trace candidate index partition accounting overflowed: accepted={} rejected={}",
1352                    self.accepted_candidates, self.rejected_candidates
1353                ),
1354            }
1355        })?;
1356        if self.accepted_candidate_indices.len() != self.accepted_candidates
1357            || self.rejected_candidate_indices.len() != self.rejected_candidates
1358            || self.accepted_world_views != self.accepted_candidates
1359            || accounted_candidates != self.generated_candidates
1360        {
1361            return Err(XlogError::UnsupportedEpistemicConstruct {
1362                construct: construct.to_string(),
1363                context: format!(
1364                    "semantic trace candidate index partition requires counts and index vectors \
1365                     to match generated candidates, got generated={} accepted={} \
1366                     accepted_indices={} accepted_world_views={} rejected={} rejected_indices={}",
1367                    self.generated_candidates,
1368                    self.accepted_candidates,
1369                    self.accepted_candidate_indices.len(),
1370                    self.accepted_world_views,
1371                    self.rejected_candidates,
1372                    self.rejected_candidate_indices.len()
1373                ),
1374            });
1375        }
1376        if self.rejection_reasons.len() != self.rejected_candidates {
1377            return Err(XlogError::UnsupportedEpistemicConstruct {
1378                construct: construct.to_string(),
1379                context: format!(
1380                    "semantic trace rejection reason count must match rejected candidates, got \
1381                     reasons={} rejected={}",
1382                    self.rejection_reasons.len(),
1383                    self.rejected_candidates
1384                ),
1385            });
1386        }
1387        self.typed_rejection_reasons()?;
1388
1389        let mut seen = BTreeSet::new();
1390        for (kind, indices) in [
1391            ("accepted", self.accepted_candidate_indices.as_slice()),
1392            ("rejected", self.rejected_candidate_indices.as_slice()),
1393        ] {
1394            for &index in indices {
1395                if index >= self.generated_candidates {
1396                    return Err(XlogError::UnsupportedEpistemicConstruct {
1397                        construct: construct.to_string(),
1398                        context: format!(
1399                            "semantic trace candidate index partition has out-of-range {kind} \
1400                             index {index} for generated candidate count {}",
1401                            self.generated_candidates
1402                        ),
1403                    });
1404                }
1405                if !seen.insert(index) {
1406                    return Err(XlogError::UnsupportedEpistemicConstruct {
1407                        construct: construct.to_string(),
1408                        context: format!(
1409                            "semantic trace candidate index partition contains duplicate \
1410                             candidate index {index}"
1411                        ),
1412                    });
1413                }
1414            }
1415        }
1416        if seen.len() != self.generated_candidates {
1417            return Err(XlogError::UnsupportedEpistemicConstruct {
1418                construct: construct.to_string(),
1419                context: format!(
1420                    "semantic trace candidate index partition covers {} of {} generated \
1421                     candidates",
1422                    seen.len(),
1423                    self.generated_candidates
1424                ),
1425            });
1426        }
1427
1428        Ok(())
1429    }
1430
1431    /// Decode nonzero device rejection codes into typed GPU semantic reasons.
1432    pub fn typed_rejection_reasons(&self) -> Result<Vec<EpistemicGpuRejectionReason>> {
1433        self.rejection_reasons
1434            .iter()
1435            .copied()
1436            .map(EpistemicGpuRejectionReason::from_code)
1437            .collect()
1438    }
1439
1440    /// Summarize accepted/rejected candidates from the device rejection buffer.
1441    pub fn from_device_rejection_reasons(
1442        provider: &xlog_cuda::CudaKernelProvider,
1443        workspace: &EpistemicGpuWorkspace,
1444        candidate_generation: &EpistemicGpuCandidateGenerationTrace,
1445        propagation: &EpistemicGpuPropagationTrace,
1446        model_membership: &EpistemicGpuModelMembershipTrace,
1447        world_view_validation: &EpistemicGpuWorldViewValidationTrace,
1448    ) -> Result<Self> {
1449        let candidate_count = candidate_generation.generated_candidates;
1450        require_positive(candidate_count, "epistemic GPU semantic-trace candidates")?;
1451        if candidate_count > workspace.layout.rejection_reason_slots {
1452            return Err(XlogError::ResourceExhausted {
1453                context: "epistemic GPU semantic-trace rejection metadata".to_string(),
1454                estimated_bytes: candidate_count as u64,
1455                budget_bytes: workspace.layout.rejection_reason_slots as u64,
1456            });
1457        }
1458        if propagation.literal_count != candidate_generation.literal_count
1459            || model_membership.literal_count != candidate_generation.literal_count
1460            || world_view_validation.literal_count != candidate_generation.literal_count
1461        {
1462            return Err(XlogError::UnsupportedEpistemicConstruct {
1463                construct: "epistemic GPU semantic trace".to_string(),
1464                context: format!(
1465                    "semantic trace requires all GPU stages to agree on literal count, got \
1466                     generated={} propagated={} membership={} validation={}",
1467                    candidate_generation.literal_count,
1468                    propagation.literal_count,
1469                    model_membership.literal_count,
1470                    world_view_validation.literal_count
1471                ),
1472            });
1473        }
1474        let pruned_candidates = candidate_count
1475            .checked_sub(propagation.propagated_candidates)
1476            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
1477                construct: "epistemic GPU semantic trace".to_string(),
1478                context: format!(
1479                    "semantic trace cannot prune more candidates than were generated: \
1480                     generated={} propagated={}",
1481                    candidate_count, propagation.propagated_candidates
1482                ),
1483            })?;
1484        if propagation.rejection_reason_slots_written < candidate_count {
1485            return Err(XlogError::UnsupportedEpistemicConstruct {
1486                construct: "epistemic GPU semantic trace".to_string(),
1487                context: format!(
1488                    "semantic trace requires rejection metadata for every generated candidate, \
1489                     got generated={} rejection_slots_initialized={}",
1490                    candidate_count, propagation.rejection_reason_slots_written
1491                ),
1492            });
1493        }
1494        if model_membership.candidates_checked != candidate_count
1495            || world_view_validation.candidates_checked != candidate_count
1496        {
1497            return Err(XlogError::UnsupportedEpistemicConstruct {
1498                construct: "epistemic GPU semantic trace".to_string(),
1499                context: format!(
1500                    "semantic trace requires GPU validation coverage for every generated \
1501                     candidate, got generated={} membership_checked={} validation_checked={}",
1502                    candidate_count,
1503                    model_membership.candidates_checked,
1504                    world_view_validation.candidates_checked
1505                ),
1506            });
1507        }
1508        if model_membership.reduction_count != world_view_validation.reduction_count
1509            || model_membership.models_per_reduction != world_view_validation.models_per_reduction
1510        {
1511            return Err(XlogError::UnsupportedEpistemicConstruct {
1512                construct: "epistemic GPU semantic trace".to_string(),
1513                context: format!(
1514                    "semantic trace requires model-membership and world-view validation layouts \
1515                     to match, got membership_reductions={} validation_reductions={} \
1516                     membership_models_per_reduction={} validation_models_per_reduction={}",
1517                    model_membership.reduction_count,
1518                    world_view_validation.reduction_count,
1519                    model_membership.models_per_reduction,
1520                    world_view_validation.models_per_reduction
1521                ),
1522            });
1523        }
1524
1525        let raw_rejection_reasons = provider
1526            .dtoh_small_metadata_untracked(&workspace.rejection_reasons, candidate_count)?;
1527        // Bounded metadata read of the parallel constraint-violation index buffer.
1528        // Like `rejection_reasons`, this is an untracked post-hot-path metadata
1529        // read, not a data-plane transfer.
1530        let raw_constraint_violation_index = provider.dtoh_small_metadata_untracked(
1531            &workspace.constraint_violation_index,
1532            candidate_count,
1533        )?;
1534        let constraint_violation_code =
1535            EpistemicGpuRejectionReason::WorldViewConstraintViolation.code();
1536        let mut accepted_candidate_indices = Vec::new();
1537        let mut rejected_candidate_indices = Vec::new();
1538        let mut rejection_reasons = Vec::new();
1539        let mut constraint_violation_indices: Vec<Option<u32>> = Vec::new();
1540        for (candidate_index, reason) in raw_rejection_reasons.into_iter().enumerate() {
1541            if reason == 0 {
1542                accepted_candidate_indices.push(candidate_index);
1543            } else {
1544                EpistemicGpuRejectionReason::from_code(reason)?;
1545                rejected_candidate_indices.push(candidate_index);
1546                rejection_reasons.push(reason);
1547                // Gate the constraint-specific index on the integrity-constraint
1548                // reason code: the kernel writes `rejection_reasons[c] = 6` and
1549                // `constraint_violation_index[c] = constraint` together, so the
1550                // index is trustworthy exactly when the reason is 6. Any other
1551                // reason -> None, independent of buffer contents (also defends
1552                // the zero-constraint path where the sentinel is never written).
1553                let firing = raw_constraint_violation_index
1554                    .get(candidate_index)
1555                    .copied()
1556                    .unwrap_or(u32::MAX);
1557                if reason == constraint_violation_code && firing != u32::MAX {
1558                    constraint_violation_indices.push(Some(firing));
1559                } else {
1560                    constraint_violation_indices.push(None);
1561                }
1562            }
1563        }
1564        let accepted_candidates = accepted_candidate_indices.len();
1565        let rejected_candidates = rejection_reasons.len();
1566        let reduced_model_slots_checked = checked_product(
1567            checked_product(
1568                world_view_validation.candidates_checked,
1569                model_membership.reduction_count,
1570            )?,
1571            model_membership.models_per_reduction,
1572        )?;
1573        let rejection_reason_metadata_bytes =
1574            checked_product(candidate_count, std::mem::size_of::<u32>())? as u64;
1575
1576        Ok(Self {
1577            generated_candidates: candidate_count,
1578            guesses: checked_product(candidate_count, candidate_generation.literal_count)?,
1579            propagated_candidates: propagation.propagated_candidates,
1580            pruned_candidates,
1581            tested_candidates: world_view_validation.candidates_checked,
1582            reduced_model_slots_checked,
1583            accepted_candidates,
1584            accepted_candidate_indices,
1585            accepted_world_views: accepted_candidates,
1586            rejected_candidates,
1587            rejected_candidate_indices,
1588            rejection_reasons,
1589            constraint_violation_indices,
1590            // Counts the bounded metadata read of the rejection-reason code buffer
1591            // specifically (the certification invariant scopes to that buffer's
1592            // bytes). The parallel constraint-violation index buffer is a
1593            // separate bounded metadata read tracked alongside it, not folded
1594            // into this rejection-reason-specific counter.
1595            rejection_reason_device_reads: 1,
1596            rejection_reason_metadata_bytes,
1597            cpu_candidate_enumerations: 0,
1598            cpu_world_view_validations: 0,
1599        })
1600    }
1601}
1602
1603fn transfer_counter_delta(name: &str, before: u64, after: u64) -> Result<u64> {
1604    after
1605        .checked_sub(before)
1606        .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
1607            construct: "epistemic GPU transfer budget".to_string(),
1608            context: format!(
1609                "host transfer counter decreased during GPU hot path: {name} before={before}, \
1610                 after={after}"
1611            ),
1612        })
1613}
1614
1615impl EpistemicGpuModelMembershipTrace {
1616    /// Build a model-membership trace for a bounded device launch.
1617    pub fn for_counts(
1618        literal_count: usize,
1619        candidate_count: usize,
1620        reduction_count: usize,
1621        models_per_reduction: usize,
1622    ) -> Result<Self> {
1623        require_positive(literal_count, "epistemic GPU model-membership literals")?;
1624        require_positive(candidate_count, "epistemic GPU model-membership candidates")?;
1625        require_positive(reduction_count, "epistemic GPU model-membership reductions")?;
1626        require_positive(
1627            models_per_reduction,
1628            "epistemic GPU model-membership models",
1629        )?;
1630        let model_membership_bytes_written = checked_product(
1631            checked_product(
1632                checked_product(candidate_count, reduction_count)?,
1633                models_per_reduction,
1634            )?,
1635            literal_count,
1636        )?;
1637        require_u32_launch_bound(
1638            model_membership_bytes_written,
1639            "epistemic GPU model-membership launch",
1640        )?;
1641
1642        Ok(Self {
1643            literal_count,
1644            candidates_checked: candidate_count,
1645            reduction_count,
1646            models_per_reduction,
1647            model_membership_bytes_written,
1648            output_row_count_device_reads: 1,
1649            tuple_source_row_count_device_reads: 0,
1650            tuple_source_key_column_device_reads: 0,
1651            rejection_reason_slots_checked: candidate_count,
1652            membership_source: EpistemicGpuModelMembershipSource::ReducedOutputRowCountOnly,
1653            kernel_launches: 1,
1654            host_write_ops: 0,
1655            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
1656        })
1657    }
1658
1659    /// Build a model-membership trace backed by reduced stable-model tuple sources.
1660    pub fn for_stable_model_tuple_sources(
1661        literal_count: usize,
1662        candidate_count: usize,
1663        reduction_count: usize,
1664        models_per_reduction: usize,
1665        tuple_source_count: usize,
1666    ) -> Result<Self> {
1667        Self::for_stable_model_tuple_sources_with_key_columns(
1668            literal_count,
1669            candidate_count,
1670            reduction_count,
1671            models_per_reduction,
1672            tuple_source_count,
1673            0,
1674        )
1675    }
1676
1677    /// Build a model-membership trace backed by tuple sources and key columns.
1678    pub fn for_stable_model_tuple_sources_with_key_columns(
1679        literal_count: usize,
1680        candidate_count: usize,
1681        reduction_count: usize,
1682        models_per_reduction: usize,
1683        tuple_source_count: usize,
1684        tuple_source_key_column_count: usize,
1685    ) -> Result<Self> {
1686        require_positive(literal_count, "epistemic GPU model-membership literals")?;
1687        require_positive(candidate_count, "epistemic GPU model-membership candidates")?;
1688        require_positive(reduction_count, "epistemic GPU model-membership reductions")?;
1689        require_positive(
1690            models_per_reduction,
1691            "epistemic GPU model-membership models",
1692        )?;
1693        require_positive(
1694            tuple_source_count,
1695            "epistemic GPU model-membership tuple sources",
1696        )?;
1697        if tuple_source_count > u32::MAX as usize {
1698            return Err(XlogError::ResourceExhausted {
1699                context: "epistemic GPU model-membership tuple sources".to_string(),
1700                estimated_bytes: tuple_source_count as u64,
1701                budget_bytes: u32::MAX as u64,
1702            });
1703        }
1704        if tuple_source_key_column_count > u32::MAX as usize {
1705            return Err(XlogError::ResourceExhausted {
1706                context: "epistemic GPU model-membership tuple key columns".to_string(),
1707                estimated_bytes: tuple_source_key_column_count as u64,
1708                budget_bytes: u32::MAX as u64,
1709            });
1710        }
1711        let model_membership_bytes_written = checked_product(
1712            checked_product(
1713                checked_product(candidate_count, reduction_count)?,
1714                models_per_reduction,
1715            )?,
1716            literal_count,
1717        )?;
1718        require_u32_launch_bound(
1719            model_membership_bytes_written,
1720            "epistemic GPU model-membership launch",
1721        )?;
1722
1723        Ok(Self {
1724            literal_count,
1725            candidates_checked: candidate_count,
1726            reduction_count,
1727            models_per_reduction,
1728            model_membership_bytes_written,
1729            output_row_count_device_reads: 0,
1730            tuple_source_row_count_device_reads: tuple_source_count as u32,
1731            tuple_source_key_column_device_reads: tuple_source_key_column_count as u32,
1732            rejection_reason_slots_checked: candidate_count,
1733            membership_source: EpistemicGpuModelMembershipSource::StableModelTupleBuffer,
1734            kernel_launches: tuple_source_count as u32,
1735            host_write_ops: 0,
1736            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
1737        })
1738    }
1739
1740    /// Attach CUDA-event timing captured by the runtime launch path.
1741    pub const fn with_kernel_timing(
1742        mut self,
1743        kernel_timing: EpistemicGpuKernelTimingTrace,
1744    ) -> Self {
1745        self.kernel_timing = kernel_timing;
1746        self
1747    }
1748
1749    /// Require semantic stable-model tuple membership before accepting execution.
1750    pub fn require_stable_model_tuple_source(&self) -> Result<()> {
1751        if self.membership_source != EpistemicGpuModelMembershipSource::StableModelTupleBuffer {
1752            return Err(XlogError::UnsupportedEpistemicConstruct {
1753                construct: "epistemic GPU stable-model membership certification".to_string(),
1754                context: format!(
1755                    "model-membership source {:?} is bounded staging only; actual reduced \
1756                     stable-model tuple membership is required before returning accepted \
1757                     epistemic execution",
1758                    self.membership_source
1759                ),
1760            });
1761        }
1762
1763        Ok(())
1764    }
1765
1766    /// Require the tuple-key device reads planned for this model-membership trace.
1767    pub fn require_planned_tuple_key_column_reads(
1768        &self,
1769        expected_key_column_reads: usize,
1770    ) -> Result<()> {
1771        if self.tuple_source_key_column_device_reads as usize != expected_key_column_reads {
1772            return Err(XlogError::UnsupportedEpistemicConstruct {
1773                construct: "epistemic GPU stable-model membership certification".to_string(),
1774                context: format!(
1775                    "model-membership tuple-key device column reads must match the planned \
1776                     nonzero-arity tuple keys, got reads={} expected={}",
1777                    self.tuple_source_key_column_device_reads, expected_key_column_reads
1778                ),
1779            });
1780        }
1781
1782        Ok(())
1783    }
1784}
1785
1786impl EpistemicGpuWorldViewValidationTrace {
1787    /// Build a world-view validation trace for a bounded device launch.
1788    pub fn for_counts(
1789        literal_count: usize,
1790        candidate_count: usize,
1791        reduction_count: usize,
1792        models_per_reduction: usize,
1793    ) -> Result<Self> {
1794        require_positive(
1795            literal_count,
1796            "epistemic GPU world-view validation literals",
1797        )?;
1798        require_positive(
1799            candidate_count,
1800            "epistemic GPU world-view validation candidates",
1801        )?;
1802        require_positive(
1803            reduction_count,
1804            "epistemic GPU world-view validation reductions",
1805        )?;
1806        require_positive(
1807            models_per_reduction,
1808            "epistemic GPU world-view validation models",
1809        )?;
1810        let model_membership_bytes_checked = checked_product(
1811            checked_product(
1812                checked_product(candidate_count, reduction_count)?,
1813                models_per_reduction,
1814            )?,
1815            literal_count,
1816        )?;
1817        require_u32_launch_bound(
1818            model_membership_bytes_checked,
1819            "epistemic GPU world-view validation membership launch",
1820        )?;
1821
1822        Ok(Self {
1823            literal_count,
1824            candidates_checked: candidate_count,
1825            reduction_count,
1826            models_per_reduction,
1827            model_membership_bytes_checked,
1828            world_view_slots_checked: candidate_count,
1829            rejection_reason_slots_written: candidate_count,
1830            kernel_launches: 1,
1831            host_write_ops: 0,
1832            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
1833        })
1834    }
1835
1836    /// Attach CUDA-event timing captured by the runtime launch path.
1837    pub const fn with_kernel_timing(
1838        mut self,
1839        kernel_timing: EpistemicGpuKernelTimingTrace,
1840    ) -> Self {
1841        self.kernel_timing = kernel_timing;
1842        self
1843    }
1844}
1845
1846impl EpistemicGpuPropagationTrace {
1847    /// Build a propagation trace for a bounded device launch.
1848    pub fn for_counts(literal_count: usize, candidate_count: usize) -> Result<Self> {
1849        require_positive(literal_count, "epistemic GPU propagation literals")?;
1850        require_positive(candidate_count, "epistemic GPU propagation candidates")?;
1851        require_u32_launch_dimensions(
1852            &[literal_count, candidate_count],
1853            "epistemic GPU propagation launch",
1854        )?;
1855
1856        Ok(Self {
1857            literal_count,
1858            propagated_candidates: candidate_count,
1859            world_view_bytes_written: candidate_count,
1860            rejection_reason_slots_written: candidate_count,
1861            kernel_launches: 1,
1862            host_write_ops: 0,
1863            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
1864        })
1865    }
1866
1867    /// Attach CUDA-event timing captured by the runtime launch path.
1868    pub const fn with_kernel_timing(
1869        mut self,
1870        kernel_timing: EpistemicGpuKernelTimingTrace,
1871    ) -> Self {
1872        self.kernel_timing = kernel_timing;
1873        self
1874    }
1875}
1876
1877impl EpistemicGpuWorkspaceResetTrace {
1878    /// Build the reset trace implied by a workspace layout.
1879    pub fn for_layout(layout: EpistemicGpuWorkspaceLayout) -> Self {
1880        Self::try_for_layout(layout)
1881            .expect("epistemic GPU workspace reset trace byte total overflowed")
1882    }
1883
1884    /// Build the reset trace implied by a workspace layout, failing closed on overflow.
1885    pub fn try_for_layout(layout: EpistemicGpuWorkspaceLayout) -> Result<Self> {
1886        Ok(Self {
1887            candidate_assumption_bytes: layout.candidate_assumption_bytes,
1888            world_view_bytes: layout.world_view_bytes,
1889            model_membership_bytes: layout.model_membership_bytes,
1890            rejection_reason_bytes: checked_product(
1891                layout.rejection_reason_slots,
1892                std::mem::size_of::<u32>(),
1893            )?,
1894            device_zero_ops: 4,
1895            host_write_ops: 0,
1896        })
1897    }
1898
1899    /// Total bytes zeroed by the reset path.
1900    pub fn total_zeroed_bytes(&self) -> usize {
1901        self.try_total_zeroed_bytes()
1902            .expect("epistemic GPU workspace reset byte total overflowed")
1903    }
1904
1905    /// Checked total bytes zeroed by the reset path.
1906    pub fn try_total_zeroed_bytes(&self) -> Result<usize> {
1907        checked_sum(
1908            checked_sum(
1909                checked_sum(self.candidate_assumption_bytes, self.world_view_bytes)?,
1910                self.model_membership_bytes,
1911            )?,
1912            self.rejection_reason_bytes,
1913        )
1914    }
1915
1916    /// Require the retained reset trace to match the prepared workspace layout.
1917    pub fn require_matches_layout(
1918        &self,
1919        construct: &str,
1920        layout: EpistemicGpuWorkspaceLayout,
1921    ) -> Result<()> {
1922        let expected = Self::try_for_layout(layout)?;
1923        if *self != expected {
1924            return Err(XlogError::UnsupportedEpistemicConstruct {
1925                construct: construct.to_string(),
1926                context: format!(
1927                    "workspace reset trace does not match prepared GPU workspace layout: \
1928                     candidate_bytes={}/{} world_view_bytes={}/{} model_membership_bytes={}/{} \
1929                     rejection_reason_bytes={}/{} device_zero_ops={}/{} host_write_ops={}/{}",
1930                    self.candidate_assumption_bytes,
1931                    expected.candidate_assumption_bytes,
1932                    self.world_view_bytes,
1933                    expected.world_view_bytes,
1934                    self.model_membership_bytes,
1935                    expected.model_membership_bytes,
1936                    self.rejection_reason_bytes,
1937                    expected.rejection_reason_bytes,
1938                    self.device_zero_ops,
1939                    expected.device_zero_ops,
1940                    self.host_write_ops,
1941                    expected.host_write_ops
1942                ),
1943            });
1944        }
1945
1946        Ok(())
1947    }
1948}
1949
1950/// Runtime preflight summary for an epistemic executable plan.
1951#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1952pub struct EpistemicGpuRuntimePreflight {
1953    /// Selected epistemic semantics mode for the accepted GPU execution.
1954    pub epistemic_mode: EirEpistemicMode,
1955    /// GPU workspace layout required by the executable plan.
1956    pub workspace_layout: EpistemicGpuWorkspaceLayout,
1957    /// Compiled reduced-runtime rule count.
1958    pub reduced_runtime_rule_count: usize,
1959    /// Compiler-generated reduced integrity-constraint relations to validate.
1960    pub reduced_constraint_relation_count: usize,
1961    /// Reduced rules that the epistemic planner marked as requiring WCOJ eligibility.
1962    pub wcoj_required_reduction_count: usize,
1963    /// Number of reduced rules carrying a `MultiWayJoin` route.
1964    pub multiway_reduction_count: usize,
1965    /// Number of K-clique WCOJ plans reused from the production planner.
1966    pub kclique_wcoj_plan_count: usize,
1967    /// Number of triangle WCOJ routes reused from the production runtime.
1968    pub wcoj_triangle_route_count: usize,
1969    /// Number of 4-cycle WCOJ routes reused from the production runtime.
1970    pub wcoj_4cycle_route_count: usize,
1971    /// Generic Free Join multiway routes: `plan: None` nodes
1972    /// matching no dedicated kernel shape. Opportunistic by contract —
1973    /// the dispatcher may structurally decline to the embedded binary
1974    /// fallback (non-prefix bound columns, non-u32 inputs, …), so these
1975    /// routes carry no hard dispatch obligation and are excluded from
1976    /// the dedicated-WCOJ certification arithmetic.
1977    pub free_join_route_count: usize,
1978    /// K-clique WCOJ plan counts by arity K=5..8.
1979    pub kclique_wcoj_plan_count_by_arity: [usize; 4],
1980    /// Maximum K-clique arity observed across production WCOJ plans.
1981    pub kclique_wcoj_max_arity: u8,
1982    /// Live edge-permutation slots carried by production K-clique plans.
1983    pub kclique_wcoj_edge_permutation_count: usize,
1984    /// Distinct K-clique stream groups carried by production WCOJ plans.
1985    pub kclique_stream_group_count: usize,
1986    /// K-clique WCOJ plans carrying helper-split skew scheduling metadata.
1987    pub kclique_skew_scheduled_plan_count: usize,
1988    /// Number of structured planned-hash routes.
1989    pub planned_hash_route_count: usize,
1990    /// Planned-hash routes where complete planner costs predicted hash wins.
1991    pub planned_hash_planner_wins_count: usize,
1992    /// Planned-hash routes selected because complete WCOJ stats were unavailable.
1993    pub planned_hash_incomplete_stats_count: usize,
1994    /// Planned-hash routes carrying finite hash-vs-WCOJ cost evidence.
1995    pub planned_hash_cost_evidence_count: usize,
1996    /// Sorted-layout edge-slot requirements carried by WCOJ plans.
1997    pub sorted_layout_requirement_count: usize,
1998    /// Helper-splitting specs carried by WCOJ plans.
1999    pub helper_split_spec_count: usize,
2000    /// Compiler-created helper-split relation rules in the reduced runtime plan.
2001    pub helper_relation_rule_count: usize,
2002    /// WCOJ input scans of compiler-created helper-split relations.
2003    pub helper_relation_scan_count: usize,
2004    /// Tuple-membership bindings certified for stable-model membership checks.
2005    pub tuple_membership_binding_count: usize,
2006    /// Solver assumption bindings exported by the semantic plan.
2007    pub solver_assumption_binding_count: usize,
2008    /// Solver production capabilities required by the semantic plan.
2009    pub solver_required_capability_count: usize,
2010    /// Distinct solver statuses required by the semantic plan.
2011    pub solver_required_status_count: usize,
2012    /// Non-negated `know` operators represented by the executable GPU plan.
2013    pub know_operator_count: usize,
2014    /// Non-negated `possible` operators represented by the executable GPU plan.
2015    pub possible_operator_count: usize,
2016    /// Negated `know` operators represented as `not know`.
2017    pub not_know_operator_count: usize,
2018    /// Negated `possible` operators represented as `not possible`.
2019    pub not_possible_operator_count: usize,
2020    /// Forbidden CPU fallback counters copied from the GPU semantic contract.
2021    pub cpu_fallbacks: EpistemicCpuFallbackCounters,
2022}
2023
2024impl EpistemicGpuRuntimePreflight {
2025    /// Whether this accepted execution used Gelfond-1991 (G91) compatibility semantics.
2026    pub fn is_g91_mode(&self) -> bool {
2027        matches!(self.epistemic_mode, EirEpistemicMode::G91)
2028    }
2029
2030    /// Whether this accepted execution used default FAEEL semantics.
2031    pub fn is_faeel_mode(&self) -> bool {
2032        matches!(self.epistemic_mode, EirEpistemicMode::Faeel)
2033    }
2034
2035    /// Inspect an executable epistemic plan before GPU kernel dispatch.
2036    pub fn for_executable_plan(
2037        executable: &EpistemicExecutablePlan,
2038        capacities: EpistemicGpuWorkspaceCapacities,
2039    ) -> Result<Self> {
2040        if !executable.gpu_plan.cpu_fallbacks.is_zero() {
2041            return Err(XlogError::UnsupportedEpistemicConstruct {
2042                construct: "epistemic GPU runtime preflight".to_string(),
2043                context: "nonzero CPU fallback counters".to_string(),
2044            });
2045        }
2046        executable.gpu_plan.validate_tuple_membership_bindings()?;
2047        executable.gpu_plan.validate_solver_contract()?;
2048        // A plan may carry MULTIPLE epistemic output heads: a JOINT-SOLVED
2049        // coalesced multi-head component shares ONE candidate enumeration +
2050        // world-view validation and materializes each head against the shared
2051        // accepted world view (see `execute_epistemic_gpu_execution`). Soundness of
2052        // the coupling is gated in the logic lowering
2053        // (`classify_cross_component_modal_coupling`); the runtime executes the
2054        // resulting well-formed plan and is no longer restricted to one head.
2055        require_epistemic_gpu_kernel_phases(&executable.gpu_plan)?;
2056        require_epistemic_gpu_buffer_contract(&executable.gpu_plan)?;
2057
2058        let workspace_layout =
2059            EpistemicGpuWorkspaceLayout::for_plan(&executable.gpu_plan, capacities)?;
2060        let mut routes = RuntimeRouteSummary::default();
2061        let mut reduced_runtime_rule_count = 0usize;
2062        let mut reduced_constraint_relation_names = Vec::new();
2063        let wcoj_required_reduction_count = executable
2064            .gpu_plan
2065            .reductions
2066            .iter()
2067            .filter(|reduction| {
2068                matches!(
2069                    reduction.wcoj_status,
2070                    EpistemicWcojReductionStatus::RequiresPlannerEligibility
2071                )
2072            })
2073            .count();
2074        let helper_relation_ids = helper_relation_ids(executable);
2075        let mut helper_relation_rule_count = 0usize;
2076        let mut helper_relation_scan_count = 0usize;
2077
2078        for rule in executable
2079            .reduced_runtime_plan
2080            .rules_by_scc
2081            .iter()
2082            .flatten()
2083        {
2084            reduced_runtime_rule_count += 1;
2085            if rule.head.starts_with(XLOG_CONSTRAINT_RELATION_PREFIX)
2086                && !reduced_constraint_relation_names
2087                    .iter()
2088                    .any(|name| name == &rule.head)
2089            {
2090                reduced_constraint_relation_names.push(rule.head.as_str());
2091            }
2092            if rule.head.starts_with("__kclique_helper_") {
2093                helper_relation_rule_count += 1;
2094            }
2095            helper_relation_scan_count +=
2096                count_helper_relation_scans(&rule.body, &helper_relation_ids);
2097            summarize_runtime_routes(&rule.body, &mut routes);
2098        }
2099
2100        if wcoj_required_reduction_count > routes.multiway_reduction_count {
2101            return Err(XlogError::UnsupportedEpistemicConstruct {
2102                construct: "epistemic GPU WCOJ route certification".to_string(),
2103                context: format!(
2104                    "plan requires {} WCOJ-eligible epistemic reductions, but reduced runtime \
2105                     plan exposes {} MultiWayJoin routes",
2106                    wcoj_required_reduction_count, routes.multiway_reduction_count
2107                ),
2108            });
2109        }
2110
2111        let planned_hash_reason_count = routes
2112            .planned_hash_planner_wins_count
2113            .checked_add(routes.planned_hash_incomplete_stats_count)
2114            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
2115                construct: "epistemic GPU planned-hash certification".to_string(),
2116                context: "planned-hash reason counters overflowed".to_string(),
2117            })?;
2118        if planned_hash_reason_count != routes.planned_hash_route_count
2119            || routes.planned_hash_cost_evidence_count < routes.planned_hash_planner_wins_count
2120        {
2121            return Err(XlogError::UnsupportedEpistemicConstruct {
2122                construct: "epistemic GPU planned-hash certification".to_string(),
2123                context: format!(
2124                    "planned_hash_routes={}, planner_wins={}, incomplete_stats={}, \
2125                     finite_cost_evidence={}",
2126                    routes.planned_hash_route_count,
2127                    routes.planned_hash_planner_wins_count,
2128                    routes.planned_hash_incomplete_stats_count,
2129                    routes.planned_hash_cost_evidence_count
2130                ),
2131            });
2132        }
2133
2134        if routes.kclique_wcoj_plan_count > 0 && routes.kclique_wcoj_edge_permutation_count == 0 {
2135            return Err(XlogError::UnsupportedEpistemicConstruct {
2136                construct: "epistemic GPU K-clique WCOJ certification".to_string(),
2137                context: format!(
2138                    "K-clique WCOJ plans require live edge-permutation slots, got \
2139                     kclique_plans={} edge_permutation_slots=0",
2140                    routes.kclique_wcoj_plan_count
2141                ),
2142            });
2143        }
2144
2145        if routes.helper_split_spec_count > 0
2146            && (helper_relation_rule_count < routes.helper_split_spec_count
2147                || helper_relation_scan_count < routes.helper_split_spec_count)
2148        {
2149            return Err(XlogError::UnsupportedEpistemicConstruct {
2150                construct: "epistemic GPU helper-split certification".to_string(),
2151                context: format!(
2152                    "helper_split_specs={}, helper_relation_rules={}, \
2153                     helper_relation_scans={}",
2154                    routes.helper_split_spec_count,
2155                    helper_relation_rule_count,
2156                    helper_relation_scan_count
2157                ),
2158            });
2159        }
2160
2161        let mut know_operator_count = 0usize;
2162        let mut possible_operator_count = 0usize;
2163        let mut not_know_operator_count = 0usize;
2164        let mut not_possible_operator_count = 0usize;
2165        for literal in &executable.gpu_plan.epistemic_literals {
2166            match (literal.op, literal.negated) {
2167                (EirEpistemicOp::Know, false) => know_operator_count += 1,
2168                (EirEpistemicOp::Possible, false) => possible_operator_count += 1,
2169                (EirEpistemicOp::Know, true) => not_know_operator_count += 1,
2170                (EirEpistemicOp::Possible, true) => not_possible_operator_count += 1,
2171            }
2172        }
2173
2174        Ok(Self {
2175            epistemic_mode: executable.gpu_plan.mode,
2176            workspace_layout,
2177            reduced_runtime_rule_count,
2178            reduced_constraint_relation_count: reduced_constraint_relation_names.len(),
2179            wcoj_required_reduction_count,
2180            multiway_reduction_count: routes.multiway_reduction_count,
2181            kclique_wcoj_plan_count: routes.kclique_wcoj_plan_count,
2182            wcoj_triangle_route_count: routes.wcoj_triangle_route_count,
2183            wcoj_4cycle_route_count: routes.wcoj_4cycle_route_count,
2184            free_join_route_count: routes.free_join_route_count,
2185            kclique_wcoj_plan_count_by_arity: routes.kclique_wcoj_plan_count_by_arity,
2186            kclique_wcoj_max_arity: routes.kclique_wcoj_max_arity,
2187            kclique_wcoj_edge_permutation_count: routes.kclique_wcoj_edge_permutation_count,
2188            kclique_stream_group_count: routes.kclique_stream_groups.len(),
2189            kclique_skew_scheduled_plan_count: routes.kclique_skew_scheduled_plan_count,
2190            planned_hash_route_count: routes.planned_hash_route_count,
2191            planned_hash_planner_wins_count: routes.planned_hash_planner_wins_count,
2192            planned_hash_incomplete_stats_count: routes.planned_hash_incomplete_stats_count,
2193            planned_hash_cost_evidence_count: routes.planned_hash_cost_evidence_count,
2194            sorted_layout_requirement_count: routes.sorted_layout_requirement_count,
2195            helper_split_spec_count: routes.helper_split_spec_count,
2196            helper_relation_rule_count,
2197            helper_relation_scan_count,
2198            tuple_membership_binding_count: executable.gpu_plan.tuple_membership_bindings.len(),
2199            solver_assumption_binding_count: executable
2200                .gpu_plan
2201                .solver_contract
2202                .assumption_bindings
2203                .len(),
2204            solver_required_capability_count: executable
2205                .gpu_plan
2206                .solver_contract
2207                .distinct_required_capability_count(),
2208            solver_required_status_count: executable
2209                .gpu_plan
2210                .solver_contract
2211                .distinct_required_status_count(),
2212            know_operator_count,
2213            possible_operator_count,
2214            not_know_operator_count,
2215            not_possible_operator_count,
2216            cpu_fallbacks: executable.gpu_plan.cpu_fallbacks,
2217        })
2218    }
2219}
2220
2221/// Prepared runtime state for epistemic GPU execution.
2222pub struct EpistemicGpuPreparedExecution {
2223    /// Static preflight summary.
2224    pub preflight: EpistemicGpuRuntimePreflight,
2225    /// Planned tuple-membership bindings certified before GPU execution.
2226    pub tuple_membership_bindings: Vec<EpistemicTupleMembershipBinding>,
2227    /// Device-resident workspace buffers.
2228    pub workspace: EpistemicGpuWorkspace,
2229    /// Device-side initialization trace for the workspace buffers.
2230    pub workspace_reset: EpistemicGpuWorkspaceResetTrace,
2231}
2232
2233/// Counter trace captured around a reduced production runtime dispatch.
2234#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2235pub struct EpistemicGpuRuntimeTrace {
2236    /// Static preflight summary for the executed plan.
2237    pub preflight: EpistemicGpuRuntimePreflight,
2238    /// Runtime counters before dispatch.
2239    pub counters_before: EpistemicGpuRuntimeCounters,
2240    /// Runtime counters after dispatch.
2241    pub counters_after: EpistemicGpuRuntimeCounters,
2242    /// Checked counter delta for the dispatch window.
2243    pub counter_delta: EpistemicGpuRuntimeCounters,
2244    /// WCOJ certification result derived from preflight obligations and deltas.
2245    pub wcoj_certification: EpistemicGpuRuntimeWcojCertification,
2246}
2247
2248impl EpistemicGpuRuntimeTrace {
2249    /// Build a trace from static preflight data and runtime counter snapshots.
2250    pub fn from_preflight_and_counters(
2251        preflight: EpistemicGpuRuntimePreflight,
2252        counters_before: EpistemicGpuRuntimeCounters,
2253        counters_after: EpistemicGpuRuntimeCounters,
2254    ) -> Self {
2255        Self::try_from_preflight_and_counters(preflight, counters_before, counters_after)
2256            .expect("runtime counter snapshots must be monotonic")
2257    }
2258
2259    /// Build a trace from static preflight data and runtime counter snapshots, failing closed
2260    /// if runtime proof counters move backwards or overflow while being summarized.
2261    pub fn try_from_preflight_and_counters(
2262        preflight: EpistemicGpuRuntimePreflight,
2263        counters_before: EpistemicGpuRuntimeCounters,
2264        counters_after: EpistemicGpuRuntimeCounters,
2265    ) -> Result<Self> {
2266        let counter_delta = counters_after.checked_delta_since(counters_before)?;
2267        let wcoj_certification = EpistemicGpuRuntimeWcojCertification::try_for_preflight_and_delta(
2268            &preflight,
2269            &counter_delta,
2270        )?;
2271
2272        Ok(Self {
2273            preflight,
2274            counters_before,
2275            counters_after,
2276            counter_delta,
2277            wcoj_certification,
2278        })
2279    }
2280
2281    /// Fail closed when a WCOJ-required epistemic reduction lacks runtime evidence.
2282    pub fn require_wcoj_certification(&self) -> Result<()> {
2283        match self.wcoj_certification {
2284            EpistemicGpuRuntimeWcojCertification::MissingRequiredWcojDispatch {
2285                required_multiway_reductions,
2286                required_kclique_plans,
2287                observed_wcoj_dispatches,
2288                observed_kclique_dispatches,
2289            } => Err(XlogError::UnsupportedEpistemicConstruct {
2290                construct: "epistemic GPU WCOJ dispatch certification".to_string(),
2291                context: format!(
2292                    "required_multiway_reductions={required_multiway_reductions}, \
2293                     required_kclique_plans={required_kclique_plans}, \
2294                     observed_wcoj_dispatches={observed_wcoj_dispatches}, \
2295                     observed_kclique_dispatches={observed_kclique_dispatches}"
2296                ),
2297            }),
2298            EpistemicGpuRuntimeWcojCertification::MissingRequiredWcojLayout {
2299                required_sorted_layouts,
2300                observed_layout_events,
2301            } => Err(XlogError::UnsupportedEpistemicConstruct {
2302                construct: "epistemic GPU WCOJ layout certification".to_string(),
2303                context: format!(
2304                    "required_sorted_layouts={required_sorted_layouts}, \
2305                     observed_layout_events={observed_layout_events}"
2306                ),
2307            }),
2308            EpistemicGpuRuntimeWcojCertification::MissingRequiredKcliqueMetadata {
2309                required_kclique_plans,
2310                observed_metadata_builds,
2311                observed_metadata_build_nanos,
2312            } => Err(XlogError::UnsupportedEpistemicConstruct {
2313                construct: "epistemic GPU K-clique metadata certification".to_string(),
2314                context: format!(
2315                    "required_kclique_plans={required_kclique_plans}, \
2316                     observed_metadata_builds={observed_metadata_builds}, \
2317                     observed_metadata_build_nanos={observed_metadata_build_nanos}"
2318                ),
2319            }),
2320            EpistemicGpuRuntimeWcojCertification::NotRequired { .. }
2321            | EpistemicGpuRuntimeWcojCertification::Certified { .. } => Ok(()),
2322        }
2323    }
2324}
2325
2326/// Runtime counters relevant to epistemic GPU certification.
2327#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
2328pub struct EpistemicGpuRuntimeCounters {
2329    /// Successful triangle WCOJ dispatches installed by the executor.
2330    pub wcoj_triangle_dispatch_count: u64,
2331    /// Successful 4-cycle WCOJ dispatches installed by the executor.
2332    pub wcoj_4cycle_dispatch_count: u64,
2333    /// Successful chain dispatches installed by the executor.
2334    pub chain_dispatch_count: u64,
2335    /// Successful K=5 clique WCOJ dispatches installed by the executor.
2336    pub wcoj_clique5_dispatch_count: u64,
2337    /// Successful K=6 clique WCOJ dispatches installed by the executor.
2338    pub wcoj_clique6_dispatch_count: u64,
2339    /// Successful K=7 clique WCOJ dispatches installed by the executor.
2340    pub wcoj_clique7_dispatch_count: u64,
2341    /// Successful K=8 clique WCOJ dispatches installed by the executor.
2342    pub wcoj_clique8_dispatch_count: u64,
2343    /// Successful generic Free Join dispatches installed by the
2344    /// executor. Observability only: free-join routes carry no hard
2345    /// dispatch obligation (structural declines execute the embedded
2346    /// binary fallback by contract), so this counter never gates
2347    /// certification.
2348    pub free_join_dispatch_count: u64,
2349    /// D3 — successful factorized recursive-delta dispatches installed
2350    /// by the executor. Observability only, same contract as the Free
2351    /// Join counter: declines execute the legacy semi-naive path, so
2352    /// this counter never gates certification.
2353    pub factorized_delta_dispatch_count: u64,
2354    /// Provider-level HG triangle dispatch counter.
2355    pub provider_wcoj_triangle_hg_dispatch_count: u64,
2356    /// WCOJ layout-sort invocations observed by the provider.
2357    pub wcoj_layout_sort_invocation_count: u64,
2358    /// WCOJ layout fast-path hits observed by the provider.
2359    pub wcoj_layout_fast_path_hit_count: u64,
2360    /// K-clique metadata builds observed by the provider.
2361    pub kclique_metadata_build_count: u64,
2362    /// Provider-observed nanoseconds spent building K-clique metadata.
2363    pub kclique_metadata_build_nanos: u64,
2364    /// Recursive Merge-phase K-clique histogram refresh boundaries observed by the executor.
2365    pub kclique_histogram_refresh_count: u64,
2366    /// Recursive Merge-phase K-clique histogram refresh accounting time observed by the executor.
2367    pub kclique_histogram_refresh_nanos: u128,
2368}
2369
2370impl EpistemicGpuRuntimeCounters {
2371    fn checked_counter_delta(counter: &str, after: u64, before: u64) -> Result<u64> {
2372        after
2373            .checked_sub(before)
2374            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
2375                construct: "epistemic GPU runtime counter trace".to_string(),
2376                context: format!(
2377                    "runtime proof counter {counter} decreased from {before} to {after}"
2378                ),
2379            })
2380    }
2381
2382    fn checked_counter_delta_u128(counter: &str, after: u128, before: u128) -> Result<u128> {
2383        after
2384            .checked_sub(before)
2385            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
2386                construct: "epistemic GPU runtime counter trace".to_string(),
2387                context: format!(
2388                    "runtime proof counter {counter} decreased from {before} to {after}"
2389                ),
2390            })
2391    }
2392
2393    fn checked_counter_sum(counter: &str, values: &[u64]) -> Result<u64> {
2394        values.iter().try_fold(0u64, |acc, value| {
2395            acc.checked_add(*value)
2396                .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
2397                    construct: "epistemic GPU runtime counter trace".to_string(),
2398                    context: format!(
2399                        "runtime proof counter {counter} overflowed while adding {value} to {acc}"
2400                    ),
2401                })
2402        })
2403    }
2404
2405    /// Checked delta from an earlier snapshot.
2406    pub fn checked_delta_since(self, before: Self) -> Result<Self> {
2407        Ok(Self {
2408            wcoj_triangle_dispatch_count: Self::checked_counter_delta(
2409                "wcoj_triangle_dispatch_count",
2410                self.wcoj_triangle_dispatch_count,
2411                before.wcoj_triangle_dispatch_count,
2412            )?,
2413            wcoj_4cycle_dispatch_count: Self::checked_counter_delta(
2414                "wcoj_4cycle_dispatch_count",
2415                self.wcoj_4cycle_dispatch_count,
2416                before.wcoj_4cycle_dispatch_count,
2417            )?,
2418            chain_dispatch_count: Self::checked_counter_delta(
2419                "chain_dispatch_count",
2420                self.chain_dispatch_count,
2421                before.chain_dispatch_count,
2422            )?,
2423            wcoj_clique5_dispatch_count: Self::checked_counter_delta(
2424                "wcoj_clique5_dispatch_count",
2425                self.wcoj_clique5_dispatch_count,
2426                before.wcoj_clique5_dispatch_count,
2427            )?,
2428            wcoj_clique6_dispatch_count: Self::checked_counter_delta(
2429                "wcoj_clique6_dispatch_count",
2430                self.wcoj_clique6_dispatch_count,
2431                before.wcoj_clique6_dispatch_count,
2432            )?,
2433            wcoj_clique7_dispatch_count: Self::checked_counter_delta(
2434                "wcoj_clique7_dispatch_count",
2435                self.wcoj_clique7_dispatch_count,
2436                before.wcoj_clique7_dispatch_count,
2437            )?,
2438            wcoj_clique8_dispatch_count: Self::checked_counter_delta(
2439                "wcoj_clique8_dispatch_count",
2440                self.wcoj_clique8_dispatch_count,
2441                before.wcoj_clique8_dispatch_count,
2442            )?,
2443            factorized_delta_dispatch_count: Self::checked_counter_delta(
2444                "factorized_delta_dispatch_count",
2445                self.factorized_delta_dispatch_count,
2446                before.factorized_delta_dispatch_count,
2447            )?,
2448            free_join_dispatch_count: Self::checked_counter_delta(
2449                "free_join_dispatch_count",
2450                self.free_join_dispatch_count,
2451                before.free_join_dispatch_count,
2452            )?,
2453            provider_wcoj_triangle_hg_dispatch_count: Self::checked_counter_delta(
2454                "provider_wcoj_triangle_hg_dispatch_count",
2455                self.provider_wcoj_triangle_hg_dispatch_count,
2456                before.provider_wcoj_triangle_hg_dispatch_count,
2457            )?,
2458            wcoj_layout_sort_invocation_count: Self::checked_counter_delta(
2459                "wcoj_layout_sort_invocation_count",
2460                self.wcoj_layout_sort_invocation_count,
2461                before.wcoj_layout_sort_invocation_count,
2462            )?,
2463            wcoj_layout_fast_path_hit_count: Self::checked_counter_delta(
2464                "wcoj_layout_fast_path_hit_count",
2465                self.wcoj_layout_fast_path_hit_count,
2466                before.wcoj_layout_fast_path_hit_count,
2467            )?,
2468            kclique_metadata_build_count: Self::checked_counter_delta(
2469                "kclique_metadata_build_count",
2470                self.kclique_metadata_build_count,
2471                before.kclique_metadata_build_count,
2472            )?,
2473            kclique_metadata_build_nanos: Self::checked_counter_delta(
2474                "kclique_metadata_build_nanos",
2475                self.kclique_metadata_build_nanos,
2476                before.kclique_metadata_build_nanos,
2477            )?,
2478            kclique_histogram_refresh_count: Self::checked_counter_delta(
2479                "kclique_histogram_refresh_count",
2480                self.kclique_histogram_refresh_count,
2481                before.kclique_histogram_refresh_count,
2482            )?,
2483            kclique_histogram_refresh_nanos: Self::checked_counter_delta_u128(
2484                "kclique_histogram_refresh_nanos",
2485                self.kclique_histogram_refresh_nanos,
2486                before.kclique_histogram_refresh_nanos,
2487            )?,
2488        })
2489    }
2490
2491    /// Saturating delta from an earlier snapshot.
2492    pub fn saturating_delta_since(self, before: Self) -> Self {
2493        Self {
2494            wcoj_triangle_dispatch_count: self
2495                .wcoj_triangle_dispatch_count
2496                .saturating_sub(before.wcoj_triangle_dispatch_count),
2497            wcoj_4cycle_dispatch_count: self
2498                .wcoj_4cycle_dispatch_count
2499                .saturating_sub(before.wcoj_4cycle_dispatch_count),
2500            chain_dispatch_count: self
2501                .chain_dispatch_count
2502                .saturating_sub(before.chain_dispatch_count),
2503            wcoj_clique5_dispatch_count: self
2504                .wcoj_clique5_dispatch_count
2505                .saturating_sub(before.wcoj_clique5_dispatch_count),
2506            wcoj_clique6_dispatch_count: self
2507                .wcoj_clique6_dispatch_count
2508                .saturating_sub(before.wcoj_clique6_dispatch_count),
2509            wcoj_clique7_dispatch_count: self
2510                .wcoj_clique7_dispatch_count
2511                .saturating_sub(before.wcoj_clique7_dispatch_count),
2512            wcoj_clique8_dispatch_count: self
2513                .wcoj_clique8_dispatch_count
2514                .saturating_sub(before.wcoj_clique8_dispatch_count),
2515            free_join_dispatch_count: self
2516                .free_join_dispatch_count
2517                .saturating_sub(before.free_join_dispatch_count),
2518            factorized_delta_dispatch_count: self
2519                .factorized_delta_dispatch_count
2520                .saturating_sub(before.factorized_delta_dispatch_count),
2521            provider_wcoj_triangle_hg_dispatch_count: self
2522                .provider_wcoj_triangle_hg_dispatch_count
2523                .saturating_sub(before.provider_wcoj_triangle_hg_dispatch_count),
2524            wcoj_layout_sort_invocation_count: self
2525                .wcoj_layout_sort_invocation_count
2526                .saturating_sub(before.wcoj_layout_sort_invocation_count),
2527            wcoj_layout_fast_path_hit_count: self
2528                .wcoj_layout_fast_path_hit_count
2529                .saturating_sub(before.wcoj_layout_fast_path_hit_count),
2530            kclique_metadata_build_count: self
2531                .kclique_metadata_build_count
2532                .saturating_sub(before.kclique_metadata_build_count),
2533            kclique_metadata_build_nanos: self
2534                .kclique_metadata_build_nanos
2535                .saturating_sub(before.kclique_metadata_build_nanos),
2536            kclique_histogram_refresh_count: self
2537                .kclique_histogram_refresh_count
2538                .saturating_sub(before.kclique_histogram_refresh_count),
2539            kclique_histogram_refresh_nanos: self
2540                .kclique_histogram_refresh_nanos
2541                .saturating_sub(before.kclique_histogram_refresh_nanos),
2542        }
2543    }
2544
2545    /// Total WCOJ dispatches installed by the executor.
2546    pub fn wcoj_dispatch_count(&self) -> u64 {
2547        self.wcoj_triangle_dispatch_count
2548            .saturating_add(self.wcoj_4cycle_dispatch_count)
2549            .saturating_add(self.wcoj_clique_dispatch_count())
2550    }
2551
2552    /// Checked total WCOJ dispatches installed by the executor.
2553    pub fn checked_wcoj_dispatch_count(&self) -> Result<u64> {
2554        Self::checked_counter_sum(
2555            "wcoj_dispatch_count",
2556            &[
2557                self.wcoj_triangle_dispatch_count,
2558                self.wcoj_4cycle_dispatch_count,
2559                self.checked_wcoj_clique_dispatch_count()?,
2560            ],
2561        )
2562    }
2563
2564    /// Total K-clique WCOJ dispatches installed by the executor.
2565    pub fn wcoj_clique_dispatch_count(&self) -> u64 {
2566        self.wcoj_clique5_dispatch_count
2567            .saturating_add(self.wcoj_clique6_dispatch_count)
2568            .saturating_add(self.wcoj_clique7_dispatch_count)
2569            .saturating_add(self.wcoj_clique8_dispatch_count)
2570    }
2571
2572    /// Checked total K-clique WCOJ dispatches installed by the executor.
2573    pub fn checked_wcoj_clique_dispatch_count(&self) -> Result<u64> {
2574        Self::checked_counter_sum(
2575            "wcoj_clique_dispatch_count",
2576            &[
2577                self.wcoj_clique5_dispatch_count,
2578                self.wcoj_clique6_dispatch_count,
2579                self.wcoj_clique7_dispatch_count,
2580                self.wcoj_clique8_dispatch_count,
2581            ],
2582        )
2583    }
2584}
2585
2586/// WCOJ certification status for an epistemic runtime dispatch attempt.
2587#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2588pub enum EpistemicGpuRuntimeWcojCertification {
2589    /// The preflight did not require a WCOJ dispatch.
2590    NotRequired {
2591        /// Observed executor-installed WCOJ dispatches.
2592        observed_wcoj_dispatches: u64,
2593        /// Structured planned-hash routes that replaced WCOJ dispatch obligations.
2594        planned_hash_routes: usize,
2595        /// Planned-hash routes where complete planner costs predicted hash wins.
2596        planned_hash_planner_wins: usize,
2597        /// Planned-hash routes selected because complete WCOJ stats were unavailable.
2598        planned_hash_incomplete_stats: usize,
2599        /// Planned-hash routes carrying finite hash-vs-WCOJ cost evidence.
2600        planned_hash_cost_evidence: usize,
2601    },
2602    /// Runtime counters prove the required WCOJ dispatch happened.
2603    Certified {
2604        /// Observed executor-installed WCOJ dispatches.
2605        observed_wcoj_dispatches: u64,
2606        /// MultiWayJoin reductions certified by the observed WCOJ dispatches.
2607        certified_multiway_reductions: usize,
2608        /// Observed executor-installed K-clique dispatches.
2609        observed_kclique_dispatches: u64,
2610        /// Edge-permutation slots certified by the dispatched K-clique plans.
2611        certified_edge_permutation_slots: usize,
2612        /// Distinct stream groups certified by the dispatched K-clique plans.
2613        certified_stream_groups: usize,
2614        /// Helper-split skew-scheduled K-clique plans certified by dispatch.
2615        certified_skew_scheduled_plans: usize,
2616        /// Sorted-layout requirements certified by the dispatched K-clique plans.
2617        certified_sorted_layout_requirements: usize,
2618        /// Helper-split specs certified by the dispatched K-clique plans.
2619        certified_helper_split_specs: usize,
2620        /// Helper relation rules proving production helper-split rewrite happened.
2621        certified_helper_relation_rules: usize,
2622        /// Helper relation scans proving WCOJ consumed production helper output.
2623        certified_helper_relation_scans: usize,
2624        /// Observed provider WCOJ layout-sort invocations.
2625        observed_layout_sorts: u64,
2626        /// Observed provider WCOJ layout fast-path hits.
2627        observed_layout_fast_path_hits: u64,
2628        /// Observed provider K-clique metadata builds.
2629        observed_metadata_builds: u64,
2630        /// Observed provider time spent building K-clique metadata.
2631        observed_metadata_build_nanos: u64,
2632        /// Observed recursive K-clique histogram refresh boundaries.
2633        observed_histogram_refreshes: u64,
2634        /// Observed recursive K-clique histogram refresh accounting time.
2635        observed_histogram_refresh_nanos: u128,
2636    },
2637    /// The plan required sorted layouts, but no layout path executed.
2638    MissingRequiredWcojLayout {
2639        /// Sorted-layout requirements found during preflight.
2640        required_sorted_layouts: usize,
2641        /// Observed layout sort or fast-path events.
2642        observed_layout_events: u64,
2643    },
2644    /// The plan dispatched a K-clique WCOJ route, but metadata-build counters did not advance.
2645    MissingRequiredKcliqueMetadata {
2646        /// K-clique WCOJ plans found during preflight.
2647        required_kclique_plans: usize,
2648        /// Observed provider K-clique metadata builds.
2649        observed_metadata_builds: u64,
2650        /// Observed provider time spent building K-clique metadata.
2651        observed_metadata_build_nanos: u64,
2652    },
2653    /// The plan had WCOJ obligations, but counters did not advance.
2654    MissingRequiredWcojDispatch {
2655        /// MultiWayJoin reductions found during preflight after excluding planned hash routes.
2656        required_multiway_reductions: usize,
2657        /// K-clique WCOJ plans found during preflight.
2658        required_kclique_plans: usize,
2659        /// Observed executor-installed WCOJ dispatches.
2660        observed_wcoj_dispatches: u64,
2661        /// Observed executor-installed K-clique dispatches.
2662        observed_kclique_dispatches: u64,
2663    },
2664}
2665
2666/// CUDA provider identity that produced an epistemic GPU execution result.
2667#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2668pub struct EpistemicGpuProviderIdentity {
2669    /// CUDA device ordinal used by the executor.
2670    pub device_ordinal: usize,
2671    /// Stable address of the executor's CUDA device wrapper.
2672    pub device_ptr: usize,
2673    /// Stable address of the executor's GPU memory manager.
2674    pub memory_ptr: usize,
2675}
2676
2677impl EpistemicGpuProviderIdentity {
2678    /// Capture the device and memory-manager identity for a CUDA provider.
2679    pub fn from_provider(provider: &xlog_cuda::CudaKernelProvider) -> Self {
2680        Self {
2681            device_ordinal: provider.device().ordinal(),
2682            device_ptr: Arc::as_ptr(provider.device()) as usize,
2683            memory_ptr: Arc::as_ptr(provider.memory()) as usize,
2684        }
2685    }
2686}
2687
2688/// Output from executing the reduced production runtime plan for an epistemic program.
2689pub struct EpistemicGpuExecutionResult {
2690    /// CUDA provider identity that owns this result's device-resident buffers.
2691    pub provider_identity: EpistemicGpuProviderIdentity,
2692    /// Prepared workspace and preflight state.
2693    pub prepared: EpistemicGpuPreparedExecution,
2694    /// Candidate-generation trace captured before reduced-plan dispatch.
2695    pub candidate_generation: EpistemicGpuCandidateGenerationTrace,
2696    /// Candidate-propagation trace captured before reduced-plan dispatch.
2697    pub propagation: EpistemicGpuPropagationTrace,
2698    /// Candidate-validation trace captured before reduced-plan dispatch.
2699    pub candidate_validation: EpistemicGpuCandidateValidationTrace,
2700    /// Model-membership staging trace captured after reduced-plan dispatch.
2701    pub model_membership: EpistemicGpuModelMembershipTrace,
2702    /// World-view validation trace captured after model-membership staging.
2703    pub world_view_validation: EpistemicGpuWorldViewValidationTrace,
2704    /// World-view integrity-constraint validation trace captured after world-view validation.
2705    pub constraint_world_view_validation: EpistemicGpuConstraintWorldViewValidationTrace,
2706    /// Accepted-candidate materialization trace captured after world-view validation.
2707    pub materialization: EpistemicGpuMaterializationTrace,
2708    /// Final result materialization trace captured from reduced output metadata.
2709    pub final_result_materialization: EpistemicGpuFinalResultMaterializationTrace,
2710    /// Final query tuple materialization trace captured after final-result gating.
2711    pub final_tuple_materialization: EpistemicGpuFinalTupleMaterializationTrace,
2712    /// Hot-path host-transfer budget trace for epistemic GPU execution.
2713    pub transfer_budget: EpistemicGpuTransferBudgetTrace,
2714    /// Final-result transfer accounting after the GPU hot path.
2715    pub final_result_transfer: EpistemicGpuFinalResultTransferTrace,
2716    /// Reduced integrity-constraint validation after production runtime dispatch.
2717    pub constraint_validation: EpistemicGpuConstraintValidationTrace,
2718    /// Device-derived semantic summary after world-view validation.
2719    pub semantic_trace: EpistemicGpuSemanticTrace,
2720    /// Tuple-membership bindings that were validated and executed for this result.
2721    pub tuple_membership_bindings: Vec<EpistemicTupleMembershipBinding>,
2722    /// Device-resident final query output buffer.
2723    ///
2724    /// For a single epistemic output head this is the only materialized relation.
2725    /// For a JOINT-SOLVED coalesced multi-head component this is the PRIMARY head's
2726    /// output (the last reduction's head); the remaining coupled heads, each
2727    /// materialized against the SAME accepted world view, are in
2728    /// [`Self::additional_head_outputs`].
2729    pub final_output: CudaBuffer,
2730    /// Additional coupled-head outputs for a JOINT-SOLVED multi-head component.
2731    ///
2732    /// Empty for single-head execution. Each entry is `(head_predicate, buffer)`
2733    /// for a distinct epistemic output head OTHER than the primary head, filtered
2734    /// against the shared accepted world view via that head's row-filter bindings.
2735    pub additional_head_outputs: Vec<(String, CudaBuffer)>,
2736    /// Device-resident final tuple evidence buffer before public projection.
2737    pub tuple_evidence_output: Option<CudaBuffer>,
2738    /// Output buffer returned by the reduced production execution plan.
2739    pub output: CudaBuffer,
2740    /// Runtime counter trace for the reduced production plan dispatch.
2741    pub trace: EpistemicGpuRuntimeTrace,
2742}
2743
2744impl EpistemicGpuExecutionResult {
2745    /// Device-resident output used to derive concrete tuple-membership evidence.
2746    pub fn tuple_evidence_output(&self) -> &CudaBuffer {
2747        self.tuple_evidence_output
2748            .as_ref()
2749            .unwrap_or(&self.final_output)
2750    }
2751
2752    /// Require that the retained runtime trace certifies the prepared execution.
2753    pub fn require_runtime_dispatch_certification(&self) -> Result<()> {
2754        if self.trace.preflight != self.prepared.preflight {
2755            return Err(XlogError::UnsupportedEpistemicConstruct {
2756                construct: "epistemic GPU runtime dispatch certification".to_string(),
2757                context: "runtime trace preflight does not match prepared execution preflight"
2758                    .to_string(),
2759            });
2760        }
2761        if self.prepared.workspace.layout != self.prepared.preflight.workspace_layout {
2762            return Err(XlogError::UnsupportedEpistemicConstruct {
2763                construct: "epistemic GPU runtime dispatch certification".to_string(),
2764                context: "prepared GPU workspace layout does not match preflight workspace layout"
2765                    .to_string(),
2766            });
2767        }
2768        self.prepared
2769            .workspace
2770            .require_buffer_lengths_match_layout("epistemic GPU runtime dispatch certification")?;
2771        if self.tuple_membership_bindings.len()
2772            != self.prepared.preflight.tuple_membership_binding_count
2773        {
2774            return Err(XlogError::UnsupportedEpistemicConstruct {
2775                construct: "epistemic GPU runtime dispatch certification".to_string(),
2776                context: format!(
2777                    "runtime tuple-membership bindings do not match prepared preflight, got {} \
2778                     bindings for preflight count {}",
2779                    self.tuple_membership_bindings.len(),
2780                    self.prepared.preflight.tuple_membership_binding_count
2781                ),
2782            });
2783        }
2784        if self.tuple_membership_bindings != self.prepared.tuple_membership_bindings {
2785            return Err(XlogError::UnsupportedEpistemicConstruct {
2786                construct: "epistemic GPU runtime dispatch certification".to_string(),
2787                context: "runtime tuple-membership bindings do not match prepared GPU execution"
2788                    .to_string(),
2789            });
2790        }
2791        self.model_membership
2792            .require_planned_tuple_key_column_reads(expected_tuple_key_column_reads(
2793                &self.prepared.tuple_membership_bindings,
2794            )?)?;
2795        self.prepared.workspace_reset.require_matches_layout(
2796            "epistemic GPU runtime dispatch certification",
2797            self.prepared.preflight.workspace_layout,
2798        )?;
2799        self.final_result_transfer.require_matches_final_output(
2800            "epistemic GPU runtime dispatch certification",
2801            &self.final_output,
2802        )?;
2803        self.constraint_validation.require_matches_preflight(
2804            "epistemic GPU runtime dispatch certification",
2805            &self.prepared.preflight,
2806        )?;
2807        self.candidate_validation
2808            .require_matches_candidate_generation(
2809                "epistemic GPU runtime dispatch certification",
2810                &self.candidate_generation,
2811            )?;
2812        self.semantic_trace.require_matches_execution_traces(
2813            "epistemic GPU runtime dispatch certification",
2814            &self.candidate_generation,
2815            &self.propagation,
2816            &self.model_membership,
2817            &self.world_view_validation,
2818        )?;
2819        self.semantic_trace.require_rejection_metadata_accounting(
2820            "epistemic GPU runtime dispatch certification",
2821        )?;
2822        self.semantic_trace
2823            .require_candidate_index_partition("epistemic GPU runtime dispatch certification")?;
2824        let aggregate_kernel_timing = self.try_aggregate_kernel_timing()?;
2825        if !aggregate_kernel_timing.is_recorded() {
2826            return Err(XlogError::UnsupportedEpistemicConstruct {
2827                construct: "epistemic GPU runtime dispatch certification".to_string(),
2828                context: "accepted GPU execution did not record CUDA-event timing".to_string(),
2829            });
2830        }
2831        self.trace.require_wcoj_certification()
2832    }
2833
2834    /// Aggregate CUDA-event timing from all epistemic GPU hot-path kernels.
2835    pub fn aggregate_kernel_timing(&self) -> EpistemicGpuKernelTimingTrace {
2836        self.try_aggregate_kernel_timing()
2837            .expect("epistemic GPU kernel timing aggregation overflowed")
2838    }
2839
2840    /// Checked CUDA-event timing aggregation for certification paths.
2841    pub fn try_aggregate_kernel_timing(&self) -> Result<EpistemicGpuKernelTimingTrace> {
2842        let traces = [
2843            self.candidate_generation.kernel_timing,
2844            self.propagation.kernel_timing,
2845            self.candidate_validation.kernel_timing,
2846            self.model_membership.kernel_timing,
2847            self.world_view_validation.kernel_timing,
2848            self.materialization.kernel_timing,
2849            self.final_result_materialization.kernel_timing,
2850            self.final_tuple_materialization.kernel_timing,
2851        ];
2852
2853        if traces
2854            .iter()
2855            .all(EpistemicGpuKernelTimingTrace::is_recorded)
2856        {
2857            EpistemicGpuKernelTimingTrace::checked_sum(traces)
2858        } else {
2859            Ok(EpistemicGpuKernelTimingTrace::unrecorded())
2860        }
2861    }
2862}
2863
2864/// Batch-level trace proving split components reused the single-plan GPU path.
2865#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
2866pub struct EpistemicGpuBatchExecutionTrace {
2867    /// Number of executable components requested by the batch.
2868    pub component_count: usize,
2869    /// Number of components executed through `execute_epistemic_gpu_execution`.
2870    pub gpu_runtime_component_executions: usize,
2871    /// CPU recomposition steps performed by this batch adapter.
2872    pub cpu_recomposition_steps: u64,
2873    /// CPU candidate enumerations observed across component semantic traces.
2874    pub cpu_candidate_enumerations: u64,
2875    /// CPU world-view validations observed across component semantic traces.
2876    pub cpu_world_view_validations: u64,
2877    /// CPU solver-search fallbacks observed across component preflight traces.
2878    pub cpu_solver_search_fallbacks: u64,
2879    /// CPU probability recomputations observed across component preflight traces.
2880    pub cpu_probability_recomputations: u64,
2881    /// Hot-path device-to-host calls tracked across all components.
2882    pub tracked_dtoh_calls: u64,
2883    /// Hot-path data-plane host-to-device calls tracked across all components.
2884    pub tracked_htod_calls: u64,
2885    /// Hot-path aggregate host-to-device calls tracked across all components.
2886    pub tracked_aggregate_htod_calls: u64,
2887    /// Hot-path launch-metadata host-to-device calls tracked across all components.
2888    pub tracked_launch_metadata_htod_calls: u64,
2889    /// Hot-path data-plane host-to-device calls tracked across all components.
2890    pub tracked_data_plane_htod_calls: u64,
2891    /// Per-candidate host round trips tracked across all components.
2892    pub per_candidate_host_round_trips: u64,
2893    /// Final output rows represented across all component device buffers.
2894    pub final_output_rows: usize,
2895    /// Final output payload bytes represented across all component device buffers.
2896    pub final_output_payload_bytes: u64,
2897    /// Device row-count metadata reads used for component final-result accounting.
2898    pub final_result_row_count_device_reads: u32,
2899    /// Post-hot-path final-result data-plane device-to-host calls across all components.
2900    pub final_result_data_plane_dtoh_calls: u64,
2901    /// Post-hot-path final-result data-plane device-to-host bytes across all components.
2902    pub final_result_data_plane_dtoh_bytes: u64,
2903    /// Reduced integrity-constraint relations checked across all components.
2904    pub checked_constraint_relations: usize,
2905    /// Reduced integrity-constraint relations with violating rows across all components.
2906    pub violated_constraint_relations: usize,
2907    /// Constraint row-count metadata reads used across all components.
2908    pub constraint_row_count_device_reads: u32,
2909    /// Accepted world views observed across component semantic traces.
2910    pub accepted_world_views: usize,
2911    /// Rejected candidates observed across component semantic traces.
2912    pub rejected_candidates: usize,
2913    /// Non-negated `know` operators observed across component preflight traces.
2914    pub know_operator_count: usize,
2915    /// Non-negated `possible` operators observed across component preflight traces.
2916    pub possible_operator_count: usize,
2917    /// Negated `know` operators observed as `not know` across component preflight traces.
2918    pub not_know_operator_count: usize,
2919    /// Negated `possible` operators observed as `not possible` across component preflight traces.
2920    pub not_possible_operator_count: usize,
2921    /// Aggregate CUDA-event timing from all component hot-path kernels.
2922    pub aggregate_kernel_timing: EpistemicGpuKernelTimingTrace,
2923}
2924
2925impl EpistemicGpuBatchExecutionTrace {
2926    /// Build an aggregate trace from completed component results.
2927    pub fn from_component_results(results: &[EpistemicGpuExecutionResult]) -> Self {
2928        Self::try_from_component_results(results)
2929            .expect("epistemic GPU batch trace aggregation overflowed")
2930    }
2931
2932    /// Build an aggregate trace from completed component results and fail closed
2933    /// if any certification counter overflows.
2934    pub fn try_from_component_results(results: &[EpistemicGpuExecutionResult]) -> Result<Self> {
2935        let component_kernel_timings = results
2936            .iter()
2937            .map(EpistemicGpuExecutionResult::try_aggregate_kernel_timing)
2938            .collect::<Result<Vec<_>>>()?;
2939        let aggregate_kernel_timing = if component_kernel_timings
2940            .iter()
2941            .all(EpistemicGpuKernelTimingTrace::is_recorded)
2942        {
2943            EpistemicGpuKernelTimingTrace::checked_sum(component_kernel_timings)
2944        } else {
2945            Ok(EpistemicGpuKernelTimingTrace::unrecorded())
2946        };
2947        let aggregate_kernel_timing = aggregate_kernel_timing?;
2948
2949        Ok(Self {
2950            component_count: results.len(),
2951            gpu_runtime_component_executions: results.len(),
2952            cpu_recomposition_steps: 0,
2953            cpu_candidate_enumerations: checked_batch_sum_u64(
2954                "cpu_candidate_enumerations",
2955                results
2956                    .iter()
2957                    .map(|result| u64::from(result.semantic_trace.cpu_candidate_enumerations)),
2958            )?,
2959            cpu_world_view_validations: checked_batch_sum_u64(
2960                "cpu_world_view_validations",
2961                results
2962                    .iter()
2963                    .map(|result| u64::from(result.semantic_trace.cpu_world_view_validations)),
2964            )?,
2965            cpu_solver_search_fallbacks: checked_batch_sum_u64(
2966                "cpu_solver_search_fallbacks",
2967                results
2968                    .iter()
2969                    .map(|result| result.prepared.preflight.cpu_fallbacks.solver_search),
2970            )?,
2971            cpu_probability_recomputations: checked_batch_sum_u64(
2972                "cpu_probability_recomputations",
2973                results.iter().map(|result| {
2974                    result
2975                        .prepared
2976                        .preflight
2977                        .cpu_fallbacks
2978                        .probabilistic_recompute
2979                }),
2980            )?,
2981            tracked_dtoh_calls: checked_batch_sum_u64(
2982                "tracked_dtoh_calls",
2983                results
2984                    .iter()
2985                    .map(|result| result.transfer_budget.tracked_dtoh_calls),
2986            )?,
2987            tracked_htod_calls: checked_batch_sum_u64(
2988                "tracked_htod_calls",
2989                results
2990                    .iter()
2991                    .map(|result| result.transfer_budget.tracked_htod_calls),
2992            )?,
2993            tracked_aggregate_htod_calls: checked_batch_sum_u64(
2994                "tracked_aggregate_htod_calls",
2995                results
2996                    .iter()
2997                    .map(|result| result.transfer_budget.tracked_aggregate_htod_calls),
2998            )?,
2999            tracked_launch_metadata_htod_calls: checked_batch_sum_u64(
3000                "tracked_launch_metadata_htod_calls",
3001                results
3002                    .iter()
3003                    .map(|result| result.transfer_budget.tracked_launch_metadata_htod_calls),
3004            )?,
3005            tracked_data_plane_htod_calls: checked_batch_sum_u64(
3006                "tracked_data_plane_htod_calls",
3007                results
3008                    .iter()
3009                    .map(|result| result.transfer_budget.tracked_data_plane_htod_calls),
3010            )?,
3011            per_candidate_host_round_trips: checked_batch_sum_u64(
3012                "per_candidate_host_round_trips",
3013                results
3014                    .iter()
3015                    .map(|result| result.transfer_budget.per_candidate_host_round_trips),
3016            )?,
3017            final_output_rows: checked_batch_sum_usize(
3018                "final_output_rows",
3019                results
3020                    .iter()
3021                    .map(|result| result.final_result_transfer.final_output_rows),
3022            )?,
3023            final_output_payload_bytes: checked_batch_sum_u64(
3024                "final_output_payload_bytes",
3025                results
3026                    .iter()
3027                    .map(|result| result.final_result_transfer.final_output_payload_bytes),
3028            )?,
3029            final_result_row_count_device_reads: checked_batch_sum_u32(
3030                "final_result_row_count_device_reads",
3031                results
3032                    .iter()
3033                    .map(|result| result.final_result_transfer.row_count_device_reads),
3034            )?,
3035            final_result_data_plane_dtoh_calls: checked_batch_sum_u64(
3036                "final_result_data_plane_dtoh_calls",
3037                results
3038                    .iter()
3039                    .map(|result| result.final_result_transfer.tracked_data_plane_dtoh_calls),
3040            )?,
3041            final_result_data_plane_dtoh_bytes: checked_batch_sum_u64(
3042                "final_result_data_plane_dtoh_bytes",
3043                results
3044                    .iter()
3045                    .map(|result| result.final_result_transfer.tracked_data_plane_dtoh_bytes),
3046            )?,
3047            checked_constraint_relations: checked_batch_sum_usize(
3048                "checked_constraint_relations",
3049                results
3050                    .iter()
3051                    .map(|result| result.constraint_validation.checked_constraint_relations),
3052            )?,
3053            violated_constraint_relations: checked_batch_sum_usize(
3054                "violated_constraint_relations",
3055                results
3056                    .iter()
3057                    .map(|result| result.constraint_validation.violated_constraint_relations),
3058            )?,
3059            constraint_row_count_device_reads: checked_batch_sum_u32(
3060                "constraint_row_count_device_reads",
3061                results
3062                    .iter()
3063                    .map(|result| result.constraint_validation.row_count_device_reads),
3064            )?,
3065            accepted_world_views: checked_batch_sum_usize(
3066                "accepted_world_views",
3067                results
3068                    .iter()
3069                    .map(|result| result.semantic_trace.accepted_world_views),
3070            )?,
3071            rejected_candidates: checked_batch_sum_usize(
3072                "rejected_candidates",
3073                results
3074                    .iter()
3075                    .map(|result| result.semantic_trace.rejected_candidates),
3076            )?,
3077            know_operator_count: checked_batch_sum_usize(
3078                "know_operator_count",
3079                results
3080                    .iter()
3081                    .map(|result| result.prepared.preflight.know_operator_count),
3082            )?,
3083            possible_operator_count: checked_batch_sum_usize(
3084                "possible_operator_count",
3085                results
3086                    .iter()
3087                    .map(|result| result.prepared.preflight.possible_operator_count),
3088            )?,
3089            not_know_operator_count: checked_batch_sum_usize(
3090                "not_know_operator_count",
3091                results
3092                    .iter()
3093                    .map(|result| result.prepared.preflight.not_know_operator_count),
3094            )?,
3095            not_possible_operator_count: checked_batch_sum_usize(
3096                "not_possible_operator_count",
3097                results
3098                    .iter()
3099                    .map(|result| result.prepared.preflight.not_possible_operator_count),
3100            )?,
3101            aggregate_kernel_timing,
3102        })
3103    }
3104}
3105
3106fn checked_batch_sum_u64(
3107    counter: &'static str,
3108    values: impl IntoIterator<Item = u64>,
3109) -> Result<u64> {
3110    values.into_iter().try_fold(0u64, |acc, value| {
3111        acc.checked_add(value)
3112            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
3113                construct: "epistemic GPU batch execution trace".to_string(),
3114                context: format!(
3115                    "batch counter {counter} overflowed while aggregating component traces: \
3116                     acc={acc} next={value}"
3117                ),
3118            })
3119    })
3120}
3121
3122fn checked_batch_sum_u32(
3123    counter: &'static str,
3124    values: impl IntoIterator<Item = u32>,
3125) -> Result<u32> {
3126    values.into_iter().try_fold(0u32, |acc, value| {
3127        acc.checked_add(value)
3128            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
3129                construct: "epistemic GPU batch execution trace".to_string(),
3130                context: format!(
3131                    "batch counter {counter} overflowed while aggregating component traces: \
3132                     acc={acc} next={value}"
3133                ),
3134            })
3135    })
3136}
3137
3138fn checked_batch_sum_usize(
3139    counter: &'static str,
3140    values: impl IntoIterator<Item = usize>,
3141) -> Result<usize> {
3142    values.into_iter().try_fold(0usize, |acc, value| {
3143        acc.checked_add(value)
3144            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
3145                construct: "epistemic GPU batch execution trace".to_string(),
3146                context: format!(
3147                    "batch counter {counter} overflowed while aggregating component traces: \
3148                     acc={acc} next={value}"
3149                ),
3150            })
3151    })
3152}
3153
3154/// Results plus aggregate trace from a split/batch epistemic GPU execution.
3155pub struct EpistemicGpuBatchExecutionResult {
3156    /// Per-component execution results from the existing single-plan GPU path.
3157    pub results: Vec<EpistemicGpuExecutionResult>,
3158    /// Aggregate batch certification trace.
3159    pub trace: EpistemicGpuBatchExecutionTrace,
3160}
3161
3162impl EpistemicGpuBatchExecutionResult {
3163    /// Require the retained aggregate trace to be derived from the component results.
3164    pub fn require_trace_matches_components(&self, construct: &str) -> Result<()> {
3165        if self.results.is_empty() {
3166            return Err(XlogError::UnsupportedEpistemicConstruct {
3167                construct: construct.to_string(),
3168                context: "batch evidence requires at least one GPU component".to_string(),
3169            });
3170        }
3171        let expected = EpistemicGpuBatchExecutionTrace::try_from_component_results(&self.results)?;
3172        if self.trace != expected {
3173            return Err(XlogError::UnsupportedEpistemicConstruct {
3174                construct: construct.to_string(),
3175                context: format!(
3176                    "batch aggregate trace does not match component GPU execution results: \
3177                     trace_components={}/{} expected_components={}/{} \
3178                     trace_final_rows={} expected_final_rows={} trace_dtoh_calls={} \
3179                     expected_dtoh_calls={} trace_data_plane_htod_calls={} \
3180                     expected_data_plane_htod_calls={} trace_constraint_violations={} \
3181                     expected_constraint_violations={} trace_accepted_world_views={} \
3182                     expected_accepted_world_views={}",
3183                    self.trace.gpu_runtime_component_executions,
3184                    self.trace.component_count,
3185                    expected.gpu_runtime_component_executions,
3186                    expected.component_count,
3187                    self.trace.final_output_rows,
3188                    expected.final_output_rows,
3189                    self.trace.tracked_dtoh_calls,
3190                    expected.tracked_dtoh_calls,
3191                    self.trace.tracked_data_plane_htod_calls,
3192                    expected.tracked_data_plane_htod_calls,
3193                    self.trace.violated_constraint_relations,
3194                    expected.violated_constraint_relations,
3195                    self.trace.accepted_world_views,
3196                    expected.accepted_world_views
3197                ),
3198            });
3199        }
3200        if !self.trace.aggregate_kernel_timing.is_recorded() {
3201            return Err(XlogError::UnsupportedEpistemicConstruct {
3202                construct: construct.to_string(),
3203                context: "batch GPU execution did not record aggregate CUDA-event timing"
3204                    .to_string(),
3205            });
3206        }
3207        Ok(())
3208    }
3209}
3210
3211impl EpistemicGpuRuntimeWcojCertification {
3212    /// Compare static preflight obligations with runtime counter deltas.
3213    pub fn for_preflight_and_delta(
3214        preflight: &EpistemicGpuRuntimePreflight,
3215        delta: &EpistemicGpuRuntimeCounters,
3216    ) -> Self {
3217        Self::try_for_preflight_and_delta(preflight, delta)
3218            .expect("runtime WCOJ certification counters must not overflow")
3219    }
3220
3221    /// Compare static preflight obligations with runtime counter deltas, failing closed
3222    /// if certification counters overflow while being summarized.
3223    pub fn try_for_preflight_and_delta(
3224        preflight: &EpistemicGpuRuntimePreflight,
3225        delta: &EpistemicGpuRuntimeCounters,
3226    ) -> Result<Self> {
3227        let observed_wcoj_dispatches = delta.checked_wcoj_dispatch_count()?;
3228        let observed_kclique_dispatches = delta.checked_wcoj_clique_dispatch_count()?;
3229        let wcoj_routed_reduction_count = preflight
3230            .multiway_reduction_count
3231            .checked_sub(preflight.planned_hash_route_count)
3232            // Generic Free Join routes are opportunistic by
3233            // contract (structural decline executes the embedded
3234            // binary fallback), so they never form part of the hard
3235            // dedicated-WCOJ dispatch obligation.
3236            .and_then(|count| count.checked_sub(preflight.free_join_route_count))
3237            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
3238                construct: "epistemic GPU WCOJ route certification".to_string(),
3239                context: format!(
3240                    "planned hash + free-join routes exceed observed route obligations: \
3241                     multiway_reductions={} planned_hash_routes={} free_join_routes={}",
3242                    preflight.multiway_reduction_count,
3243                    preflight.planned_hash_route_count,
3244                    preflight.free_join_route_count
3245                ),
3246            })?;
3247        let required_multiway_reductions = wcoj_routed_reduction_count;
3248
3249        if required_multiway_reductions == 0 {
3250            return Ok(Self::NotRequired {
3251                observed_wcoj_dispatches,
3252                planned_hash_routes: preflight.planned_hash_route_count,
3253                planned_hash_planner_wins: preflight.planned_hash_planner_wins_count,
3254                planned_hash_incomplete_stats: preflight.planned_hash_incomplete_stats_count,
3255                planned_hash_cost_evidence: preflight.planned_hash_cost_evidence_count,
3256            });
3257        }
3258
3259        if observed_wcoj_dispatches < required_multiway_reductions as u64
3260            || observed_kclique_dispatches < preflight.kclique_wcoj_plan_count as u64
3261            || delta.wcoj_triangle_dispatch_count < preflight.wcoj_triangle_route_count as u64
3262            || delta.wcoj_4cycle_dispatch_count < preflight.wcoj_4cycle_route_count as u64
3263            || delta.wcoj_clique5_dispatch_count
3264                < preflight.kclique_wcoj_plan_count_by_arity[0] as u64
3265            || delta.wcoj_clique6_dispatch_count
3266                < preflight.kclique_wcoj_plan_count_by_arity[1] as u64
3267            || delta.wcoj_clique7_dispatch_count
3268                < preflight.kclique_wcoj_plan_count_by_arity[2] as u64
3269            || delta.wcoj_clique8_dispatch_count
3270                < preflight.kclique_wcoj_plan_count_by_arity[3] as u64
3271        {
3272            return Ok(Self::MissingRequiredWcojDispatch {
3273                required_multiway_reductions,
3274                required_kclique_plans: preflight.kclique_wcoj_plan_count,
3275                observed_wcoj_dispatches,
3276                observed_kclique_dispatches,
3277            });
3278        }
3279
3280        let observed_layout_events = EpistemicGpuRuntimeCounters::checked_counter_sum(
3281            "wcoj_layout_events",
3282            &[
3283                delta.wcoj_layout_sort_invocation_count,
3284                delta.wcoj_layout_fast_path_hit_count,
3285            ],
3286        )?;
3287        if observed_layout_events < preflight.sorted_layout_requirement_count as u64 {
3288            return Ok(Self::MissingRequiredWcojLayout {
3289                required_sorted_layouts: preflight.sorted_layout_requirement_count,
3290                observed_layout_events,
3291            });
3292        }
3293
3294        if preflight.kclique_wcoj_plan_count > 0
3295            && (delta.kclique_metadata_build_count < preflight.kclique_wcoj_plan_count as u64
3296                || delta.kclique_metadata_build_nanos == 0)
3297        {
3298            return Ok(Self::MissingRequiredKcliqueMetadata {
3299                required_kclique_plans: preflight.kclique_wcoj_plan_count,
3300                observed_metadata_builds: delta.kclique_metadata_build_count,
3301                observed_metadata_build_nanos: delta.kclique_metadata_build_nanos,
3302            });
3303        }
3304
3305        Ok(Self::Certified {
3306            observed_wcoj_dispatches,
3307            certified_multiway_reductions: required_multiway_reductions,
3308            observed_kclique_dispatches,
3309            certified_edge_permutation_slots: preflight.kclique_wcoj_edge_permutation_count,
3310            certified_stream_groups: preflight.kclique_stream_group_count,
3311            certified_skew_scheduled_plans: preflight.kclique_skew_scheduled_plan_count,
3312            certified_sorted_layout_requirements: preflight.sorted_layout_requirement_count,
3313            certified_helper_split_specs: preflight.helper_split_spec_count,
3314            certified_helper_relation_rules: preflight.helper_relation_rule_count,
3315            certified_helper_relation_scans: preflight.helper_relation_scan_count,
3316            observed_layout_sorts: delta.wcoj_layout_sort_invocation_count,
3317            observed_layout_fast_path_hits: delta.wcoj_layout_fast_path_hit_count,
3318            observed_metadata_builds: delta.kclique_metadata_build_count,
3319            observed_metadata_build_nanos: delta.kclique_metadata_build_nanos,
3320            observed_histogram_refreshes: delta.kclique_histogram_refresh_count,
3321            observed_histogram_refresh_nanos: delta.kclique_histogram_refresh_nanos,
3322        })
3323    }
3324}
3325
3326#[allow(clippy::large_enum_variant)]
3327enum TupleSourceLaunch<'a> {
3328    ArityZero {
3329        literal_index: u32,
3330        reduction_index: u32,
3331        negated: u8,
3332        row_count: &'a TrackedCudaSlice<u32>,
3333    },
3334    ArityOne {
3335        literal_index: u32,
3336        reduction_index: u32,
3337        negated: u8,
3338        row_count: &'a TrackedCudaSlice<u32>,
3339        key_col0: &'a CudaColumn,
3340        key_col0_width: u32,
3341        expected_key_col0_bits: u64,
3342        expected_key_col0_type_code: u8,
3343    },
3344    ArityTwo {
3345        literal_index: u32,
3346        reduction_index: u32,
3347        negated: u8,
3348        row_count: &'a TrackedCudaSlice<u32>,
3349        key_col0: &'a CudaColumn,
3350        key_col0_width: u32,
3351        expected_key_col0_bits: u64,
3352        expected_key_col0_type_code: u8,
3353        key_col1: &'a CudaColumn,
3354        key_col1_width: u32,
3355        expected_key_col1_bits: u64,
3356        expected_key_col1_type_code: u8,
3357    },
3358    ArityThree {
3359        literal_index: u32,
3360        reduction_index: u32,
3361        negated: u8,
3362        row_count: &'a TrackedCudaSlice<u32>,
3363        key_col0: &'a CudaColumn,
3364        key_col0_width: u32,
3365        expected_key_col0_bits: u64,
3366        expected_key_col0_type_code: u8,
3367        key_col1: &'a CudaColumn,
3368        key_col1_width: u32,
3369        expected_key_col1_bits: u64,
3370        expected_key_col1_type_code: u8,
3371        key_col2: &'a CudaColumn,
3372        key_col2_width: u32,
3373        expected_key_col2_bits: u64,
3374        expected_key_col2_type_code: u8,
3375    },
3376    ArityN {
3377        literal_index: u32,
3378        reduction_index: u32,
3379        negated: u8,
3380        row_count: &'a TrackedCudaSlice<u32>,
3381        bound_value_row_count: &'a TrackedCudaSlice<u32>,
3382        key_col_count: u32,
3383        key_col_ptrs: TrackedCudaSlice<u64>,
3384        key_col_widths: TrackedCudaSlice<u32>,
3385        expected_key_bits: TrackedCudaSlice<u64>,
3386        expected_key_type_codes: TrackedCudaSlice<u8>,
3387        tuple_key_match_modes: TrackedCudaSlice<u8>,
3388        bound_value_col_ptrs: TrackedCudaSlice<u64>,
3389        bound_value_col_widths: TrackedCudaSlice<u32>,
3390        has_bound_value_keys: u8,
3391    },
3392}
3393
3394const TUPLE_KEY_MATCH_MODE_GROUND: u8 = 0;
3395const TUPLE_KEY_MATCH_MODE_BOUND_OUTPUT: u8 = 1;
3396/// Anonymous wildcard tuple-key position: matches any stable-model value.
3397const TUPLE_KEY_MATCH_MODE_WILDCARD: u8 = 2;
3398
3399#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3400struct TupleKeyExpectation {
3401    bits: u64,
3402    type_code: u8,
3403}
3404
3405impl TupleKeyExpectation {
3406    fn from_term(term: &EirTerm, column_type: ScalarType) -> Result<Self> {
3407        let bits = match (term, column_type) {
3408            (EirTerm::Integer(value), ScalarType::U32) => {
3409                u32::try_from(*value).map(u64::from).map_err(|_| {
3410                    tuple_key_expectation_error(format!(
3411                        "integer {value} is out of range for U32 tuple-key column"
3412                    ))
3413                })?
3414            }
3415            (EirTerm::Integer(value), ScalarType::I32) => i32::try_from(*value)
3416                .map(|v| v as u32 as u64)
3417                .map_err(|_| {
3418                    tuple_key_expectation_error(format!(
3419                        "integer {value} is out of range for I32 tuple-key column"
3420                    ))
3421                })?,
3422            (EirTerm::Integer(value), ScalarType::U64) => u64::try_from(*value).map_err(|_| {
3423                tuple_key_expectation_error(format!(
3424                    "integer {value} is out of range for U64 tuple-key column"
3425                ))
3426            })?,
3427            (EirTerm::Integer(value), ScalarType::I64) => *value as u64,
3428            (EirTerm::Integer(value), ScalarType::Bool) => match *value {
3429                0 => 0,
3430                1 => 1,
3431                _ => {
3432                    return Err(tuple_key_expectation_error(format!(
3433                        "integer {value} is out of range for Bool tuple-key column"
3434                    )))
3435                }
3436            },
3437            (EirTerm::Symbol(value), ScalarType::Symbol) => u64::from(*value),
3438            (EirTerm::String(value), ScalarType::Symbol) => {
3439                u64::from(xlog_core::symbol::intern(value))
3440            }
3441            (EirTerm::FloatBits(bits), ScalarType::F64) => *bits,
3442            (EirTerm::FloatBits(bits), ScalarType::F32) => {
3443                (f64::from_bits(*bits) as f32).to_bits() as u64
3444            }
3445            (EirTerm::Variable(_), _) => {
3446                return Err(tuple_key_expectation_error(format!(
3447                    "term {term:?} cannot be encoded as a ground tuple-key expectation"
3448                )))
3449            }
3450            (
3451                EirTerm::Anonymous
3452                | EirTerm::List(_)
3453                | EirTerm::Cons { .. }
3454                | EirTerm::Compound { .. }
3455                | EirTerm::PredRef(_)
3456                | EirTerm::Aggregate { .. },
3457                _,
3458            ) => {
3459                return Err(tuple_key_expectation_error(format!(
3460                    "term {term:?} cannot be used for GPU tuple-key matching"
3461                )))
3462            }
3463            _ => {
3464                return Err(tuple_key_expectation_error(format!(
3465                    "term {term:?} cannot be encoded for {column_type:?} tuple-key column"
3466                )))
3467            }
3468        };
3469
3470        Ok(Self {
3471            bits,
3472            type_code: column_type.to_code(),
3473        })
3474    }
3475}
3476
3477fn tuple_key_expectation_error(context: String) -> XlogError {
3478    XlogError::UnsupportedEpistemicConstruct {
3479        construct: "epistemic GPU tuple-key expectation".to_string(),
3480        context,
3481    }
3482}
3483
3484impl Executor {
3485    /// Resolve a modal tuple-source relation, disambiguating same-name multi-arity
3486    /// modal predicates by arity.
3487    ///
3488    /// The relation store is keyed by name, so a program using the SAME predicate
3489    /// name at two different arities in modal literals (`know p(X)` over `p/1` AND
3490    /// `possible p(X,Y)` over `p/2`) could not resolve both sources under the bare
3491    /// name. Distinct arities ARE distinct relations, so this resolves the
3492    /// ARITY-QUALIFIED store key (`"p/1"`, `"p/2"`) FIRST, falling back to the bare
3493    /// predicate name when no qualified entry exists. Single-arity epistemic
3494    /// programs keep uploading under the bare name and hit the fallback unchanged
3495    /// (no regression); a multi-arity program uploads each arity under its own
3496    /// qualified key and both resolve distinctly.
3497    ///
3498    /// The `"/"` separator is collision-safe: parser predicate names cannot contain
3499    /// `"/"`, so a qualified key can never shadow a real bare-name relation.
3500    ///
3501    /// Resolution is structural (driven by `arity`, never by a specific arity VALUE
3502    /// or predicate NAME), so it introduces no special-casing.
3503    fn resolve_modal_tuple_source(&self, predicate: &str, arity: usize) -> Option<&CudaBuffer> {
3504        let qualified = format!("{predicate}/{arity}");
3505        self.store()
3506            .get(qualified.as_str())
3507            .or_else(|| self.store().get(predicate))
3508    }
3509
3510    /// Snapshot runtime counters used by epistemic GPU certification.
3511    pub fn epistemic_gpu_runtime_counters(&self) -> EpistemicGpuRuntimeCounters {
3512        EpistemicGpuRuntimeCounters {
3513            wcoj_triangle_dispatch_count: self.wcoj_triangle_dispatch_count(),
3514            wcoj_4cycle_dispatch_count: self.wcoj_4cycle_dispatch_count(),
3515            chain_dispatch_count: self.chain_dispatch_count(),
3516            wcoj_clique5_dispatch_count: self.wcoj_clique5_dispatch_count(),
3517            wcoj_clique6_dispatch_count: self.wcoj_clique6_dispatch_count(),
3518            wcoj_clique7_dispatch_count: self.wcoj_clique7_dispatch_count(),
3519            wcoj_clique8_dispatch_count: self.wcoj_clique8_dispatch_count(),
3520            free_join_dispatch_count: self.free_join_dispatch_count(),
3521            factorized_delta_dispatch_count: self.factorized_delta_dispatch_count(),
3522            provider_wcoj_triangle_hg_dispatch_count: self
3523                .provider
3524                .wcoj_triangle_hg_dispatch_count(),
3525            wcoj_layout_sort_invocation_count: self.provider.wcoj_layout_sort_invocation_count(),
3526            wcoj_layout_fast_path_hit_count: self.provider.wcoj_layout_fast_path_hit_count(),
3527            kclique_metadata_build_count: self.provider.kclique_metadata_build_count(),
3528            kclique_metadata_build_nanos: self.provider.kclique_metadata_build_nanos(),
3529            kclique_histogram_refresh_count: self.kclique_histogram_refresh_count(),
3530            kclique_histogram_refresh_nanos: self.kclique_histogram_refresh_nanos(),
3531        }
3532    }
3533
3534    fn time_epistemic_gpu_kernel_launch(
3535        &self,
3536        operation: &str,
3537        launch: impl FnOnce() -> std::result::Result<(), DriverError>,
3538    ) -> Result<EpistemicGpuKernelTimingTrace> {
3539        let stream = self.provider.device().inner().stream().clone();
3540        let start = stream
3541            .record_event(Some(sys::CUevent_flags::CU_EVENT_DEFAULT))
3542            .map_err(|e| XlogError::execution_ctx(operation, "record start timing event", &e))?;
3543        launch().map_err(|e| XlogError::execution_ctx(operation, "launch kernel", &e))?;
3544        let end = stream
3545            .record_event(Some(sys::CUevent_flags::CU_EVENT_DEFAULT))
3546            .map_err(|e| XlogError::execution_ctx(operation, "record end timing event", &e))?;
3547        let elapsed_ms = start
3548            .elapsed_ms(&end)
3549            .map_err(|e| XlogError::execution_ctx(operation, "measure CUDA event elapsed", &e))?;
3550
3551        EpistemicGpuKernelTimingTrace::from_cuda_elapsed_ms(elapsed_ms)
3552    }
3553
3554    /// Allocate GPU-resident buffers required by an epistemic GPU plan.
3555    pub fn allocate_epistemic_gpu_workspace(
3556        &self,
3557        plan: &EpistemicGpuPlan,
3558        capacities: EpistemicGpuWorkspaceCapacities,
3559    ) -> Result<EpistemicGpuWorkspace> {
3560        let layout = EpistemicGpuWorkspaceLayout::for_plan(plan, capacities)?;
3561        let memory = self.provider.memory();
3562
3563        Ok(EpistemicGpuWorkspace {
3564            layout,
3565            candidate_assumptions: memory.alloc::<u8>(layout.candidate_assumption_bytes)?,
3566            world_views: memory.alloc::<u8>(layout.world_view_bytes)?,
3567            model_membership: memory.alloc::<u8>(layout.model_membership_bytes)?,
3568            rejection_reasons: memory.alloc::<u32>(layout.rejection_reason_slots)?,
3569            constraint_violation_index: memory.alloc::<u32>(layout.rejection_reason_slots)?,
3570        })
3571    }
3572
3573    /// Zero every epistemic workspace buffer on device before hot-path use.
3574    pub fn reset_epistemic_gpu_workspace(
3575        &self,
3576        workspace: &mut EpistemicGpuWorkspace,
3577    ) -> Result<EpistemicGpuWorkspaceResetTrace> {
3578        let device = self.provider.device().inner();
3579
3580        device
3581            .memset_zeros(&mut workspace.candidate_assumptions)
3582            .map_err(|e| {
3583                XlogError::execution_ctx(
3584                    "epistemic GPU workspace reset",
3585                    "candidate assumptions memset",
3586                    &e,
3587                )
3588            })?;
3589        device
3590            .memset_zeros(&mut workspace.world_views)
3591            .map_err(|e| {
3592                XlogError::execution_ctx("epistemic GPU workspace reset", "world views memset", &e)
3593            })?;
3594        device
3595            .memset_zeros(&mut workspace.model_membership)
3596            .map_err(|e| {
3597                XlogError::execution_ctx(
3598                    "epistemic GPU workspace reset",
3599                    "model membership memset",
3600                    &e,
3601                )
3602            })?;
3603        device
3604            .memset_zeros(&mut workspace.rejection_reasons)
3605            .map_err(|e| {
3606                XlogError::execution_ctx(
3607                    "epistemic GPU workspace reset",
3608                    "rejection reasons memset",
3609                    &e,
3610                )
3611            })?;
3612
3613        EpistemicGpuWorkspaceResetTrace::try_for_layout(workspace.layout)
3614    }
3615
3616    /// Generate candidate-assumption bitsets directly into the GPU workspace.
3617    pub fn generate_epistemic_gpu_candidates(
3618        &self,
3619        workspace: &mut EpistemicGpuWorkspace,
3620        literal_count: usize,
3621        candidate_count: usize,
3622    ) -> Result<EpistemicGpuCandidateGenerationTrace> {
3623        let trace =
3624            EpistemicGpuCandidateGenerationTrace::for_counts(literal_count, candidate_count)?;
3625        if trace.candidate_assumption_bytes > workspace.layout.candidate_assumption_bytes {
3626            return Err(XlogError::ResourceExhausted {
3627                context: "epistemic GPU candidate assumption workspace".to_string(),
3628                estimated_bytes: trace.candidate_assumption_bytes as u64,
3629                budget_bytes: workspace.layout.candidate_assumption_bytes as u64,
3630            });
3631        }
3632        if trace.candidate_assumption_bytes > u32::MAX as usize {
3633            return Err(XlogError::ResourceExhausted {
3634                context: "epistemic GPU candidate generation launch".to_string(),
3635                estimated_bytes: trace.candidate_assumption_bytes as u64,
3636                budget_bytes: u32::MAX as u64,
3637            });
3638        }
3639
3640        let literal_count =
3641            checked_u32_dimension(literal_count, "epistemic GPU candidate generation literals")?;
3642        let candidate_count = checked_u32_dimension(
3643            candidate_count,
3644            "epistemic GPU candidate generation candidates",
3645        )?;
3646        let total = checked_u32_dimension(
3647            trace.candidate_assumption_bytes,
3648            "epistemic GPU candidate generation launch elements",
3649        )?;
3650        let func = self
3651            .provider
3652            .device()
3653            .inner()
3654            .get_func(
3655                EPISTEMIC_MODULE,
3656                epistemic_kernels::EPISTEMIC_GENERATE_CANDIDATE_ASSUMPTIONS_U8,
3657            )
3658            .ok_or_else(|| {
3659                XlogError::Execution("epistemic candidate generation kernel not found".to_string())
3660            })?;
3661        let config = LaunchConfig::for_num_elems(total);
3662
3663        let kernel_timing = self.time_epistemic_gpu_kernel_launch(
3664            "epistemic GPU candidate generation",
3665            || unsafe {
3666                // SAFETY: kernel arguments match the PTX signature; the workspace capacity check
3667                // above proves the output buffer covers literal_count * candidate_count bytes.
3668                func.clone().launch(
3669                    config,
3670                    (
3671                        literal_count,
3672                        candidate_count,
3673                        &mut workspace.candidate_assumptions,
3674                    ),
3675                )
3676            },
3677        )?;
3678
3679        Ok(trace.with_kernel_timing(kernel_timing))
3680    }
3681
3682    /// Propagate generated candidates into GPU-resident world-view staging buffers.
3683    pub fn propagate_epistemic_gpu_candidates(
3684        &self,
3685        workspace: &mut EpistemicGpuWorkspace,
3686        literal_count: usize,
3687        candidate_count: usize,
3688    ) -> Result<EpistemicGpuPropagationTrace> {
3689        let mut trace = EpistemicGpuPropagationTrace::for_counts(literal_count, candidate_count)?;
3690        let candidate_assumption_bytes = checked_product(literal_count, candidate_count)?;
3691        if candidate_assumption_bytes > workspace.layout.candidate_assumption_bytes {
3692            return Err(XlogError::ResourceExhausted {
3693                context: "epistemic GPU propagation candidate workspace".to_string(),
3694                estimated_bytes: candidate_assumption_bytes as u64,
3695                budget_bytes: workspace.layout.candidate_assumption_bytes as u64,
3696            });
3697        }
3698        if trace.rejection_reason_slots_written > workspace.layout.rejection_reason_slots {
3699            return Err(XlogError::ResourceExhausted {
3700                context: "epistemic GPU propagation rejection workspace".to_string(),
3701                estimated_bytes: trace.rejection_reason_slots_written as u64,
3702                budget_bytes: workspace.layout.rejection_reason_slots as u64,
3703            });
3704        }
3705        if literal_count > u32::MAX as usize || candidate_count > u32::MAX as usize {
3706            return Err(XlogError::ResourceExhausted {
3707                context: "epistemic GPU propagation launch".to_string(),
3708                estimated_bytes: literal_count.max(candidate_count) as u64,
3709                budget_bytes: u32::MAX as u64,
3710            });
3711        }
3712
3713        let world_stride =
3714            workspace.layout.world_view_bytes / workspace.layout.rejection_reason_slots;
3715        if world_stride == 0 || world_stride > u32::MAX as usize {
3716            return Err(XlogError::ResourceExhausted {
3717                context: "epistemic GPU propagation world stride".to_string(),
3718                estimated_bytes: world_stride as u64,
3719                budget_bytes: u32::MAX as u64,
3720            });
3721        }
3722        let world_view_bitset_bytes_per_candidate =
3723            world_view_bitset_bytes_per_candidate(literal_count)?;
3724        if world_view_bitset_bytes_per_candidate > world_stride {
3725            return Err(XlogError::ResourceExhausted {
3726                context: "epistemic GPU propagation world-view bitset stride".to_string(),
3727                estimated_bytes: world_view_bitset_bytes_per_candidate as u64,
3728                budget_bytes: world_stride as u64,
3729            });
3730        }
3731        let world_view_bitset_bytes =
3732            checked_product(world_view_bitset_bytes_per_candidate, candidate_count)?;
3733        if world_view_bitset_bytes > workspace.layout.world_view_bytes {
3734            return Err(XlogError::ResourceExhausted {
3735                context: "epistemic GPU propagation world-view bitsets".to_string(),
3736                estimated_bytes: world_view_bitset_bytes as u64,
3737                budget_bytes: workspace.layout.world_view_bytes as u64,
3738            });
3739        }
3740        trace.world_view_bytes_written = checked_product(world_stride, candidate_count)?;
3741        if trace.world_view_bytes_written > workspace.layout.world_view_bytes {
3742            return Err(XlogError::ResourceExhausted {
3743                context: "epistemic GPU propagation world-view workspace".to_string(),
3744                estimated_bytes: trace.world_view_bytes_written as u64,
3745                budget_bytes: workspace.layout.world_view_bytes as u64,
3746            });
3747        }
3748
3749        let literal_count =
3750            checked_u32_dimension(literal_count, "epistemic GPU propagation literals")?;
3751        let candidate_count =
3752            checked_u32_dimension(candidate_count, "epistemic GPU propagation candidates")?;
3753        let world_stride =
3754            checked_u32_dimension(world_stride, "epistemic GPU propagation world stride")?;
3755        let func = self
3756            .provider
3757            .device()
3758            .inner()
3759            .get_func(
3760                EPISTEMIC_MODULE,
3761                epistemic_kernels::EPISTEMIC_PROPAGATE_CANDIDATES_U8,
3762            )
3763            .ok_or_else(|| {
3764                XlogError::Execution("epistemic candidate propagation kernel not found".to_string())
3765            })?;
3766        let config = LaunchConfig::for_num_elems(candidate_count);
3767
3768        let kernel_timing = self.time_epistemic_gpu_kernel_launch(
3769            "epistemic GPU candidate propagation",
3770            || unsafe {
3771                // SAFETY: kernel arguments match the PTX signature; the capacity checks
3772                // above prove candidate, world-view, and rejection buffers cover all writes.
3773                func.clone().launch(
3774                    config,
3775                    (
3776                        literal_count,
3777                        candidate_count,
3778                        world_stride,
3779                        &workspace.candidate_assumptions,
3780                        &mut workspace.world_views,
3781                        &mut workspace.rejection_reasons,
3782                    ),
3783                )
3784            },
3785        )?;
3786
3787        Ok(trace.with_kernel_timing(kernel_timing))
3788    }
3789
3790    /// Validate staged candidate bitsets and world-view activity on device.
3791    pub fn validate_epistemic_gpu_candidates(
3792        &self,
3793        workspace: &mut EpistemicGpuWorkspace,
3794        literal_count: usize,
3795        candidate_count: usize,
3796    ) -> Result<EpistemicGpuCandidateValidationTrace> {
3797        let mut trace =
3798            EpistemicGpuCandidateValidationTrace::for_counts(literal_count, candidate_count)?;
3799        if trace.candidate_assumption_bytes_checked > workspace.layout.candidate_assumption_bytes {
3800            return Err(XlogError::ResourceExhausted {
3801                context: "epistemic GPU validation candidate workspace".to_string(),
3802                estimated_bytes: trace.candidate_assumption_bytes_checked as u64,
3803                budget_bytes: workspace.layout.candidate_assumption_bytes as u64,
3804            });
3805        }
3806        if trace.rejection_reason_slots_written > workspace.layout.rejection_reason_slots {
3807            return Err(XlogError::ResourceExhausted {
3808                context: "epistemic GPU validation rejection workspace".to_string(),
3809                estimated_bytes: trace.rejection_reason_slots_written as u64,
3810                budget_bytes: workspace.layout.rejection_reason_slots as u64,
3811            });
3812        }
3813        if literal_count > u32::MAX as usize || candidate_count > u32::MAX as usize {
3814            return Err(XlogError::ResourceExhausted {
3815                context: "epistemic GPU validation launch".to_string(),
3816                estimated_bytes: literal_count.max(candidate_count) as u64,
3817                budget_bytes: u32::MAX as u64,
3818            });
3819        }
3820
3821        let world_stride =
3822            workspace.layout.world_view_bytes / workspace.layout.rejection_reason_slots;
3823        if world_stride == 0 || world_stride > u32::MAX as usize {
3824            return Err(XlogError::ResourceExhausted {
3825                context: "epistemic GPU validation world stride".to_string(),
3826                estimated_bytes: world_stride as u64,
3827                budget_bytes: u32::MAX as u64,
3828            });
3829        }
3830        let world_view_bitset_bytes_per_candidate =
3831            world_view_bitset_bytes_per_candidate(literal_count)?;
3832        if world_view_bitset_bytes_per_candidate > world_stride {
3833            return Err(XlogError::ResourceExhausted {
3834                context: "epistemic GPU validation world-view bitset stride".to_string(),
3835                estimated_bytes: world_view_bitset_bytes_per_candidate as u64,
3836                budget_bytes: world_stride as u64,
3837            });
3838        }
3839        let world_view_bitset_bytes =
3840            checked_product(world_view_bitset_bytes_per_candidate, candidate_count)?;
3841        if world_view_bitset_bytes > workspace.layout.world_view_bytes {
3842            return Err(XlogError::ResourceExhausted {
3843                context: "epistemic GPU validation world-view bitsets".to_string(),
3844                estimated_bytes: world_view_bitset_bytes as u64,
3845                budget_bytes: workspace.layout.world_view_bytes as u64,
3846            });
3847        }
3848        trace.world_view_bytes_checked = world_view_bitset_bytes;
3849        if trace.world_view_bytes_checked > workspace.layout.world_view_bytes {
3850            return Err(XlogError::ResourceExhausted {
3851                context: "epistemic GPU validation world-view workspace".to_string(),
3852                estimated_bytes: trace.world_view_bytes_checked as u64,
3853                budget_bytes: workspace.layout.world_view_bytes as u64,
3854            });
3855        }
3856
3857        let literal_count =
3858            checked_u32_dimension(literal_count, "epistemic GPU validation literals")?;
3859        let candidate_count =
3860            checked_u32_dimension(candidate_count, "epistemic GPU validation candidates")?;
3861        let world_stride =
3862            checked_u32_dimension(world_stride, "epistemic GPU validation world stride")?;
3863        let func = self
3864            .provider
3865            .device()
3866            .inner()
3867            .get_func(
3868                EPISTEMIC_MODULE,
3869                epistemic_kernels::EPISTEMIC_VALIDATE_CANDIDATE_BITS_U8,
3870            )
3871            .ok_or_else(|| {
3872                XlogError::Execution("epistemic candidate validation kernel not found".to_string())
3873            })?;
3874        let config = LaunchConfig::for_num_elems(candidate_count);
3875
3876        let kernel_timing = self.time_epistemic_gpu_kernel_launch(
3877            "epistemic GPU candidate validation",
3878            || unsafe {
3879                // SAFETY: kernel arguments match the PTX signature; the capacity checks
3880                // above prove candidate, world-view, and rejection buffers cover all accesses.
3881                func.clone().launch(
3882                    config,
3883                    (
3884                        literal_count,
3885                        candidate_count,
3886                        world_stride,
3887                        &workspace.candidate_assumptions,
3888                        &workspace.world_views,
3889                        &mut workspace.rejection_reasons,
3890                    ),
3891                )
3892            },
3893        )?;
3894
3895        Ok(trace.with_kernel_timing(kernel_timing))
3896    }
3897
3898    /// Populate candidate-scoped model-membership staging buffers on device.
3899    pub fn populate_epistemic_gpu_model_membership(
3900        &self,
3901        workspace: &mut EpistemicGpuWorkspace,
3902        output: &CudaBuffer,
3903        literal_count: usize,
3904        candidate_count: usize,
3905        reduction_count: usize,
3906        models_per_reduction: usize,
3907    ) -> Result<EpistemicGpuModelMembershipTrace> {
3908        let trace = EpistemicGpuModelMembershipTrace::for_counts(
3909            literal_count,
3910            candidate_count,
3911            reduction_count,
3912            models_per_reduction,
3913        )?;
3914        let candidate_assumption_bytes = checked_product(literal_count, candidate_count)?;
3915        if candidate_assumption_bytes > workspace.layout.candidate_assumption_bytes {
3916            return Err(XlogError::ResourceExhausted {
3917                context: "epistemic GPU model-membership candidate workspace".to_string(),
3918                estimated_bytes: candidate_assumption_bytes as u64,
3919                budget_bytes: workspace.layout.candidate_assumption_bytes as u64,
3920            });
3921        }
3922        if candidate_count > workspace.layout.world_view_bytes {
3923            return Err(XlogError::ResourceExhausted {
3924                context: "epistemic GPU model-membership world-view workspace".to_string(),
3925                estimated_bytes: candidate_count as u64,
3926                budget_bytes: workspace.layout.world_view_bytes as u64,
3927            });
3928        }
3929        if trace.model_membership_bytes_written > workspace.layout.model_membership_bytes {
3930            return Err(XlogError::ResourceExhausted {
3931                context: "epistemic GPU model-membership workspace".to_string(),
3932                estimated_bytes: trace.model_membership_bytes_written as u64,
3933                budget_bytes: workspace.layout.model_membership_bytes as u64,
3934            });
3935        }
3936        if trace.rejection_reason_slots_checked > workspace.layout.rejection_reason_slots {
3937            return Err(XlogError::ResourceExhausted {
3938                context: "epistemic GPU model-membership rejection workspace".to_string(),
3939                estimated_bytes: trace.rejection_reason_slots_checked as u64,
3940                budget_bytes: workspace.layout.rejection_reason_slots as u64,
3941            });
3942        }
3943        if trace.model_membership_bytes_written > u32::MAX as usize {
3944            return Err(XlogError::ResourceExhausted {
3945                context: "epistemic GPU model-membership launch".to_string(),
3946                estimated_bytes: trace.model_membership_bytes_written as u64,
3947                budget_bytes: u32::MAX as u64,
3948            });
3949        }
3950        if literal_count > u32::MAX as usize
3951            || candidate_count > u32::MAX as usize
3952            || reduction_count > u32::MAX as usize
3953            || models_per_reduction > u32::MAX as usize
3954        {
3955            return Err(XlogError::ResourceExhausted {
3956                context: "epistemic GPU model-membership dimensions".to_string(),
3957                estimated_bytes: literal_count
3958                    .max(candidate_count)
3959                    .max(reduction_count)
3960                    .max(models_per_reduction) as u64,
3961                budget_bytes: u32::MAX as u64,
3962            });
3963        }
3964
3965        let world_stride =
3966            workspace.layout.world_view_bytes / workspace.layout.rejection_reason_slots;
3967        if world_stride == 0 || world_stride > u32::MAX as usize {
3968            return Err(XlogError::ResourceExhausted {
3969                context: "epistemic GPU model-membership world stride".to_string(),
3970                estimated_bytes: world_stride as u64,
3971                budget_bytes: u32::MAX as u64,
3972            });
3973        }
3974
3975        let literal_count = literal_count as u32;
3976        let candidate_count = candidate_count as u32;
3977        let reduction_count = reduction_count as u32;
3978        let models_per_reduction = models_per_reduction as u32;
3979        let world_stride = world_stride as u32;
3980        let total = trace.model_membership_bytes_written as u32;
3981        let func = self
3982            .provider
3983            .device()
3984            .inner()
3985            .get_func(
3986                EPISTEMIC_MODULE,
3987                epistemic_kernels::EPISTEMIC_POPULATE_MODEL_MEMBERSHIP_U8,
3988            )
3989            .ok_or_else(|| {
3990                XlogError::Execution("epistemic model-membership kernel not found".to_string())
3991            })?;
3992        let config = LaunchConfig::for_num_elems(total);
3993
3994        let kernel_timing =
3995            self.time_epistemic_gpu_kernel_launch("epistemic GPU model membership", || unsafe {
3996                // SAFETY: kernel arguments match the PTX signature; the capacity checks
3997                // above prove candidate, world-view, membership, and rejection buffers
3998                // cover all reads and writes.
3999                func.clone().launch(
4000                    config,
4001                    (
4002                        literal_count,
4003                        candidate_count,
4004                        reduction_count,
4005                        models_per_reduction,
4006                        world_stride,
4007                        output.num_rows_device(),
4008                        &workspace.candidate_assumptions,
4009                        &workspace.world_views,
4010                        &mut workspace.model_membership,
4011                        &mut workspace.rejection_reasons,
4012                    ),
4013                )
4014            })?;
4015
4016        Ok(trace.with_kernel_timing(kernel_timing))
4017    }
4018
4019    /// Populate model-membership bytes from reduced stable-model tuple sources.
4020    pub fn populate_epistemic_gpu_model_membership_from_tuple_sources(
4021        &self,
4022        workspace: &mut EpistemicGpuWorkspace,
4023        output: &CudaBuffer,
4024        gpu_plan: &EpistemicGpuPlan,
4025        candidate_count: usize,
4026        models_per_reduction: usize,
4027    ) -> Result<EpistemicGpuModelMembershipTrace> {
4028        gpu_plan.validate_tuple_membership_bindings()?;
4029
4030        let literal_count = gpu_plan.epistemic_literals.len();
4031        let reduction_count = gpu_plan.reductions.len();
4032        let tuple_source_key_column_count = gpu_plan
4033            .tuple_membership_bindings
4034            .iter()
4035            .try_fold(0usize, |acc, binding| {
4036                checked_sum(acc, binding.key_columns.len())
4037            })?;
4038        let mut trace =
4039            EpistemicGpuModelMembershipTrace::for_stable_model_tuple_sources_with_key_columns(
4040                literal_count,
4041                candidate_count,
4042                reduction_count,
4043                models_per_reduction,
4044                gpu_plan.tuple_membership_bindings.len(),
4045                tuple_source_key_column_count,
4046            )?;
4047        trace.output_row_count_device_reads = trace.kernel_launches;
4048        let candidate_assumption_bytes = checked_product(literal_count, candidate_count)?;
4049        if candidate_assumption_bytes > workspace.layout.candidate_assumption_bytes {
4050            return Err(XlogError::ResourceExhausted {
4051                context: "epistemic GPU model-membership candidate workspace".to_string(),
4052                estimated_bytes: candidate_assumption_bytes as u64,
4053                budget_bytes: workspace.layout.candidate_assumption_bytes as u64,
4054            });
4055        }
4056        if candidate_count > workspace.layout.world_view_bytes {
4057            return Err(XlogError::ResourceExhausted {
4058                context: "epistemic GPU model-membership world-view workspace".to_string(),
4059                estimated_bytes: candidate_count as u64,
4060                budget_bytes: workspace.layout.world_view_bytes as u64,
4061            });
4062        }
4063        if trace.model_membership_bytes_written > workspace.layout.model_membership_bytes {
4064            return Err(XlogError::ResourceExhausted {
4065                context: "epistemic GPU model-membership workspace".to_string(),
4066                estimated_bytes: trace.model_membership_bytes_written as u64,
4067                budget_bytes: workspace.layout.model_membership_bytes as u64,
4068            });
4069        }
4070        if trace.rejection_reason_slots_checked > workspace.layout.rejection_reason_slots {
4071            return Err(XlogError::ResourceExhausted {
4072                context: "epistemic GPU model-membership rejection workspace".to_string(),
4073                estimated_bytes: trace.rejection_reason_slots_checked as u64,
4074                budget_bytes: workspace.layout.rejection_reason_slots as u64,
4075            });
4076        }
4077        if trace.model_membership_bytes_written > u32::MAX as usize {
4078            return Err(XlogError::ResourceExhausted {
4079                context: "epistemic GPU model-membership launch".to_string(),
4080                estimated_bytes: trace.model_membership_bytes_written as u64,
4081                budget_bytes: u32::MAX as u64,
4082            });
4083        }
4084        if literal_count > u32::MAX as usize
4085            || candidate_count > u32::MAX as usize
4086            || reduction_count > u32::MAX as usize
4087            || models_per_reduction > u32::MAX as usize
4088        {
4089            return Err(XlogError::ResourceExhausted {
4090                context: "epistemic GPU model-membership dimensions".to_string(),
4091                estimated_bytes: literal_count
4092                    .max(candidate_count)
4093                    .max(reduction_count)
4094                    .max(models_per_reduction) as u64,
4095                budget_bytes: u32::MAX as u64,
4096            });
4097        }
4098
4099        let world_stride =
4100            workspace.layout.world_view_bytes / workspace.layout.rejection_reason_slots;
4101        if world_stride == 0 || world_stride > u32::MAX as usize {
4102            return Err(XlogError::ResourceExhausted {
4103                context: "epistemic GPU model-membership world stride".to_string(),
4104                estimated_bytes: world_stride as u64,
4105                budget_bytes: u32::MAX as u64,
4106            });
4107        }
4108
4109        let per_binding_launch_elems = checked_product(candidate_count, models_per_reduction)?;
4110        if per_binding_launch_elems > u32::MAX as usize {
4111            return Err(XlogError::ResourceExhausted {
4112                context: "epistemic GPU model-membership tuple-source launch".to_string(),
4113                estimated_bytes: per_binding_launch_elems as u64,
4114                budget_bytes: u32::MAX as u64,
4115            });
4116        }
4117
4118        let mut tuple_sources = Vec::with_capacity(gpu_plan.tuple_membership_bindings.len());
4119        for binding in &gpu_plan.tuple_membership_bindings {
4120            let source_relation = self
4121                .resolve_modal_tuple_source(binding.predicate.as_str(), binding.arity)
4122                .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4123                    construct: "epistemic GPU stable-model tuple membership".to_string(),
4124                    context: format!(
4125                        "missing reduced stable-model tuple source relation {} (arity {})",
4126                        binding.predicate, binding.arity
4127                    ),
4128                })?;
4129            if source_relation.arity() != binding.arity {
4130                return Err(XlogError::UnsupportedEpistemicConstruct {
4131                    construct: "epistemic GPU stable-model tuple membership".to_string(),
4132                    context: format!(
4133                        "tuple source relation {} arity {} does not match binding arity {}",
4134                        binding.predicate,
4135                        source_relation.arity(),
4136                        binding.arity
4137                    ),
4138                });
4139            }
4140            let has_bound_value_keys = binding
4141                .key_terms
4142                .iter()
4143                .any(|term| matches!(term, EirTerm::Variable(_)));
4144            // Anonymous wildcards are value-level matches handled only by the
4145            // general arm; route any binding carrying a variable or an anonymous
4146            // term there. The specialized arity arms remain a fast path for
4147            // all-ground tuple keys.
4148            let has_value_level_keys = binding
4149                .key_terms
4150                .iter()
4151                .any(|term| matches!(term, EirTerm::Variable(_) | EirTerm::Anonymous));
4152            match binding.key_columns.as_slice() {
4153                [] => tuple_sources.push(TupleSourceLaunch::ArityZero {
4154                    literal_index: binding.literal_index as u32,
4155                    reduction_index: binding.reduction_index as u32,
4156                    negated: binding.negated as u8,
4157                    row_count: source_relation.num_rows_device(),
4158                }),
4159                &[key_col] if !has_value_level_keys => {
4160                    let key_col0 = source_relation.column(key_col).ok_or_else(|| {
4161                        XlogError::UnsupportedEpistemicConstruct {
4162                            construct: "epistemic GPU stable-model tuple membership".to_string(),
4163                            context: format!(
4164                                "tuple source relation {} missing key column {}",
4165                                binding.predicate, key_col
4166                            ),
4167                        }
4168                    })?;
4169                    let key_col0_type =
4170                        source_relation
4171                            .schema()
4172                            .column_type(key_col)
4173                            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4174                                construct: "epistemic GPU stable-model tuple membership"
4175                                    .to_string(),
4176                                context: format!(
4177                                    "tuple source relation {} missing schema for key column {}",
4178                                    binding.predicate, key_col
4179                                ),
4180                            })?;
4181                    let key_col0_width = key_col0_type.size_bytes();
4182                    let key_col0_expectation =
4183                        TupleKeyExpectation::from_term(&binding.key_terms[0], key_col0_type)?;
4184                    if key_col0_width > u32::MAX as usize {
4185                        return Err(XlogError::ResourceExhausted {
4186                            context: "epistemic GPU tuple-key column width".to_string(),
4187                            estimated_bytes: key_col0_width as u64,
4188                            budget_bytes: u32::MAX as u64,
4189                        });
4190                    }
4191                    tuple_sources.push(TupleSourceLaunch::ArityOne {
4192                        literal_index: binding.literal_index as u32,
4193                        reduction_index: binding.reduction_index as u32,
4194                        negated: binding.negated as u8,
4195                        row_count: source_relation.num_rows_device(),
4196                        key_col0,
4197                        key_col0_width: key_col0_width as u32,
4198                        expected_key_col0_bits: key_col0_expectation.bits,
4199                        expected_key_col0_type_code: key_col0_expectation.type_code,
4200                    });
4201                }
4202                &[key_col0, key_col1] if !has_value_level_keys => {
4203                    let key_col0_ref = source_relation.column(key_col0).ok_or_else(|| {
4204                        XlogError::UnsupportedEpistemicConstruct {
4205                            construct: "epistemic GPU stable-model tuple membership".to_string(),
4206                            context: format!(
4207                                "tuple source relation {} missing key column {}",
4208                                binding.predicate, key_col0
4209                            ),
4210                        }
4211                    })?;
4212                    let key_col1_ref = source_relation.column(key_col1).ok_or_else(|| {
4213                        XlogError::UnsupportedEpistemicConstruct {
4214                            construct: "epistemic GPU stable-model tuple membership".to_string(),
4215                            context: format!(
4216                                "tuple source relation {} missing key column {}",
4217                                binding.predicate, key_col1
4218                            ),
4219                        }
4220                    })?;
4221                    let key_col0_type =
4222                        source_relation
4223                            .schema()
4224                            .column_type(key_col0)
4225                            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4226                                construct: "epistemic GPU stable-model tuple membership"
4227                                    .to_string(),
4228                                context: format!(
4229                                    "tuple source relation {} missing schema for key column {}",
4230                                    binding.predicate, key_col0
4231                                ),
4232                            })?;
4233                    let key_col1_type =
4234                        source_relation
4235                            .schema()
4236                            .column_type(key_col1)
4237                            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4238                                construct: "epistemic GPU stable-model tuple membership"
4239                                    .to_string(),
4240                                context: format!(
4241                                    "tuple source relation {} missing schema for key column {}",
4242                                    binding.predicate, key_col1
4243                                ),
4244                            })?;
4245                    let key_col0_width = key_col0_type.size_bytes();
4246                    let key_col1_width = key_col1_type.size_bytes();
4247                    let key_col0_expectation =
4248                        TupleKeyExpectation::from_term(&binding.key_terms[0], key_col0_type)?;
4249                    let key_col1_expectation =
4250                        TupleKeyExpectation::from_term(&binding.key_terms[1], key_col1_type)?;
4251                    let max_width = key_col0_width.max(key_col1_width);
4252                    if max_width > u32::MAX as usize {
4253                        return Err(XlogError::ResourceExhausted {
4254                            context: "epistemic GPU tuple-key column width".to_string(),
4255                            estimated_bytes: max_width as u64,
4256                            budget_bytes: u32::MAX as u64,
4257                        });
4258                    }
4259                    tuple_sources.push(TupleSourceLaunch::ArityTwo {
4260                        literal_index: binding.literal_index as u32,
4261                        reduction_index: binding.reduction_index as u32,
4262                        negated: binding.negated as u8,
4263                        row_count: source_relation.num_rows_device(),
4264                        key_col0: key_col0_ref,
4265                        key_col0_width: key_col0_width as u32,
4266                        expected_key_col0_bits: key_col0_expectation.bits,
4267                        expected_key_col0_type_code: key_col0_expectation.type_code,
4268                        key_col1: key_col1_ref,
4269                        key_col1_width: key_col1_width as u32,
4270                        expected_key_col1_bits: key_col1_expectation.bits,
4271                        expected_key_col1_type_code: key_col1_expectation.type_code,
4272                    });
4273                }
4274                &[key_col0, key_col1, key_col2] if !has_value_level_keys => {
4275                    let key_col0_ref = source_relation.column(key_col0).ok_or_else(|| {
4276                        XlogError::UnsupportedEpistemicConstruct {
4277                            construct: "epistemic GPU stable-model tuple membership".to_string(),
4278                            context: format!(
4279                                "tuple source relation {} missing key column {}",
4280                                binding.predicate, key_col0
4281                            ),
4282                        }
4283                    })?;
4284                    let key_col1_ref = source_relation.column(key_col1).ok_or_else(|| {
4285                        XlogError::UnsupportedEpistemicConstruct {
4286                            construct: "epistemic GPU stable-model tuple membership".to_string(),
4287                            context: format!(
4288                                "tuple source relation {} missing key column {}",
4289                                binding.predicate, key_col1
4290                            ),
4291                        }
4292                    })?;
4293                    let key_col2_ref = source_relation.column(key_col2).ok_or_else(|| {
4294                        XlogError::UnsupportedEpistemicConstruct {
4295                            construct: "epistemic GPU stable-model tuple membership".to_string(),
4296                            context: format!(
4297                                "tuple source relation {} missing key column {}",
4298                                binding.predicate, key_col2
4299                            ),
4300                        }
4301                    })?;
4302                    let key_col0_type =
4303                        source_relation
4304                            .schema()
4305                            .column_type(key_col0)
4306                            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4307                                construct: "epistemic GPU stable-model tuple membership"
4308                                    .to_string(),
4309                                context: format!(
4310                                    "tuple source relation {} missing schema for key column {}",
4311                                    binding.predicate, key_col0
4312                                ),
4313                            })?;
4314                    let key_col1_type =
4315                        source_relation
4316                            .schema()
4317                            .column_type(key_col1)
4318                            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4319                                construct: "epistemic GPU stable-model tuple membership"
4320                                    .to_string(),
4321                                context: format!(
4322                                    "tuple source relation {} missing schema for key column {}",
4323                                    binding.predicate, key_col1
4324                                ),
4325                            })?;
4326                    let key_col2_type =
4327                        source_relation
4328                            .schema()
4329                            .column_type(key_col2)
4330                            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4331                                construct: "epistemic GPU stable-model tuple membership"
4332                                    .to_string(),
4333                                context: format!(
4334                                    "tuple source relation {} missing schema for key column {}",
4335                                    binding.predicate, key_col2
4336                                ),
4337                            })?;
4338                    let key_col0_width = key_col0_type.size_bytes();
4339                    let key_col1_width = key_col1_type.size_bytes();
4340                    let key_col2_width = key_col2_type.size_bytes();
4341                    let key_col0_expectation =
4342                        TupleKeyExpectation::from_term(&binding.key_terms[0], key_col0_type)?;
4343                    let key_col1_expectation =
4344                        TupleKeyExpectation::from_term(&binding.key_terms[1], key_col1_type)?;
4345                    let key_col2_expectation =
4346                        TupleKeyExpectation::from_term(&binding.key_terms[2], key_col2_type)?;
4347                    let max_width = key_col0_width.max(key_col1_width).max(key_col2_width);
4348                    if max_width > u32::MAX as usize {
4349                        return Err(XlogError::ResourceExhausted {
4350                            context: "epistemic GPU tuple-key column width".to_string(),
4351                            estimated_bytes: max_width as u64,
4352                            budget_bytes: u32::MAX as u64,
4353                        });
4354                    }
4355                    tuple_sources.push(TupleSourceLaunch::ArityThree {
4356                        literal_index: binding.literal_index as u32,
4357                        reduction_index: binding.reduction_index as u32,
4358                        negated: binding.negated as u8,
4359                        row_count: source_relation.num_rows_device(),
4360                        key_col0: key_col0_ref,
4361                        key_col0_width: key_col0_width as u32,
4362                        expected_key_col0_bits: key_col0_expectation.bits,
4363                        expected_key_col0_type_code: key_col0_expectation.type_code,
4364                        key_col1: key_col1_ref,
4365                        key_col1_width: key_col1_width as u32,
4366                        expected_key_col1_bits: key_col1_expectation.bits,
4367                        expected_key_col1_type_code: key_col1_expectation.type_code,
4368                        key_col2: key_col2_ref,
4369                        key_col2_width: key_col2_width as u32,
4370                        expected_key_col2_bits: key_col2_expectation.bits,
4371                        expected_key_col2_type_code: key_col2_expectation.type_code,
4372                    });
4373                }
4374                key_columns => {
4375                    if key_columns.len() > u32::MAX as usize {
4376                        return Err(XlogError::ResourceExhausted {
4377                            context: "epistemic GPU tuple-key arity".to_string(),
4378                            estimated_bytes: key_columns.len() as u64,
4379                            budget_bytes: u32::MAX as u64,
4380                        });
4381                    }
4382
4383                    let mut key_col_ptrs_host = Vec::with_capacity(key_columns.len());
4384                    let mut key_col_widths_host = Vec::with_capacity(key_columns.len());
4385                    let mut expected_key_bits_host = Vec::with_capacity(key_columns.len());
4386                    let mut expected_key_type_codes_host = Vec::with_capacity(key_columns.len());
4387                    let mut tuple_key_match_modes_host = Vec::with_capacity(key_columns.len());
4388                    let mut bound_value_col_ptrs_host = Vec::with_capacity(key_columns.len());
4389                    let mut bound_value_col_widths_host = Vec::with_capacity(key_columns.len());
4390                    for (term_index, &key_col) in key_columns.iter().enumerate() {
4391                        let key_col_ref = source_relation.column(key_col).ok_or_else(|| {
4392                            XlogError::UnsupportedEpistemicConstruct {
4393                                construct: "epistemic GPU stable-model tuple membership"
4394                                    .to_string(),
4395                                context: format!(
4396                                    "tuple source relation {} missing key column {}",
4397                                    binding.predicate, key_col
4398                                ),
4399                            }
4400                        })?;
4401                        let key_col_type = source_relation
4402                            .schema()
4403                            .column_type(key_col)
4404                            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4405                                construct: "epistemic GPU stable-model tuple membership"
4406                                    .to_string(),
4407                                context: format!(
4408                                    "tuple source relation {} missing schema for key column {}",
4409                                    binding.predicate, key_col
4410                                ),
4411                            })?;
4412                        let key_col_width = key_col_type.size_bytes();
4413                        if key_col_width > u32::MAX as usize {
4414                            return Err(XlogError::ResourceExhausted {
4415                                context: "epistemic GPU tuple-key column width".to_string(),
4416                                estimated_bytes: key_col_width as u64,
4417                                budget_bytes: u32::MAX as u64,
4418                            });
4419                        }
4420
4421                        key_col_ptrs_host.push(*key_col_ref.device_ptr());
4422                        key_col_widths_host.push(key_col_width as u32);
4423                        match &binding.key_terms[term_index] {
4424                            EirTerm::Variable(variable_name) => {
4425                                let bound_col_index = binding.bound_output_columns[term_index]
4426                                    .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
4427                                        construct: "epistemic GPU bound tuple-key matching"
4428                                            .to_string(),
4429                                        context: format!(
4430                                            "tuple key variable {variable_name} has no reduced \
4431                                             output column binding"
4432                                        ),
4433                                    })?;
4434                                let bound_col =
4435                                    output.column(bound_col_index).ok_or_else(|| {
4436                                        XlogError::UnsupportedEpistemicConstruct {
4437                                            construct: "epistemic GPU bound tuple-key matching"
4438                                                .to_string(),
4439                                            context: format!(
4440                                                "reduced output is missing device column \
4441                                             {bound_col_index} for variable {variable_name}"
4442                                            ),
4443                                        }
4444                                    })?;
4445                                let bound_col_type =
4446                                    output.schema().column_type(bound_col_index).ok_or_else(
4447                                        || XlogError::UnsupportedEpistemicConstruct {
4448                                            construct: "epistemic GPU bound tuple-key matching"
4449                                                .to_string(),
4450                                            context: format!(
4451                                                "reduced output is missing schema for variable \
4452                                             {variable_name}"
4453                                            ),
4454                                        },
4455                                    )?;
4456                                if bound_col_type != key_col_type {
4457                                    return Err(XlogError::UnsupportedEpistemicConstruct {
4458                                        construct: "epistemic GPU bound tuple-key matching"
4459                                            .to_string(),
4460                                        context: format!(
4461                                            "bound variable {variable_name} has output type \
4462                                             {bound_col_type:?}, but tuple source {} key column \
4463                                             {} has type {key_col_type:?}",
4464                                            binding.predicate, key_col
4465                                        ),
4466                                    });
4467                                }
4468                                let bound_col_width = bound_col_type.size_bytes();
4469                                if bound_col_width > u32::MAX as usize {
4470                                    return Err(XlogError::ResourceExhausted {
4471                                        context: "epistemic GPU bound tuple-key column width"
4472                                            .to_string(),
4473                                        estimated_bytes: bound_col_width as u64,
4474                                        budget_bytes: u32::MAX as u64,
4475                                    });
4476                                }
4477
4478                                expected_key_bits_host.push(0);
4479                                expected_key_type_codes_host.push(key_col_type.to_code());
4480                                tuple_key_match_modes_host.push(TUPLE_KEY_MATCH_MODE_BOUND_OUTPUT);
4481                                bound_value_col_ptrs_host.push(*bound_col.device_ptr());
4482                                bound_value_col_widths_host.push(bound_col_width as u32);
4483                            }
4484                            EirTerm::Anonymous => {
4485                                // Wildcard: no equality requirement on this
4486                                // tuple-key column. The device still reads the
4487                                // column pointer/width, but the kernel matches
4488                                // every stable-model value in this position.
4489                                expected_key_bits_host.push(0);
4490                                expected_key_type_codes_host.push(key_col_type.to_code());
4491                                tuple_key_match_modes_host.push(TUPLE_KEY_MATCH_MODE_WILDCARD);
4492                                bound_value_col_ptrs_host.push(0);
4493                                bound_value_col_widths_host.push(0);
4494                            }
4495                            term => {
4496                                let expectation =
4497                                    TupleKeyExpectation::from_term(term, key_col_type)?;
4498                                expected_key_bits_host.push(expectation.bits);
4499                                expected_key_type_codes_host.push(expectation.type_code);
4500                                tuple_key_match_modes_host.push(TUPLE_KEY_MATCH_MODE_GROUND);
4501                                bound_value_col_ptrs_host.push(0);
4502                                bound_value_col_widths_host.push(0);
4503                            }
4504                        }
4505                    }
4506
4507                    let memory = self.provider.memory();
4508                    let mut key_col_ptrs = memory.alloc::<u64>(key_columns.len())?;
4509                    let mut key_col_widths = memory.alloc::<u32>(key_columns.len())?;
4510                    let mut expected_key_bits = memory.alloc::<u64>(key_columns.len())?;
4511                    let mut expected_key_type_codes = memory.alloc::<u8>(key_columns.len())?;
4512                    let mut tuple_key_match_modes = memory.alloc::<u8>(key_columns.len())?;
4513                    let mut bound_value_col_ptrs = memory.alloc::<u64>(key_columns.len())?;
4514                    let mut bound_value_col_widths = memory.alloc::<u32>(key_columns.len())?;
4515                    self.provider
4516                        .htod_launch_metadata_sync_copy_into(&key_col_ptrs_host, &mut key_col_ptrs)
4517                        .map_err(|e| {
4518                            XlogError::execution_ctx(
4519                                "epistemic GPU tuple-key metadata",
4520                                "upload key column pointers",
4521                                &e,
4522                            )
4523                        })?;
4524                    self.provider
4525                        .htod_launch_metadata_sync_copy_into(
4526                            &key_col_widths_host,
4527                            &mut key_col_widths,
4528                        )
4529                        .map_err(|e| {
4530                            XlogError::execution_ctx(
4531                                "epistemic GPU tuple-key metadata",
4532                                "upload key column widths",
4533                                &e,
4534                            )
4535                        })?;
4536                    self.provider
4537                        .htod_launch_metadata_sync_copy_into(
4538                            &expected_key_bits_host,
4539                            &mut expected_key_bits,
4540                        )
4541                        .map_err(|e| {
4542                            XlogError::execution_ctx(
4543                                "epistemic GPU tuple-key metadata",
4544                                "upload expected key bits",
4545                                &e,
4546                            )
4547                        })?;
4548                    self.provider
4549                        .htod_launch_metadata_sync_copy_into(
4550                            &expected_key_type_codes_host,
4551                            &mut expected_key_type_codes,
4552                        )
4553                        .map_err(|e| {
4554                            XlogError::execution_ctx(
4555                                "epistemic GPU tuple-key metadata",
4556                                "upload expected key type codes",
4557                                &e,
4558                            )
4559                        })?;
4560                    self.provider
4561                        .htod_launch_metadata_sync_copy_into(
4562                            &tuple_key_match_modes_host,
4563                            &mut tuple_key_match_modes,
4564                        )
4565                        .map_err(|e| {
4566                            XlogError::execution_ctx(
4567                                "epistemic GPU tuple-key metadata",
4568                                "upload tuple key match modes",
4569                                &e,
4570                            )
4571                        })?;
4572                    self.provider
4573                        .htod_launch_metadata_sync_copy_into(
4574                            &bound_value_col_ptrs_host,
4575                            &mut bound_value_col_ptrs,
4576                        )
4577                        .map_err(|e| {
4578                            XlogError::execution_ctx(
4579                                "epistemic GPU tuple-key metadata",
4580                                "upload bound value column pointers",
4581                                &e,
4582                            )
4583                        })?;
4584                    self.provider
4585                        .htod_launch_metadata_sync_copy_into(
4586                            &bound_value_col_widths_host,
4587                            &mut bound_value_col_widths,
4588                        )
4589                        .map_err(|e| {
4590                            XlogError::execution_ctx(
4591                                "epistemic GPU tuple-key metadata",
4592                                "upload bound value column widths",
4593                                &e,
4594                            )
4595                        })?;
4596
4597                    tuple_sources.push(TupleSourceLaunch::ArityN {
4598                        literal_index: binding.literal_index as u32,
4599                        reduction_index: binding.reduction_index as u32,
4600                        negated: binding.negated as u8,
4601                        row_count: source_relation.num_rows_device(),
4602                        bound_value_row_count: output.num_rows_device(),
4603                        key_col_count: key_columns.len() as u32,
4604                        key_col_ptrs,
4605                        key_col_widths,
4606                        expected_key_bits,
4607                        expected_key_type_codes,
4608                        tuple_key_match_modes,
4609                        bound_value_col_ptrs,
4610                        bound_value_col_widths,
4611                        has_bound_value_keys: has_bound_value_keys as u8,
4612                    });
4613                }
4614            }
4615        }
4616
4617        let literal_count = literal_count as u32;
4618        let candidate_count = candidate_count as u32;
4619        let reduction_count = reduction_count as u32;
4620        let models_per_reduction = models_per_reduction as u32;
4621        let world_stride = world_stride as u32;
4622        let func = self
4623            .provider
4624            .device()
4625            .inner()
4626            .get_func(
4627                EPISTEMIC_MODULE,
4628                epistemic_kernels::EPISTEMIC_POPULATE_MODEL_MEMBERSHIP_FROM_TUPLE_SOURCE_U8,
4629            )
4630            .ok_or_else(|| {
4631                XlogError::Execution(
4632                    "epistemic tuple-source model-membership kernel not found".to_string(),
4633                )
4634            })?;
4635        let func_arity1 = self
4636            .provider
4637            .device()
4638            .inner()
4639            .get_func(
4640                EPISTEMIC_MODULE,
4641                epistemic_kernels::EPISTEMIC_POPULATE_MODEL_MEMBERSHIP_FROM_TUPLE_SOURCE_ARITY1_U8,
4642            )
4643            .ok_or_else(|| {
4644                XlogError::Execution(
4645                    "epistemic arity-one tuple-source model-membership kernel not found"
4646                        .to_string(),
4647                )
4648            })?;
4649        let func_arity2 = self
4650            .provider
4651            .device()
4652            .inner()
4653            .get_func(
4654                EPISTEMIC_MODULE,
4655                epistemic_kernels::EPISTEMIC_POPULATE_MODEL_MEMBERSHIP_FROM_TUPLE_SOURCE_ARITY2_U8,
4656            )
4657            .ok_or_else(|| {
4658                XlogError::Execution(
4659                    "epistemic arity-two tuple-source model-membership kernel not found"
4660                        .to_string(),
4661                )
4662            })?;
4663        let func_arity3 = self
4664            .provider
4665            .device()
4666            .inner()
4667            .get_func(
4668                EPISTEMIC_MODULE,
4669                epistemic_kernels::EPISTEMIC_POPULATE_MODEL_MEMBERSHIP_FROM_TUPLE_SOURCE_ARITY3_U8,
4670            )
4671            .ok_or_else(|| {
4672                XlogError::Execution(
4673                    "epistemic arity-three tuple-source model-membership kernel not found"
4674                        .to_string(),
4675                )
4676            })?;
4677        let func_arity_n = self
4678            .provider
4679            .device()
4680            .inner()
4681            .get_func(
4682                EPISTEMIC_MODULE,
4683                epistemic_kernels::EPISTEMIC_POPULATE_MODEL_MEMBERSHIP_FROM_TUPLE_SOURCE_ARITY_N_U8,
4684            )
4685            .ok_or_else(|| {
4686                XlogError::Execution(
4687                    "epistemic generic-arity tuple-source model-membership kernel not found"
4688                        .to_string(),
4689                )
4690            })?;
4691        let config = LaunchConfig::for_num_elems(per_binding_launch_elems as u32);
4692
4693        let mut kernel_timings = Vec::with_capacity(tuple_sources.len());
4694        for tuple_source in &tuple_sources {
4695            let kernel_timing = self.time_epistemic_gpu_kernel_launch(
4696                "epistemic GPU tuple-source model membership",
4697                || unsafe {
4698                    match tuple_source {
4699                        TupleSourceLaunch::ArityZero {
4700                            literal_index,
4701                            reduction_index,
4702                            negated,
4703                            row_count,
4704                        } => {
4705                            // SAFETY: kernel arguments match the PTX signature; the capacity
4706                            // checks above prove candidate, world-view, membership, rejection,
4707                            // and tuple-source row-count buffers cover all accesses.
4708                            let mut params: Vec<*mut c_void> = vec![
4709                                literal_count.as_kernel_param(),
4710                                candidate_count.as_kernel_param(),
4711                                reduction_count.as_kernel_param(),
4712                                models_per_reduction.as_kernel_param(),
4713                                world_stride.as_kernel_param(),
4714                                literal_index.as_kernel_param(),
4715                                reduction_index.as_kernel_param(),
4716                                negated.as_kernel_param(),
4717                                output.num_rows_device().as_kernel_param(),
4718                                row_count.as_kernel_param(),
4719                                (&workspace.candidate_assumptions).as_kernel_param(),
4720                                (&workspace.world_views).as_kernel_param(),
4721                                (&workspace.model_membership).as_kernel_param(),
4722                                (&workspace.rejection_reasons).as_kernel_param(),
4723                            ];
4724                            func.clone().launch(config, &mut params)?;
4725                        }
4726                        TupleSourceLaunch::ArityOne {
4727                            literal_index,
4728                            reduction_index,
4729                            negated,
4730                            row_count,
4731                            key_col0,
4732                            key_col0_width,
4733                            expected_key_col0_bits,
4734                            expected_key_col0_type_code,
4735                        } => {
4736                            // SAFETY: kernel arguments match the PTX signature; capacity checks
4737                            // above cover workspace buffers, row_count comes from the named source
4738                            // relation, and key_col0/key_col0_width are schema-validated.
4739                            let mut params: Vec<*mut c_void> = vec![
4740                                literal_count.as_kernel_param(),
4741                                candidate_count.as_kernel_param(),
4742                                reduction_count.as_kernel_param(),
4743                                models_per_reduction.as_kernel_param(),
4744                                world_stride.as_kernel_param(),
4745                                literal_index.as_kernel_param(),
4746                                reduction_index.as_kernel_param(),
4747                                negated.as_kernel_param(),
4748                                output.num_rows_device().as_kernel_param(),
4749                                row_count.as_kernel_param(),
4750                                key_col0.as_kernel_param(),
4751                                key_col0_width.as_kernel_param(),
4752                                expected_key_col0_bits.as_kernel_param(),
4753                                expected_key_col0_type_code.as_kernel_param(),
4754                                (&workspace.candidate_assumptions).as_kernel_param(),
4755                                (&workspace.world_views).as_kernel_param(),
4756                                (&workspace.model_membership).as_kernel_param(),
4757                                (&workspace.rejection_reasons).as_kernel_param(),
4758                            ];
4759                            func_arity1.clone().launch(config, &mut params)?;
4760                        }
4761                        TupleSourceLaunch::ArityTwo {
4762                            literal_index,
4763                            reduction_index,
4764                            negated,
4765                            row_count,
4766                            key_col0,
4767                            key_col0_width,
4768                            expected_key_col0_bits,
4769                            expected_key_col0_type_code,
4770                            key_col1,
4771                            key_col1_width,
4772                            expected_key_col1_bits,
4773                            expected_key_col1_type_code,
4774                        } => {
4775                            // SAFETY: kernel arguments match the PTX signature; capacity checks
4776                            // above cover workspace buffers, row_count comes from the named source
4777                            // relation, and both key columns are schema-validated.
4778                            let mut params: Vec<*mut c_void> = vec![
4779                                literal_count.as_kernel_param(),
4780                                candidate_count.as_kernel_param(),
4781                                reduction_count.as_kernel_param(),
4782                                models_per_reduction.as_kernel_param(),
4783                                world_stride.as_kernel_param(),
4784                                literal_index.as_kernel_param(),
4785                                reduction_index.as_kernel_param(),
4786                                negated.as_kernel_param(),
4787                                output.num_rows_device().as_kernel_param(),
4788                                row_count.as_kernel_param(),
4789                                key_col0.as_kernel_param(),
4790                                key_col0_width.as_kernel_param(),
4791                                expected_key_col0_bits.as_kernel_param(),
4792                                expected_key_col0_type_code.as_kernel_param(),
4793                                key_col1.as_kernel_param(),
4794                                key_col1_width.as_kernel_param(),
4795                                expected_key_col1_bits.as_kernel_param(),
4796                                expected_key_col1_type_code.as_kernel_param(),
4797                                (&workspace.candidate_assumptions).as_kernel_param(),
4798                                (&workspace.world_views).as_kernel_param(),
4799                                (&workspace.model_membership).as_kernel_param(),
4800                                (&workspace.rejection_reasons).as_kernel_param(),
4801                            ];
4802                            func_arity2.clone().launch(config, &mut params)?;
4803                        }
4804                        TupleSourceLaunch::ArityThree {
4805                            literal_index,
4806                            reduction_index,
4807                            negated,
4808                            row_count,
4809                            key_col0,
4810                            key_col0_width,
4811                            expected_key_col0_bits,
4812                            expected_key_col0_type_code,
4813                            key_col1,
4814                            key_col1_width,
4815                            expected_key_col1_bits,
4816                            expected_key_col1_type_code,
4817                            key_col2,
4818                            key_col2_width,
4819                            expected_key_col2_bits,
4820                            expected_key_col2_type_code,
4821                        } => {
4822                            // SAFETY: kernel arguments match the PTX signature; capacity checks
4823                            // above cover workspace buffers, row_count comes from the named source
4824                            // relation, and all key columns are schema-validated.
4825                            let mut params: Vec<*mut c_void> = vec![
4826                                literal_count.as_kernel_param(),
4827                                candidate_count.as_kernel_param(),
4828                                reduction_count.as_kernel_param(),
4829                                models_per_reduction.as_kernel_param(),
4830                                world_stride.as_kernel_param(),
4831                                literal_index.as_kernel_param(),
4832                                reduction_index.as_kernel_param(),
4833                                negated.as_kernel_param(),
4834                                output.num_rows_device().as_kernel_param(),
4835                                row_count.as_kernel_param(),
4836                                key_col0.as_kernel_param(),
4837                                key_col0_width.as_kernel_param(),
4838                                expected_key_col0_bits.as_kernel_param(),
4839                                expected_key_col0_type_code.as_kernel_param(),
4840                                key_col1.as_kernel_param(),
4841                                key_col1_width.as_kernel_param(),
4842                                expected_key_col1_bits.as_kernel_param(),
4843                                expected_key_col1_type_code.as_kernel_param(),
4844                                key_col2.as_kernel_param(),
4845                                key_col2_width.as_kernel_param(),
4846                                expected_key_col2_bits.as_kernel_param(),
4847                                expected_key_col2_type_code.as_kernel_param(),
4848                                (&workspace.candidate_assumptions).as_kernel_param(),
4849                                (&workspace.world_views).as_kernel_param(),
4850                                (&workspace.model_membership).as_kernel_param(),
4851                                (&workspace.rejection_reasons).as_kernel_param(),
4852                            ];
4853                            func_arity3.clone().launch(config, &mut params)?;
4854                        }
4855                        TupleSourceLaunch::ArityN {
4856                            literal_index,
4857                            reduction_index,
4858                            negated,
4859                            row_count,
4860                            bound_value_row_count,
4861                            key_col_count,
4862                            key_col_ptrs,
4863                            key_col_widths,
4864                            expected_key_bits,
4865                            expected_key_type_codes,
4866                            tuple_key_match_modes,
4867                            bound_value_col_ptrs,
4868                            bound_value_col_widths,
4869                            has_bound_value_keys,
4870                        } => {
4871                            // SAFETY: kernel arguments match the PTX signature; capacity checks
4872                            // above cover workspace buffers, row_count comes from the named source
4873                            // relation, and pointer/width/expectation arrays are device-resident
4874                            // launch metadata for existing relation and reduced-output columns.
4875                            let mut params: Vec<*mut c_void> = vec![
4876                                literal_count.as_kernel_param(),
4877                                candidate_count.as_kernel_param(),
4878                                reduction_count.as_kernel_param(),
4879                                models_per_reduction.as_kernel_param(),
4880                                world_stride.as_kernel_param(),
4881                                literal_index.as_kernel_param(),
4882                                reduction_index.as_kernel_param(),
4883                                negated.as_kernel_param(),
4884                                output.num_rows_device().as_kernel_param(),
4885                                row_count.as_kernel_param(),
4886                                key_col_ptrs.as_kernel_param(),
4887                                key_col_widths.as_kernel_param(),
4888                                expected_key_bits.as_kernel_param(),
4889                                expected_key_type_codes.as_kernel_param(),
4890                                tuple_key_match_modes.as_kernel_param(),
4891                                bound_value_col_ptrs.as_kernel_param(),
4892                                bound_value_col_widths.as_kernel_param(),
4893                                bound_value_row_count.as_kernel_param(),
4894                                key_col_count.as_kernel_param(),
4895                                has_bound_value_keys.as_kernel_param(),
4896                                (&workspace.candidate_assumptions).as_kernel_param(),
4897                                (&workspace.world_views).as_kernel_param(),
4898                                (&workspace.model_membership).as_kernel_param(),
4899                                (&workspace.rejection_reasons).as_kernel_param(),
4900                            ];
4901                            func_arity_n.clone().launch(config, &mut params)?;
4902                        }
4903                    };
4904                    Ok(())
4905                },
4906            )?;
4907            kernel_timings.push(kernel_timing);
4908        }
4909        let kernel_timing = EpistemicGpuKernelTimingTrace::checked_sum(kernel_timings)?;
4910
4911        Ok(trace.with_kernel_timing(kernel_timing))
4912    }
4913
4914    /// Validate staged model memberships against candidate world views on device.
4915    pub fn validate_epistemic_gpu_world_views(
4916        &self,
4917        workspace: &mut EpistemicGpuWorkspace,
4918        gpu_plan: &EpistemicGpuPlan,
4919        candidate_count: usize,
4920        models_per_reduction: usize,
4921    ) -> Result<EpistemicGpuWorldViewValidationTrace> {
4922        gpu_plan.validate_tuple_membership_bindings()?;
4923        let literal_count = gpu_plan.epistemic_literals.len();
4924        let reduction_count = gpu_plan.reductions.len();
4925        let trace = EpistemicGpuWorldViewValidationTrace::for_counts(
4926            literal_count,
4927            candidate_count,
4928            reduction_count,
4929            models_per_reduction,
4930        )?;
4931        if trace.model_membership_bytes_checked > workspace.layout.model_membership_bytes {
4932            return Err(XlogError::ResourceExhausted {
4933                context: "epistemic GPU world-view validation membership workspace".to_string(),
4934                estimated_bytes: trace.model_membership_bytes_checked as u64,
4935                budget_bytes: workspace.layout.model_membership_bytes as u64,
4936            });
4937        }
4938        if trace.world_view_slots_checked > workspace.layout.world_view_bytes {
4939            return Err(XlogError::ResourceExhausted {
4940                context: "epistemic GPU world-view validation world-view workspace".to_string(),
4941                estimated_bytes: trace.world_view_slots_checked as u64,
4942                budget_bytes: workspace.layout.world_view_bytes as u64,
4943            });
4944        }
4945        if trace.rejection_reason_slots_written > workspace.layout.rejection_reason_slots {
4946            return Err(XlogError::ResourceExhausted {
4947                context: "epistemic GPU world-view validation rejection workspace".to_string(),
4948                estimated_bytes: trace.rejection_reason_slots_written as u64,
4949                budget_bytes: workspace.layout.rejection_reason_slots as u64,
4950            });
4951        }
4952        if trace.model_membership_bytes_checked > u32::MAX as usize {
4953            return Err(XlogError::ResourceExhausted {
4954                context: "epistemic GPU world-view validation membership launch".to_string(),
4955                estimated_bytes: trace.model_membership_bytes_checked as u64,
4956                budget_bytes: u32::MAX as u64,
4957            });
4958        }
4959        if literal_count > u32::MAX as usize
4960            || candidate_count > u32::MAX as usize
4961            || reduction_count > u32::MAX as usize
4962            || models_per_reduction > u32::MAX as usize
4963        {
4964            return Err(XlogError::ResourceExhausted {
4965                context: "epistemic GPU world-view validation dimensions".to_string(),
4966                estimated_bytes: literal_count
4967                    .max(candidate_count)
4968                    .max(reduction_count)
4969                    .max(models_per_reduction) as u64,
4970                budget_bytes: u32::MAX as u64,
4971            });
4972        }
4973
4974        let mut literal_op_codes_host = vec![0u8; literal_count];
4975        let mut literal_negated_host = vec![0u8; literal_count];
4976        let mut literal_bound_to_output_host = vec![0u8; literal_count];
4977        let mut literal_reduction_indices_host = vec![0u32; literal_count];
4978        for binding in &gpu_plan.tuple_membership_bindings {
4979            literal_op_codes_host[binding.literal_index] = epistemic_operator_code(binding.op);
4980            literal_negated_host[binding.literal_index] = u8::from(binding.negated);
4981            literal_bound_to_output_host[binding.literal_index] =
4982                u8::from(binding.bound_output_columns.iter().any(Option::is_some));
4983            literal_reduction_indices_host[binding.literal_index] = binding.reduction_index as u32;
4984        }
4985        let memory = self.provider.memory();
4986        let mut literal_op_codes = memory.alloc::<u8>(literal_count)?;
4987        let mut literal_negated = memory.alloc::<u8>(literal_count)?;
4988        let mut literal_bound_to_output = memory.alloc::<u8>(literal_count)?;
4989        let mut literal_reduction_indices = memory.alloc::<u32>(literal_count)?;
4990        self.provider
4991            .htod_launch_metadata_sync_copy_into(&literal_op_codes_host, &mut literal_op_codes)
4992            .map_err(|e| {
4993                XlogError::execution_ctx(
4994                    "epistemic GPU world-view validation metadata",
4995                    "upload literal operator codes",
4996                    &e,
4997                )
4998            })?;
4999        self.provider
5000            .htod_launch_metadata_sync_copy_into(&literal_negated_host, &mut literal_negated)
5001            .map_err(|e| {
5002                XlogError::execution_ctx(
5003                    "epistemic GPU world-view validation metadata",
5004                    "upload literal negation flags",
5005                    &e,
5006                )
5007            })?;
5008        self.provider
5009            .htod_launch_metadata_sync_copy_into(
5010                &literal_bound_to_output_host,
5011                &mut literal_bound_to_output,
5012            )
5013            .map_err(|e| {
5014                XlogError::execution_ctx(
5015                    "epistemic GPU world-view validation metadata",
5016                    "upload literal output-binding flags",
5017                    &e,
5018                )
5019            })?;
5020        self.provider
5021            .htod_launch_metadata_sync_copy_into(
5022                &literal_reduction_indices_host,
5023                &mut literal_reduction_indices,
5024            )
5025            .map_err(|e| {
5026                XlogError::execution_ctx(
5027                    "epistemic GPU world-view validation metadata",
5028                    "upload literal reduction indices",
5029                    &e,
5030                )
5031            })?;
5032
5033        let world_stride =
5034            workspace.layout.world_view_bytes / workspace.layout.rejection_reason_slots;
5035        if world_stride == 0 || world_stride > u32::MAX as usize {
5036            return Err(XlogError::ResourceExhausted {
5037                context: "epistemic GPU world-view validation world stride".to_string(),
5038                estimated_bytes: world_stride as u64,
5039                budget_bytes: u32::MAX as u64,
5040            });
5041        }
5042
5043        let literal_count = literal_count as u32;
5044        let candidate_count = candidate_count as u32;
5045        let reduction_count = reduction_count as u32;
5046        let models_per_reduction = models_per_reduction as u32;
5047        let world_stride = world_stride as u32;
5048        let func = self
5049            .provider
5050            .device()
5051            .inner()
5052            .get_func(
5053                EPISTEMIC_MODULE,
5054                epistemic_kernels::EPISTEMIC_VALIDATE_WORLD_VIEWS_U8,
5055            )
5056            .ok_or_else(|| {
5057                XlogError::Execution("epistemic world-view validation kernel not found".to_string())
5058            })?;
5059        let config = LaunchConfig::for_num_elems(candidate_count);
5060
5061        let kernel_timing = self.time_epistemic_gpu_kernel_launch(
5062            "epistemic GPU world-view validation",
5063            || unsafe {
5064                // SAFETY: kernel arguments match the PTX signature; the capacity checks
5065                // above prove model-membership, world-view, and rejection buffers cover
5066                // all reads and writes for the candidate range.
5067                let mut params: Vec<*mut c_void> = vec![
5068                    literal_count.as_kernel_param(),
5069                    candidate_count.as_kernel_param(),
5070                    reduction_count.as_kernel_param(),
5071                    models_per_reduction.as_kernel_param(),
5072                    world_stride.as_kernel_param(),
5073                    (&literal_op_codes).as_kernel_param(),
5074                    (&literal_negated).as_kernel_param(),
5075                    (&literal_bound_to_output).as_kernel_param(),
5076                    (&literal_reduction_indices).as_kernel_param(),
5077                    (&workspace.candidate_assumptions).as_kernel_param(),
5078                    (&workspace.model_membership).as_kernel_param(),
5079                    (&workspace.world_views).as_kernel_param(),
5080                    (&workspace.rejection_reasons).as_kernel_param(),
5081                ];
5082                func.clone().launch(config, &mut params)
5083            },
5084        )?;
5085
5086        Ok(trace.with_kernel_timing(kernel_timing))
5087    }
5088
5089    /// Prune accepted candidate world views that satisfy an epistemic integrity
5090    /// constraint body.
5091    ///
5092    /// Runs after [`Self::validate_epistemic_gpu_world_views`]: each surviving
5093    /// candidate's assumption bit equals the negation-folded observed modal
5094    /// value of its literal, so a constraint body holds in this accepted world
5095    /// view exactly when every referenced literal's assumption bit is set. Such
5096    /// candidates are pruned on device with the world-view constraint-violation
5097    /// rejection code; no accepted world is read back to the host.
5098    pub fn validate_epistemic_gpu_world_view_constraints(
5099        &self,
5100        workspace: &mut EpistemicGpuWorkspace,
5101        gpu_plan: &EpistemicGpuPlan,
5102        candidate_count: usize,
5103    ) -> Result<EpistemicGpuConstraintWorldViewValidationTrace> {
5104        gpu_plan.validate_constraints()?;
5105        let literal_count = gpu_plan.epistemic_literals.len();
5106        let constraint_count = gpu_plan.constraints.len();
5107
5108        // Initialize the parallel constraint-violation index buffer to the
5109        // sentinel `u32::MAX` ("not rejected by a constraint") for every
5110        // candidate, BEFORE the zero-constraint early return below. Zero is a
5111        // valid constraint index, so the buffer cannot be left zeroed: any
5112        // candidate rejected by reason codes 1-5 (or accepted) must read back as
5113        // the sentinel, never a spurious `Some(0)`. The upload rides the
5114        // launch-metadata channel (like the CSR buffers below), so it adds no
5115        // tracked data-plane host-to-device transfer and keeps `host_write_ops` at zero.
5116        if candidate_count > workspace.layout.rejection_reason_slots {
5117            return Err(XlogError::ResourceExhausted {
5118                context: "epistemic GPU constraint-violation index workspace".to_string(),
5119                estimated_bytes: candidate_count as u64,
5120                budget_bytes: workspace.layout.rejection_reason_slots as u64,
5121            });
5122        }
5123        if candidate_count > 0 {
5124            let sentinel_host = vec![u32::MAX; candidate_count];
5125            let fill_len = candidate_count;
5126            let mut sentinel_view = workspace.constraint_violation_index.slice_mut(0..fill_len);
5127            self.provider
5128                .htod_launch_metadata_sync_copy_into(&sentinel_host, &mut sentinel_view)
5129                .map_err(|e| {
5130                    XlogError::execution_ctx(
5131                        "epistemic GPU world-view constraint metadata",
5132                        "initialize constraint-violation index sentinel",
5133                        &e,
5134                    )
5135                })?;
5136        }
5137
5138        // Flatten constraint -> literal index references into CSR-style buffers.
5139        let mut offsets_host = Vec::with_capacity(constraint_count);
5140        let mut counts_host = Vec::with_capacity(constraint_count);
5141        let mut indices_host: Vec<u32> = Vec::new();
5142        for constraint in &gpu_plan.constraints {
5143            offsets_host.push(indices_host.len() as u32);
5144            counts_host.push(constraint.literal_indices.len() as u32);
5145            for &literal_index in &constraint.literal_indices {
5146                indices_host.push(literal_index as u32);
5147            }
5148        }
5149        let constraint_literal_refs = indices_host.len();
5150
5151        let trace = EpistemicGpuConstraintWorldViewValidationTrace {
5152            constraint_count,
5153            constraint_literal_refs,
5154            candidates_checked: candidate_count,
5155            rejection_reason_slots_written: candidate_count,
5156            kernel_launches: 0,
5157            host_write_ops: 0,
5158            kernel_timing: EpistemicGpuKernelTimingTrace::unrecorded(),
5159        };
5160
5161        if constraint_count == 0 {
5162            // No world-view constraints to evaluate; leave the rejection buffer
5163            // untouched so accepted candidates flow through unchanged.
5164            return Ok(trace);
5165        }
5166
5167        if candidate_count > workspace.layout.rejection_reason_slots {
5168            return Err(XlogError::ResourceExhausted {
5169                context: "epistemic GPU world-view constraint rejection workspace".to_string(),
5170                estimated_bytes: candidate_count as u64,
5171                budget_bytes: workspace.layout.rejection_reason_slots as u64,
5172            });
5173        }
5174        if candidate_count > u32::MAX as usize
5175            || literal_count > u32::MAX as usize
5176            || constraint_count > u32::MAX as usize
5177            || constraint_literal_refs > u32::MAX as usize
5178        {
5179            return Err(XlogError::ResourceExhausted {
5180                context: "epistemic GPU world-view constraint dimensions".to_string(),
5181                estimated_bytes: candidate_count
5182                    .max(literal_count)
5183                    .max(constraint_count)
5184                    .max(constraint_literal_refs) as u64,
5185                budget_bytes: u32::MAX as u64,
5186            });
5187        }
5188
5189        let memory = self.provider.memory();
5190        let mut constraint_literal_offsets = memory.alloc::<u32>(constraint_count)?;
5191        let mut constraint_literal_counts = memory.alloc::<u32>(constraint_count)?;
5192        let mut constraint_literal_indices = memory.alloc::<u32>(constraint_literal_refs.max(1))?;
5193        self.provider
5194            .htod_launch_metadata_sync_copy_into(&offsets_host, &mut constraint_literal_offsets)
5195            .map_err(|e| {
5196                XlogError::execution_ctx(
5197                    "epistemic GPU world-view constraint metadata",
5198                    "upload constraint literal offsets",
5199                    &e,
5200                )
5201            })?;
5202        self.provider
5203            .htod_launch_metadata_sync_copy_into(&counts_host, &mut constraint_literal_counts)
5204            .map_err(|e| {
5205                XlogError::execution_ctx(
5206                    "epistemic GPU world-view constraint metadata",
5207                    "upload constraint literal counts",
5208                    &e,
5209                )
5210            })?;
5211        if !indices_host.is_empty() {
5212            self.provider
5213                .htod_launch_metadata_sync_copy_into(&indices_host, &mut constraint_literal_indices)
5214                .map_err(|e| {
5215                    XlogError::execution_ctx(
5216                        "epistemic GPU world-view constraint metadata",
5217                        "upload constraint literal indices",
5218                        &e,
5219                    )
5220                })?;
5221        }
5222
5223        let literal_count_u32 = literal_count as u32;
5224        let candidate_count_u32 = candidate_count as u32;
5225        let constraint_count_u32 = constraint_count as u32;
5226        let func = self
5227            .provider
5228            .device()
5229            .inner()
5230            .get_func(
5231                EPISTEMIC_MODULE,
5232                epistemic_kernels::EPISTEMIC_VALIDATE_CONSTRAINTS_U8,
5233            )
5234            .ok_or_else(|| {
5235                XlogError::Execution(
5236                    "epistemic world-view constraint validation kernel not found".to_string(),
5237                )
5238            })?;
5239        let config = LaunchConfig::for_num_elems(candidate_count_u32);
5240
5241        let kernel_timing = self.time_epistemic_gpu_kernel_launch(
5242            "epistemic GPU world-view constraint validation",
5243            || unsafe {
5244                // SAFETY: kernel arguments match the PTX signature; the capacity
5245                // check above proves the rejection buffer covers every candidate,
5246                // and CSR offset/count/index buffers are sized to the constraint
5247                // literal references uploaded above.
5248                let mut params: Vec<*mut c_void> = vec![
5249                    literal_count_u32.as_kernel_param(),
5250                    candidate_count_u32.as_kernel_param(),
5251                    constraint_count_u32.as_kernel_param(),
5252                    (&constraint_literal_offsets).as_kernel_param(),
5253                    (&constraint_literal_counts).as_kernel_param(),
5254                    (&constraint_literal_indices).as_kernel_param(),
5255                    (&workspace.candidate_assumptions).as_kernel_param(),
5256                    (&mut workspace.rejection_reasons).as_kernel_param(),
5257                    (&mut workspace.constraint_violation_index).as_kernel_param(),
5258                ];
5259                func.clone().launch(config, &mut params)
5260            },
5261        )?;
5262
5263        Ok(EpistemicGpuConstraintWorldViewValidationTrace {
5264            kernel_launches: 1,
5265            kernel_timing,
5266            ..trace
5267        })
5268    }
5269
5270    /// Materialize accepted candidate flags into the GPU world-view buffer.
5271    pub fn materialize_epistemic_gpu_candidates(
5272        &self,
5273        workspace: &mut EpistemicGpuWorkspace,
5274        candidate_count: usize,
5275    ) -> Result<EpistemicGpuMaterializationTrace> {
5276        let trace = EpistemicGpuMaterializationTrace::for_count(candidate_count)?;
5277        if trace.world_view_slots_written > workspace.layout.world_view_bytes {
5278            return Err(XlogError::ResourceExhausted {
5279                context: "epistemic GPU materialization world-view workspace".to_string(),
5280                estimated_bytes: trace.world_view_slots_written as u64,
5281                budget_bytes: workspace.layout.world_view_bytes as u64,
5282            });
5283        }
5284        if candidate_count > workspace.layout.rejection_reason_slots {
5285            return Err(XlogError::ResourceExhausted {
5286                context: "epistemic GPU materialization rejection workspace".to_string(),
5287                estimated_bytes: candidate_count as u64,
5288                budget_bytes: workspace.layout.rejection_reason_slots as u64,
5289            });
5290        }
5291        if candidate_count > u32::MAX as usize {
5292            return Err(XlogError::ResourceExhausted {
5293                context: "epistemic GPU materialization launch".to_string(),
5294                estimated_bytes: candidate_count as u64,
5295                budget_bytes: u32::MAX as u64,
5296            });
5297        }
5298
5299        let world_stride =
5300            workspace.layout.world_view_bytes / workspace.layout.rejection_reason_slots;
5301        if world_stride == 0 || world_stride > u32::MAX as usize {
5302            return Err(XlogError::ResourceExhausted {
5303                context: "epistemic GPU materialization world stride".to_string(),
5304                estimated_bytes: world_stride as u64,
5305                budget_bytes: u32::MAX as u64,
5306            });
5307        }
5308
5309        let candidate_count = candidate_count as u32;
5310        let world_stride = world_stride as u32;
5311        let func = self
5312            .provider
5313            .device()
5314            .inner()
5315            .get_func(
5316                EPISTEMIC_MODULE,
5317                epistemic_kernels::EPISTEMIC_MATERIALIZE_ACCEPTED_CANDIDATES_U8,
5318            )
5319            .ok_or_else(|| {
5320                XlogError::Execution(
5321                    "epistemic candidate materialization kernel not found".to_string(),
5322                )
5323            })?;
5324        let config = LaunchConfig::for_num_elems(candidate_count);
5325
5326        let kernel_timing = self.time_epistemic_gpu_kernel_launch(
5327            "epistemic GPU candidate materialization",
5328            || unsafe {
5329                // SAFETY: kernel arguments match the PTX signature; the capacity checks
5330                // above prove world-view and rejection buffers cover all accesses.
5331                func.clone().launch(
5332                    config,
5333                    (
5334                        candidate_count,
5335                        world_stride,
5336                        &workspace.rejection_reasons,
5337                        &mut workspace.world_views,
5338                    ),
5339                )
5340            },
5341        )?;
5342
5343        Ok(trace.with_kernel_timing(kernel_timing))
5344    }
5345
5346    /// Materialize final result flags from the reduced runtime output row count.
5347    pub fn materialize_epistemic_gpu_final_results(
5348        &self,
5349        workspace: &mut EpistemicGpuWorkspace,
5350        output: &CudaBuffer,
5351        candidate_count: usize,
5352    ) -> Result<EpistemicGpuFinalResultMaterializationTrace> {
5353        let trace = EpistemicGpuFinalResultMaterializationTrace::for_count(candidate_count)?;
5354        if trace.world_view_slots_written > workspace.layout.world_view_bytes {
5355            return Err(XlogError::ResourceExhausted {
5356                context: "epistemic GPU final-result world-view workspace".to_string(),
5357                estimated_bytes: trace.world_view_slots_written as u64,
5358                budget_bytes: workspace.layout.world_view_bytes as u64,
5359            });
5360        }
5361        if candidate_count > workspace.layout.rejection_reason_slots {
5362            return Err(XlogError::ResourceExhausted {
5363                context: "epistemic GPU final-result rejection workspace".to_string(),
5364                estimated_bytes: candidate_count as u64,
5365                budget_bytes: workspace.layout.rejection_reason_slots as u64,
5366            });
5367        }
5368        if candidate_count > u32::MAX as usize {
5369            return Err(XlogError::ResourceExhausted {
5370                context: "epistemic GPU final-result launch".to_string(),
5371                estimated_bytes: candidate_count as u64,
5372                budget_bytes: u32::MAX as u64,
5373            });
5374        }
5375
5376        let world_stride =
5377            workspace.layout.world_view_bytes / workspace.layout.rejection_reason_slots;
5378        if world_stride == 0 || world_stride > u32::MAX as usize {
5379            return Err(XlogError::ResourceExhausted {
5380                context: "epistemic GPU final-result world stride".to_string(),
5381                estimated_bytes: world_stride as u64,
5382                budget_bytes: u32::MAX as u64,
5383            });
5384        }
5385
5386        let candidate_count = candidate_count as u32;
5387        let world_stride = world_stride as u32;
5388        let func = self
5389            .provider
5390            .device()
5391            .inner()
5392            .get_func(
5393                EPISTEMIC_MODULE,
5394                epistemic_kernels::EPISTEMIC_MATERIALIZE_FINAL_RESULT_FLAGS_U8,
5395            )
5396            .ok_or_else(|| {
5397                XlogError::Execution(
5398                    "epistemic final-result materialization kernel not found".to_string(),
5399                )
5400            })?;
5401        let config = LaunchConfig::for_num_elems(candidate_count);
5402
5403        let kernel_timing = self.time_epistemic_gpu_kernel_launch(
5404            "epistemic GPU final result materialization",
5405            || unsafe {
5406                // SAFETY: kernel arguments match the PTX signature; the capacity checks
5407                // above prove world-view and rejection buffers cover all accesses, and
5408                // output.num_rows_device() is the runtime-owned device scalar for output
5409                // row count metadata.
5410                func.clone().launch(
5411                    config,
5412                    (
5413                        candidate_count,
5414                        world_stride,
5415                        output.num_rows_device(),
5416                        &workspace.rejection_reasons,
5417                        &mut workspace.world_views,
5418                    ),
5419                )
5420            },
5421        )?;
5422
5423        Ok(trace.with_kernel_timing(kernel_timing))
5424    }
5425
5426    /// Materialize final query tuples into a device-resident output buffer.
5427    #[allow(clippy::too_many_arguments)]
5428    pub fn materialize_epistemic_gpu_final_tuples(
5429        &self,
5430        workspace: &mut EpistemicGpuWorkspace,
5431        output: &CudaBuffer,
5432        gpu_plan: &EpistemicGpuPlan,
5433        literal_count: usize,
5434        candidate_count: usize,
5435        reduction_count: usize,
5436        models_per_reduction: usize,
5437    ) -> Result<(CudaBuffer, EpistemicGpuFinalTupleMaterializationTrace)> {
5438        self.materialize_epistemic_gpu_final_tuples_scoped(
5439            workspace,
5440            output,
5441            gpu_plan,
5442            literal_count,
5443            candidate_count,
5444            reduction_count,
5445            models_per_reduction,
5446            None,
5447        )
5448    }
5449
5450    /// Materialize final query tuples, optionally scoping the modal row-filter to a
5451    /// single coalesced head's reductions.
5452    ///
5453    /// `head_reduction_filter` is the JOINT-SOLVING multi-output seam: the joint
5454    /// candidate enumeration + world-view validation runs ONCE over the combined
5455    /// modal literals (so the accepted world view in `workspace` is shared by every
5456    /// head), then this method is called once per distinct head with that head's
5457    /// reduction indices. Only the row-filter bindings whose `reduction_index` is
5458    /// in the filter drive that head's output filtering; the full joint plan
5459    /// (`gpu_plan`) is still validated and the joint workspace dimensions are
5460    /// preserved, so each head is materialized against the SAME accepted world
5461    /// view. `None` materializes against every binding (the single-head path).
5462    #[allow(clippy::too_many_arguments)]
5463    fn materialize_epistemic_gpu_final_tuples_scoped(
5464        &self,
5465        workspace: &mut EpistemicGpuWorkspace,
5466        output: &CudaBuffer,
5467        gpu_plan: &EpistemicGpuPlan,
5468        literal_count: usize,
5469        candidate_count: usize,
5470        reduction_count: usize,
5471        models_per_reduction: usize,
5472        head_reduction_filter: Option<&BTreeSet<usize>>,
5473    ) -> Result<(CudaBuffer, EpistemicGpuFinalTupleMaterializationTrace)> {
5474        gpu_plan.validate_tuple_membership_bindings()?;
5475        if candidate_count > workspace.layout.rejection_reason_slots {
5476            return Err(XlogError::ResourceExhausted {
5477                context: "epistemic GPU final-tuple rejection workspace".to_string(),
5478                estimated_bytes: candidate_count as u64,
5479                budget_bytes: workspace.layout.rejection_reason_slots as u64,
5480            });
5481        }
5482        let literal_count_u32 =
5483            checked_u32_dimension(literal_count, "epistemic GPU final-tuple literals")?;
5484        let candidate_count_u32 =
5485            checked_u32_dimension(candidate_count, "epistemic GPU final-tuple candidates")?;
5486        let reduction_count_u32 =
5487            checked_u32_dimension(reduction_count, "epistemic GPU final-tuple reductions")?;
5488        let models_per_reduction_u32 = checked_u32_dimension(
5489            models_per_reduction,
5490            "epistemic GPU final-tuple models per reduction",
5491        )?;
5492        let output_row_capacity =
5493            usize::try_from(output.num_rows()).map_err(|_| XlogError::ResourceExhausted {
5494                context: "epistemic GPU final-tuple output rows".to_string(),
5495                estimated_bytes: output.num_rows(),
5496                budget_bytes: usize::MAX as u64,
5497            })?;
5498        let output_row_capacity_u32 =
5499            checked_u32_dimension(output_row_capacity, "epistemic GPU final-tuple output rows")?;
5500        let final_output_columns =
5501            final_output_columns_for_materialization(output, gpu_plan, head_reduction_filter)?;
5502        let mut tuple_bytes_capacity = 0usize;
5503        let mut source_columns: Vec<(&CudaColumn, u32, u32)> =
5504            Vec::with_capacity(final_output_columns.len());
5505        let mut result_columns_raw: Vec<TrackedCudaSlice<u8>> =
5506            Vec::with_capacity(final_output_columns.len());
5507        let mut final_schema_columns = Vec::with_capacity(final_output_columns.len());
5508        let mut final_schema_sort_labels = Vec::with_capacity(final_output_columns.len());
5509        for &col_idx in &final_output_columns {
5510            let src_col = output.column(col_idx).ok_or_else(|| {
5511                XlogError::Execution(format!("epistemic final tuple missing column {col_idx}"))
5512            })?;
5513            let (column_name, column_type) = output
5514                .schema()
5515                .columns
5516                .get(col_idx)
5517                .ok_or_else(|| {
5518                    XlogError::Execution(format!(
5519                        "epistemic final tuple missing schema column {col_idx}"
5520                    ))
5521                })?
5522                .clone();
5523            let column_width = column_type.size_bytes();
5524            let expected_column_bytes = checked_product(output_row_capacity, column_width)?;
5525            if src_col.len() < expected_column_bytes {
5526                return Err(XlogError::ResourceExhausted {
5527                    context: "epistemic GPU final-tuple column capacity".to_string(),
5528                    estimated_bytes: expected_column_bytes as u64,
5529                    budget_bytes: src_col.len() as u64,
5530                });
5531            }
5532            let column_byte_len =
5533                checked_u32_dimension(src_col.len(), "epistemic GPU final-tuple column")?;
5534            let column_width =
5535                checked_u32_dimension(column_width, "epistemic GPU final-tuple column width")?;
5536            tuple_bytes_capacity = checked_sum(tuple_bytes_capacity, src_col.len())?;
5537            source_columns.push((src_col, column_byte_len, column_width));
5538            result_columns_raw.push(self.provider.memory().alloc::<u8>(src_col.len())?);
5539            final_schema_columns.push((column_name, column_type));
5540            final_schema_sort_labels.push(
5541                output
5542                    .schema()
5543                    .column_sort_label(col_idx)
5544                    .unwrap_or("")
5545                    .to_string(),
5546            );
5547        }
5548
5549        let mut final_row_count = self.provider.memory().alloc::<u32>(1)?;
5550        let mut row_map = self
5551            .provider
5552            .memory()
5553            .alloc::<u32>(output_row_capacity.max(1))?;
5554        let row_filter_bindings: Vec<_> = gpu_plan
5555            .tuple_membership_bindings
5556            .iter()
5557            .filter(|binding| binding.bound_output_columns.iter().any(Option::is_some))
5558            .filter(|binding| {
5559                head_reduction_filter
5560                    .map(|reductions| reductions.contains(&binding.reduction_index))
5561                    .unwrap_or(true)
5562            })
5563            .collect();
5564        if row_filter_bindings.len() > u32::MAX as usize {
5565            return Err(XlogError::ResourceExhausted {
5566                context: "epistemic GPU final tuple row-filter count".to_string(),
5567                estimated_bytes: row_filter_bindings.len() as u64,
5568                budget_bytes: u32::MAX as u64,
5569            });
5570        }
5571        let negated_row_filter_count = row_filter_bindings
5572            .iter()
5573            .filter(|binding| binding.negated)
5574            .count();
5575        let trace = EpistemicGpuFinalTupleMaterializationTrace::for_counts(
5576            final_output_columns.len(),
5577            output_row_capacity,
5578            tuple_bytes_capacity,
5579            literal_count,
5580            candidate_count,
5581            reduction_count,
5582            models_per_reduction,
5583        )?
5584        .with_row_filter_counts(row_filter_bindings.len(), negated_row_filter_count)?;
5585        if trace.model_membership_bytes_checked > workspace.layout.model_membership_bytes {
5586            return Err(XlogError::ResourceExhausted {
5587                context: "epistemic GPU final-tuple membership workspace".to_string(),
5588                estimated_bytes: trace.model_membership_bytes_checked as u64,
5589                budget_bytes: workspace.layout.model_membership_bytes as u64,
5590            });
5591        }
5592        if trace.world_view_slots_checked > workspace.layout.world_view_bytes {
5593            return Err(XlogError::ResourceExhausted {
5594                context: "epistemic GPU final-tuple world-view workspace".to_string(),
5595                estimated_bytes: trace.world_view_slots_checked as u64,
5596                budget_bytes: workspace.layout.world_view_bytes as u64,
5597            });
5598        }
5599        if trace.model_membership_bytes_checked > u32::MAX as usize {
5600            return Err(XlogError::ResourceExhausted {
5601                context: "epistemic GPU final-tuple membership launch".to_string(),
5602                estimated_bytes: trace.model_membership_bytes_checked as u64,
5603                budget_bytes: u32::MAX as u64,
5604            });
5605        }
5606
5607        let world_stride =
5608            workspace.layout.world_view_bytes / workspace.layout.rejection_reason_slots;
5609        if world_stride == 0 || world_stride > u32::MAX as usize {
5610            return Err(XlogError::ResourceExhausted {
5611                context: "epistemic GPU final-tuple world stride".to_string(),
5612                estimated_bytes: world_stride as u64,
5613                budget_bytes: u32::MAX as u64,
5614            });
5615        }
5616        let world_stride =
5617            checked_u32_dimension(world_stride, "epistemic GPU final-tuple world stride")?;
5618        let mut metadata_len = 0usize;
5619        for binding in &row_filter_bindings {
5620            metadata_len = checked_sum(metadata_len, binding.key_columns.len())?;
5621        }
5622        let metadata_len = metadata_len.max(1);
5623        let row_filter_metadata_len = row_filter_bindings.len().max(1);
5624        checked_u32_dimension(
5625            metadata_len,
5626            "epistemic GPU final tuple row-filter key metadata",
5627        )?;
5628        checked_u32_dimension(
5629            row_filter_metadata_len,
5630            "epistemic GPU final tuple row-filter metadata",
5631        )?;
5632        let memory = self.provider.memory();
5633        let device = self.provider.device().inner();
5634        let mut tuple_source_row_count_ptrs = memory.alloc::<u64>(row_filter_metadata_len)?;
5635        let mut row_filter_negated = memory.alloc::<u8>(row_filter_metadata_len)?;
5636        let mut row_filter_key_offsets = memory.alloc::<u32>(row_filter_metadata_len)?;
5637        let mut row_filter_key_counts = memory.alloc::<u32>(row_filter_metadata_len)?;
5638        let mut key_col_ptrs = memory.alloc::<u64>(metadata_len)?;
5639        let mut key_col_widths = memory.alloc::<u32>(metadata_len)?;
5640        let mut expected_key_bits = memory.alloc::<u64>(metadata_len)?;
5641        let mut expected_key_type_codes = memory.alloc::<u8>(metadata_len)?;
5642        let mut tuple_key_match_modes = memory.alloc::<u8>(metadata_len)?;
5643        let mut bound_value_col_ptrs = memory.alloc::<u64>(metadata_len)?;
5644        let mut bound_value_col_widths = memory.alloc::<u32>(metadata_len)?;
5645        let row_filter_count = checked_u32_dimension(
5646            row_filter_bindings.len(),
5647            "epistemic GPU final tuple row-filter count",
5648        )?;
5649        let mut tuple_source_row_counts = Vec::with_capacity(row_filter_bindings.len());
5650
5651        if !row_filter_bindings.is_empty() {
5652            let mut tuple_source_row_count_ptrs_host =
5653                Vec::with_capacity(row_filter_bindings.len());
5654            let mut row_filter_negated_host = Vec::with_capacity(row_filter_bindings.len());
5655            let mut row_filter_key_offsets_host = Vec::with_capacity(row_filter_bindings.len());
5656            let mut row_filter_key_counts_host = Vec::with_capacity(row_filter_bindings.len());
5657            let mut key_col_ptrs_host = Vec::with_capacity(metadata_len);
5658            let mut key_col_widths_host = Vec::with_capacity(metadata_len);
5659            let mut expected_key_bits_host = Vec::with_capacity(metadata_len);
5660            let mut expected_key_type_codes_host = Vec::with_capacity(metadata_len);
5661            let mut tuple_key_match_modes_host = Vec::with_capacity(metadata_len);
5662            let mut bound_value_col_ptrs_host = Vec::with_capacity(metadata_len);
5663            let mut bound_value_col_widths_host = Vec::with_capacity(metadata_len);
5664
5665            for binding in &row_filter_bindings {
5666                let row_filter_key_offset = checked_u32_dimension(
5667                    key_col_ptrs_host.len(),
5668                    "epistemic GPU final tuple row-filter key offset",
5669                )?;
5670                let row_filter_key_count = checked_u32_dimension(
5671                    binding.key_columns.len(),
5672                    "epistemic GPU final tuple row-filter key arity",
5673                )?;
5674                row_filter_key_offsets_host.push(row_filter_key_offset);
5675                row_filter_key_counts_host.push(row_filter_key_count);
5676                row_filter_negated_host.push(binding.negated as u8);
5677
5678                let source_relation = self
5679                    .resolve_modal_tuple_source(binding.predicate.as_str(), binding.arity)
5680                    .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
5681                        construct: "epistemic GPU final tuple row filtering".to_string(),
5682                        context: format!(
5683                            "missing tuple source relation {} (arity {}) for final row filter",
5684                            binding.predicate, binding.arity
5685                        ),
5686                    })?;
5687                let tuple_source_row_count = self.clone_device_row_count(source_relation)?;
5688                tuple_source_row_count_ptrs_host.push(*tuple_source_row_count.device_ptr());
5689                tuple_source_row_counts.push(tuple_source_row_count);
5690
5691                for (term_index, &key_col) in binding.key_columns.iter().enumerate() {
5692                    let key_col_ref = source_relation.column(key_col).ok_or_else(|| {
5693                        XlogError::UnsupportedEpistemicConstruct {
5694                            construct: "epistemic GPU final tuple row filtering".to_string(),
5695                            context: format!(
5696                                "tuple source relation {} missing key column {}",
5697                                binding.predicate, key_col
5698                            ),
5699                        }
5700                    })?;
5701                    let key_col_type =
5702                        source_relation
5703                            .schema()
5704                            .column_type(key_col)
5705                            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
5706                                construct: "epistemic GPU final tuple row filtering".to_string(),
5707                                context: format!(
5708                                    "tuple source relation {} missing schema for key column {}",
5709                                    binding.predicate, key_col
5710                                ),
5711                            })?;
5712                    let key_col_width = key_col_type.size_bytes();
5713                    if key_col_width > u32::MAX as usize {
5714                        return Err(XlogError::ResourceExhausted {
5715                            context: "epistemic GPU final tuple row-filter key width".to_string(),
5716                            estimated_bytes: key_col_width as u64,
5717                            budget_bytes: u32::MAX as u64,
5718                        });
5719                    }
5720
5721                    key_col_ptrs_host.push(*key_col_ref.device_ptr());
5722                    key_col_widths_host.push(key_col_width as u32);
5723                    match &binding.key_terms[term_index] {
5724                        EirTerm::Variable(variable_name) => {
5725                            let bound_col_index = binding.bound_output_columns[term_index]
5726                                .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
5727                                    construct: "epistemic GPU final tuple row filtering"
5728                                        .to_string(),
5729                                    context: format!(
5730                                        "tuple key variable {variable_name} has no reduced \
5731                                             output column binding"
5732                                    ),
5733                                })?;
5734                            let bound_col = output.column(bound_col_index).ok_or_else(|| {
5735                                XlogError::UnsupportedEpistemicConstruct {
5736                                    construct: "epistemic GPU final tuple row filtering"
5737                                        .to_string(),
5738                                    context: format!(
5739                                        "reduced output missing device column {bound_col_index} \
5740                                         for variable {variable_name}"
5741                                    ),
5742                                }
5743                            })?;
5744                            let bound_col_type = output
5745                                .schema()
5746                                .column_type(bound_col_index)
5747                                .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
5748                                    construct: "epistemic GPU final tuple row filtering"
5749                                        .to_string(),
5750                                    context: format!(
5751                                        "reduced output missing schema for variable \
5752                                             {variable_name}"
5753                                    ),
5754                                })?;
5755                            if bound_col_type != key_col_type {
5756                                return Err(XlogError::UnsupportedEpistemicConstruct {
5757                                    construct: "epistemic GPU final tuple row filtering"
5758                                        .to_string(),
5759                                    context: format!(
5760                                        "bound variable {variable_name} has output type \
5761                                         {bound_col_type:?}, but tuple source {} key column {} \
5762                                         has type {key_col_type:?}",
5763                                        binding.predicate, key_col
5764                                    ),
5765                                });
5766                            }
5767                            let bound_col_width = bound_col_type.size_bytes();
5768                            if bound_col_width > u32::MAX as usize {
5769                                return Err(XlogError::ResourceExhausted {
5770                                    context: "epistemic GPU final tuple row-filter bound width"
5771                                        .to_string(),
5772                                    estimated_bytes: bound_col_width as u64,
5773                                    budget_bytes: u32::MAX as u64,
5774                                });
5775                            }
5776                            expected_key_bits_host.push(0);
5777                            expected_key_type_codes_host.push(key_col_type.to_code());
5778                            tuple_key_match_modes_host.push(TUPLE_KEY_MATCH_MODE_BOUND_OUTPUT);
5779                            bound_value_col_ptrs_host.push(*bound_col.device_ptr());
5780                            bound_value_col_widths_host.push(bound_col_width as u32);
5781                        }
5782                        EirTerm::Anonymous => {
5783                            // Wildcard: this tuple-key column imposes no
5784                            // equality requirement when filtering output rows.
5785                            expected_key_bits_host.push(0);
5786                            expected_key_type_codes_host.push(key_col_type.to_code());
5787                            tuple_key_match_modes_host.push(TUPLE_KEY_MATCH_MODE_WILDCARD);
5788                            bound_value_col_ptrs_host.push(0);
5789                            bound_value_col_widths_host.push(0);
5790                        }
5791                        term => {
5792                            let expectation = TupleKeyExpectation::from_term(term, key_col_type)?;
5793                            expected_key_bits_host.push(expectation.bits);
5794                            expected_key_type_codes_host.push(expectation.type_code);
5795                            tuple_key_match_modes_host.push(TUPLE_KEY_MATCH_MODE_GROUND);
5796                            bound_value_col_ptrs_host.push(0);
5797                            bound_value_col_widths_host.push(0);
5798                        }
5799                    }
5800                }
5801            }
5802
5803            let metadata_context = "epistemic GPU final tuple row-filter metadata";
5804            self.provider
5805                .htod_launch_metadata_sync_copy_into(
5806                    &tuple_source_row_count_ptrs_host,
5807                    &mut tuple_source_row_count_ptrs,
5808                )
5809                .map_err(|e| {
5810                    XlogError::execution_ctx(
5811                        metadata_context,
5812                        "upload tuple source row-count pointers",
5813                        &e,
5814                    )
5815                })?;
5816            self.provider
5817                .htod_launch_metadata_sync_copy_into(
5818                    &row_filter_negated_host,
5819                    &mut row_filter_negated,
5820                )
5821                .map_err(|e| {
5822                    XlogError::execution_ctx(metadata_context, "upload row-filter polarity", &e)
5823                })?;
5824            self.provider
5825                .htod_launch_metadata_sync_copy_into(
5826                    &row_filter_key_offsets_host,
5827                    &mut row_filter_key_offsets,
5828                )
5829                .map_err(|e| {
5830                    XlogError::execution_ctx(metadata_context, "upload row-filter key offsets", &e)
5831                })?;
5832            self.provider
5833                .htod_launch_metadata_sync_copy_into(
5834                    &row_filter_key_counts_host,
5835                    &mut row_filter_key_counts,
5836                )
5837                .map_err(|e| {
5838                    XlogError::execution_ctx(metadata_context, "upload row-filter key counts", &e)
5839                })?;
5840            self.provider
5841                .htod_launch_metadata_sync_copy_into(&key_col_ptrs_host, &mut key_col_ptrs)
5842                .map_err(|e| {
5843                    XlogError::execution_ctx(metadata_context, "upload key column pointers", &e)
5844                })?;
5845            self.provider
5846                .htod_launch_metadata_sync_copy_into(&key_col_widths_host, &mut key_col_widths)
5847                .map_err(|e| {
5848                    XlogError::execution_ctx(metadata_context, "upload key column widths", &e)
5849                })?;
5850            self.provider
5851                .htod_launch_metadata_sync_copy_into(
5852                    &expected_key_bits_host,
5853                    &mut expected_key_bits,
5854                )
5855                .map_err(|e| {
5856                    XlogError::execution_ctx(metadata_context, "upload expected key bits", &e)
5857                })?;
5858            self.provider
5859                .htod_launch_metadata_sync_copy_into(
5860                    &expected_key_type_codes_host,
5861                    &mut expected_key_type_codes,
5862                )
5863                .map_err(|e| {
5864                    XlogError::execution_ctx(metadata_context, "upload expected key type codes", &e)
5865                })?;
5866            self.provider
5867                .htod_launch_metadata_sync_copy_into(
5868                    &tuple_key_match_modes_host,
5869                    &mut tuple_key_match_modes,
5870                )
5871                .map_err(|e| {
5872                    XlogError::execution_ctx(metadata_context, "upload tuple key match modes", &e)
5873                })?;
5874            self.provider
5875                .htod_launch_metadata_sync_copy_into(
5876                    &bound_value_col_ptrs_host,
5877                    &mut bound_value_col_ptrs,
5878                )
5879                .map_err(|e| {
5880                    XlogError::execution_ctx(
5881                        metadata_context,
5882                        "upload bound value column pointers",
5883                        &e,
5884                    )
5885                })?;
5886            self.provider
5887                .htod_launch_metadata_sync_copy_into(
5888                    &bound_value_col_widths_host,
5889                    &mut bound_value_col_widths,
5890                )
5891                .map_err(|e| {
5892                    XlogError::execution_ctx(
5893                        metadata_context,
5894                        "upload bound value column widths",
5895                        &e,
5896                    )
5897                })?;
5898        } else {
5899            let metadata_context = "epistemic GPU final tuple row-filter metadata";
5900            device
5901                .memset_zeros(&mut tuple_source_row_count_ptrs)
5902                .map_err(|e| {
5903                    XlogError::execution_ctx(
5904                        metadata_context,
5905                        "tuple source row-count pointer memset",
5906                        &e,
5907                    )
5908                })?;
5909            device.memset_zeros(&mut row_filter_negated).map_err(|e| {
5910                XlogError::execution_ctx(metadata_context, "row-filter polarity memset", &e)
5911            })?;
5912            device
5913                .memset_zeros(&mut row_filter_key_offsets)
5914                .map_err(|e| {
5915                    XlogError::execution_ctx(metadata_context, "row-filter key offset memset", &e)
5916                })?;
5917            device
5918                .memset_zeros(&mut row_filter_key_counts)
5919                .map_err(|e| {
5920                    XlogError::execution_ctx(metadata_context, "row-filter key count memset", &e)
5921                })?;
5922            device.memset_zeros(&mut key_col_ptrs).map_err(|e| {
5923                XlogError::execution_ctx(metadata_context, "key column pointer memset", &e)
5924            })?;
5925            device.memset_zeros(&mut key_col_widths).map_err(|e| {
5926                XlogError::execution_ctx(metadata_context, "key column width memset", &e)
5927            })?;
5928            device.memset_zeros(&mut expected_key_bits).map_err(|e| {
5929                XlogError::execution_ctx(metadata_context, "expected key bits memset", &e)
5930            })?;
5931            device
5932                .memset_zeros(&mut expected_key_type_codes)
5933                .map_err(|e| {
5934                    XlogError::execution_ctx(metadata_context, "expected key type code memset", &e)
5935                })?;
5936            device
5937                .memset_zeros(&mut tuple_key_match_modes)
5938                .map_err(|e| {
5939                    XlogError::execution_ctx(metadata_context, "tuple key match mode memset", &e)
5940                })?;
5941            device
5942                .memset_zeros(&mut bound_value_col_ptrs)
5943                .map_err(|e| {
5944                    XlogError::execution_ctx(
5945                        metadata_context,
5946                        "bound value column pointer memset",
5947                        &e,
5948                    )
5949                })?;
5950            device
5951                .memset_zeros(&mut bound_value_col_widths)
5952                .map_err(|e| {
5953                    XlogError::execution_ctx(
5954                        metadata_context,
5955                        "bound value column width memset",
5956                        &e,
5957                    )
5958                })?;
5959        }
5960
5961        // Global-gate literal mask: a literal that does not bind any reduced
5962        // output column (pure-ground, pure-anonymous, or arity-0) is checked by
5963        // the global membership gate rather than a per-row filter. For those
5964        // literals the body literal must actually hold in the accepted
5965        // candidate's world view; per-row (bound-variable) literals are already
5966        // enforced by the row-filter loop above. The accepted candidate's
5967        // assumption bit already folds in `know`/`possible` modality and
5968        // negation (the validation kernel guarantees assumption == observed for
5969        // accepted candidates), so the gate requires the assumption bit to be
5970        // set for every global-gate literal.
5971        // Constraint literals participate in modal world-view evaluation (model
5972        // membership + assumption-bit pinning) but must NOT gate output rows:
5973        // their pruning is enforced by the separate world-view constraint kernel,
5974        // which rejects candidates whose accepted world view satisfies the
5975        // constraint body. Treating them as required gates would invert the
5976        // semantics (emit rows only when the forbidden body holds), so exclude
5977        // them from the output-gating mask.
5978        let mut is_constraint_literal = vec![false; literal_count.max(1)];
5979        for constraint in &gpu_plan.constraints {
5980            for &literal_index in &constraint.literal_indices {
5981                if literal_index < literal_count {
5982                    is_constraint_literal[literal_index] = true;
5983                }
5984            }
5985        }
5986        let mut gate_literal_required_host = vec![0u8; literal_count.max(1)];
5987        for binding in &gpu_plan.tuple_membership_bindings {
5988            if !binding.bound_output_columns.iter().any(Option::is_some)
5989                && binding.literal_index < literal_count
5990                && !is_constraint_literal[binding.literal_index]
5991            {
5992                gate_literal_required_host[binding.literal_index] = 1u8;
5993            }
5994        }
5995        // A rule mixing a per-row (bound-variable) modal literal with a global
5996        // gate (pure-ground/anonymous/arity-0) literal is materialized soundly:
5997        // the row-map kernel applies the global-gate `gate_literal_required`
5998        // mask on BOTH the global membership path and the per-row membership
5999        // path, so global-gate literals and per-row bound tuple-key gates
6000        // compose conjunctively. The two gate buffers below are passed to the
6001        // row-map kernel for both paths.
6002        let mut gate_literal_required = memory.alloc::<u8>(literal_count.max(1))?;
6003        self.provider
6004            .htod_launch_metadata_sync_copy_into(
6005                &gate_literal_required_host,
6006                &mut gate_literal_required,
6007            )
6008            .map_err(|e| {
6009                XlogError::execution_ctx(
6010                    "epistemic GPU final tuple gate metadata",
6011                    "upload global-gate literal mask",
6012                    &e,
6013                )
6014            })?;
6015
6016        let row_map_func = self
6017            .provider
6018            .device()
6019            .inner()
6020            .get_func(
6021                EPISTEMIC_MODULE,
6022                epistemic_kernels::EPISTEMIC_BUILD_FINAL_TUPLE_ROW_MAP_U8,
6023            )
6024            .ok_or_else(|| {
6025                XlogError::Execution("epistemic final tuple row-map kernel not found".to_string())
6026            })?;
6027        let close_rejections_func = self
6028            .provider
6029            .device()
6030            .inner()
6031            .get_func(
6032                EPISTEMIC_MODULE,
6033                epistemic_kernels::EPISTEMIC_CLOSE_FINAL_TUPLE_REJECTIONS_U8,
6034            )
6035            .ok_or_else(|| {
6036                XlogError::Execution(
6037                    "epistemic final tuple rejection-close kernel not found".to_string(),
6038                )
6039            })?;
6040        let func = self
6041            .provider
6042            .device()
6043            .inner()
6044            .get_func(
6045                EPISTEMIC_MODULE,
6046                epistemic_kernels::EPISTEMIC_MATERIALIZE_FINAL_TUPLE_COLUMN_U8,
6047            )
6048            .ok_or_else(|| {
6049                XlogError::Execution(
6050                    "epistemic final tuple materialization kernel not found".to_string(),
6051                )
6052            })?;
6053
6054        let mut kernel_timings = Vec::with_capacity(checked_sum(source_columns.len(), 2)?);
6055        let row_map_timing = self.time_epistemic_gpu_kernel_launch(
6056            "epistemic GPU final tuple row map",
6057            || unsafe {
6058                self.provider
6059                    .device()
6060                    .inner()
6061                    .memset_zeros(&mut final_row_count)?;
6062                self.provider.device().inner().memset_zeros(&mut row_map)?;
6063                let mut row_map_params: Vec<*mut c_void> = vec![
6064                    output_row_capacity_u32.as_kernel_param(),
6065                    literal_count_u32.as_kernel_param(),
6066                    candidate_count_u32.as_kernel_param(),
6067                    reduction_count_u32.as_kernel_param(),
6068                    models_per_reduction_u32.as_kernel_param(),
6069                    world_stride.as_kernel_param(),
6070                    output.num_rows_device().as_kernel_param(),
6071                    (&workspace.rejection_reasons).as_kernel_param(),
6072                    (&workspace.model_membership).as_kernel_param(),
6073                    (&workspace.world_views).as_kernel_param(),
6074                    (&tuple_source_row_count_ptrs).as_kernel_param(),
6075                    (&row_filter_negated).as_kernel_param(),
6076                    (&row_filter_key_offsets).as_kernel_param(),
6077                    (&row_filter_key_counts).as_kernel_param(),
6078                    (&key_col_ptrs).as_kernel_param(),
6079                    (&key_col_widths).as_kernel_param(),
6080                    (&expected_key_bits).as_kernel_param(),
6081                    (&expected_key_type_codes).as_kernel_param(),
6082                    (&tuple_key_match_modes).as_kernel_param(),
6083                    (&bound_value_col_ptrs).as_kernel_param(),
6084                    (&bound_value_col_widths).as_kernel_param(),
6085                    row_filter_count.as_kernel_param(),
6086                    (&row_map).as_kernel_param(),
6087                    (&final_row_count).as_kernel_param(),
6088                    (&workspace.candidate_assumptions).as_kernel_param(),
6089                    (&gate_literal_required).as_kernel_param(),
6090                ];
6091                row_map_func.clone().launch(
6092                    LaunchConfig::for_num_elems(output_row_capacity_u32.max(1)),
6093                    &mut row_map_params,
6094                )?;
6095                Ok(())
6096            },
6097        )?;
6098        kernel_timings.push(row_map_timing);
6099
6100        let close_rejections_timing = self.time_epistemic_gpu_kernel_launch(
6101            "epistemic GPU final tuple rejection closeout",
6102            || unsafe {
6103                let mut close_rejections_params: Vec<*mut c_void> = vec![
6104                    candidate_count_u32.as_kernel_param(),
6105                    world_stride.as_kernel_param(),
6106                    (&final_row_count).as_kernel_param(),
6107                    (&workspace.rejection_reasons).as_kernel_param(),
6108                    (&workspace.world_views).as_kernel_param(),
6109                ];
6110                close_rejections_func.clone().launch(
6111                    LaunchConfig::for_num_elems(candidate_count_u32.max(1)),
6112                    &mut close_rejections_params,
6113                )?;
6114                Ok(())
6115            },
6116        )?;
6117        kernel_timings.push(close_rejections_timing);
6118
6119        for ((src_col, column_byte_len, column_row_width), dst_col) in
6120            source_columns.iter().zip(result_columns_raw.iter_mut())
6121        {
6122            let column_timing = self.time_epistemic_gpu_kernel_launch(
6123                "epistemic GPU final tuple column materialization",
6124                || unsafe {
6125                    // SAFETY: source and destination columns are valid device byte
6126                    // buffers of identical length, the row-count scalar and schema
6127                    // row width are runtime-owned, and membership/world-view buffers
6128                    // were capacity-checked.
6129                    let mut params: Vec<*mut c_void> = vec![
6130                        column_byte_len.as_kernel_param(),
6131                        column_row_width.as_kernel_param(),
6132                        literal_count_u32.as_kernel_param(),
6133                        candidate_count_u32.as_kernel_param(),
6134                        reduction_count_u32.as_kernel_param(),
6135                        models_per_reduction_u32.as_kernel_param(),
6136                        world_stride.as_kernel_param(),
6137                        output.num_rows_device().as_kernel_param(),
6138                        (&workspace.rejection_reasons).as_kernel_param(),
6139                        (&workspace.model_membership).as_kernel_param(),
6140                        (&workspace.world_views).as_kernel_param(),
6141                        (&row_map).as_kernel_param(),
6142                        (*src_col).as_kernel_param(),
6143                        dst_col.as_kernel_param(),
6144                        (&final_row_count).as_kernel_param(),
6145                    ];
6146                    func.clone().launch(
6147                        LaunchConfig::for_num_elems((*column_byte_len).max(1)),
6148                        &mut params,
6149                    )?;
6150                    Ok(())
6151                },
6152            )?;
6153            kernel_timings.push(column_timing);
6154        }
6155        let kernel_timing = EpistemicGpuKernelTimingTrace::checked_sum(kernel_timings)?;
6156
6157        let result_columns: Vec<CudaColumn> =
6158            result_columns_raw.into_iter().map(Into::into).collect();
6159        let final_schema = Schema::new(final_schema_columns)
6160            .with_sort_labels(final_schema_sort_labels)
6161            .map_err(|err| XlogError::Execution(format!("epistemic final schema: {err}")))?;
6162        let final_output = CudaBuffer::from_columns(
6163            result_columns,
6164            output.num_rows(),
6165            final_row_count,
6166            final_schema,
6167        );
6168        let final_output = if gpu_plan.final_output_columns.is_none() {
6169            final_output
6170        } else {
6171            self.provider.dedup_full_row(&final_output)?
6172        };
6173
6174        Ok((final_output, trace.with_kernel_timing(kernel_timing)))
6175    }
6176
6177    /// Prepare runtime-owned GPU buffers for an epistemic executable plan.
6178    pub fn prepare_epistemic_gpu_execution(
6179        &self,
6180        executable: &EpistemicExecutablePlan,
6181        capacities: EpistemicGpuWorkspaceCapacities,
6182    ) -> Result<EpistemicGpuPreparedExecution> {
6183        let preflight = EpistemicGpuRuntimePreflight::for_executable_plan(executable, capacities)?;
6184        let mut workspace =
6185            self.allocate_epistemic_gpu_workspace(&executable.gpu_plan, capacities)?;
6186        let workspace_reset = self.reset_epistemic_gpu_workspace(&mut workspace)?;
6187
6188        Ok(EpistemicGpuPreparedExecution {
6189            preflight,
6190            tuple_membership_bindings: executable.gpu_plan.tuple_membership_bindings.clone(),
6191            workspace,
6192            workspace_reset,
6193        })
6194    }
6195
6196    fn validate_epistemic_gpu_reduced_constraints(
6197        &self,
6198        executable: &EpistemicExecutablePlan,
6199    ) -> Result<EpistemicGpuConstraintValidationTrace> {
6200        let mut checked_constraint_relations = 0usize;
6201        let mut violated_constraint_relations = 0usize;
6202        let mut row_count_device_reads = 0u32;
6203        let mut violations = Vec::new();
6204
6205        let mut relation_names = Vec::new();
6206        for rule in executable
6207            .reduced_runtime_plan
6208            .rules_by_scc
6209            .iter()
6210            .flatten()
6211        {
6212            if rule.head.starts_with(XLOG_CONSTRAINT_RELATION_PREFIX)
6213                && !relation_names.iter().any(|name| name == &rule.head)
6214            {
6215                relation_names.push(rule.head.as_str());
6216            }
6217        }
6218
6219        for relation_name in relation_names {
6220            checked_constraint_relations += 1;
6221            let relation = self.store().get(relation_name).ok_or_else(|| {
6222                XlogError::Execution(format!(
6223                    "missing reduced constraint relation {relation_name} after production runtime \
6224                     dispatch"
6225                ))
6226            })?;
6227            let row_count_was_cached = relation.cached_row_count().is_some();
6228            let rows = self.provider.device_row_count(relation)?;
6229            row_count_device_reads += u32::from(!row_count_was_cached);
6230            if rows > 0 {
6231                violated_constraint_relations += 1;
6232                violations.push(format!("{relation_name}={rows}"));
6233            }
6234        }
6235
6236        if !violations.is_empty() {
6237            return Err(XlogError::Execution(format!(
6238                "epistemic GPU reduced constraint violation: {}",
6239                violations.join(", ")
6240            )));
6241        }
6242
6243        Ok(EpistemicGpuConstraintValidationTrace {
6244            checked_constraint_relations,
6245            violated_constraint_relations,
6246            row_count_device_reads,
6247        })
6248    }
6249
6250    /// Materialize a stratum's GATED epistemic head output into the relation store
6251    /// as a base relation, for stratified epistemic execution.
6252    ///
6253    /// After a lower stratum computes its modal-gated head extension (the
6254    /// `final_output`/additional-head buffer), the higher stratum's `know`/
6255    /// `possible` over that head must read the GATED extension — not the ungated
6256    /// reduced relation the reduced runtime plan leaves in the store. This OVERWRITES
6257    /// the store relation under `name` with a device-side clone of the gated buffer,
6258    /// so the existing tuple-membership filter (which reads the source
6259    /// relation from the store by predicate name) gates the higher stratum against
6260    /// the correct extension. No resolve-into-body is performed, so there is no
6261    /// double-gating against the GPU world-view filter.
6262    pub fn materialize_epistemic_head_relation(
6263        &mut self,
6264        name: &str,
6265        gated_output: &CudaBuffer,
6266    ) -> Result<()> {
6267        let cloned = self.clone_buffer(gated_output)?;
6268        self.put_relation(name, cloned);
6269        Ok(())
6270    }
6271
6272    /// Device-side clone of a store-resident relation buffer, for surfacing a
6273    /// stratified ordinary stratum's output as a query result without moving it out
6274    /// of the store.
6275    pub fn clone_store_relation(&self, buffer: &CudaBuffer) -> Result<CudaBuffer> {
6276        self.clone_buffer(buffer)
6277    }
6278
6279    /// Execute the reduced production runtime plan and capture epistemic GPU evidence.
6280    pub fn execute_epistemic_gpu_execution(
6281        &mut self,
6282        executable: &EpistemicExecutablePlan,
6283        capacities: EpistemicGpuWorkspaceCapacities,
6284    ) -> Result<EpistemicGpuExecutionResult> {
6285        let mut prepared = self.prepare_epistemic_gpu_execution(executable, capacities)?;
6286        let literal_count = executable.gpu_plan.epistemic_literals.len();
6287        let candidate_count = bounded_candidate_count(literal_count, capacities.max_candidates)?;
6288        let transfer_budget_start = self.provider.host_transfer_stats();
6289        let launch_metadata_transfer_start = self.provider.host_launch_metadata_transfer_stats();
6290        let candidate_generation = self.generate_epistemic_gpu_candidates(
6291            &mut prepared.workspace,
6292            literal_count,
6293            candidate_count,
6294        )?;
6295        let propagation = self.propagate_epistemic_gpu_candidates(
6296            &mut prepared.workspace,
6297            literal_count,
6298            candidate_count,
6299        )?;
6300        let candidate_validation = self.validate_epistemic_gpu_candidates(
6301            &mut prepared.workspace,
6302            literal_count,
6303            candidate_count,
6304        )?;
6305        let counters_before = self.epistemic_gpu_runtime_counters();
6306        let _reduced_return = self.execute_plan(&executable.reduced_runtime_plan)?;
6307        let counters_after = self.epistemic_gpu_runtime_counters();
6308        let trace = EpistemicGpuRuntimeTrace::try_from_preflight_and_counters(
6309            prepared.preflight,
6310            counters_before,
6311            counters_after,
6312        )?;
6313        trace.require_wcoj_certification()?;
6314        let output_relation = executable
6315            .gpu_plan
6316            .reductions
6317            .last()
6318            .ok_or_else(|| XlogError::UnsupportedEpistemicConstruct {
6319                construct: "epistemic GPU reduced output".to_string(),
6320                context: "executable plan has no epistemic reductions".to_string(),
6321            })?
6322            .head_predicate
6323            .as_str();
6324        let output = {
6325            let reduced_output = self.store().get(output_relation).ok_or_else(|| {
6326                XlogError::UnsupportedEpistemicConstruct {
6327                    construct: "epistemic GPU reduced output".to_string(),
6328                    context: format!(
6329                        "missing reduced output relation {output_relation} after production \
6330                         runtime dispatch"
6331                    ),
6332                }
6333            })?;
6334            self.clone_buffer(reduced_output)?
6335        };
6336        let model_membership = self.populate_epistemic_gpu_model_membership_from_tuple_sources(
6337            &mut prepared.workspace,
6338            &output,
6339            &executable.gpu_plan,
6340            candidate_count,
6341            capacities.max_models_per_reduction,
6342        )?;
6343        model_membership.require_stable_model_tuple_source()?;
6344        let expected_tuple_key_column_reads =
6345            expected_tuple_key_column_reads(&executable.gpu_plan.tuple_membership_bindings)?;
6346        model_membership.require_planned_tuple_key_column_reads(expected_tuple_key_column_reads)?;
6347        let world_view_validation = self.validate_epistemic_gpu_world_views(
6348            &mut prepared.workspace,
6349            &executable.gpu_plan,
6350            candidate_count,
6351            capacities.max_models_per_reduction,
6352        )?;
6353        let constraint_world_view_validation = self.validate_epistemic_gpu_world_view_constraints(
6354            &mut prepared.workspace,
6355            &executable.gpu_plan,
6356            candidate_count,
6357        )?;
6358        let materialization =
6359            self.materialize_epistemic_gpu_candidates(&mut prepared.workspace, candidate_count)?;
6360        let final_result_materialization = self.materialize_epistemic_gpu_final_results(
6361            &mut prepared.workspace,
6362            &output,
6363            candidate_count,
6364        )?;
6365        // Distinct epistemic output heads and the reduction indices that feed each.
6366        // A single-head plan keeps the unscoped (None) row filter. A JOINT-SOLVED
6367        // coalesced multi-head plan materializes EACH head against the SAME accepted
6368        // world view by scoping the modal row-filter to that head's reductions.
6369        let head_reductions = epistemic_head_reduction_indices(&executable.gpu_plan);
6370        let is_multi_head = head_reductions.len() > 1;
6371        let primary_head_filter = if is_multi_head {
6372            head_reductions.get(output_relation).cloned()
6373        } else {
6374            None
6375        };
6376        let (final_output, final_tuple_materialization) = self
6377            .materialize_epistemic_gpu_final_tuples_scoped(
6378                &mut prepared.workspace,
6379                &output,
6380                &executable.gpu_plan,
6381                literal_count,
6382                candidate_count,
6383                executable.gpu_plan.reductions.len(),
6384                capacities.max_models_per_reduction,
6385                primary_head_filter.as_ref(),
6386            )?;
6387        // Materialize every OTHER coupled head against the shared accepted world
6388        // view. Each head's reduced relation (already computed jointly by the single
6389        // reduced-program dispatch above) is the materialization source; only that
6390        // head's modal row-filter bindings apply.
6391        let mut additional_head_outputs: Vec<(String, CudaBuffer)> = Vec::new();
6392        if is_multi_head {
6393            for (head, reductions) in &head_reductions {
6394                if head.as_str() == output_relation {
6395                    continue;
6396                }
6397                let head_output = {
6398                    let reduced_head = self.store().get(head.as_str()).ok_or_else(|| {
6399                        XlogError::UnsupportedEpistemicConstruct {
6400                            construct: "epistemic GPU reduced output".to_string(),
6401                            context: format!(
6402                                "missing reduced output relation {head} after production runtime \
6403                                 dispatch for joint multi-head materialization"
6404                            ),
6405                        }
6406                    })?;
6407                    self.clone_buffer(reduced_head)?
6408                };
6409                let (head_final_output, _head_trace) = self
6410                    .materialize_epistemic_gpu_final_tuples_scoped(
6411                        &mut prepared.workspace,
6412                        &head_output,
6413                        &executable.gpu_plan,
6414                        literal_count,
6415                        candidate_count,
6416                        executable.gpu_plan.reductions.len(),
6417                        capacities.max_models_per_reduction,
6418                        Some(reductions),
6419                    )?;
6420                additional_head_outputs.push((head.clone(), head_final_output));
6421            }
6422        }
6423        let tuple_evidence_output = if executable.gpu_plan.final_output_columns.is_some() {
6424            let mut evidence_plan = executable.gpu_plan.clone();
6425            evidence_plan.final_output_columns = None;
6426            let (evidence_output, _) = self.materialize_epistemic_gpu_final_tuples(
6427                &mut prepared.workspace,
6428                &output,
6429                &evidence_plan,
6430                literal_count,
6431                candidate_count,
6432                executable.gpu_plan.reductions.len(),
6433                capacities.max_models_per_reduction,
6434            )?;
6435            Some(evidence_output)
6436        } else {
6437            None
6438        };
6439        let transfer_budget_end = self.provider.host_transfer_stats();
6440        let launch_metadata_transfer_end = self.provider.host_launch_metadata_transfer_stats();
6441        let transfer_budget =
6442            EpistemicGpuTransferBudgetTrace::from_host_transfer_stats_with_launch_metadata(
6443                candidate_count,
6444                transfer_budget_start,
6445                transfer_budget_end,
6446                launch_metadata_transfer_start,
6447                launch_metadata_transfer_end,
6448            )?;
6449        let final_result_transfer =
6450            EpistemicGpuFinalResultTransferTrace::from_final_output(&self.provider, &final_output)?;
6451        final_tuple_materialization.require_row_filter_materialization_evidence(
6452            "epistemic GPU final tuple materialization",
6453            final_result_transfer.final_output_rows,
6454        )?;
6455        let constraint_validation = self.validate_epistemic_gpu_reduced_constraints(executable)?;
6456        let semantic_trace = EpistemicGpuSemanticTrace::from_device_rejection_reasons(
6457            &self.provider,
6458            &prepared.workspace,
6459            &candidate_generation,
6460            &propagation,
6461            &model_membership,
6462            &world_view_validation,
6463        )?;
6464
6465        Ok(EpistemicGpuExecutionResult {
6466            provider_identity: EpistemicGpuProviderIdentity::from_provider(&self.provider),
6467            prepared,
6468            candidate_generation,
6469            propagation,
6470            candidate_validation,
6471            model_membership,
6472            world_view_validation,
6473            constraint_world_view_validation,
6474            materialization,
6475            final_result_materialization,
6476            final_tuple_materialization,
6477            transfer_budget,
6478            final_result_transfer,
6479            constraint_validation,
6480            semantic_trace,
6481            tuple_membership_bindings: executable.gpu_plan.tuple_membership_bindings.clone(),
6482            final_output,
6483            additional_head_outputs,
6484            tuple_evidence_output,
6485            output,
6486            trace,
6487        })
6488    }
6489
6490    /// Execute multiple accepted epistemic GPU executable plans in order.
6491    ///
6492    /// This is the runtime adapter used by split execution evidence: each
6493    /// component is still dispatched through [`Self::execute_epistemic_gpu_execution`],
6494    /// so candidate generation, model-membership, world-view validation,
6495    /// materialization, transfer-budget, and production runtime counters are
6496    /// recorded by the existing single-plan path.
6497    pub fn execute_epistemic_gpu_execution_batch(
6498        &mut self,
6499        executables: &[&EpistemicExecutablePlan],
6500        capacities: EpistemicGpuWorkspaceCapacities,
6501    ) -> Result<Vec<EpistemicGpuExecutionResult>> {
6502        if executables.is_empty() {
6503            return Err(XlogError::UnsupportedEpistemicConstruct {
6504                construct: "epistemic GPU batch execution".to_string(),
6505                context: "batch execution requires at least one executable component".to_string(),
6506            });
6507        }
6508
6509        let mut results = Vec::with_capacity(executables.len());
6510        for executable in executables {
6511            results.push(self.execute_epistemic_gpu_execution(executable, capacities)?);
6512        }
6513        Ok(results)
6514    }
6515
6516    /// Execute multiple epistemic GPU executable plans and return an aggregate trace.
6517    ///
6518    /// This is used by split-execution certification: every component still
6519    /// routes through the existing single-plan GPU runtime path, and the batch
6520    /// trace only aggregates those component traces. It does not perform CPU
6521    /// recomposition.
6522    pub fn execute_epistemic_gpu_execution_batch_with_trace(
6523        &mut self,
6524        executables: &[&EpistemicExecutablePlan],
6525        capacities: EpistemicGpuWorkspaceCapacities,
6526    ) -> Result<EpistemicGpuBatchExecutionResult> {
6527        let results = self.execute_epistemic_gpu_execution_batch(executables, capacities)?;
6528        let trace = EpistemicGpuBatchExecutionTrace::try_from_component_results(&results)?;
6529        Ok(EpistemicGpuBatchExecutionResult { results, trace })
6530    }
6531}
6532
6533#[derive(Default)]
6534struct RuntimeRouteSummary {
6535    multiway_reduction_count: usize,
6536    kclique_wcoj_plan_count: usize,
6537    wcoj_triangle_route_count: usize,
6538    wcoj_4cycle_route_count: usize,
6539    free_join_route_count: usize,
6540    kclique_wcoj_plan_count_by_arity: [usize; 4],
6541    kclique_wcoj_max_arity: u8,
6542    kclique_wcoj_edge_permutation_count: usize,
6543    kclique_stream_groups: BTreeSet<StreamGroupId>,
6544    kclique_skew_scheduled_plan_count: usize,
6545    planned_hash_route_count: usize,
6546    planned_hash_planner_wins_count: usize,
6547    planned_hash_incomplete_stats_count: usize,
6548    planned_hash_cost_evidence_count: usize,
6549    sorted_layout_requirement_count: usize,
6550    helper_split_spec_count: usize,
6551}
6552
6553fn summarize_runtime_routes(node: &RirNode, routes: &mut RuntimeRouteSummary) {
6554    match node {
6555        RirNode::MultiWayJoin { inputs, plan, .. } => {
6556            routes.multiway_reduction_count += 1;
6557            match plan {
6558                Some(MultiwayPlan::WcojWithPlan(order)) => {
6559                    routes.kclique_wcoj_plan_count += 1;
6560                    if let Some(slot) = usize::from(order.k).checked_sub(5) {
6561                        if slot < routes.kclique_wcoj_plan_count_by_arity.len() {
6562                            routes.kclique_wcoj_plan_count_by_arity[slot] += 1;
6563                        }
6564                    }
6565                    routes.kclique_wcoj_max_arity = routes.kclique_wcoj_max_arity.max(order.k);
6566                    routes.kclique_wcoj_edge_permutation_count += order
6567                        .edge_permutation
6568                        .iter()
6569                        .take_while(|slot| **slot != u8::MAX)
6570                        .count();
6571                    routes.kclique_stream_groups.insert(order.stream_group);
6572                    if !order.helper_split_specs.is_empty() {
6573                        routes.kclique_skew_scheduled_plan_count += 1;
6574                    }
6575                    routes.sorted_layout_requirement_count +=
6576                        order.sorted_layout_requirements.edge_slots.len();
6577                    routes.helper_split_spec_count += order.helper_split_specs.len();
6578                }
6579                Some(MultiwayPlan::PlannedHashRoute {
6580                    reason,
6581                    planner_evidence,
6582                }) => {
6583                    routes.planned_hash_route_count += 1;
6584                    match reason {
6585                        PlannedHashReason::PlannerPredictsHashWins => {
6586                            routes.planned_hash_planner_wins_count += 1;
6587                            if planner_evidence.wcoj_cost.is_finite()
6588                                && planner_evidence.hash_cost.is_finite()
6589                                && planner_evidence.hash_cost <= planner_evidence.wcoj_cost
6590                            {
6591                                routes.planned_hash_cost_evidence_count += 1;
6592                            }
6593                        }
6594                        PlannedHashReason::IncompleteStatsSafeDefault => {
6595                            routes.planned_hash_incomplete_stats_count += 1;
6596                        }
6597                    }
6598                }
6599                // Generic Free Join route: provenance variant set
6600                // only by the general multiway promoter. Opportunistic
6601                // — the dispatcher's structural decline executes the
6602                // embedded binary fallback, so no hard dispatch
6603                // obligation accrues here.
6604                Some(MultiwayPlan::FreeJoin) => {
6605                    routes.free_join_route_count += 1;
6606                }
6607                None => {
6608                    if super::wcoj_dispatch::match_multiway_triangle(node).is_some() {
6609                        routes.wcoj_triangle_route_count += 1;
6610                    } else if super::wcoj_dispatch::match_multiway_4cycle(node).is_some() {
6611                        routes.wcoj_4cycle_route_count += 1;
6612                    }
6613                }
6614            }
6615
6616            for input in inputs {
6617                summarize_runtime_routes(input, routes);
6618            }
6619        }
6620        RirNode::Filter { input, .. }
6621        | RirNode::Project { input, .. }
6622        | RirNode::Distinct { input, .. }
6623        | RirNode::GroupBy { input, .. } => summarize_runtime_routes(input, routes),
6624        RirNode::Join { left, right, .. } | RirNode::Diff { left, right } => {
6625            summarize_runtime_routes(left, routes);
6626            summarize_runtime_routes(right, routes);
6627        }
6628        RirNode::Union { inputs } => {
6629            for input in inputs {
6630                summarize_runtime_routes(input, routes);
6631            }
6632        }
6633        RirNode::Fixpoint {
6634            base, recursive, ..
6635        } => {
6636            summarize_runtime_routes(base, routes);
6637            summarize_runtime_routes(recursive, routes);
6638        }
6639        RirNode::ChainJoin { left, right, .. } => {
6640            summarize_runtime_routes(left, routes);
6641            summarize_runtime_routes(right, routes);
6642        }
6643        RirNode::TensorMaskedJoin { .. } | RirNode::Scan { .. } | RirNode::Unit => {}
6644    }
6645}
6646
6647fn helper_relation_ids(executable: &EpistemicExecutablePlan) -> BTreeSet<RelId> {
6648    executable
6649        .relation_ids
6650        .iter()
6651        .filter_map(|(name, rel)| name.starts_with("__kclique_helper_").then_some(*rel))
6652        .collect()
6653}
6654
6655fn count_helper_relation_scans(node: &RirNode, helper_relations: &BTreeSet<RelId>) -> usize {
6656    match node {
6657        RirNode::Scan { .. } => 0,
6658        RirNode::MultiWayJoin { plan, inputs, .. } => {
6659            let own_wcoj_inputs = if matches!(plan, Some(MultiwayPlan::WcojWithPlan(_))) {
6660                inputs
6661                    .iter()
6662                    .map(|input| count_helper_relation_leaf_scans(input, helper_relations))
6663                    .sum()
6664            } else {
6665                0
6666            };
6667            own_wcoj_inputs
6668                + inputs
6669                    .iter()
6670                    .map(|input| count_helper_relation_scans(input, helper_relations))
6671                    .sum::<usize>()
6672        }
6673        RirNode::Filter { input, .. }
6674        | RirNode::Project { input, .. }
6675        | RirNode::Distinct { input, .. }
6676        | RirNode::GroupBy { input, .. } => count_helper_relation_scans(input, helper_relations),
6677        RirNode::Join { left, right, .. } | RirNode::Diff { left, right } => {
6678            count_helper_relation_scans(left, helper_relations)
6679                + count_helper_relation_scans(right, helper_relations)
6680        }
6681        RirNode::Union { inputs } => inputs
6682            .iter()
6683            .map(|input| count_helper_relation_scans(input, helper_relations))
6684            .sum(),
6685        RirNode::Fixpoint {
6686            base, recursive, ..
6687        } => {
6688            count_helper_relation_scans(base, helper_relations)
6689                + count_helper_relation_scans(recursive, helper_relations)
6690        }
6691        RirNode::ChainJoin { left, right, .. } => {
6692            count_helper_relation_scans(left, helper_relations)
6693                + count_helper_relation_scans(right, helper_relations)
6694        }
6695        RirNode::TensorMaskedJoin { .. } | RirNode::Unit => 0,
6696    }
6697}
6698
6699fn count_helper_relation_leaf_scans(node: &RirNode, helper_relations: &BTreeSet<RelId>) -> usize {
6700    match node {
6701        RirNode::Scan { rel } => usize::from(helper_relations.contains(rel)),
6702        RirNode::Filter { input, .. }
6703        | RirNode::Project { input, .. }
6704        | RirNode::Distinct { input, .. }
6705        | RirNode::GroupBy { input, .. } => {
6706            count_helper_relation_leaf_scans(input, helper_relations)
6707        }
6708        RirNode::Join { left, right, .. } | RirNode::Diff { left, right } => {
6709            count_helper_relation_leaf_scans(left, helper_relations)
6710                + count_helper_relation_leaf_scans(right, helper_relations)
6711        }
6712        RirNode::Union { inputs } => inputs
6713            .iter()
6714            .map(|input| count_helper_relation_leaf_scans(input, helper_relations))
6715            .sum(),
6716        RirNode::Fixpoint {
6717            base, recursive, ..
6718        } => {
6719            count_helper_relation_leaf_scans(base, helper_relations)
6720                + count_helper_relation_leaf_scans(recursive, helper_relations)
6721        }
6722        RirNode::MultiWayJoin { inputs, .. } => inputs
6723            .iter()
6724            .map(|input| count_helper_relation_leaf_scans(input, helper_relations))
6725            .sum(),
6726        RirNode::ChainJoin { left, right, .. } => {
6727            count_helper_relation_leaf_scans(left, helper_relations)
6728                + count_helper_relation_leaf_scans(right, helper_relations)
6729        }
6730        RirNode::TensorMaskedJoin { .. } | RirNode::Unit => 0,
6731    }
6732}
6733
6734fn require_positive(value: usize, context: &str) -> Result<()> {
6735    if value == 0 {
6736        return Err(XlogError::ResourceExhausted {
6737            context: context.to_string(),
6738            estimated_bytes: 0,
6739            budget_bytes: 1,
6740        });
6741    }
6742    Ok(())
6743}
6744
6745fn checked_u32_dimension(value: usize, context: &str) -> Result<u32> {
6746    u32::try_from(value).map_err(|_| XlogError::ResourceExhausted {
6747        context: context.to_string(),
6748        estimated_bytes: value as u64,
6749        budget_bytes: u32::MAX as u64,
6750    })
6751}
6752
6753/// Map each distinct epistemic output head to the reduction indices feeding it.
6754///
6755/// Reduction index = position in `gpu_plan.reductions`, which is exactly the
6756/// `reduction_index` carried by every tuple-membership binding, so the returned
6757/// sets scope each head's modal row-filter for joint multi-head materialization.
6758fn epistemic_head_reduction_indices(
6759    gpu_plan: &EpistemicGpuPlan,
6760) -> std::collections::BTreeMap<String, BTreeSet<usize>> {
6761    let mut heads: std::collections::BTreeMap<String, BTreeSet<usize>> =
6762        std::collections::BTreeMap::new();
6763    for (reduction_index, reduction) in gpu_plan.reductions.iter().enumerate() {
6764        heads
6765            .entry(reduction.head_predicate.clone())
6766            .or_default()
6767            .insert(reduction_index);
6768    }
6769    heads
6770}
6771
6772fn final_output_columns_for_materialization(
6773    output: &CudaBuffer,
6774    gpu_plan: &EpistemicGpuPlan,
6775    head_reduction_filter: Option<&BTreeSet<usize>>,
6776) -> Result<Vec<usize>> {
6777    // PER-HEAD augmented projection: in a JOINT-SOLVED multi-head component each head
6778    // is materialized from its OWN reduced relation buffer with its OWN reduction
6779    // filter. The plan-global `final_output_columns` is derived from the first
6780    // epistemic rule's head and would mis-project coupled heads of DIFFERING arity.
6781    // When a head filter is supplied, project the first `public_head_arity` columns of
6782    // THAT head (the augmented modal-literal columns are appended after the public head
6783    // terms), so heads of differing arity/projection each materialize their own public
6784    // tuple shape. This reads only the store/world-view boundary (the reduced relation
6785    // buffer's arity + the plan's recorded public arity) — never a resolved body.
6786    if let Some(filter) = head_reduction_filter {
6787        if let Some(public_head_arity) = gpu_plan
6788            .reductions
6789            .iter()
6790            .enumerate()
6791            .filter(|(reduction_index, _)| filter.contains(reduction_index))
6792            .map(|(_, reduction)| reduction.public_head_arity)
6793            .max()
6794        {
6795            if public_head_arity > output.arity() {
6796                return Err(XlogError::UnsupportedEpistemicConstruct {
6797                    construct: "epistemic GPU final output projection".to_string(),
6798                    context: format!(
6799                        "per-head public arity {} exceeds reduced output arity {} for the \
6800                         joint multi-head materialization",
6801                        public_head_arity,
6802                        output.arity()
6803                    ),
6804                });
6805            }
6806            return Ok((0..public_head_arity).collect());
6807        }
6808    }
6809
6810    let Some(final_output_columns) = &gpu_plan.final_output_columns else {
6811        return Ok((0..output.arity()).collect());
6812    };
6813
6814    let mut seen = vec![false; output.arity()];
6815    for &column in final_output_columns {
6816        if column >= output.arity() {
6817            return Err(XlogError::UnsupportedEpistemicConstruct {
6818                construct: "epistemic GPU final output projection".to_string(),
6819                context: format!(
6820                    "final output column {} exceeds reduced output arity {}",
6821                    column,
6822                    output.arity()
6823                ),
6824            });
6825        }
6826        if seen[column] {
6827            return Err(XlogError::UnsupportedEpistemicConstruct {
6828                construct: "epistemic GPU final output projection".to_string(),
6829                context: format!("duplicate final output column {}", column),
6830            });
6831        }
6832        seen[column] = true;
6833    }
6834
6835    Ok(final_output_columns.clone())
6836}
6837
6838fn require_u32_launch_bound(value: usize, context: &str) -> Result<()> {
6839    checked_u32_dimension(value, context).map(|_| ())
6840}
6841
6842fn require_u32_launch_dimensions(values: &[usize], context: &str) -> Result<()> {
6843    let max_value = values.iter().copied().max().unwrap_or(0);
6844    require_u32_launch_bound(max_value, context)
6845}
6846
6847fn checked_product(left: usize, right: usize) -> Result<usize> {
6848    left.checked_mul(right).ok_or_else(|| {
6849        XlogError::Kernel(format!(
6850            "epistemic GPU workspace size overflow: {left} * {right}"
6851        ))
6852    })
6853}
6854
6855fn checked_sum(left: usize, right: usize) -> Result<usize> {
6856    left.checked_add(right).ok_or_else(|| {
6857        XlogError::Kernel(format!(
6858            "epistemic GPU workspace size overflow: {left} + {right}"
6859        ))
6860    })
6861}
6862
6863fn require_epistemic_gpu_kernel_phases(gpu_plan: &EpistemicGpuPlan) -> Result<()> {
6864    let required = [
6865        EpistemicGpuHotPathPhase::CandidateGeneration,
6866        EpistemicGpuHotPathPhase::Propagation,
6867        EpistemicGpuHotPathPhase::CandidateValidation,
6868        EpistemicGpuHotPathPhase::ModelMembership,
6869        EpistemicGpuHotPathPhase::WorldViewValidation,
6870        EpistemicGpuHotPathPhase::ResultMaterialization,
6871        EpistemicGpuHotPathPhase::FinalResultMaterialization,
6872        EpistemicGpuHotPathPhase::FinalTupleMaterialization,
6873    ];
6874
6875    for phase in required {
6876        if !gpu_plan.required_kernel_phases.contains(&phase) {
6877            return Err(XlogError::UnsupportedEpistemicConstruct {
6878                construct: "epistemic GPU kernel phase contract".to_string(),
6879                context: format!(
6880                    "accepted GPU execution requires kernel phase {:?}, but the plan declared {:?}",
6881                    phase, gpu_plan.required_kernel_phases
6882                ),
6883            });
6884        }
6885    }
6886
6887    Ok(())
6888}
6889
6890fn require_epistemic_gpu_buffer_contract(gpu_plan: &EpistemicGpuPlan) -> Result<()> {
6891    let required = [
6892        EpistemicGpuBufferKind::CandidateAssumptions,
6893        EpistemicGpuBufferKind::WorldViews,
6894        EpistemicGpuBufferKind::ModelMembership,
6895        EpistemicGpuBufferKind::RejectionReasons,
6896    ];
6897
6898    for buffer in required {
6899        if !gpu_plan.required_buffers.contains(&buffer) {
6900            return Err(XlogError::UnsupportedEpistemicConstruct {
6901                construct: "epistemic GPU buffer contract".to_string(),
6902                context: format!(
6903                    "accepted GPU execution requires buffer {:?}, but the plan declared {:?}",
6904                    buffer, gpu_plan.required_buffers
6905                ),
6906            });
6907        }
6908    }
6909
6910    Ok(())
6911}
6912
6913fn expected_tuple_key_column_reads(bindings: &[EpistemicTupleMembershipBinding]) -> Result<usize> {
6914    bindings.iter().try_fold(0usize, |acc, binding| {
6915        checked_sum(acc, binding.key_columns.len())
6916    })
6917}
6918
6919fn world_view_bitset_bytes_per_candidate(literal_count: usize) -> Result<usize> {
6920    Ok(checked_sum(literal_count, 7)? / 8)
6921}
6922
6923fn epistemic_operator_code(op: EirEpistemicOp) -> u8 {
6924    match op {
6925        EirEpistemicOp::Know => 1,
6926        EirEpistemicOp::Possible => 2,
6927    }
6928}
6929
6930fn bounded_candidate_count(literal_count: usize, max_candidates: usize) -> Result<usize> {
6931    require_positive(literal_count, "epistemic GPU execution literals")?;
6932    require_positive(max_candidates, "epistemic GPU execution candidates")?;
6933    if literal_count > 31 {
6934        return Err(XlogError::UnsupportedEpistemicConstruct {
6935            construct: "epistemic GPU execution candidate generation".to_string(),
6936            context: format!("literal count {literal_count} exceeds 31-bit candidate mask"),
6937        });
6938    }
6939    let required_candidates = 1usize << literal_count;
6940    if max_candidates < required_candidates {
6941        return Err(XlogError::ResourceExhausted {
6942            context: "epistemic GPU execution candidate capacity".to_string(),
6943            estimated_bytes: required_candidates as u64,
6944            budget_bytes: max_candidates as u64,
6945        });
6946    }
6947    Ok(required_candidates)
6948}
6949
6950#[cfg(test)]
6951mod tests {
6952    use super::*;
6953    use xlog_core::ScalarType;
6954    use xlog_ir::EirTerm;
6955
6956    #[test]
6957    fn tuple_key_expectation_encodes_ground_integer_for_u32_column() {
6958        let expectation =
6959            TupleKeyExpectation::from_term(&EirTerm::Integer(42), ScalarType::U32).unwrap();
6960
6961        assert_eq!(
6962            expectation,
6963            TupleKeyExpectation {
6964                bits: 42,
6965                type_code: ScalarType::U32.to_code(),
6966            }
6967        );
6968    }
6969
6970    #[test]
6971    fn tuple_key_expectation_encodes_symbol_for_symbol_column() {
6972        let expectation =
6973            TupleKeyExpectation::from_term(&EirTerm::Symbol(7), ScalarType::Symbol).unwrap();
6974
6975        assert_eq!(
6976            expectation,
6977            TupleKeyExpectation {
6978                bits: 7,
6979                type_code: ScalarType::Symbol.to_code(),
6980            }
6981        );
6982    }
6983
6984    #[test]
6985    fn tuple_key_expectation_rejects_variable_as_ground_expectation() {
6986        let err =
6987            TupleKeyExpectation::from_term(&EirTerm::Variable("X".to_string()), ScalarType::U32)
6988                .expect_err("variable tuple keys require bound-output matching");
6989
6990        match err {
6991            XlogError::UnsupportedEpistemicConstruct { construct, context } => {
6992                assert_eq!(construct, "epistemic GPU tuple-key expectation");
6993                assert!(context.contains("cannot be encoded as a ground tuple-key expectation"));
6994            }
6995            other => panic!("expected tuple-key expectation error, got {other:?}"),
6996        }
6997    }
6998}