Skip to main content

xlog_prob/compilation/
mod.rs

1//! GPU-native knowledge compilation.
2//!
3//! This module is the home of GPU-native compilation + verification utilities.
4//!
5//! Production correctness requires the GPU CDCL equivalence verifier (see `validation`).
6
7use std::sync::Arc;
8use std::time::Instant;
9
10use cudarc::driver::DeviceSlice;
11use xlog_core::{Result, XlogError};
12use xlog_cuda::memory::TrackedCudaSlice;
13use xlog_cuda::CudaKernelProvider;
14use xlog_solve::{GpuCdclConfig, GpuCnf};
15
16use crate::compilation::gpu_cache::{GpuCircuitCache, GpuCircuitCacheHandle};
17use crate::gpu::{GpuCircuitBuilder, GpuCircuitLayout, GpuXgcf};
18
19pub mod disk_cache;
20pub mod gpu_cache;
21pub mod gpu_cnf;
22pub mod gpu_d4;
23pub mod gpu_pir;
24pub mod gpu_pir_intern;
25pub mod gpu_weights;
26pub mod sparse_matrix;
27pub mod validation;
28
29pub use gpu_cnf::{encode_cnf_gpu, GpuCnfEncoding, GpuCnfVarTables};
30pub use gpu_d4::GpuCompileConfig;
31pub use gpu_pir::{GpuPirGraph, GpuPirRoots, PIR_AND, PIR_LIT, PIR_NEG_LIT, PIR_OR};
32// PIR_CONST and PIR_DECISION are used within gpu_pir.rs and gpu_pir_intern.rs
33// via direct module paths; no crate-level re-export needed.
34pub use gpu_pir_intern::{GpuPirInterner, PirBatch};
35pub use gpu_weights::GpuWeights;
36pub use gpu_weights::{
37    apply_query_vars_device, build_evidence_by_var_gpu, build_weights_gpu, map_nodes_to_vars_gpu,
38    restore_query_vars_device,
39};
40// GpuCsrCnf is currently unused (dead code); remove re-export.
41pub use validation::{
42    build_equivalence_queries_gpu, validate_equivalence_gpu, validate_equivalence_gpu_gated,
43    GpuEquivalenceConfig, GpuEquivalenceQueries,
44};
45// check_equivalence_gpu and check_equivalence_gpu_gated are called only
46// within validation.rs itself; no crate-level re-export needed.
47
48/// Per-stage compilation timing (populated only when XLOG_WARMUP_PROFILE=1).
49#[derive(Debug, Clone, Default)]
50pub struct CircuitCompileProfile {
51    pub cnf_hash_sec: f64,
52    pub d4_compile_sec: f64,
53    pub verify_sec: f64,
54    pub smooth_sec: f64,
55    pub cache_store_sec: f64,
56    pub free_var_mask_sec: f64,
57    pub gpu_cache_hit: bool,
58    pub disk_cache_hit: bool,
59    /// BFS frontier item count after `frontier_depth` expansion steps
60    /// (0 on cache hits or when profiling is disabled).
61    pub frontier_items: u32,
62}
63
64pub(crate) fn warmup_profiling_enabled() -> bool {
65    std::env::var("XLOG_WARMUP_PROFILE")
66        .map(|v| v == "1")
67        .unwrap_or(false)
68}
69
70/// Device-resident random-variable list for GPU smoothing.
71pub struct DeviceRandomVarList {
72    list: TrackedCudaSlice<u32>,
73    count: u32,
74}
75
76impl DeviceRandomVarList {
77    pub fn from_device(list: TrackedCudaSlice<u32>, count: u32) -> Result<Self> {
78        let len = u32::try_from(list.len()).map_err(|_| {
79            XlogError::Compilation("DeviceRandomVarList: list length exceeds u32".to_string())
80        })?;
81        if count > len {
82            return Err(XlogError::Compilation(format!(
83                "DeviceRandomVarList: count {} exceeds list len {}",
84                count, len
85            )));
86        }
87        Ok(Self { list, count })
88    }
89
90    pub fn from_host(provider: &CudaKernelProvider, host: &[u32]) -> Result<Self> {
91        let memory = provider.memory();
92        let mut list = memory.alloc::<u32>(host.len())?;
93        if !host.is_empty() {
94            provider
95                .htod_sync_copy_into_tracked(host, &mut list)
96                .map_err(|e| {
97                    XlogError::Kernel(format!("DeviceRandomVarList upload failed: {}", e))
98                })?;
99        }
100        let count = u32::try_from(host.len()).map_err(|_| {
101            XlogError::Compilation("DeviceRandomVarList: host len exceeds u32".to_string())
102        })?;
103        Ok(Self { list, count })
104    }
105
106    pub fn is_empty(&self) -> bool {
107        self.count == 0
108    }
109
110    pub fn count(&self) -> u32 {
111        self.count
112    }
113
114    pub fn list(&self) -> &TrackedCudaSlice<u32> {
115        &self.list
116    }
117}
118
119fn upload_disk_artifact_for_verification(
120    artifact: &disk_cache::CircuitArtifact,
121    provider: &Arc<CudaKernelProvider>,
122) -> Result<GpuXgcf> {
123    let memory = provider.memory().clone();
124
125    macro_rules! upload {
126        ($field:expr, $ty:ty, $name:literal) => {{
127            let mut device = memory.alloc::<$ty>($field.len())?;
128            provider
129                .htod_sync_copy_into_tracked($field, &mut device)
130                .map_err(|e| {
131                    XlogError::Kernel(format!("disk cache verify upload {} failed: {}", $name, e))
132                })?;
133            device
134        }};
135    }
136
137    let builder = GpuCircuitBuilder {
138        node_type: upload!(&artifact.node_type, u8, "node_type"),
139        child_offsets: upload!(&artifact.child_offsets, u32, "child_offsets"),
140        child_indices: upload!(&artifact.child_indices, u32, "child_indices"),
141        lit: upload!(&artifact.lit, i32, "lit"),
142        decision_var: upload!(&artifact.decision_var, u32, "decision_var"),
143        decision_child_false: upload!(&artifact.decision_child_false, u32, "decision_child_false"),
144        decision_child_true: upload!(&artifact.decision_child_true, u32, "decision_child_true"),
145    };
146    let layout = GpuCircuitLayout {
147        num_nodes: artifact.num_nodes,
148        num_edges: artifact.num_edges,
149        num_levels: artifact.num_levels,
150        level_offsets: upload!(&artifact.level_offsets, u32, "level_offsets"),
151        level_nodes: upload!(&artifact.level_nodes, u32, "level_nodes"),
152        root: artifact.root,
153        max_var: artifact.max_var,
154        num_nodes_device: None,
155        num_edges_device: None,
156    };
157
158    GpuXgcf::from_device(builder, layout, provider)
159}
160
161/// Compile CNF on GPU, then verify equivalence with GPU CDCL.
162pub fn compile_gpu_d4_and_verify(
163    cnf: &GpuCnf,
164    decision_var_limit: &TrackedCudaSlice<u32>,
165    provider: &Arc<CudaKernelProvider>,
166    config: &GpuCompileConfig,
167) -> Result<GpuXgcf> {
168    if config.cdcl_conflict_budget.is_some() {
169        return Err(XlogError::Compilation(
170            "cdcl_conflict_budget is not supported by the GPU CDCL verifier".to_string(),
171        ));
172    }
173    // Size guard BEFORE compile — the D4 compile itself can crash with a
174    // context-poisoning launch failure on a large CNF, earlier than the verify.
175    validation::check_verify_size_bound(cnf, "compile_gpu_d4_and_verify")?;
176    let circuit = gpu_d4::compile_gpu_d4(cnf, provider, config)?;
177    let cdcl = cdcl_config_from_compile(config)?;
178    validate_equivalence_gpu(
179        cnf,
180        decision_var_limit,
181        &circuit,
182        provider,
183        GpuEquivalenceConfig {
184            cdcl,
185            reuse_workspace: config.incremental_verify,
186        },
187    )?;
188    Ok(circuit)
189}
190
191/// Compile CNF on GPU, cache the circuit, then verify equivalence with GPU CDCL.
192///
193/// `canonical_cnf_hash`: a process-independent hash of the PIR structure, used as
194/// the `cnf_hash` in the disk cache key. Computed via [`crate::cnf::canonical_pir_hash`].
195/// If `None`, disk caching is skipped.
196pub fn compile_gpu_d4_and_verify_cached(
197    cnf: &GpuCnf,
198    decision_var_limit: &TrackedCudaSlice<u32>,
199    provider: &Arc<CudaKernelProvider>,
200    config: &GpuCompileConfig,
201    cache: &mut GpuCircuitCache,
202    random_vars: &DeviceRandomVarList,
203    canonical_cnf_hash: Option<u64>,
204) -> Result<(GpuCircuitCacheHandle, Option<CircuitCompileProfile>)> {
205    if config.cdcl_conflict_budget.is_some() {
206        return Err(XlogError::Compilation(
207            "cdcl_conflict_budget is not supported by the GPU CDCL verifier".to_string(),
208        ));
209    }
210    // Size guard BEFORE compile (the D4 compile can crash earlier than
211    // the verify on a large CNF).
212    validation::check_verify_size_bound(cnf, "compile_gpu_d4_and_verify_cached")?;
213
214    let profiling = warmup_profiling_enabled();
215    let mut profile = CircuitCompileProfile::default();
216
217    // --- CNF hash stage ---
218    #[cfg(debug_assertions)]
219    eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: hash_cnf_gpu");
220    let t_hash = if profiling {
221        Some(Instant::now())
222    } else {
223        None
224    };
225    let key = gpu_cache::hash_cnf_gpu(cnf, provider)?;
226    if let Some(t0) = t_hash {
227        provider
228            .device()
229            .synchronize()
230            .map_err(|e| XlogError::Kernel(format!("sync after hash_cnf_gpu: {}", e)))?;
231        profile.cnf_hash_sec = t0.elapsed().as_secs_f64();
232    }
233    #[cfg(debug_assertions)]
234    {
235        if !profiling {
236            provider
237                .device()
238                .synchronize()
239                .map_err(|e| XlogError::Kernel(format!("sync after hash_cnf_gpu failed: {}", e)))?;
240        }
241    }
242    #[cfg(debug_assertions)]
243    eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: lookup_or_insert_device");
244    let lookup = cache.lookup_or_insert_device(&key)?;
245    let mut handle = lookup.into_handle()?;
246
247    // --- Disk cache check (only on GPU cache miss) ---
248    //
249    // D→H copy compile_needed to decide whether we need to compile at all.
250    // If compile_needed == 0, the GPU cache already has the circuit (GPU cache hit).
251    // If compile_needed == 1, we check the disk cache before falling through to the
252    // GPU-native Decision-DNNF compiler.
253    let compile_needed_host: Vec<u32> = provider
254        .device()
255        .inner()
256        .dtoh_sync_copy(handle.compile_needed_device())
257        .map_err(|e| XlogError::Kernel(format!("dtoh compile_needed: {}", e)))?;
258    let compile_needed = compile_needed_host[0];
259
260    // GPU cache hit — short-circuit the entire compile pipeline.
261    if compile_needed == 0 {
262        profile.gpu_cache_hit = true;
263        let out_profile = if profiling { Some(profile) } else { None };
264        return Ok((handle, out_profile));
265    }
266
267    // Build the disk cache key (we know compile_needed == 1 at this point).
268    // The canonical PIR hash keeps semantically stable cache identity, while the encoded CNF
269    // hash guards symmetric conditioned programs whose PIR shape is identical but whose CNF is not.
270    let cache_key = if compile_needed == 1 {
271        if let Some(canonical_hash) = canonical_cnf_hash {
272            let config_hash = hash_compile_config(config);
273            let random_vars_hash = hash_random_vars(random_vars, provider)?;
274            let gpu_cnf_hash = read_gpu_cnf_hash(&key, provider)?;
275            let cnf_hash = combine_disk_cnf_hash(canonical_hash, gpu_cnf_hash);
276            let sm = detect_compute_capability(provider)?;
277            Some(disk_cache::CircuitCacheKey {
278                cnf_hash,
279                config_hash,
280                random_vars_hash,
281                sm,
282            })
283        } else {
284            None
285        }
286    } else {
287        None
288    };
289
290    // Check disk cache on GPU cache miss
291    if let Some(ref disk_key) = cache_key {
292        #[cfg(debug_assertions)]
293        eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: checking disk cache");
294        if let Ok(Some(artifact)) = disk_cache::read_artifact(disk_key) {
295            #[cfg(debug_assertions)]
296            eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: disk cache hit");
297            // Stale-artifact guard: the canonical PIR hash keeps semantically-stable
298            // cache identity, so an entry written by an earlier engine whose ENCODING
299            // differed (different var universe) can be served for the same key. A
300            // cached circuit referencing vars beyond the freshly-encoded CNF is
301            // staleness, not an engine bug — evict and fall through to a fresh
302            // compile (which overwrites the entry). Fail-closed equivalence checking
303            // stays authoritative for freshly-compiled circuits below.
304            if artifact.max_var > cnf.var_cap {
305                #[cfg(debug_assertions)]
306                eprintln!(
307                    "[xlog-prob] compile_gpu_d4_and_verify_cached: stale disk artifact \
308                     (max_var {} > cnf var_cap {}), evicting",
309                    artifact.max_var, cnf.var_cap
310                );
311                disk_cache::evict_artifact(disk_key);
312            } else {
313                let circuit = upload_disk_artifact_for_verification(&artifact, provider)?;
314                let verifier_decision_var_limit = if random_vars.is_empty() {
315                    &cnf.num_vars
316                } else {
317                    decision_var_limit
318                };
319                let cdcl = cdcl_config_from_compile(config)?;
320                let t_verify = if profiling {
321                    Some(Instant::now())
322                } else {
323                    None
324                };
325                match validate_equivalence_gpu_gated(
326                    cnf,
327                    verifier_decision_var_limit,
328                    &circuit,
329                    provider,
330                    GpuEquivalenceConfig {
331                        cdcl,
332                        reuse_workspace: config.incremental_verify,
333                    },
334                    handle.compile_needed_device(),
335                ) {
336                    Ok(()) => {
337                        if let Some(t0) = t_verify {
338                            provider.device().synchronize().map_err(|e| {
339                                XlogError::Kernel(format!("sync after disk cache verify: {}", e))
340                            })?;
341                            profile.verify_sec = t0.elapsed().as_secs_f64();
342                        }
343                        cache.restore_from_host_arrays(&mut handle, &artifact)?;
344                        provider.device().synchronize().map_err(|e| {
345                            XlogError::Kernel(format!("sync after disk cache restore: {}", e))
346                        })?;
347                        profile.disk_cache_hit = true;
348                        let out_profile = if profiling { Some(profile) } else { None };
349                        return Ok((handle, out_profile));
350                    }
351                    Err(_stale) => {
352                        // A cached artifact failing equivalence against the current
353                        // CNF is a stale entry (fresh-compile verification below
354                        // remains fail-closed). Evict and recompile.
355                        #[cfg(debug_assertions)]
356                        eprintln!(
357                            "[xlog-prob] compile_gpu_d4_and_verify_cached: cached circuit \
358                             failed equivalence against current CNF ({_stale}), evicting"
359                        );
360                        disk_cache::evict_artifact(disk_key);
361                    }
362                }
363            }
364        }
365        #[cfg(debug_assertions)]
366        eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: disk cache miss");
367    }
368
369    let d4_config = d4_config_for_smoothing(config, random_vars.count())?;
370
371    // --- GPU-native Decision-DNNF compile stage ---
372    #[cfg(debug_assertions)]
373    eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: compile_gpu_d4_gated");
374    let t_d4 = if profiling {
375        Some(Instant::now())
376    } else {
377        None
378    };
379    let (circuit_base, frontier_items) = gpu_d4::compile_gpu_d4_gated_with_stats(
380        cnf,
381        provider,
382        &d4_config,
383        handle.compile_needed_device(),
384    )?;
385    profile.frontier_items = frontier_items;
386    if let Some(t0) = t_d4 {
387        provider
388            .device()
389            .synchronize()
390            .map_err(|e| XlogError::Kernel(format!("sync after d4 compile: {}", e)))?;
391        profile.d4_compile_sec = t0.elapsed().as_secs_f64();
392    }
393    #[cfg(debug_assertions)]
394    {
395        if !profiling {
396            provider.device().synchronize().map_err(|e| {
397                XlogError::Kernel(format!("sync after compile_gpu_d4_gated failed: {}", e))
398            })?;
399        }
400    }
401    if circuit_base.num_nodes() == 0 || circuit_base.num_levels() == 0 {
402        // Defensive: the GPU-native Decision-DNNF compiler returned an empty circuit
403        // (the primary GPU cache hit is handled by the compile_needed == 0 early return
404        // above; this catches degenerate CNFs).
405        let out_profile = if profiling { Some(profile) } else { None };
406        return Ok((handle, out_profile));
407    }
408
409    // --- Verify equivalence stage ---
410    //
411    // Verify equivalence on the *base* circuit (pre-smoothing) to keep the verifier CNFs minimal.
412    //
413    // `encode_cnf_gpu` sets `decision_var_limit` to the end of the leaf+choice var range. For
414    // deterministic programs with no probabilistic vars, this range is empty (limit=0). In that
415    // case, the verifier must still be able to branch, so fall back to `cnf.num_vars` (all CNF
416    // vars are semantically meaningful when there is no probabilistic decision set).
417    let verifier_decision_var_limit = if random_vars.is_empty() {
418        &cnf.num_vars
419    } else {
420        decision_var_limit
421    };
422    let cdcl = cdcl_config_from_compile(config)?;
423    #[cfg(debug_assertions)]
424    eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: validate_equivalence_gpu_gated");
425    let t_verify = if profiling {
426        Some(Instant::now())
427    } else {
428        None
429    };
430    validate_equivalence_gpu_gated(
431        cnf,
432        verifier_decision_var_limit,
433        &circuit_base,
434        provider,
435        GpuEquivalenceConfig {
436            cdcl,
437            reuse_workspace: config.incremental_verify,
438        },
439        handle.compile_needed_device(),
440    )?;
441    if let Some(t0) = t_verify {
442        provider
443            .device()
444            .synchronize()
445            .map_err(|e| XlogError::Kernel(format!("sync after verify: {}", e)))?;
446        profile.verify_sec = t0.elapsed().as_secs_f64();
447    }
448    #[cfg(debug_assertions)]
449    {
450        if !profiling {
451            provider.device().synchronize().map_err(|e| {
452                XlogError::Kernel(format!(
453                    "sync after validate_equivalence_gpu_gated failed: {}",
454                    e
455                ))
456            })?;
457        }
458    }
459
460    // --- Smoothing stage ---
461    //
462    // Smoothing is evaluation-only (WMC/grad correctness); it is semantics-preserving and does not
463    // need to participate in the equivalence check.
464    let t_smooth = if profiling {
465        Some(Instant::now())
466    } else {
467        None
468    };
469    let circuit_eval = if random_vars.is_empty() {
470        circuit_base
471    } else {
472        #[cfg(debug_assertions)]
473        eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: smooth_random_vars_device");
474        let smoothed = circuit_base.smooth_random_vars_device(
475            provider,
476            random_vars.list(),
477            random_vars.count(),
478            config.smooth_node_cap,
479            config.smooth_edge_cap,
480        )?;
481        #[cfg(debug_assertions)]
482        {
483            if !profiling {
484                provider.device().synchronize().map_err(|e| {
485                    XlogError::Kernel(format!(
486                        "sync after smooth_random_vars_device failed: {}",
487                        e
488                    ))
489                })?;
490            }
491        }
492        smoothed
493    };
494    if let Some(t0) = t_smooth {
495        provider
496            .device()
497            .synchronize()
498            .map_err(|e| XlogError::Kernel(format!("sync after smooth: {}", e)))?;
499        profile.smooth_sec = t0.elapsed().as_secs_f64();
500    }
501
502    // --- Cache store stage ---
503    #[cfg(debug_assertions)]
504    eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: store_from_xgcf");
505    let t_store = if profiling {
506        Some(Instant::now())
507    } else {
508        None
509    };
510    cache.store_from_xgcf(&mut handle, &circuit_eval)?;
511    if let Some(t0) = t_store {
512        provider
513            .device()
514            .synchronize()
515            .map_err(|e| XlogError::Kernel(format!("sync after cache store: {}", e)))?;
516        profile.cache_store_sec = t0.elapsed().as_secs_f64();
517    }
518    #[cfg(debug_assertions)]
519    {
520        if !profiling {
521            provider.device().synchronize().map_err(|e| {
522                XlogError::Kernel(format!("sync after store_from_xgcf failed: {}", e))
523            })?;
524        }
525    }
526
527    // --- Free-var mask stage ---
528    #[cfg(debug_assertions)]
529    eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: compute_free_var_mask_gpu_gated");
530    let t_fvm = if profiling {
531        Some(Instant::now())
532    } else {
533        None
534    };
535    let free_var_mask = gpu_d4::compute_free_var_mask_gpu_gated(
536        cnf,
537        &circuit_eval,
538        provider,
539        handle.compile_needed_device(),
540    )?;
541    #[cfg(debug_assertions)]
542    {
543        if !profiling {
544            provider.device().synchronize().map_err(|e| {
545                XlogError::Kernel(format!(
546                    "sync after compute_free_var_mask_gpu_gated failed: {}",
547                    e
548                ))
549            })?;
550        }
551    }
552    // Only enable free-var correction if there are actual free variables.
553    // When the mask is all-zero (common for smoothed d-DNNF circuits),
554    // skipping this keeps has_free_var_mask[slot]=false, which avoids unnecessary
555    // free-var correction kernel launches on every subsequent eval.
556    let mask_host: Vec<u8> = provider
557        .device()
558        .inner()
559        .dtoh_sync_copy(&free_var_mask)
560        .map_err(|e| XlogError::Kernel(format!("Failed to read free_var_mask: {}", e)))?;
561    let has_free_vars = mask_host.iter().any(|&b| b != 0);
562    #[cfg(debug_assertions)]
563    eprintln!(
564        "[xlog-prob] free_var_mask: {} free vars, batched eval {}",
565        if has_free_vars { "has" } else { "no" },
566        if has_free_vars { "DISABLED" } else { "ENABLED" },
567    );
568    if has_free_vars {
569        cache.store_free_var_mask(&handle, &free_var_mask)?;
570        #[cfg(debug_assertions)]
571        {
572            if !profiling {
573                provider.device().synchronize().map_err(|e| {
574                    XlogError::Kernel(format!("sync after store_free_var_mask failed: {}", e))
575                })?;
576            }
577        }
578    }
579    if let Some(t0) = t_fvm {
580        provider
581            .device()
582            .synchronize()
583            .map_err(|e| XlogError::Kernel(format!("sync after free_var_mask: {}", e)))?;
584        profile.free_var_mask_sec = t0.elapsed().as_secs_f64();
585    }
586
587    // --- Disk cache write (opportunistic) ---
588    //
589    // After a successful compilation, write the artifact to disk for next warm start.
590    // Errors are silently ignored — the disk cache is best-effort.
591    if let Some(ref disk_key) = cache_key {
592        if let Ok(artifact) = cache.build_artifact_from_device(&handle, provider) {
593            let _ = disk_cache::write_artifact(disk_key, &artifact);
594            #[cfg(debug_assertions)]
595            eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: wrote disk cache artifact");
596        }
597    }
598
599    let out_profile = if profiling { Some(profile) } else { None };
600    Ok((handle, out_profile))
601}
602
603fn d4_config_for_smoothing(
604    config: &GpuCompileConfig,
605    random_var_count: u32,
606) -> Result<GpuCompileConfig> {
607    if random_var_count == 0 {
608        return Ok(*config);
609    }
610    let headroom = 2u32
611        .checked_add(random_var_count)
612        .ok_or_else(|| XlogError::Compilation("smooth headroom overflow".to_string()))?;
613    if config.smooth_node_cap <= headroom {
614        return Err(XlogError::Compilation(format!(
615            "GpuCompileConfig smooth_node_cap {} too small for smoothing headroom {}",
616            config.smooth_node_cap, headroom
617        )));
618    }
619    let base_cap = config
620        .smooth_node_cap
621        .checked_sub(headroom)
622        .ok_or_else(|| XlogError::Compilation("smooth node cap underflow".to_string()))?;
623    if base_cap < 3 {
624        return Err(XlogError::Compilation(
625            "GpuCompileConfig smooth_node_cap leaves <3 base nodes".to_string(),
626        ));
627    }
628    let mut out = *config;
629    out.smooth_node_cap = base_cap;
630    Ok(out)
631}
632
633fn cdcl_config_from_compile(config: &GpuCompileConfig) -> Result<GpuCdclConfig> {
634    if config.cdcl_restart_interval == 0 {
635        return Err(XlogError::Compilation(
636            "cdcl_restart_interval must be > 0".to_string(),
637        ));
638    }
639    if config.cdcl_learned_bytes == 0 {
640        return Err(XlogError::Compilation(
641            "cdcl_learned_bytes must be > 0".to_string(),
642        ));
643    }
644
645    // Deterministic sizing: assume average learned clause length = 4.
646    const AVG_LEN: u64 = 4;
647    const META_BYTES_PER_CLAUSE: u64 = 24; // offsets + lbd + activity + flags + proof offsets (rounded up)
648    const PROOF_BYTES_PER_CLAUSE: u64 = 8 + (8 * AVG_LEN); // (conflict, steps) + 2*u32 per lit
649    const LIT_BYTES_PER_CLAUSE: u64 = 4 * AVG_LEN;
650
651    let bytes_per_clause = META_BYTES_PER_CLAUSE
652        .checked_add(PROOF_BYTES_PER_CLAUSE)
653        .and_then(|v| v.checked_add(LIT_BYTES_PER_CLAUSE))
654        .ok_or_else(|| XlogError::Compilation("cdcl bytes per clause overflow".to_string()))?;
655
656    let max_clauses = config
657        .cdcl_learned_bytes
658        .checked_div(bytes_per_clause)
659        .ok_or_else(|| XlogError::Compilation("cdcl_learned_bytes div overflow".to_string()))?;
660    if max_clauses == 0 {
661        return Err(XlogError::Compilation(
662            "cdcl_learned_bytes too small for learned clause arena".to_string(),
663        ));
664    }
665
666    let max_lits = max_clauses
667        .checked_mul(AVG_LEN)
668        .ok_or_else(|| XlogError::Compilation("max_learned_lits overflow".to_string()))?;
669    let max_proof_u32 = max_clauses
670        .checked_mul(2 + 2 * AVG_LEN)
671        .ok_or_else(|| XlogError::Compilation("max_proof_u32 overflow".to_string()))?;
672
673    let max_learned_clauses = u32::try_from(max_clauses)
674        .map_err(|_| XlogError::Compilation("max_learned_clauses exceeds u32::MAX".to_string()))?;
675    let max_learned_lits = u32::try_from(max_lits)
676        .map_err(|_| XlogError::Compilation("max_learned_lits exceeds u32::MAX".to_string()))?;
677    let max_proof_u32 = u32::try_from(max_proof_u32)
678        .map_err(|_| XlogError::Compilation("max_proof_u32 exceeds u32::MAX".to_string()))?;
679
680    let reduce_interval = config
681        .cdcl_restart_interval
682        .checked_mul(20)
683        .ok_or_else(|| XlogError::Compilation("cdcl reduce_interval overflow".to_string()))?;
684
685    let mut gpu_cdcl = GpuCdclConfig::default();
686    gpu_cdcl.max_learned_clauses = max_learned_clauses;
687    gpu_cdcl.max_learned_lits = max_learned_lits;
688    gpu_cdcl.max_proof_u32 = max_proof_u32;
689    gpu_cdcl.restart_base = config.cdcl_restart_interval;
690    gpu_cdcl.reduce_interval = reduce_interval;
691    // Fail-closed conflict budget. Treewidth-hard equivalence verifies can
692    // spin past the GPU watchdog and crash with a context-poisoning launch
693    // failure; a finite budget makes them decline (VerifyBudgetExceeded) before
694    // that. Default 0 = unlimited (no behavior change). A recommended value is a
695    // calibration follow-up (between a completing verify's conflict count and
696    // the watchdog boundary); operators opt in via XLOG_D4_VERIFY_MAX_CONFLICTS.
697    gpu_cdcl.max_conflicts = verify_max_conflicts();
698    Ok(gpu_cdcl)
699}
700
701/// Per-verify conflict budget, read once from
702/// `XLOG_D4_VERIFY_MAX_CONFLICTS`. Default 0 = unlimited.
703fn verify_max_conflicts() -> u32 {
704    static BUDGET: std::sync::OnceLock<u32> = std::sync::OnceLock::new();
705    *BUDGET.get_or_init(|| {
706        std::env::var("XLOG_D4_VERIFY_MAX_CONFLICTS")
707            .ok()
708            .and_then(|v| v.trim().parse::<u32>().ok())
709            .unwrap_or(0)
710    })
711}
712
713// ---------------------------------------------------------------------------
714// Disk cache helpers
715// ---------------------------------------------------------------------------
716
717/// FNV-1a 64-bit hash — deterministic across processes and Rust versions.
718/// Matches the FNV-1a algorithm used in the GPU hash kernel (kernels/cache.cu).
719fn fnv1a_u64(bytes: &[u8]) -> u64 {
720    const FNV_OFFSET: u64 = 0xcbf29ce484222325;
721    const FNV_PRIME: u64 = 0x100000001b3;
722    let mut h = FNV_OFFSET;
723    for &b in bytes {
724        h ^= b as u64;
725        h = h.wrapping_mul(FNV_PRIME);
726    }
727    h
728}
729
730fn combine_disk_cnf_hash(canonical_hash: u64, gpu_cnf_hash: u64) -> u64 {
731    let mut buf = Vec::with_capacity(16);
732    buf.extend_from_slice(&canonical_hash.to_le_bytes());
733    buf.extend_from_slice(&gpu_cnf_hash.to_le_bytes());
734    fnv1a_u64(&buf)
735}
736
737fn read_gpu_cnf_hash(
738    hash: &TrackedCudaSlice<u64>,
739    provider: &Arc<CudaKernelProvider>,
740) -> Result<u64> {
741    let host: Vec<u64> = provider
742        .device()
743        .inner()
744        .dtoh_sync_copy(hash)
745        .map_err(|e| XlogError::Kernel(format!("dtoh GPU CNF hash for disk cache key: {}", e)))?;
746    host.first()
747        .copied()
748        .ok_or_else(|| XlogError::Kernel("empty GPU CNF hash for disk cache key".to_string()))
749}
750
751/// Hash the compile config fields that affect circuit topology output.
752fn hash_compile_config(config: &GpuCompileConfig) -> u64 {
753    let mut buf = Vec::new();
754    buf.extend_from_slice(&config.frontier_depth.to_le_bytes());
755    buf.extend_from_slice(&config.max_frontier_items.to_le_bytes());
756    buf.extend_from_slice(&config.max_depth.to_le_bytes());
757    buf.extend_from_slice(&config.smooth_node_cap.to_le_bytes());
758    buf.extend_from_slice(&config.smooth_edge_cap.to_le_bytes());
759    // CDCL verifier params do not affect the compiled circuit topology,
760    // but we include them for safety so a verifier config change invalidates the cache.
761    buf.extend_from_slice(&config.cdcl_restart_interval.to_le_bytes());
762    buf.extend_from_slice(&config.cdcl_learned_bytes.to_le_bytes());
763    fnv1a_u64(&buf)
764}
765
766/// Hash the random variable list (D→H copy + hash).
767fn hash_random_vars(
768    random_vars: &DeviceRandomVarList,
769    provider: &Arc<CudaKernelProvider>,
770) -> Result<u64> {
771    let count = random_vars.count();
772    let mut buf = Vec::new();
773    buf.extend_from_slice(&count.to_le_bytes());
774    if count > 0 {
775        let host: Vec<u32> = provider
776            .device()
777            .inner()
778            .dtoh_sync_copy(random_vars.list())
779            .map_err(|e| {
780                XlogError::Kernel(format!("dtoh random_vars for disk cache hash: {}", e))
781            })?;
782        // Hash only the valid elements (count may be less than the allocation).
783        for &v in &host[..count as usize] {
784            buf.extend_from_slice(&v.to_le_bytes());
785        }
786    }
787    Ok(fnv1a_u64(&buf))
788}
789
790/// Query the device compute capability and encode as `major * 10 + minor` (e.g. 89 for sm_89).
791fn detect_compute_capability(provider: &Arc<CudaKernelProvider>) -> Result<u32> {
792    use cudarc::driver::sys::CUdevice_attribute;
793
794    let device = provider.device().inner();
795    let major = device
796        .attribute(CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
797        .map_err(|e| {
798            XlogError::Kernel(format!("Failed to query compute capability major: {}", e))
799        })?;
800    let minor = device
801        .attribute(CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
802        .map_err(|e| {
803            XlogError::Kernel(format!("Failed to query compute capability minor: {}", e))
804        })?;
805    let major_u32: u32 = major.try_into().map_err(|_| {
806        XlogError::Kernel(format!(
807            "compute capability major {} cannot be converted to u32",
808            major
809        ))
810    })?;
811    let minor_u32: u32 = minor.try_into().map_err(|_| {
812        XlogError::Kernel(format!(
813            "compute capability minor {} cannot be converted to u32",
814            minor
815        ))
816    })?;
817    Ok(major_u32 * 10 + minor_u32)
818}