Skip to main content

xlog_prob/mc/
mod.rs

1//! Approximate probabilistic inference via Monte Carlo sampling.
2//!
3//! This engine samples probabilistic facts / annotated disjunction decisions on the GPU and
4//! evaluates the deterministic core in each sampled world.
5//!
6//! For programs with non-monotone recursion (cycles through `not` and/or aggregates),
7//! Monte Carlo evaluation requires the user to opt into the approximate probabilistic
8//! engine. The deterministic evaluation uses a bounded, cycle-aware semantics:
9//!
10//! - If an SCC reaches a fixpoint under synchronous iteration, that fixpoint is used.
11//! - If the SCC enters a cycle, the interpretation is the intersection of all states in the cycle
12//!   (skeptical, invariant tuples only). This avoids parity/oscillation dependence on iteration
13//!   count while remaining fully deterministic and explicit.
14
15mod buffers;
16mod evidence;
17mod resident;
18mod results;
19
20pub use evidence::{EvidenceForcing, ForceabilityReason};
21pub use resident::{
22    compile_resident_plan, McNoHostStats, McResidentResult, ResidentPlan, ResidentRejectKind,
23    ResidentRejection,
24};
25
26#[cfg(feature = "host-io")]
27use std::collections::{HashMap, HashSet};
28use std::sync::Arc;
29
30#[cfg(feature = "host-io")]
31use cudarc::driver::DeviceSlice;
32use xlog_core::{MemoryBudget, Result, XlogError};
33use xlog_cuda::memory::TrackedCudaSlice;
34use xlog_cuda::{CudaDevice, CudaKernelProvider, GpuMemoryManager};
35#[cfg(feature = "host-io")]
36use xlog_logic::ast::{BodyLiteral, Rule};
37use xlog_logic::ast::{Directives, Evidence, ProbMethod, ProbQuery, Program};
38
39use crate::exact::GpuConfig;
40#[cfg(feature = "host-io")]
41use crate::provenance::Value;
42use crate::provenance::{atom_key_from_ground_atom, GroundAtom};
43
44/// Sampling method for Monte Carlo inference.
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum McSamplingMethod {
47    /// Sample from prior, discard worlds where evidence is not satisfied.
48    Rejection,
49    /// Force evidence variables in the sampler; every sample counts.
50    EvidenceClamping,
51}
52
53impl McSamplingMethod {
54    pub fn as_str(self) -> &'static str {
55        match self {
56            McSamplingMethod::Rejection => "rejection",
57            McSamplingMethod::EvidenceClamping => "evidence_clamping",
58        }
59    }
60}
61
62impl From<ProbMethod> for McSamplingMethod {
63    fn from(value: ProbMethod) -> Self {
64        match value {
65            ProbMethod::Rejection => Self::Rejection,
66            ProbMethod::EvidenceClamping => Self::EvidenceClamping,
67        }
68    }
69}
70
71/// Strategy for counting evidence-satisfied samples in the MC loop.
72///
73/// In `QueriesOnly` mode (used with evidence clamping), evidence is
74/// guaranteed to hold in every sample, so we skip the truth-kernel's
75/// evidence check and evidence-side buffer allocations.
76#[derive(Debug, Clone, Copy, PartialEq, Eq)]
77pub enum McCountStrategy {
78    /// Full path: check both queries and evidence each sample.
79    QueriesAndEvidence,
80    /// Clamped path: evidence is always satisfied; only accumulate query flags.
81    QueriesOnly,
82}
83
84impl McCountStrategy {
85    /// Derive the count strategy from the chosen sampling method.
86    pub fn from_method(method: McSamplingMethod) -> Self {
87        match method {
88            McSamplingMethod::Rejection => Self::QueriesAndEvidence,
89            McSamplingMethod::EvidenceClamping => Self::QueriesOnly,
90        }
91    }
92}
93
94/// Breakdown of time spent in each phase of MC evaluation.
95/// Gate with `XLOG_MC_PROFILE=1` to print at the end of evaluation.
96#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
97pub struct McTimingBreakdown {
98    pub sampler_us: u64,
99    pub sample_reset_us: u64,
100    pub sample_build_us: u64,
101    pub eval_us: u64,
102    pub count_us: u64,
103}
104
105impl McTimingBreakdown {
106    pub fn total_us(&self) -> u64 {
107        self.sampler_us
108            .saturating_add(self.sample_reset_us)
109            .saturating_add(self.sample_build_us)
110            .saturating_add(self.eval_us)
111            .saturating_add(self.count_us)
112    }
113}
114
115/// Bounded semantics for non-monotone SCC evaluation inside MC sampling.
116pub const NONMONOTONE_SEMANTICS: &str = "Synchronous iteration per SCC; if a fixpoint is reached, use it; if a cycle is detected, use the intersection of all states in the cycle (skeptical tuples only); if the iteration budget is exceeded, use the intersection across all visited states (conservative).";
117
118#[derive(Debug, Clone)]
119#[non_exhaustive]
120/// Configuration for Monte Carlo probabilistic inference.
121///
122/// Use [`McEvalConfig::default()`] as a starting point and then update the
123/// individual fields you need.
124pub struct McEvalConfig {
125    /// Number of Monte Carlo samples.
126    pub samples: usize,
127    /// RNG seed (deterministic).
128    pub seed: u64,
129    /// Two-sided confidence level in (0,1) (e.g., 0.95).
130    pub confidence: f64,
131    /// Maximum SCC iteration steps for non-monotone cycle detection.
132    pub max_nonmonotone_iterations: usize,
133    /// Sampling method override. `None` = auto-select (EvidenceClamping when forceable, Rejection otherwise).
134    pub sampling_method: Option<McSamplingMethod>,
135    /// Allow the host CPU oracle ([`McProgram::evaluate_cpu`]) when the
136    /// resident GPU engine rejects the program. Default `false`: a rejected
137    /// program fails closed with the typed rejection instead of silently
138    /// running on the CPU. When set, the oracle result is labeled
139    /// [`McEngine::CpuOracle`] and must never serve as GPU-native evidence.
140    pub allow_cpu_oracle_fallback: bool,
141}
142
143impl Default for McEvalConfig {
144    fn default() -> Self {
145        Self {
146            samples: 10000,
147            seed: 0,
148            confidence: 0.95,
149            max_nonmonotone_iterations: 1024,
150            sampling_method: None,
151            allow_cpu_oracle_fallback: false,
152        }
153    }
154}
155
156impl McEvalConfig {
157    pub fn from_directives(directives: &Directives) -> Result<Self> {
158        let mut cfg = Self::default();
159        if let Some(samples) = directives.prob_samples {
160            cfg.samples = samples;
161        }
162        if let Some(seed) = directives.prob_seed {
163            cfg.seed = seed;
164        }
165        if let Some(confidence) = directives.prob_confidence {
166            cfg.confidence = confidence;
167        }
168        if let Some(iterations) = directives.prob_max_nonmonotone_iterations {
169            cfg.max_nonmonotone_iterations = iterations;
170        }
171        cfg.sampling_method = directives.prob_method.map(McSamplingMethod::from);
172        cfg.validate()?;
173        Ok(cfg)
174    }
175
176    pub fn validate(&self) -> Result<()> {
177        if self.samples == 0 {
178            return Err(XlogError::Compilation(
179                "MC inference requires samples > 0".to_string(),
180            ));
181        }
182        if !(0.0 < self.confidence && self.confidence < 1.0) || self.confidence.is_nan() {
183            return Err(XlogError::Compilation(format!(
184                "MC inference requires 0 < confidence < 1, got {}",
185                self.confidence
186            )));
187        }
188        if self.max_nonmonotone_iterations == 0 {
189            return Err(XlogError::Compilation(
190                "MC inference requires max_nonmonotone_iterations > 0".to_string(),
191            ));
192        }
193        Ok(())
194    }
195}
196
197#[derive(Debug, Clone)]
198pub struct McQueryEstimate {
199    pub atom: GroundAtom,
200    pub prob: f64,
201    pub log_prob: f64,
202    pub stderr: f64,
203    pub ci_low: f64,
204    pub ci_high: f64,
205}
206
207/// Which engine produced an [`McResult`].
208#[derive(Debug, Clone, Copy, PartialEq, Eq)]
209pub enum McEngine {
210    /// GPU-resident megakernel engine — the production MC path.
211    GpuResident,
212    /// Host CPU oracle ([`McProgram::evaluate_cpu`]) — explicit opt-in only
213    /// (see [`McEvalConfig::allow_cpu_oracle_fallback`]); never valid as
214    /// GPU-native or zero-host evidence.
215    CpuOracle,
216}
217
218impl McEngine {
219    /// Stable string form for result metadata surfaces (CLI JSON, pyxlog).
220    pub fn as_str(&self) -> &'static str {
221        match self {
222            McEngine::GpuResident => "gpu-resident",
223            McEngine::CpuOracle => "cpu-oracle",
224        }
225    }
226}
227
228#[derive(Debug, Clone)]
229pub struct McResult {
230    pub total_samples: usize,
231    pub evidence_samples: usize,
232    pub seed: u64,
233    pub confidence: f64,
234    pub query_estimates: Vec<McQueryEstimate>,
235    pub nonmonotone_sccs: usize,
236    pub nonmonotone_cycles: usize,
237    pub nonmonotone_iteration_limit_hits: usize,
238    pub sampling_method: McSamplingMethod,
239    /// Engine that produced this result; CPU-oracle results are reachable
240    /// only through explicit opt-in and are labeled so downstream consumers
241    /// can never mistake them for GPU-resident evidence.
242    pub engine: McEngine,
243}
244
245/// **Legacy back-compat surface** — tracked (data-plane) host<->device transfer
246/// deltas measured around the MC measured region.
247///
248/// This struct dates from the predecessor `a894aab4` engine, which removed only
249/// *tracked* data-plane transfers from a still-host-orchestrated loop. The
250/// current resident megakernel engine has a *stronger* property — **no host
251/// interaction at all** in the measured region — so the authoritative contract
252/// now lives in [`McNoHostStats`] (`McResidentResult::no_host`), which also
253/// counts untracked metadata reads, host fixpoint iterations, and in-region
254/// device allocations. `McDeviceResult` retains this field for API
255/// back-compatibility; for the resident engine its tracked-call fields are zero.
256#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
257pub struct McHotLoopTransfers {
258    /// Tracked host-to-device calls in the measured region.
259    pub htod_calls: u64,
260    /// Tracked device-to-host calls in the measured region.
261    pub dtoh_calls: u64,
262    /// Tracked host-to-device bytes in the measured region.
263    pub htod_bytes: u64,
264    /// Tracked device-to-host bytes in the measured region.
265    pub dtoh_bytes: u64,
266}
267
268impl McHotLoopTransfers {
269    /// True when no tracked host/device transfer occurred in the measured region.
270    pub fn is_zero(&self) -> bool {
271        self.htod_calls == 0 && self.dtoh_calls == 0
272    }
273}
274
275/// Device-resident Monte Carlo result counts.
276pub struct McDeviceResult {
277    pub query_counts: TrackedCudaSlice<u32>,
278    pub evidence_count: TrackedCudaSlice<u32>,
279    pub total_samples: usize,
280    pub seed: u64,
281    pub confidence: f64,
282    pub nonmonotone_sccs: usize,
283    pub nonmonotone_cycles: usize,
284    pub nonmonotone_iteration_limit_hits: usize,
285    pub sampling_method: McSamplingMethod,
286    /// Legacy back-compat field: tracked transfers measured around the resident
287    /// engine's measured region (zero). The authoritative no-host contract is
288    /// [`McNoHostStats`] on [`McResidentResult`]; see [`McHotLoopTransfers`].
289    pub hot_loop_transfers: McHotLoopTransfers,
290    /// Authoritative no-host counters for the resident engine measured region.
291    pub no_host: McNoHostStats,
292}
293
294#[derive(Debug, Clone)]
295pub(super) struct ProbFactSpec {
296    pub(super) var_idx: usize,
297    pub(super) atom: GroundAtom,
298}
299
300#[derive(Debug, Clone)]
301pub(super) struct AdSpec {
302    pub(super) decision_vars: Vec<usize>,
303    pub(super) choices: Vec<GroundAtom>,
304    #[cfg_attr(not(feature = "host-io"), allow(dead_code))]
305    pub(super) has_none: bool,
306}
307
308#[cfg(feature = "host-io")]
309#[derive(Debug, Clone, Default, PartialEq, Eq)]
310pub(super) struct Relation {
311    pub(super) tuples: HashSet<Vec<Value>>,
312}
313
314#[cfg(feature = "host-io")]
315impl Relation {
316    pub(super) fn insert_tuple(&mut self, tuple: Vec<Value>) {
317        self.tuples.insert(tuple);
318    }
319
320    pub(super) fn contains(&self, tuple: &[Value]) -> bool {
321        self.tuples.contains(tuple)
322    }
323
324    pub(super) fn is_empty(&self) -> bool {
325        self.tuples.is_empty()
326    }
327}
328
329#[cfg(feature = "host-io")]
330#[derive(Debug, Clone)]
331pub(super) enum SccKind {
332    MonotoneNonRecursive,
333    MonotoneRecursive,
334    NonMonotone,
335}
336
337#[cfg(feature = "host-io")]
338#[derive(Debug, Clone)]
339pub(super) struct SccPlan {
340    pub(super) predicates: Vec<String>,
341    pub(super) rules: Vec<Rule>,
342    pub(super) kind: SccKind,
343}
344
345#[cfg_attr(not(feature = "host-io"), allow(dead_code))]
346#[derive(Debug, Clone, Default)]
347pub(super) struct EvalStats {
348    pub(super) nonmonotone_sccs: usize,
349    pub(super) nonmonotone_cycles: usize,
350    pub(super) nonmonotone_iteration_limit_hits: usize,
351}
352
353#[derive(Clone)]
354pub struct McProgram {
355    pub(super) gpu_config: GpuConfig,
356    pub(super) program: Program,
357    #[cfg(feature = "host-io")]
358    pub(super) base_store: HashMap<String, Relation>,
359    #[cfg(feature = "host-io")]
360    pub(super) scc_plans: Vec<SccPlan>,
361    pub(super) queries: Vec<GroundAtom>,
362    pub(super) evidence: Vec<(GroundAtom, bool)>,
363    pub(super) bernoulli_probs: Vec<f32>,
364    pub(super) prob_facts: Vec<ProbFactSpec>,
365    pub(super) annotated_disjunctions: Vec<AdSpec>,
366}
367
368impl McProgram {
369    pub fn compile_source(source: &str) -> Result<Self> {
370        let program = xlog_logic::parse_program(source)?;
371        Self::compile_program(&program)
372    }
373
374    pub fn compile_source_with_gpu(source: &str, config: GpuConfig) -> Result<Self> {
375        let mut program = Self::compile_source(source)?;
376        program.gpu_config = config;
377        Ok(program)
378    }
379
380    pub fn num_vars(&self) -> usize {
381        self.bernoulli_probs.len()
382    }
383
384    /// Host-facing MC evaluation: runs the resident megakernel engine
385    /// ([`Self::evaluate_gpu_device_with_provider`]) and then **materializes the
386    /// result on the host** by downloading the final query/evidence counts
387    /// *after* the measured region. The download is a host-result
388    /// materialization, not part of the measured region — the no-host property
389    /// belongs to the resident engine, not to this convenience wrapper. Use
390    /// [`Self::evaluate_gpu_device`] when you want device-resident counts with no
391    /// host download at all.
392    ///
393    /// **Fail-closed contract:** if the resident engine rejects the program
394    /// (negation, aggregates, unbounded terms, ...), this returns the typed
395    /// rejection error. The CPU oracle is reachable only via the explicit
396    /// [`McEvalConfig::allow_cpu_oracle_fallback`] opt-in and its result is
397    /// labeled [`McEngine::CpuOracle`].
398    #[cfg(feature = "host-io")]
399    pub fn evaluate(&self, cfg: McEvalConfig) -> Result<McResult> {
400        let provider = Arc::new(self.provider()?);
401        let cfg_clone = cfg.clone();
402        let device_result =
403            match self.evaluate_gpu_device_with_provider(cfg_clone, provider.clone()) {
404                Ok(result) => result,
405                Err(XlogError::Compilation(message))
406                    if message.starts_with("resident MC engine rejected program") =>
407                {
408                    // Fail closed by default: the CPU oracle is an explicit,
409                    // labeled opt-in (`allow_cpu_oracle_fallback`), never a
410                    // silent substitute for the resident GPU engine.
411                    if cfg.allow_cpu_oracle_fallback {
412                        return self.evaluate_cpu(cfg);
413                    }
414                    return Err(XlogError::Compilation(format!(
415                        "{message}; MC fail-closed: set \
416                         McEvalConfig::allow_cpu_oracle_fallback to explicitly \
417                         run the labeled CPU oracle instead"
418                    )));
419                }
420                Err(err) => return Err(err),
421            };
422
423        let mut host_counts = vec![0u32; device_result.query_counts.len()];
424        if !host_counts.is_empty() {
425            provider
426                .device()
427                .inner()
428                .dtoh_sync_copy_into(&device_result.query_counts, &mut host_counts)
429                .map_err(|e| {
430                    XlogError::Kernel(format!("Failed to download MC query counts: {}", e))
431                })?;
432        }
433
434        let mut host_evidence = [0u32];
435        provider
436            .device()
437            .inner()
438            .dtoh_sync_copy_into(&device_result.evidence_count, &mut host_evidence)
439            .map_err(|e| {
440                XlogError::Kernel(format!("Failed to download MC evidence count: {}", e))
441            })?;
442
443        let evidence_samples = if self.evidence.is_empty() {
444            cfg.samples
445        } else {
446            host_evidence[0] as usize
447        };
448
449        if device_result.sampling_method != McSamplingMethod::EvidenceClamping
450            && !self.evidence.is_empty()
451            && evidence_samples == 0
452        {
453            return Err(XlogError::Execution(format!(
454                "MC inference error: evidence was never satisfied across {} samples (seed={})",
455                cfg.samples, cfg.seed
456            )));
457        }
458
459        let z = results::normal_quantile(0.5 + cfg.confidence / 2.0);
460        let mut query_estimates: Vec<McQueryEstimate> = Vec::with_capacity(self.queries.len());
461        for (i, atom) in self.queries.iter().enumerate() {
462            let k = host_counts.get(i).copied().unwrap_or(0) as usize;
463            let (p, stderr, ci_low, ci_high) = results::binomial_estimate(k, evidence_samples, z);
464            let log_prob = if p == 0.0 { f64::NEG_INFINITY } else { p.ln() };
465
466            query_estimates.push(McQueryEstimate {
467                atom: atom.clone(),
468                prob: p,
469                log_prob,
470                stderr,
471                ci_low,
472                ci_high,
473            });
474        }
475
476        Ok(McResult {
477            total_samples: cfg.samples,
478            evidence_samples,
479            seed: cfg.seed,
480            confidence: cfg.confidence,
481            query_estimates,
482            nonmonotone_sccs: device_result.nonmonotone_sccs,
483            nonmonotone_cycles: device_result.nonmonotone_cycles,
484            engine: McEngine::GpuResident,
485            nonmonotone_iteration_limit_hits: device_result.nonmonotone_iteration_limit_hits,
486            sampling_method: device_result.sampling_method,
487        })
488    }
489
490    /// CPU **oracle / debug** MC path. Downloads the full sampled-bit matrix to
491    /// the host and evaluates every sampled world on a host relation store.
492    ///
493    /// This is intentionally *not* GPU-native: it performs a large DtoH of the
494    /// sample matrix and runs the deterministic core on the CPU. It exists solely
495    /// as a deterministic, seed-matched oracle for validating the GPU-native
496    /// device counts (the GPU sampler is shared, so for the same program/seed the
497    /// two paths see identical samples). It must **never** be used as zero-host /
498    /// GPU-native release evidence, and the acceptance matrix excludes it and the
499    /// tests that call it (`tests/gpu_mc_vs_cpu.rs`, `tests/mc.rs`).
500    #[cfg(feature = "host-io")]
501    pub fn evaluate_cpu(&self, cfg: McEvalConfig) -> Result<McResult> {
502        if cfg.samples == 0 {
503            return Err(XlogError::Execution(
504                "MC inference requires samples > 0".to_string(),
505            ));
506        }
507        if !(0.0 < cfg.confidence && cfg.confidence < 1.0) || cfg.confidence.is_nan() {
508            return Err(XlogError::Execution(format!(
509                "MC inference requires 0 < confidence < 1, got {}",
510                cfg.confidence
511            )));
512        }
513        if cfg.max_nonmonotone_iterations == 0 {
514            return Err(XlogError::Execution(
515                "MC inference requires max_nonmonotone_iterations > 0".to_string(),
516            ));
517        }
518
519        let (method, forcing) = self.resolve_sampling_method(cfg.sampling_method)?;
520        let is_clamped = method == McSamplingMethod::EvidenceClamping;
521
522        let mut n_evidence: usize = 0;
523        let mut n_query_true: Vec<usize> = vec![0; self.queries.len()];
524        let mut stats = EvalStats::default();
525
526        let num_vars = self.bernoulli_probs.len();
527        let samples_matrix: Vec<u8> = if num_vars == 0 {
528            Vec::new()
529        } else {
530            let total = num_vars
531                .checked_mul(cfg.samples)
532                .ok_or_else(|| XlogError::Execution("MC sample matrix overflow".to_string()))?;
533            let provider = Arc::new(self.provider()?);
534
535            // Allocate force arrays: upload actual forcing data in clamped mode, zero-fill otherwise
536            let mut d_force_mask = provider.memory().alloc::<u8>(num_vars.max(1))?;
537            let mut d_forced_value = provider.memory().alloc::<u8>(num_vars.max(1))?;
538            if is_clamped {
539                provider
540                    .htod_sync_copy_into_tracked(&forcing.force_mask, &mut d_force_mask)
541                    .map_err(|e| {
542                        XlogError::Kernel(format!("Failed to upload force_mask: {}", e))
543                    })?;
544                provider
545                    .htod_sync_copy_into_tracked(&forcing.forced_value, &mut d_forced_value)
546                    .map_err(|e| {
547                        XlogError::Kernel(format!("Failed to upload forced_value: {}", e))
548                    })?;
549            } else {
550                provider
551                    .device()
552                    .inner()
553                    .memset_zeros(&mut d_force_mask)
554                    .map_err(|e| XlogError::Kernel(format!("Failed to zero force_mask: {}", e)))?;
555                provider
556                    .device()
557                    .inner()
558                    .memset_zeros(&mut d_forced_value)
559                    .map_err(|e| {
560                        XlogError::Kernel(format!("Failed to zero forced_value: {}", e))
561                    })?;
562            }
563
564            let samples_device = provider.sample_bernoulli_matrix_device(
565                &self.bernoulli_probs,
566                cfg.samples,
567                cfg.seed,
568                &d_force_mask.slice(..),
569                &d_forced_value.slice(..),
570            )?;
571            let mut host = vec![0u8; total];
572            if !host.is_empty() {
573                provider
574                    .device()
575                    .inner()
576                    .dtoh_sync_copy_into(&samples_device, &mut host)
577                    .map_err(|e| {
578                        XlogError::Kernel(format!("Failed to download MC samples: {}", e))
579                    })?;
580            }
581            host
582        };
583
584        for sample_idx in 0..cfg.samples {
585            let sample_bits = if num_vars == 0 {
586                &[][..]
587            } else {
588                let start = sample_idx * num_vars;
589                let end = start + num_vars;
590                &samples_matrix[start..end]
591            };
592
593            let mut store = self.base_store.clone();
594            self.apply_sample_facts(&mut store, sample_bits)?;
595
596            let sample_stats = results::evaluate_program_inplace(
597                &self.scc_plans,
598                &mut store,
599                cfg.max_nonmonotone_iterations,
600            )?;
601            stats.nonmonotone_sccs += sample_stats.nonmonotone_sccs;
602            stats.nonmonotone_cycles += sample_stats.nonmonotone_cycles;
603            stats.nonmonotone_iteration_limit_hits += sample_stats.nonmonotone_iteration_limit_hits;
604
605            // In clamped mode, skip evidence check — all samples count
606            if !is_clamped && !results::evidence_satisfied(&store, &self.evidence) {
607                continue;
608            }
609
610            n_evidence += 1;
611            for (i, q) in self.queries.iter().enumerate() {
612                if results::atom_holds(&store, q) {
613                    n_query_true[i] += 1;
614                }
615            }
616        }
617
618        if !is_clamped && !self.evidence.is_empty() && n_evidence == 0 {
619            return Err(XlogError::Execution(format!(
620                "MC inference error: evidence was never satisfied across {} samples (seed={})",
621                cfg.samples, cfg.seed
622            )));
623        }
624
625        // If there is no evidence (or clamped mode), treat all samples as evidence-satisfying.
626        let denom = if self.evidence.is_empty() || is_clamped {
627            cfg.samples
628        } else {
629            n_evidence
630        };
631
632        let z = results::normal_quantile(0.5 + cfg.confidence / 2.0);
633
634        let mut query_estimates: Vec<McQueryEstimate> = Vec::with_capacity(self.queries.len());
635        for (i, atom) in self.queries.iter().enumerate() {
636            let k = n_query_true[i];
637            let (p, stderr, ci_low, ci_high) = results::binomial_estimate(k, denom, z);
638            let log_prob = if p == 0.0 { f64::NEG_INFINITY } else { p.ln() };
639
640            query_estimates.push(McQueryEstimate {
641                atom: atom.clone(),
642                prob: p,
643                log_prob,
644                stderr,
645                ci_low,
646                ci_high,
647            });
648        }
649
650        Ok(McResult {
651            total_samples: cfg.samples,
652            evidence_samples: denom,
653            seed: cfg.seed,
654            confidence: cfg.confidence,
655            query_estimates,
656            nonmonotone_sccs: stats.nonmonotone_sccs,
657            nonmonotone_cycles: stats.nonmonotone_cycles,
658            nonmonotone_iteration_limit_hits: stats.nonmonotone_iteration_limit_hits,
659            sampling_method: method,
660            engine: McEngine::CpuOracle,
661        })
662    }
663
664    /// Alias for [`Self::evaluate`]: GPU device evaluation followed by host-result
665    /// materialization (final-count download after the measured region). The
666    /// `_gpu` suffix denotes that the *compute* runs on the GPU — it does **not**
667    /// imply a zero-host result, since it returns a host [`McResult`]. For the
668    /// device-resident, no-host-download API use [`Self::evaluate_gpu_device`].
669    #[cfg(feature = "host-io")]
670    pub fn evaluate_gpu(&self, cfg: McEvalConfig) -> Result<McResult> {
671        self.evaluate(cfg)
672    }
673
674    /// GPU-native device-resident MC evaluation via the resident megakernel
675    /// engine ([`resident`]). Returns [`McDeviceResult`] with counts left on the
676    /// device (no host download). The engine evaluates all worlds in a single
677    /// launch with **no host interaction in the measured region** (no host
678    /// sample loop, no per-sample/per-operator host launches or allocations, no
679    /// tracked transfers, and no untracked metadata reads); see
680    /// [`McResidentResult::no_host`] / [`McNoHostStats::is_no_host`] for the full
681    /// measured contract.
682    pub fn evaluate_gpu_device(&self, cfg: McEvalConfig) -> Result<McDeviceResult> {
683        let provider = Arc::new(self.provider()?);
684        self.evaluate_gpu_device_with_provider(cfg, provider)
685    }
686
687    /// GPU-native device-resident MC evaluation using a caller-supplied provider
688    /// (enables provider/buffer reuse across calls). Static setup (arena
689    /// allocation, plan upload, sampling) happens before the measured region;
690    /// the measured region is a single resident-engine launch with **zero host
691    /// interaction** — no host loop over samples or fixpoint iterations, no
692    /// per-sample/per-operator host launches or device allocations, no tracked
693    /// HtoD/DtoH transfers, and no untracked metadata reads. The full contract is
694    /// measured by [`McNoHostStats`] (`McResidentResult::no_host`) and gated by
695    /// `tests/mc_resident.rs`. Counts remain device-resident; the caller decides
696    /// whether/when to download them.
697    pub fn evaluate_gpu_device_with_provider(
698        &self,
699        cfg: McEvalConfig,
700        provider: Arc<CudaKernelProvider>,
701    ) -> Result<McDeviceResult> {
702        // The GPU-resident megakernel engine is the sole MC execution path: there
703        // is no host-orchestrated per-sample fallback. It evaluates ALL worlds in
704        // a single launch with zero host interaction in the measured region, then
705        // returns device-resident counts. Programs outside the supported bounded
706        // fragment fail closed (typed `ResidentRejection`).
707        let r = self.evaluate_resident_with_provider(cfg, provider)?;
708        let no_host = r.no_host;
709        Ok(McDeviceResult {
710            query_counts: r.query_counts,
711            evidence_count: r.evidence_count,
712            total_samples: r.total_samples,
713            seed: r.seed,
714            confidence: r.confidence,
715            // The resident engine accepts only bounded positive Datalog and
716            // device-evaluates its fixpoint; legacy non-monotone SCC bookkeeping
717            // is not part of this engine's reported state.
718            nonmonotone_sccs: 0,
719            nonmonotone_cycles: 0,
720            nonmonotone_iteration_limit_hits: 0,
721            sampling_method: r.sampling_method,
722            // Back-compat surface: `hot_loop_transfers` reports the resident
723            // engine's tracked transfers (zero). Richer no-host evidence (untracked
724            // reads, fixpoint/alloc counts) lives in `McResidentResult::no_host`.
725            hot_loop_transfers: McHotLoopTransfers {
726                htod_calls: no_host.tracked_htod_calls,
727                dtoh_calls: no_host.tracked_dtoh_calls,
728                htod_bytes: 0,
729                dtoh_bytes: 0,
730            },
731            no_host,
732        })
733    }
734
735    fn compile_program(program: &Program) -> Result<Self> {
736        let mut queries: Vec<GroundAtom> = Vec::new();
737        for ProbQuery { atom } in &program.prob_queries {
738            queries.push(atom_key_from_ground_atom(atom)?);
739        }
740
741        let mut evidence: Vec<(GroundAtom, bool)> = Vec::new();
742        for Evidence { atom, value } in &program.evidence {
743            evidence.push((atom_key_from_ground_atom(atom)?, *value));
744        }
745
746        let mut prob_facts = program.prob_facts.clone();
747        buffers::extend_prob_facts_with_coin(program, &mut prob_facts)?;
748        let (bernoulli_probs, prob_facts, annotated_disjunctions) =
749            buffers::compile_sampling_plan(&prob_facts, &program.annotated_disjunctions)?;
750
751        #[cfg(feature = "host-io")]
752        let mut base_store: HashMap<String, Relation> = HashMap::new();
753
754        // Deterministic facts.
755        #[cfg(feature = "host-io")]
756        {
757            for fact in program.facts() {
758                let atom = atom_key_from_ground_atom(&fact.head)?;
759                base_store
760                    .entry(atom.predicate.clone())
761                    .or_default()
762                    .insert_tuple(atom.args);
763            }
764        }
765
766        // Ensure relations exist for all referenced predicates so evaluation treats missing as empty,
767        // but never errors due to an unknown predicate (CPU eval path only).
768        #[cfg(feature = "host-io")]
769        {
770            let mut referenced: HashSet<String> = HashSet::new();
771            for rule in &program.rules {
772                referenced.insert(rule.head.predicate.clone());
773                for lit in &rule.body {
774                    match lit {
775                        BodyLiteral::Positive(a) | BodyLiteral::Negated(a) => {
776                            referenced.insert(a.predicate.clone());
777                        }
778                        BodyLiteral::Epistemic(lit) => {
779                            referenced.insert(lit.atom.predicate.clone());
780                        }
781                        BodyLiteral::Comparison(_)
782                        | BodyLiteral::IsExpr(_)
783                        | BodyLiteral::Univ(_) => {}
784                    }
785                }
786            }
787            for pf in &program.prob_facts {
788                referenced.insert(pf.atom.predicate.clone());
789            }
790            for ad in &program.annotated_disjunctions {
791                for pf in &ad.choices {
792                    referenced.insert(pf.atom.predicate.clone());
793                }
794            }
795            for q in &queries {
796                referenced.insert(q.predicate.clone());
797            }
798            for (e, _) in &evidence {
799                referenced.insert(e.predicate.clone());
800            }
801            for pred in referenced {
802                base_store.entry(pred).or_default();
803            }
804        }
805
806        #[cfg(feature = "host-io")]
807        let scc_plans = results::build_scc_plans(program)?;
808
809        Ok(Self {
810            gpu_config: GpuConfig::default(),
811            program: program.clone(),
812            #[cfg(feature = "host-io")]
813            base_store,
814            #[cfg(feature = "host-io")]
815            scc_plans,
816            queries,
817            evidence,
818            bernoulli_probs,
819            prob_facts,
820            annotated_disjunctions,
821        })
822    }
823
824    fn provider(&self) -> Result<CudaKernelProvider> {
825        if self.gpu_config.memory_bytes == 0 {
826            return Err(XlogError::Kernel(
827                "GPU memory budget must be non-zero".to_string(),
828            ));
829        }
830
831        let device = Arc::new(CudaDevice::new(self.gpu_config.device_ordinal)?);
832        let memory = Arc::new(GpuMemoryManager::new(
833            device.clone(),
834            MemoryBudget::with_limit(self.gpu_config.memory_bytes),
835        ));
836        CudaKernelProvider::new(device, memory)
837    }
838}