1use std::sync::Arc;
8use std::time::Instant;
9
10use cudarc::driver::DeviceSlice;
11use xlog_core::{Result, XlogError};
12use xlog_cuda::memory::TrackedCudaSlice;
13use xlog_cuda::CudaKernelProvider;
14use xlog_solve::{GpuCdclConfig, GpuCnf};
15
16use crate::compilation::gpu_cache::{GpuCircuitCache, GpuCircuitCacheHandle};
17use crate::gpu::{GpuCircuitBuilder, GpuCircuitLayout, GpuXgcf};
18
19pub mod disk_cache;
20pub mod gpu_cache;
21pub mod gpu_cnf;
22pub mod gpu_d4;
23pub mod gpu_pir;
24pub mod gpu_pir_intern;
25pub mod gpu_weights;
26pub mod sparse_matrix;
27pub mod validation;
28
29pub use gpu_cnf::{encode_cnf_gpu, GpuCnfEncoding, GpuCnfVarTables};
30pub use gpu_d4::GpuCompileConfig;
31pub use gpu_pir::{GpuPirGraph, GpuPirRoots, PIR_AND, PIR_LIT, PIR_NEG_LIT, PIR_OR};
32pub use gpu_pir_intern::{GpuPirInterner, PirBatch};
35pub use gpu_weights::GpuWeights;
36pub use gpu_weights::{
37 apply_query_vars_device, build_evidence_by_var_gpu, build_weights_gpu, map_nodes_to_vars_gpu,
38 restore_query_vars_device,
39};
40pub use validation::{
42 build_equivalence_queries_gpu, validate_equivalence_gpu, validate_equivalence_gpu_gated,
43 GpuEquivalenceConfig, GpuEquivalenceQueries,
44};
45#[derive(Debug, Clone, Default)]
50pub struct CircuitCompileProfile {
51 pub cnf_hash_sec: f64,
52 pub d4_compile_sec: f64,
53 pub verify_sec: f64,
54 pub smooth_sec: f64,
55 pub cache_store_sec: f64,
56 pub free_var_mask_sec: f64,
57 pub gpu_cache_hit: bool,
58 pub disk_cache_hit: bool,
59 pub frontier_items: u32,
62}
63
64pub(crate) fn warmup_profiling_enabled() -> bool {
65 std::env::var("XLOG_WARMUP_PROFILE")
66 .map(|v| v == "1")
67 .unwrap_or(false)
68}
69
70pub struct DeviceRandomVarList {
72 list: TrackedCudaSlice<u32>,
73 count: u32,
74}
75
76impl DeviceRandomVarList {
77 pub fn from_device(list: TrackedCudaSlice<u32>, count: u32) -> Result<Self> {
78 let len = u32::try_from(list.len()).map_err(|_| {
79 XlogError::Compilation("DeviceRandomVarList: list length exceeds u32".to_string())
80 })?;
81 if count > len {
82 return Err(XlogError::Compilation(format!(
83 "DeviceRandomVarList: count {} exceeds list len {}",
84 count, len
85 )));
86 }
87 Ok(Self { list, count })
88 }
89
90 pub fn from_host(provider: &CudaKernelProvider, host: &[u32]) -> Result<Self> {
91 let memory = provider.memory();
92 let mut list = memory.alloc::<u32>(host.len())?;
93 if !host.is_empty() {
94 provider
95 .htod_sync_copy_into_tracked(host, &mut list)
96 .map_err(|e| {
97 XlogError::Kernel(format!("DeviceRandomVarList upload failed: {}", e))
98 })?;
99 }
100 let count = u32::try_from(host.len()).map_err(|_| {
101 XlogError::Compilation("DeviceRandomVarList: host len exceeds u32".to_string())
102 })?;
103 Ok(Self { list, count })
104 }
105
106 pub fn is_empty(&self) -> bool {
107 self.count == 0
108 }
109
110 pub fn count(&self) -> u32 {
111 self.count
112 }
113
114 pub fn list(&self) -> &TrackedCudaSlice<u32> {
115 &self.list
116 }
117}
118
119fn upload_disk_artifact_for_verification(
120 artifact: &disk_cache::CircuitArtifact,
121 provider: &Arc<CudaKernelProvider>,
122) -> Result<GpuXgcf> {
123 let memory = provider.memory().clone();
124
125 macro_rules! upload {
126 ($field:expr, $ty:ty, $name:literal) => {{
127 let mut device = memory.alloc::<$ty>($field.len())?;
128 provider
129 .htod_sync_copy_into_tracked($field, &mut device)
130 .map_err(|e| {
131 XlogError::Kernel(format!("disk cache verify upload {} failed: {}", $name, e))
132 })?;
133 device
134 }};
135 }
136
137 let builder = GpuCircuitBuilder {
138 node_type: upload!(&artifact.node_type, u8, "node_type"),
139 child_offsets: upload!(&artifact.child_offsets, u32, "child_offsets"),
140 child_indices: upload!(&artifact.child_indices, u32, "child_indices"),
141 lit: upload!(&artifact.lit, i32, "lit"),
142 decision_var: upload!(&artifact.decision_var, u32, "decision_var"),
143 decision_child_false: upload!(&artifact.decision_child_false, u32, "decision_child_false"),
144 decision_child_true: upload!(&artifact.decision_child_true, u32, "decision_child_true"),
145 };
146 let layout = GpuCircuitLayout {
147 num_nodes: artifact.num_nodes,
148 num_edges: artifact.num_edges,
149 num_levels: artifact.num_levels,
150 level_offsets: upload!(&artifact.level_offsets, u32, "level_offsets"),
151 level_nodes: upload!(&artifact.level_nodes, u32, "level_nodes"),
152 root: artifact.root,
153 max_var: artifact.max_var,
154 num_nodes_device: None,
155 num_edges_device: None,
156 };
157
158 GpuXgcf::from_device(builder, layout, provider)
159}
160
161pub fn compile_gpu_d4_and_verify(
163 cnf: &GpuCnf,
164 decision_var_limit: &TrackedCudaSlice<u32>,
165 provider: &Arc<CudaKernelProvider>,
166 config: &GpuCompileConfig,
167) -> Result<GpuXgcf> {
168 if config.cdcl_conflict_budget.is_some() {
169 return Err(XlogError::Compilation(
170 "cdcl_conflict_budget is not supported by the GPU CDCL verifier".to_string(),
171 ));
172 }
173 validation::check_verify_size_bound(cnf, "compile_gpu_d4_and_verify")?;
176 let circuit = gpu_d4::compile_gpu_d4(cnf, provider, config)?;
177 let cdcl = cdcl_config_from_compile(config)?;
178 validate_equivalence_gpu(
179 cnf,
180 decision_var_limit,
181 &circuit,
182 provider,
183 GpuEquivalenceConfig {
184 cdcl,
185 reuse_workspace: config.incremental_verify,
186 },
187 )?;
188 Ok(circuit)
189}
190
191pub fn compile_gpu_d4_and_verify_cached(
197 cnf: &GpuCnf,
198 decision_var_limit: &TrackedCudaSlice<u32>,
199 provider: &Arc<CudaKernelProvider>,
200 config: &GpuCompileConfig,
201 cache: &mut GpuCircuitCache,
202 random_vars: &DeviceRandomVarList,
203 canonical_cnf_hash: Option<u64>,
204) -> Result<(GpuCircuitCacheHandle, Option<CircuitCompileProfile>)> {
205 if config.cdcl_conflict_budget.is_some() {
206 return Err(XlogError::Compilation(
207 "cdcl_conflict_budget is not supported by the GPU CDCL verifier".to_string(),
208 ));
209 }
210 validation::check_verify_size_bound(cnf, "compile_gpu_d4_and_verify_cached")?;
213
214 let profiling = warmup_profiling_enabled();
215 let mut profile = CircuitCompileProfile::default();
216
217 #[cfg(debug_assertions)]
219 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: hash_cnf_gpu");
220 let t_hash = if profiling {
221 Some(Instant::now())
222 } else {
223 None
224 };
225 let key = gpu_cache::hash_cnf_gpu(cnf, provider)?;
226 if let Some(t0) = t_hash {
227 provider
228 .device()
229 .synchronize()
230 .map_err(|e| XlogError::Kernel(format!("sync after hash_cnf_gpu: {}", e)))?;
231 profile.cnf_hash_sec = t0.elapsed().as_secs_f64();
232 }
233 #[cfg(debug_assertions)]
234 {
235 if !profiling {
236 provider
237 .device()
238 .synchronize()
239 .map_err(|e| XlogError::Kernel(format!("sync after hash_cnf_gpu failed: {}", e)))?;
240 }
241 }
242 #[cfg(debug_assertions)]
243 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: lookup_or_insert_device");
244 let lookup = cache.lookup_or_insert_device(&key)?;
245 let mut handle = lookup.into_handle()?;
246
247 let compile_needed_host: Vec<u32> = provider
254 .device()
255 .inner()
256 .dtoh_sync_copy(handle.compile_needed_device())
257 .map_err(|e| XlogError::Kernel(format!("dtoh compile_needed: {}", e)))?;
258 let compile_needed = compile_needed_host[0];
259
260 if compile_needed == 0 {
262 profile.gpu_cache_hit = true;
263 let out_profile = if profiling { Some(profile) } else { None };
264 return Ok((handle, out_profile));
265 }
266
267 let cache_key = if compile_needed == 1 {
271 if let Some(canonical_hash) = canonical_cnf_hash {
272 let config_hash = hash_compile_config(config);
273 let random_vars_hash = hash_random_vars(random_vars, provider)?;
274 let gpu_cnf_hash = read_gpu_cnf_hash(&key, provider)?;
275 let cnf_hash = combine_disk_cnf_hash(canonical_hash, gpu_cnf_hash);
276 let sm = detect_compute_capability(provider)?;
277 Some(disk_cache::CircuitCacheKey {
278 cnf_hash,
279 config_hash,
280 random_vars_hash,
281 sm,
282 })
283 } else {
284 None
285 }
286 } else {
287 None
288 };
289
290 if let Some(ref disk_key) = cache_key {
292 #[cfg(debug_assertions)]
293 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: checking disk cache");
294 if let Ok(Some(artifact)) = disk_cache::read_artifact(disk_key) {
295 #[cfg(debug_assertions)]
296 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: disk cache hit");
297 if artifact.max_var > cnf.var_cap {
305 #[cfg(debug_assertions)]
306 eprintln!(
307 "[xlog-prob] compile_gpu_d4_and_verify_cached: stale disk artifact \
308 (max_var {} > cnf var_cap {}), evicting",
309 artifact.max_var, cnf.var_cap
310 );
311 disk_cache::evict_artifact(disk_key);
312 } else {
313 let circuit = upload_disk_artifact_for_verification(&artifact, provider)?;
314 let verifier_decision_var_limit = if random_vars.is_empty() {
315 &cnf.num_vars
316 } else {
317 decision_var_limit
318 };
319 let cdcl = cdcl_config_from_compile(config)?;
320 let t_verify = if profiling {
321 Some(Instant::now())
322 } else {
323 None
324 };
325 match validate_equivalence_gpu_gated(
326 cnf,
327 verifier_decision_var_limit,
328 &circuit,
329 provider,
330 GpuEquivalenceConfig {
331 cdcl,
332 reuse_workspace: config.incremental_verify,
333 },
334 handle.compile_needed_device(),
335 ) {
336 Ok(()) => {
337 if let Some(t0) = t_verify {
338 provider.device().synchronize().map_err(|e| {
339 XlogError::Kernel(format!("sync after disk cache verify: {}", e))
340 })?;
341 profile.verify_sec = t0.elapsed().as_secs_f64();
342 }
343 cache.restore_from_host_arrays(&mut handle, &artifact)?;
344 provider.device().synchronize().map_err(|e| {
345 XlogError::Kernel(format!("sync after disk cache restore: {}", e))
346 })?;
347 profile.disk_cache_hit = true;
348 let out_profile = if profiling { Some(profile) } else { None };
349 return Ok((handle, out_profile));
350 }
351 Err(_stale) => {
352 #[cfg(debug_assertions)]
356 eprintln!(
357 "[xlog-prob] compile_gpu_d4_and_verify_cached: cached circuit \
358 failed equivalence against current CNF ({_stale}), evicting"
359 );
360 disk_cache::evict_artifact(disk_key);
361 }
362 }
363 }
364 }
365 #[cfg(debug_assertions)]
366 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: disk cache miss");
367 }
368
369 let d4_config = d4_config_for_smoothing(config, random_vars.count())?;
370
371 #[cfg(debug_assertions)]
373 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: compile_gpu_d4_gated");
374 let t_d4 = if profiling {
375 Some(Instant::now())
376 } else {
377 None
378 };
379 let (circuit_base, frontier_items) = gpu_d4::compile_gpu_d4_gated_with_stats(
380 cnf,
381 provider,
382 &d4_config,
383 handle.compile_needed_device(),
384 )?;
385 profile.frontier_items = frontier_items;
386 if let Some(t0) = t_d4 {
387 provider
388 .device()
389 .synchronize()
390 .map_err(|e| XlogError::Kernel(format!("sync after d4 compile: {}", e)))?;
391 profile.d4_compile_sec = t0.elapsed().as_secs_f64();
392 }
393 #[cfg(debug_assertions)]
394 {
395 if !profiling {
396 provider.device().synchronize().map_err(|e| {
397 XlogError::Kernel(format!("sync after compile_gpu_d4_gated failed: {}", e))
398 })?;
399 }
400 }
401 if circuit_base.num_nodes() == 0 || circuit_base.num_levels() == 0 {
402 let out_profile = if profiling { Some(profile) } else { None };
406 return Ok((handle, out_profile));
407 }
408
409 let verifier_decision_var_limit = if random_vars.is_empty() {
418 &cnf.num_vars
419 } else {
420 decision_var_limit
421 };
422 let cdcl = cdcl_config_from_compile(config)?;
423 #[cfg(debug_assertions)]
424 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: validate_equivalence_gpu_gated");
425 let t_verify = if profiling {
426 Some(Instant::now())
427 } else {
428 None
429 };
430 validate_equivalence_gpu_gated(
431 cnf,
432 verifier_decision_var_limit,
433 &circuit_base,
434 provider,
435 GpuEquivalenceConfig {
436 cdcl,
437 reuse_workspace: config.incremental_verify,
438 },
439 handle.compile_needed_device(),
440 )?;
441 if let Some(t0) = t_verify {
442 provider
443 .device()
444 .synchronize()
445 .map_err(|e| XlogError::Kernel(format!("sync after verify: {}", e)))?;
446 profile.verify_sec = t0.elapsed().as_secs_f64();
447 }
448 #[cfg(debug_assertions)]
449 {
450 if !profiling {
451 provider.device().synchronize().map_err(|e| {
452 XlogError::Kernel(format!(
453 "sync after validate_equivalence_gpu_gated failed: {}",
454 e
455 ))
456 })?;
457 }
458 }
459
460 let t_smooth = if profiling {
465 Some(Instant::now())
466 } else {
467 None
468 };
469 let circuit_eval = if random_vars.is_empty() {
470 circuit_base
471 } else {
472 #[cfg(debug_assertions)]
473 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: smooth_random_vars_device");
474 let smoothed = circuit_base.smooth_random_vars_device(
475 provider,
476 random_vars.list(),
477 random_vars.count(),
478 config.smooth_node_cap,
479 config.smooth_edge_cap,
480 )?;
481 #[cfg(debug_assertions)]
482 {
483 if !profiling {
484 provider.device().synchronize().map_err(|e| {
485 XlogError::Kernel(format!(
486 "sync after smooth_random_vars_device failed: {}",
487 e
488 ))
489 })?;
490 }
491 }
492 smoothed
493 };
494 if let Some(t0) = t_smooth {
495 provider
496 .device()
497 .synchronize()
498 .map_err(|e| XlogError::Kernel(format!("sync after smooth: {}", e)))?;
499 profile.smooth_sec = t0.elapsed().as_secs_f64();
500 }
501
502 #[cfg(debug_assertions)]
504 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: store_from_xgcf");
505 let t_store = if profiling {
506 Some(Instant::now())
507 } else {
508 None
509 };
510 cache.store_from_xgcf(&mut handle, &circuit_eval)?;
511 if let Some(t0) = t_store {
512 provider
513 .device()
514 .synchronize()
515 .map_err(|e| XlogError::Kernel(format!("sync after cache store: {}", e)))?;
516 profile.cache_store_sec = t0.elapsed().as_secs_f64();
517 }
518 #[cfg(debug_assertions)]
519 {
520 if !profiling {
521 provider.device().synchronize().map_err(|e| {
522 XlogError::Kernel(format!("sync after store_from_xgcf failed: {}", e))
523 })?;
524 }
525 }
526
527 #[cfg(debug_assertions)]
529 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: compute_free_var_mask_gpu_gated");
530 let t_fvm = if profiling {
531 Some(Instant::now())
532 } else {
533 None
534 };
535 let free_var_mask = gpu_d4::compute_free_var_mask_gpu_gated(
536 cnf,
537 &circuit_eval,
538 provider,
539 handle.compile_needed_device(),
540 )?;
541 #[cfg(debug_assertions)]
542 {
543 if !profiling {
544 provider.device().synchronize().map_err(|e| {
545 XlogError::Kernel(format!(
546 "sync after compute_free_var_mask_gpu_gated failed: {}",
547 e
548 ))
549 })?;
550 }
551 }
552 let mask_host: Vec<u8> = provider
557 .device()
558 .inner()
559 .dtoh_sync_copy(&free_var_mask)
560 .map_err(|e| XlogError::Kernel(format!("Failed to read free_var_mask: {}", e)))?;
561 let has_free_vars = mask_host.iter().any(|&b| b != 0);
562 #[cfg(debug_assertions)]
563 eprintln!(
564 "[xlog-prob] free_var_mask: {} free vars, batched eval {}",
565 if has_free_vars { "has" } else { "no" },
566 if has_free_vars { "DISABLED" } else { "ENABLED" },
567 );
568 if has_free_vars {
569 cache.store_free_var_mask(&handle, &free_var_mask)?;
570 #[cfg(debug_assertions)]
571 {
572 if !profiling {
573 provider.device().synchronize().map_err(|e| {
574 XlogError::Kernel(format!("sync after store_free_var_mask failed: {}", e))
575 })?;
576 }
577 }
578 }
579 if let Some(t0) = t_fvm {
580 provider
581 .device()
582 .synchronize()
583 .map_err(|e| XlogError::Kernel(format!("sync after free_var_mask: {}", e)))?;
584 profile.free_var_mask_sec = t0.elapsed().as_secs_f64();
585 }
586
587 if let Some(ref disk_key) = cache_key {
592 if let Ok(artifact) = cache.build_artifact_from_device(&handle, provider) {
593 let _ = disk_cache::write_artifact(disk_key, &artifact);
594 #[cfg(debug_assertions)]
595 eprintln!("[xlog-prob] compile_gpu_d4_and_verify_cached: wrote disk cache artifact");
596 }
597 }
598
599 let out_profile = if profiling { Some(profile) } else { None };
600 Ok((handle, out_profile))
601}
602
603fn d4_config_for_smoothing(
604 config: &GpuCompileConfig,
605 random_var_count: u32,
606) -> Result<GpuCompileConfig> {
607 if random_var_count == 0 {
608 return Ok(*config);
609 }
610 let headroom = 2u32
611 .checked_add(random_var_count)
612 .ok_or_else(|| XlogError::Compilation("smooth headroom overflow".to_string()))?;
613 if config.smooth_node_cap <= headroom {
614 return Err(XlogError::Compilation(format!(
615 "GpuCompileConfig smooth_node_cap {} too small for smoothing headroom {}",
616 config.smooth_node_cap, headroom
617 )));
618 }
619 let base_cap = config
620 .smooth_node_cap
621 .checked_sub(headroom)
622 .ok_or_else(|| XlogError::Compilation("smooth node cap underflow".to_string()))?;
623 if base_cap < 3 {
624 return Err(XlogError::Compilation(
625 "GpuCompileConfig smooth_node_cap leaves <3 base nodes".to_string(),
626 ));
627 }
628 let mut out = *config;
629 out.smooth_node_cap = base_cap;
630 Ok(out)
631}
632
633fn cdcl_config_from_compile(config: &GpuCompileConfig) -> Result<GpuCdclConfig> {
634 if config.cdcl_restart_interval == 0 {
635 return Err(XlogError::Compilation(
636 "cdcl_restart_interval must be > 0".to_string(),
637 ));
638 }
639 if config.cdcl_learned_bytes == 0 {
640 return Err(XlogError::Compilation(
641 "cdcl_learned_bytes must be > 0".to_string(),
642 ));
643 }
644
645 const AVG_LEN: u64 = 4;
647 const META_BYTES_PER_CLAUSE: u64 = 24; const PROOF_BYTES_PER_CLAUSE: u64 = 8 + (8 * AVG_LEN); const LIT_BYTES_PER_CLAUSE: u64 = 4 * AVG_LEN;
650
651 let bytes_per_clause = META_BYTES_PER_CLAUSE
652 .checked_add(PROOF_BYTES_PER_CLAUSE)
653 .and_then(|v| v.checked_add(LIT_BYTES_PER_CLAUSE))
654 .ok_or_else(|| XlogError::Compilation("cdcl bytes per clause overflow".to_string()))?;
655
656 let max_clauses = config
657 .cdcl_learned_bytes
658 .checked_div(bytes_per_clause)
659 .ok_or_else(|| XlogError::Compilation("cdcl_learned_bytes div overflow".to_string()))?;
660 if max_clauses == 0 {
661 return Err(XlogError::Compilation(
662 "cdcl_learned_bytes too small for learned clause arena".to_string(),
663 ));
664 }
665
666 let max_lits = max_clauses
667 .checked_mul(AVG_LEN)
668 .ok_or_else(|| XlogError::Compilation("max_learned_lits overflow".to_string()))?;
669 let max_proof_u32 = max_clauses
670 .checked_mul(2 + 2 * AVG_LEN)
671 .ok_or_else(|| XlogError::Compilation("max_proof_u32 overflow".to_string()))?;
672
673 let max_learned_clauses = u32::try_from(max_clauses)
674 .map_err(|_| XlogError::Compilation("max_learned_clauses exceeds u32::MAX".to_string()))?;
675 let max_learned_lits = u32::try_from(max_lits)
676 .map_err(|_| XlogError::Compilation("max_learned_lits exceeds u32::MAX".to_string()))?;
677 let max_proof_u32 = u32::try_from(max_proof_u32)
678 .map_err(|_| XlogError::Compilation("max_proof_u32 exceeds u32::MAX".to_string()))?;
679
680 let reduce_interval = config
681 .cdcl_restart_interval
682 .checked_mul(20)
683 .ok_or_else(|| XlogError::Compilation("cdcl reduce_interval overflow".to_string()))?;
684
685 let mut gpu_cdcl = GpuCdclConfig::default();
686 gpu_cdcl.max_learned_clauses = max_learned_clauses;
687 gpu_cdcl.max_learned_lits = max_learned_lits;
688 gpu_cdcl.max_proof_u32 = max_proof_u32;
689 gpu_cdcl.restart_base = config.cdcl_restart_interval;
690 gpu_cdcl.reduce_interval = reduce_interval;
691 gpu_cdcl.max_conflicts = verify_max_conflicts();
698 Ok(gpu_cdcl)
699}
700
701fn verify_max_conflicts() -> u32 {
704 static BUDGET: std::sync::OnceLock<u32> = std::sync::OnceLock::new();
705 *BUDGET.get_or_init(|| {
706 std::env::var("XLOG_D4_VERIFY_MAX_CONFLICTS")
707 .ok()
708 .and_then(|v| v.trim().parse::<u32>().ok())
709 .unwrap_or(0)
710 })
711}
712
713fn fnv1a_u64(bytes: &[u8]) -> u64 {
720 const FNV_OFFSET: u64 = 0xcbf29ce484222325;
721 const FNV_PRIME: u64 = 0x100000001b3;
722 let mut h = FNV_OFFSET;
723 for &b in bytes {
724 h ^= b as u64;
725 h = h.wrapping_mul(FNV_PRIME);
726 }
727 h
728}
729
730fn combine_disk_cnf_hash(canonical_hash: u64, gpu_cnf_hash: u64) -> u64 {
731 let mut buf = Vec::with_capacity(16);
732 buf.extend_from_slice(&canonical_hash.to_le_bytes());
733 buf.extend_from_slice(&gpu_cnf_hash.to_le_bytes());
734 fnv1a_u64(&buf)
735}
736
737fn read_gpu_cnf_hash(
738 hash: &TrackedCudaSlice<u64>,
739 provider: &Arc<CudaKernelProvider>,
740) -> Result<u64> {
741 let host: Vec<u64> = provider
742 .device()
743 .inner()
744 .dtoh_sync_copy(hash)
745 .map_err(|e| XlogError::Kernel(format!("dtoh GPU CNF hash for disk cache key: {}", e)))?;
746 host.first()
747 .copied()
748 .ok_or_else(|| XlogError::Kernel("empty GPU CNF hash for disk cache key".to_string()))
749}
750
751fn hash_compile_config(config: &GpuCompileConfig) -> u64 {
753 let mut buf = Vec::new();
754 buf.extend_from_slice(&config.frontier_depth.to_le_bytes());
755 buf.extend_from_slice(&config.max_frontier_items.to_le_bytes());
756 buf.extend_from_slice(&config.max_depth.to_le_bytes());
757 buf.extend_from_slice(&config.smooth_node_cap.to_le_bytes());
758 buf.extend_from_slice(&config.smooth_edge_cap.to_le_bytes());
759 buf.extend_from_slice(&config.cdcl_restart_interval.to_le_bytes());
762 buf.extend_from_slice(&config.cdcl_learned_bytes.to_le_bytes());
763 fnv1a_u64(&buf)
764}
765
766fn hash_random_vars(
768 random_vars: &DeviceRandomVarList,
769 provider: &Arc<CudaKernelProvider>,
770) -> Result<u64> {
771 let count = random_vars.count();
772 let mut buf = Vec::new();
773 buf.extend_from_slice(&count.to_le_bytes());
774 if count > 0 {
775 let host: Vec<u32> = provider
776 .device()
777 .inner()
778 .dtoh_sync_copy(random_vars.list())
779 .map_err(|e| {
780 XlogError::Kernel(format!("dtoh random_vars for disk cache hash: {}", e))
781 })?;
782 for &v in &host[..count as usize] {
784 buf.extend_from_slice(&v.to_le_bytes());
785 }
786 }
787 Ok(fnv1a_u64(&buf))
788}
789
790fn detect_compute_capability(provider: &Arc<CudaKernelProvider>) -> Result<u32> {
792 use cudarc::driver::sys::CUdevice_attribute;
793
794 let device = provider.device().inner();
795 let major = device
796 .attribute(CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
797 .map_err(|e| {
798 XlogError::Kernel(format!("Failed to query compute capability major: {}", e))
799 })?;
800 let minor = device
801 .attribute(CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
802 .map_err(|e| {
803 XlogError::Kernel(format!("Failed to query compute capability minor: {}", e))
804 })?;
805 let major_u32: u32 = major.try_into().map_err(|_| {
806 XlogError::Kernel(format!(
807 "compute capability major {} cannot be converted to u32",
808 major
809 ))
810 })?;
811 let minor_u32: u32 = minor.try_into().map_err(|_| {
812 XlogError::Kernel(format!(
813 "compute capability minor {} cannot be converted to u32",
814 minor
815 ))
816 })?;
817 Ok(major_u32 * 10 + minor_u32)
818}