xlog_cuda/device_runtime/
mod.rs

1//! Stream-ordered device memory runtime, RMM-inspired.
2//!
3//! v0.6 architecture work. Replaces the per-`CudaKernelProvider`
4//! `GpuMemoryManager` model (which cannot enforce a real per-device
5//! budget across parallel tests, Python users, or multiple executors
6//! on a single physical GPU) with a per-CUDA-ordinal singleton
7//! [`XlogDeviceRuntime`] composed of swappable
8//! [`DeviceMemoryResource`] adaptors:
9//!
10//! ```text
11//! XlogDeviceRuntime per CUDA ordinal
12//!   -> StreamPool of non-blocking streams
13//!   -> GlobalDeviceBudget per physical GPU
14//!   -> Logging / Debug adaptor (optional)
15//!   -> AsyncCudaResource (production) | DirectCudaResource (sanitizer/cert)
16//! ```
17//!
18//! Required resources:
19//!   * [`DirectCudaResource`] — cudarc default (non-pooled) allocation
20//!     backend (`CudaDeviceInner::alloc::<u8>` / drop, which on
21//!     async-alloc hosts forwards to `cuMemAllocAsync`). Candidate
22//!     for the sanitizer/cert role because there is no `xlog`-level
23//!     pool suballocation hiding out-of-bounds access from Compute
24//!     Sanitizer; the sanitizer-visibility property itself is
25//!     **unproven** until the manual Compute Sanitizer acceptance
26//!     gate runs on a supported host. A genuine raw-driver
27//!     `cuMemAlloc`/`cuMemFree` backend is a separate future commit.
28//!   * `AsyncCudaResource` — `cuMemAllocAsync`/`cuMemFreeAsync`
29//!     bound to a caller-supplied stream via the stream pool;
30//!     production default when supported.
31//!   * `PoolResource` — performance tier, not part of this PR; gated
32//!     behind correctness certification of the direct/async backends.
33//!   * `DebugGuardResource` — optional canary/poison/quarantine layer.
34//!   * `LoggingResource` — CSV allocation log: thread, time, action,
35//!     ptr, bytes, stream, device, tag, query id.
36//!
37//! Stream-ordered contract: every alloc / dealloc names a stream;
38//! reuse across streams requires explicit event/sync. No reliance on
39//! the CUDA legacy null/default stream. Mirrors RMM's stream-ordered
40//! rule — see https://github.com/rapidsai/RMM .
41//!
42//! v0.5.5 closed at PRs #49 / #50 / #52 (metadata-read state for
43//! binary-join output counts). The fully GPU-resident binary-join
44//! materialization rebase is gated on this allocator landing first.
45
46pub mod async_resource;
47pub mod budget;
48pub mod direct;
49pub mod logging;
50pub mod resource;
51pub mod runtime;
52pub mod stream_pool;
53
54pub use async_resource::AsyncCudaResource;
55pub use budget::GlobalDeviceBudget;
56pub use direct::DirectCudaResource;
57pub use logging::{
58    InMemorySink, LogAction, LogRecord, LogResult, LoggingResource, LoggingSink, NullSink,
59    SinkError,
60};
61pub use resource::{
62    Access, AllocTag, BlockId, BlockState, DeviceBlock, DeviceMemoryResource, Generation,
63    ResourceError, ResourceResult, StreamId,
64};
65pub use runtime::{XlogDeviceRuntime, MAX_DEVICE_ORDINALS};
66pub use stream_pool::{StreamPool, StreamPoolError, DEFAULT_MAX_STREAMS};
xlog_cuda/device_runtime/mod.rs

xlog_cuda/device_runtime/
mod.rs