xlog_cuda/device_runtime/
resource.rs

1//! Core [`DeviceMemoryResource`] trait and supporting types.
2//!
3//! Mirrors RMM's `device_memory_resource` shape so a future optional
4//! RMM backend can satisfy the same trait without requiring callers to
5//! change. Stream-ordered: every alloc/dealloc names a stream; cross-
6//! stream reuse requires explicit event-based synchronization.
7
8use std::fmt;
9use std::sync::atomic::{AtomicU64, Ordering};
10
11/// Identifier for a CUDA stream owned by the runtime's stream pool.
12/// Wraps the raw cudarc stream handle the resource will use for
13/// `cuMemAllocAsync` / `cuMemFreeAsync` ordering. Construction goes
14/// through the runtime; do not fabricate.
15#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
16pub struct StreamId(pub u32);
17
18impl StreamId {
19    /// The "default" pool stream for tests and synchronous codepaths
20    /// that have no other stream context. Production callers should
21    /// always carry a real stream from the executor / kernel launch
22    /// site.
23    pub const DEFAULT: StreamId = StreamId(0);
24}
25
26/// Caller-supplied tag for allocation log lines. Short-lived strings
27/// are interned by the logging resource; long-lived borrows are not
28/// retained.
29#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
30pub struct AllocTag(pub &'static str);
31
32impl AllocTag {
33    pub const UNTAGGED: AllocTag = AllocTag("untagged");
34}
35
36/// Monotonic counter for distinguishing reuse of the same byte address
37/// across drop / reallocate cycles. Logging and debug-guard resources
38/// use this to detect use-after-free.
39#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
40pub struct Generation(pub u64);
41
42static GENERATION_COUNTER: AtomicU64 = AtomicU64::new(1);
43
44impl Generation {
45    /// Allocate a fresh, monotonically increasing generation number.
46    /// Concurrent calls return distinct values.
47    pub fn next() -> Generation {
48        Generation(GENERATION_COUNTER.fetch_add(1, Ordering::Relaxed))
49    }
50}
51
52/// Access kind for a single block use. Drives the cross-stream
53/// dependency edges the resource queues during
54/// [`DeviceMemoryResource::prepare_block_use`] and the events it
55/// records during [`DeviceMemoryResource::finish_block_use`].
56///
57///   * [`Access::Read`] — the work consumes the block's bytes.
58///     Must wait on any prior write on a different stream. The
59///     resulting event is appended to the block's outstanding-reads
60///     list so future writers (and the eventual deallocate) can
61///     wait on it.
62///   * [`Access::Write`] — the work overwrites the block's bytes
63///     unconditionally. Must wait on the block's prior write AND
64///     all outstanding reads on different streams. The resulting
65///     event becomes the block's new last-write event; the
66///     outstanding-reads list is cleared at finish time.
67///   * [`Access::ReadWrite`] — both. Same wait set as `Write`,
68///     and the resulting event likewise replaces last-write.
69#[derive(Clone, Copy, Debug, Eq, PartialEq)]
70pub enum Access {
71    Read,
72    Write,
73    ReadWrite,
74}
75
76impl Access {
77    /// Whether work of this access kind reads the block's bytes.
78    pub fn reads(self) -> bool {
79        matches!(self, Access::Read | Access::ReadWrite)
80    }
81
82    /// Whether work of this access kind writes the block's bytes.
83    pub fn writes(self) -> bool {
84        matches!(self, Access::Write | Access::ReadWrite)
85    }
86}
87
88/// Compact identity of a [`DeviceBlock`] suitable for snapshotting
89/// into structures whose lifetime should not be tied to the source
90/// slice's borrow. The fields needed to validate `(ptr, generation)`
91/// against the resource's live map and to resolve `alloc_stream` for
92/// cross-stream waits / dealloc ordering.
93///
94/// Created via [`BlockId::from_block`]. Pure data; no resource
95/// handle, no `Drop`. Cheap to copy.
96#[derive(Clone, Copy, Debug, Eq, PartialEq)]
97pub struct BlockId {
98    pub ptr: u64,
99    pub generation: Generation,
100    pub alloc_stream: StreamId,
101    pub device_ordinal: u32,
102}
103
104impl BlockId {
105    /// Snapshot a [`DeviceBlock`]'s identity. The returned id is
106    /// independent of the original block's borrow lifetime; the
107    /// runtime's generation guard catches stale ids whose backing
108    /// allocation has been recycled.
109    pub fn from_block(block: &DeviceBlock) -> Self {
110        Self {
111            ptr: block.ptr,
112            generation: block.generation,
113            alloc_stream: block.alloc_stream,
114            device_ordinal: block.device_ordinal,
115        }
116    }
117}
118
119/// State of an outstanding [`DeviceBlock`] from the runtime's
120/// perspective. Adaptors flip blocks between these states; bug-detection
121/// resources reject operations on blocks in an unexpected state.
122#[derive(Clone, Copy, Debug, Eq, PartialEq)]
123pub enum BlockState {
124    /// Returned from `allocate`; safe to read/write on `alloc_stream`
125    /// or after a synchronization to another stream.
126    Live,
127    /// Returned from `deallocate` but still pending kernel completion
128    /// on its owning stream. Reuse must wait for stream sync.
129    Retired,
130    /// Held by `DebugGuardResource` for delayed reuse / canary
131    /// validation. Not reissued until the quarantine window passes.
132    Quarantined,
133    /// Memory has been physically freed. Any further use is a bug.
134    Freed,
135}
136
137/// One outstanding device-memory allocation. Owned by the caller until
138/// returned to its originating resource via
139/// [`DeviceMemoryResource::deallocate`].
140///
141/// Carries the metadata required for stream-ordered correctness and
142/// post-mortem debugging: the resource that owns the block, the device
143/// ordinal, the stream the allocation is bound to, byte size, alignment,
144/// caller tag, generation number, and current state.
145#[derive(Debug)]
146pub struct DeviceBlock {
147    /// Raw device pointer (opaque to safe Rust callers).
148    pub ptr: u64,
149    /// CUDA ordinal of the device this block lives on.
150    pub device_ordinal: u32,
151    /// Allocation stream. Reads/writes on a different stream require
152    /// explicit synchronization (event wait or device sync).
153    pub alloc_stream: StreamId,
154    /// Size in bytes. May exceed the caller-requested size when the
155    /// resource rounds up for alignment or pool granularity.
156    pub bytes: usize,
157    /// Alignment in bytes (always ≥ caller request).
158    pub align: usize,
159    /// Caller-supplied tag, surfaced in allocation logs.
160    pub tag: AllocTag,
161    /// Monotonic generation. Reused addresses get fresh generations.
162    pub generation: Generation,
163    /// Current state. Adaptors transition this; tests assert on it.
164    pub state: BlockState,
165}
166
167/// Errors returned by resource implementations. Distinct variants for
168/// the cases stress tests need to pin (out-of-budget vs CUDA driver
169/// failure vs use-after-free etc.).
170#[derive(Debug)]
171pub enum ResourceError {
172    /// The requested allocation would exceed the resource's budget.
173    /// Carries the requested bytes and the remaining budget so tests
174    /// can assert deterministic refusal.
175    OutOfBudget { requested: usize, remaining: usize },
176    /// CUDA driver returned an error. Carries the wrapped message.
177    Driver(String),
178    /// A stream-ordered contract was violated (e.g. dealloc on a
179    /// stream that does not match the alloc stream without an
180    /// intervening sync).
181    StreamMisuse(String),
182    /// A debug-guard or logging adaptor detected a use-after-free or
183    /// double-free. Hard error in debug builds; surfaced upward.
184    UseAfterFree { generation: Generation },
185    /// A debug-guard adaptor detected an out-of-bounds write past a
186    /// canary boundary.
187    OutOfBounds { generation: Generation },
188}
189
190impl fmt::Display for ResourceError {
191    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
192        match self {
193            Self::OutOfBudget {
194                requested,
195                remaining,
196            } => write!(
197                f,
198                "out of budget: requested {} bytes, remaining {} bytes",
199                requested, remaining
200            ),
201            Self::Driver(msg) => write!(f, "CUDA driver error: {}", msg),
202            Self::StreamMisuse(msg) => write!(f, "stream-ordered contract violated: {}", msg),
203            Self::UseAfterFree { generation } => {
204                write!(f, "use-after-free on generation {:?}", generation)
205            }
206            Self::OutOfBounds { generation } => {
207                write!(f, "out-of-bounds write on generation {:?}", generation)
208            }
209        }
210    }
211}
212
213impl std::error::Error for ResourceError {}
214
215pub type ResourceResult<T> = std::result::Result<T, ResourceError>;
216
217/// Stream-ordered device memory resource. Implementations:
218///   * [`crate::device_runtime::direct::DirectCudaResource`] —
219///     cudarc default (non-pooled) backend; **candidate** for the
220///     sanitizer/cert role, **unproven** until the manual Compute Sanitizer
221///     acceptance gate runs on a supported host.
222///   * [`crate::device_runtime::async_resource::AsyncCudaResource`] —
223///     stream-ordered cuMemAllocAsync/cuMemFreeAsync backend;
224///     production default when the context supports async-alloc.
225///   * [`crate::device_runtime::logging::LoggingResource`] —
226///     telemetry decorator over any inner resource.
227///   * [`crate::device_runtime::budget::GlobalDeviceBudget`] —
228///     per-runtime byte-limit decorator over any inner resource.
229///   * `PoolResource` — performance tier; v0.7+ (not implemented).
230///   * `DebugGuardResource` — canary/poison/quarantine; v0.7+
231///     (not implemented).
232///
233/// Implementations must be thread-safe. The runtime composes resources
234/// via decoration (each resource wraps an inner `Box<dyn
235/// DeviceMemoryResource + Send + Sync>`).
236pub trait DeviceMemoryResource: Send + Sync {
237    /// Allocate `bytes` bytes on the resource's device, ordered on
238    /// `stream`. The returned block is in [`BlockState::Live`].
239    fn allocate(
240        &self,
241        bytes: usize,
242        stream: StreamId,
243        tag: AllocTag,
244    ) -> ResourceResult<DeviceBlock>;
245
246    /// Return `block` to the resource. After this call the block's
247    /// state is [`BlockState::Retired`] (or [`BlockState::Quarantined`]
248    /// for debug-guard resources). Reuse of the underlying memory is
249    /// resource-specific but must respect the stream-ordered contract.
250    ///
251    /// `block.alloc_stream` is authoritative for ordering. If the
252    /// caller has touched the memory on a different stream, they must
253    /// have synchronized before calling `deallocate`.
254    fn deallocate(&self, block: DeviceBlock) -> ResourceResult<()>;
255
256    /// CUDA device ordinal this resource serves. Resources are pinned
257    /// to a single device.
258    fn device_ordinal(&self) -> u32;
259
260    /// Bytes currently outstanding (live + retired-but-not-yet-freed).
261    /// Used by tests and by the global budget adaptor.
262    fn bytes_outstanding(&self) -> usize;
263
264    /// Drain any retired-but-not-yet-freed bytes whose underlying
265    /// CUDA work has completed. For synchronous backends this is a
266    /// no-op. For stream-ordered async backends this synchronizes
267    /// the streams that have queued `cuMemFreeAsync` calls and
268    /// re-counts `bytes_outstanding` accordingly.
269    ///
270    /// Callers that need an accurate budget reading after a burst
271    /// of asynchronous deallocations should call this before
272    /// reading `bytes_outstanding`. Calling on a synchronous backend
273    /// is harmless and free.
274    fn reap_pending(&self) -> ResourceResult<()> {
275        Ok(())
276    }
277
278    /// Record that work has been (or is being) submitted on
279    /// `use_stream` that touches `block`'s bytes. Resources that
280    /// participate in cross-stream lifetime tracking (notably the
281    /// stream-ordered async backend) MUST attach a CUDA event from
282    /// `use_stream` to the block; on `deallocate(block)`, the
283    /// block's `alloc_stream` will wait on every recorded event
284    /// before queueing the underlying free.
285    ///
286    /// **The default implementation returns
287    /// [`ResourceError::StreamMisuse`].** This is intentional: a
288    /// silent no-op default would let a launch builder call
289    /// `record_block_use` against a resource that does not
290    /// actually track cross-stream uses (e.g.,
291    /// [`crate::device_runtime::direct::DirectCudaResource`]),
292    /// observe `Ok(())`, queue a kernel on a different stream,
293    /// then drop the block — and quietly hit the cross-stream
294    /// use-after-free that this API exists to prevent. False
295    /// safety is worse than no safety. Resources that cannot
296    /// track cross-stream uses MUST inherit this default;
297    /// callers (notably the future xlog launch builder) MUST
298    /// surface the error rather than masking it.
299    ///
300    /// Override status today:
301    ///   * [`crate::device_runtime::async_resource::AsyncCudaResource`]
302    ///     overrides with real event tracking.
303    ///   * [`crate::device_runtime::logging::LoggingResource`] and
304    ///     [`crate::device_runtime::budget::GlobalDeviceBudget`]
305    ///     forward to their inner resource (so the underlying
306    ///     backend's behavior surfaces unchanged).
307    ///   * [`crate::device_runtime::direct::DirectCudaResource`]
308    ///     does NOT override — it correctly returns
309    ///     `StreamMisuse` and forces callers to either route
310    ///     allocations through `AsyncCudaResource` or take
311    ///     responsibility for cross-stream synchronization
312    ///     themselves.
313    ///
314    /// # Errors
315    ///   * [`ResourceError::StreamMisuse`] from the default impl
316    ///     when the resource cannot track cross-stream uses.
317    ///   * [`ResourceError::UseAfterFree`] if `block` is not the
318    ///     block currently live at `block.ptr` (caller likely
319    ///     handed back a stale [`DeviceBlock`] whose generation
320    ///     no longer matches the live entry).
321    ///   * [`ResourceError::StreamMisuse`] if `use_stream` does
322    ///     not resolve in the resource's stream pool.
323    ///   * [`ResourceError::Driver`] for CUDA driver / event
324    ///     creation failures.
325    ///
326    /// Callers that bypass this API and submit cross-stream work
327    /// directly (raw `cuMemcpyDtoHAsync`, raw `Vec<*mut c_void>`
328    /// kernel launches that the launch builder did not see, etc.)
329    /// are responsible for their own cross-stream synchronization.
330    /// The resource cannot infer arbitrary external CUDA work.
331    fn record_block_use(&self, block: &DeviceBlock, use_stream: StreamId) -> ResourceResult<()> {
332        let _ = (block, use_stream);
333        Err(ResourceError::StreamMisuse(
334            "record_block_use unsupported by this resource (the active backend \
335             does not track cross-stream uses; route allocations through a \
336             stream-ordered backend such as AsyncCudaResource, or take \
337             responsibility for cross-stream synchronization explicitly)"
338                .to_string(),
339        ))
340    }
341
342    /// Whether this resource (and any inner resources it
343    /// composes) actually tracks cross-stream uses via
344    /// `record_block_use`. Used by the launch recorder's
345    /// preflight to fail BEFORE queueing CUDA work, rather than
346    /// after. The default returns `false` to match the trait's
347    /// default `record_block_use` behavior; resources that
348    /// override `record_block_use` to track events MUST override
349    /// this to return `true`. Decorators forward to inner.
350    fn supports_block_use_tracking(&self) -> bool {
351        false
352    }
353
354    /// Pre-launch / pre-copy hook: queue any cross-stream waits
355    /// required for `use_stream` to safely access `block` with
356    /// `access` semantics. MUST be called BEFORE the GPU work is
357    /// enqueued on `use_stream`.
358    ///
359    /// Concretely, on [`Access::Read`] the resource must queue
360    /// `use_stream.wait(&last_write)` if a write on a different
361    /// stream is outstanding. On [`Access::Write`] /
362    /// [`Access::ReadWrite`] the resource must additionally queue
363    /// waits on every outstanding read recorded on a different
364    /// stream — the writer must observe completion of every prior
365    /// reader. Same-stream events are skipped (CUDA stream order
366    /// already covers them).
367    ///
368    /// **The default implementation returns
369    /// [`ResourceError::StreamMisuse`].** Same rationale as
370    /// `record_block_use`: a silent no-op default would let
371    /// callers paired against a non-tracking backend believe the
372    /// dependency edge was queued. Decorators forward; tracking
373    /// backends override.
374    ///
375    /// # Errors
376    ///   * [`ResourceError::StreamMisuse`] from the default impl
377    ///     when the resource cannot track cross-stream uses.
378    ///   * [`ResourceError::UseAfterFree`] if `block` is not the
379    ///     id currently live at `block.ptr`.
380    ///   * [`ResourceError::Driver`] for CUDA driver / event-wait
381    ///     failures.
382    fn prepare_block_use(
383        &self,
384        block: BlockId,
385        use_stream: StreamId,
386        access: Access,
387    ) -> ResourceResult<()> {
388        let _ = (block, use_stream, access);
389        Err(ResourceError::StreamMisuse(
390            "prepare_block_use unsupported by this resource (the active backend \
391             does not track cross-stream uses; route allocations through \
392             AsyncCudaResource or take responsibility for cross-stream \
393             synchronization explicitly)"
394                .to_string(),
395        ))
396    }
397
398    /// Post-launch / post-copy hook: record an event on
399    /// `use_stream` capturing the work just enqueued and update
400    /// `block`'s dependency state.
401    ///
402    /// Concretely, on [`Access::Read`] the new event is appended
403    /// to the block's outstanding-reads list (so future writers
404    /// and the eventual deallocate can wait on it). On
405    /// [`Access::Write`] / [`Access::ReadWrite`] the new event
406    /// **replaces** the block's last-write event and the
407    /// outstanding-reads list is cleared (any prior reader's
408    /// dependency was queued at prepare time and is now subsumed
409    /// by the new write event).
410    ///
411    /// **The default implementation returns
412    /// [`ResourceError::StreamMisuse`].** Same rationale as
413    /// `record_block_use`. Decorators forward; tracking backends
414    /// override.
415    fn finish_block_use(
416        &self,
417        block: BlockId,
418        use_stream: StreamId,
419        access: Access,
420    ) -> ResourceResult<()> {
421        let _ = (block, use_stream, access);
422        Err(ResourceError::StreamMisuse(
423            "finish_block_use unsupported by this resource (the active backend \
424             does not track cross-stream uses; route allocations through \
425             AsyncCudaResource or take responsibility for cross-stream \
426             synchronization explicitly)"
427                .to_string(),
428        ))
429    }
430}
xlog_cuda/device_runtime/resource.rs

xlog_cuda/device_runtime/
resource.rs