xlog_cuda/device_runtime/resource.rs
1//! Core [`DeviceMemoryResource`] trait and supporting types.
2//!
3//! Mirrors RMM's `device_memory_resource` shape so a future optional
4//! RMM backend can satisfy the same trait without requiring callers to
5//! change. Stream-ordered: every alloc/dealloc names a stream; cross-
6//! stream reuse requires explicit event-based synchronization.
7
8use std::fmt;
9use std::sync::atomic::{AtomicU64, Ordering};
10
11/// Identifier for a CUDA stream owned by the runtime's stream pool.
12/// Wraps the raw cudarc stream handle the resource will use for
13/// `cuMemAllocAsync` / `cuMemFreeAsync` ordering. Construction goes
14/// through the runtime; do not fabricate.
15#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
16pub struct StreamId(pub u32);
17
18impl StreamId {
19 /// The "default" pool stream for tests and synchronous codepaths
20 /// that have no other stream context. Production callers should
21 /// always carry a real stream from the executor / kernel launch
22 /// site.
23 pub const DEFAULT: StreamId = StreamId(0);
24}
25
26/// Caller-supplied tag for allocation log lines. Short-lived strings
27/// are interned by the logging resource; long-lived borrows are not
28/// retained.
29#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
30pub struct AllocTag(pub &'static str);
31
32impl AllocTag {
33 pub const UNTAGGED: AllocTag = AllocTag("untagged");
34}
35
36/// Monotonic counter for distinguishing reuse of the same byte address
37/// across drop / reallocate cycles. Logging and debug-guard resources
38/// use this to detect use-after-free.
39#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
40pub struct Generation(pub u64);
41
42static GENERATION_COUNTER: AtomicU64 = AtomicU64::new(1);
43
44impl Generation {
45 /// Allocate a fresh, monotonically increasing generation number.
46 /// Concurrent calls return distinct values.
47 pub fn next() -> Generation {
48 Generation(GENERATION_COUNTER.fetch_add(1, Ordering::Relaxed))
49 }
50}
51
52/// Access kind for a single block use. Drives the cross-stream
53/// dependency edges the resource queues during
54/// [`DeviceMemoryResource::prepare_block_use`] and the events it
55/// records during [`DeviceMemoryResource::finish_block_use`].
56///
57/// * [`Access::Read`] — the work consumes the block's bytes.
58/// Must wait on any prior write on a different stream. The
59/// resulting event is appended to the block's outstanding-reads
60/// list so future writers (and the eventual deallocate) can
61/// wait on it.
62/// * [`Access::Write`] — the work overwrites the block's bytes
63/// unconditionally. Must wait on the block's prior write AND
64/// all outstanding reads on different streams. The resulting
65/// event becomes the block's new last-write event; the
66/// outstanding-reads list is cleared at finish time.
67/// * [`Access::ReadWrite`] — both. Same wait set as `Write`,
68/// and the resulting event likewise replaces last-write.
69#[derive(Clone, Copy, Debug, Eq, PartialEq)]
70pub enum Access {
71 Read,
72 Write,
73 ReadWrite,
74}
75
76impl Access {
77 /// Whether work of this access kind reads the block's bytes.
78 pub fn reads(self) -> bool {
79 matches!(self, Access::Read | Access::ReadWrite)
80 }
81
82 /// Whether work of this access kind writes the block's bytes.
83 pub fn writes(self) -> bool {
84 matches!(self, Access::Write | Access::ReadWrite)
85 }
86}
87
88/// Compact identity of a [`DeviceBlock`] suitable for snapshotting
89/// into structures whose lifetime should not be tied to the source
90/// slice's borrow. The fields needed to validate `(ptr, generation)`
91/// against the resource's live map and to resolve `alloc_stream` for
92/// cross-stream waits / dealloc ordering.
93///
94/// Created via [`BlockId::from_block`]. Pure data; no resource
95/// handle, no `Drop`. Cheap to copy.
96#[derive(Clone, Copy, Debug, Eq, PartialEq)]
97pub struct BlockId {
98 pub ptr: u64,
99 pub generation: Generation,
100 pub alloc_stream: StreamId,
101 pub device_ordinal: u32,
102}
103
104impl BlockId {
105 /// Snapshot a [`DeviceBlock`]'s identity. The returned id is
106 /// independent of the original block's borrow lifetime; the
107 /// runtime's generation guard catches stale ids whose backing
108 /// allocation has been recycled.
109 pub fn from_block(block: &DeviceBlock) -> Self {
110 Self {
111 ptr: block.ptr,
112 generation: block.generation,
113 alloc_stream: block.alloc_stream,
114 device_ordinal: block.device_ordinal,
115 }
116 }
117}
118
119/// State of an outstanding [`DeviceBlock`] from the runtime's
120/// perspective. Adaptors flip blocks between these states; bug-detection
121/// resources reject operations on blocks in an unexpected state.
122#[derive(Clone, Copy, Debug, Eq, PartialEq)]
123pub enum BlockState {
124 /// Returned from `allocate`; safe to read/write on `alloc_stream`
125 /// or after a synchronization to another stream.
126 Live,
127 /// Returned from `deallocate` but still pending kernel completion
128 /// on its owning stream. Reuse must wait for stream sync.
129 Retired,
130 /// Held by `DebugGuardResource` for delayed reuse / canary
131 /// validation. Not reissued until the quarantine window passes.
132 Quarantined,
133 /// Memory has been physically freed. Any further use is a bug.
134 Freed,
135}
136
137/// One outstanding device-memory allocation. Owned by the caller until
138/// returned to its originating resource via
139/// [`DeviceMemoryResource::deallocate`].
140///
141/// Carries the metadata required for stream-ordered correctness and
142/// post-mortem debugging: the resource that owns the block, the device
143/// ordinal, the stream the allocation is bound to, byte size, alignment,
144/// caller tag, generation number, and current state.
145#[derive(Debug)]
146pub struct DeviceBlock {
147 /// Raw device pointer (opaque to safe Rust callers).
148 pub ptr: u64,
149 /// CUDA ordinal of the device this block lives on.
150 pub device_ordinal: u32,
151 /// Allocation stream. Reads/writes on a different stream require
152 /// explicit synchronization (event wait or device sync).
153 pub alloc_stream: StreamId,
154 /// Size in bytes. May exceed the caller-requested size when the
155 /// resource rounds up for alignment or pool granularity.
156 pub bytes: usize,
157 /// Alignment in bytes (always ≥ caller request).
158 pub align: usize,
159 /// Caller-supplied tag, surfaced in allocation logs.
160 pub tag: AllocTag,
161 /// Monotonic generation. Reused addresses get fresh generations.
162 pub generation: Generation,
163 /// Current state. Adaptors transition this; tests assert on it.
164 pub state: BlockState,
165}
166
167/// Errors returned by resource implementations. Distinct variants for
168/// the cases stress tests need to pin (out-of-budget vs CUDA driver
169/// failure vs use-after-free etc.).
170#[derive(Debug)]
171pub enum ResourceError {
172 /// The requested allocation would exceed the resource's budget.
173 /// Carries the requested bytes and the remaining budget so tests
174 /// can assert deterministic refusal.
175 OutOfBudget { requested: usize, remaining: usize },
176 /// CUDA driver returned an error. Carries the wrapped message.
177 Driver(String),
178 /// A stream-ordered contract was violated (e.g. dealloc on a
179 /// stream that does not match the alloc stream without an
180 /// intervening sync).
181 StreamMisuse(String),
182 /// A debug-guard or logging adaptor detected a use-after-free or
183 /// double-free. Hard error in debug builds; surfaced upward.
184 UseAfterFree { generation: Generation },
185 /// A debug-guard adaptor detected an out-of-bounds write past a
186 /// canary boundary.
187 OutOfBounds { generation: Generation },
188}
189
190impl fmt::Display for ResourceError {
191 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
192 match self {
193 Self::OutOfBudget {
194 requested,
195 remaining,
196 } => write!(
197 f,
198 "out of budget: requested {} bytes, remaining {} bytes",
199 requested, remaining
200 ),
201 Self::Driver(msg) => write!(f, "CUDA driver error: {}", msg),
202 Self::StreamMisuse(msg) => write!(f, "stream-ordered contract violated: {}", msg),
203 Self::UseAfterFree { generation } => {
204 write!(f, "use-after-free on generation {:?}", generation)
205 }
206 Self::OutOfBounds { generation } => {
207 write!(f, "out-of-bounds write on generation {:?}", generation)
208 }
209 }
210 }
211}
212
213impl std::error::Error for ResourceError {}
214
215pub type ResourceResult<T> = std::result::Result<T, ResourceError>;
216
217/// Stream-ordered device memory resource. Implementations:
218/// * [`crate::device_runtime::direct::DirectCudaResource`] —
219/// cudarc default (non-pooled) backend; **candidate** for the
220/// sanitizer/cert role, **unproven** until the manual Compute Sanitizer
221/// acceptance gate runs on a supported host.
222/// * [`crate::device_runtime::async_resource::AsyncCudaResource`] —
223/// stream-ordered cuMemAllocAsync/cuMemFreeAsync backend;
224/// production default when the context supports async-alloc.
225/// * [`crate::device_runtime::logging::LoggingResource`] —
226/// telemetry decorator over any inner resource.
227/// * [`crate::device_runtime::budget::GlobalDeviceBudget`] —
228/// per-runtime byte-limit decorator over any inner resource.
229/// * `PoolResource` — performance tier; v0.7+ (not implemented).
230/// * `DebugGuardResource` — canary/poison/quarantine; v0.7+
231/// (not implemented).
232///
233/// Implementations must be thread-safe. The runtime composes resources
234/// via decoration (each resource wraps an inner `Box<dyn
235/// DeviceMemoryResource + Send + Sync>`).
236pub trait DeviceMemoryResource: Send + Sync {
237 /// Allocate `bytes` bytes on the resource's device, ordered on
238 /// `stream`. The returned block is in [`BlockState::Live`].
239 fn allocate(
240 &self,
241 bytes: usize,
242 stream: StreamId,
243 tag: AllocTag,
244 ) -> ResourceResult<DeviceBlock>;
245
246 /// Return `block` to the resource. After this call the block's
247 /// state is [`BlockState::Retired`] (or [`BlockState::Quarantined`]
248 /// for debug-guard resources). Reuse of the underlying memory is
249 /// resource-specific but must respect the stream-ordered contract.
250 ///
251 /// `block.alloc_stream` is authoritative for ordering. If the
252 /// caller has touched the memory on a different stream, they must
253 /// have synchronized before calling `deallocate`.
254 fn deallocate(&self, block: DeviceBlock) -> ResourceResult<()>;
255
256 /// CUDA device ordinal this resource serves. Resources are pinned
257 /// to a single device.
258 fn device_ordinal(&self) -> u32;
259
260 /// Bytes currently outstanding (live + retired-but-not-yet-freed).
261 /// Used by tests and by the global budget adaptor.
262 fn bytes_outstanding(&self) -> usize;
263
264 /// Drain any retired-but-not-yet-freed bytes whose underlying
265 /// CUDA work has completed. For synchronous backends this is a
266 /// no-op. For stream-ordered async backends this synchronizes
267 /// the streams that have queued `cuMemFreeAsync` calls and
268 /// re-counts `bytes_outstanding` accordingly.
269 ///
270 /// Callers that need an accurate budget reading after a burst
271 /// of asynchronous deallocations should call this before
272 /// reading `bytes_outstanding`. Calling on a synchronous backend
273 /// is harmless and free.
274 fn reap_pending(&self) -> ResourceResult<()> {
275 Ok(())
276 }
277
278 /// Record that work has been (or is being) submitted on
279 /// `use_stream` that touches `block`'s bytes. Resources that
280 /// participate in cross-stream lifetime tracking (notably the
281 /// stream-ordered async backend) MUST attach a CUDA event from
282 /// `use_stream` to the block; on `deallocate(block)`, the
283 /// block's `alloc_stream` will wait on every recorded event
284 /// before queueing the underlying free.
285 ///
286 /// **The default implementation returns
287 /// [`ResourceError::StreamMisuse`].** This is intentional: a
288 /// silent no-op default would let a launch builder call
289 /// `record_block_use` against a resource that does not
290 /// actually track cross-stream uses (e.g.,
291 /// [`crate::device_runtime::direct::DirectCudaResource`]),
292 /// observe `Ok(())`, queue a kernel on a different stream,
293 /// then drop the block — and quietly hit the cross-stream
294 /// use-after-free that this API exists to prevent. False
295 /// safety is worse than no safety. Resources that cannot
296 /// track cross-stream uses MUST inherit this default;
297 /// callers (notably the future xlog launch builder) MUST
298 /// surface the error rather than masking it.
299 ///
300 /// Override status today:
301 /// * [`crate::device_runtime::async_resource::AsyncCudaResource`]
302 /// overrides with real event tracking.
303 /// * [`crate::device_runtime::logging::LoggingResource`] and
304 /// [`crate::device_runtime::budget::GlobalDeviceBudget`]
305 /// forward to their inner resource (so the underlying
306 /// backend's behavior surfaces unchanged).
307 /// * [`crate::device_runtime::direct::DirectCudaResource`]
308 /// does NOT override — it correctly returns
309 /// `StreamMisuse` and forces callers to either route
310 /// allocations through `AsyncCudaResource` or take
311 /// responsibility for cross-stream synchronization
312 /// themselves.
313 ///
314 /// # Errors
315 /// * [`ResourceError::StreamMisuse`] from the default impl
316 /// when the resource cannot track cross-stream uses.
317 /// * [`ResourceError::UseAfterFree`] if `block` is not the
318 /// block currently live at `block.ptr` (caller likely
319 /// handed back a stale [`DeviceBlock`] whose generation
320 /// no longer matches the live entry).
321 /// * [`ResourceError::StreamMisuse`] if `use_stream` does
322 /// not resolve in the resource's stream pool.
323 /// * [`ResourceError::Driver`] for CUDA driver / event
324 /// creation failures.
325 ///
326 /// Callers that bypass this API and submit cross-stream work
327 /// directly (raw `cuMemcpyDtoHAsync`, raw `Vec<*mut c_void>`
328 /// kernel launches that the launch builder did not see, etc.)
329 /// are responsible for their own cross-stream synchronization.
330 /// The resource cannot infer arbitrary external CUDA work.
331 fn record_block_use(&self, block: &DeviceBlock, use_stream: StreamId) -> ResourceResult<()> {
332 let _ = (block, use_stream);
333 Err(ResourceError::StreamMisuse(
334 "record_block_use unsupported by this resource (the active backend \
335 does not track cross-stream uses; route allocations through a \
336 stream-ordered backend such as AsyncCudaResource, or take \
337 responsibility for cross-stream synchronization explicitly)"
338 .to_string(),
339 ))
340 }
341
342 /// Whether this resource (and any inner resources it
343 /// composes) actually tracks cross-stream uses via
344 /// `record_block_use`. Used by the launch recorder's
345 /// preflight to fail BEFORE queueing CUDA work, rather than
346 /// after. The default returns `false` to match the trait's
347 /// default `record_block_use` behavior; resources that
348 /// override `record_block_use` to track events MUST override
349 /// this to return `true`. Decorators forward to inner.
350 fn supports_block_use_tracking(&self) -> bool {
351 false
352 }
353
354 /// Pre-launch / pre-copy hook: queue any cross-stream waits
355 /// required for `use_stream` to safely access `block` with
356 /// `access` semantics. MUST be called BEFORE the GPU work is
357 /// enqueued on `use_stream`.
358 ///
359 /// Concretely, on [`Access::Read`] the resource must queue
360 /// `use_stream.wait(&last_write)` if a write on a different
361 /// stream is outstanding. On [`Access::Write`] /
362 /// [`Access::ReadWrite`] the resource must additionally queue
363 /// waits on every outstanding read recorded on a different
364 /// stream — the writer must observe completion of every prior
365 /// reader. Same-stream events are skipped (CUDA stream order
366 /// already covers them).
367 ///
368 /// **The default implementation returns
369 /// [`ResourceError::StreamMisuse`].** Same rationale as
370 /// `record_block_use`: a silent no-op default would let
371 /// callers paired against a non-tracking backend believe the
372 /// dependency edge was queued. Decorators forward; tracking
373 /// backends override.
374 ///
375 /// # Errors
376 /// * [`ResourceError::StreamMisuse`] from the default impl
377 /// when the resource cannot track cross-stream uses.
378 /// * [`ResourceError::UseAfterFree`] if `block` is not the
379 /// id currently live at `block.ptr`.
380 /// * [`ResourceError::Driver`] for CUDA driver / event-wait
381 /// failures.
382 fn prepare_block_use(
383 &self,
384 block: BlockId,
385 use_stream: StreamId,
386 access: Access,
387 ) -> ResourceResult<()> {
388 let _ = (block, use_stream, access);
389 Err(ResourceError::StreamMisuse(
390 "prepare_block_use unsupported by this resource (the active backend \
391 does not track cross-stream uses; route allocations through \
392 AsyncCudaResource or take responsibility for cross-stream \
393 synchronization explicitly)"
394 .to_string(),
395 ))
396 }
397
398 /// Post-launch / post-copy hook: record an event on
399 /// `use_stream` capturing the work just enqueued and update
400 /// `block`'s dependency state.
401 ///
402 /// Concretely, on [`Access::Read`] the new event is appended
403 /// to the block's outstanding-reads list (so future writers
404 /// and the eventual deallocate can wait on it). On
405 /// [`Access::Write`] / [`Access::ReadWrite`] the new event
406 /// **replaces** the block's last-write event and the
407 /// outstanding-reads list is cleared (any prior reader's
408 /// dependency was queued at prepare time and is now subsumed
409 /// by the new write event).
410 ///
411 /// **The default implementation returns
412 /// [`ResourceError::StreamMisuse`].** Same rationale as
413 /// `record_block_use`. Decorators forward; tracking backends
414 /// override.
415 fn finish_block_use(
416 &self,
417 block: BlockId,
418 use_stream: StreamId,
419 access: Access,
420 ) -> ResourceResult<()> {
421 let _ = (block, use_stream, access);
422 Err(ResourceError::StreamMisuse(
423 "finish_block_use unsupported by this resource (the active backend \
424 does not track cross-stream uses; route allocations through \
425 AsyncCudaResource or take responsibility for cross-stream \
426 synchronization explicitly)"
427 .to_string(),
428 ))
429 }
430}