xlog_cuda/device_runtime/direct.rs
1//! [`DirectCudaResource`] — cudarc default (non-pooled) allocation
2//! backend.
3//!
4//! Each [`DeviceMemoryResource::allocate`] call goes through cudarc's
5//! `CudaDeviceInner::alloc::<u8>(bytes)`. cudarc itself routes that
6//! through `CudaStream::alloc` against the device's default stream,
7//! which forwards to **`cuMemAllocAsync` on contexts that support
8//! async-alloc** and falls back to a synchronous path otherwise.
9//! There is no `xlog`-level pooling or suballocation in this layer —
10//! every `allocate` is one cudarc call, every `deallocate` drops the
11//! resulting `CudaSlice<u8>` (which in turn invokes `cuMemFreeAsync`
12//! or the synchronous fallback that cudarc selected).
13//!
14//! Earlier revisions described this backend as "raw `cuMemAlloc` /
15//! `cuMemFree`". That was wrong. A genuine raw-driver direct backend
16//! (bypassing cudarc entirely) is a separate work item; until that
17//! exists, this backend is the **non-pooled default** — not a synchronous
18//! `cuMemAlloc`/`cuMemFree` adaptor — and it does not by itself
19//! guarantee that pool suballocation is absent from the underlying
20//! call path on a given host.
21//!
22//! **Sanitizer status: unproven.** The intent of having a non-pooled
23//! backend is that pool *suballocation* hides byte-level
24//! out-of-bounds access from Compute Sanitizer. The cudarc default
25//! path forwards to `cuMemAllocAsync`, which on async-alloc hosts is
26//! a stream-ordered allocator; whether that is sufficiently
27//! sanitizer-visible is exactly what the **manual Compute Sanitizer
28//! acceptance gate** is supposed to confirm on a supported host. Do
29//! not describe this backend as "sanitizer-certified" until that
30//! manual gate has produced a captured negative-test pass; until the
31//! gate lands, treat the sanitizer role as "candidate, not certified".
32//!
33//! Stream-ordered semantics: the backend records the caller-supplied
34//! `alloc_stream` on the returned [`DeviceBlock`] but does **not**
35//! attempt to bind the underlying cudarc allocation to that stream —
36//! cudarc allocates against the device's default stream regardless.
37//! Stream-ordered allocation/free that honors a caller-supplied
38//! [`StreamId`] is `AsyncCudaResource`'s responsibility (separate
39//! commit).
40
41use std::collections::HashMap;
42use std::sync::atomic::{AtomicUsize, Ordering};
43use std::sync::{Arc, Mutex};
44
45use cudarc::driver::CudaSlice;
46
47use super::resource::{
48 AllocTag, BlockState, DeviceBlock, DeviceMemoryResource, Generation, ResourceError,
49 ResourceResult, StreamId,
50};
51use crate::CudaDevice;
52
53/// cudarc default (non-pooled) allocation adaptor. Holds the
54/// underlying `CudaSlice<u8>` allocations alive in an internal map so
55/// the runtime returns opaque [`DeviceBlock`]s to callers; on
56/// deallocate the slice is dropped, which invokes whichever cudarc
57/// free path matches the alloc path (`cuMemFreeAsync` on async-alloc
58/// hosts, the synchronous fallback otherwise).
59///
60/// Concurrency: `Send + Sync`. The internal map is protected by a
61/// `Mutex`. Allocate and deallocate are short-running map operations
62/// plus the underlying CUDA call.
63pub struct DirectCudaResource {
64 device: Arc<CudaDevice>,
65 device_ordinal: u32,
66 /// Live + retired-but-not-yet-freed allocations, keyed by raw
67 /// device pointer. Holding the slice keeps `cuMemFree` from
68 /// running until we explicitly drop it on deallocate.
69 live: Mutex<HashMap<u64, CudaSlice<u8>>>,
70 /// Sum of bytes outstanding (live + retired). Updated together
71 /// with the map under the `live` mutex.
72 bytes_outstanding: AtomicUsize,
73}
74
75impl DirectCudaResource {
76 /// Construct a resource bound to `device`. `device_ordinal` is the
77 /// CUDA ordinal for logging / multi-device disambiguation.
78 pub fn new(device: Arc<CudaDevice>, device_ordinal: u32) -> Self {
79 Self {
80 device,
81 device_ordinal,
82 live: Mutex::new(HashMap::new()),
83 bytes_outstanding: AtomicUsize::new(0),
84 }
85 }
86
87 /// Borrow the device handle. Tests and downstream resources use
88 /// this to launch kernels against the same device this resource
89 /// allocates on.
90 pub fn device(&self) -> &Arc<CudaDevice> {
91 &self.device
92 }
93}
94
95impl DeviceMemoryResource for DirectCudaResource {
96 fn allocate(
97 &self,
98 bytes: usize,
99 stream: StreamId,
100 tag: AllocTag,
101 ) -> ResourceResult<DeviceBlock> {
102 if bytes == 0 {
103 // Zero-byte allocations are not legal in CUDA; surface as
104 // a contract error rather than calling cuMemAlloc(0).
105 return Err(ResourceError::Driver(
106 "DirectCudaResource: zero-byte allocation not supported".to_string(),
107 ));
108 }
109
110 // SAFETY: the device handle is valid for the lifetime of
111 // `self`, and `bytes > 0` is checked above. cudarc's
112 // `CudaDeviceInner::alloc::<u8>(bytes)` forwards to
113 // `cuMemAllocAsync` (against the device's default stream)
114 // when the context supports async-alloc, otherwise to the
115 // synchronous fallback. Failure is propagated as
116 // `ResourceError::Driver`.
117 let slice = unsafe {
118 self.device.inner().alloc::<u8>(bytes).map_err(|e| {
119 ResourceError::Driver(format!("cudarc alloc::<u8>({}): {}", bytes, e))
120 })?
121 };
122
123 // Extract the raw device pointer. The "sync" handle returned by
124 // `device_ptr` is intentionally leaked — the slice's lifetime is
125 // managed by the map, not the sync.
126 let (raw_ptr, sync) =
127 <CudaSlice<u8> as cudarc::driver::DevicePtr<u8>>::device_ptr(&slice, slice.stream());
128 std::mem::forget(sync);
129 let ptr = raw_ptr;
130
131 {
132 let mut live = self.live.lock().expect("live map poisoned");
133 // The CUDA driver does not return the same byte address
134 // for two simultaneously live allocations. If our map
135 // already has this pointer, it indicates a bookkeeping
136 // bug or driver behavior we want to surface loudly.
137 // Use `contains_key` then `insert` so a (theoretical)
138 // collision returns `Err` without mutating the map —
139 // a `live.insert(ptr, slice).is_some()` pattern would
140 // replace the existing entry, drop the old slice (which
141 // calls cuMemFree on memory we still believe we own),
142 // and leave the new slice resident in `live` while we
143 // return Err. Avoid that here.
144 if live.contains_key(&ptr) {
145 return Err(ResourceError::Driver(format!(
146 "DirectCudaResource: pointer collision on alloc ({:#x})",
147 ptr
148 )));
149 }
150 live.insert(ptr, slice);
151 }
152 self.bytes_outstanding.fetch_add(bytes, Ordering::Relaxed);
153
154 Ok(DeviceBlock {
155 ptr,
156 device_ordinal: self.device_ordinal,
157 alloc_stream: stream,
158 bytes,
159 align: std::mem::align_of::<u8>(),
160 tag,
161 generation: Generation::next(),
162 state: BlockState::Live,
163 })
164 }
165
166 fn deallocate(&self, block: DeviceBlock) -> ResourceResult<()> {
167 if block.device_ordinal != self.device_ordinal {
168 return Err(ResourceError::Driver(format!(
169 "DirectCudaResource: deallocate on wrong device (block ord {} vs resource ord {})",
170 block.device_ordinal, self.device_ordinal
171 )));
172 }
173
174 let removed = {
175 let mut live = self.live.lock().expect("live map poisoned");
176 live.remove(&block.ptr)
177 };
178 let slice = removed.ok_or(ResourceError::UseAfterFree {
179 generation: block.generation,
180 })?;
181
182 self.bytes_outstanding
183 .fetch_sub(block.bytes, Ordering::Relaxed);
184
185 // Dropping the `CudaSlice<u8>` invokes whichever cudarc free
186 // path matches the alloc path: `cuMemFreeAsync` on the
187 // device's default stream when the context supports
188 // async-alloc, the synchronous fallback otherwise. Either
189 // way the caller-supplied `block.alloc_stream` is **not**
190 // honored here — only `AsyncCudaResource` does that. If the
191 // caller has work queued on a non-default stream that
192 // touches this memory they were responsible for
193 // synchronizing before calling deallocate.
194 drop(slice);
195 Ok(())
196 }
197
198 fn device_ordinal(&self) -> u32 {
199 self.device_ordinal
200 }
201
202 fn bytes_outstanding(&self) -> usize {
203 self.bytes_outstanding.load(Ordering::Relaxed)
204 }
205}
206
207#[cfg(test)]
208mod tests {
209 use super::*;
210
211 fn try_device() -> Option<Arc<CudaDevice>> {
212 CudaDevice::new(0).ok().map(Arc::new)
213 }
214
215 #[test]
216 fn allocate_then_deallocate_round_trips() {
217 let Some(device) = try_device() else {
218 eprintln!("Skipping: no CUDA device");
219 return;
220 };
221 let r = DirectCudaResource::new(device, 0);
222 assert_eq!(r.bytes_outstanding(), 0);
223
224 let block = r
225 .allocate(4096, StreamId::DEFAULT, AllocTag::UNTAGGED)
226 .expect("alloc");
227 assert_eq!(block.bytes, 4096);
228 assert_eq!(block.state, BlockState::Live);
229 assert_eq!(r.bytes_outstanding(), 4096);
230
231 r.deallocate(block).expect("dealloc");
232 assert_eq!(r.bytes_outstanding(), 0);
233 }
234
235 #[test]
236 fn zero_byte_allocate_rejects() {
237 let Some(device) = try_device() else {
238 return;
239 };
240 let r = DirectCudaResource::new(device, 0);
241 let err = r.allocate(0, StreamId::DEFAULT, AllocTag::UNTAGGED);
242 assert!(matches!(err, Err(ResourceError::Driver(_))));
243 assert_eq!(r.bytes_outstanding(), 0);
244 }
245
246 #[test]
247 fn deallocate_unknown_block_returns_use_after_free() {
248 let Some(device) = try_device() else {
249 return;
250 };
251 let r = DirectCudaResource::new(device, 0);
252 let bogus = DeviceBlock {
253 ptr: 0xdead_beef,
254 device_ordinal: 0,
255 alloc_stream: StreamId::DEFAULT,
256 bytes: 16,
257 align: 1,
258 tag: AllocTag::UNTAGGED,
259 generation: Generation::next(),
260 state: BlockState::Live,
261 };
262 assert!(matches!(
263 r.deallocate(bogus),
264 Err(ResourceError::UseAfterFree { .. })
265 ));
266 }
267
268 /// Locks the contract that DirectCudaResource does NOT
269 /// silently accept `record_block_use`. If a caller (e.g. the
270 /// future xlog launch builder) calls record_block_use against
271 /// a runtime built around DirectCudaResource, the call must
272 /// fail loudly with StreamMisuse — not return Ok and quietly
273 /// fail to track anything. False safety here would let
274 /// downstream code queue cross-stream kernels and drop
275 /// blocks while the cross-stream use was never recorded,
276 /// reproducing exactly the use-after-free this whole layer
277 /// exists to prevent.
278 ///
279 /// Implementation note: DirectCudaResource inherits the
280 /// trait's default `record_block_use` impl (which returns
281 /// `StreamMisuse`). It does NOT override. If a future change
282 /// adds a real override, it must make the override
283 /// genuinely track cross-stream uses (similar to
284 /// AsyncCudaResource's implementation) — anything else
285 /// regresses this contract.
286 #[test]
287 fn record_block_use_rejected_with_stream_misuse() {
288 let Some(device) = try_device() else {
289 return;
290 };
291 let r = DirectCudaResource::new(device, 0);
292 let block = r
293 .allocate(64, StreamId::DEFAULT, AllocTag::UNTAGGED)
294 .expect("alloc");
295 let err = r.record_block_use(&block, StreamId::DEFAULT);
296 match err {
297 Err(ResourceError::StreamMisuse(msg)) => {
298 assert!(
299 msg.contains("unsupported"),
300 "expected 'unsupported' in StreamMisuse message, got {:?}",
301 msg
302 );
303 }
304 other => panic!(
305 "DirectCudaResource::record_block_use must return StreamMisuse \
306 to surface unsupported cross-stream tracking; got {:?}",
307 other
308 ),
309 }
310 // The block stays live — a failed record_block_use must
311 // NOT have removed the entry or dropped the slice.
312 assert_eq!(r.bytes_outstanding(), 64);
313 r.deallocate(block).expect("dealloc still works");
314 }
315}