xlog_cuda/device_runtime/
direct.rs

1//! [`DirectCudaResource`] — cudarc default (non-pooled) allocation
2//! backend.
3//!
4//! Each [`DeviceMemoryResource::allocate`] call goes through cudarc's
5//! `CudaDeviceInner::alloc::<u8>(bytes)`. cudarc itself routes that
6//! through `CudaStream::alloc` against the device's default stream,
7//! which forwards to **`cuMemAllocAsync` on contexts that support
8//! async-alloc** and falls back to a synchronous path otherwise.
9//! There is no `xlog`-level pooling or suballocation in this layer —
10//! every `allocate` is one cudarc call, every `deallocate` drops the
11//! resulting `CudaSlice<u8>` (which in turn invokes `cuMemFreeAsync`
12//! or the synchronous fallback that cudarc selected).
13//!
14//! Earlier revisions described this backend as "raw `cuMemAlloc` /
15//! `cuMemFree`". That was wrong. A genuine raw-driver direct backend
16//! (bypassing cudarc entirely) is a separate work item; until that
17//! exists, this backend is the **non-pooled default** — not a synchronous
18//! `cuMemAlloc`/`cuMemFree` adaptor — and it does not by itself
19//! guarantee that pool suballocation is absent from the underlying
20//! call path on a given host.
21//!
22//! **Sanitizer status: unproven.** The intent of having a non-pooled
23//! backend is that pool *suballocation* hides byte-level
24//! out-of-bounds access from Compute Sanitizer. The cudarc default
25//! path forwards to `cuMemAllocAsync`, which on async-alloc hosts is
26//! a stream-ordered allocator; whether that is sufficiently
27//! sanitizer-visible is exactly what the **manual Compute Sanitizer
28//! acceptance gate** is supposed to confirm on a supported host. Do
29//! not describe this backend as "sanitizer-certified" until that
30//! manual gate has produced a captured negative-test pass; until the
31//! gate lands, treat the sanitizer role as "candidate, not certified".
32//!
33//! Stream-ordered semantics: the backend records the caller-supplied
34//! `alloc_stream` on the returned [`DeviceBlock`] but does **not**
35//! attempt to bind the underlying cudarc allocation to that stream —
36//! cudarc allocates against the device's default stream regardless.
37//! Stream-ordered allocation/free that honors a caller-supplied
38//! [`StreamId`] is `AsyncCudaResource`'s responsibility (separate
39//! commit).
40
41use std::collections::HashMap;
42use std::sync::atomic::{AtomicUsize, Ordering};
43use std::sync::{Arc, Mutex};
44
45use cudarc::driver::CudaSlice;
46
47use super::resource::{
48    AllocTag, BlockState, DeviceBlock, DeviceMemoryResource, Generation, ResourceError,
49    ResourceResult, StreamId,
50};
51use crate::CudaDevice;
52
53/// cudarc default (non-pooled) allocation adaptor. Holds the
54/// underlying `CudaSlice<u8>` allocations alive in an internal map so
55/// the runtime returns opaque [`DeviceBlock`]s to callers; on
56/// deallocate the slice is dropped, which invokes whichever cudarc
57/// free path matches the alloc path (`cuMemFreeAsync` on async-alloc
58/// hosts, the synchronous fallback otherwise).
59///
60/// Concurrency: `Send + Sync`. The internal map is protected by a
61/// `Mutex`. Allocate and deallocate are short-running map operations
62/// plus the underlying CUDA call.
63pub struct DirectCudaResource {
64    device: Arc<CudaDevice>,
65    device_ordinal: u32,
66    /// Live + retired-but-not-yet-freed allocations, keyed by raw
67    /// device pointer. Holding the slice keeps `cuMemFree` from
68    /// running until we explicitly drop it on deallocate.
69    live: Mutex<HashMap<u64, CudaSlice<u8>>>,
70    /// Sum of bytes outstanding (live + retired). Updated together
71    /// with the map under the `live` mutex.
72    bytes_outstanding: AtomicUsize,
73}
74
75impl DirectCudaResource {
76    /// Construct a resource bound to `device`. `device_ordinal` is the
77    /// CUDA ordinal for logging / multi-device disambiguation.
78    pub fn new(device: Arc<CudaDevice>, device_ordinal: u32) -> Self {
79        Self {
80            device,
81            device_ordinal,
82            live: Mutex::new(HashMap::new()),
83            bytes_outstanding: AtomicUsize::new(0),
84        }
85    }
86
87    /// Borrow the device handle. Tests and downstream resources use
88    /// this to launch kernels against the same device this resource
89    /// allocates on.
90    pub fn device(&self) -> &Arc<CudaDevice> {
91        &self.device
92    }
93}
94
95impl DeviceMemoryResource for DirectCudaResource {
96    fn allocate(
97        &self,
98        bytes: usize,
99        stream: StreamId,
100        tag: AllocTag,
101    ) -> ResourceResult<DeviceBlock> {
102        if bytes == 0 {
103            // Zero-byte allocations are not legal in CUDA; surface as
104            // a contract error rather than calling cuMemAlloc(0).
105            return Err(ResourceError::Driver(
106                "DirectCudaResource: zero-byte allocation not supported".to_string(),
107            ));
108        }
109
110        // SAFETY: the device handle is valid for the lifetime of
111        // `self`, and `bytes > 0` is checked above. cudarc's
112        // `CudaDeviceInner::alloc::<u8>(bytes)` forwards to
113        // `cuMemAllocAsync` (against the device's default stream)
114        // when the context supports async-alloc, otherwise to the
115        // synchronous fallback. Failure is propagated as
116        // `ResourceError::Driver`.
117        let slice = unsafe {
118            self.device.inner().alloc::<u8>(bytes).map_err(|e| {
119                ResourceError::Driver(format!("cudarc alloc::<u8>({}): {}", bytes, e))
120            })?
121        };
122
123        // Extract the raw device pointer. The "sync" handle returned by
124        // `device_ptr` is intentionally leaked — the slice's lifetime is
125        // managed by the map, not the sync.
126        let (raw_ptr, sync) =
127            <CudaSlice<u8> as cudarc::driver::DevicePtr<u8>>::device_ptr(&slice, slice.stream());
128        std::mem::forget(sync);
129        let ptr = raw_ptr;
130
131        {
132            let mut live = self.live.lock().expect("live map poisoned");
133            // The CUDA driver does not return the same byte address
134            // for two simultaneously live allocations. If our map
135            // already has this pointer, it indicates a bookkeeping
136            // bug or driver behavior we want to surface loudly.
137            // Use `contains_key` then `insert` so a (theoretical)
138            // collision returns `Err` without mutating the map —
139            // a `live.insert(ptr, slice).is_some()` pattern would
140            // replace the existing entry, drop the old slice (which
141            // calls cuMemFree on memory we still believe we own),
142            // and leave the new slice resident in `live` while we
143            // return Err. Avoid that here.
144            if live.contains_key(&ptr) {
145                return Err(ResourceError::Driver(format!(
146                    "DirectCudaResource: pointer collision on alloc ({:#x})",
147                    ptr
148                )));
149            }
150            live.insert(ptr, slice);
151        }
152        self.bytes_outstanding.fetch_add(bytes, Ordering::Relaxed);
153
154        Ok(DeviceBlock {
155            ptr,
156            device_ordinal: self.device_ordinal,
157            alloc_stream: stream,
158            bytes,
159            align: std::mem::align_of::<u8>(),
160            tag,
161            generation: Generation::next(),
162            state: BlockState::Live,
163        })
164    }
165
166    fn deallocate(&self, block: DeviceBlock) -> ResourceResult<()> {
167        if block.device_ordinal != self.device_ordinal {
168            return Err(ResourceError::Driver(format!(
169                "DirectCudaResource: deallocate on wrong device (block ord {} vs resource ord {})",
170                block.device_ordinal, self.device_ordinal
171            )));
172        }
173
174        let removed = {
175            let mut live = self.live.lock().expect("live map poisoned");
176            live.remove(&block.ptr)
177        };
178        let slice = removed.ok_or(ResourceError::UseAfterFree {
179            generation: block.generation,
180        })?;
181
182        self.bytes_outstanding
183            .fetch_sub(block.bytes, Ordering::Relaxed);
184
185        // Dropping the `CudaSlice<u8>` invokes whichever cudarc free
186        // path matches the alloc path: `cuMemFreeAsync` on the
187        // device's default stream when the context supports
188        // async-alloc, the synchronous fallback otherwise. Either
189        // way the caller-supplied `block.alloc_stream` is **not**
190        // honored here — only `AsyncCudaResource` does that. If the
191        // caller has work queued on a non-default stream that
192        // touches this memory they were responsible for
193        // synchronizing before calling deallocate.
194        drop(slice);
195        Ok(())
196    }
197
198    fn device_ordinal(&self) -> u32 {
199        self.device_ordinal
200    }
201
202    fn bytes_outstanding(&self) -> usize {
203        self.bytes_outstanding.load(Ordering::Relaxed)
204    }
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    fn try_device() -> Option<Arc<CudaDevice>> {
212        CudaDevice::new(0).ok().map(Arc::new)
213    }
214
215    #[test]
216    fn allocate_then_deallocate_round_trips() {
217        let Some(device) = try_device() else {
218            eprintln!("Skipping: no CUDA device");
219            return;
220        };
221        let r = DirectCudaResource::new(device, 0);
222        assert_eq!(r.bytes_outstanding(), 0);
223
224        let block = r
225            .allocate(4096, StreamId::DEFAULT, AllocTag::UNTAGGED)
226            .expect("alloc");
227        assert_eq!(block.bytes, 4096);
228        assert_eq!(block.state, BlockState::Live);
229        assert_eq!(r.bytes_outstanding(), 4096);
230
231        r.deallocate(block).expect("dealloc");
232        assert_eq!(r.bytes_outstanding(), 0);
233    }
234
235    #[test]
236    fn zero_byte_allocate_rejects() {
237        let Some(device) = try_device() else {
238            return;
239        };
240        let r = DirectCudaResource::new(device, 0);
241        let err = r.allocate(0, StreamId::DEFAULT, AllocTag::UNTAGGED);
242        assert!(matches!(err, Err(ResourceError::Driver(_))));
243        assert_eq!(r.bytes_outstanding(), 0);
244    }
245
246    #[test]
247    fn deallocate_unknown_block_returns_use_after_free() {
248        let Some(device) = try_device() else {
249            return;
250        };
251        let r = DirectCudaResource::new(device, 0);
252        let bogus = DeviceBlock {
253            ptr: 0xdead_beef,
254            device_ordinal: 0,
255            alloc_stream: StreamId::DEFAULT,
256            bytes: 16,
257            align: 1,
258            tag: AllocTag::UNTAGGED,
259            generation: Generation::next(),
260            state: BlockState::Live,
261        };
262        assert!(matches!(
263            r.deallocate(bogus),
264            Err(ResourceError::UseAfterFree { .. })
265        ));
266    }
267
268    /// Locks the contract that DirectCudaResource does NOT
269    /// silently accept `record_block_use`. If a caller (e.g. the
270    /// future xlog launch builder) calls record_block_use against
271    /// a runtime built around DirectCudaResource, the call must
272    /// fail loudly with StreamMisuse — not return Ok and quietly
273    /// fail to track anything. False safety here would let
274    /// downstream code queue cross-stream kernels and drop
275    /// blocks while the cross-stream use was never recorded,
276    /// reproducing exactly the use-after-free this whole layer
277    /// exists to prevent.
278    ///
279    /// Implementation note: DirectCudaResource inherits the
280    /// trait's default `record_block_use` impl (which returns
281    /// `StreamMisuse`). It does NOT override. If a future change
282    /// adds a real override, it must make the override
283    /// genuinely track cross-stream uses (similar to
284    /// AsyncCudaResource's implementation) — anything else
285    /// regresses this contract.
286    #[test]
287    fn record_block_use_rejected_with_stream_misuse() {
288        let Some(device) = try_device() else {
289            return;
290        };
291        let r = DirectCudaResource::new(device, 0);
292        let block = r
293            .allocate(64, StreamId::DEFAULT, AllocTag::UNTAGGED)
294            .expect("alloc");
295        let err = r.record_block_use(&block, StreamId::DEFAULT);
296        match err {
297            Err(ResourceError::StreamMisuse(msg)) => {
298                assert!(
299                    msg.contains("unsupported"),
300                    "expected 'unsupported' in StreamMisuse message, got {:?}",
301                    msg
302                );
303            }
304            other => panic!(
305                "DirectCudaResource::record_block_use must return StreamMisuse \
306                 to surface unsupported cross-stream tracking; got {:?}",
307                other
308            ),
309        }
310        // The block stays live — a failed record_block_use must
311        // NOT have removed the entry or dropped the slice.
312        assert_eq!(r.bytes_outstanding(), 64);
313        r.deallocate(block).expect("dealloc still works");
314    }
315}
xlog_cuda/device_runtime/direct.rs

xlog_cuda/device_runtime/
direct.rs