Skip to main content

xlog_cuda_tests/categories/
c17_caching.rs

1//! Category 17: Caching and coherence
2//!
3//! Tests cache behavior and coherence, including cache line access patterns,
4//! cache reuse, cache thrashing scenarios, memory locality, and L2 cache effects.
5
6use crate::harness::{CategoryResult, TestContext, TestResult};
7use std::time::Instant;
8use xlog_core::{ScalarType, Schema};
9
10/// Run all tests in this category.
11pub fn run_all(ctx: &TestContext) -> CategoryResult {
12    let mut results = CategoryResult::new("c17_caching");
13    let start = Instant::now();
14
15    results.add_result(test_cache_line_access(ctx));
16    results.add_result(test_cache_reuse(ctx));
17    results.add_result(test_cache_thrashing(ctx));
18    results.add_result(test_memory_locality(ctx));
19    results.add_result(test_l2_cache_effects(ctx));
20
21    results.set_duration(start.elapsed());
22    results
23}
24
25/// Test 1: Test sizes aligned to cache lines (128 bytes).
26///
27/// GPU cache lines are typically 128 bytes. This test verifies operations
28/// work correctly with data sizes that are aligned to cache line boundaries.
29fn test_cache_line_access(ctx: &TestContext) -> TestResult {
30    let start = Instant::now();
31    let schema = Schema::new(vec![("val".to_string(), ScalarType::U32)]);
32
33    // Cache line is 128 bytes = 32 u32 values
34    const CACHE_LINE_U32S: usize = 32;
35
36    // Test various cache line aligned sizes
37    let test_sizes = [
38        CACHE_LINE_U32S,       // 1 cache line
39        CACHE_LINE_U32S * 2,   // 2 cache lines
40        CACHE_LINE_U32S * 4,   // 4 cache lines
41        CACHE_LINE_U32S * 16,  // 16 cache lines
42        CACHE_LINE_U32S * 64,  // 64 cache lines
43        CACHE_LINE_U32S * 256, // 256 cache lines
44    ];
45
46    for &size in &test_sizes {
47        // Create data aligned to cache line size
48        let data: Vec<u32> = (0..size)
49            .map(|i| ((i * 1103515245 + 12345) % 1000000) as u32)
50            .collect();
51
52        let buffer = match ctx
53            .provider
54            .create_buffer_from_slice::<u32>(&data, schema.clone())
55        {
56            Ok(buf) => buf,
57            Err(e) => {
58                return TestResult::error(
59                    "test_cache_line_access",
60                    start.elapsed(),
61                    format!("Failed to create buffer of size {}: {}", size, e),
62                )
63            }
64        };
65
66        // Sort operation to exercise cache
67        let sorted = match ctx.provider.sort(&buffer, &[0]) {
68            Ok(s) => s,
69            Err(e) => {
70                return TestResult::error(
71                    "test_cache_line_access",
72                    start.elapsed(),
73                    format!("Sort failed for size {}: {}", size, e),
74                )
75            }
76        };
77
78        // Verify sort correctness
79        let sorted_data = match ctx.provider.download_column::<u32>(&sorted, 0) {
80            Ok(d) => d,
81            Err(e) => {
82                return TestResult::error(
83                    "test_cache_line_access",
84                    start.elapsed(),
85                    format!("Download failed for size {}: {}", size, e),
86                )
87            }
88        };
89
90        if sorted_data.len() != size {
91            return TestResult::error(
92                "test_cache_line_access",
93                start.elapsed(),
94                format!(
95                    "Size {}: expected {} rows, got {}",
96                    size,
97                    size,
98                    sorted_data.len()
99                ),
100            );
101        }
102
103        for i in 1..sorted_data.len() {
104            if sorted_data[i] < sorted_data[i - 1] {
105                return TestResult::error(
106                    "test_cache_line_access",
107                    start.elapsed(),
108                    format!(
109                        "Size {}: sort incorrect at index {}: {} < {}",
110                        size,
111                        i,
112                        sorted_data[i],
113                        sorted_data[i - 1]
114                    ),
115                );
116            }
117        }
118
119        // Also test filter with cache-aligned data
120        let mask: Vec<u8> = (0..size).map(|i| if i % 2 == 0 { 1 } else { 0 }).collect();
121        let filtered = match ctx.provider.filter_by_mask(&buffer, &mask) {
122            Ok(f) => f,
123            Err(e) => {
124                return TestResult::error(
125                    "test_cache_line_access",
126                    start.elapsed(),
127                    format!("Filter failed for size {}: {}", size, e),
128                )
129            }
130        };
131
132        let expected_count = (size + 1) / 2;
133        if ctx.device_row_count(&filtered) != expected_count as u64 {
134            return TestResult::error(
135                "test_cache_line_access",
136                start.elapsed(),
137                format!(
138                    "Size {}: filter expected {} rows, got {}",
139                    size,
140                    expected_count,
141                    ctx.device_row_count(&filtered)
142                ),
143            );
144        }
145    }
146
147    // Test non-aligned sizes (to verify no issues with non-aligned access)
148    let non_aligned_sizes = [
149        CACHE_LINE_U32S + 1,
150        CACHE_LINE_U32S * 2 - 1,
151        CACHE_LINE_U32S * 4 + 7,
152        CACHE_LINE_U32S * 10 + 13,
153    ];
154
155    for &size in &non_aligned_sizes {
156        let data: Vec<u32> = (0..size).map(|i| ((i * 31337) % 100000) as u32).collect();
157
158        let buffer = match ctx
159            .provider
160            .create_buffer_from_slice::<u32>(&data, schema.clone())
161        {
162            Ok(buf) => buf,
163            Err(e) => {
164                return TestResult::error(
165                    "test_cache_line_access",
166                    start.elapsed(),
167                    format!(
168                        "Failed to create non-aligned buffer of size {}: {}",
169                        size, e
170                    ),
171                )
172            }
173        };
174
175        let sorted = match ctx.provider.sort(&buffer, &[0]) {
176            Ok(s) => s,
177            Err(e) => {
178                return TestResult::error(
179                    "test_cache_line_access",
180                    start.elapsed(),
181                    format!("Sort failed for non-aligned size {}: {}", size, e),
182                )
183            }
184        };
185
186        let sorted_data = match ctx.provider.download_column::<u32>(&sorted, 0) {
187            Ok(d) => d,
188            Err(e) => {
189                return TestResult::error(
190                    "test_cache_line_access",
191                    start.elapsed(),
192                    format!("Download failed for non-aligned size {}: {}", size, e),
193                )
194            }
195        };
196
197        for i in 1..sorted_data.len() {
198            if sorted_data[i] < sorted_data[i - 1] {
199                return TestResult::error(
200                    "test_cache_line_access",
201                    start.elapsed(),
202                    format!("Non-aligned size {}: sort incorrect at index {}", size, i),
203                );
204            }
205        }
206    }
207
208    if let Err(e) = ctx.sync_and_check() {
209        return TestResult::error(
210            "test_cache_line_access",
211            start.elapsed(),
212            format!("Sync failed: {}", e),
213        );
214    }
215
216    TestResult::passed("test_cache_line_access", start.elapsed())
217}
218
219/// Test 2: Run same operation multiple times to exercise cache reuse.
220///
221/// Tests cache efficiency by running the same operation repeatedly on the
222/// same data, which should benefit from cache warm-up effects.
223fn test_cache_reuse(ctx: &TestContext) -> TestResult {
224    let start = Instant::now();
225    let schema = Schema::new(vec![("val".to_string(), ScalarType::U32)]);
226
227    const SIZE: usize = 10000;
228    const ITERATIONS: usize = 10;
229
230    // Create data that should fit in cache after first access
231    let data: Vec<u32> = (0..SIZE).map(|i| ((i * 17 + 13) % 10000) as u32).collect();
232
233    let buffer = match ctx
234        .provider
235        .create_buffer_from_slice::<u32>(&data, schema.clone())
236    {
237        Ok(buf) => buf,
238        Err(e) => {
239            return TestResult::error(
240                "test_cache_reuse",
241                start.elapsed(),
242                format!("Failed to create buffer: {}", e),
243            )
244        }
245    };
246
247    // Run sort multiple times on same data - should benefit from cache
248    let mut first_result: Option<Vec<u32>> = None;
249
250    for i in 0..ITERATIONS {
251        let sorted = match ctx.provider.sort(&buffer, &[0]) {
252            Ok(s) => s,
253            Err(e) => {
254                return TestResult::error(
255                    "test_cache_reuse",
256                    start.elapsed(),
257                    format!("Iteration {}: sort failed: {}", i, e),
258                )
259            }
260        };
261
262        let result = match ctx.provider.download_column::<u32>(&sorted, 0) {
263            Ok(d) => d,
264            Err(e) => {
265                return TestResult::error(
266                    "test_cache_reuse",
267                    start.elapsed(),
268                    format!("Iteration {}: download failed: {}", i, e),
269                )
270            }
271        };
272
273        // Verify correctness
274        for j in 1..result.len() {
275            if result[j] < result[j - 1] {
276                return TestResult::error(
277                    "test_cache_reuse",
278                    start.elapsed(),
279                    format!("Iteration {}: sort incorrect at index {}", i, j),
280                );
281            }
282        }
283
284        // Verify consistency across iterations
285        match &first_result {
286            Some(first) => {
287                if result != *first {
288                    return TestResult::error(
289                        "test_cache_reuse",
290                        start.elapsed(),
291                        format!("Iteration {}: result differs from first iteration", i),
292                    );
293                }
294            }
295            None => {
296                first_result = Some(result);
297            }
298        }
299    }
300
301    // Test filter cache reuse
302    let mask: Vec<u8> = (0..SIZE).map(|i| if i % 3 == 0 { 1 } else { 0 }).collect();
303    let expected_count = (SIZE + 2) / 3;
304
305    for i in 0..ITERATIONS {
306        let filtered = match ctx.provider.filter_by_mask(&buffer, &mask) {
307            Ok(f) => f,
308            Err(e) => {
309                return TestResult::error(
310                    "test_cache_reuse",
311                    start.elapsed(),
312                    format!("Filter iteration {}: failed: {}", i, e),
313                )
314            }
315        };
316
317        if ctx.device_row_count(&filtered) != expected_count as u64 {
318            return TestResult::error(
319                "test_cache_reuse",
320                start.elapsed(),
321                format!(
322                    "Filter iteration {}: expected {} rows, got {}",
323                    i,
324                    expected_count,
325                    ctx.device_row_count(&filtered)
326                ),
327            );
328        }
329    }
330
331    // Test dedup cache reuse with duplicates
332    let schema2 = Schema::new(vec![
333        ("key".to_string(), ScalarType::U32),
334        ("val".to_string(), ScalarType::U32),
335    ]);
336
337    let keys: Vec<u32> = (0..SIZE).map(|i| (i % 1000) as u32).collect();
338    let vals: Vec<u32> = (0..SIZE as u32).collect();
339
340    let buffer2 = match ctx
341        .provider
342        .create_buffer_from_u32_columns(&[&keys, &vals], schema2)
343    {
344        Ok(buf) => buf,
345        Err(e) => {
346            return TestResult::error(
347                "test_cache_reuse",
348                start.elapsed(),
349                format!("Failed to create buffer2: {}", e),
350            )
351        }
352    };
353
354    for i in 0..ITERATIONS {
355        let deduped = match ctx.provider.dedup(&buffer2, &[0]) {
356            Ok(d) => d,
357            Err(e) => {
358                return TestResult::error(
359                    "test_cache_reuse",
360                    start.elapsed(),
361                    format!("Dedup iteration {}: failed: {}", i, e),
362                )
363            }
364        };
365
366        if ctx.device_row_count(&deduped) != 1000 {
367            return TestResult::error(
368                "test_cache_reuse",
369                start.elapsed(),
370                format!(
371                    "Dedup iteration {}: expected 1000 rows, got {}",
372                    i,
373                    ctx.device_row_count(&deduped)
374                ),
375            );
376        }
377    }
378
379    if let Err(e) = ctx.sync_and_check() {
380        return TestResult::error(
381            "test_cache_reuse",
382            start.elapsed(),
383            format!("Sync failed: {}", e),
384        );
385    }
386
387    TestResult::passed("test_cache_reuse", start.elapsed())
388}
389
390/// Test 3: Large data that exceeds cache capacity to test cache thrashing.
391///
392/// Tests behavior when working set exceeds cache capacity, which causes
393/// cache thrashing and higher memory bandwidth requirements.
394fn test_cache_thrashing(ctx: &TestContext) -> TestResult {
395    let start = Instant::now();
396    let schema = Schema::new(vec![("val".to_string(), ScalarType::U32)]);
397
398    // L2 cache is typically 2-6MB on modern GPUs
399    // 1M u32s = 4MB, 5M u32s = 20MB (definitely exceeds L2)
400    const LARGE_SIZE: usize = 5_000_000;
401    const MEDIUM_SIZE: usize = 1_000_000;
402
403    // Test with large data that exceeds L2 cache
404    let large_data: Vec<u32> = (0..LARGE_SIZE)
405        .map(|i| ((i * 1103515245 + 12345) % 10000000) as u32)
406        .collect();
407
408    let large_buffer = match ctx
409        .provider
410        .create_buffer_from_slice::<u32>(&large_data, schema.clone())
411    {
412        Ok(buf) => buf,
413        Err(e) => {
414            return TestResult::error(
415                "test_cache_thrashing",
416                start.elapsed(),
417                format!("Failed to create large buffer: {}", e),
418            )
419        }
420    };
421
422    // Sort large data (will cause cache thrashing)
423    let sorted = match ctx.provider.sort(&large_buffer, &[0]) {
424        Ok(s) => s,
425        Err(e) => {
426            return TestResult::error(
427                "test_cache_thrashing",
428                start.elapsed(),
429                format!("Large sort failed: {}", e),
430            )
431        }
432    };
433
434    // Verify correctness even with cache thrashing
435    let sorted_data = match ctx.provider.download_column::<u32>(&sorted, 0) {
436        Ok(d) => d,
437        Err(e) => {
438            return TestResult::error(
439                "test_cache_thrashing",
440                start.elapsed(),
441                format!("Large download failed: {}", e),
442            )
443        }
444    };
445
446    if sorted_data.len() != LARGE_SIZE {
447        return TestResult::error(
448            "test_cache_thrashing",
449            start.elapsed(),
450            format!(
451                "Large sort: expected {} rows, got {}",
452                LARGE_SIZE,
453                sorted_data.len()
454            ),
455        );
456    }
457
458    // Spot check sorted order (checking every element is slow)
459    for i in (1..sorted_data.len()).step_by(10000) {
460        if sorted_data[i] < sorted_data[i - 1] {
461            return TestResult::error(
462                "test_cache_thrashing",
463                start.elapsed(),
464                format!(
465                    "Large sort incorrect at index {}: {} < {}",
466                    i,
467                    sorted_data[i],
468                    sorted_data[i - 1]
469                ),
470            );
471        }
472    }
473
474    // Also check first and last segments thoroughly
475    for i in 1..1000.min(sorted_data.len()) {
476        if sorted_data[i] < sorted_data[i - 1] {
477            return TestResult::error(
478                "test_cache_thrashing",
479                start.elapsed(),
480                format!("Large sort incorrect at start index {}", i),
481            );
482        }
483    }
484
485    if sorted_data.len() > 1000 {
486        for i in (sorted_data.len() - 999)..sorted_data.len() {
487            if sorted_data[i] < sorted_data[i - 1] {
488                return TestResult::error(
489                    "test_cache_thrashing",
490                    start.elapsed(),
491                    format!("Large sort incorrect at end index {}", i),
492                );
493            }
494        }
495    }
496
497    // Test medium size that may partially fit in L2
498    let medium_data: Vec<u32> = (0..MEDIUM_SIZE)
499        .map(|i| ((i * 31337) % 1000000) as u32)
500        .collect();
501
502    let medium_buffer = match ctx
503        .provider
504        .create_buffer_from_slice::<u32>(&medium_data, schema.clone())
505    {
506        Ok(buf) => buf,
507        Err(e) => {
508            return TestResult::error(
509                "test_cache_thrashing",
510                start.elapsed(),
511                format!("Failed to create medium buffer: {}", e),
512            )
513        }
514    };
515
516    let sorted_medium = match ctx.provider.sort(&medium_buffer, &[0]) {
517        Ok(s) => s,
518        Err(e) => {
519            return TestResult::error(
520                "test_cache_thrashing",
521                start.elapsed(),
522                format!("Medium sort failed: {}", e),
523            )
524        }
525    };
526
527    let medium_result = match ctx.provider.download_column::<u32>(&sorted_medium, 0) {
528        Ok(d) => d,
529        Err(e) => {
530            return TestResult::error(
531                "test_cache_thrashing",
532                start.elapsed(),
533                format!("Medium download failed: {}", e),
534            )
535        }
536    };
537
538    for i in (1..medium_result.len()).step_by(1000) {
539        if medium_result[i] < medium_result[i - 1] {
540            return TestResult::error(
541                "test_cache_thrashing",
542                start.elapsed(),
543                format!("Medium sort incorrect at index {}", i),
544            );
545        }
546    }
547
548    // Filter on large data
549    let large_mask: Vec<u8> = (0..LARGE_SIZE)
550        .map(|i| if i % 4 == 0 { 1 } else { 0 })
551        .collect();
552    let filtered = match ctx.provider.filter_by_mask(&large_buffer, &large_mask) {
553        Ok(f) => f,
554        Err(e) => {
555            return TestResult::error(
556                "test_cache_thrashing",
557                start.elapsed(),
558                format!("Large filter failed: {}", e),
559            )
560        }
561    };
562
563    let expected_count = (LARGE_SIZE + 3) / 4;
564    if ctx.device_row_count(&filtered) != expected_count as u64 {
565        return TestResult::error(
566            "test_cache_thrashing",
567            start.elapsed(),
568            format!(
569                "Large filter: expected {} rows, got {}",
570                expected_count,
571                ctx.device_row_count(&filtered)
572            ),
573        );
574    }
575
576    if let Err(e) = ctx.sync_and_check() {
577        return TestResult::error(
578            "test_cache_thrashing",
579            start.elapsed(),
580            format!("Sync failed: {}", e),
581        );
582    }
583
584    TestResult::passed("test_cache_thrashing", start.elapsed())
585}
586
587/// Test 4: Test operations with good and bad memory locality.
588///
589/// Tests operations with sequential access patterns (good locality) versus
590/// random access patterns (bad locality) to verify correctness in both cases.
591fn test_memory_locality(ctx: &TestContext) -> TestResult {
592    let start = Instant::now();
593    let schema = Schema::new(vec![("val".to_string(), ScalarType::U32)]);
594
595    const SIZE: usize = 100000;
596
597    // Good locality: sequential data
598    let sequential_data: Vec<u32> = (0..SIZE as u32).collect();
599
600    let seq_buffer = match ctx
601        .provider
602        .create_buffer_from_slice::<u32>(&sequential_data, schema.clone())
603    {
604        Ok(buf) => buf,
605        Err(e) => {
606            return TestResult::error(
607                "test_memory_locality",
608                start.elapsed(),
609                format!("Failed to create sequential buffer: {}", e),
610            )
611        }
612    };
613
614    // Sort sequential data (already sorted - best case)
615    let sorted_seq = match ctx.provider.sort(&seq_buffer, &[0]) {
616        Ok(s) => s,
617        Err(e) => {
618            return TestResult::error(
619                "test_memory_locality",
620                start.elapsed(),
621                format!("Sort sequential failed: {}", e),
622            )
623        }
624    };
625
626    let seq_result = match ctx.provider.download_column::<u32>(&sorted_seq, 0) {
627        Ok(d) => d,
628        Err(e) => {
629            return TestResult::error(
630                "test_memory_locality",
631                start.elapsed(),
632                format!("Download sequential failed: {}", e),
633            )
634        }
635    };
636
637    // Verify sorted
638    for i in 0..seq_result.len() {
639        if seq_result[i] != i as u32 {
640            return TestResult::error(
641                "test_memory_locality",
642                start.elapsed(),
643                format!(
644                    "Sequential sort incorrect at index {}: expected {}, got {}",
645                    i, i, seq_result[i]
646                ),
647            );
648        }
649    }
650
651    // Bad locality: reverse sorted data
652    let reverse_data: Vec<u32> = (0..SIZE as u32).rev().collect();
653
654    let rev_buffer = match ctx
655        .provider
656        .create_buffer_from_slice::<u32>(&reverse_data, schema.clone())
657    {
658        Ok(buf) => buf,
659        Err(e) => {
660            return TestResult::error(
661                "test_memory_locality",
662                start.elapsed(),
663                format!("Failed to create reverse buffer: {}", e),
664            )
665        }
666    };
667
668    let sorted_rev = match ctx.provider.sort(&rev_buffer, &[0]) {
669        Ok(s) => s,
670        Err(e) => {
671            return TestResult::error(
672                "test_memory_locality",
673                start.elapsed(),
674                format!("Sort reverse failed: {}", e),
675            )
676        }
677    };
678
679    let rev_result = match ctx.provider.download_column::<u32>(&sorted_rev, 0) {
680        Ok(d) => d,
681        Err(e) => {
682            return TestResult::error(
683                "test_memory_locality",
684                start.elapsed(),
685                format!("Download reverse failed: {}", e),
686            )
687        }
688    };
689
690    // Verify sorted
691    for i in 0..rev_result.len() {
692        if rev_result[i] != i as u32 {
693            return TestResult::error(
694                "test_memory_locality",
695                start.elapsed(),
696                format!(
697                    "Reverse sort incorrect at index {}: expected {}, got {}",
698                    i, i, rev_result[i]
699                ),
700            );
701        }
702    }
703
704    // Worst locality: random access pattern
705    let random_data: Vec<u32> = (0..SIZE)
706        .map(|i| ((i * 1103515245 + 12345) % SIZE) as u32)
707        .collect();
708
709    let rand_buffer = match ctx
710        .provider
711        .create_buffer_from_slice::<u32>(&random_data, schema.clone())
712    {
713        Ok(buf) => buf,
714        Err(e) => {
715            return TestResult::error(
716                "test_memory_locality",
717                start.elapsed(),
718                format!("Failed to create random buffer: {}", e),
719            )
720        }
721    };
722
723    let sorted_rand = match ctx.provider.sort(&rand_buffer, &[0]) {
724        Ok(s) => s,
725        Err(e) => {
726            return TestResult::error(
727                "test_memory_locality",
728                start.elapsed(),
729                format!("Sort random failed: {}", e),
730            )
731        }
732    };
733
734    let rand_result = match ctx.provider.download_column::<u32>(&sorted_rand, 0) {
735        Ok(d) => d,
736        Err(e) => {
737            return TestResult::error(
738                "test_memory_locality",
739                start.elapsed(),
740                format!("Download random failed: {}", e),
741            )
742        }
743    };
744
745    // Verify sorted order
746    for i in 1..rand_result.len() {
747        if rand_result[i] < rand_result[i - 1] {
748            return TestResult::error(
749                "test_memory_locality",
750                start.elapsed(),
751                format!(
752                    "Random sort incorrect at index {}: {} < {}",
753                    i,
754                    rand_result[i],
755                    rand_result[i - 1]
756                ),
757            );
758        }
759    }
760
761    // Test filter with different locality patterns
762    // Sequential mask pattern (good locality)
763    let seq_mask: Vec<u8> = (0..SIZE)
764        .map(|i| if i < SIZE / 2 { 1 } else { 0 })
765        .collect();
766    let filtered_seq = match ctx.provider.filter_by_mask(&seq_buffer, &seq_mask) {
767        Ok(f) => f,
768        Err(e) => {
769            return TestResult::error(
770                "test_memory_locality",
771                start.elapsed(),
772                format!("Sequential filter failed: {}", e),
773            )
774        }
775    };
776
777    if ctx.device_row_count(&filtered_seq) != (SIZE / 2) as u64 {
778        return TestResult::error(
779            "test_memory_locality",
780            start.elapsed(),
781            format!(
782                "Sequential filter: expected {} rows, got {}",
783                SIZE / 2,
784                ctx.device_row_count(&filtered_seq)
785            ),
786        );
787    }
788
789    // Alternating mask pattern (stride access)
790    let alt_mask: Vec<u8> = (0..SIZE).map(|i| if i % 2 == 0 { 1 } else { 0 }).collect();
791    let filtered_alt = match ctx.provider.filter_by_mask(&seq_buffer, &alt_mask) {
792        Ok(f) => f,
793        Err(e) => {
794            return TestResult::error(
795                "test_memory_locality",
796                start.elapsed(),
797                format!("Alternating filter failed: {}", e),
798            )
799        }
800    };
801
802    let expected_alt = (SIZE + 1) / 2;
803    if ctx.device_row_count(&filtered_alt) != expected_alt as u64 {
804        return TestResult::error(
805            "test_memory_locality",
806            start.elapsed(),
807            format!(
808                "Alternating filter: expected {} rows, got {}",
809                expected_alt,
810                ctx.device_row_count(&filtered_alt)
811            ),
812        );
813    }
814
815    // Sparse mask (bad locality for output)
816    let sparse_mask: Vec<u8> = (0..SIZE)
817        .map(|i| if i % 100 == 0 { 1 } else { 0 })
818        .collect();
819    let filtered_sparse = match ctx.provider.filter_by_mask(&seq_buffer, &sparse_mask) {
820        Ok(f) => f,
821        Err(e) => {
822            return TestResult::error(
823                "test_memory_locality",
824                start.elapsed(),
825                format!("Sparse filter failed: {}", e),
826            )
827        }
828    };
829
830    let expected_sparse = (SIZE + 99) / 100;
831    if ctx.device_row_count(&filtered_sparse) != expected_sparse as u64 {
832        return TestResult::error(
833            "test_memory_locality",
834            start.elapsed(),
835            format!(
836                "Sparse filter: expected {} rows, got {}",
837                expected_sparse,
838                ctx.device_row_count(&filtered_sparse)
839            ),
840        );
841    }
842
843    if let Err(e) = ctx.sync_and_check() {
844        return TestResult::error(
845            "test_memory_locality",
846            start.elapsed(),
847            format!("Sync failed: {}", e),
848        );
849    }
850
851    TestResult::passed("test_memory_locality", start.elapsed())
852}
853
854/// Test 5: Test sizes that fit vs overflow L2 cache.
855///
856/// Tests operations with data sizes that fit within L2 cache (fast) versus
857/// sizes that overflow L2 cache (requires main memory access).
858fn test_l2_cache_effects(ctx: &TestContext) -> TestResult {
859    let start = Instant::now();
860    let schema = Schema::new(vec![("val".to_string(), ScalarType::U32)]);
861
862    // Typical L2 cache sizes: 2-6MB on modern GPUs
863    // Small: definitely fits in L2 (256KB = 64K u32s)
864    // Medium: might fit in L2 (2MB = 512K u32s)
865    // Large: definitely exceeds L2 (20MB = 5M u32s)
866
867    const SMALL_SIZE: usize = 64_000; // ~256KB
868    const MEDIUM_SIZE: usize = 512_000; // ~2MB
869    const LARGE_SIZE: usize = 2_000_000; // ~8MB
870
871    // Test small (L2 resident)
872    let small_data: Vec<u32> = (0..SMALL_SIZE)
873        .map(|i| ((i * 17 + 13) % SMALL_SIZE) as u32)
874        .collect();
875
876    let small_buffer = match ctx
877        .provider
878        .create_buffer_from_slice::<u32>(&small_data, schema.clone())
879    {
880        Ok(buf) => buf,
881        Err(e) => {
882            return TestResult::error(
883                "test_l2_cache_effects",
884                start.elapsed(),
885                format!("Failed to create small buffer: {}", e),
886            )
887        }
888    };
889
890    // Run multiple times to benefit from L2 caching
891    for i in 0..3 {
892        let sorted = match ctx.provider.sort(&small_buffer, &[0]) {
893            Ok(s) => s,
894            Err(e) => {
895                return TestResult::error(
896                    "test_l2_cache_effects",
897                    start.elapsed(),
898                    format!("Small sort iteration {} failed: {}", i, e),
899                )
900            }
901        };
902
903        let result = match ctx.provider.download_column::<u32>(&sorted, 0) {
904            Ok(d) => d,
905            Err(e) => {
906                return TestResult::error(
907                    "test_l2_cache_effects",
908                    start.elapsed(),
909                    format!("Small download iteration {} failed: {}", i, e),
910                )
911            }
912        };
913
914        for j in 1..result.len() {
915            if result[j] < result[j - 1] {
916                return TestResult::error(
917                    "test_l2_cache_effects",
918                    start.elapsed(),
919                    format!("Small sort iteration {} incorrect at index {}", i, j),
920                );
921            }
922        }
923    }
924
925    // Test medium (borderline L2)
926    let medium_data: Vec<u32> = (0..MEDIUM_SIZE)
927        .map(|i| ((i * 31337) % MEDIUM_SIZE) as u32)
928        .collect();
929
930    let medium_buffer = match ctx
931        .provider
932        .create_buffer_from_slice::<u32>(&medium_data, schema.clone())
933    {
934        Ok(buf) => buf,
935        Err(e) => {
936            return TestResult::error(
937                "test_l2_cache_effects",
938                start.elapsed(),
939                format!("Failed to create medium buffer: {}", e),
940            )
941        }
942    };
943
944    let sorted_medium = match ctx.provider.sort(&medium_buffer, &[0]) {
945        Ok(s) => s,
946        Err(e) => {
947            return TestResult::error(
948                "test_l2_cache_effects",
949                start.elapsed(),
950                format!("Medium sort failed: {}", e),
951            )
952        }
953    };
954
955    let medium_result = match ctx.provider.download_column::<u32>(&sorted_medium, 0) {
956        Ok(d) => d,
957        Err(e) => {
958            return TestResult::error(
959                "test_l2_cache_effects",
960                start.elapsed(),
961                format!("Medium download failed: {}", e),
962            )
963        }
964    };
965
966    for i in (1..medium_result.len()).step_by(1000) {
967        if medium_result[i] < medium_result[i - 1] {
968            return TestResult::error(
969                "test_l2_cache_effects",
970                start.elapsed(),
971                format!("Medium sort incorrect at index {}", i),
972            );
973        }
974    }
975
976    // Test large (exceeds L2)
977    let large_data: Vec<u32> = (0..LARGE_SIZE)
978        .map(|i| ((i * 1103515245 + 12345) % LARGE_SIZE) as u32)
979        .collect();
980
981    let large_buffer = match ctx
982        .provider
983        .create_buffer_from_slice::<u32>(&large_data, schema.clone())
984    {
985        Ok(buf) => buf,
986        Err(e) => {
987            return TestResult::error(
988                "test_l2_cache_effects",
989                start.elapsed(),
990                format!("Failed to create large buffer: {}", e),
991            )
992        }
993    };
994
995    let sorted_large = match ctx.provider.sort(&large_buffer, &[0]) {
996        Ok(s) => s,
997        Err(e) => {
998            return TestResult::error(
999                "test_l2_cache_effects",
1000                start.elapsed(),
1001                format!("Large sort failed: {}", e),
1002            )
1003        }
1004    };
1005
1006    let large_result = match ctx.provider.download_column::<u32>(&sorted_large, 0) {
1007        Ok(d) => d,
1008        Err(e) => {
1009            return TestResult::error(
1010                "test_l2_cache_effects",
1011                start.elapsed(),
1012                format!("Large download failed: {}", e),
1013            )
1014        }
1015    };
1016
1017    // Spot check large result
1018    for i in (1..large_result.len()).step_by(10000) {
1019        if large_result[i] < large_result[i - 1] {
1020            return TestResult::error(
1021                "test_l2_cache_effects",
1022                start.elapsed(),
1023                format!("Large sort incorrect at index {}", i),
1024            );
1025        }
1026    }
1027
1028    // Test interleaved operations at different cache levels
1029    // This exercises cache replacement policies
1030    let small2_data: Vec<u32> = (0..SMALL_SIZE).map(|i| (i * 3) as u32).collect();
1031    let small2_buffer = match ctx
1032        .provider
1033        .create_buffer_from_slice::<u32>(&small2_data, schema.clone())
1034    {
1035        Ok(buf) => buf,
1036        Err(e) => {
1037            return TestResult::error(
1038                "test_l2_cache_effects",
1039                start.elapsed(),
1040                format!("Failed to create small2 buffer: {}", e),
1041            )
1042        }
1043    };
1044
1045    // Interleave operations on small and medium buffers
1046    for i in 0..3 {
1047        // Small operation
1048        let small_sorted = match ctx.provider.sort(&small2_buffer, &[0]) {
1049            Ok(s) => s,
1050            Err(e) => {
1051                return TestResult::error(
1052                    "test_l2_cache_effects",
1053                    start.elapsed(),
1054                    format!("Interleaved small sort {} failed: {}", i, e),
1055                )
1056            }
1057        };
1058
1059        // Medium operation (may evict small from L2)
1060        let medium_sorted = match ctx.provider.sort(&medium_buffer, &[0]) {
1061            Ok(s) => s,
1062            Err(e) => {
1063                return TestResult::error(
1064                    "test_l2_cache_effects",
1065                    start.elapsed(),
1066                    format!("Interleaved medium sort {} failed: {}", i, e),
1067                )
1068            }
1069        };
1070
1071        // Verify both completed correctly
1072        if ctx.device_row_count(&small_sorted) != SMALL_SIZE as u64 {
1073            return TestResult::error(
1074                "test_l2_cache_effects",
1075                start.elapsed(),
1076                format!(
1077                    "Interleaved {}: small has {} rows, expected {}",
1078                    i,
1079                    ctx.device_row_count(&small_sorted),
1080                    SMALL_SIZE
1081                ),
1082            );
1083        }
1084
1085        if ctx.device_row_count(&medium_sorted) != MEDIUM_SIZE as u64 {
1086            return TestResult::error(
1087                "test_l2_cache_effects",
1088                start.elapsed(),
1089                format!(
1090                    "Interleaved {}: medium has {} rows, expected {}",
1091                    i,
1092                    ctx.device_row_count(&medium_sorted),
1093                    MEDIUM_SIZE
1094                ),
1095            );
1096        }
1097    }
1098
1099    if let Err(e) = ctx.sync_and_check() {
1100        return TestResult::error(
1101            "test_l2_cache_effects",
1102            start.elapsed(),
1103            format!("Sync failed: {}", e),
1104        );
1105    }
1106
1107    TestResult::passed("test_l2_cache_effects", start.elapsed())
1108}