xlog_stats/stats.rs
1//! Core statistics types for GPU-resident relation metadata.
2//!
3//! This module provides statistics tracking for relations and columns that are
4//! used by the query optimizer and solver heuristics to make informed decisions
5//! about query execution strategies.
6
7use xlog_core::{RelId, ScalarType};
8
9/// GPU-resident relation statistics.
10///
11/// Tracks cardinality, memory usage, access patterns, and column-level statistics
12/// for relations stored on the GPU. These statistics drive optimizer cost models
13/// and solver heuristics for efficient query execution.
14#[derive(Debug, Clone)]
15pub struct RelationStats {
16 /// Unique identifier for the relation
17 pub rel_id: RelId,
18 /// Estimated number of rows in the relation
19 pub cardinality: u64,
20 /// Estimated total size in bytes on GPU
21 pub byte_size: u64,
22 /// Per-column statistics
23 pub column_stats: Vec<ColumnStats>,
24 /// Per-column prefix fan-out statistics for trie-style WCOJ planning.
25 pub prefix_degrees: Vec<PrefixDegreeStats>,
26 /// Per-column key heat/skew summaries for skew-aware WCOJ planning.
27 pub key_heats: Vec<KeyHeatStats>,
28 /// Access heat for LRU-style eviction (exponential moving average)
29 pub heat: f32,
30 /// Unix timestamp of last access
31 pub last_access: u64,
32 /// Whether an index exists for this relation
33 pub has_index: bool,
34}
35
36impl RelationStats {
37 /// Creates new statistics for a relation with default (empty) values.
38 ///
39 /// # Arguments
40 /// * `rel_id` - The unique identifier for the relation
41 ///
42 /// # Returns
43 /// A new `RelationStats` instance with zero cardinality, no columns, and cold heat.
44 pub fn new(rel_id: RelId) -> Self {
45 Self {
46 rel_id,
47 cardinality: 0,
48 byte_size: 0,
49 column_stats: Vec::new(),
50 prefix_degrees: Vec::new(),
51 key_heats: Vec::new(),
52 heat: 0.0,
53 last_access: 0,
54 has_index: false,
55 }
56 }
57
58 /// Updates the cardinality (row count) of the relation.
59 ///
60 /// This should be called after bulk loads, inserts, or when statistics
61 /// are refreshed from the actual GPU-resident data.
62 ///
63 /// # Arguments
64 /// * `rows` - The new cardinality estimate
65 pub fn update_cardinality(&mut self, rows: u64) {
66 self.cardinality = rows;
67 }
68
69 /// Updates the byte size estimate for the relation.
70 ///
71 /// # Arguments
72 /// * `bytes` - The estimated total size in bytes
73 pub fn update_byte_size(&mut self, bytes: u64) {
74 self.byte_size = bytes;
75 }
76
77 /// Records an access to this relation, updating heat and timestamp.
78 ///
79 /// Uses an exponential moving average for heat calculation:
80 /// `heat = heat * 0.9 + 0.1`
81 ///
82 /// This causes frequently accessed relations to maintain high heat
83 /// while infrequently accessed ones cool down over time.
84 pub fn record_access(&mut self) {
85 // Exponential moving average for heat
86 self.heat = self.heat * 0.9 + 0.1;
87 self.last_access = std::time::SystemTime::now()
88 .duration_since(std::time::UNIX_EPOCH)
89 .unwrap_or_default()
90 .as_secs();
91 }
92
93 /// Decays the heat by a multiplicative factor.
94 ///
95 /// This should be called periodically (e.g., during garbage collection
96 /// or memory pressure events) to allow unused relations to cool down.
97 ///
98 /// # Arguments
99 /// * `factor` - Multiplicative decay factor (typically 0.0 to 1.0)
100 pub fn decay_heat(&mut self, factor: f32) {
101 self.heat *= factor;
102 }
103
104 /// Adds column statistics for a new column.
105 ///
106 /// # Arguments
107 /// * `col_stats` - The column statistics to add
108 pub fn add_column(&mut self, col_stats: ColumnStats) {
109 self.column_stats.push(col_stats);
110 }
111
112 /// Gets column statistics by index.
113 ///
114 /// # Arguments
115 /// * `col_idx` - The column index
116 ///
117 /// # Returns
118 /// A reference to the column statistics if found
119 pub fn get_column(&self, col_idx: usize) -> Option<&ColumnStats> {
120 self.column_stats.iter().find(|c| c.col_idx == col_idx)
121 }
122
123 /// Gets mutable column statistics by index.
124 ///
125 /// # Arguments
126 /// * `col_idx` - The column index
127 ///
128 /// # Returns
129 /// A mutable reference to the column statistics if found
130 pub fn get_column_mut(&mut self, col_idx: usize) -> Option<&mut ColumnStats> {
131 self.column_stats.iter_mut().find(|c| c.col_idx == col_idx)
132 }
133
134 /// Adds prefix-degree statistics for a join-key column.
135 ///
136 /// Existing entries for the same column are retained; consumers use the first
137 /// matching entry so snapshots can preserve historical observations.
138 pub fn add_prefix_degree(&mut self, prefix_degree: PrefixDegreeStats) {
139 self.prefix_degrees.push(prefix_degree);
140 }
141
142 /// Gets prefix-degree statistics by column index.
143 pub fn get_prefix_degree(&self, col_idx: usize) -> Option<&PrefixDegreeStats> {
144 self.prefix_degrees.iter().find(|p| p.col_idx == col_idx)
145 }
146
147 /// Adds key-heat statistics for a join-key column.
148 ///
149 /// This is distinct from relation-level [`RelationStats::heat`]: relation heat
150 /// tracks access frequency, while key heat tracks per-key skew for a column.
151 pub fn add_key_heat(&mut self, key_heat: KeyHeatStats) {
152 self.key_heats.push(key_heat);
153 }
154
155 /// Gets key-heat statistics by column index.
156 pub fn get_key_heat(&self, col_idx: usize) -> Option<&KeyHeatStats> {
157 self.key_heats.iter().find(|h| h.col_idx == col_idx)
158 }
159
160 /// Estimates the selectivity for a given predicate cardinality.
161 ///
162 /// # Arguments
163 /// * `estimated_matches` - The estimated number of matching rows
164 ///
165 /// # Returns
166 /// The selectivity as a ratio (0.0 to 1.0)
167 pub fn estimate_selectivity(&self, estimated_matches: u64) -> f64 {
168 if self.cardinality == 0 {
169 return 1.0;
170 }
171 (estimated_matches as f64 / self.cardinality as f64).clamp(0.0, 1.0)
172 }
173}
174
175/// Prefix fan-out statistics for one relation column.
176///
177/// WCOJ planners use this as the trie prefix-degree signal: lower average and
178/// bounded maximum fan-out usually mean less inner-loop work for a variable
179/// order that binds the column early.
180#[derive(Debug, Clone)]
181pub struct PrefixDegreeStats {
182 /// Column index within the relation.
183 pub col_idx: usize,
184 /// Average number of rows below one distinct prefix key.
185 pub avg_degree: f64,
186 /// High-water fan-out used as a skew guard.
187 pub max_degree: f64,
188}
189
190impl PrefixDegreeStats {
191 /// Creates prefix-degree statistics for a column.
192 pub fn new(col_idx: usize, avg_degree: f64, max_degree: f64) -> Self {
193 Self {
194 col_idx,
195 avg_degree,
196 max_degree,
197 }
198 }
199}
200
201/// Per-key heat/skew statistics for one relation column.
202///
203/// The value is a compact summary of key-frequency imbalance. A value near zero
204/// is cold/unskewed; larger values indicate pivot-heavy keys that should be
205/// demoted by a skew-aware WCOJ planner.
206#[derive(Debug, Clone)]
207pub struct KeyHeatStats {
208 /// Column index within the relation.
209 pub col_idx: usize,
210 /// Heat value for the heavy-key tail.
211 pub heat: f64,
212 /// Multiplicative skew factor for the heaviest observed keys.
213 pub skew_factor: f64,
214}
215
216impl KeyHeatStats {
217 /// Creates key-heat statistics for a column.
218 pub fn new(col_idx: usize, heat: f64, skew_factor: f64) -> Self {
219 Self {
220 col_idx,
221 heat,
222 skew_factor,
223 }
224 }
225}
226
227/// Per-column statistics for optimizer cost estimation.
228///
229/// Tracks null counts, distinct value estimates, and value ranges for columns.
230/// These statistics enable the optimizer to estimate filter selectivity and
231/// join cardinalities.
232#[derive(Debug, Clone)]
233pub struct ColumnStats {
234 /// Column index within the relation
235 pub col_idx: usize,
236 /// Data type of the column
237 pub dtype: ScalarType,
238 /// Count of null values (for nullable columns)
239 pub null_count: u64,
240 /// HyperLogLog-style distinct value estimate
241 pub distinct_estimate: u64,
242 /// Minimum value (for orderable types, stored as i64)
243 pub min_value: Option<i64>,
244 /// Maximum value (for orderable types, stored as i64)
245 pub max_value: Option<i64>,
246 /// Average value length for variable-length types (e.g., symbols)
247 pub avg_width: Option<f32>,
248}
249
250impl ColumnStats {
251 /// Creates new column statistics with default values.
252 ///
253 /// # Arguments
254 /// * `col_idx` - The column index within the relation
255 /// * `dtype` - The scalar type of the column
256 ///
257 /// # Returns
258 /// A new `ColumnStats` instance with zero counts and no range information.
259 pub fn new(col_idx: usize, dtype: ScalarType) -> Self {
260 Self {
261 col_idx,
262 dtype,
263 null_count: 0,
264 distinct_estimate: 0,
265 min_value: None,
266 max_value: None,
267 avg_width: None,
268 }
269 }
270
271 /// Updates the distinct value estimate.
272 ///
273 /// This should be updated from HyperLogLog or similar cardinality estimation
274 /// algorithms running on the GPU.
275 ///
276 /// # Arguments
277 /// * `estimate` - The new distinct value estimate
278 pub fn update_distinct(&mut self, estimate: u64) {
279 self.distinct_estimate = estimate;
280 }
281
282 /// Updates the value range for this column.
283 ///
284 /// # Arguments
285 /// * `min` - The minimum value (encoded as i64)
286 /// * `max` - The maximum value (encoded as i64)
287 pub fn update_range(&mut self, min: i64, max: i64) {
288 self.min_value = Some(min);
289 self.max_value = Some(max);
290 }
291
292 /// Updates the null count for this column.
293 ///
294 /// # Arguments
295 /// * `count` - The number of null values
296 pub fn update_null_count(&mut self, count: u64) {
297 self.null_count = count;
298 }
299
300 /// Updates the average width for variable-length columns.
301 ///
302 /// # Arguments
303 /// * `width` - The average value width in bytes
304 pub fn update_avg_width(&mut self, width: f32) {
305 self.avg_width = Some(width);
306 }
307
308 /// Estimates selectivity for an equality predicate.
309 ///
310 /// Uses the distinct value count to estimate selectivity. If no distinct
311 /// count is available, returns a default estimate.
312 ///
313 /// # Arguments
314 /// * `total_rows` - The total number of rows in the relation
315 ///
316 /// # Returns
317 /// The estimated selectivity (0.0 to 1.0)
318 pub fn equality_selectivity(&self, total_rows: u64) -> f64 {
319 if self.distinct_estimate == 0 || total_rows == 0 {
320 // Default selectivity when no statistics available
321 return 0.1;
322 }
323 1.0 / self.distinct_estimate as f64
324 }
325
326 /// Estimates selectivity for a range predicate.
327 ///
328 /// Uses min/max values to estimate what fraction of the range is covered.
329 /// Returns a default estimate if range statistics are unavailable.
330 ///
331 /// # Arguments
332 /// * `low` - The lower bound of the range (inclusive)
333 /// * `high` - The upper bound of the range (inclusive)
334 ///
335 /// # Returns
336 /// The estimated selectivity (0.0 to 1.0)
337 pub fn range_selectivity(&self, low: i64, high: i64) -> f64 {
338 match (self.min_value, self.max_value) {
339 (Some(col_min), Some(col_max)) if col_max > col_min => {
340 let col_range = (col_max - col_min) as f64;
341 let effective_low = low.max(col_min);
342 let effective_high = high.min(col_max);
343 if effective_high < effective_low {
344 return 0.0;
345 }
346 let query_range = (effective_high - effective_low) as f64;
347 (query_range / col_range).clamp(0.0, 1.0)
348 }
349 _ => {
350 // Default range selectivity when no statistics available
351 0.25
352 }
353 }
354 }
355
356 /// Returns the storage size per value for this column type.
357 pub fn value_size_bytes(&self) -> usize {
358 self.dtype.size_bytes()
359 }
360}
361
362/// Join selectivity model for estimating join output cardinality.
363///
364/// Tracks information about joins between two relations, including the join
365/// keys and estimated selectivity. This is crucial for the optimizer to
366/// choose between nested-loop, hash, and sort-merge join strategies.
367#[derive(Debug, Clone)]
368pub struct JoinSelectivity {
369 /// Left relation in the join
370 pub left_rel: RelId,
371 /// Right relation in the join
372 pub right_rel: RelId,
373 /// Column indices used as join keys on the left relation
374 pub left_keys: Vec<usize>,
375 /// Column indices used as join keys on the right relation
376 pub right_keys: Vec<usize>,
377 /// Estimated selectivity factor (0.0 to 1.0)
378 pub selectivity: f64,
379 /// Whether this is a primary key to foreign key join
380 pub is_pk_fk: bool,
381 /// Cached join cardinality estimate (if computed)
382 cached_output_estimate: Option<u64>,
383}
384
385impl JoinSelectivity {
386 /// Creates a new join selectivity model between two relations.
387 ///
388 /// Initializes with default selectivity of 1.0 (cross product).
389 ///
390 /// # Arguments
391 /// * `left_rel` - The left relation's ID
392 /// * `right_rel` - The right relation's ID
393 ///
394 /// # Returns
395 /// A new `JoinSelectivity` with default values.
396 pub fn new(left_rel: RelId, right_rel: RelId) -> Self {
397 Self {
398 left_rel,
399 right_rel,
400 left_keys: Vec::new(),
401 right_keys: Vec::new(),
402 selectivity: 1.0,
403 is_pk_fk: false,
404 cached_output_estimate: None,
405 }
406 }
407
408 /// Sets the join keys for both relations.
409 ///
410 /// # Arguments
411 /// * `left_keys` - Column indices on the left relation
412 /// * `right_keys` - Column indices on the right relation
413 pub fn set_keys(&mut self, left_keys: Vec<usize>, right_keys: Vec<usize>) {
414 debug_assert_eq!(
415 left_keys.len(),
416 right_keys.len(),
417 "Join key counts must match"
418 );
419 self.left_keys = left_keys;
420 self.right_keys = right_keys;
421 self.cached_output_estimate = None;
422 }
423
424 /// Sets the selectivity factor.
425 ///
426 /// # Arguments
427 /// * `selectivity` - The selectivity factor (0.0 to 1.0)
428 pub fn set_selectivity(&mut self, selectivity: f64) {
429 self.selectivity = selectivity.clamp(0.0, 1.0);
430 self.cached_output_estimate = None;
431 }
432
433 /// Marks this as a primary key to foreign key join.
434 ///
435 /// PK-FK joins have special selectivity characteristics: the output
436 /// cardinality equals the FK side's cardinality.
437 pub fn mark_pk_fk(&mut self) {
438 self.is_pk_fk = true;
439 }
440
441 /// Estimates the output row count for this join.
442 ///
443 /// For PK-FK joins, returns the cardinality of the FK side.
444 /// For other joins, returns: left_rows * right_rows * selectivity
445 ///
446 /// # Arguments
447 /// * `left_rows` - Cardinality of the left relation
448 /// * `right_rows` - Cardinality of the right relation
449 ///
450 /// # Returns
451 /// The estimated output cardinality (minimum of 1)
452 pub fn estimate_output_rows(&self, left_rows: u64, right_rows: u64) -> u64 {
453 if self.is_pk_fk {
454 // FK side determines cardinality in PK-FK joins
455 // Conventionally, right side is FK
456 return right_rows;
457 }
458 ((left_rows as f64 * right_rows as f64 * self.selectivity) as u64).max(1)
459 }
460
461 /// Estimates selectivity from column statistics.
462 ///
463 /// Uses the "independence assumption" and distinct value counts:
464 /// selectivity = 1 / max(distinct_left, distinct_right)
465 ///
466 /// # Arguments
467 /// * `left_distinct` - Distinct value count for left join key
468 /// * `right_distinct` - Distinct value count for right join key
469 ///
470 /// # Returns
471 /// The estimated selectivity
472 pub fn estimate_selectivity_from_stats(left_distinct: u64, right_distinct: u64) -> f64 {
473 if left_distinct == 0 || right_distinct == 0 {
474 return 1.0;
475 }
476 1.0 / left_distinct.max(right_distinct) as f64
477 }
478
479 /// Updates selectivity based on observed join statistics.
480 ///
481 /// This can be called after query execution to improve future estimates.
482 ///
483 /// # Arguments
484 /// * `left_rows` - Actual left cardinality
485 /// * `right_rows` - Actual right cardinality
486 /// * `output_rows` - Actual output cardinality
487 pub fn update_from_observation(&mut self, left_rows: u64, right_rows: u64, output_rows: u64) {
488 let product = left_rows as f64 * right_rows as f64;
489 if product > 0.0 {
490 self.selectivity = (output_rows as f64 / product).clamp(0.0, 1.0);
491 self.cached_output_estimate = Some(output_rows);
492 }
493 }
494}
495
496#[cfg(test)]
497mod tests {
498 use super::*;
499
500 #[test]
501 fn test_relation_stats_new() {
502 let stats = RelationStats::new(RelId(1));
503 assert_eq!(stats.rel_id, RelId(1));
504 assert_eq!(stats.cardinality, 0);
505 assert_eq!(stats.heat, 0.0);
506 assert_eq!(stats.byte_size, 0);
507 assert!(stats.column_stats.is_empty());
508 assert!(!stats.has_index);
509 }
510
511 #[test]
512 fn test_relation_stats_update_cardinality() {
513 let mut stats = RelationStats::new(RelId(1));
514 stats.update_cardinality(1000);
515 assert_eq!(stats.cardinality, 1000);
516 }
517
518 #[test]
519 fn test_relation_stats_update_byte_size() {
520 let mut stats = RelationStats::new(RelId(1));
521 stats.update_byte_size(4096);
522 assert_eq!(stats.byte_size, 4096);
523 }
524
525 #[test]
526 fn test_relation_stats_update_heat() {
527 let mut stats = RelationStats::new(RelId(1));
528 assert_eq!(stats.heat, 0.0);
529
530 stats.record_access();
531 assert!(stats.heat > 0.0);
532 let heat_after_first = stats.heat;
533 assert!((heat_after_first - 0.1).abs() < 0.001);
534
535 stats.record_access();
536 assert!(stats.heat > heat_after_first);
537 // After second access: 0.1 * 0.9 + 0.1 = 0.19
538 assert!((stats.heat - 0.19).abs() < 0.001);
539
540 // Verify last_access was set
541 assert!(stats.last_access > 0);
542 }
543
544 #[test]
545 fn test_relation_stats_decay_heat() {
546 let mut stats = RelationStats::new(RelId(1));
547 stats.record_access();
548 stats.record_access();
549 let initial_heat = stats.heat;
550
551 stats.decay_heat(0.5);
552 assert!((stats.heat - initial_heat * 0.5).abs() < 0.001);
553 }
554
555 #[test]
556 fn test_relation_stats_column_management() {
557 let mut stats = RelationStats::new(RelId(1));
558 let col0 = ColumnStats::new(0, ScalarType::U32);
559 let col1 = ColumnStats::new(1, ScalarType::I64);
560
561 stats.add_column(col0);
562 stats.add_column(col1);
563
564 assert_eq!(stats.column_stats.len(), 2);
565 assert!(stats.get_column(0).is_some());
566 assert!(stats.get_column(1).is_some());
567 assert!(stats.get_column(2).is_none());
568
569 // Test mutable access
570 if let Some(col) = stats.get_column_mut(0) {
571 col.update_distinct(100);
572 }
573 assert_eq!(stats.get_column(0).unwrap().distinct_estimate, 100);
574 }
575
576 #[test]
577 fn test_relation_stats_estimate_selectivity() {
578 let mut stats = RelationStats::new(RelId(1));
579 stats.update_cardinality(1000);
580
581 // 100 matches out of 1000 = 0.1 selectivity
582 let sel = stats.estimate_selectivity(100);
583 assert!((sel - 0.1).abs() < 0.001);
584
585 // Edge case: zero cardinality
586 let empty_stats = RelationStats::new(RelId(2));
587 assert_eq!(empty_stats.estimate_selectivity(50), 1.0);
588 }
589
590 #[test]
591 fn test_column_stats_new() {
592 let col = ColumnStats::new(0, ScalarType::U32);
593 assert_eq!(col.col_idx, 0);
594 assert_eq!(col.dtype, ScalarType::U32);
595 assert_eq!(col.distinct_estimate, 0);
596 assert_eq!(col.null_count, 0);
597 assert!(col.min_value.is_none());
598 assert!(col.max_value.is_none());
599 assert!(col.avg_width.is_none());
600 }
601
602 #[test]
603 fn test_column_stats_update_distinct() {
604 let mut col = ColumnStats::new(0, ScalarType::U32);
605 col.update_distinct(500);
606 assert_eq!(col.distinct_estimate, 500);
607 }
608
609 #[test]
610 fn test_column_stats_update_range() {
611 let mut col = ColumnStats::new(0, ScalarType::I32);
612 col.update_range(-100, 100);
613 assert_eq!(col.min_value, Some(-100));
614 assert_eq!(col.max_value, Some(100));
615 }
616
617 #[test]
618 fn test_column_stats_update_null_count() {
619 let mut col = ColumnStats::new(0, ScalarType::U32);
620 col.update_null_count(42);
621 assert_eq!(col.null_count, 42);
622 }
623
624 #[test]
625 fn test_column_stats_update_avg_width() {
626 let mut col = ColumnStats::new(0, ScalarType::Symbol);
627 col.update_avg_width(12.5);
628 assert_eq!(col.avg_width, Some(12.5));
629 }
630
631 #[test]
632 fn test_column_stats_equality_selectivity() {
633 let mut col = ColumnStats::new(0, ScalarType::U32);
634 col.update_distinct(100);
635
636 let sel = col.equality_selectivity(1000);
637 assert!((sel - 0.01).abs() < 0.0001); // 1/100 = 0.01
638
639 // Edge case: no distinct estimate
640 let empty_col = ColumnStats::new(1, ScalarType::U32);
641 assert_eq!(empty_col.equality_selectivity(1000), 0.1); // default
642 }
643
644 #[test]
645 fn test_column_stats_range_selectivity() {
646 let mut col = ColumnStats::new(0, ScalarType::I64);
647 col.update_range(0, 100);
648
649 // Query for [25, 75] on column with range [0, 100]
650 let sel = col.range_selectivity(25, 75);
651 assert!((sel - 0.5).abs() < 0.001); // (75-25)/100 = 0.5
652
653 // Query outside range
654 let sel_outside = col.range_selectivity(200, 300);
655 assert_eq!(sel_outside, 0.0);
656
657 // Query partially overlapping
658 let sel_partial = col.range_selectivity(50, 150);
659 assert!((sel_partial - 0.5).abs() < 0.001); // (100-50)/100 = 0.5
660
661 // No range stats available
662 let empty_col = ColumnStats::new(1, ScalarType::I64);
663 assert_eq!(empty_col.range_selectivity(0, 100), 0.25); // default
664 }
665
666 #[test]
667 fn test_column_stats_value_size() {
668 assert_eq!(ColumnStats::new(0, ScalarType::U32).value_size_bytes(), 4);
669 assert_eq!(ColumnStats::new(0, ScalarType::U64).value_size_bytes(), 8);
670 assert_eq!(ColumnStats::new(0, ScalarType::Bool).value_size_bytes(), 1);
671 }
672
673 #[test]
674 fn test_join_selectivity_new() {
675 let js = JoinSelectivity::new(RelId(1), RelId(2));
676 assert_eq!(js.left_rel, RelId(1));
677 assert_eq!(js.right_rel, RelId(2));
678 assert!(js.left_keys.is_empty());
679 assert!(js.right_keys.is_empty());
680 assert_eq!(js.selectivity, 1.0);
681 assert!(!js.is_pk_fk);
682 }
683
684 #[test]
685 fn test_join_selectivity_set_keys() {
686 let mut js = JoinSelectivity::new(RelId(1), RelId(2));
687 js.set_keys(vec![0, 1], vec![0, 1]);
688 assert_eq!(js.left_keys, vec![0, 1]);
689 assert_eq!(js.right_keys, vec![0, 1]);
690 }
691
692 #[test]
693 fn test_join_selectivity_set_selectivity() {
694 let mut js = JoinSelectivity::new(RelId(1), RelId(2));
695 js.set_selectivity(0.01);
696 assert!((js.selectivity - 0.01).abs() < 0.0001);
697
698 // Test clamping
699 js.set_selectivity(2.0);
700 assert_eq!(js.selectivity, 1.0);
701
702 js.set_selectivity(-1.0);
703 assert_eq!(js.selectivity, 0.0);
704 }
705
706 #[test]
707 fn test_join_selectivity_estimate_output_rows() {
708 let mut js = JoinSelectivity::new(RelId(1), RelId(2));
709 js.set_selectivity(0.01);
710
711 // 1000 * 500 * 0.01 = 5000
712 let output = js.estimate_output_rows(1000, 500);
713 assert_eq!(output, 5000);
714
715 // Test minimum of 1
716 js.set_selectivity(0.0);
717 let output_min = js.estimate_output_rows(10, 10);
718 assert_eq!(output_min, 1);
719 }
720
721 #[test]
722 fn test_join_selectivity_pk_fk() {
723 let mut js = JoinSelectivity::new(RelId(1), RelId(2));
724 js.mark_pk_fk();
725 assert!(js.is_pk_fk);
726
727 // PK-FK join: output = FK side cardinality
728 let output = js.estimate_output_rows(100, 500);
729 assert_eq!(output, 500); // FK side (right) cardinality
730 }
731
732 #[test]
733 fn test_join_selectivity_estimate_from_stats() {
734 // Selectivity = 1 / max(100, 200) = 0.005
735 let sel = JoinSelectivity::estimate_selectivity_from_stats(100, 200);
736 assert!((sel - 0.005).abs() < 0.0001);
737
738 // Edge case: zero distinct
739 let sel_zero = JoinSelectivity::estimate_selectivity_from_stats(0, 100);
740 assert_eq!(sel_zero, 1.0);
741 }
742
743 #[test]
744 fn test_join_selectivity_update_from_observation() {
745 let mut js = JoinSelectivity::new(RelId(1), RelId(2));
746 js.update_from_observation(1000, 500, 2500);
747
748 // Observed selectivity = 2500 / (1000 * 500) = 0.005
749 assert!((js.selectivity - 0.005).abs() < 0.0001);
750 }
751
752 #[test]
753 fn test_all_scalar_types_column_stats() {
754 // Ensure we can create column stats for all scalar types
755 let types = [
756 ScalarType::U32,
757 ScalarType::U64,
758 ScalarType::I32,
759 ScalarType::I64,
760 ScalarType::F32,
761 ScalarType::F64,
762 ScalarType::Bool,
763 ScalarType::Symbol,
764 ];
765
766 for (idx, dtype) in types.iter().enumerate() {
767 let col = ColumnStats::new(idx, *dtype);
768 assert_eq!(col.col_idx, idx);
769 assert_eq!(col.dtype, *dtype);
770 assert!(col.value_size_bytes() > 0);
771 }
772 }
773}