Skip to main content

xlog_stats/
stats.rs

1//! Core statistics types for GPU-resident relation metadata.
2//!
3//! This module provides statistics tracking for relations and columns that are
4//! used by the query optimizer and solver heuristics to make informed decisions
5//! about query execution strategies.
6
7use xlog_core::{RelId, ScalarType};
8
9/// GPU-resident relation statistics.
10///
11/// Tracks cardinality, memory usage, access patterns, and column-level statistics
12/// for relations stored on the GPU. These statistics drive optimizer cost models
13/// and solver heuristics for efficient query execution.
14#[derive(Debug, Clone)]
15pub struct RelationStats {
16    /// Unique identifier for the relation
17    pub rel_id: RelId,
18    /// Estimated number of rows in the relation
19    pub cardinality: u64,
20    /// Estimated total size in bytes on GPU
21    pub byte_size: u64,
22    /// Per-column statistics
23    pub column_stats: Vec<ColumnStats>,
24    /// Per-column prefix fan-out statistics for trie-style WCOJ planning.
25    pub prefix_degrees: Vec<PrefixDegreeStats>,
26    /// Per-column key heat/skew summaries for skew-aware WCOJ planning.
27    pub key_heats: Vec<KeyHeatStats>,
28    /// Access heat for LRU-style eviction (exponential moving average)
29    pub heat: f32,
30    /// Unix timestamp of last access
31    pub last_access: u64,
32    /// Whether an index exists for this relation
33    pub has_index: bool,
34}
35
36impl RelationStats {
37    /// Creates new statistics for a relation with default (empty) values.
38    ///
39    /// # Arguments
40    /// * `rel_id` - The unique identifier for the relation
41    ///
42    /// # Returns
43    /// A new `RelationStats` instance with zero cardinality, no columns, and cold heat.
44    pub fn new(rel_id: RelId) -> Self {
45        Self {
46            rel_id,
47            cardinality: 0,
48            byte_size: 0,
49            column_stats: Vec::new(),
50            prefix_degrees: Vec::new(),
51            key_heats: Vec::new(),
52            heat: 0.0,
53            last_access: 0,
54            has_index: false,
55        }
56    }
57
58    /// Updates the cardinality (row count) of the relation.
59    ///
60    /// This should be called after bulk loads, inserts, or when statistics
61    /// are refreshed from the actual GPU-resident data.
62    ///
63    /// # Arguments
64    /// * `rows` - The new cardinality estimate
65    pub fn update_cardinality(&mut self, rows: u64) {
66        self.cardinality = rows;
67    }
68
69    /// Updates the byte size estimate for the relation.
70    ///
71    /// # Arguments
72    /// * `bytes` - The estimated total size in bytes
73    pub fn update_byte_size(&mut self, bytes: u64) {
74        self.byte_size = bytes;
75    }
76
77    /// Records an access to this relation, updating heat and timestamp.
78    ///
79    /// Uses an exponential moving average for heat calculation:
80    /// `heat = heat * 0.9 + 0.1`
81    ///
82    /// This causes frequently accessed relations to maintain high heat
83    /// while infrequently accessed ones cool down over time.
84    pub fn record_access(&mut self) {
85        // Exponential moving average for heat
86        self.heat = self.heat * 0.9 + 0.1;
87        self.last_access = std::time::SystemTime::now()
88            .duration_since(std::time::UNIX_EPOCH)
89            .unwrap_or_default()
90            .as_secs();
91    }
92
93    /// Decays the heat by a multiplicative factor.
94    ///
95    /// This should be called periodically (e.g., during garbage collection
96    /// or memory pressure events) to allow unused relations to cool down.
97    ///
98    /// # Arguments
99    /// * `factor` - Multiplicative decay factor (typically 0.0 to 1.0)
100    pub fn decay_heat(&mut self, factor: f32) {
101        self.heat *= factor;
102    }
103
104    /// Adds column statistics for a new column.
105    ///
106    /// # Arguments
107    /// * `col_stats` - The column statistics to add
108    pub fn add_column(&mut self, col_stats: ColumnStats) {
109        self.column_stats.push(col_stats);
110    }
111
112    /// Gets column statistics by index.
113    ///
114    /// # Arguments
115    /// * `col_idx` - The column index
116    ///
117    /// # Returns
118    /// A reference to the column statistics if found
119    pub fn get_column(&self, col_idx: usize) -> Option<&ColumnStats> {
120        self.column_stats.iter().find(|c| c.col_idx == col_idx)
121    }
122
123    /// Gets mutable column statistics by index.
124    ///
125    /// # Arguments
126    /// * `col_idx` - The column index
127    ///
128    /// # Returns
129    /// A mutable reference to the column statistics if found
130    pub fn get_column_mut(&mut self, col_idx: usize) -> Option<&mut ColumnStats> {
131        self.column_stats.iter_mut().find(|c| c.col_idx == col_idx)
132    }
133
134    /// Adds prefix-degree statistics for a join-key column.
135    ///
136    /// Existing entries for the same column are retained; consumers use the first
137    /// matching entry so snapshots can preserve historical observations.
138    pub fn add_prefix_degree(&mut self, prefix_degree: PrefixDegreeStats) {
139        self.prefix_degrees.push(prefix_degree);
140    }
141
142    /// Gets prefix-degree statistics by column index.
143    pub fn get_prefix_degree(&self, col_idx: usize) -> Option<&PrefixDegreeStats> {
144        self.prefix_degrees.iter().find(|p| p.col_idx == col_idx)
145    }
146
147    /// Adds key-heat statistics for a join-key column.
148    ///
149    /// This is distinct from relation-level [`RelationStats::heat`]: relation heat
150    /// tracks access frequency, while key heat tracks per-key skew for a column.
151    pub fn add_key_heat(&mut self, key_heat: KeyHeatStats) {
152        self.key_heats.push(key_heat);
153    }
154
155    /// Gets key-heat statistics by column index.
156    pub fn get_key_heat(&self, col_idx: usize) -> Option<&KeyHeatStats> {
157        self.key_heats.iter().find(|h| h.col_idx == col_idx)
158    }
159
160    /// Estimates the selectivity for a given predicate cardinality.
161    ///
162    /// # Arguments
163    /// * `estimated_matches` - The estimated number of matching rows
164    ///
165    /// # Returns
166    /// The selectivity as a ratio (0.0 to 1.0)
167    pub fn estimate_selectivity(&self, estimated_matches: u64) -> f64 {
168        if self.cardinality == 0 {
169            return 1.0;
170        }
171        (estimated_matches as f64 / self.cardinality as f64).clamp(0.0, 1.0)
172    }
173}
174
175/// Prefix fan-out statistics for one relation column.
176///
177/// WCOJ planners use this as the trie prefix-degree signal: lower average and
178/// bounded maximum fan-out usually mean less inner-loop work for a variable
179/// order that binds the column early.
180#[derive(Debug, Clone)]
181pub struct PrefixDegreeStats {
182    /// Column index within the relation.
183    pub col_idx: usize,
184    /// Average number of rows below one distinct prefix key.
185    pub avg_degree: f64,
186    /// High-water fan-out used as a skew guard.
187    pub max_degree: f64,
188}
189
190impl PrefixDegreeStats {
191    /// Creates prefix-degree statistics for a column.
192    pub fn new(col_idx: usize, avg_degree: f64, max_degree: f64) -> Self {
193        Self {
194            col_idx,
195            avg_degree,
196            max_degree,
197        }
198    }
199}
200
201/// Per-key heat/skew statistics for one relation column.
202///
203/// The value is a compact summary of key-frequency imbalance. A value near zero
204/// is cold/unskewed; larger values indicate pivot-heavy keys that should be
205/// demoted by a skew-aware WCOJ planner.
206#[derive(Debug, Clone)]
207pub struct KeyHeatStats {
208    /// Column index within the relation.
209    pub col_idx: usize,
210    /// Heat value for the heavy-key tail.
211    pub heat: f64,
212    /// Multiplicative skew factor for the heaviest observed keys.
213    pub skew_factor: f64,
214}
215
216impl KeyHeatStats {
217    /// Creates key-heat statistics for a column.
218    pub fn new(col_idx: usize, heat: f64, skew_factor: f64) -> Self {
219        Self {
220            col_idx,
221            heat,
222            skew_factor,
223        }
224    }
225}
226
227/// Per-column statistics for optimizer cost estimation.
228///
229/// Tracks null counts, distinct value estimates, and value ranges for columns.
230/// These statistics enable the optimizer to estimate filter selectivity and
231/// join cardinalities.
232#[derive(Debug, Clone)]
233pub struct ColumnStats {
234    /// Column index within the relation
235    pub col_idx: usize,
236    /// Data type of the column
237    pub dtype: ScalarType,
238    /// Count of null values (for nullable columns)
239    pub null_count: u64,
240    /// HyperLogLog-style distinct value estimate
241    pub distinct_estimate: u64,
242    /// Minimum value (for orderable types, stored as i64)
243    pub min_value: Option<i64>,
244    /// Maximum value (for orderable types, stored as i64)
245    pub max_value: Option<i64>,
246    /// Average value length for variable-length types (e.g., symbols)
247    pub avg_width: Option<f32>,
248}
249
250impl ColumnStats {
251    /// Creates new column statistics with default values.
252    ///
253    /// # Arguments
254    /// * `col_idx` - The column index within the relation
255    /// * `dtype` - The scalar type of the column
256    ///
257    /// # Returns
258    /// A new `ColumnStats` instance with zero counts and no range information.
259    pub fn new(col_idx: usize, dtype: ScalarType) -> Self {
260        Self {
261            col_idx,
262            dtype,
263            null_count: 0,
264            distinct_estimate: 0,
265            min_value: None,
266            max_value: None,
267            avg_width: None,
268        }
269    }
270
271    /// Updates the distinct value estimate.
272    ///
273    /// This should be updated from HyperLogLog or similar cardinality estimation
274    /// algorithms running on the GPU.
275    ///
276    /// # Arguments
277    /// * `estimate` - The new distinct value estimate
278    pub fn update_distinct(&mut self, estimate: u64) {
279        self.distinct_estimate = estimate;
280    }
281
282    /// Updates the value range for this column.
283    ///
284    /// # Arguments
285    /// * `min` - The minimum value (encoded as i64)
286    /// * `max` - The maximum value (encoded as i64)
287    pub fn update_range(&mut self, min: i64, max: i64) {
288        self.min_value = Some(min);
289        self.max_value = Some(max);
290    }
291
292    /// Updates the null count for this column.
293    ///
294    /// # Arguments
295    /// * `count` - The number of null values
296    pub fn update_null_count(&mut self, count: u64) {
297        self.null_count = count;
298    }
299
300    /// Updates the average width for variable-length columns.
301    ///
302    /// # Arguments
303    /// * `width` - The average value width in bytes
304    pub fn update_avg_width(&mut self, width: f32) {
305        self.avg_width = Some(width);
306    }
307
308    /// Estimates selectivity for an equality predicate.
309    ///
310    /// Uses the distinct value count to estimate selectivity. If no distinct
311    /// count is available, returns a default estimate.
312    ///
313    /// # Arguments
314    /// * `total_rows` - The total number of rows in the relation
315    ///
316    /// # Returns
317    /// The estimated selectivity (0.0 to 1.0)
318    pub fn equality_selectivity(&self, total_rows: u64) -> f64 {
319        if self.distinct_estimate == 0 || total_rows == 0 {
320            // Default selectivity when no statistics available
321            return 0.1;
322        }
323        1.0 / self.distinct_estimate as f64
324    }
325
326    /// Estimates selectivity for a range predicate.
327    ///
328    /// Uses min/max values to estimate what fraction of the range is covered.
329    /// Returns a default estimate if range statistics are unavailable.
330    ///
331    /// # Arguments
332    /// * `low` - The lower bound of the range (inclusive)
333    /// * `high` - The upper bound of the range (inclusive)
334    ///
335    /// # Returns
336    /// The estimated selectivity (0.0 to 1.0)
337    pub fn range_selectivity(&self, low: i64, high: i64) -> f64 {
338        match (self.min_value, self.max_value) {
339            (Some(col_min), Some(col_max)) if col_max > col_min => {
340                let col_range = (col_max - col_min) as f64;
341                let effective_low = low.max(col_min);
342                let effective_high = high.min(col_max);
343                if effective_high < effective_low {
344                    return 0.0;
345                }
346                let query_range = (effective_high - effective_low) as f64;
347                (query_range / col_range).clamp(0.0, 1.0)
348            }
349            _ => {
350                // Default range selectivity when no statistics available
351                0.25
352            }
353        }
354    }
355
356    /// Returns the storage size per value for this column type.
357    pub fn value_size_bytes(&self) -> usize {
358        self.dtype.size_bytes()
359    }
360}
361
362/// Join selectivity model for estimating join output cardinality.
363///
364/// Tracks information about joins between two relations, including the join
365/// keys and estimated selectivity. This is crucial for the optimizer to
366/// choose between nested-loop, hash, and sort-merge join strategies.
367#[derive(Debug, Clone)]
368pub struct JoinSelectivity {
369    /// Left relation in the join
370    pub left_rel: RelId,
371    /// Right relation in the join
372    pub right_rel: RelId,
373    /// Column indices used as join keys on the left relation
374    pub left_keys: Vec<usize>,
375    /// Column indices used as join keys on the right relation
376    pub right_keys: Vec<usize>,
377    /// Estimated selectivity factor (0.0 to 1.0)
378    pub selectivity: f64,
379    /// Whether this is a primary key to foreign key join
380    pub is_pk_fk: bool,
381    /// Cached join cardinality estimate (if computed)
382    cached_output_estimate: Option<u64>,
383}
384
385impl JoinSelectivity {
386    /// Creates a new join selectivity model between two relations.
387    ///
388    /// Initializes with default selectivity of 1.0 (cross product).
389    ///
390    /// # Arguments
391    /// * `left_rel` - The left relation's ID
392    /// * `right_rel` - The right relation's ID
393    ///
394    /// # Returns
395    /// A new `JoinSelectivity` with default values.
396    pub fn new(left_rel: RelId, right_rel: RelId) -> Self {
397        Self {
398            left_rel,
399            right_rel,
400            left_keys: Vec::new(),
401            right_keys: Vec::new(),
402            selectivity: 1.0,
403            is_pk_fk: false,
404            cached_output_estimate: None,
405        }
406    }
407
408    /// Sets the join keys for both relations.
409    ///
410    /// # Arguments
411    /// * `left_keys` - Column indices on the left relation
412    /// * `right_keys` - Column indices on the right relation
413    pub fn set_keys(&mut self, left_keys: Vec<usize>, right_keys: Vec<usize>) {
414        debug_assert_eq!(
415            left_keys.len(),
416            right_keys.len(),
417            "Join key counts must match"
418        );
419        self.left_keys = left_keys;
420        self.right_keys = right_keys;
421        self.cached_output_estimate = None;
422    }
423
424    /// Sets the selectivity factor.
425    ///
426    /// # Arguments
427    /// * `selectivity` - The selectivity factor (0.0 to 1.0)
428    pub fn set_selectivity(&mut self, selectivity: f64) {
429        self.selectivity = selectivity.clamp(0.0, 1.0);
430        self.cached_output_estimate = None;
431    }
432
433    /// Marks this as a primary key to foreign key join.
434    ///
435    /// PK-FK joins have special selectivity characteristics: the output
436    /// cardinality equals the FK side's cardinality.
437    pub fn mark_pk_fk(&mut self) {
438        self.is_pk_fk = true;
439    }
440
441    /// Estimates the output row count for this join.
442    ///
443    /// For PK-FK joins, returns the cardinality of the FK side.
444    /// For other joins, returns: left_rows * right_rows * selectivity
445    ///
446    /// # Arguments
447    /// * `left_rows` - Cardinality of the left relation
448    /// * `right_rows` - Cardinality of the right relation
449    ///
450    /// # Returns
451    /// The estimated output cardinality (minimum of 1)
452    pub fn estimate_output_rows(&self, left_rows: u64, right_rows: u64) -> u64 {
453        if self.is_pk_fk {
454            // FK side determines cardinality in PK-FK joins
455            // Conventionally, right side is FK
456            return right_rows;
457        }
458        ((left_rows as f64 * right_rows as f64 * self.selectivity) as u64).max(1)
459    }
460
461    /// Estimates selectivity from column statistics.
462    ///
463    /// Uses the "independence assumption" and distinct value counts:
464    /// selectivity = 1 / max(distinct_left, distinct_right)
465    ///
466    /// # Arguments
467    /// * `left_distinct` - Distinct value count for left join key
468    /// * `right_distinct` - Distinct value count for right join key
469    ///
470    /// # Returns
471    /// The estimated selectivity
472    pub fn estimate_selectivity_from_stats(left_distinct: u64, right_distinct: u64) -> f64 {
473        if left_distinct == 0 || right_distinct == 0 {
474            return 1.0;
475        }
476        1.0 / left_distinct.max(right_distinct) as f64
477    }
478
479    /// Updates selectivity based on observed join statistics.
480    ///
481    /// This can be called after query execution to improve future estimates.
482    ///
483    /// # Arguments
484    /// * `left_rows` - Actual left cardinality
485    /// * `right_rows` - Actual right cardinality
486    /// * `output_rows` - Actual output cardinality
487    pub fn update_from_observation(&mut self, left_rows: u64, right_rows: u64, output_rows: u64) {
488        let product = left_rows as f64 * right_rows as f64;
489        if product > 0.0 {
490            self.selectivity = (output_rows as f64 / product).clamp(0.0, 1.0);
491            self.cached_output_estimate = Some(output_rows);
492        }
493    }
494}
495
496#[cfg(test)]
497mod tests {
498    use super::*;
499
500    #[test]
501    fn test_relation_stats_new() {
502        let stats = RelationStats::new(RelId(1));
503        assert_eq!(stats.rel_id, RelId(1));
504        assert_eq!(stats.cardinality, 0);
505        assert_eq!(stats.heat, 0.0);
506        assert_eq!(stats.byte_size, 0);
507        assert!(stats.column_stats.is_empty());
508        assert!(!stats.has_index);
509    }
510
511    #[test]
512    fn test_relation_stats_update_cardinality() {
513        let mut stats = RelationStats::new(RelId(1));
514        stats.update_cardinality(1000);
515        assert_eq!(stats.cardinality, 1000);
516    }
517
518    #[test]
519    fn test_relation_stats_update_byte_size() {
520        let mut stats = RelationStats::new(RelId(1));
521        stats.update_byte_size(4096);
522        assert_eq!(stats.byte_size, 4096);
523    }
524
525    #[test]
526    fn test_relation_stats_update_heat() {
527        let mut stats = RelationStats::new(RelId(1));
528        assert_eq!(stats.heat, 0.0);
529
530        stats.record_access();
531        assert!(stats.heat > 0.0);
532        let heat_after_first = stats.heat;
533        assert!((heat_after_first - 0.1).abs() < 0.001);
534
535        stats.record_access();
536        assert!(stats.heat > heat_after_first);
537        // After second access: 0.1 * 0.9 + 0.1 = 0.19
538        assert!((stats.heat - 0.19).abs() < 0.001);
539
540        // Verify last_access was set
541        assert!(stats.last_access > 0);
542    }
543
544    #[test]
545    fn test_relation_stats_decay_heat() {
546        let mut stats = RelationStats::new(RelId(1));
547        stats.record_access();
548        stats.record_access();
549        let initial_heat = stats.heat;
550
551        stats.decay_heat(0.5);
552        assert!((stats.heat - initial_heat * 0.5).abs() < 0.001);
553    }
554
555    #[test]
556    fn test_relation_stats_column_management() {
557        let mut stats = RelationStats::new(RelId(1));
558        let col0 = ColumnStats::new(0, ScalarType::U32);
559        let col1 = ColumnStats::new(1, ScalarType::I64);
560
561        stats.add_column(col0);
562        stats.add_column(col1);
563
564        assert_eq!(stats.column_stats.len(), 2);
565        assert!(stats.get_column(0).is_some());
566        assert!(stats.get_column(1).is_some());
567        assert!(stats.get_column(2).is_none());
568
569        // Test mutable access
570        if let Some(col) = stats.get_column_mut(0) {
571            col.update_distinct(100);
572        }
573        assert_eq!(stats.get_column(0).unwrap().distinct_estimate, 100);
574    }
575
576    #[test]
577    fn test_relation_stats_estimate_selectivity() {
578        let mut stats = RelationStats::new(RelId(1));
579        stats.update_cardinality(1000);
580
581        // 100 matches out of 1000 = 0.1 selectivity
582        let sel = stats.estimate_selectivity(100);
583        assert!((sel - 0.1).abs() < 0.001);
584
585        // Edge case: zero cardinality
586        let empty_stats = RelationStats::new(RelId(2));
587        assert_eq!(empty_stats.estimate_selectivity(50), 1.0);
588    }
589
590    #[test]
591    fn test_column_stats_new() {
592        let col = ColumnStats::new(0, ScalarType::U32);
593        assert_eq!(col.col_idx, 0);
594        assert_eq!(col.dtype, ScalarType::U32);
595        assert_eq!(col.distinct_estimate, 0);
596        assert_eq!(col.null_count, 0);
597        assert!(col.min_value.is_none());
598        assert!(col.max_value.is_none());
599        assert!(col.avg_width.is_none());
600    }
601
602    #[test]
603    fn test_column_stats_update_distinct() {
604        let mut col = ColumnStats::new(0, ScalarType::U32);
605        col.update_distinct(500);
606        assert_eq!(col.distinct_estimate, 500);
607    }
608
609    #[test]
610    fn test_column_stats_update_range() {
611        let mut col = ColumnStats::new(0, ScalarType::I32);
612        col.update_range(-100, 100);
613        assert_eq!(col.min_value, Some(-100));
614        assert_eq!(col.max_value, Some(100));
615    }
616
617    #[test]
618    fn test_column_stats_update_null_count() {
619        let mut col = ColumnStats::new(0, ScalarType::U32);
620        col.update_null_count(42);
621        assert_eq!(col.null_count, 42);
622    }
623
624    #[test]
625    fn test_column_stats_update_avg_width() {
626        let mut col = ColumnStats::new(0, ScalarType::Symbol);
627        col.update_avg_width(12.5);
628        assert_eq!(col.avg_width, Some(12.5));
629    }
630
631    #[test]
632    fn test_column_stats_equality_selectivity() {
633        let mut col = ColumnStats::new(0, ScalarType::U32);
634        col.update_distinct(100);
635
636        let sel = col.equality_selectivity(1000);
637        assert!((sel - 0.01).abs() < 0.0001); // 1/100 = 0.01
638
639        // Edge case: no distinct estimate
640        let empty_col = ColumnStats::new(1, ScalarType::U32);
641        assert_eq!(empty_col.equality_selectivity(1000), 0.1); // default
642    }
643
644    #[test]
645    fn test_column_stats_range_selectivity() {
646        let mut col = ColumnStats::new(0, ScalarType::I64);
647        col.update_range(0, 100);
648
649        // Query for [25, 75] on column with range [0, 100]
650        let sel = col.range_selectivity(25, 75);
651        assert!((sel - 0.5).abs() < 0.001); // (75-25)/100 = 0.5
652
653        // Query outside range
654        let sel_outside = col.range_selectivity(200, 300);
655        assert_eq!(sel_outside, 0.0);
656
657        // Query partially overlapping
658        let sel_partial = col.range_selectivity(50, 150);
659        assert!((sel_partial - 0.5).abs() < 0.001); // (100-50)/100 = 0.5
660
661        // No range stats available
662        let empty_col = ColumnStats::new(1, ScalarType::I64);
663        assert_eq!(empty_col.range_selectivity(0, 100), 0.25); // default
664    }
665
666    #[test]
667    fn test_column_stats_value_size() {
668        assert_eq!(ColumnStats::new(0, ScalarType::U32).value_size_bytes(), 4);
669        assert_eq!(ColumnStats::new(0, ScalarType::U64).value_size_bytes(), 8);
670        assert_eq!(ColumnStats::new(0, ScalarType::Bool).value_size_bytes(), 1);
671    }
672
673    #[test]
674    fn test_join_selectivity_new() {
675        let js = JoinSelectivity::new(RelId(1), RelId(2));
676        assert_eq!(js.left_rel, RelId(1));
677        assert_eq!(js.right_rel, RelId(2));
678        assert!(js.left_keys.is_empty());
679        assert!(js.right_keys.is_empty());
680        assert_eq!(js.selectivity, 1.0);
681        assert!(!js.is_pk_fk);
682    }
683
684    #[test]
685    fn test_join_selectivity_set_keys() {
686        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
687        js.set_keys(vec![0, 1], vec![0, 1]);
688        assert_eq!(js.left_keys, vec![0, 1]);
689        assert_eq!(js.right_keys, vec![0, 1]);
690    }
691
692    #[test]
693    fn test_join_selectivity_set_selectivity() {
694        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
695        js.set_selectivity(0.01);
696        assert!((js.selectivity - 0.01).abs() < 0.0001);
697
698        // Test clamping
699        js.set_selectivity(2.0);
700        assert_eq!(js.selectivity, 1.0);
701
702        js.set_selectivity(-1.0);
703        assert_eq!(js.selectivity, 0.0);
704    }
705
706    #[test]
707    fn test_join_selectivity_estimate_output_rows() {
708        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
709        js.set_selectivity(0.01);
710
711        // 1000 * 500 * 0.01 = 5000
712        let output = js.estimate_output_rows(1000, 500);
713        assert_eq!(output, 5000);
714
715        // Test minimum of 1
716        js.set_selectivity(0.0);
717        let output_min = js.estimate_output_rows(10, 10);
718        assert_eq!(output_min, 1);
719    }
720
721    #[test]
722    fn test_join_selectivity_pk_fk() {
723        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
724        js.mark_pk_fk();
725        assert!(js.is_pk_fk);
726
727        // PK-FK join: output = FK side cardinality
728        let output = js.estimate_output_rows(100, 500);
729        assert_eq!(output, 500); // FK side (right) cardinality
730    }
731
732    #[test]
733    fn test_join_selectivity_estimate_from_stats() {
734        // Selectivity = 1 / max(100, 200) = 0.005
735        let sel = JoinSelectivity::estimate_selectivity_from_stats(100, 200);
736        assert!((sel - 0.005).abs() < 0.0001);
737
738        // Edge case: zero distinct
739        let sel_zero = JoinSelectivity::estimate_selectivity_from_stats(0, 100);
740        assert_eq!(sel_zero, 1.0);
741    }
742
743    #[test]
744    fn test_join_selectivity_update_from_observation() {
745        let mut js = JoinSelectivity::new(RelId(1), RelId(2));
746        js.update_from_observation(1000, 500, 2500);
747
748        // Observed selectivity = 2500 / (1000 * 500) = 0.005
749        assert!((js.selectivity - 0.005).abs() < 0.0001);
750    }
751
752    #[test]
753    fn test_all_scalar_types_column_stats() {
754        // Ensure we can create column stats for all scalar types
755        let types = [
756            ScalarType::U32,
757            ScalarType::U64,
758            ScalarType::I32,
759            ScalarType::I64,
760            ScalarType::F32,
761            ScalarType::F64,
762            ScalarType::Bool,
763            ScalarType::Symbol,
764        ];
765
766        for (idx, dtype) in types.iter().enumerate() {
767            let col = ColumnStats::new(idx, *dtype);
768            assert_eq!(col.col_idx, idx);
769            assert_eq!(col.dtype, *dtype);
770            assert!(col.value_size_bytes() > 0);
771        }
772    }
773}