Skip to main content

xlog_cuda/
kernel_manifest_data.rs

1// Single source of truth for CUDA kernel modules.
2//
3// This file is consumed by both `build.rs` (via include!()) and `lib.rs`
4// (via `pub mod kernel_manifest_data`) to avoid the kernel list being
5// duplicated in two places.
6// NOTE: Use regular comments (//), NOT inner doc comments (//!), because
7// include!() in build.rs would interpret //! as documenting the wrong item.
8
9/// Module names matching the .cu filenames (without extension).
10/// Order matches provider/mod.rs load order. All 24 modules listed.
11pub const KERNEL_CU_NAMES: &[&str] = &[
12    "join",
13    "dedup",
14    "groupby",
15    "scan",
16    "sort",
17    "filter",
18    "set_ops",
19    "pack",
20    "pir",
21    "cnf",
22    "cache",
23    "weights",
24    "circuit",
25    "mc_sample",
26    "mc_eval",
27    "arith",
28    "sat",
29    "d4",
30    "neural",
31    "ilp",
32    "ilp_credit",
33    "ilp_exact",
34    "epistemic",
35    "wcoj",
36    "mc_resident",
37];
38
39/// Describes a single CUDA module: the .cu file name, the runtime module name
40/// used by cudarc, and the list of kernel function entry points within.
41pub struct KernelModuleSpec {
42    pub cu_name: &'static str,
43    pub module_name: &'static str,
44    pub kernels: &'static [&'static str],
45}
46
47/// All kernel modules with their entry-point function names.
48/// Order and cu_names match `KERNEL_CU_NAMES`.
49pub const KERNEL_MODULES: &[KernelModuleSpec] = &[
50    KernelModuleSpec {
51        cu_name: "join",
52        module_name: "xlog_join",
53        kernels: &[
54            "hash_join_build",
55            "hash_join_probe",
56            "compute_composite_hash",
57            "hash_join_bucket_count_v2",
58            "hash_join_scatter_v2",
59            "hash_join_probe_v2",
60            "hash_join_probe_v2_count_per_row",
61            "hash_join_probe_v2_materialize",
62            "hash_join_total_from_scan",
63            "hash_join_csm_unmatched_mask",
64            "hash_join_semi",
65            "hash_join_anti",
66            "init_hash_table",
67            // Nested-loop inner join production operator (emit-pairs design).
68            "nested_loop_join_inner_u32_1key_pairs",
69            // Sort-merge inner join provider-level operator (emit-pairs design,
70            // caller-asserted pre-sorted inputs).
71            "sort_merge_join_inner_u32_1key_pairs",
72        ],
73    },
74    KernelModuleSpec {
75        cu_name: "dedup",
76        module_name: "xlog_dedup",
77        kernels: &[
78            "mark_duplicates",
79            "mark_unique_columnar",
80            "mark_unique_and_scan_columnar",
81            "compact_rows",
82            "mark_unique_full_row_bytewise",
83            "mark_diff_full_row_typed_sorted",
84            "small_sort_full_row_indices_typed",
85        ],
86    },
87    KernelModuleSpec {
88        cu_name: "groupby",
89        module_name: "xlog_groupby",
90        kernels: &[
91            "detect_group_boundaries",
92            "detect_boundaries",
93            "extract_group_keys",
94            "group_ids_from_boundaries",
95            "group_start_indices",
96            "capture_num_groups",
97            "groupby_count",
98            "groupby_sum",
99            "groupby_sum_u64",
100            "groupby_min",
101            "groupby_min_u64",
102            "groupby_max",
103            "groupby_max_u64",
104            "groupby_logsumexp_max",
105            "groupby_logsumexp_sumexp",
106            "groupby_logsumexp_final",
107        ],
108    },
109    KernelModuleSpec {
110        cu_name: "scan",
111        module_name: "xlog_scan",
112        kernels: &[
113            "block_inclusive_scan",
114            "add_block_offsets",
115            "exclusive_scan_mask",
116            "count_mask",
117            "multiblock_scan_phase1",
118            "multiblock_scan_u32_phase1",
119            "multiblock_scan_phase2",
120            "multiblock_scan_phase3",
121        ],
122    },
123    KernelModuleSpec {
124        cu_name: "sort",
125        module_name: "xlog_sort",
126        kernels: &[
127            "radix_histogram",
128            "radix_scatter",
129            "compute_ranks",
130            "radix_scatter_stable",
131            "compute_digit_prefix_sums",
132            "init_indices",
133            "apply_permutation_u32",
134            "apply_permutation_bytes",
135            "gather_keys_i32_ordered_u32",
136            "gather_keys_f32_ordered_u32",
137            "gather_keys_bool_ordered_u32",
138            "gather_keys_u64_lo_u32",
139            "gather_keys_u64_hi_u32",
140            "gather_keys_i64_lo_u32",
141            "gather_keys_i64_hi_u32",
142            "gather_keys_f64_lo_u32",
143            "gather_keys_f64_hi_u32",
144            // Sort-merge sortedness-detection kernel used by provider-level
145            // callers before invoking the sort-merge join.
146            "check_ascending_sorted_u32",
147        ],
148    },
149    KernelModuleSpec {
150        cu_name: "filter",
151        module_name: "xlog_filter",
152        kernels: &[
153            "filter_compare_u32",
154            "filter_compare_i64",
155            "filter_compare_f64",
156            "filter_compare_i32",
157            "filter_compare_u64",
158            "filter_compare_f32",
159            "filter_compare_u8",
160            "filter_compare_u32_scan_phase1",
161            "filter_compare_f64_scan_phase1",
162            "filter_compare_f32_scan_phase1",
163            "filter_compare_u32_col",
164            "filter_compare_i32_col",
165            "filter_compare_i64_col",
166            "filter_compare_u64_col",
167            "filter_compare_f32_col",
168            "filter_compare_f64_col",
169            "filter_compare_u8_col",
170            "fill_u32_iota",
171            "fill_u32_const",
172            "mark_random_vars",
173            "random_var_to_bit_from_list",
174            "check_random_var_count",
175            "compact_u32_by_mask",
176            "compact_i64_by_mask",
177            "compact_f64_by_mask",
178            "compact_bytes_by_mask",
179            "capture_compact_count",
180            "mask_clamp_rows",
181            "mask_and",
182            "mask_or",
183            "mask_not",
184        ],
185    },
186    KernelModuleSpec {
187        cu_name: "set_ops",
188        module_name: "xlog_set_ops",
189        kernels: &["concat_u32", "concat_bytes", "sorted_diff_mark"],
190    },
191    KernelModuleSpec {
192        cu_name: "pack",
193        module_name: "xlog_pack",
194        kernels: &[
195            "pack_keys",
196            "hash_packed_keys",
197            "pack_and_hash_keys",
198            "pack_and_hash_keys_generic",
199            "pack_keys_aligned",
200            "unpack_column",
201            "unpack_column_counted",
202            "gather_packed_rows",
203            "gather_packed_rows_counted",
204            "scatter_packed_rows",
205            "compare_packed_keys",
206            "pack_bools_to_bitmap",
207        ],
208    },
209    KernelModuleSpec {
210        cu_name: "pir",
211        module_name: "xlog_pir",
212        kernels: &[
213            "pir_pack_keys",
214            "pir_hash_keys",
215            "pir_mark_unique",
216            "pir_find_existing",
217            "pir_mark_new_groups",
218            "pir_build_group_ids",
219            "pir_fill_child_parents",
220            "pir_mark_unique_pairs",
221            "pir_compact_pairs",
222            "pir_count_children",
223            "pir_write_child_offsets",
224            "pir_gather_children",
225            "pir_build_graph_child_counts",
226            "pir_sum_counts",
227            "pir_emit_nodes_and_ids",
228            "pir_update_counts",
229        ],
230    },
231    KernelModuleSpec {
232        cu_name: "cnf",
233        module_name: "xlog_cnf",
234        kernels: &[
235            "cnf_reachability_init",
236            "cnf_reachability_bfs",
237            "cnf_mark_leaf_choice",
238            "cnf_assign_leaf_var",
239            "cnf_assign_choice_var",
240            "cnf_mark_node_vars",
241            "cnf_count_clauses",
242            "cnf_capture_last_counts",
243            "cnf_compute_leaf_choice_totals",
244            "cnf_compute_totals",
245            "cnf_assign_node_var",
246            "cnf_emit_clauses",
247            "cnf_set_clause_end",
248        ],
249    },
250    KernelModuleSpec {
251        cu_name: "cache",
252        module_name: "xlog_cache",
253        kernels: &[
254            "cache_cnf_hash",
255            "cache_lookup_or_insert",
256            "cache_evict_lru",
257            "cache_store_u8",
258            "cache_store_u32",
259            "cache_store_i32",
260            "cache_store_f64",
261            "cache_store_meta",
262        ],
263    },
264    KernelModuleSpec {
265        cu_name: "weights",
266        module_name: "xlog_weights",
267        kernels: &[
268            "weights_fill_leaf",
269            "weights_fill_choice",
270            "weights_count_lift_exact",
271            "weights_set_evidence_from_nodes",
272            "weights_apply_evidence",
273            "weights_map_nodes_to_vars",
274            "weights_force_var_false",
275            "weights_restore_var_false",
276            "weights_force_var_true",
277            "weights_restore_var_true",
278            "weights_copy_slot_to_batch",
279            "weights_apply_query_vars",
280            "weights_restore_query_vars",
281            "weights_apply_query_vars_false_batched",
282            "weights_restore_query_vars_false_batched",
283            "weights_apply_query_vars_true_batched",
284            "weights_restore_query_vars_true_batched",
285        ],
286    },
287    KernelModuleSpec {
288        cu_name: "circuit",
289        module_name: "xlog_circuit",
290        kernels: &[
291            "xgcf_forward_level",
292            "xgcf_backward_level_propagate",
293            "xgcf_backward_level_decision_grad",
294            "xgcf_backward_level_lit_grad",
295            "xgcf_free_var_apply_grad",
296            "xgcf_free_var_reduce_stage",
297            "xgcf_add_scalar",
298            "xgcf_forward_level_cached",
299            "xgcf_eval_all_levels_cached",
300            "xgcf_eval_all_levels_cached_batched",
301            "xgcf_backward_level_propagate_cached",
302            "xgcf_backward_level_decision_grad_cached",
303            "xgcf_backward_level_lit_grad_cached",
304            "xgcf_backward_all_levels_cached",
305            "xgcf_backward_all_levels_cached_batched",
306            "xgcf_free_var_apply_grad_cached",
307            "xgcf_free_var_reduce_stage_cached",
308            "xgcf_add_scalar_cached",
309            "xgcf_set_root_adj_cached_batched",
310            "xgcf_copy_root_cached",
311            "xgcf_copy_root_cached_meta",
312            "xgcf_copy_root_cached_meta_batched",
313        ],
314    },
315    KernelModuleSpec {
316        cu_name: "mc_sample",
317        module_name: "xlog_mc_sample",
318        kernels: &["mc_sample_bernoulli"],
319    },
320    KernelModuleSpec {
321        cu_name: "mc_eval",
322        module_name: "xlog_mc_eval",
323        kernels: &[
324            "mc_eval_mask_var",
325            "mc_eval_mask_ad_choice",
326            "mc_eval_query_evidence_truth",
327            "mc_accumulate_counts",
328        ],
329    },
330    KernelModuleSpec {
331        cu_name: "arith",
332        module_name: "xlog_arith",
333        kernels: &[
334            "arith_binary_i64",
335            "arith_binary_i32",
336            "arith_binary_u64",
337            "arith_binary_u32",
338            "arith_binary_f64",
339            "arith_binary_f32",
340            "arith_abs_i64",
341            "arith_abs_i32",
342            "arith_abs_f64",
343            "arith_abs_f32",
344            "arith_pow_f64",
345            "arith_cast",
346            "arith_fill_const_u32",
347            "arith_fill_const_u64",
348            "arith_fill_const_i64",
349            "arith_fill_const_i32",
350            "arith_fill_const_f64",
351            "arith_fill_const_f32",
352            "arith_fill_const_u8",
353            "arith_select_i64",
354            "arith_select_i32",
355            "arith_select_u64",
356            "arith_select_u32",
357            "arith_select_f64",
358            "arith_select_f32",
359        ],
360    },
361    KernelModuleSpec {
362        cu_name: "sat",
363        module_name: "xlog_sat",
364        kernels: &[
365            "sat_cdcl_solve",
366            "sat_check_model",
367            "sat_proof_mark_needed",
368            "sat_proof_check",
369            "sat_assert_status",
370            "sat_assert_ok",
371            "sat_xgcf_cnf_counts",
372            "sat_xgcf_cnf_emit",
373            "sat_xgcf_cnf_capture_last_counts",
374            "sat_xgcf_cnf_compute_totals",
375            "sat_cnf_write_terminator",
376            "sat_cnf_copy_into",
377            "sat_shift_offsets",
378            "sat_xgcf_write_root_unit_clause",
379            "sat_not_phi_counts",
380            "sat_emit_not_phi",
381        ],
382    },
383    KernelModuleSpec {
384        cu_name: "d4",
385        module_name: "xlog_d4",
386        kernels: &[
387            "d4_validate_cnf",
388            "d4_levelize_counts",
389            "d4_levelize_emit",
390            "d4_frontier_prepare",
391            "d4_frontier_expand",
392            "d4_frontier_prepare_dense",
393            "d4_frontier_expand_dense",
394            "d4_compile_count",
395            "d4_compile_emit",
396            "d4_capture_emit_meta",
397            "d4_support_level",
398            "d4_support_set_root_bits",
399            "d4_smooth_count",
400            "d4_smooth_wrapper_counts",
401            "d4_smooth_wrapper_edge_counts_or",
402            "d4_smooth_wrapper_edge_counts_dec",
403            "d4_smooth_init_nodes",
404            "d4_smooth_emit_level",
405            "d4_smooth_check_edge_cap",
406            "d4_mark_vars_in_clauses",
407            "d4_mark_vars_in_circuit",
408            "d4_build_free_var_mask",
409            "d4_assert_u32_eq",
410            "d4_assert_bitset_var",
411            "d4_assert_dense_var",
412            "d4_assert_leaf_root_and_degree",
413        ],
414    },
415    KernelModuleSpec {
416        cu_name: "neural",
417        module_name: "xlog_neural",
418        kernels: &[
419            "neural_fill_ad_chain_f32",
420            "neural_scatter_ad_chain_grads_f32",
421        ],
422    },
423    KernelModuleSpec {
424        cu_name: "ilp",
425        module_name: "xlog_ilp",
426        kernels: &[
427            "extract_nonzero_indices",
428            "ilp_mark_selected_ids_u32",
429            "ilp_mark_selected_ids_i32",
430            "ilp_mark_selected_ids_i64",
431            "ilp_mark_selected_ids_u64",
432            "ilp_validate_selected_ids_u32",
433            "ilp_validate_selected_ids_i32",
434            "ilp_validate_selected_ids_i64",
435            "ilp_validate_selected_ids_u64",
436            "ilp_broadcast_candidate_flag",
437            "ilp_coo_fill_from_mask",
438            "ilp_csr_histogram",
439            "ilp_reduce_sum_f32",
440            "ilp_reduce_sum_f64",
441        ],
442    },
443    KernelModuleSpec {
444        cu_name: "ilp_credit",
445        module_name: "xlog_ilp_credit",
446        kernels: &[
447            "ilp_coo_fill",
448            "ilp_credit_forward_f32",
449            "ilp_credit_forward_f64",
450            "ilp_credit_backward_f32",
451            "ilp_credit_backward_f64",
452        ],
453    },
454    KernelModuleSpec {
455        cu_name: "ilp_exact",
456        module_name: "xlog_ilp_exact",
457        kernels: &[
458            "ilp_exact_score",
459            "ilp_exact_score_u32",
460            "ilp_exact_score_chain_smem",
461            "ilp_exact_score_chain_smem_u32",
462            "ilp_exact_select_topk",
463        ],
464    },
465    KernelModuleSpec {
466        cu_name: "epistemic",
467        module_name: "xlog_epistemic",
468        kernels: &[
469            "epistemic_generate_candidate_assumptions_u8",
470            "epistemic_propagate_candidates_u8",
471            "epistemic_validate_candidate_bits_u8",
472            "epistemic_populate_model_membership_u8",
473            "epistemic_populate_model_membership_from_tuple_source_u8",
474            "epistemic_populate_model_membership_from_tuple_source_arity1_u8",
475            "epistemic_populate_model_membership_from_tuple_source_arity2_u8",
476            "epistemic_populate_model_membership_from_tuple_source_arity3_u8",
477            "epistemic_populate_model_membership_from_tuple_source_arity_n_u8",
478            "epistemic_validate_world_views_u8",
479            "epistemic_validate_constraints_u8",
480            "epistemic_materialize_accepted_candidates_u8",
481            "epistemic_materialize_final_result_flags_u8",
482            "epistemic_build_final_tuple_row_map_u8",
483            "epistemic_close_final_tuple_rejections_u8",
484            "epistemic_materialize_final_tuple_column_u8",
485        ],
486    },
487    KernelModuleSpec {
488        cu_name: "wcoj",
489        module_name: "xlog_wcoj",
490        kernels: &[
491            "wcoj_build_metadata_mark_boundaries_u32",
492            "wcoj_build_metadata_mark_boundaries_u64",
493            "wcoj_build_metadata_scatter_u32",
494            "wcoj_build_metadata_scatter_u64",
495            "wcoj_triangle_build_hg_work_plan_u32",
496            "wcoj_triangle_count_hg_u32",
497            "wcoj_triangle_groupby_root_count_hg_u32",
498            "wcoj_triangle_groupby_root_sum_hg_u32",
499            "wcoj_triangle_groupby_root_min_hg_u32",
500            "wcoj_triangle_groupby_root_max_hg_u32",
501            "wcoj_triangle_materialize_hg_u32",
502            "wcoj_triangle_build_hg_work_plan_u64",
503            "wcoj_triangle_count_hg_u64",
504            "wcoj_triangle_groupby_root_count_hg_u64",
505            "wcoj_triangle_groupby_root_sum_hg_u64",
506            "wcoj_triangle_groupby_root_min_hg_u64",
507            "wcoj_triangle_groupby_root_max_hg_u64",
508            "wcoj_groupby_root_segment_sum_counts_u32",
509            "wcoj_groupby_root_segment_sum_values_u64",
510            "wcoj_groupby_root_segment_min_values_u64",
511            "wcoj_groupby_root_segment_max_values_u64",
512            "wcoj_triangle_materialize_hg_u64",
513            "wcoj_triangle_count_hg_cached_u32",
514            "wcoj_triangle_materialize_hg_cached_u32",
515            "wcoj_scan_hg_block_counts_u32",
516            "wcoj_compute_total",
517            "wcoj_layout_check_sorted_unique_u32",
518            "wcoj_layout_check_sorted_unique_u64",
519            "wcoj_4cycle_build_e2_work_prefix_u32",
520            "wcoj_4cycle_build_hg_work_plan_u32",
521            "wcoj_4cycle_count_hg_u32",
522            "wcoj_4cycle_groupby_root_count_hg_u32",
523            "wcoj_4cycle_groupby_root_sum_hg_u32",
524            "wcoj_4cycle_groupby_root_min_hg_u32",
525            "wcoj_4cycle_groupby_root_max_hg_u32",
526            "wcoj_4cycle_materialize_hg_u32",
527            "wcoj_4cycle_build_e2_work_prefix_u64",
528            "wcoj_4cycle_build_hg_work_plan_u64",
529            "wcoj_4cycle_count_hg_u64",
530            "wcoj_4cycle_groupby_root_count_hg_u64",
531            "wcoj_4cycle_materialize_hg_u64",
532            // General-arity WCOJ clique kernel family (k=5..8 from
533            // a single C++ template; ABI wrappers below are
534            // template-call-only for source-auditability).
535            "wcoj_clique5_count_hg_u32",
536            "wcoj_clique5_materialize_hg_u32",
537            "wcoj_clique5_count_hg_u64",
538            "wcoj_clique5_materialize_hg_u64",
539            "wcoj_clique6_count_hg_u32",
540            "wcoj_clique6_materialize_hg_u32",
541            "wcoj_clique6_count_hg_u64",
542            "wcoj_clique6_materialize_hg_u64",
543            "wcoj_clique7_count_hg_u32",
544            "wcoj_clique7_materialize_hg_u32",
545            "wcoj_clique7_count_hg_u64",
546            "wcoj_clique7_materialize_hg_u64",
547            "wcoj_clique8_count_hg_u32",
548            "wcoj_clique8_materialize_hg_u32",
549            "wcoj_clique8_count_hg_u64",
550            "wcoj_clique8_materialize_hg_u64",
551            // Aggregate-fused K-clique group-by-root count kernels (u32
552            // width-class, K=5/6).
553            "wcoj_clique5_groupby_root_count_hg_u32",
554            "wcoj_clique6_groupby_root_count_hg_u32",
555            // GPU Free Join level-synchronous frontier engine primitives.
556            "fj_expand_work_prefix_u32",
557            "fj_expand_count_u32",
558            "fj_expand_emit_u32",
559            "fj_probe_refine_u32",
560            // u64 width-class twins (work prefix is
561            // width-agnostic and shared).
562            "fj_expand_count_u64",
563            "fj_expand_emit_u64",
564            "fj_probe_refine_u64",
565            // Factorized count epilogue (width-agnostic).
566            "fj_count_multiplicity",
567            // D3 S3 spike — factorized recursive delta novel-set
568            // pipeline (dense-domain bitmap union–diff).
569            "fj_delta_range_u32",
570            "fj_delta_mark_u32",
571            "fj_delta_subtract_u32",
572            "fj_delta_popcount",
573            "fj_delta_emit_u32",
574            "fj_delta_max_u32",
575            // D3 sparse-domain spike — hash-set novel pipeline.
576            "fj_delta_sparse_estimate",
577            "fj_delta_sparse_load_r",
578            "fj_delta_sparse_insert_candidates",
579            "fj_delta_sparse_mark",
580            "fj_delta_sparse_emit",
581        ],
582    },
583    KernelModuleSpec {
584        cu_name: "mc_resident",
585        module_name: "xlog_mc_resident",
586        kernels: &["mc_resident_engine"],
587    },
588];
589
590#[cfg(test)]
591mod tests {
592    use super::*;
593
594    #[test]
595    fn kernel_modules_matches_cu_names() {
596        assert_eq!(
597            KERNEL_MODULES.len(),
598            KERNEL_CU_NAMES.len(),
599            "KERNEL_MODULES length ({}) != KERNEL_CU_NAMES length ({})",
600            KERNEL_MODULES.len(),
601            KERNEL_CU_NAMES.len(),
602        );
603        for (i, spec) in KERNEL_MODULES.iter().enumerate() {
604            assert_eq!(
605                spec.cu_name, KERNEL_CU_NAMES[i],
606                "KERNEL_MODULES[{}].cu_name = {:?}, expected {:?}",
607                i, spec.cu_name, KERNEL_CU_NAMES[i],
608            );
609        }
610    }
611
612    #[test]
613    fn kernel_modules_count_is_25() {
614        assert_eq!(KERNEL_MODULES.len(), 25);
615    }
616
617    #[test]
618    fn all_kernel_entries_are_non_empty() {
619        for spec in KERNEL_MODULES {
620            assert!(
621                !spec.kernels.is_empty(),
622                "module {:?} has no kernel entries",
623                spec.cu_name,
624            );
625            assert!(
626                !spec.module_name.is_empty(),
627                "module {:?} has empty module_name",
628                spec.cu_name,
629            );
630        }
631    }
632}