EricLBuehler · EricLBuehler · Jul 23, 2025 · Jul 23, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/mistralrs-core/build.rs b/mistralrs-core/build.rs
@@ -7,7 +7,7 @@ fn main() {
         use std::{path::PathBuf, vec};
         println!("cargo:rerun-if-changed=build.rs");
         let build_dir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
-        let lib_files = vec!["src/cuda/sort.cu"];
+        let lib_files = vec!["src/cuda/sort.cu", "src/cuda/indexed_matmul.cu", "src/cuda/fused_moe_optimized.cu"];
         for lib_file in lib_files.iter() {
             println!("cargo:rerun-if-changed={lib_file}");
         }

diff --git a/mistralrs-core/src/cuda/ffi.rs b/mistralrs-core/src/cuda/ffi.rs
@@ -114,4 +114,132 @@ extern "C" {
         inplace: bool,
         stream: i64,
     );
+
+    // Indexed matrix multiplication for MoE
+    pub(crate) fn indexed_matmul_f32(
+        input: *const c_void,
+        expert_weights: *const c_void,
+        routing_weights: *const c_void,
+        expert_indices: *const c_void,
+        output: *mut c_void,
+        num_tokens: i32,
+        hidden_dim: i32,
+        out_dim: i32,
+        num_selected_experts: i32,
+        num_experts: i32,
+        stream: i64,
+    );
+    pub(crate) fn indexed_matmul_f16(
+        input: *const c_void,
+        expert_weights: *const c_void,
+        routing_weights: *const c_void,
+        expert_indices: *const c_void,
+        output: *mut c_void,
+        num_tokens: i32,
+        hidden_dim: i32,
+        out_dim: i32,
+        num_selected_experts: i32,
+        num_experts: i32,
+        stream: i64,
+    );
+    pub(crate) fn indexed_matmul_bf16(
+        input: *const c_void,
+        expert_weights: *const c_void,
+        routing_weights: *const c_void,
+        expert_indices: *const c_void,
+        output: *mut c_void,
+        num_tokens: i32,
+        hidden_dim: i32,
+        out_dim: i32,
+        num_selected_experts: i32,
+        num_experts: i32,
+        stream: i64,
+    );
+
+    // Fused MoE forward pass
+    pub(crate) fn fused_moe_forward_f32(
+        input: *const c_void,
+        gate_weights: *const c_void,
+        up_weights: *const c_void,
+        down_weights: *const c_void,
+        routing_weights: *const c_void,
+        expert_indices: *const c_void,
+        output: *mut c_void,
+        num_tokens: i32,
+        hidden_dim: i32,
+        intermediate_dim: i32,
+        num_selected_experts: i32,
+        num_experts: i32,
+        activation_type: i32,
+        stream: i64,
+    );
+    pub(crate) fn fused_moe_forward_f16(
+        input: *const c_void,
+        gate_weights: *const c_void,
+        up_weights: *const c_void,
+        down_weights: *const c_void,
+        routing_weights: *const c_void,
+        expert_indices: *const c_void,
+        output: *mut c_void,
+        num_tokens: i32,
+        hidden_dim: i32,
+        intermediate_dim: i32,
+        num_selected_experts: i32,
+        num_experts: i32,
+        activation_type: i32,
+        stream: i64,
+    );
+    pub(crate) fn fused_moe_forward_bf16(
+        input: *const c_void,
+        gate_weights: *const c_void,
+        up_weights: *const c_void,
+        down_weights: *const c_void,
+        routing_weights: *const c_void,
+        expert_indices: *const c_void,
+        output: *mut c_void,
+        num_tokens: i32,
+        hidden_dim: i32,
+        intermediate_dim: i32,
+        num_selected_experts: i32,
+        num_experts: i32,
+        activation_type: i32,
+        stream: i64,
+    );
+
+    // Optimized fused MoE forward pass
+    pub(crate) fn fused_moe_forward_optimized_f32(
+        input: *const c_void,
+        gate_weights: *const c_void,
+        up_weights: *const c_void,
+        down_weights: *const c_void,
+        routing_weights: *const c_void,
+        expert_indices: *const c_void,
+        output: *mut c_void,
+        num_tokens: i32,
+        hidden_dim: i32,
+        intermediate_dim: i32,
+        num_selected_experts: i32,
+        num_experts: i32,
+        activation_type: i32,
+        stream: i64,
+    );
+
+    // Chunked fused MoE forward pass for large batches
+    pub(crate) fn fused_moe_forward_chunked_f32(
+        input: *const c_void,
+        gate_weights: *const c_void,
+        up_weights: *const c_void,
+        down_weights: *const c_void,
+        routing_weights: *const c_void,
+        expert_indices: *const c_void,
+        output: *mut c_void,
+        num_tokens: i32,
+        hidden_dim: i32,
+        intermediate_dim: i32,
+        num_selected_experts: i32,
+        num_experts: i32,
+        activation_type: i32,
+        chunk_size: i32,
+        stream: i64,
+    );
 }