EricLBuehler · EricLBuehler · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024
diff --git a/docs/PHI3.5MOE.md b/docs/PHI3.5MOE.md
@@ -5,7 +5,7 @@ The Phi 3.5 MoE model is a 16x3.8B parameter decoder-only text-to-text mixture o
 - Context length of **128k tokens**
 - Trained on **4.9T tokens**
 - 16 experts (16x3.8B parameters) with **6.6B active parameters**
-- Expect inference performance of a 7B model: automatic expert offloading coming soon!
+- Expect inference performance of a 7B model
 
 ## About the MoE mechanism
 1) Compute router gating logits
@@ -70,4 +70,43 @@ res = runner.send_chat_completion_request(
 )
 print(res.choices[0].message.content)
 print(res.usage)
-```
+```
+
+## Rust API
+You can find this example [here](../mistralrs/examples/phi3_5_moe/main.rs).
+
+```rust
+fn setup() -> anyhow::Result<Arc<MistralRs>> {
+    // Select a Mistral model
+    let loader = NormalLoaderBuilder::new(
+        NormalSpecificConfig {
+            use_flash_attn: false,
+            prompt_batchsize: None,
+            topology: None,
+        },
+        None,
+        None,
+        Some("microsoft/Phi-3.5-MoE-instruct".to_string()),
+    )
+    .build(NormalLoaderType::Phi3_5MoE)?;
+    // Load, into a Pipeline
+    let pipeline = loader.load_model_from_hf(
+        None,
+        TokenSource::CacheToken,
+        &ModelDType::Auto,
+        &best_device()?,
+        false,
+        DeviceMapMetadata::dummy(),
+        Some(IsqType::Q4K),
+        None, // No PagedAttention.
+    )?;
+    // Create the MistralRs, which is a runner
+    Ok(MistralRsBuilder::new(
+        pipeline,
+        SchedulerConfig::DefaultScheduler {
+            method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
+        },
+    )
+    .build())
+}
+```
diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml
@@ -104,3 +104,7 @@ required-features = []
 [[example]]
 name = "topology"
 required-features = []
+
+[[example]]
+name = "phi3_5_moe"
+required-features = []
diff --git a/mistralrs/examples/phi3_5_moe/main.rs b/mistralrs/examples/phi3_5_moe/main.rs
@@ -0,0 +1,101 @@
+use either::Either;
+use indexmap::IndexMap;
+use std::sync::Arc;
+use tokio::sync::mpsc::channel;
+
+use mistralrs::{
+    Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, IsqType, MistralRs,
+    MistralRsBuilder, ModelDType, NormalLoaderBuilder, NormalLoaderType, NormalRequest,
+    NormalSpecificConfig, Request, RequestMessage, Response, Result, SamplingParams,
+    SchedulerConfig, TokenSource,
+};
+
+/// Gets the best device, cpu, cuda if compiled with CUDA
+pub(crate) fn best_device() -> Result<Device> {
+    #[cfg(not(feature = "metal"))]
+    {
+        Device::cuda_if_available(0)
+    }
+    #[cfg(feature = "metal")]
+    {
+        Device::new_metal(0)
+    }
+}
+
+fn setup() -> anyhow::Result<Arc<MistralRs>> {
+    // Select a Mistral model
+    let loader = NormalLoaderBuilder::new(
+        NormalSpecificConfig {
+            use_flash_attn: false,
+            prompt_batchsize: None,
+            topology: None,
+        },
+        None,
+        None,
+        Some("microsoft/Phi-3.5-MoE-instruct".to_string()),
+    )
+    .build(NormalLoaderType::Phi3_5MoE)?;
+    // Load, into a Pipeline
+    let pipeline = loader.load_model_from_hf(
+        None,
+        TokenSource::CacheToken,
+        &ModelDType::Auto,
+        &best_device()?,
+        false,
+        DeviceMapMetadata::dummy(),
+        Some(IsqType::Q4K),
+        None, // No PagedAttention.
+    )?;
+    // Create the MistralRs, which is a runner
+    Ok(MistralRsBuilder::new(
+        pipeline,
+        SchedulerConfig::DefaultScheduler {
+            method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
+        },
+    )
+    .build())
+}
+
+fn main() -> anyhow::Result<()> {
+    let mistralrs = setup()?;
+
+    let (tx, mut rx) = channel(10_000);
+    let request = Request::Normal(NormalRequest {
+        messages: RequestMessage::Chat(vec![IndexMap::from([
+            ("role".to_string(), Either::Left("user".to_string())),
+            ("content".to_string(), Either::Left("Hello!".to_string())),
+        ])]),
+        sampling_params: SamplingParams::default(),
+        response: tx,
+        return_logprobs: false,
+        is_streaming: false,
+        id: 0,
+        constraint: Constraint::None,
+        suffix: None,
+        adapters: None,
+        tools: None,
+        tool_choice: None,
+        logits_processors: None,
+    });
+    mistralrs.get_sender()?.blocking_send(request)?;
+
+    let response = rx.blocking_recv().unwrap();
+    match response {
+        Response::Done(c) => println!(
+            "Text: {}, Prompt T/s: {}, Completion T/s: {}",
+            c.choices[0].message.content.as_ref().unwrap(),
+            c.usage.avg_prompt_tok_per_sec,
+            c.usage.avg_compl_tok_per_sec
+        ),
+        Response::InternalError(e) => panic!("Internal error: {e}"),
+        Response::ValidationError(e) => panic!("Validation error: {e}"),
+        Response::ModelError(e, c) => panic!(
+            "Model error: {e}. Response: Text: {}, Prompt T/s: {}, Completion T/s: {}",
+            c.choices[0].message.content.as_ref().unwrap(),
+            c.usage.avg_prompt_tok_per_sec,
+            c.usage.avg_compl_tok_per_sec
+        ),
+        _ => unreachable!(),
+    }
+    Ok(())
+}