EricLBuehler · EricLBuehler · Dec 14, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 14, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,8 +26,8 @@ rust-version = "1.82"
 
 [workspace.dependencies]
 anyhow = "1.0.80"
-candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "6800496" }
-candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "6800496" }
+candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "521e33c" }
+candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "521e33c" }
 serde = "1.0.197"
 serde_json = "1.0.114"
 indexmap = { version = "2.2.5", features = ["serde"] }

diff --git a/examples/server/llama_vision.py b/examples/server/llama_vision.py
@@ -54,7 +54,7 @@ def log_response(response: httpx.Response):
             ],
         },
     ],
-    max_tokens=256,
+    # max_tokens=256,
     frequency_penalty=1.0,
     top_p=0.1,
     temperature=0,

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -17,7 +17,7 @@ candle-core.workspace = true
 candle-nn.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "6800496", optional = true }
+candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "c0c2b23", optional = true }
 dirs = "5.0.1"
 hf-hub = { version = "0.3.3", package = "candle-hf-hub" }
 thiserror = "1.0.57"

diff --git a/mistralrs-core/src/attention.rs b/mistralrs-core/src/attention.rs
@@ -162,23 +162,46 @@ fn naive_sdpa(
     if mask.is_some_and(|mask| mask.rank() == 2 || (mask.rank() == 3 && mask.dims()[0] == 1))
         && supports_attn_softmax
     {
-        let mut att = MatMul.matmul(q, &k.t()?)?;
+        let n_attn_heads = q.dim(1)?;
+        let bs = q.dim(0)?;
+        let attention_bias = match mask {
+            Some(mask) if mask.rank() == 3 && mask.dims()[0] == 1 => {
+                mask.unsqueeze(0)?.repeat((bs, n_attn_heads, 1, 1))?
+            }
+            Some(mask) if mask.rank() == 3 => mask.unsqueeze(0)?,
+            Some(mask) if mask.rank() == 2 => {
+                mask.unsqueeze(0)?
+                    .unsqueeze(0)?
+                    .repeat((bs, n_attn_heads, 1, 1))?
+            }
+            Some(mask) if mask.rank() == 4 => mask.clone(),
+            _ => candle_core::bail!("unsupported mask {mask:?}"),
+        };
+        let mut att = attention_bias;
+
+        q.matmul_with_alpha_beta(
+            &k.t()?,
+            &mut att,
+            Some((sdpa_params.softmax_scale / sdpa_params.softcap.unwrap_or(1.0)) as f64),
+        )?;
+
+        if let Some(softcap) = sdpa_params.softcap {
+            att = (att.tanh()? * softcap as f64)?;
+        }
+
+        candle_nn::ops::inplace_softmax_last_dim(&mut att)?;
+
+        MatMul.matmul(&att, v)
+    } else if let Some(mask) = mask {
+        let mut att = MatMul.matmul_affine_div(q, &k.t()?, (head_dim as f64).sqrt())?;
         if let Some(softcap) = sdpa_params.softcap {
             att = (att / softcap as f64)?;
             att = att.tanh()?;
             att = (att * softcap as f64)?;
         }
 
-        let mask = match mask {
-            Some(mask) if mask.rank() == 3 && mask.dim(0)? == 1 => mask.squeeze(0)?,
-            Some(mask) if mask.rank() == 2 => mask.clone(),
-            _ => unreachable!(),
-        };
-        candle_nn::ops::inplace_attn_softmax_last_dim(
-            &mut att,
-            &mask,
-            1. / (head_dim as f32).sqrt(),
-        )?;
+        att = att.broadcast_add(mask)?;
+        candle_nn::ops::inplace_softmax_last_dim(&mut att)?;
         MatMul.matmul(&att, v)
     } else {
         let mut att = MatMul.matmul_affine_div(q, &k.t()?, (head_dim as f64).sqrt())?;
@@ -188,10 +211,6 @@ fn naive_sdpa(
             att = (att * softcap as f64)?;
         }
 
-        att = match mask {
-            Some(m) => att.broadcast_add(m)?,
-            None => att,
-        };
         candle_nn::ops::inplace_softmax_last_dim(&mut att)?;
         MatMul.matmul(&att, v)
     }
@@ -291,11 +310,11 @@ impl Sdpa {
                     if let Some(softcap) = sdpa_params.softcap {
                         attention_scores = (attention_scores.tanh()? * softcap as f64)?;
                     }
-                    let attention_probs = candle_nn::ops::softmax_last_dim(&attention_scores)?;
+                    candle_nn::ops::inplace_softmax_last_dim(&mut attention_scores)?;
 
                     let context_layer = cublaslt.batch_matmul(
                         &v.t()?.contiguous()?,
-                        &attention_probs,
+                        &attention_scores,
                         // We save one allocation
                         Some(&q),
                         None,

diff --git a/mistralrs-core/src/pipeline/loaders/vision_loaders.rs b/mistralrs-core/src/pipeline/loaders/vision_loaders.rs
@@ -509,21 +509,72 @@ impl VisionModelLoader for VLlamaLoader {
 }
 
 impl IsqModelLoader for VLlamaLoader {
-    fn isq_layer_regexes(&self, _config: &str) -> Result<Vec<Regex>> {
-        Ok(vec![
-            // Attention
-            Regex::new(r"layers\.(\d+)\.self_attn\.q_proj\.(weight|bias)$")?,
-            Regex::new(r"layers\.(\d+)\.self_attn\.k_proj\.(weight|bias)$")?,
-            Regex::new(r"layers\.(\d+)\.self_attn\.v_proj\.(weight|bias)$")?,
-            Regex::new(r"layers\.(\d+)\.self_attn\.o_proj\.(weight|bias)$")?,
-            // MLP text
-            Regex::new(r"layers\.(\d+)\.mlp\.gate_proj\.(weight|bias)$")?,
-            Regex::new(r"layers\.(\d+)\.mlp\.up_proj\.(weight|bias)$")?,
-            Regex::new(r"layers\.(\d+)\.mlp\.down_proj\.(weight|bias)$")?,
+    fn isq_layer_regexes(&self, config: &str) -> Result<Vec<Regex>> {
+        let config: MLlamaConfig = serde_json::from_str(config)?;
+        let cross_attn_layers = &config.text_config.cross_attention_layers;
+        let transformer_layers =
+            (0..config.text_config.num_hidden_layers).filter(|i| !cross_attn_layers.contains(i));
+        let mut text_regexes = Vec::new();
+        for layer in transformer_layers {
+            text_regexes.extend(vec![
+                // Attention text
+                Regex::new(&format!(
+                    r"language_model.model.layers\.{layer}\.self_attn\.q_proj\.(weight|bias)$"
+                ))?,
+                Regex::new(&format!(
+                    r"language_model.model.layers\.{layer}\.self_attn\.k_proj\.(weight|bias)$"
+                ))?,
+                Regex::new(&format!(
+                    r"language_model.model.layers\.{layer}\.self_attn\.v_proj\.(weight|bias)$"
+                ))?,
+                Regex::new(&format!(
+                    r"language_model.model.layers\.{layer}\.self_attn\.o_proj\.(weight|bias)$"
+                ))?,
+                // MLP text
+                Regex::new(&format!(
+                    r"language_model.model.layers\.{layer}\.mlp\.gate_proj\.(weight|bias)$"
+                ))?,
+                Regex::new(&format!(
+                    r"language_model.model.layers\.{layer}\.mlp\.up_proj\.(weight|bias)$"
+                ))?,
+                Regex::new(&format!(
+                    r"language_model.model.layers\.{layer}\.mlp\.down_proj\.(weight|bias)$"
+                ))?,
+            ]);
+        }
+        let vision_regexes = vec![
+            // Vision attention (transformer)
+            Regex::new(
+                r"vision_model.transformer.layers\.(\d+)\.self_attn\.q_proj\.(weight|bias)$",
+            )?,
+            Regex::new(
+                r"vision_model.transformer.layers\.(\d+)\.self_attn\.k_proj\.(weight|bias)$",
+            )?,
+            Regex::new(
+                r"vision_model.transformer.layers\.(\d+)\.self_attn\.v_proj\.(weight|bias)$",
+            )?,
+            Regex::new(
+                r"vision_model.transformer.layers\.(\d+)\.self_attn\.o_proj\.(weight|bias)$",
+            )?,
+            // Vision attention (global transforemr)
+            Regex::new(
+                r"vision_model.global_transformer.layers\.(\d+)\.self_attn\.q_proj\.(weight|bias)$",
+            )?,
+            Regex::new(
+                r"vision_model.global_transformer.layers\.(\d+)\.self_attn\.k_proj\.(weight|bias)$",
+            )?,
+            Regex::new(
+                r"vision_model.global_transformer.layers\.(\d+)\.self_attn\.v_proj\.(weight|bias)$",
+            )?,
+            Regex::new(
+                r"vision_model.global_transformer.layers\.(\d+)\.self_attn\.o_proj\.(weight|bias)$",
+            )?,
             // MLP vision
             Regex::new(r"layers\.(\d+)\.mlp\.fc1\.(weight|bias)$")?,
             Regex::new(r"layers\.(\d+)\.mlp\.fc2\.(weight|bias)$")?,
-        ])
+        ];
+
+        Ok([text_regexes, vision_regexes].concat())
     }
 }
 

diff --git a/mistralrs-core/src/vision_models/mllama/text.rs b/mistralrs-core/src/vision_models/mllama/text.rs
@@ -583,7 +583,7 @@ impl MLlamaTextModel {
                         vb.pp(format!("layers.{i}")),
                         &*mapper,
                         i,
-                        normal_loading_metadata.loading_isq,
+                        false,
                     )?,
                 ))
             } else {
@@ -698,14 +698,14 @@ impl IsqModel for MLlamaTextModel {
         let mut tensors = Vec::new();
         for (i, layer) in self.layers.iter_mut().enumerate() {
             match layer {
-                MLlamaDecoderLayer::CrossAttn(cross) => {
-                    tensors.push((&mut cross.attn.q_proj, Some(i)));
-                    tensors.push((&mut cross.attn.k_proj, Some(i)));
-                    tensors.push((&mut cross.attn.v_proj, Some(i)));
-                    tensors.push((&mut cross.attn.o_proj, Some(i)));
-                    tensors.push((&mut cross.mlp.gate_proj, Some(i)));
-                    tensors.push((&mut cross.mlp.up_proj, Some(i)));
-                    tensors.push((&mut cross.mlp.down_proj, Some(i)));
+                MLlamaDecoderLayer::CrossAttn(_cross) => {
+                    // tensors.push((&mut cross.attn.q_proj, Some(i)));
+                    // tensors.push((&mut cross.attn.k_proj, Some(i)));
+                    // tensors.push((&mut cross.attn.v_proj, Some(i)));
+                    // tensors.push((&mut cross.attn.o_proj, Some(i)));
+                    // tensors.push((&mut cross.mlp.gate_proj, Some(i)));
+                    // tensors.push((&mut cross.mlp.up_proj, Some(i)));
+                    // tensors.push((&mut cross.mlp.down_proj, Some(i)));
                 }
                 MLlamaDecoderLayer::SelfAttn(self_attn) => {
                     tensors.push((&mut self_attn.attn.q_proj, Some(i)));