Spaces:

visualisable-ai
/

api

Sleeping

gary-boon Claude Opus 4.5 commited on 8 days ago

Commit

9056859

1 Parent(s): 4ec134b

Fix QKV matrix extraction for Mistral/Devstral architecture

- Use explicit head_dim from model config (Mistral uses 128, not hidden_size/num_heads=160)
- Handle both tuple and tensor outputs from projection hooks
- Properly reshape Q/K/V tensors to [seq_len, num_heads, head_dim]
- Support GQA by expanding K/V heads (8) to match Q heads (32)
- Add warning logs for debugging hook failures

This fixes the "too many indices for tensor of dimension 2" and
shape mismatch errors when extracting QKV matrices from Devstral.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (1) hide show

backend/model_service.py +124 -20

backend/model_service.py CHANGED Viewed

@@ -1571,7 +1571,11 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         n_heads = manager.model.config.n_head if hasattr(manager.model.config, 'n_head') else manager.model.config.num_attention_heads
         d_model = manager.model.config.n_embd if hasattr(manager.model.config, 'n_embd') else manager.model.config.hidden_size
-        head_dim = d_model // n_heads
         # Generation loop with full instrumentation
         layer_data_by_token = []  # Store layer data for each generated token
@@ -1612,10 +1616,22 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                 try:
                     if layer_idx not in qkv_captures:
                         qkv_captures[layer_idx] = {}
                     # output shape: [batch, seq_len, num_heads * head_dim]
-                    qkv_captures[layer_idx]['q'] = output[0].detach().cpu()
-                except Exception:
-                    pass
             return hook
         def make_separate_k_hook(layer_idx):
@@ -1624,9 +1640,27 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                 try:
                     if layer_idx not in qkv_captures:
                         qkv_captures[layer_idx] = {}
-                    qkv_captures[layer_idx]['k'] = output[0].detach().cpu()
-                except Exception:
-                    pass
             return hook
         def make_separate_v_hook(layer_idx):
@@ -1635,9 +1669,27 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                 try:
                     if layer_idx not in qkv_captures:
                         qkv_captures[layer_idx] = {}
-                    qkv_captures[layer_idx]['v'] = output[0].detach().cpu()
-                except Exception:
-                    pass
             return hook
         # Register hooks - support both CodeGen and Mistral/Devstral architectures
@@ -2068,7 +2120,11 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             n_heads = manager.model.config.n_head if hasattr(manager.model.config, 'n_head') else manager.model.config.num_attention_heads
             d_model = manager.model.config.n_embd if hasattr(manager.model.config, 'n_embd') else manager.model.config.hidden_size
-            head_dim = d_model // n_heads
             # === STAGE 2: GENERATING ===
             layer_data_by_token = []
@@ -2105,9 +2161,21 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     try:
                         if layer_idx not in qkv_captures:
                             qkv_captures[layer_idx] = {}
-                        qkv_captures[layer_idx]['q'] = output[0].detach().cpu()
-                    except Exception:
-                        pass
                 return hook
             def make_separate_k_hook(layer_idx):
@@ -2116,9 +2184,27 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     try:
                         if layer_idx not in qkv_captures:
                             qkv_captures[layer_idx] = {}
-                        qkv_captures[layer_idx]['k'] = output[0].detach().cpu()
-                    except Exception:
-                        pass
                 return hook
             def make_separate_v_hook(layer_idx):
@@ -2127,9 +2213,27 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     try:
                         if layer_idx not in qkv_captures:
                             qkv_captures[layer_idx] = {}
-                        qkv_captures[layer_idx]['v'] = output[0].detach().cpu()
-                    except Exception:
-                        pass
                 return hook
             # Register hooks - support both CodeGen and Mistral/Devstral architectures

         n_heads = manager.model.config.n_head if hasattr(manager.model.config, 'n_head') else manager.model.config.num_attention_heads
         d_model = manager.model.config.n_embd if hasattr(manager.model.config, 'n_embd') else manager.model.config.hidden_size
+        # Use explicit head_dim from config if available (Mistral models have this)
+        if hasattr(manager.model.config, 'head_dim'):
+            head_dim = manager.model.config.head_dim
+        else:
+            head_dim = d_model // n_heads
         # Generation loop with full instrumentation
         layer_data_by_token = []  # Store layer data for each generated token
                 try:
                     if layer_idx not in qkv_captures:
                         qkv_captures[layer_idx] = {}
+                    # Handle both tuple and tensor outputs
+                    if isinstance(output, tuple):
+                        out = output[0]
+                    else:
+                        out = output
+                    out = out.detach().cpu()
                     # output shape: [batch, seq_len, num_heads * head_dim]
+                    # If 3D, take first batch element
+                    if out.dim() == 3:
+                        out = out[0]  # [seq_len, num_heads * head_dim]
+                    # Reshape to [seq_len, num_heads, head_dim]
+                    seq_len = out.shape[0]
+                    out = out.reshape(seq_len, n_heads, head_dim)
+                    qkv_captures[layer_idx]['q'] = out
+                except Exception as e:
+                    logger.warning(f"Q hook error layer {layer_idx}: {e}")
             return hook
         def make_separate_k_hook(layer_idx):
                 try:
                     if layer_idx not in qkv_captures:
                         qkv_captures[layer_idx] = {}
+                    # Handle both tuple and tensor outputs
+                    if isinstance(output, tuple):
+                        out = output[0]
+                    else:
+                        out = output
+                    out = out.detach().cpu()
+                    # If 3D, take first batch element
+                    if out.dim() == 3:
+                        out = out[0]  # [seq_len, kv_heads * head_dim]
+                    # For GQA models, K has fewer heads (kv_heads)
+                    seq_len = out.shape[0]
+                    hidden_size = out.shape[1]
+                    actual_kv_heads = hidden_size // head_dim
+                    out = out.reshape(seq_len, actual_kv_heads, head_dim)
+                    # If GQA, repeat KV heads to match Q heads
+                    if actual_kv_heads != n_heads:
+                        repeat_factor = n_heads // actual_kv_heads
+                        out = out.repeat_interleave(repeat_factor, dim=1)
+                    qkv_captures[layer_idx]['k'] = out
+                except Exception as e:
+                    logger.warning(f"K hook error layer {layer_idx}: {e}")
             return hook
         def make_separate_v_hook(layer_idx):
                 try:
                     if layer_idx not in qkv_captures:
                         qkv_captures[layer_idx] = {}
+                    # Handle both tuple and tensor outputs
+                    if isinstance(output, tuple):
+                        out = output[0]
+                    else:
+                        out = output
+                    out = out.detach().cpu()
+                    # If 3D, take first batch element
+                    if out.dim() == 3:
+                        out = out[0]  # [seq_len, kv_heads * head_dim]
+                    # For GQA models, V has fewer heads (kv_heads)
+                    seq_len = out.shape[0]
+                    hidden_size = out.shape[1]
+                    actual_kv_heads = hidden_size // head_dim
+                    out = out.reshape(seq_len, actual_kv_heads, head_dim)
+                    # If GQA, repeat KV heads to match Q heads
+                    if actual_kv_heads != n_heads:
+                        repeat_factor = n_heads // actual_kv_heads
+                        out = out.repeat_interleave(repeat_factor, dim=1)
+                    qkv_captures[layer_idx]['v'] = out
+                except Exception as e:
+                    logger.warning(f"V hook error layer {layer_idx}: {e}")
             return hook
         # Register hooks - support both CodeGen and Mistral/Devstral architectures
             n_heads = manager.model.config.n_head if hasattr(manager.model.config, 'n_head') else manager.model.config.num_attention_heads
             d_model = manager.model.config.n_embd if hasattr(manager.model.config, 'n_embd') else manager.model.config.hidden_size
+            # Use explicit head_dim from config if available (Mistral models have this)
+            if hasattr(manager.model.config, 'head_dim'):
+                head_dim = manager.model.config.head_dim
+            else:
+                head_dim = d_model // n_heads
             # === STAGE 2: GENERATING ===
             layer_data_by_token = []
                     try:
                         if layer_idx not in qkv_captures:
                             qkv_captures[layer_idx] = {}
+                        # Handle both tuple and tensor outputs
+                        if isinstance(output, tuple):
+                            out = output[0]
+                        else:
+                            out = output
+                        out = out.detach().cpu()
+                        # If 3D, take first batch element
+                        if out.dim() == 3:
+                            out = out[0]  # [seq_len, num_heads * head_dim]
+                        # Reshape to [seq_len, num_heads, head_dim]
+                        seq_len = out.shape[0]
+                        out = out.reshape(seq_len, n_heads, head_dim)
+                        qkv_captures[layer_idx]['q'] = out
+                    except Exception as e:
+                        logger.warning(f"[Stream] Q hook error layer {layer_idx}: {e}")
                 return hook
             def make_separate_k_hook(layer_idx):
                     try:
                         if layer_idx not in qkv_captures:
                             qkv_captures[layer_idx] = {}
+                        # Handle both tuple and tensor outputs
+                        if isinstance(output, tuple):
+                            out = output[0]
+                        else:
+                            out = output
+                        out = out.detach().cpu()
+                        # If 3D, take first batch element
+                        if out.dim() == 3:
+                            out = out[0]  # [seq_len, kv_heads * head_dim]
+                        # For GQA models, K has fewer heads (kv_heads)
+                        seq_len = out.shape[0]
+                        hidden_size = out.shape[1]
+                        actual_kv_heads = hidden_size // head_dim
+                        out = out.reshape(seq_len, actual_kv_heads, head_dim)
+                        # If GQA, repeat KV heads to match Q heads
+                        if actual_kv_heads != n_heads:
+                            repeat_factor = n_heads // actual_kv_heads
+                            out = out.repeat_interleave(repeat_factor, dim=1)
+                        qkv_captures[layer_idx]['k'] = out
+                    except Exception as e:
+                        logger.warning(f"[Stream] K hook error layer {layer_idx}: {e}")
                 return hook
             def make_separate_v_hook(layer_idx):
                     try:
                         if layer_idx not in qkv_captures:
                             qkv_captures[layer_idx] = {}
+                        # Handle both tuple and tensor outputs
+                        if isinstance(output, tuple):
+                            out = output[0]
+                        else:
+                            out = output
+                        out = out.detach().cpu()
+                        # If 3D, take first batch element
+                        if out.dim() == 3:
+                            out = out[0]  # [seq_len, kv_heads * head_dim]
+                        # For GQA models, V has fewer heads (kv_heads)
+                        seq_len = out.shape[0]
+                        hidden_size = out.shape[1]
+                        actual_kv_heads = hidden_size // head_dim
+                        out = out.reshape(seq_len, actual_kv_heads, head_dim)
+                        # If GQA, repeat KV heads to match Q heads
+                        if actual_kv_heads != n_heads:
+                            repeat_factor = n_heads // actual_kv_heads
+                            out = out.repeat_interleave(repeat_factor, dim=1)
+                        qkv_captures[layer_idx]['v'] = out
+                    except Exception as e:
+                        logger.warning(f"[Stream] V hook error layer {layer_idx}: {e}")
                 return hook
             # Register hooks - support both CodeGen and Mistral/Devstral architectures