Spaces:

visualisable-ai
/

api

Sleeping

gary-boon Claude Opus 4.5 commited on 6 days ago

Commit

4ec134b

1 Parent(s): 3e67ea2

Fix QKV visualization for Mistral/Devstral architecture

- Add separate Q, K, V hook makers for Mistral-style architectures
that use separate projections (q_proj, k_proj, v_proj)
- Update layer iteration to use adapter's _get_layers() method
for architecture-agnostic layer access
- Maintain backwards compatibility with CodeGen's combined QKV projection
- Apply same fix to both regular and streaming endpoints

This fixes the QKV visualization which was silently failing for Devstral
because the hook registration only supported CodeGen's transformer.h structure.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (1) hide show

backend/model_service.py +134 -20

backend/model_service.py CHANGED Viewed

@@ -1582,6 +1582,7 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         hooks = []
         def make_qkv_hook(layer_idx):
             def hook(module, input, output):
                 try:
                     # output shape: [batch, seq_len, 3 * hidden_size]
@@ -1605,18 +1606,74 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                     pass
             return hook
-        # Register hooks on all qkv_proj modules (if available)
-        # This is model-specific - CodeGen uses different architecture
         try:
-            if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
-                for layer_idx, layer in enumerate(manager.model.transformer.h):
-                    if hasattr(layer, 'attn') and hasattr(layer.attn, 'qkv_proj'):
-                        hook = layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx))
-                        hooks.append(hook)
-                    elif hasattr(layer, 'attn') and hasattr(layer.attn, 'c_attn'):
-                        # GPT-2 style attention
-                        hook = layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx))
-                        hooks.append(hook)
         except Exception as hook_error:
             logger.warning(f"Could not register QKV hooks: {hook_error}")
@@ -2022,6 +2079,7 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             hooks = []
             def make_qkv_hook(layer_idx):
                 def hook(module, input, output):
                     try:
                         if output.dim() != 3:
@@ -2041,16 +2099,72 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         pass
                 return hook
-            # Register hooks
             try:
-                if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
-                    for layer_idx, layer in enumerate(manager.model.transformer.h):
-                        if hasattr(layer, 'attn') and hasattr(layer.attn, 'qkv_proj'):
-                            hook = layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx))
-                            hooks.append(hook)
-                        elif hasattr(layer, 'attn') and hasattr(layer.attn, 'c_attn'):
-                            hook = layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx))
-                            hooks.append(hook)
             except Exception as hook_error:
                 logger.warning(f"Could not register QKV hooks: {hook_error}")

         hooks = []
         def make_qkv_hook(layer_idx):
+            """Hook for combined QKV projection (CodeGen/GPT-NeoX style)"""
             def hook(module, input, output):
                 try:
                     # output shape: [batch, seq_len, 3 * hidden_size]
                     pass
             return hook
+        def make_separate_q_hook(layer_idx):
+            """Hook for separate Q projection (Mistral/LLaMA style)"""
+            def hook(module, input, output):
+                try:
+                    if layer_idx not in qkv_captures:
+                        qkv_captures[layer_idx] = {}
+                    # output shape: [batch, seq_len, num_heads * head_dim]
+                    qkv_captures[layer_idx]['q'] = output[0].detach().cpu()
+                except Exception:
+                    pass
+            return hook
+        def make_separate_k_hook(layer_idx):
+            """Hook for separate K projection (Mistral/LLaMA style)"""
+            def hook(module, input, output):
+                try:
+                    if layer_idx not in qkv_captures:
+                        qkv_captures[layer_idx] = {}
+                    qkv_captures[layer_idx]['k'] = output[0].detach().cpu()
+                except Exception:
+                    pass
+            return hook
+        def make_separate_v_hook(layer_idx):
+            """Hook for separate V projection (Mistral/LLaMA style)"""
+            def hook(module, input, output):
+                try:
+                    if layer_idx not in qkv_captures:
+                        qkv_captures[layer_idx] = {}
+                    qkv_captures[layer_idx]['v'] = output[0].detach().cpu()
+                except Exception:
+                    pass
+            return hook
+        # Register hooks - support both CodeGen and Mistral/Devstral architectures
         try:
+            # Try to get layers via adapter first (works for all model types)
+            layers = None
+            if manager.adapter:
+                try:
+                    layers = manager.adapter._get_layers()
+                    logger.info(f"Using adapter to get {len(layers)} layers for QKV hooks")
+                except Exception:
+                    pass
+            # Fallback for CodeGen if adapter doesn't work
+            if layers is None and hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
+                layers = manager.model.transformer.h
+                logger.info(f"Using transformer.h for {len(layers)} layers for QKV hooks")
+            if layers:
+                for layer_idx, layer in enumerate(layers):
+                    # Mistral/Devstral: separate Q, K, V projections
+                    if hasattr(layer, 'self_attn'):
+                        attn = layer.self_attn
+                        if hasattr(attn, 'q_proj') and hasattr(attn, 'k_proj') and hasattr(attn, 'v_proj'):
+                            hooks.append(attn.q_proj.register_forward_hook(make_separate_q_hook(layer_idx)))
+                            hooks.append(attn.k_proj.register_forward_hook(make_separate_k_hook(layer_idx)))
+                            hooks.append(attn.v_proj.register_forward_hook(make_separate_v_hook(layer_idx)))
+                    # CodeGen/GPT-NeoX: combined QKV projection
+                    elif hasattr(layer, 'attn'):
+                        if hasattr(layer.attn, 'qkv_proj'):
+                            hooks.append(layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx)))
+                        elif hasattr(layer.attn, 'c_attn'):
+                            # GPT-2 style attention
+                            hooks.append(layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx)))
+                logger.info(f"Registered {len(hooks)} QKV hooks")
         except Exception as hook_error:
             logger.warning(f"Could not register QKV hooks: {hook_error}")
             hooks = []
             def make_qkv_hook(layer_idx):
+                """Hook for combined QKV projection (CodeGen/GPT-NeoX style)"""
                 def hook(module, input, output):
                     try:
                         if output.dim() != 3:
                         pass
                 return hook
+            def make_separate_q_hook(layer_idx):
+                """Hook for separate Q projection (Mistral/LLaMA style)"""
+                def hook(module, input, output):
+                    try:
+                        if layer_idx not in qkv_captures:
+                            qkv_captures[layer_idx] = {}
+                        qkv_captures[layer_idx]['q'] = output[0].detach().cpu()
+                    except Exception:
+                        pass
+                return hook
+            def make_separate_k_hook(layer_idx):
+                """Hook for separate K projection (Mistral/LLaMA style)"""
+                def hook(module, input, output):
+                    try:
+                        if layer_idx not in qkv_captures:
+                            qkv_captures[layer_idx] = {}
+                        qkv_captures[layer_idx]['k'] = output[0].detach().cpu()
+                    except Exception:
+                        pass
+                return hook
+            def make_separate_v_hook(layer_idx):
+                """Hook for separate V projection (Mistral/LLaMA style)"""
+                def hook(module, input, output):
+                    try:
+                        if layer_idx not in qkv_captures:
+                            qkv_captures[layer_idx] = {}
+                        qkv_captures[layer_idx]['v'] = output[0].detach().cpu()
+                    except Exception:
+                        pass
+                return hook
+            # Register hooks - support both CodeGen and Mistral/Devstral architectures
             try:
+                # Try to get layers via adapter first (works for all model types)
+                layers = None
+                if manager.adapter:
+                    try:
+                        layers = manager.adapter._get_layers()
+                        logger.info(f"[Stream] Using adapter to get {len(layers)} layers for QKV hooks")
+                    except Exception:
+                        pass
+                # Fallback for CodeGen if adapter doesn't work
+                if layers is None and hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
+                    layers = manager.model.transformer.h
+                    logger.info(f"[Stream] Using transformer.h for {len(layers)} layers for QKV hooks")
+                if layers:
+                    for layer_idx, layer in enumerate(layers):
+                        # Mistral/Devstral: separate Q, K, V projections
+                        if hasattr(layer, 'self_attn'):
+                            attn = layer.self_attn
+                            if hasattr(attn, 'q_proj') and hasattr(attn, 'k_proj') and hasattr(attn, 'v_proj'):
+                                hooks.append(attn.q_proj.register_forward_hook(make_separate_q_hook(layer_idx)))
+                                hooks.append(attn.k_proj.register_forward_hook(make_separate_k_hook(layer_idx)))
+                                hooks.append(attn.v_proj.register_forward_hook(make_separate_v_hook(layer_idx)))
+                        # CodeGen/GPT-NeoX: combined QKV projection
+                        elif hasattr(layer, 'attn'):
+                            if hasattr(layer.attn, 'qkv_proj'):
+                                hooks.append(layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx)))
+                            elif hasattr(layer.attn, 'c_attn'):
+                                hooks.append(layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx)))
+                    logger.info(f"[Stream] Registered {len(hooks)} QKV hooks")
             except Exception as hook_error:
                 logger.warning(f"Could not register QKV hooks: {hook_error}")