Spaces:

visualisable-ai
/

api

Sleeping

gary-boon Claude Opus 4.5 commited on 5 days ago

Commit

d0b7e29

1 Parent(s): a79cb83

Revert QKV visualization fixes - need better approach for data streaming

Reverts commits:
- a79cb83 Add safety checks for missing QKV keys
- decb5ab Limit QKV matrices to top 5 heads per layer
- 9056859 Fix QKV matrix extraction for Mistral/Devstral architecture
- 4ec134b Fix QKV visualization for Mistral/Devstral architecture

The QKV fixes caused response sizes to explode, causing 504 timeouts.
Need to implement a better approach (lazy loading) before re-enabling.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (1) hide show

backend/model_service.py +28 -271

backend/model_service.py CHANGED Viewed

@@ -1571,11 +1571,7 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         n_heads = manager.model.config.n_head if hasattr(manager.model.config, 'n_head') else manager.model.config.num_attention_heads
         d_model = manager.model.config.n_embd if hasattr(manager.model.config, 'n_embd') else manager.model.config.hidden_size
-        # Use explicit head_dim from config if available (Mistral models have this)
-        if hasattr(manager.model.config, 'head_dim'):
-            head_dim = manager.model.config.head_dim
-        else:
-            head_dim = d_model // n_heads
         # Generation loop with full instrumentation
         layer_data_by_token = []  # Store layer data for each generated token
@@ -1586,7 +1582,6 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         hooks = []
         def make_qkv_hook(layer_idx):
-            """Hook for combined QKV projection (CodeGen/GPT-NeoX style)"""
             def hook(module, input, output):
                 try:
                     # output shape: [batch, seq_len, 3 * hidden_size]
@@ -1610,122 +1605,18 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                     pass
             return hook
-        def make_separate_q_hook(layer_idx):
-            """Hook for separate Q projection (Mistral/LLaMA style)"""
-            def hook(module, input, output):
-                try:
-                    if layer_idx not in qkv_captures:
-                        qkv_captures[layer_idx] = {}
-                    # Handle both tuple and tensor outputs
-                    if isinstance(output, tuple):
-                        out = output[0]
-                    else:
-                        out = output
-                    out = out.detach().cpu()
-                    # output shape: [batch, seq_len, num_heads * head_dim]
-                    # If 3D, take first batch element
-                    if out.dim() == 3:
-                        out = out[0]  # [seq_len, num_heads * head_dim]
-                    # Reshape to [seq_len, num_heads, head_dim]
-                    seq_len = out.shape[0]
-                    out = out.reshape(seq_len, n_heads, head_dim)
-                    qkv_captures[layer_idx]['q'] = out
-                except Exception as e:
-                    logger.warning(f"Q hook error layer {layer_idx}: {e}")
-            return hook
-        def make_separate_k_hook(layer_idx):
-            """Hook for separate K projection (Mistral/LLaMA style)"""
-            def hook(module, input, output):
-                try:
-                    if layer_idx not in qkv_captures:
-                        qkv_captures[layer_idx] = {}
-                    # Handle both tuple and tensor outputs
-                    if isinstance(output, tuple):
-                        out = output[0]
-                    else:
-                        out = output
-                    out = out.detach().cpu()
-                    # If 3D, take first batch element
-                    if out.dim() == 3:
-                        out = out[0]  # [seq_len, kv_heads * head_dim]
-                    # For GQA models, K has fewer heads (kv_heads)
-                    seq_len = out.shape[0]
-                    hidden_size = out.shape[1]
-                    actual_kv_heads = hidden_size // head_dim
-                    out = out.reshape(seq_len, actual_kv_heads, head_dim)
-                    # If GQA, repeat KV heads to match Q heads
-                    if actual_kv_heads != n_heads:
-                        repeat_factor = n_heads // actual_kv_heads
-                        out = out.repeat_interleave(repeat_factor, dim=1)
-                    qkv_captures[layer_idx]['k'] = out
-                except Exception as e:
-                    logger.warning(f"K hook error layer {layer_idx}: {e}")
-            return hook
-        def make_separate_v_hook(layer_idx):
-            """Hook for separate V projection (Mistral/LLaMA style)"""
-            def hook(module, input, output):
-                try:
-                    if layer_idx not in qkv_captures:
-                        qkv_captures[layer_idx] = {}
-                    # Handle both tuple and tensor outputs
-                    if isinstance(output, tuple):
-                        out = output[0]
-                    else:
-                        out = output
-                    out = out.detach().cpu()
-                    # If 3D, take first batch element
-                    if out.dim() == 3:
-                        out = out[0]  # [seq_len, kv_heads * head_dim]
-                    # For GQA models, V has fewer heads (kv_heads)
-                    seq_len = out.shape[0]
-                    hidden_size = out.shape[1]
-                    actual_kv_heads = hidden_size // head_dim
-                    out = out.reshape(seq_len, actual_kv_heads, head_dim)
-                    # If GQA, repeat KV heads to match Q heads
-                    if actual_kv_heads != n_heads:
-                        repeat_factor = n_heads // actual_kv_heads
-                        out = out.repeat_interleave(repeat_factor, dim=1)
-                    qkv_captures[layer_idx]['v'] = out
-                except Exception as e:
-                    logger.warning(f"V hook error layer {layer_idx}: {e}")
-            return hook
-        # Register hooks - support both CodeGen and Mistral/Devstral architectures
         try:
-            # Try to get layers via adapter first (works for all model types)
-            layers = None
-            if manager.adapter:
-                try:
-                    layers = manager.adapter._get_layers()
-                    logger.info(f"Using adapter to get {len(layers)} layers for QKV hooks")
-                except Exception:
-                    pass
-            # Fallback for CodeGen if adapter doesn't work
-            if layers is None and hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
-                layers = manager.model.transformer.h
-                logger.info(f"Using transformer.h for {len(layers)} layers for QKV hooks")
-            if layers:
-                for layer_idx, layer in enumerate(layers):
-                    # Mistral/Devstral: separate Q, K, V projections
-                    if hasattr(layer, 'self_attn'):
-                        attn = layer.self_attn
-                        if hasattr(attn, 'q_proj') and hasattr(attn, 'k_proj') and hasattr(attn, 'v_proj'):
-                            hooks.append(attn.q_proj.register_forward_hook(make_separate_q_hook(layer_idx)))
-                            hooks.append(attn.k_proj.register_forward_hook(make_separate_k_hook(layer_idx)))
-                            hooks.append(attn.v_proj.register_forward_hook(make_separate_v_hook(layer_idx)))
-                    # CodeGen/GPT-NeoX: combined QKV projection
-                    elif hasattr(layer, 'attn'):
-                        if hasattr(layer.attn, 'qkv_proj'):
-                            hooks.append(layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx)))
-                        elif hasattr(layer.attn, 'c_attn'):
-                            # GPT-2 style attention
-                            hooks.append(layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx)))
-                logger.info(f"Registered {len(hooks)} QKV hooks")
         except Exception as hook_error:
             logger.warning(f"Could not register QKV hooks: {hook_error}")
@@ -1859,16 +1750,11 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                         k_matrix = None
                         v_matrix = None
                         if layer_idx in qkv_captures:
-                            layer_qkv = qkv_captures[layer_idx]
                             # Q/K/V shape: [seq_len, n_heads, head_dim]
                             # Convert to float32 for numpy (bfloat16 not supported)
-                            # Check each key exists (hooks may have failed for some)
-                            if 'q' in layer_qkv:
-                                q_matrix = layer_qkv['q'][:, head_idx, :].float().numpy().tolist()
-                            if 'k' in layer_qkv:
-                                k_matrix = layer_qkv['k'][:, head_idx, :].float().numpy().tolist()
-                            if 'v' in layer_qkv:
-                                v_matrix = layer_qkv['v'][:, head_idx, :].float().numpy().tolist()
                         critical_heads.append({
                             "head_idx": head_idx,
@@ -1887,14 +1773,6 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                     # Sort by max_weight (return all heads, frontend will decide how many to display)
                     critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
-                    # Only keep QKV matrices for top 5 heads to avoid massive response sizes
-                    # (40 layers × 32 heads × 3 matrices × seq_len × head_dim is too much data)
-                    for i, head in enumerate(critical_heads):
-                        if i >= 5:  # Keep QKV only for top 5 heads
-                            head["q_matrix"] = None
-                            head["k_matrix"] = None
-                            head["v_matrix"] = None
                     # Detect layer-level pattern (percentage-based for any layer count)
                     layer_pattern = None
                     layer_fraction = (layer_idx + 1) / n_layers  # 1-indexed fraction
@@ -2133,11 +2011,7 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             n_heads = manager.model.config.n_head if hasattr(manager.model.config, 'n_head') else manager.model.config.num_attention_heads
             d_model = manager.model.config.n_embd if hasattr(manager.model.config, 'n_embd') else manager.model.config.hidden_size
-            # Use explicit head_dim from config if available (Mistral models have this)
-            if hasattr(manager.model.config, 'head_dim'):
-                head_dim = manager.model.config.head_dim
-            else:
-                head_dim = d_model // n_heads
             # === STAGE 2: GENERATING ===
             layer_data_by_token = []
@@ -2148,7 +2022,6 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             hooks = []
             def make_qkv_hook(layer_idx):
-                """Hook for combined QKV projection (CodeGen/GPT-NeoX style)"""
                 def hook(module, input, output):
                     try:
                         if output.dim() != 3:
@@ -2168,120 +2041,16 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         pass
                 return hook
-            def make_separate_q_hook(layer_idx):
-                """Hook for separate Q projection (Mistral/LLaMA style)"""
-                def hook(module, input, output):
-                    try:
-                        if layer_idx not in qkv_captures:
-                            qkv_captures[layer_idx] = {}
-                        # Handle both tuple and tensor outputs
-                        if isinstance(output, tuple):
-                            out = output[0]
-                        else:
-                            out = output
-                        out = out.detach().cpu()
-                        # If 3D, take first batch element
-                        if out.dim() == 3:
-                            out = out[0]  # [seq_len, num_heads * head_dim]
-                        # Reshape to [seq_len, num_heads, head_dim]
-                        seq_len = out.shape[0]
-                        out = out.reshape(seq_len, n_heads, head_dim)
-                        qkv_captures[layer_idx]['q'] = out
-                    except Exception as e:
-                        logger.warning(f"[Stream] Q hook error layer {layer_idx}: {e}")
-                return hook
-            def make_separate_k_hook(layer_idx):
-                """Hook for separate K projection (Mistral/LLaMA style)"""
-                def hook(module, input, output):
-                    try:
-                        if layer_idx not in qkv_captures:
-                            qkv_captures[layer_idx] = {}
-                        # Handle both tuple and tensor outputs
-                        if isinstance(output, tuple):
-                            out = output[0]
-                        else:
-                            out = output
-                        out = out.detach().cpu()
-                        # If 3D, take first batch element
-                        if out.dim() == 3:
-                            out = out[0]  # [seq_len, kv_heads * head_dim]
-                        # For GQA models, K has fewer heads (kv_heads)
-                        seq_len = out.shape[0]
-                        hidden_size = out.shape[1]
-                        actual_kv_heads = hidden_size // head_dim
-                        out = out.reshape(seq_len, actual_kv_heads, head_dim)
-                        # If GQA, repeat KV heads to match Q heads
-                        if actual_kv_heads != n_heads:
-                            repeat_factor = n_heads // actual_kv_heads
-                            out = out.repeat_interleave(repeat_factor, dim=1)
-                        qkv_captures[layer_idx]['k'] = out
-                    except Exception as e:
-                        logger.warning(f"[Stream] K hook error layer {layer_idx}: {e}")
-                return hook
-            def make_separate_v_hook(layer_idx):
-                """Hook for separate V projection (Mistral/LLaMA style)"""
-                def hook(module, input, output):
-                    try:
-                        if layer_idx not in qkv_captures:
-                            qkv_captures[layer_idx] = {}
-                        # Handle both tuple and tensor outputs
-                        if isinstance(output, tuple):
-                            out = output[0]
-                        else:
-                            out = output
-                        out = out.detach().cpu()
-                        # If 3D, take first batch element
-                        if out.dim() == 3:
-                            out = out[0]  # [seq_len, kv_heads * head_dim]
-                        # For GQA models, V has fewer heads (kv_heads)
-                        seq_len = out.shape[0]
-                        hidden_size = out.shape[1]
-                        actual_kv_heads = hidden_size // head_dim
-                        out = out.reshape(seq_len, actual_kv_heads, head_dim)
-                        # If GQA, repeat KV heads to match Q heads
-                        if actual_kv_heads != n_heads:
-                            repeat_factor = n_heads // actual_kv_heads
-                            out = out.repeat_interleave(repeat_factor, dim=1)
-                        qkv_captures[layer_idx]['v'] = out
-                    except Exception as e:
-                        logger.warning(f"[Stream] V hook error layer {layer_idx}: {e}")
-                return hook
-            # Register hooks - support both CodeGen and Mistral/Devstral architectures
             try:
-                # Try to get layers via adapter first (works for all model types)
-                layers = None
-                if manager.adapter:
-                    try:
-                        layers = manager.adapter._get_layers()
-                        logger.info(f"[Stream] Using adapter to get {len(layers)} layers for QKV hooks")
-                    except Exception:
-                        pass
-                # Fallback for CodeGen if adapter doesn't work
-                if layers is None and hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
-                    layers = manager.model.transformer.h
-                    logger.info(f"[Stream] Using transformer.h for {len(layers)} layers for QKV hooks")
-                if layers:
-                    for layer_idx, layer in enumerate(layers):
-                        # Mistral/Devstral: separate Q, K, V projections
-                        if hasattr(layer, 'self_attn'):
-                            attn = layer.self_attn
-                            if hasattr(attn, 'q_proj') and hasattr(attn, 'k_proj') and hasattr(attn, 'v_proj'):
-                                hooks.append(attn.q_proj.register_forward_hook(make_separate_q_hook(layer_idx)))
-                                hooks.append(attn.k_proj.register_forward_hook(make_separate_k_hook(layer_idx)))
-                                hooks.append(attn.v_proj.register_forward_hook(make_separate_v_hook(layer_idx)))
-                        # CodeGen/GPT-NeoX: combined QKV projection
-                        elif hasattr(layer, 'attn'):
-                            if hasattr(layer.attn, 'qkv_proj'):
-                                hooks.append(layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx)))
-                            elif hasattr(layer.attn, 'c_attn'):
-                                hooks.append(layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx)))
-                    logger.info(f"[Stream] Registered {len(hooks)} QKV hooks")
             except Exception as hook_error:
                 logger.warning(f"Could not register QKV hooks: {hook_error}")
@@ -2419,14 +2188,9 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                             k_matrix = None
                             v_matrix = None
                             if layer_idx in qkv_captures:
-                                layer_qkv = qkv_captures[layer_idx]
-                                # Check each key exists (hooks may have failed for some)
-                                if 'q' in layer_qkv:
-                                    q_matrix = layer_qkv['q'][:, head_idx, :].float().numpy().tolist()
-                                if 'k' in layer_qkv:
-                                    k_matrix = layer_qkv['k'][:, head_idx, :].float().numpy().tolist()
-                                if 'v' in layer_qkv:
-                                    v_matrix = layer_qkv['v'][:, head_idx, :].float().numpy().tolist()
                             critical_heads.append({
                                 "head_idx": head_idx,
@@ -2441,13 +2205,6 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
-                        # Only keep QKV matrices for top 5 heads to avoid massive response sizes
-                        for i, head in enumerate(critical_heads):
-                            if i >= 5:
-                                head["q_matrix"] = None
-                                head["k_matrix"] = None
-                                head["v_matrix"] = None
                         layer_pattern = None
                         layer_fraction = (layer_idx + 1) / n_layers
                         if layer_idx == 0:

         n_heads = manager.model.config.n_head if hasattr(manager.model.config, 'n_head') else manager.model.config.num_attention_heads
         d_model = manager.model.config.n_embd if hasattr(manager.model.config, 'n_embd') else manager.model.config.hidden_size
+        head_dim = d_model // n_heads
         # Generation loop with full instrumentation
         layer_data_by_token = []  # Store layer data for each generated token
         hooks = []
         def make_qkv_hook(layer_idx):
             def hook(module, input, output):
                 try:
                     # output shape: [batch, seq_len, 3 * hidden_size]
                     pass
             return hook
+        # Register hooks on all qkv_proj modules (if available)
+        # This is model-specific - CodeGen uses different architecture
         try:
+            if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
+                for layer_idx, layer in enumerate(manager.model.transformer.h):
+                    if hasattr(layer, 'attn') and hasattr(layer.attn, 'qkv_proj'):
+                        hook = layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx))
+                        hooks.append(hook)
+                    elif hasattr(layer, 'attn') and hasattr(layer.attn, 'c_attn'):
+                        # GPT-2 style attention
+                        hook = layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx))
+                        hooks.append(hook)
         except Exception as hook_error:
             logger.warning(f"Could not register QKV hooks: {hook_error}")
                         k_matrix = None
                         v_matrix = None
                         if layer_idx in qkv_captures:
                             # Q/K/V shape: [seq_len, n_heads, head_dim]
                             # Convert to float32 for numpy (bfloat16 not supported)
+                            q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy().tolist()
+                            k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy().tolist()
+                            v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy().tolist()
                         critical_heads.append({
                             "head_idx": head_idx,
                     # Sort by max_weight (return all heads, frontend will decide how many to display)
                     critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
                     # Detect layer-level pattern (percentage-based for any layer count)
                     layer_pattern = None
                     layer_fraction = (layer_idx + 1) / n_layers  # 1-indexed fraction
             n_heads = manager.model.config.n_head if hasattr(manager.model.config, 'n_head') else manager.model.config.num_attention_heads
             d_model = manager.model.config.n_embd if hasattr(manager.model.config, 'n_embd') else manager.model.config.hidden_size
+            head_dim = d_model // n_heads
             # === STAGE 2: GENERATING ===
             layer_data_by_token = []
             hooks = []
             def make_qkv_hook(layer_idx):
                 def hook(module, input, output):
                     try:
                         if output.dim() != 3:
                         pass
                 return hook
+            # Register hooks
             try:
+                if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
+                    for layer_idx, layer in enumerate(manager.model.transformer.h):
+                        if hasattr(layer, 'attn') and hasattr(layer.attn, 'qkv_proj'):
+                            hook = layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx))
+                            hooks.append(hook)
+                        elif hasattr(layer, 'attn') and hasattr(layer.attn, 'c_attn'):
+                            hook = layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx))
+                            hooks.append(hook)
             except Exception as hook_error:
                 logger.warning(f"Could not register QKV hooks: {hook_error}")
                             k_matrix = None
                             v_matrix = None
                             if layer_idx in qkv_captures:
+                                q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy().tolist()
+                                k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy().tolist()
+                                v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy().tolist()
                             critical_heads.append({
                                 "head_idx": head_idx,
                         critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
                         layer_pattern = None
                         layer_fraction = (layer_idx + 1) / n_layers
                         if layer_idx == 0: