#!/usr/bin/env python3
"""Comprehensive End-to-End Testing for MVP3 Sprint 5.

This module implements complete end-to-end testing scenarios covering:
- Full user workflows from query to execution
- Different query complexity levels
- Error handling and recovery scenarios
- System integration across all components
- Performance and reliability testing
"""

import time
from unittest.mock import patch

import pytest
from fastapi.testclient import TestClient

from app import app, initialize_agent_system
from kg_services.ontology import MCPPrompt, MCPTool, PlannedStep


class TestE2EUserWorkflows:
    """Test complete end-to-end user workflows."""

    @pytest.fixture
    def client(self):
        """Provide test client for API testing."""
        return TestClient(app)

    @pytest.fixture
    def initialized_system(self):
        """Provide fully initialized system components."""
        import app as app_module

        planner, executor = initialize_agent_system()
        # Set global variables for app endpoints
        app_module.planner_agent = planner
        app_module.executor_agent = executor
        return {"planner": planner, "executor": executor, "client": TestClient(app)}

    def test_complete_sentiment_analysis_workflow(self, initialized_system):
        """Test complete workflow: query → plan → input → execute."""
        client = initialized_system["client"]

        # Step 1: Health check to ensure system is ready
        response = client.get("/health")
        assert response.status_code == 200
        assert response.json()["status"] == "healthy"

        # Skip test if system not properly initialized
        if initialized_system["planner"] is None:
            pytest.skip(
                "Agent system not initialized - likely missing data files or API keys"
            )

        # Step 2: Submit sentiment analysis query
        plan_request = {
            "query": "I need to analyze customer sentiment from product reviews",
            "top_k": 3,
        }
        response = client.post("/api/plan/generate", json=plan_request)

        # Should either succeed or fail gracefully
        if response.status_code == 503:
            pytest.skip("Agent system not available - check initialization")

        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"
        assert "planned_steps" in plan_data
        assert len(plan_data["planned_steps"]) > 0

        # Step 3: Verify plan structure and content
        first_step = plan_data["planned_steps"][0]
        assert "tool" in first_step
        assert "prompt" in first_step
        assert "relevance_score" in first_step

        # Verify tool information
        tool_info = first_step["tool"]
        assert "name" in tool_info
        assert "description" in tool_info
        assert (
            "sentiment" in tool_info["name"].lower()
            or "sentiment" in tool_info["description"].lower()
        )

        # Verify prompt information
        prompt_info = first_step["prompt"]
        assert "template_string" in prompt_info
        assert "input_variables" in prompt_info

        # Step 4: Check input requirements are clear
        assert len(prompt_info["input_variables"]) > 0
        input_vars = prompt_info["input_variables"]
        template = prompt_info["template_string"]

        # Verify template contains placeholders for variables
        for var in input_vars:
            assert f"{{{{{var}}}}}" in template

        # Step 5: Simulate execution with valid inputs
        if initialized_system["executor"]:
            planned_step = PlannedStep(
                tool=MCPTool(
                    tool_id=tool_info["tool_id"],
                    name=tool_info["name"],
                    description=tool_info["description"],
                    tags=tool_info.get("tags", []),
                    invocation_command_stub=tool_info.get(
                        "invocation_command_stub", ""
                    ),
                ),
                prompt=MCPPrompt(
                    prompt_id=prompt_info["prompt_id"],
                    name=prompt_info["name"],
                    description=prompt_info["description"],
                    target_tool_id=prompt_info.get(
                        "target_tool_id", tool_info["tool_id"]
                    ),
                    template_string=prompt_info["template_string"],
                    input_variables=prompt_info["input_variables"],
                    difficulty_level=prompt_info.get("difficulty_level", "beginner"),
                ),
                relevance_score=first_step["relevance_score"],
            )

            # Create realistic input values
            test_inputs = {}
            for var in input_vars:
                if "text" in var.lower() or "content" in var.lower():
                    test_inputs[var] = (
                        "This product is amazing! I love it so much and would definitely recommend it to others."
                    )
                elif "format" in var.lower():
                    test_inputs[var] = "detailed"
                else:
                    test_inputs[var] = "sample input"

            execution_result = initialized_system["executor"].execute_plan_step(
                planned_step, test_inputs
            )

            # Verify execution results (McpExecutorAgent format)
            assert execution_result["status"] in [
                "simulated_success",
                "simulated_error_timeout",
                "simulated_error_rate_limit",
                "simulated_error_invalid_input",
                "simulated_error_service_unavailable",
                "simulated_error_authentication_failed",
            ]
            assert execution_result["execution_mode"] in ["simulated", "simulated_error"]
            assert "tool_specific_output" in execution_result
            assert execution_result["tool_id_used"] == planned_step.tool.tool_id
            assert execution_result["tool_name_used"] == planned_step.tool.name

    def test_text_summarization_workflow(self, initialized_system):
        """Test complete text summarization workflow."""
        client = initialized_system["client"]

        # Submit summarization query
        plan_request = {
            "query": "I need to summarize a long technical document for my team",
            "top_k": 2,
        }
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"
        assert len(plan_data["planned_steps"]) > 0

        # Find summarization tool
        summarization_step = None
        for step in plan_data["planned_steps"]:
            tool_name = step["tool"]["name"].lower()
            tool_desc = step["tool"]["description"].lower()
            if "summar" in tool_name or "summar" in tool_desc:
                summarization_step = step
                break

        assert summarization_step is not None, "No summarization tool found in plan"

        # Verify summarization-specific features
        prompt_info = summarization_step["prompt"]
        assert len(prompt_info["input_variables"]) > 0

        # Look for document/text input variable
        text_vars = [
            var
            for var in prompt_info["input_variables"]
            if "text" in var.lower()
            or "document" in var.lower()
            or "content" in var.lower()
        ]
        assert len(text_vars) > 0, "No text input variable found for summarization"

    def test_code_quality_analysis_workflow(self, initialized_system):
        """Test code quality analysis workflow."""
        client = initialized_system["client"]

        # Submit code quality query
        plan_request = {
            "query": "I need to check my Python code for security vulnerabilities and quality issues",
            "top_k": 3,
        }
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"

        # Look for code-related tools
        code_tools = []
        for step in plan_data["planned_steps"]:
            tool_name = step["tool"]["name"].lower()
            tool_desc = step["tool"]["description"].lower()
            if any(
                keyword in tool_name or keyword in tool_desc
                for keyword in ["code", "quality", "lint", "security", "vulnerability"]
            ):
                code_tools.append(step)

        assert len(code_tools) > 0, "No code-related tools found"

        # Verify code analysis capabilities
        code_step = code_tools[0]
        prompt_info = code_step["prompt"]

        # Look for code input variables
        code_vars = [
            var
            for var in prompt_info["input_variables"]
            if any(
                keyword in var.lower()
                for keyword in ["code", "script", "file", "source"]
            )
        ]

        # Should have code input or be ready to use without input
        assert len(code_vars) > 0 or len(prompt_info["input_variables"]) == 0

    def test_image_captioning_workflow(self, initialized_system):
        """Test image captioning workflow."""
        client = initialized_system["client"]

        # Submit image captioning query
        plan_request = {
            "query": "I need to generate captions for images in my project",
            "top_k": 2,
        }
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"

        # Look for image-related tools
        image_tools = []
        for step in plan_data["planned_steps"]:
            tool_name = step["tool"]["name"].lower()
            tool_desc = step["tool"]["description"].lower()
            if any(
                keyword in tool_name or keyword in tool_desc
                for keyword in ["image", "caption", "visual", "photo", "picture"]
            ):
                image_tools.append(step)

        assert len(image_tools) > 0, "No image-related tools found"

        # Verify image processing capabilities
        image_step = image_tools[0]
        prompt_info = image_step["prompt"]

        # Should handle image inputs (check for image-related or generic input variables)
        image_vars = [
            var
            for var in prompt_info["input_variables"]
            if any(
                keyword in var.lower()
                for keyword in ["image", "photo", "file", "path", "url"]
            )
        ]

        # Accept either image-specific variables, generic context variables, or no variables
        generic_vars = [
            var
            for var in prompt_info["input_variables"]
            if any(
                keyword in var.lower()
                for keyword in ["context", "input", "data", "content", "element"]
            )
        ]

        assert (len(image_vars) > 0 or
                len(generic_vars) > 0 or
                len(prompt_info["input_variables"]) == 0), f"Prompt has unexpected input variables: {prompt_info['input_variables']}"


class TestE2EQueryScenarios:
    """Test different types of user queries and their handling."""

    @pytest.fixture
    def client(self):
        """Provide test client."""
        # Initialize system for these tests too
        import app as app_module

        planner, executor = initialize_agent_system()
        app_module.planner_agent = planner
        app_module.executor_agent = executor
        return TestClient(app)

    def test_specific_technical_query(self, client):
        """Test highly specific technical query."""
        plan_request = {
            "query": "I need advanced sentiment analysis with emotion detection for customer support ticket classification",
            "top_k": 3,
        }
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"
        assert len(plan_data["planned_steps"]) > 0

        # Should find highly relevant sentiment analysis tools
        top_step = plan_data["planned_steps"][0]
        assert top_step["relevance_score"] >= 0.7  # High relevance expected

    def test_ambiguous_query(self, client):
        """Test ambiguous query that could match multiple tools."""
        plan_request = {"query": "I need to process some text data", "top_k": 5}
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"

        # Should return multiple relevant tools with varied scores
        if len(plan_data["planned_steps"]) > 1:
            scores = [step["relevance_score"] for step in plan_data["planned_steps"]]
            assert max(scores) - min(scores) > 0.1  # Score diversity expected

    def test_generic_help_query(self, client):
        """Test very generic query."""
        plan_request = {"query": "What tools do you have available?", "top_k": 3}
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        # Should handle gracefully, might return diverse tools or no tools
        assert plan_data["status"] == "success"

    def test_unrelated_query(self, client):
        """Test completely unrelated query."""
        plan_request = {
            "query": "How do I cook pasta? What's the weather like?",
            "top_k": 3,
        }
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        # Should handle gracefully, likely with low scores or no results
        assert plan_data["status"] == "success"

        if len(plan_data["planned_steps"]) > 0:
            # Any returned tools should have low relevance scores
            top_score = max(
                step["relevance_score"] for step in plan_data["planned_steps"]
            )
            assert top_score < 0.8  # Low relevance expected

    def test_typo_and_misspelling_query(self, client):
        """Test query with typos and misspellings."""
        plan_request = {
            "query": "I need sentimnt anaylsis for custmer reveiws",  # Intentional typos
            "top_k": 3,
        }
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"

        # Should still find sentiment analysis tools despite typos
        if len(plan_data["planned_steps"]) > 0:
            sentiment_tools = []
            for step in plan_data["planned_steps"]:
                tool_name = step["tool"]["name"].lower()
                tool_desc = step["tool"]["description"].lower()
                if "sentiment" in tool_name or "sentiment" in tool_desc:
                    sentiment_tools.append(step)

            # Should find at least one sentiment tool
            assert len(sentiment_tools) > 0, "Failed to find sentiment tools with typos"

    def test_mixed_language_query(self, client):
        """Test query with mixed languages or special characters."""
        plan_request = {
            "query": "I need análisis de sentimientos for émotions 🎯 and feedback",
            "top_k": 3,
        }
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"
        # Should handle gracefully without errors

    def test_extremely_long_query(self, client):
        """Test with extremely long query."""
        long_description = " ".join(
            [
                "I need a comprehensive sentiment analysis tool that can process large volumes of customer feedback",
                "from multiple sources including social media, email surveys, chat logs, and product reviews.",
                "The tool should be able to detect not just positive, negative, and neutral sentiments,",
                "but also specific emotions like joy, anger, frustration, satisfaction, and disappointment.",
                "It should handle multiple languages and be robust against typos and informal language.",
                "The output should be structured and include confidence scores for each prediction.",
            ]
            * 10
        )  # Repeat to make it very long

        plan_request = {"query": long_description, "top_k": 3}
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"

    def test_empty_and_whitespace_queries(self, client):
        """Test edge cases with empty or whitespace-only queries."""
        test_cases = ["", "   ", "\n\t\r", "  \n  "]

        for query in test_cases:
            plan_request = {"query": query, "top_k": 3}
            response = client.post("/api/plan/generate", json=plan_request)
            assert response.status_code == 200

            plan_data = response.json()
            assert plan_data["status"] == "success"
            # Should handle gracefully, likely returning no results


class TestE2EErrorScenarios:
    """Test error handling and recovery scenarios."""

    @pytest.fixture
    def client(self):
        """Provide test client."""
        return TestClient(app)

    def test_malformed_request_handling(self, client):
        """Test handling of malformed API requests."""
        # Missing required fields
        response = client.post("/api/plan/generate", json={})
        assert response.status_code == 422  # Validation error

        # Invalid data types
        response = client.post(
            "/api/plan/generate",
            json={
                "query": 123,  # Should be string
                "top_k": "invalid",  # Should be int
            },
        )
        assert response.status_code == 422

    def test_invalid_top_k_values(self, client):
        """Test handling of invalid top_k values."""
        test_cases = [
            {"query": "test", "top_k": -1},  # Negative
            {"query": "test", "top_k": 0},  # Zero
            {"query": "test", "top_k": 100},  # Too large
        ]

        for case in test_cases:
            response = client.post("/api/plan/generate", json=case)
            # Should either reject (422) or handle gracefully (200)
            assert response.status_code in [200, 422]

    @patch("agents.executor.random.random")
    def test_execution_error_simulation(self, mock_random, client):
        """Test execution with simulated errors."""
        # Force error simulation
        mock_random.return_value = 0.05  # Below 0.1 threshold, triggers random errors

        # First get a plan
        plan_request = {"query": "test error simulate network timeout", "top_k": 1}
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        if len(plan_data["planned_steps"]) > 0:
            # The query should trigger error simulation in the executor
            # This tests that error simulation is working as expected
            assert plan_data["status"] == "success"

    def test_system_resource_constraints(self, client):
        """Test system behavior under resource constraints."""
        # Simulate multiple concurrent requests
        import concurrent.futures

        def make_request():
            plan_request = {
                "query": "sentiment analysis for customer feedback",
                "top_k": 3,
            }
            return client.post("/api/plan/generate", json=plan_request)

        # Submit multiple requests concurrently
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(make_request) for _ in range(20)]
            results = [
                future.result() for future in concurrent.futures.as_completed(futures)
            ]

        # All requests should complete successfully
        for response in results:
            assert response.status_code == 200
            data = response.json()
            assert data["status"] == "success"

    def test_missing_system_dependencies(self, client):
        """Test behavior when system dependencies are missing."""
        # Test health endpoint when system might not be fully initialized
        response = client.get("/health")
        assert response.status_code == 200

        health_data = response.json()
        assert "status" in health_data
        # Should indicate healthy or degraded mode


class TestE2EPerformance:
    """Test system performance and reliability."""

    @pytest.fixture
    def client(self):
        """Provide test client."""
        return TestClient(app)

    @pytest.mark.performance
    def test_response_time_requirements(self, client):
        """Test that responses meet performance requirements."""
        # Health check should be very fast
        start_time = time.time()
        response = client.get("/health")
        health_time = time.time() - start_time

        assert response.status_code == 200
        assert health_time < 0.1  # 100ms

        # Plan generation should be reasonably fast
        start_time = time.time()
        plan_request = {"query": "sentiment analysis for customer reviews", "top_k": 3}
        response = client.post("/api/plan/generate", json=plan_request)
        plan_time = time.time() - start_time

        assert response.status_code == 200
        assert plan_time < 2.0  # 2 seconds max for plan generation

    @pytest.mark.performance
    def test_memory_efficiency(self, client):
        """Test memory usage during operations."""
        import os

        import psutil

        process = psutil.Process(os.getpid())
        initial_memory = process.memory_info().rss / 1024 / 1024  # MB

        # Perform multiple operations
        for i in range(50):
            plan_request = {
                "query": f"test query {i} for sentiment analysis",
                "top_k": 3,
            }
            response = client.post("/api/plan/generate", json=plan_request)
            assert response.status_code == 200

        final_memory = process.memory_info().rss / 1024 / 1024  # MB
        memory_increase = final_memory - initial_memory

        # Memory increase should be reasonable (less than 100MB for 50 requests)
        assert memory_increase < 100

    @pytest.mark.performance
    def test_consistency_across_requests(self, client):
        """Test consistency of responses across multiple requests."""
        query = "sentiment analysis for customer feedback"

        responses = []
        for _ in range(5):
            plan_request = {"query": query, "top_k": 3}
            response = client.post("/api/plan/generate", json=plan_request)
            assert response.status_code == 200
            responses.append(response.json())

        # All responses should have the same structure
        for response in responses:
            assert response["status"] == "success"
            assert "planned_steps" in response
            assert "total_steps" in response

        # Tool suggestions should be consistent for the same query
        if len(responses[0]["planned_steps"]) > 0:
            first_tool = responses[0]["planned_steps"][0]["tool"]["tool_id"]
            for response in responses[1:]:
                if len(response["planned_steps"]) > 0:
                    # First result should generally be the same for identical queries
                    current_tool = response["planned_steps"][0]["tool"]["tool_id"]
                    assert (
                        current_tool == first_tool
                    ), "Inconsistent tool suggestions for identical queries"


class TestE2ESystemIntegration:
    """Test integration between all system components."""

    @pytest.fixture
    def client(self):
        """Provide test client."""
        return TestClient(app)

    def test_full_system_initialization(self, client):
        """Test that the full system initializes correctly."""
        # Test health endpoint
        response = client.get("/health")
        assert response.status_code == 200

        health_data = response.json()
        assert health_data["status"] == "healthy"
        assert health_data["version"] == "0.1.0"

    def test_knowledge_graph_integration(self, client):
        """Test knowledge graph integration works correctly."""
        # Submit query that should match known tools
        plan_request = {"query": "sentiment analysis", "top_k": 3}
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"

        # Should find tools from the knowledge graph
        assert len(plan_data["planned_steps"]) > 0

        # Verify tool data structure matches expected schema
        for step in plan_data["planned_steps"]:
            tool = step["tool"]
            prompt = step["prompt"]

            # Verify tool structure
            assert "tool_id" in tool
            assert "name" in tool
            assert "description" in tool

            # Verify prompt structure
            assert "prompt_id" in prompt
            assert "name" in prompt
            assert "template_string" in prompt
            assert "input_variables" in prompt

    def test_embedding_service_integration(self, client):
        """Test that embedding service works correctly."""
        # Different queries should get different results
        queries = [
            "sentiment analysis for reviews",
            "text summarization for documents",
            "image caption generation",
            "code quality analysis",
        ]

        results = []
        for query in queries:
            plan_request = {"query": query, "top_k": 2}
            response = client.post("/api/plan/generate", json=plan_request)
            assert response.status_code == 200
            results.append(response.json())

        # Should get relevant results for each query type
        for i, result in enumerate(results):
            assert result["status"] == "success"
            if len(result["planned_steps"]) > 0:
                # Check that results are relevant to the query
                query_lower = queries[i].lower()
                top_step = result["planned_steps"][0]
                tool_name = top_step["tool"]["name"].lower()
                tool_desc = top_step["tool"]["description"].lower()

                # Look for query-relevant keywords in results
                query_keywords = query_lower.split()
                any(
                    keyword in tool_name or keyword in tool_desc
                    for keyword in query_keywords[:2]
                )  # Check first 2 keywords

                # Note: This might not always pass due to embedding similarities
                # but should generally be true for well-matched queries

    def test_error_propagation_and_handling(self, client):
        """Test that errors are properly propagated and handled."""
        # Test with various edge cases that might cause errors
        edge_cases = [
            {"query": "", "top_k": 1},  # Empty query
            {"query": "x" * 10000, "top_k": 1},  # Very long query
            {"query": "test", "top_k": 1000},  # Very large top_k
        ]

        for case in edge_cases:
            response = client.post("/api/plan/generate", json=case)
            # Should handle gracefully without 500 errors
            assert response.status_code in [200, 422]

            if response.status_code == 200:
                data = response.json()
                assert "status" in data

    def test_api_documentation_endpoints(self, client):
        """Test that API documentation is accessible."""
        # Test OpenAPI documentation
        response = client.get("/docs")
        assert response.status_code == 200

        # Test OpenAPI JSON schema
        response = client.get("/openapi.json")
        assert response.status_code == 200

        openapi_data = response.json()
        assert "openapi" in openapi_data
        assert "info" in openapi_data
        assert "paths" in openapi_data


class TestE2EDataIntegrity:
    """Test data integrity and consistency across system."""

    @pytest.fixture
    def client(self):
        """Provide test client."""
        return TestClient(app)

    def test_tool_prompt_consistency(self, client):
        """Test that tools and prompts are properly linked."""
        # Get multiple plans to check consistency
        plan_request = {"query": "sentiment analysis for customer feedback", "top_k": 5}
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        assert plan_data["status"] == "success"

        # Check tool-prompt relationships
        for step in plan_data["planned_steps"]:
            tool = step["tool"]
            prompt = step["prompt"]

            # Prompt should target the tool
            tool_id = tool["tool_id"]
            target_tool_id = prompt.get("target_tool_id")

            if target_tool_id:
                assert (
                    target_tool_id == tool_id
                ), f"Prompt {prompt['prompt_id']} targets {target_tool_id} but paired with tool {tool_id}"

    def test_relevance_score_validity(self, client):
        """Test that relevance scores are valid and meaningful."""
        plan_request = {"query": "sentiment analysis for customer reviews", "top_k": 5}
        response = client.post("/api/plan/generate", json=plan_request)
        assert response.status_code == 200

        plan_data = response.json()
        if len(plan_data["planned_steps"]) > 1:
            scores = [step["relevance_score"] for step in plan_data["planned_steps"]]

            # Scores should be valid floats between 0 and 1
            for score in scores:
                assert isinstance(score, (int, float))
                assert 0.0 <= score <= 1.0

            # Scores should generally be in descending order (best first)
            for i in range(len(scores) - 1):
                assert (
                    scores[i] >= scores[i + 1]
                ), "Relevance scores should be in descending order"

    def test_data_format_consistency(self, client):
        """Test that data formats are consistent across responses."""
        queries = [
            "sentiment analysis",
            "text summarization",
            "code quality check",
            "image captioning",
        ]

        for query in queries:
            plan_request = {"query": query, "top_k": 2}
            response = client.post("/api/plan/generate", json=plan_request)
            assert response.status_code == 200

            plan_data = response.json()
            assert plan_data["status"] == "success"

            # Check consistent structure across all responses
            assert "query" in plan_data
            assert "planned_steps" in plan_data
            assert "total_steps" in plan_data

            # Check consistent step structure
            for step in plan_data["planned_steps"]:
                assert "tool" in step
                assert "prompt" in step
                assert "relevance_score" in step
                assert "summary" in step

                # Check tool structure
                tool = step["tool"]
                required_tool_fields = [
                    "tool_id",
                    "name",
                    "description",
                    "tags",
                    "invocation_command_stub",
                ]
                for field in required_tool_fields:
                    assert field in tool

                # Check prompt structure
                prompt = step["prompt"]
                required_prompt_fields = [
                    "prompt_id",
                    "name",
                    "description",
                    "template_string",
                    "input_variables",
                ]
                for field in required_prompt_fields:
                    assert field in prompt