"""Embedding service for semantic similarity in MCP tools and prompts. This module provides sophisticated embedding functionality for tool descriptions, prompt templates, and query matching. It serves as the semantic foundation for the KGraph-MCP system's intelligent tool discovery capabilities. Architecture Overview: The EmbeddingService implements a resilient embedding strategy with multiple fallback layers to ensure the system remains functional even when external APIs are unavailable: 1. Primary: OpenAI text-embedding-3-small API (production quality) 2. Fallback: Deterministic hash-based embeddings (development/testing) 3. Error Handling: Graceful degradation with comprehensive logging Embedding Strategy: OpenAI Embeddings (Preferred): - Uses text-embedding-3-small model (1536 dimensions) - High-quality semantic representations - Optimized for similarity search tasks - Cost-effective for production use Hash-based Embeddings (Fallback): - Deterministic vectors from SHA-256 hashing - Consistent across runs for testing - No external dependencies - Normalized to [0,1] range for compatibility Mathematical Foundation: Embeddings map text into high-dimensional vector spaces where semantic similarity is preserved as geometric proximity. The cosine similarity metric measures the angle between vectors, providing rotation-invariant similarity scoring. Vector Space Properties: - Similar concepts cluster together - Semantic relationships preserved as linear operations - Dimensionality enables nuanced similarity measurement Performance Characteristics: - OpenAI API: ~100-500ms per request (network dependent) - Hash fallback: ~1-10ms per text (local computation) - Batch processing: Significant efficiency gains for multiple texts - Caching: Recommended for production to reduce API costs Usage Patterns: Development: Use hash embeddings for fast iteration Testing: Deterministic embeddings ensure reproducible results Production: OpenAI embeddings for highest quality semantic matching """ import hashlib import logging import math import os from typing import Optional import openai from dotenv import load_dotenv from .ontology import MCPTool, MCPPrompt # Load environment variables from .env file if it exists load_dotenv() # Create logger for this module with structured output logger = logging.getLogger(__name__) class EmbeddingService: """Service for generating embeddings and computing semantic similarity with fallback support. This class provides a robust embedding service that gracefully handles API failures and provides consistent functionality across different deployment environments. It's designed to support both development (with mock embeddings) and production (with real OpenAI embeddings) use cases. Architecture Design: The service follows the Strategy pattern with automatic fallback: - Primary strategy: OpenAI API for high-quality embeddings - Fallback strategy: Hash-based deterministic embeddings - Error recovery: Comprehensive logging and graceful degradation Key Features: 1. Resilient API Integration: Handles network failures gracefully 2. Development Support: Works without API keys for local development 3. Cost Optimization: Efficient text preprocessing and error handling 4. Consistency: Deterministic fallbacks for reproducible testing 5. Observability: Comprehensive logging for debugging and monitoring Example Usage: >>> service = EmbeddingService() >>> >>> # Generate embedding for a query >>> query_vec = service.get_embedding("analyze customer sentiment") >>> >>> # Generate embedding for a tool >>> tool_vec = service.embed_tool_description(sentiment_tool) >>> >>> # Compute similarity >>> similarity = service.compute_similarity(query_vec, tool_vec) >>> print(f"Similarity: {similarity:.3f}") """ def __init__(self, embedding_dim: int = 128) -> None: """Initialize the embedding service with OpenAI API integration. Sets up the OpenAI client if API key is available, otherwise configures for fallback mode. The initialization is designed to be robust and fail gracefully if external dependencies are unavailable. Args: embedding_dim: Dimension for hash-based fallback embeddings (default: 128) Note: OpenAI embeddings use 1536 dimensions regardless Environment Variables: OPENAI_API_KEY: Required for production-quality embeddings If not set, service operates in fallback mode Side Effects: - Configures OpenAI client if API key available - Logs initialization status for debugging - Sets up fallback parameters for hash-based embeddings """ self.embedding_dim = embedding_dim # For hash-based fallback only # Initialize OpenAI client with error handling api_key = os.getenv("OPENAI_API_KEY") if api_key and api_key.strip(): try: self.openai_client: Optional[openai.OpenAI] = openai.OpenAI(api_key=api_key) logger.info("EmbeddingService initialized with OpenAI API integration") except Exception as e: logger.error(f"Failed to initialize OpenAI client: {e}") self.openai_client = None else: self.openai_client = None logger.warning( "OPENAI_API_KEY not found. EmbeddingService will use hash-based fallback embeddings. " "For production quality, set OPENAI_API_KEY environment variable." ) def get_embedding(self, text: str) -> Optional[list[float]]: """Generate an embedding vector for the given text using OpenAI API with fallbacks. This is the primary method for generating high-quality embeddings. It uses OpenAI's text-embedding-3-small model which provides excellent semantic representations optimized for similarity search tasks. Processing Pipeline: 1. Text preprocessing: Clean and normalize input text 2. API call: Request embedding from OpenAI 3. Error handling: Log failures and return None for graceful degradation 4. Result extraction: Return the embedding vector Args: text: The text to embed (will be preprocessed automatically) Examples: "analyze customer sentiment", "tool description text" Returns: List of 1536 floats representing the embedding vector, or None if API call fails The vector is L2-normalized and ready for cosine similarity computation Error Handling: - OpenAI API errors: Logged with details, returns None - Network errors: Logged and handled gracefully - Invalid input: Preprocessed to handle edge cases Performance: - Typical latency: 100-500ms depending on network - Rate limits: Respects OpenAI's rate limiting - Cost: ~$0.00002 per 1K tokens (very cost-effective) Example: >>> embedding = service.get_embedding("analyze customer feedback sentiment") >>> if embedding: ... print(f"Generated {len(embedding)}-dimensional embedding") ... else: ... print("API unavailable, consider using fallback methods") """ if not self.openai_client: logger.debug("OpenAI client not available. Use embed_text() for hash-based fallback.") return None try: # Preprocess text: Clean newlines and whitespace for better API performance cleaned_text = text.replace("\n", " ").strip() # Validate input length (OpenAI has token limits) if not cleaned_text: logger.warning("Empty text provided to get_embedding()") return None # Make API call to OpenAI with optimized model selection response = self.openai_client.embeddings.create( input=cleaned_text, model="text-embedding-3-small" # Balanced performance/cost model ) # Extract embedding from response embedding_vector = response.data[0].embedding logger.debug(f"Generated OpenAI embedding for text: '{cleaned_text[:50]}...'") return embedding_vector except openai.APIError as e: logger.error(f"OpenAI API error when generating embedding: {e}") return None except Exception as e: logger.error(f"Unexpected error when generating embedding: {e}") return None def embed_text(self, text: str) -> list[float]: """Generate a deterministic embedding vector using hash-based approach. This method provides a fallback embedding strategy that doesn't require external APIs. It's particularly useful for development, testing, and scenarios where API access is limited. Algorithm Design: 1. Hash Generation: Use SHA-256 for deterministic, collision-resistant hashing 2. Numeric Conversion: Convert hex pairs to normalized floats [0,1] 3. Dimension Control: Pad or truncate to desired embedding dimension 4. Normalization: Ensure vector components are in [0,1] range Characteristics: - Deterministic: Same input always produces same output - Fast: Pure local computation, no network calls - Consistent: Suitable for testing and development - Limited Semantics: No real semantic understanding Args: text: The text to embed (any string input accepted) Returns: List of floats representing the hash-based embedding vector Dimension matches self.embedding_dim (default: 128) Mathematical Properties: - Range: [0.0, 1.0] for all components - Distribution: Approximately uniform across dimensions - Distance: Hash collisions extremely rare due to SHA-256 Example: >>> embedding = service.embed_text("customer sentiment analysis") >>> print(f"Hash-based embedding: {len(embedding)} dimensions") >>> # Same input will always produce identical output >>> assert embedding == service.embed_text("customer sentiment analysis") """ # Generate deterministic hash: SHA-256 provides excellent distribution text_hash = hashlib.sha256(text.encode('utf-8')).hexdigest() # Convert hex hash to normalized float values embedding = [] for i in range(0, min(len(text_hash), self.embedding_dim * 2), 2): # Process pairs of hex digits for better distribution hex_pair = text_hash[i:i + 2] # Convert to int (0-255) then normalize to [0,1] value = int(hex_pair, 16) / 255.0 embedding.append(value) # Ensure consistent dimensionality through padding or truncation while len(embedding) < self.embedding_dim: embedding.append(0.0) # Pad with zeros if needed # Truncate if hash provided more values than needed final_embedding = embedding[:self.embedding_dim] logger.debug(f"Generated hash-based embedding for text: '{text[:30]}...'") return final_embedding def embed_tool_description(self, tool: MCPTool) -> list[float]: """Generate an embedding for a tool's complete description. Creates a rich embedding by combining multiple tool attributes to maximize semantic information capture. This method optimizes for tool discovery by including both the name and description in the embedding. Combination Strategy: The method concatenates tool.name and tool.description because: 1. Names often contain key terminology (e.g., "Sentiment Analyzer") 2. Descriptions provide context and use cases 3. Combined text gives richer semantic representation 4. Improves matching accuracy for varied query styles Args: tool: The MCPTool instance to embed Returns: Embedding vector representing the tool's semantic meaning Uses the configured embedding strategy (OpenAI or hash-based) Example: >>> tool = MCPTool( ... name="Text Summarizer", ... description="Automatically summarizes long documents" ... ) >>> embedding = service.embed_tool_description(tool) >>> # This embedding captures both "summarizer" and "documents" concepts """ # Combine name and description for comprehensive semantic representation combined_text = f"{tool.name} {tool.description}" # Try OpenAI embedding first, fallback to hash-based openai_embedding = self.get_embedding(combined_text) if openai_embedding is not None: return openai_embedding # Fallback to hash-based embedding return self.embed_text(combined_text) def embed_prompt_description(self, prompt: MCPPrompt) -> list[float]: """Generate an embedding for a prompt's complete description. Similar to embed_tool_description but optimized for prompt content. Combines multiple prompt attributes to create a rich semantic representation that enables effective prompt discovery and matching. Combination Strategy: Concatenates name, description, and use_case because: 1. Names identify the prompt type (e.g., "Basic Sentiment Analysis") 2. Descriptions explain the prompt's purpose 3. Use cases provide context for when to use the prompt 4. Combined text enables multi-faceted semantic matching Args: prompt: The MCPPrompt instance to embed Returns: Embedding vector representing the prompt's semantic meaning Uses the configured embedding strategy (OpenAI or hash-based) Example: >>> prompt = MCPPrompt( ... name="Academic Paper Summary", ... description="Summarizes research papers with key findings", ... use_case="Academic research and literature review" ... ) >>> embedding = service.embed_prompt_description(prompt) >>> # Captures concepts: academic, research, summary, findings """ # Combine multiple fields for comprehensive semantic representation text_parts = [prompt.name, prompt.description] if prompt.use_case: # Include use case if available text_parts.append(prompt.use_case) combined_text = " ".join(text_parts) # Try OpenAI embedding first, fallback to hash-based openai_embedding = self.get_embedding(combined_text) if openai_embedding is not None: return openai_embedding # Fallback to hash-based embedding return self.embed_text(combined_text) def compute_similarity( self, embedding1: list[float], embedding2: list[float] ) -> float: """Compute cosine similarity between two embeddings with normalization. Cosine similarity is the preferred metric for embedding comparison because it measures orientation similarity (semantic relatedness) rather than magnitude. This is ideal for text embeddings where we care about conceptual similarity rather than absolute values. Mathematical Foundation: cosine_similarity(A, B) = (A · B) / (||A|| × ||B||) Where: - A · B is the dot product (measures alignment) - ||A||, ||B|| are vector magnitudes (L2 norms) - Result range: [-1, 1] where 1 = identical, 0 = orthogonal, -1 = opposite Normalization: The method normalizes the cosine similarity from [-1,1] to [0,1] range using the formula: (cosine_sim + 1) / 2. This provides intuitive similarity scores where 0 = no similarity, 1 = maximum similarity. Args: embedding1: First embedding vector (typically query embedding) embedding2: Second embedding vector (typically tool/prompt embedding) Returns: Similarity score between 0.0 and 1.0 - 1.0: Maximum similarity (identical semantic meaning) - 0.5: Neutral similarity (orthogonal vectors) - 0.0: Minimum similarity (opposite semantic meaning) Error Handling: - Different vector lengths: Truncated to minimum length - Zero-magnitude vectors: Returns 0.0 to prevent division by zero - Invalid inputs: Gracefully handled with bounds checking Performance: O(d) where d is vector dimensionality (typically 128-1536) Example: >>> query_emb = service.get_embedding("analyze sentiment") >>> tool_emb = service.embed_tool_description(sentiment_tool) >>> sim = service.compute_similarity(query_emb, tool_emb) >>> print(f"Semantic similarity: {sim:.3f}") # e.g., 0.847 """ # Handle different vector lengths by truncating to minimum min_len = min(len(embedding1), len(embedding2)) if min_len == 0: logger.warning("Empty embedding vector(s) provided to compute_similarity") return 0.0 vec1 = embedding1[:min_len] vec2 = embedding2[:min_len] # Compute dot product (numerator of cosine similarity) dot_product = sum(a * b for a, b in zip(vec1, vec2, strict=False)) # Compute vector magnitudes (denominators of cosine similarity) magnitude1 = math.sqrt(sum(a * a for a in vec1)) magnitude2 = math.sqrt(sum(b * b for b in vec2)) # Handle zero-magnitude vectors (shouldn't happen with real embeddings) if magnitude1 == 0 or magnitude2 == 0: logger.warning("Zero-magnitude vector encountered in similarity computation") return 0.0 # Calculate cosine similarity: range [-1, 1] cosine_sim = dot_product / (magnitude1 * magnitude2) # Normalize to [0, 1] range for intuitive interpretation normalized_similarity = max(0.0, min(1.0, (cosine_sim + 1) / 2)) return normalized_similarity def find_similar_tools( self, query: str, tools: list[MCPTool], top_k: int = 5 ) -> list[MCPTool]: """Find tools most similar to the given query using semantic search. This method implements end-to-end semantic tool discovery by combining query embedding, tool embedding, and similarity ranking. It provides the core functionality for intelligent tool suggestion. Algorithm: 1. Query Processing: Generate embedding for user query 2. Tool Processing: Generate embeddings for all tools 3. Similarity Computation: Calculate similarity scores 4. Ranking: Sort tools by similarity score 5. Selection: Return top-k most similar tools Args: query: The search query text (natural language) tools: List of MCPTool instances to search through top_k: Maximum number of tools to return (default: 5) Returns: List of MCPTool instances ordered by similarity (most similar first) Length will be min(len(tools), top_k) Performance: O(n × d) where n = number of tools, d = embedding dimension Example: >>> tools = kg.get_all_tools() >>> query = "I need to analyze customer feedback sentiment" >>> similar = service.find_similar_tools(query, tools, top_k=3) >>> for tool in similar: ... print(f"Found: {tool.name}") """ # Generate query embedding query_embedding = self.embed_text(query) # Compute similarities for all tools tool_similarities = [] for tool in tools: tool_embedding = self.embed_tool_description(tool) similarity = self.compute_similarity(query_embedding, tool_embedding) tool_similarities.append((tool, similarity)) # Sort by similarity (descending) and return top_k tool_similarities.sort(key=lambda x: x[1], reverse=True) top_tools = [tool for tool, _ in tool_similarities[:top_k]] # Log the search results for debugging if top_tools: top_scores = [sim for _, sim in tool_similarities[:top_k]] logger.debug(f"Found {len(top_tools)} similar tools for query '{query[:30]}...', " f"top similarities: {[f'{s:.3f}' for s in top_scores]}") else: logger.warning(f"No tools found for query: '{query}'") return top_tools