#!/usr/bin/env python3 """ MCP Image Processing Tool - Gradio Implementation This MCP server provides image processing capabilities including: - Image captioning using BLIP models - Image analysis and description - Visual content understanding - OCR capabilities (basic) Supports MCP protocol via Gradio interface. """ import base64 import io import logging import os import gradio as gr from PIL import Image # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Try to import transformers for BLIP model try: from transformers import BlipForConditionalGeneration, BlipProcessor TRANSFORMERS_AVAILABLE = True except ImportError: logger.warning("Transformers not available, using mock image analysis") TRANSFORMERS_AVAILABLE = False # Try to import requests for downloading images try: import requests REQUESTS_AVAILABLE = True except ImportError: logger.warning("Requests not available, limited image input support") REQUESTS_AVAILABLE = False class ImageProcessor: """Image processing service using BLIP model.""" def __init__(self): """Initialize the image processor.""" self.processor = None self.model = None self._initialize_model() def _initialize_model(self): """Initialize the BLIP model for image captioning.""" if not TRANSFORMERS_AVAILABLE: logger.info("Using mock image processing (transformers not available)") return try: model_name = os.getenv("MODEL_NAME", "Salesforce/blip-image-captioning-base") logger.info(f"Loading BLIP model: {model_name}") self.processor = BlipProcessor.from_pretrained(model_name) self.model = BlipForConditionalGeneration.from_pretrained(model_name) logger.info("BLIP model loaded successfully") except Exception as e: logger.error(f"Failed to load BLIP model: {e}") self.processor = None self.model = None def analyze_image(self, image_input: str | Image.Image) -> str: """ Analyze an image and generate a caption/description. Args: image_input: PIL Image object or base64 string or file path Returns: String description of the image """ try: # Convert input to PIL Image image = self._process_image_input(image_input) if image is None: return "Error: Could not process the image input" # Generate caption if self.model and self.processor: return self._generate_real_caption(image) return self._generate_mock_caption(image) except Exception as e: logger.error(f"Error analyzing image: {e}") return f"Error analyzing image: {e!s}" def _process_image_input(self, image_input: str | Image.Image) -> Image.Image | None: """Convert various image input formats to PIL Image.""" if isinstance(image_input, Image.Image): return image_input if isinstance(image_input, str): try: # Try as base64 first if image_input.startswith("data:image"): # Handle data URL header, data = image_input.split(",", 1) image_bytes = base64.b64decode(data) return Image.open(io.BytesIO(image_bytes)) if len(image_input) > 100 and not image_input.startswith(("http", "/")): # Assume it's base64 image_bytes = base64.b64decode(image_input) return Image.open(io.BytesIO(image_bytes)) if image_input.startswith("http") and REQUESTS_AVAILABLE: # Download from URL response = requests.get(image_input, timeout=10) return Image.open(io.BytesIO(response.content)) # Try as file path return Image.open(image_input) except Exception as e: logger.error(f"Error processing image input: {e}") return None return None def _generate_real_caption(self, image: Image.Image) -> str: """Generate caption using BLIP model.""" try: # Process image inputs = self.processor(image, return_tensors="pt") # Generate caption out = self.model.generate(**inputs, max_length=50, num_beams=5) caption = self.processor.decode(out[0], skip_special_tokens=True) # Add image analysis details width, height = image.size mode = image.mode result = f"Image Caption: {caption}\n" result += f"Image Details: {width}x{height} pixels, {mode} mode" return result except Exception as e: logger.error(f"Error generating real caption: {e}") return f"Error generating caption: {e!s}" def _generate_mock_caption(self, image: Image.Image) -> str: """Generate mock caption when model is not available.""" try: width, height = image.size mode = image.mode # Simple analysis based on image properties aspect_ratio = width / height if aspect_ratio > 1.5: shape_desc = "landscape" elif aspect_ratio < 0.67: shape_desc = "portrait" else: shape_desc = "square" mock_caption = f"A {shape_desc} image with {width}x{height} resolution" # Add some variety based on image characteristics if width * height > 1000000: # > 1MP quality = "high-resolution" elif width * height > 500000: # > 0.5MP quality = "medium-resolution" else: quality = "low-resolution" result = f"Image Caption: {mock_caption}\n" result += f"Analysis: {quality} {mode} image\n" result += f"Dimensions: {width}x{height} pixels" return result except Exception as e: logger.error(f"Error generating mock caption: {e}") return f"Error: {e!s}" # Initialize the image processor image_processor = ImageProcessor() def process_image_mcp(image_file) -> str: """ MCP-compatible image processing function. Args: image_file: Gradio File object or image input Returns: String with image analysis results """ try: if image_file is None: return "Error: No image provided" # Handle Gradio file input if hasattr(image_file, "name"): # It's a file object image_path = image_file.name image = Image.open(image_path) else: # Direct image input image = image_file # Process the image result = image_processor.analyze_image(image) logger.info("Successfully processed image") return result except Exception as e: error_msg = f"Error processing image: {e!s}" logger.error(error_msg) return error_msg # Create Gradio interface def create_gradio_interface(): """Create and configure the Gradio interface.""" interface = gr.Interface( fn=process_image_mcp, inputs=[ gr.Image( label="Upload Image", type="filepath", sources=["upload", "clipboard"] ) ], outputs=[ gr.Textbox( label="Image Analysis Result", lines=5, show_copy_button=True ) ], title="🖼️ MCP Image Processor", description=""" **Image Processing MCP Server** Upload an image to get: - AI-generated caption and description - Technical image details - Visual content analysis Supports: JPG, PNG, GIF, WebP formats """, examples=[], allow_flagging="never", analytics_enabled=False ) return interface def main(): """Main function to run the Gradio app.""" # Get configuration from environment port = int(os.getenv("GRADIO_SERVER_PORT", 7860)) host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0") logger.info(f"Starting MCP Image Processor on {host}:{port}") # Create interface interface = create_gradio_interface() # Launch with MCP server enabled interface.launch( server_name=host, server_port=port, share=False, debug=False, quiet=False, show_error=True ) if __name__ == "__main__": main()