Spaces:

aditya-g07
/

RetinaFace_Face_Detection

Sleeping

aditya-g07 commited on Jul 5

Commit

2d98925

1 Parent(s): 6feecf0

Fix Gradio JSON schema error: Simplify interface and use stable version

- Downgrade to Gradio 4.36.0 (stable version without JSON schema issues)
- Completely rewrite app.py with simplified interface structure
- Remove complex API functions that were causing schema parsing errors
- Use straightforward input/output types that Gradio can handle properly
- Maintain core face detection functionality while fixing runtime errors

Files changed (3) hide show

README.md +1 -1
app.py +189 -367
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔍
 colorFrom: blue
 colorTo: red
 sdk: gradio
-sdk_version: 4.44.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: blue
 colorTo: red
 sdk: gradio
+sdk_version: 4.36.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import tempfile
 import time
 from PIL import Image, ImageDraw
 import json
 # Import RetinaFace model components
 from models.retinaface import RetinaFace
@@ -26,6 +27,47 @@ def load_models():
     global mobilenet_model, resnet_model
     try:
         # Load MobileNet model
         mobilenet_model = RetinaFace(cfg=mobilenet_cfg, phase='test')
         mobilenet_model.load_state_dict(torch.load('mobilenet0.25_Final.pth', map_location=device))
@@ -38,422 +80,202 @@ def load_models():
         resnet_model.eval()
         resnet_model = resnet_model.to(device)
-        print("Models loaded successfully!")
-        return "✅ Models loaded successfully!"
     except Exception as e:
-        error_msg = f"❌ Error loading models: {e}"
-        print(error_msg)
-        return error_msg
-# Model configurations
-mobilenet_cfg = {
-    'name': 'mobilenet0.25',
-    'min_sizes': [[16, 32], [64, 128], [256, 512]],
-    'steps': [8, 16, 32],
-    'variance': [0.1, 0.2],
-    'clip': False,
-    'loc_weight': 2.0,
-    'gpu_train': True,
-    'batch_size': 32,
-    'ngpu': 1,
-    'epoch': 250,
-    'decay1': 190,
-    'decay2': 220,
-    'image_size': 640,
-    'pretrain': False,  # Don't load pretrained weights
-    'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3},
-    'in_channel': 32,
-    'out_channel': 64
-}
-resnet_cfg = {
-    'name': 'Resnet50',
-    'min_sizes': [[16, 32], [64, 128], [256, 512]],
-    'steps': [8, 16, 32],
-    'variance': [0.1, 0.2],
-    'clip': False,
-    'loc_weight': 2.0,
-    'gpu_train': True,
-    'batch_size': 24,
-    'ngpu': 4,
-    'epoch': 100,
-    'decay1': 70,
-    'decay2': 90,
-    'image_size': 840,
-    'pretrain': False,  # Don't load pretrained weights
-    'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3},
-    'in_channel': 256,
-    'out_channel': 256
-}
-def detect_faces_core(image, model, cfg, confidence_threshold=0.02, nms_threshold=0.4):
     """Core face detection function"""
-    start_time = time.time()
-    # Preprocessing
-    img = np.float32(image)
-    im_height, im_width, _ = img.shape
-    scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
-    img -= (104, 117, 123)
-    img = img.transpose(2, 0, 1)
-    img = torch.from_numpy(img).unsqueeze(0)
-    img = img.to(device)
-    scale = scale.to(device)
-    # Forward pass
-    with torch.no_grad():
-        loc, conf, landms = model(img)
-    # Post-processing
-    priorbox = PriorBox(cfg, image_size=(im_height, im_width))
-    priors = priorbox.forward()
-    priors = priors.to(device)
-    prior_data = priors.data
-    boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance'])
-    boxes = boxes * scale / 1
-    boxes = boxes.cpu().numpy()
-    scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
-    landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance'])
-    scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2],
-                           img.shape[3], img.shape[2], img.shape[3], img.shape[2],
-                           img.shape[3], img.shape[2]])
-    scale1 = scale1.to(device)
-    landms = landms * scale1 / 1
-    landms = landms.cpu().numpy()
-    # Ignore low scores
-    inds = np.where(scores > confidence_threshold)[0]
-    boxes = boxes[inds]
-    landms = landms[inds]
-    scores = scores[inds]
-    # Keep top-K before NMS
-    order = scores.argsort()[::-1][:5000]
-    boxes = boxes[order]
-    landms = landms[order]
-    scores = scores[order]
-    # Do NMS
-    dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
-    keep = py_cpu_nms(dets, nms_threshold)
-    dets = dets[keep, :]
-    landms = landms[keep]
-    # Format results
-    faces = []
-    for i in range(dets.shape[0]):
-        if dets[i, 4] < confidence_threshold:
-            continue
-        face = {
-            "bbox": {
-                "x1": float(dets[i, 0]),
-                "y1": float(dets[i, 1]),
-                "x2": float(dets[i, 2]),
-                "y2": float(dets[i, 3])
-            },
-            "confidence": float(dets[i, 4]),
-            "landmarks": {
-                "right_eye": [float(landms[i, 0]), float(landms[i, 1])],
-                "left_eye": [float(landms[i, 2]), float(landms[i, 3])],
-                "nose": [float(landms[i, 4]), float(landms[i, 5])],
-                "right_mouth": [float(landms[i, 6]), float(landms[i, 7])],
-                "left_mouth": [float(landms[i, 8]), float(landms[i, 9])]
-            }
-        }
-        faces.append(face)
-    processing_time = time.time() - start_time
-    return faces, processing_time
-def draw_faces_on_image(image, faces):
-    """Draw bounding boxes and landmarks on image"""
-    if isinstance(image, np.ndarray):
-        # Convert numpy array to PIL Image
-        image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
-    draw = ImageDraw.Draw(image)
-    for face in faces:
-        bbox = face["bbox"]
-        confidence = face["confidence"]
-        landmarks = face["landmarks"]
-        # Draw bounding box
-        draw.rectangle([bbox["x1"], bbox["y1"], bbox["x2"], bbox["y2"]],
-                      outline="red", width=2)
-        # Draw confidence score
-        draw.text((bbox["x1"], bbox["y1"] - 15),
-                 f'{confidence:.2f}', fill="red")
-        # Draw landmarks
-        for landmark_name, (x, y) in landmarks.items():
-            draw.ellipse([x-2, y-2, x+2, y+2], fill="blue")
-    return image
-def gradio_detect_faces(image, model_type, confidence_threshold, nms_threshold):
-    """Gradio interface function for face detection"""
-    if mobilenet_model is None or resnet_model is None:
-        return None, "❌ Models not loaded. Please wait for models to load.", ""
     try:
-        # Convert PIL to OpenCV format
-        if isinstance(image, Image.Image):
-            image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-        # Select model
-        if model_type.lower() == "resnet":
             model = resnet_model
-            cfg = resnet_cfg
-            model_name = "ResNet50"
         else:
             model = mobilenet_model
-            cfg = mobilenet_cfg
-            model_name = "MobileNet"
-        # Detect faces
-        faces, processing_time = detect_faces_core(
-            image, model, cfg, confidence_threshold, nms_threshold
-        )
-        # Draw results on image
-        result_image = draw_faces_on_image(image.copy(), faces)
-        # Create results text
-        results_text = f"🎯 **Detection Results**\n"
-        results_text += f"📱 Model: {model_name}\n"
-        results_text += f"⏱️ Processing Time: {processing_time:.3f}s\n"
-        results_text += f"👥 Faces Detected: {len(faces)}\n\n"
-        for i, face in enumerate(faces):
-            results_text += f"**Face {i+1}:**\n"
-            results_text += f"  Confidence: {face['confidence']:.3f}\n"
-            bbox = face['bbox']
-            results_text += f"  Location: ({bbox['x1']:.0f}, {bbox['y1']:.0f}) - ({bbox['x2']:.0f}, {bbox['y2']:.0f})\n\n"
-        # Create JSON output for API use
-        json_output = {
-            "faces": faces,
-            "processing_time": processing_time,
-            "model_used": model_name.lower(),
-            "total_faces": len(faces)
-        }
-        return result_image, results_text, json.dumps(json_output, indent=2)
-    except Exception as e:
-        error_msg = f"❌ Detection failed: {str(e)}"
-        return None, error_msg, ""
-def api_detect_live(image_base64, model_type="mobilenet", confidence_threshold=0.5, nms_threshold=0.4):
-    """API function for live detection (Thunkable compatible)"""
-    try:
-        # Decode base64 image
-        image_data = base64.b64decode(image_base64)
-        nparr = np.frombuffer(image_data, np.uint8)
-        image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
-        if image is None:
-            return {"error": "Invalid image data"}
-        # Select model
-        if model_type.lower() == "resnet":
-            model = resnet_model
-            cfg = resnet_cfg
-            model_name = "resnet"
-        else:
-            model = mobilenet_model
-            cfg = mobilenet_cfg
-            model_name = "mobilenet"
-        if model is None:
-            return {"error": f"{model_name} model not loaded"}
-        # Detect faces
-        faces, processing_time = detect_faces_core(
-            image, model, cfg, confidence_threshold, nms_threshold
-        )
-        return {
-            "faces": faces,
-            "processing_time": processing_time,
-            "model_used": model_name,
-            "total_faces": len(faces)
-        }
     except Exception as e:
-        return {"error": f"Detection failed: {str(e)}"}
 # Load models on startup
 print("Loading RetinaFace models...")
-load_status = load_models()
-# Create Gradio interface
-with gr.Blocks(title="RetinaFace Face Detection API", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🔥 RetinaFace Face Detection API
-    **Real-time face detection using RetinaFace with MobileNet and ResNet backbones**
-    - 📱 **Mobile App Ready**: Compatible with Thunkable and other mobile frameworks
-    - ⚡ **Dual Models**: MobileNet (fast) and ResNet (accurate)
-    - 🎯 **High Accuracy**: Detects faces with bounding boxes and 5-point landmarks
-    - 🌐 **API Endpoints**: Use `/api/predict` for programmatic access
-    """)
-    with gr.Row():
-        gr.Markdown(f"**Status**: {load_status}")
-    with gr.Tab("🖼️ Image Detection"):
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(type="pil", label="Upload Image")
                 model_choice = gr.Dropdown(
                     choices=["mobilenet", "resnet"],
                     value="mobilenet",
-                    label="Model Type"
                 )
-                confidence_slider = gr.Slider(
                     minimum=0.1, maximum=1.0, value=0.5, step=0.1,
-                    label="Confidence Threshold"
                 )
-                nms_slider = gr.Slider(
                     minimum=0.1, maximum=1.0, value=0.4, step=0.1,
                     label="NMS Threshold"
                 )
                 detect_btn = gr.Button("🔍 Detect Faces", variant="primary")
             with gr.Column():
-                output_image = gr.Image(label="Detection Results")
-                results_text = gr.Markdown(label="Results")
         detect_btn.click(
-            fn=gradio_detect_faces,
-            inputs=[input_image, model_choice, confidence_slider, nms_slider],
-            outputs=[output_image, results_text]
         )
-    with gr.Tab("🔗 API Documentation"):
-        gr.Markdown("""
-        ## API Endpoints for Thunkable Integration
-        ### 1. Live Detection Endpoint
-        ```
-        POST /api/predict
-        ```
-        **Request Body (JSON):**
-        ```json
-        {
-            "data": [
-                "base64_encoded_image_string",
-                "mobilenet",
-                0.5,
-                0.4
-            ]
-        }
-        ```
-        **Response:**
-        ```json
-        {
-            "data": [
-                {
-                    "faces": [...],
-                    "processing_time": 0.1,
-                    "model_used": "mobilenet",
-                    "total_faces": 2
-                }
-            ]
-        }
-        ```
-        ### 2. Thunkable Integration Example
-        **Web API Component Setup:**
-        - URL: `https://your-space-name.hf.space/api/predict`
-        - Method: `POST`
-        - Headers: `Content-Type: application/json`
-        - Body:
         ```json
         {
-            "data": [
-                "{{base64_image}}",
-                "mobilenet",
-                0.5,
-                0.4
-            ]
         }
         ```
-        ### 3. Model Performance
-        | Model | Speed | Accuracy | Best For |
-        |-------|-------|----------|----------|
-        | MobileNet | ⚡ Fast | 🎯 Good | Real-time mobile apps |
-        | ResNet50 | 🐌 Slower | 🎯🎯 High | High-accuracy applications |
-        ### 4. Response Format
-        Each detected face includes:
-        - **bbox**: Bounding box coordinates (x1, y1, x2, y2)
-        - **confidence**: Detection confidence score (0-1)
-        - **landmarks**: 5-point facial landmarks (eyes, nose, mouth corners)
         """)
-    with gr.Tab("📊 API Testing"):
-        gr.Markdown("### Test the API with base64 encoded images")
-        with gr.Row():
-            with gr.Column():
-                test_image_b64 = gr.Textbox(
-                    label="Base64 Encoded Image",
-                    placeholder="Paste base64 encoded image here...",
-                    lines=3
-                )
-                test_model = gr.Dropdown(
-                    choices=["mobilenet", "resnet"],
-                    value="mobilenet",
-                    label="Model"
-                )
-                test_conf = gr.Number(value=0.5, label="Confidence")
-                test_nms = gr.Number(value=0.4, label="NMS Threshold")
-                test_btn = gr.Button("🧪 Test API", variant="secondary")
-            with gr.Column():
-                api_output = gr.JSON(label="API Response")
-        def test_api_function(image_b64, model, conf, nms):
-            if not image_b64.strip():
-                return {"error": "Please provide base64 encoded image"}
-            # Remove data URL prefix if present
-            if image_b64.startswith('data:image'):
-                image_b64 = image_b64.split(',')[1]
-            result = api_detect_live(image_b64, model, conf, nms)
-            return result
-        test_btn.click(
-            fn=test_api_function,
-            inputs=[test_image_b64, test_model, test_conf, test_nms],
-            outputs=[api_output]
-        )
-# Custom API function for external calls
-def predict_api(image_base64, model_type="mobilenet", confidence_threshold=0.5, nms_threshold=0.4):
-    """API prediction function that matches Gradio's expected format"""
-    result = api_detect_live(image_base64, model_type, confidence_threshold, nms_threshold)
-    return result
-# Launch the app
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=True,
-        show_error=True
     )

 import time
 from PIL import Image, ImageDraw
 import json
+import io
 # Import RetinaFace model components
 from models.retinaface import RetinaFace
     global mobilenet_model, resnet_model
     try:
+        # Model configurations
+        mobilenet_cfg = {
+            'name': 'mobilenet0.25',
+            'min_sizes': [[16, 32], [64, 128], [256, 512]],
+            'steps': [8, 16, 32],
+            'variance': [0.1, 0.2],
+            'clip': False,
+            'loc_weight': 2.0,
+            'gpu_train': True,
+            'batch_size': 32,
+            'ngpu': 1,
+            'epoch': 250,
+            'decay1': 190,
+            'decay2': 220,
+            'image_size': 640,
+            'pretrain': False,
+            'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3},
+            'in_channel': 32,
+            'out_channel': 64
+        }
+        resnet_cfg = {
+            'name': 'Resnet50',
+            'min_sizes': [[16, 32], [64, 128], [256, 512]],
+            'steps': [8, 16, 32],
+            'variance': [0.1, 0.2],
+            'clip': False,
+            'loc_weight': 2.0,
+            'gpu_train': True,
+            'batch_size': 24,
+            'ngpu': 4,
+            'epoch': 100,
+            'decay1': 70,
+            'decay2': 90,
+            'image_size': 840,
+            'pretrain': False,
+            'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3},
+            'in_channel': 256,
+            'out_channel': 256
+        }
         # Load MobileNet model
         mobilenet_model = RetinaFace(cfg=mobilenet_cfg, phase='test')
         mobilenet_model.load_state_dict(torch.load('mobilenet0.25_Final.pth', map_location=device))
         resnet_model.eval()
         resnet_model = resnet_model.to(device)
+        print("✅ Models loaded successfully!")
+        return True
     except Exception as e:
+        print(f"❌ Error loading models: {e}")
+        return False
+def detect_faces(image, model_type="mobilenet", confidence_threshold=0.5, nms_threshold=0.4):
     """Core face detection function"""
     try:
+        start_time = time.time()
+        # Choose model
+        if model_type == "resnet":
             model = resnet_model
+            cfg = {
+                'min_sizes': [[16, 32], [64, 128], [256, 512]],
+                'steps': [8, 16, 32],
+                'variance': [0.1, 0.2],
+                'clip': False,
+                'image_size': 840
+            }
         else:
             model = mobilenet_model
+            cfg = {
+                'min_sizes': [[16, 32], [64, 128], [256, 512]],
+                'steps': [8, 16, 32],
+                'variance': [0.1, 0.2],
+                'clip': False,
+                'image_size': 640
+            }
+        if model is None:
+            return None, "Models not loaded"
+        # Convert PIL to numpy array
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        # Preprocessing
+        img = np.float32(image)
+        im_height, im_width, _ = img.shape
+        scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
+        img -= (104, 117, 123)
+        img = img.transpose(2, 0, 1)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.to(device)
+        scale = scale.to(device)
+        # Forward pass
+        with torch.no_grad():
+            loc, conf, landms = model(img)
+        # Generate priors
+        priorbox = PriorBox(cfg, image_size=(im_height, im_width))
+        priors = priorbox.forward()
+        priors = priors.to(device)
+        prior_data = priors.data
+        boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance'])
+        boxes = boxes * scale
+        boxes = boxes.cpu().numpy()
+        scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
+        landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance'])
+        scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2],
+                               img.shape[3], img.shape[2], img.shape[3], img.shape[2],
+                               img.shape[3], img.shape[2]])
+        scale1 = scale1.to(device)
+        landms = landms * scale1
+        landms = landms.cpu().numpy()
+        # Ignore low scores
+        inds = np.where(scores > confidence_threshold)[0]
+        boxes = boxes[inds]
+        landms = landms[inds]
+        scores = scores[inds]
+        # Keep top-K before NMS
+        order = scores.argsort()[::-1][:5000]
+        boxes = boxes[order]
+        landms = landms[order]
+        scores = scores[order]
+        # Apply NMS
+        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
+        keep = py_cpu_nms(dets, nms_threshold)
+        dets = dets[keep, :]
+        landms = landms[keep]
+        # Draw results
+        result_image = Image.fromarray(image)
+        draw = ImageDraw.Draw(result_image)
+        faces = []
+        for b, landmarks in zip(dets, landms):
+            if b[4] < confidence_threshold:
+                continue
+            # Draw bounding box
+            draw.rectangle([b[0], b[1], b[2], b[3]], outline="red", width=2)
+            # Draw confidence score
+            draw.text((b[0], b[1] - 15), f'{b[4]:.2f}', fill="red")
+            # Draw landmarks
+            for i in range(0, 10, 2):
+                draw.ellipse([landmarks[i]-2, landmarks[i+1]-2, landmarks[i]+2, landmarks[i+1]+2], fill="blue")
+            faces.append({
+                "bbox": {"x1": float(b[0]), "y1": float(b[1]), "x2": float(b[2]), "y2": float(b[3])},
+                "confidence": float(b[4]),
+                "landmarks": {
+                    "left_eye": [float(landmarks[0]), float(landmarks[1])],
+                    "right_eye": [float(landmarks[2]), float(landmarks[3])],
+                    "nose": [float(landmarks[4]), float(landmarks[5])],
+                    "left_mouth": [float(landmarks[6]), float(landmarks[7])],
+                    "right_mouth": [float(landmarks[8]), float(landmarks[9])]
+                }
+            })
+        processing_time = time.time() - start_time
+        result_text = f"""
+        **Detection Results:**
+        - **Faces Detected:** {len(faces)}
+        - **Model Used:** {model_type}
+        - **Processing Time:** {processing_time:.3f}s
+        - **Confidence Threshold:** {confidence_threshold}
+        - **NMS Threshold:** {nms_threshold}
+        """
+        return result_image, result_text
     except Exception as e:
+        return None, f"Error: {str(e)}"
 # Load models on startup
 print("Loading RetinaFace models...")
+model_loaded = load_models()
+# Create simple Gradio interface
+def create_interface():
+    with gr.Blocks(title="RetinaFace Face Detection") as demo:
+        gr.Markdown("# 🔥 RetinaFace Face Detection API")
+        gr.Markdown("Real-time face detection using RetinaFace with MobileNet and ResNet backbones")
+        if model_loaded:
+            gr.Markdown("✅ **Status**: Models loaded successfully!")
+        else:
+            gr.Markdown("❌ **Status**: Error loading models")
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(type="pil", label="Upload Image")
                 model_choice = gr.Dropdown(
                     choices=["mobilenet", "resnet"],
                     value="mobilenet",
+                    label="Model"
                 )
+                confidence = gr.Slider(
                     minimum=0.1, maximum=1.0, value=0.5, step=0.1,
+                    label="Confidence"
                 )
+                nms = gr.Slider(
                     minimum=0.1, maximum=1.0, value=0.4, step=0.1,
                     label="NMS Threshold"
                 )
                 detect_btn = gr.Button("🔍 Detect Faces", variant="primary")
             with gr.Column():
+                output_image = gr.Image(label="Results")
+                output_text = gr.Markdown()
         detect_btn.click(
+            fn=detect_faces,
+            inputs=[input_image, model_choice, confidence, nms],
+            outputs=[output_image, output_text]
         )
+        gr.Markdown("""
+        ## API Usage
+        Use `/api/predict` endpoint with:
         ```json
         {
+            "data": [image, "mobilenet", 0.5, 0.4]
         }
         ```
         """)
+    return demo
+# Create and launch the interface
+demo = create_interface()
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=True
     )

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio==4.44.1
 torch==2.0.1
 torchvision==0.15.2
 opencv-python==4.8.1.78

+gradio==4.36.0
 torch==2.0.1
 torchvision==0.15.2
 opencv-python==4.8.1.78