# src/models.py  —  Enterprise Lens V4
# ════════════════════════════════════════════════════════════════════
# Face Lane  : InsightFace SCRFD-10GF + ArcFace-R100 (buffalo_l)
#              + AdaFace IR-50 (WebFace4M) fused → 1024-D vector
#              • det_size=(1280,1280) — catches small/group faces
#              • Quality gate: det_score ≥ 0.60, face_px ≥ 40
#              • Multi-scale: runs detection at 2 scales, merges
#              • Stores one 1024-D vector PER face
#              • Each vector carries base64 face-crop thumbnail
#              • face_quality_score + face_width_px in metadata
#
# Object Lane: SigLIP + DINOv2 fused 1536-D (unchanged from V3)
# ════════════════════════════════════════════════════════════════════

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import asyncio
import base64
import functools
import hashlib
import io
import threading
import traceback

import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoProcessor
from ultralytics import YOLO

# ── InsightFace ───────────────────────────────────────────────────
try:
    import insightface
    from insightface.app import FaceAnalysis
    INSIGHTFACE_AVAILABLE = True
except ImportError:
    INSIGHTFACE_AVAILABLE = False
    print("⚠️  insightface not installed — face lane disabled")
    print("    Run: pip install insightface onnxruntime-silicon  (mac)")
    print("         pip install insightface onnxruntime          (linux/win)")

# ── AdaFace ──────────────────────────────────────────────────────
# Disabled by default — enable by setting ENABLE_ADAFACE=1 env var.
# When disabled: ArcFace(512) + zeros(512) = 1024-D (fully functional).
ADAFACE_WEIGHTS_AVAILABLE = False  # controlled by ENABLE_ADAFACE env var

# ── Constants ─────────────────────────────────────────────────────
YOLO_PERSON_CLASS_ID  = 0
MIN_FACE_SIZE         = 20      # lowered: 40 missed small faces in group photos
MAX_FACES_PER_IMAGE   = 12      # slightly higher cap for group photos
MAX_CROPS             = 6       # max YOLO object crops per image
MAX_IMAGE_SIZE        = 640     # object lane longest edge
DET_SIZE_PRIMARY      = (1280, 1280)  # V4: 1280 for small-face detection
DET_SIZE_SECONDARY    = (640, 640)    # fallback / 2nd scale
FACE_CROP_THUMB_SIZE  = 112     # face thumbnail for Pinecone metadata
FACE_CROP_QUALITY     = 80      # JPEG quality for thumbnails
FACE_QUALITY_GATE     = 0.35    # lowered from 0.60 — accepts sunglasses, angles, smiles
# Multi-scale pyramid — tried in order, results merged with IoU dedup
DET_SCALES            = [(1280, 1280), (960, 960), (640, 640)]
IOU_DEDUP_THRESHOLD   = 0.45    # suppress duplicate detections across scales
FACE_DIM              = 512     # ArcFace embedding dimension
ADAFACE_DIM           = 512     # AdaFace embedding dimension
FUSED_FACE_DIM        = 1024    # ArcFace + AdaFace concatenated


# ════════════════════════════════════════════════════════════════
#  Utility functions
# ════════════════════════════════════════════════════════════════

def _resize_pil(img: Image.Image, max_side: int = MAX_IMAGE_SIZE) -> Image.Image:
    w, h = img.size
    if max(w, h) <= max_side:
        return img
    scale = max_side / max(w, h)
    return img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)


def _img_hash(image_path: str) -> str:
    h = hashlib.md5()
    with open(image_path, "rb") as f:
        h.update(f.read(65536))
    return h.hexdigest()


def _crop_to_b64(
    img_bgr: np.ndarray,
    x1: int, y1: int, x2: int, y2: int,
    thumb_size: int = FACE_CROP_THUMB_SIZE,
) -> str:
    """Crop face from BGR image with 20% padding, return base64 JPEG thumbnail."""
    H, W = img_bgr.shape[:2]
    w, h = x2 - x1, y2 - y1
    pad_x = int(w * 0.20)
    pad_y = int(h * 0.20)
    cx1 = max(0, x1 - pad_x)
    cy1 = max(0, y1 - pad_y)
    cx2 = min(W, x2 + pad_x)
    cy2 = min(H, y2 + pad_y)
    crop = img_bgr[cy1:cy2, cx1:cx2]
    if crop.size == 0:
        return ""
    pil = Image.fromarray(crop[:, :, ::-1])          # BGR → RGB
    pil = pil.resize((thumb_size, thumb_size), Image.LANCZOS)
    buf = io.BytesIO()
    pil.save(buf, format="JPEG", quality=FACE_CROP_QUALITY)
    return base64.b64encode(buf.getvalue()).decode()


def _face_crop_for_adaface(
    img_bgr: np.ndarray,
    x1: int, y1: int, x2: int, y2: int,
) -> np.ndarray:
    """
    Crop and normalise face for AdaFace IR-50 input.
    Returns float32 numpy array (3, 112, 112) normalised to [-1, 1].
    """
    H, W = img_bgr.shape[:2]
    w, h = x2 - x1, y2 - y1
    pad_x = int(w * 0.10)
    pad_y = int(h * 0.10)
    cx1 = max(0, x1 - pad_x)
    cy1 = max(0, y1 - pad_y)
    cx2 = min(W, x2 + pad_x)
    cy2 = min(H, y2 + pad_y)
    crop = img_bgr[cy1:cy2, cx1:cx2]
    if crop.size == 0:
        return None
    rgb = crop[:, :, ::-1].copy()                   # BGR → RGB
    pil = Image.fromarray(rgb).resize((112, 112), Image.LANCZOS)
    arr = np.array(pil, dtype=np.float32) / 255.0
    arr = (arr - 0.5) / 0.5                          # normalise [-1, 1]
    return arr.transpose(2, 0, 1)                    # HWC → CHW


def _clahe_enhance(bgr: np.ndarray) -> np.ndarray:
    """CLAHE on luminance — improves detection on dark/washed/low-contrast photos."""
    lab = cv2.cvtColor(bgr, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l_eq = clahe.apply(l)
    return cv2.cvtColor(cv2.merge([l_eq, a, b]), cv2.COLOR_LAB2BGR)


def _iou(box_a: list, box_b: list) -> float:
    """IoU between two [x1,y1,x2,y2] boxes."""
    xa = max(box_a[0], box_b[0]); ya = max(box_a[1], box_b[1])
    xb = min(box_a[2], box_b[2]); yb = min(box_a[3], box_b[3])
    inter = max(0, xb - xa) * max(0, yb - ya)
    if inter == 0:
        return 0.0
    area_a = (box_a[2]-box_a[0]) * (box_a[3]-box_a[1])
    area_b = (box_b[2]-box_b[0]) * (box_b[3]-box_b[1])
    return inter / (area_a + area_b - inter)


def _dedup_faces(faces_list: list, iou_thresh: float = IOU_DEDUP_THRESHOLD) -> list:
    """Remove duplicate detections across scales/flips. Keep highest det_score."""
    if not faces_list:
        return []
    faces_list = sorted(faces_list, key=lambda f: float(f.det_score), reverse=True)
    kept = []
    for face in faces_list:
        b = face.bbox.astype(int)
        box = [b[0], b[1], b[2], b[3]]
        duplicate = any(_iou(box, [k.bbox.astype(int)[i] for i in range(4)]) > iou_thresh for k in kept)
        if not duplicate:
            kept.append(face)
    return kept

# ════════════════════════════════════════════════════════════════
#  AIModelManager — V4
# ════════════════════════════════════════════════════════════════

class AIModelManager:
    def __init__(self):
        self.device = (
            "cuda" if torch.cuda.is_available()
            else ("mps" if torch.backends.mps.is_available() else "cpu")
        )
        print(f"🚀 Loading models onto: {self.device.upper()}...")

        # ── Object Lane: SigLIP + DINOv2 (unchanged) ─────────────
        print("📦 Loading SigLIP...")
        self.siglip_processor = AutoProcessor.from_pretrained(
            "google/siglip-base-patch16-224", use_fast=True)
        self.siglip_model = AutoModel.from_pretrained(
            "google/siglip-base-patch16-224").to(self.device).eval()

        print("📦 Loading DINOv2...")
        self.dinov2_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        self.dinov2_model = AutoModel.from_pretrained(
            "facebook/dinov2-base").to(self.device).eval()

        if self.device == "cuda":
            self.siglip_model = self.siglip_model.half()
            self.dinov2_model = self.dinov2_model.half()

        # ── YOLO for object segmentation ─────────────────────────
        print("📦 Loading YOLO11n-seg...")
        self.yolo = YOLO("yolo11n-seg.pt")

        # ── Face Lane: InsightFace SCRFD + ArcFace-R100 ───────────
        # V4: ALWAYS use buffalo_l (SCRFD-10GF + ArcFace-R100)
        #     even on CPU — accuracy matters more than speed here.
        #     det_size=1280 catches faces as small as ~10px in source.
        self.face_app = None
        if INSIGHTFACE_AVAILABLE:
            try:
                print("📦 Loading InsightFace buffalo_l (SCRFD-10GF + ArcFace-R100)...")
                self.face_app = FaceAnalysis(
                    name="buffalo_l",
                    providers=(
                        ["CUDAExecutionProvider", "CPUExecutionProvider"]
                        if self.device == "cuda"
                        else ["CPUExecutionProvider"]
                    ),
                )
                self.face_app.prepare(
                    ctx_id=0 if self.device == "cuda" else -1,
                    det_size=DET_SIZE_PRIMARY,   # 1280×1280 — key for small faces
                )
                # Warmup
                test_img = np.zeros((112, 112, 3), dtype=np.uint8)
                self.face_app.get(test_img)
                print("✅ InsightFace buffalo_l loaded — SCRFD+ArcFace face lane ACTIVE")
                print(f"   det_size={DET_SIZE_PRIMARY} | quality_gate={FACE_QUALITY_GATE}")
            except Exception as e:
                print(f"❌ InsightFace init FAILED: {e}")
                print(traceback.format_exc())
                self.face_app = None
        else:
            print("❌ InsightFace NOT installed")

        # ── AdaFace IR-50 (CVPR 2022) — quality-adaptive fusion ───
        # Fused with ArcFace → 1024-D face vector
        # Weights: adaface_ir50_webface4m.ckpt from HuggingFace
        self.adaface_model = None
        self._load_adaface()

        # Thread safety for ONNX
        self._face_lock     = threading.Lock()
        self._cache         = {}
        self._cache_maxsize = 128
        adaface_status = "FULL FUSION u2705" if self.adaface_model else "ZERO-PADDED u26a0ufe0f  (AdaFace weights missing)"
        print("")
        print("u2705 Enterprise Lens V4 u2014 Models Ready")
        print(f"   Device            : {self.device.upper()}")
        print(f"   InsightFace       : buffalo_l (SCRFD-10GF + ArcFace-R100)")
        print(f"   AdaFace           : {adaface_status}")
        print(f"   Face vector dim   : {FUSED_FACE_DIM}  <- enterprise-faces MUST be {FUSED_FACE_DIM}-D")
        print(f"   Object vector dim : 1536  <- enterprise-objects MUST be 1536-D")
        print(f"   Quality gate      : det_score >= {FACE_QUALITY_GATE}, face_px >= {MIN_FACE_SIZE}")
        print(f"   Detection size    : {DET_SIZE_PRIMARY}")
        print("")

    def _load_adaface(self):
        """
        AdaFace IR-50 MS1MV2 — disabled for now.
        Face vectors use ArcFace(512) + zeros(512) = 1024-D.
        This is fully functional — cosine similarity works correctly.
        Re-enable by setting ENABLE_ADAFACE=1 env var when HF token
        injection into Docker build is confirmed working.
        """
        enable = os.getenv("ENABLE_ADAFACE", "0").strip() == "1"
        hf_token_present = bool(os.getenv("HF_TOKEN", "").strip())
        print(f"   ENABLE_ADAFACE={os.getenv('ENABLE_ADAFACE', 'NOT SET')}")
        print(f"   HF_TOKEN present={'YES' if hf_token_present else 'NO (not set or empty)'}")
        if not enable:
            print("⚠️  AdaFace disabled (ENABLE_ADAFACE != 1) — using ArcFace zero-padded 1024-D")
            self.adaface_model = None
            return

        # Full loading code kept here for when AdaFace is re-enabled
        import sys
        HF_TOKEN   = os.getenv("HF_TOKEN", None)
        REPO_ID    = "minchul/cvlface_adaface_ir50_ms1mv2"
        CACHE_PATH = os.path.expanduser("~/.cvlface_cache/minchul/cvlface_adaface_ir50_ms1mv2")
        try:
            from huggingface_hub import hf_hub_download
            print("📦 Loading AdaFace IR-50 MS1MV2...")
            os.makedirs(CACHE_PATH, exist_ok=True)
            hf_hub_download(repo_id=REPO_ID, filename="files.txt",
                token=HF_TOKEN, local_dir=CACHE_PATH, local_dir_use_symlinks=False)
            with open(os.path.join(CACHE_PATH, "files.txt")) as f:
                extra = [x.strip() for x in f.read().split("\n") if x.strip()]
            for fname in extra + ["config.json", "wrapper.py", "model.safetensors"]:
                fpath = os.path.join(CACHE_PATH, fname)
                if not os.path.exists(fpath):
                    hf_hub_download(repo_id=REPO_ID, filename=fname,
                        token=HF_TOKEN, local_dir=CACHE_PATH, local_dir_use_symlinks=False)
            cwd = os.getcwd()
            os.chdir(CACHE_PATH)
            sys.path.insert(0, CACHE_PATH)
            try:
                from transformers import AutoModel as _HF_AutoModel
                model = _HF_AutoModel.from_pretrained(
                    CACHE_PATH, trust_remote_code=True, token=HF_TOKEN)
            finally:
                os.chdir(cwd)
                if CACHE_PATH in sys.path: sys.path.remove(CACHE_PATH)
            model = model.to(self.device).eval()
            with torch.no_grad():
                out = model(torch.zeros(1, 3, 112, 112).to(self.device))
            emb = out if isinstance(out, torch.Tensor) else out.embedding
            assert emb.shape[-1] == ADAFACE_DIM
            self.adaface_model = model
            print(f"✅ AdaFace IR-50 loaded — 1024-D FULL FUSION active")
        except Exception as e:
            print(f"⚠️  AdaFace load failed: {e} — falling back to zero-padded 1024-D")
            self.adaface_model = None

        # ── Object Lane: batched SigLIP + DINOv2 embedding ───────────
    def _embed_crops_batch(self, crops: list) -> list:
        """Embed a list of PIL images → list of 1536-D numpy arrays."""
        if not crops:
            return []
        with torch.no_grad():
            # SigLIP
            sig_in = self.siglip_processor(images=crops, return_tensors="pt", padding=True)
            sig_in = {k: v.to(self.device) for k, v in sig_in.items()}
            if self.device == "cuda":
                sig_in = {k: v.half() if v.dtype == torch.float32 else v
                          for k, v in sig_in.items()}
            sig_out = self.siglip_model.get_image_features(**sig_in)
            # Handle all output types across transformers versions
            if hasattr(sig_out, "image_embeds"):
                sig_out = sig_out.image_embeds
            elif hasattr(sig_out, "pooler_output"):
                sig_out = sig_out.pooler_output
            elif hasattr(sig_out, "last_hidden_state"):
                sig_out = sig_out.last_hidden_state[:, 0, :]
            elif isinstance(sig_out, tuple):
                sig_out = sig_out[0]
            # sig_out is now a tensor
            if not isinstance(sig_out, torch.Tensor):
                sig_out = sig_out[0]
            sig_vecs = F.normalize(sig_out.float(), p=2, dim=1).cpu()

            # DINOv2
            dino_in = self.dinov2_processor(images=crops, return_tensors="pt")
            dino_in = {k: v.to(self.device) for k, v in dino_in.items()}
            if self.device == "cuda":
                dino_in = {k: v.half() if v.dtype == torch.float32 else v
                           for k, v in dino_in.items()}
            dino_out  = self.dinov2_model(**dino_in)
            dino_vecs = F.normalize(
                dino_out.last_hidden_state[:, 0, :].float(), p=2, dim=1).cpu()

            fused = F.normalize(torch.cat([sig_vecs, dino_vecs], dim=1), p=2, dim=1)
        return [fused[i].numpy() for i in range(len(crops))]

    # ── AdaFace embedding for a single face crop ─────────────────
    def _adaface_embed(self, face_arr_chw: np.ndarray) -> np.ndarray:
        """
        Run AdaFace IR-50 MS1MV2 on a preprocessed (3,112,112) float32 array.
        Input : CHW float32, normalised to [-1, 1]
        Output: 512-D L2-normalised numpy embedding, or None on failure.

        The cvlface model may return a tensor directly or an object
        with an .embedding attribute — both cases handled.
        """
        if self.adaface_model is None or face_arr_chw is None:
            return None
        try:
            t = torch.from_numpy(face_arr_chw).unsqueeze(0)  # (1,3,112,112)
            t = t.to(self.device)
            if self.device == "cuda":
                t = t.half()
            with torch.no_grad():
                out = self.adaface_model(t)
            # Handle both raw tensor and object-with-embedding outputs
            emb = out if isinstance(out, torch.Tensor) else out.embedding
            emb = F.normalize(emb.float(), p=2, dim=1)
            return emb[0].cpu().numpy()
        except Exception as e:
            print(f"⚠️  AdaFace inference error: {e}")
            return None

    # ── V4 Face detection + dual encoding ────────────────────────
    def _detect_and_encode_faces(self, img_np: np.ndarray) -> list:
        """
        Detect ALL faces using InsightFace SCRFD-10GF at 1280px.
        For each face:
          - ArcFace-R100 embedding (512-D, from InsightFace)
          - AdaFace IR-50 embedding (512-D, fused quality-adaptive)
          - Concatenate + L2-normalise → 1024-D final vector
          - Quality gate: det_score ≥ 0.60, face width ≥ 40px
          - Base64 thumbnail stored for UI

        Returns list of dicts with keys:
          type, vector (1024-D or 512-D), face_idx, bbox,
          face_crop, det_score, face_quality, face_width_px
        """
        if self.face_app is None:
            print("⚠️  face_app is None — InsightFace not loaded")
            return []

        try:
            # InsightFace expects BGR
            if img_np.dtype != np.uint8:
                img_np = (img_np * 255).astype(np.uint8)
            bgr = img_np[:, :, ::-1].copy() if img_np.shape[2] == 3 else img_np.copy()

            # ── Preprocessing: CLAHE contrast enhancement ─────────
            # Helps with dark/overexposed/low-contrast photos
            bgr_enhanced = _clahe_enhance(bgr)

            # ── Multi-scale + flip detection ──────────────────────
            # Run SCRFD at multiple resolutions AND on horizontally
            # flipped image. Catches faces that one scale/orientation misses.
            # Results are merged and deduplicated by IoU.
            all_raw_faces = []
            H, W = bgr.shape[:2]

            for scale in DET_SCALES:
                # Resize to this scale for detection
                scale_w = min(W, scale[0])
                scale_h = min(H, scale[1])
                if scale_w == W and scale_h == H:
                    bgr_scaled = bgr_enhanced
                else:
                    bgr_scaled = cv2.resize(bgr_enhanced, (scale_w, scale_h))

                print(f"🔍 SCRFD detection at {scale_w}×{scale_h}...")
                # Temporarily set det_size for this scale
                try:
                    self.face_app.det_model.input_size = scale
                    with self._face_lock:
                        faces_at_scale = self.face_app.get(bgr_scaled)
                    # Scale bboxes back to original dimensions
                    sx = W / scale_w; sy = H / scale_h
                    for f in faces_at_scale:
                        if sx != 1.0 or sy != 1.0:
                            f.bbox[0] *= sx; f.bbox[1] *= sy
                            f.bbox[2] *= sx; f.bbox[3] *= sy
                    all_raw_faces.extend(faces_at_scale)
                except Exception:
                    pass  # scale failed, continue

            # Horizontal flip pass — catches profile/turned faces
            bgr_flip = cv2.flip(bgr_enhanced, 1)
            try:
                self.face_app.det_model.input_size = DET_SIZE_PRIMARY
                with self._face_lock:
                    faces_flip = self.face_app.get(bgr_flip)
                # Mirror bboxes back to original orientation
                for f in faces_flip:
                    x1, y1, x2, y2 = f.bbox
                    f.bbox[0] = W - x2; f.bbox[2] = W - x1
                all_raw_faces.extend(faces_flip)
            except Exception:
                pass

            # Restore primary det_size
            self.face_app.det_model.input_size = DET_SIZE_PRIMARY

            # Deduplicate across scales and flip
            faces = _dedup_faces(all_raw_faces)
            print(f"   Raw detections: {len(all_raw_faces)} → after dedup: {len(faces)}")

            results  = []
            accepted = 0

            for idx, face in enumerate(faces):
                if accepted >= MAX_FACES_PER_IMAGE:
                    break

                # ── Bounding box ──────────────────────────────────
                bbox_raw = face.bbox.astype(int)
                x1, y1, x2, y2 = bbox_raw
                x1 = max(0, x1); y1 = max(0, y1)
                x2 = min(bgr.shape[1], x2); y2 = min(bgr.shape[0], y2)
                w, h = x2 - x1, y2 - y1
                if w <= 0 or h <= 0:
                    continue

                # ── Quality gate 1: minimum size ──────────────────
                if w < MIN_FACE_SIZE or h < MIN_FACE_SIZE:
                    print(f"   Face {idx}: SKIP — too small ({w}×{h}px)")
                    continue

                # ── Quality gate 2: detection confidence ──────────
                det_score = float(face.det_score) if hasattr(face, "det_score") else 1.0
                if det_score < FACE_QUALITY_GATE:
                    print(f"   Face {idx}: SKIP — low det_score ({det_score:.3f})")
                    continue

                # ── ArcFace embedding (from InsightFace) ──────────
                if face.embedding is None:
                    continue
                arcface_vec = face.embedding.astype(np.float32)
                n = np.linalg.norm(arcface_vec)
                if n > 0:
                    arcface_vec = arcface_vec / n

                # ── AdaFace embedding (quality-adaptive) ──────────
                face_chw   = _face_crop_for_adaface(bgr, x1, y1, x2, y2)
                adaface_vec = self._adaface_embed(face_chw)

                # ── Fuse: ArcFace + AdaFace → 1024-D ─────────────
                # ALWAYS output FUSED_FACE_DIM (1024) so Pinecone index
                # dimension never mismatches, regardless of AdaFace status.
                if adaface_vec is not None:
                    # Full fusion: ArcFace(512) + AdaFace(512) → 1024-D
                    fused_raw = np.concatenate([arcface_vec, adaface_vec])
                else:
                    # AdaFace unavailable — pad with zeros to maintain 1024-D
                    # The ArcFace half still carries full identity signal;
                    # zero padding is neutral and doesn't corrupt similarity.
                    print("   ⚠️  AdaFace unavailable — padding to 1024-D")
                    fused_raw = np.concatenate([arcface_vec,
                                                np.zeros(ADAFACE_DIM, dtype=np.float32)])
                n2 = np.linalg.norm(fused_raw)
                final_vec = (fused_raw / n2) if n2 > 0 else fused_raw
                vec_dim   = FUSED_FACE_DIM   # always 1024

                # ── Face crop thumbnail for UI ─────────────────────
                face_crop_b64 = _crop_to_b64(bgr, x1, y1, x2, y2)

                results.append({
                    "type":           "face",
                    "vector":         final_vec,
                    "vec_dim":        vec_dim,
                    "face_idx":       accepted,
                    "bbox":           [int(x1), int(y1), int(w), int(h)],
                    "face_crop":      face_crop_b64,
                    "det_score":      det_score,
                    "face_quality":   det_score,          # alias for metadata
                    "face_width_px":  int(w),
                })
                accepted += 1
                print(f"   Face {idx}: ACCEPTED — {w}×{h}px | "
                      f"det={det_score:.3f} | dim={vec_dim}")

            print(f"👤 {accepted} face(s) passed quality gate")
            return results

        except Exception as e:
            print(f"🟠 InsightFace error: {e}")
            print(traceback.format_exc()[-600:])
            return []

    # ── Main process_image ────────────────────────────────────────
    def process_image(
        self,
        image_path: str,
        is_query:     bool = False,
        detect_faces: bool = True,
    ) -> list:
        """
        Full pipeline for one image.

        Returns list of vector dicts:
          Face:   {type, vector (1024-D), face_idx, bbox, face_crop,
                   det_score, face_quality, face_width_px}
          Object: {type, vector (1536-D)}

        V4 changes vs V3:
          - SCRFD at 1280px (not 640) — catches small/group faces
          - buffalo_l always (not buffalo_sc on CPU)
          - ArcFace + AdaFace fused 1024-D vectors
          - Quality gate: det_score ≥ 0.60, width ≥ 40px
          - Multi-scale: detect at 1280, retry at 640 if 0 faces found
        """
        cache_key = f"{_img_hash(image_path)}_{detect_faces}_{is_query}"
        if cache_key in self._cache:
            print("⚡ Cache hit")
            return self._cache[cache_key]

        extracted    = []
        original_pil = Image.open(image_path).convert("RGB")
        img_np       = np.array(original_pil)     # RGB uint8
        faces_found  = False

        # ════════════════════════════════════════════════════════
        # FACE LANE
        # V4: Run at full resolution (up to 1280px) to catch small
        #     faces in group photos. If 0 faces detected, retry at
        #     the original resolution (multi-scale fallback).
        # ════════════════════════════════════════════════════════
        if detect_faces and self.face_app is not None:
            # Multi-scale + CLAHE + flip all handled inside _detect_and_encode_faces
            # Pass the full-resolution image — internal scaling handles the rest
            face_results = self._detect_and_encode_faces(img_np)

            if face_results:
                faces_found = True
                for fr in face_results:
                    extracted.append(fr)

        # ════════════════════════════════════════════════════════
        # OBJECT LANE
        # Always runs — even when faces are found.
        # PERSON-class YOLO crops are skipped when faces active
        # to avoid double-counting people.
        # ════════════════════════════════════════════════════════
        crops_pil    = [_resize_pil(original_pil, MAX_IMAGE_SIZE)]   # full image
        yolo_results = self.yolo(image_path, conf=0.5, verbose=False)

        for r in yolo_results:
            if r.masks is not None:
                for seg_idx, mask_xy in enumerate(r.masks.xy):
                    cls_id = int(r.boxes.cls[seg_idx].item())
                    if faces_found and cls_id == YOLO_PERSON_CLASS_ID:
                        continue
                    polygon = np.array(mask_xy, dtype=np.int32)
                    if len(polygon) < 3:
                        continue
                    x, y, w, h = cv2.boundingRect(polygon)
                    if w < 30 or h < 30:
                        continue
                    crop = original_pil.crop((x, y, x + w, y + h))
                    crops_pil.append(crop)
                    if len(crops_pil) >= MAX_CROPS + 1:
                        break
            elif r.boxes is not None:
                for box in r.boxes:
                    cls_id = int(box.cls.item())
                    if faces_found and cls_id == YOLO_PERSON_CLASS_ID:
                        continue
                    x1, y1, x2, y2 = box.xyxy[0].tolist()
                    if (x2 - x1) < 30 or (y2 - y1) < 30:
                        continue
                    crop = original_pil.crop((x1, y1, x2, y2))
                    crops_pil.append(crop)
            if len(crops_pil) >= MAX_CROPS + 1:
                break

        crops    = [_resize_pil(c, MAX_IMAGE_SIZE) for c in crops_pil]
        print(f"🧠 Embedding {len(crops)} object crop(s)...")
        obj_vecs = self._embed_crops_batch(crops)
        for vec in obj_vecs:
            extracted.append({"type": "object", "vector": vec})

        # Cache
        if len(self._cache) >= self._cache_maxsize:
            del self._cache[next(iter(self._cache))]
        self._cache[cache_key] = extracted
        return extracted

    async def process_image_async(
        self,
        image_path:   str,
        is_query:     bool = False,
        detect_faces: bool = True,
    ) -> list:
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            None,
            functools.partial(self.process_image, image_path, is_query, detect_faces),
        )