Spaces:

MogensR
/

VideoBackgroundReplacer2

Configuration error

App Files Files Community

MogensR commited on Sep 17, 2025

Commit

b610bfd

1 Parent(s): 0fb1268

ohoy

Browse files

Files changed (3) hide show

integrated_pipeline.py +421 -0
two_stage_pipeline.py +388 -0
ui_core_functionality.py +1 -1

integrated_pipeline.py ADDED Viewed

	@@ -0,0 +1,421 @@

+#!/usr/bin/env python3
+"""
+integrated_pipeline.py - Two-stage pipeline with fallback compatibility
+- Stage 1: SAM2 -> lossless mask stream + metadata, then unload SAM2
+- Stage 2: Read masks -> MatAnyone -> composite -> final output
+- Maintains compatibility with existing UI calls
+"""
+import os
+import sys
+import gc
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Dict, Any, Optional, Tuple
+import numpy as np
+import cv2
+# Add the parent directory to Python path for imports
+current_dir = Path(__file__).parent
+parent_dir = current_dir.parent
+sys.path.append(str(parent_dir))
+class TwoStageProcessor:
+    def __init__(self, temp_dir: Optional[str] = None):
+        self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp())
+        self.temp_dir.mkdir(exist_ok=True)
+        # Stage outputs
+        self.masks_path = self.temp_dir / "masks.mkv"
+        self.metadata_path = self.temp_dir / "meta.json"
+    def process_video(self, input_video: str, background_video: str,
+                     click_points: list, output_path: str,
+                     use_matanyone: bool = True, progress_callback=None) -> bool:
+        """
+        Main entry point - maintains compatibility with existing UI
+        """
+        try:
+            # Stage 1: Generate masks
+            if progress_callback:
+                progress_callback("Stage 1: Generating masks with SAM2...")
+            if not self._stage1_generate_masks(input_video, click_points, progress_callback):
+                return False
+            # Stage 2: Process and composite
+            if progress_callback:
+                progress_callback("Stage 2: Processing and compositing...")
+            return self._stage2_composite(input_video, background_video,
+                                        output_path, use_matanyone, progress_callback)
+        except Exception as e:
+            print(f"Two-stage processing failed: {e}")
+            return False
+    def _stage1_generate_masks(self, input_video: str, click_points: list,
+                              progress_callback=None) -> bool:
+        """Stage 1: SAM2 mask generation with complete memory cleanup"""
+        try:
+            # Import SAM2 only when needed
+            print("Loading SAM2...")
+            import torch
+            from sam2.build_sam import build_sam2_video_predictor
+            # Initialize SAM2
+            checkpoint = "checkpoints/sam2.1_hiera_large.pt"
+            model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
+            if not os.path.exists(checkpoint):
+                print(f"SAM2 checkpoint not found: {checkpoint}")
+                return False
+            predictor = build_sam2_video_predictor(model_cfg, checkpoint)
+            # Get video info
+            cap = cv2.VideoCapture(input_video)
+            fps = cap.get(cv2.CAP_PROP_FPS)
+            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            cap.release()
+            # Save metadata
+            metadata = {
+                "fps": fps,
+                "frame_count": frame_count,
+                "width": width,
+                "height": height,
+                "click_points": click_points
+            }
+            with open(self.metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            # Initialize inference state
+            inference_state = predictor.init_state(video_path=input_video)
+            # Add prompts
+            for i, point in enumerate(click_points):
+                x, y = point
+                predictor.add_new_points_or_box(
+                    inference_state=inference_state,
+                    frame_idx=0,
+                    obj_id=i,
+                    points=np.array([[x, y]], dtype=np.float32),
+                    labels=np.array([1], np.int32),
+                )
+            # Setup FFmpeg for lossless mask encoding
+            ffmpeg_cmd = [
+                'ffmpeg', '-y', '-f', 'rawvideo',
+                '-pix_fmt', 'gray', '-s', f'{width}x{height}',
+                '-r', str(fps), '-i', '-',
+                '-c:v', 'ffv1', '-level', '3', '-pix_fmt', 'gray',
+                str(self.masks_path)
+            ]
+            ffmpeg_process = subprocess.Popen(
+                ffmpeg_cmd, stdin=subprocess.PIPE,
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE
+            )
+            # Generate and stream masks
+            print(f"Processing {frame_count} frames...")
+            for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
+                if progress_callback:
+                    progress = (out_frame_idx + 1) / frame_count * 50  # 50% of total progress for stage 1
+                    progress_callback(f"Generating masks... Frame {out_frame_idx + 1}/{frame_count}", progress)
+                # Combine masks from all objects
+                combined_mask = np.zeros((height, width), dtype=np.uint8)
+                for obj_id in out_obj_ids:
+                    mask = (out_mask_logits[obj_id] > 0.0).squeeze()
+                    combined_mask = np.logical_or(combined_mask, mask).astype(np.uint8) * 255
+                # Write to FFmpeg
+                ffmpeg_process.stdin.write(combined_mask.tobytes())
+            # Finalize FFmpeg
+            ffmpeg_process.stdin.close()
+            ffmpeg_process.wait()
+            if ffmpeg_process.returncode != 0:
+                error = ffmpeg_process.stderr.read().decode()
+                print(f"FFmpeg error: {error}")
+                return False
+            print("Stage 1 complete: Masks saved")
+            # CRITICAL: Complete memory cleanup
+            del predictor
+            del inference_state
+            if 'torch' in locals():
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.synchronize()
+            # Force garbage collection
+            gc.collect()
+            # Clear SAM2 from sys.modules to prevent memory leaks
+            modules_to_clear = [mod for mod in sys.modules.keys() if 'sam2' in mod.lower()]
+            for mod in modules_to_clear:
+                del sys.modules[mod]
+            print("SAM2 completely unloaded from memory")
+            return True
+        except Exception as e:
+            print(f"Stage 1 failed: {e}")
+            return False
+    def _stage2_composite(self, input_video: str, background_video: str,
+                         output_path: str, use_matanyone: bool, progress_callback=None) -> bool:
+        """Stage 2: Read masks, refine with MatAnyone, and composite"""
+        try:
+            # Load metadata
+            with open(self.metadata_path, 'r') as f:
+                metadata = json.load(f)
+            frame_count = metadata["frame_count"]
+            # Read masks back from lossless stream
+            masks = self._read_mask_stream()
+            if masks is None:
+                return False
+            # Optional MatAnyone refinement
+            if use_matanyone:
+                if progress_callback:
+                    progress_callback("Refining masks with MatAnyone...")
+                masks = self._refine_with_matanyone(input_video, masks, progress_callback)
+                if masks is None:
+                    return False
+            # Final composition
+            if progress_callback:
+                progress_callback("Compositing final video...")
+            return self._composite_final_video(input_video, background_video,
+                                             masks, output_path, metadata, progress_callback)
+        except Exception as e:
+            print(f"Stage 2 failed: {e}")
+            return False
+    def _read_mask_stream(self) -> Optional[list]:
+        """Read masks from the lossless FFV1 stream"""
+        try:
+            # Load metadata for dimensions
+            with open(self.metadata_path, 'r') as f:
+                metadata = json.load(f)
+            width = metadata["width"]
+            height = metadata["height"]
+            frame_count = metadata["frame_count"]
+            # Use FFmpeg to decode masks
+            ffmpeg_cmd = [
+                'ffmpeg', '-i', str(self.masks_path),
+                '-f', 'rawvideo', '-pix_fmt', 'gray', '-'
+            ]
+            process = subprocess.Popen(
+                ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            masks = []
+            frame_size = width * height
+            for frame_idx in range(frame_count):
+                frame_data = process.stdout.read(frame_size)
+                if len(frame_data) != frame_size:
+                    print(f"Unexpected frame size at frame {frame_idx}")
+                    break
+                mask = np.frombuffer(frame_data, dtype=np.uint8).reshape((height, width))
+                masks.append(mask)
+            process.stdout.close()
+            process.wait()
+            if process.returncode != 0:
+                error = process.stderr.read().decode()
+                print(f"FFmpeg decode error: {error}")
+                return None
+            print(f"Successfully read {len(masks)} masks from stream")
+            return masks
+        except Exception as e:
+            print(f"Failed to read mask stream: {e}")
+            return None
+    def _refine_with_matanyone(self, input_video: str, masks: list, progress_callback=None) -> Optional[list]:
+        """Apply MatAnyone refinement to masks"""
+        try:
+            # Import MatAnyone only when needed
+            from matanyone.mat_anywhere import matting_inference_video
+            # Create temp directory for MatAnyone
+            matanyone_temp = self.temp_dir / "matanyone"
+            matanyone_temp.mkdir(exist_ok=True)
+            # Save masks as individual frames for MatAnyone
+            mask_dir = matanyone_temp / "masks"
+            mask_dir.mkdir(exist_ok=True)
+            for i, mask in enumerate(masks):
+                cv2.imwrite(str(mask_dir / f"mask_{i:06d}.png"), mask)
+            # Run MatAnyone
+            refined_masks_dir = matanyone_temp / "refined"
+            refined_masks_dir.mkdir(exist_ok=True)
+            success = matting_inference_video(
+                video_path=input_video,
+                mask_dir=str(mask_dir),
+                output_dir=str(refined_masks_dir),
+                progress_callback=progress_callback
+            )
+            if not success:
+                print("MatAnyone refinement failed, using original masks")
+                return masks
+            # Load refined masks
+            refined_masks = []
+            for i in range(len(masks)):
+                refined_path = refined_masks_dir / f"refined_{i:06d}.png"
+                if refined_path.exists():
+                    refined_mask = cv2.imread(str(refined_path), cv2.IMREAD_GRAYSCALE)
+                    refined_masks.append(refined_mask)
+                else:
+                    refined_masks.append(masks[i])  # Fallback to original
+            return refined_masks
+        except Exception as e:
+            print(f"MatAnyone refinement failed: {e}, using original masks")
+            return masks
+    def _composite_final_video(self, input_video: str, background_video: str,
+                              masks: list, output_path: str, metadata: Dict[str, Any],
+                              progress_callback=None) -> bool:
+        """Create final composite video"""
+        try:
+            # Setup video capture
+            fg_cap = cv2.VideoCapture(input_video)
+            bg_cap = cv2.VideoCapture(background_video)
+            fps = metadata["fps"]
+            width = metadata["width"]
+            height = metadata["height"]
+            # Setup output writer
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+            frame_idx = 0
+            total_frames = len(masks)
+            while frame_idx < total_frames:
+                # Read frames
+                ret_fg, fg_frame = fg_cap.read()
+                ret_bg, bg_frame = bg_cap.read()
+                if not ret_fg:
+                    break
+                if not ret_bg:
+                    # Loop background if shorter
+                    bg_cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
+                    ret_bg, bg_frame = bg_cap.read()
+                if not ret_bg:
+                    print("No background frame available")
+                    break
+                # Resize background to match foreground
+                bg_frame = cv2.resize(bg_frame, (width, height))
+                # Get mask
+                mask = masks[frame_idx]
+                mask_norm = mask.astype(np.float32) / 255.0
+                mask_3ch = np.stack([mask_norm, mask_norm, mask_norm], axis=-1)
+                # Composite
+                composite = (fg_frame * mask_3ch + bg_frame * (1 - mask_3ch)).astype(np.uint8)
+                out.write(composite)
+                frame_idx += 1
+                if progress_callback and frame_idx % 10 == 0:
+                    progress = 50 + (frame_idx / total_frames) * 50  # 50-100% for stage 2
+                    progress_callback(f"Compositing... Frame {frame_idx}/{total_frames}", progress)
+            # Cleanup
+            fg_cap.release()
+            bg_cap.release()
+            out.release()
+            print(f"Final video saved to: {output_path}")
+            return True
+        except Exception as e:
+            print(f"Final composition failed: {e}")
+            return False
+    def cleanup(self):
+        """Clean up temporary files"""
+        try:
+            if self.temp_dir.exists():
+                import shutil
+                shutil.rmtree(self.temp_dir)
+        except Exception as e:
+            print(f"Cleanup failed: {e}")
+# Compatibility wrapper for existing UI
+def process_video_two_stage(input_video: str, background_video: str,
+                           click_points: list, output_path: str,
+                           use_matanyone: bool = True, progress_callback=None) -> bool:
+    """
+    Drop-in replacement for existing process_video function
+    """
+    processor = TwoStageProcessor()
+    try:
+        result = processor.process_video(
+            input_video, background_video, click_points,
+            output_path, use_matanyone, progress_callback
+        )
+        return result
+    finally:
+        processor.cleanup()
+if __name__ == "__main__":
+    # Test the pipeline
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True)
+    parser.add_argument("--background", required=True)
+    parser.add_argument("--output", required=True)
+    parser.add_argument("--clicks", required=True, help="JSON string of click points")
+    parser.add_argument("--no-matanyone", action="store_true")
+    args = parser.parse_args()
+    click_points = json.loads(args.clicks)
+    use_matanyone = not args.no_matanyone
+    success = process_video_two_stage(
+        args.input, args.background, click_points,
+        args.output, use_matanyone,
+        lambda msg, prog=None: print(f"Progress: {msg} ({prog}%)" if prog else msg)
+    )
+    print("Processing completed!" if success else "Processing failed!")

two_stage_pipeline.py ADDED Viewed

	@@ -0,0 +1,388 @@

+#!/usr/bin/env python3
+"""
+two_stage_pipeline.py — Ephemeral SAM2 stage + MatAnyone stage
+- Stage 1: SAM2 -> lossless mask stream (FFV1 .mkv) + meta.json, then unload SAM2
+- Stage 2: read mask stream -> (optional) MatAnyone refine -> composite -> mux audio
+"""
+import os, sys, gc, json, cv2, time, uuid, torch, shutil, logging, subprocess, threading
+import numpy as np
+from pathlib import Path
+from typing import Optional, Callable, Tuple, Dict, Any
+from PIL import Image
+logger = logging.getLogger("backgroundfx_pro.two_stage")
+if not logger.handlers:
+    h = logging.StreamHandler()
+    h.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s: %(message)s"))
+    logger.addHandler(h)
+logger.setLevel(logging.INFO)
+# ---------------------------
+# Env & CUDA helpers
+# ---------------------------
+def setup_env():
+    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF","expandable_segments:True,max_split_size_mb:256,garbage_collection_threshold:0.7")
+    os.environ.setdefault("OMP_NUM_THREADS","1")
+    os.environ.setdefault("OPENBLAS_NUM_THREADS","1")
+    os.environ.setdefault("MKL_NUM_THREADS","1")
+    torch.set_grad_enabled(False)
+    try:
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.set_float32_matmul_precision("high")
+    except Exception:
+        pass
+    if torch.cuda.is_available():
+        try:
+            torch.cuda.set_per_process_memory_fraction(float(os.getenv("CUDA_MEMORY_FRACTION","0.88")))
+        except Exception:
+            pass
+def free_cuda():
+    if torch.cuda.is_available():
+        torch.cuda.ipc_collect()
+        torch.cuda.empty_cache()
+def unload_sam2_modules():
+    """Aggressively unload SAM2 python modules to reduce RSS."""
+    try:
+        import importlib
+        mods = [m for m in list(sys.modules) if m.startswith("sam2")]
+        for m in mods:
+            sys.modules.pop(m, None)
+        importlib.invalidate_caches()
+        gc.collect()
+        free_cuda()
+        logger.info("SAM2 modules unloaded.")
+    except Exception as e:
+        logger.warning(f"Unloading SAM2 modules: {e}")
+# ---------------------------
+# Video probing
+# ---------------------------
+def probe_video(path:str) -> Tuple[int,int,float,int]:
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Cannot open video: {path}")
+    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+    w   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    h   = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    n   = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    return w,h,float(fps),n
+# ---------------------------
+# FFmpeg mask writers/readers
+# ---------------------------
+class MaskFFV1Writer:
+    """Write uint8 binary/gray masks to FFV1 lossless .mkv via pipe."""
+    def __init__(self, path:str, w:int, h:int, fps:float):
+        self.path = path
+        self.w, self.h, self.fps = w,h,fps
+        self.proc = None
+    def __enter__(self):
+        cmd = [
+            "ffmpeg","-y","-hide_banner","-loglevel","error",
+            "-f","rawvideo","-pix_fmt","gray","-s",f"{self.w}x{self.h}","-r",f"{self.fps}",
+            "-i","-",
+            "-c:v","ffv1","-level","3","-g","1", self.path
+        ]
+        self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+        return self
+    def write(self, mask_u8: np.ndarray):
+        # Expect HxW uint8 (0/255). Ensure contiguous.
+        if mask_u8.dtype != np.uint8:
+            mask_u8 = mask_u8.astype(np.uint8)
+        self.proc.stdin.write(mask_u8.tobytes())
+    def __exit__(self, exc_type, exc, tb):
+        if self.proc:
+            try:
+                self.proc.stdin.flush()
+                self.proc.stdin.close()
+                self.proc.wait(timeout=120)
+            except Exception:
+                self.proc.kill()
+class MaskFFV1Reader:
+    """Read uint8 masks from FFV1 .mkv via pipe."""
+    def __init__(self, path:str, w:int, h:int):
+        self.path = path
+        self.w,self.h = w,h
+        self.proc = None
+        self.frame_bytes = w*h
+    def __enter__(self):
+        cmd = [
+            "ffmpeg","-hide_banner","-loglevel","error","-i", self.path,
+            "-f","rawvideo","-pix_fmt","gray","-"
+        ]
+        self.proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+        return self
+    def read(self) -> Optional[np.ndarray]:
+        buf = self.proc.stdout.read(self.frame_bytes)
+        if not buf or len(buf) < self.frame_bytes:
+            return None
+        return np.frombuffer(buf, dtype=np.uint8).reshape(self.h, self.w)
+    def __exit__(self, exc_type, exc, tb):
+        if self.proc:
+            try:
+                self.proc.stdout.close()
+                self.proc.wait(timeout=30)
+            except Exception:
+                self.proc.kill()
+# Fallback: PNG sequence (disk heavy but simple & robust)
+class MaskPNGWriter:
+    def __init__(self, dirpath: Path):
+        self.dir = dirpath; self.dir.mkdir(parents=True, exist_ok=True); self.idx=0
+    def write(self, mask_u8: np.ndarray):
+        cv2.imwrite(str(self.dir / f"{self.idx:06d}.png"), mask_u8)
+        self.idx+=1
+class MaskPNGReader:
+    def __init__(self, dirpath: Path):
+        self.dir=dirpath; self.idx=0
+    def read(self) -> Optional[np.ndarray]:
+        p = self.dir / f"{self.idx:06d}.png"
+        if not p.exists(): return None
+        img = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
+        self.idx+=1
+        return img
+# ---------------------------
+# Stage 1 — SAM2 → mask dump
+# ---------------------------
+def stage1_dump_masks(video_path:str, out_dir:Path, obj_point:Tuple[int,int]=None) -> Dict[str,Any]:
+    """
+    Run only SAM2, save masks as FFV1 (preferred) or PNG sequence + meta.json.
+    Returns meta dict.
+    """
+    setup_env()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    w,h,fps,n = probe_video(video_path)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    meta = {"video":video_path, "width":w,"height":h,"fps":fps,"frames":n, "storage":None}
+    logger.info(f"[Stage1] {w}x{h}@{fps:.2f} | frames={n}")
+    # Load SAM2 (your wrapper)
+    from models.sam2_loader import SAM2Predictor
+    predictor = SAM2Predictor(device=device)
+    state = predictor.init_state(video_path=video_path)
+    # Prompt: center positive if not provided
+    if obj_point is None:
+        obj_point = (w//2, h//2)
+    pts = np.array([[obj_point[0], obj_point[1]]], dtype=np.float32)
+    labels = np.array([1], dtype=np.int32)
+    ann_obj_id = 1
+    with torch.inference_mode():
+        predictor.add_new_points(state, 0, ann_obj_id, pts, labels)
+    # Preferred: FFV1 mask stream
+    mask_mkv = out_dir / "mask.mkv"
+    use_png = False
+    try:
+        with MaskFFV1Writer(str(mask_mkv), w, h, fps) as writer, \
+             torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16 if device.type=="cuda" else None):
+            for _, out_ids, out_logits in predictor.propagate_in_video(state):
+                # pick ann_obj_id
+                i = None
+                if isinstance(out_ids, torch.Tensor):
+                    nz = (out_ids == ann_obj_id).nonzero(as_tuple=False)
+                    if nz.numel() > 0: i = nz[0].item()
+                else:
+                    ids = list(out_ids);  i = ids.index(ann_obj_id) if ann_obj_id in ids else None
+                if i is None:
+                    # write empty
+                    writer.write(np.zeros((h,w), np.uint8))
+                    continue
+                mask = (out_logits[i] > 0).detach()
+                mask_u8 = (mask.float().mul_(255).to("cpu", non_blocking=True).numpy()).astype(np.uint8)
+                writer.write(mask_u8)
+        meta["storage"] = "ffv1"
+        meta["mask_path"] = str(mask_mkv)
+        logger.info("[Stage1] Masks saved as FFV1 .mkv")
+    except Exception as e:
+        logger.warning(f"FFV1 writer failed ({e}), falling back to PNG sequence.")
+        png_dir = out_dir / "masks_png"
+        wr = MaskPNGWriter(png_dir)
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16 if device.type=="cuda" else None):
+            for _, out_ids, out_logits in predictor.propagate_in_video(state):
+                i = None
+                if isinstance(out_ids, torch.Tensor):
+                    nz = (out_ids == ann_obj_id).nonzero(as_tuple=False)
+                    if nz.numel() > 0: i = nz[0].item()
+                else:
+                    ids = list(out_ids);  i = ids.index(ann_obj_id) if ann_obj_id in ids else None
+                if i is None:
+                    wr.write(np.zeros((h,w), np.uint8)); continue
+                mask = (out_logits[i] > 0).detach()
+                wr.write((mask.float().mul_(255).to("cpu").numpy()).astype(np.uint8))
+        meta["storage"] = "png"
+        meta["mask_path"] = str(png_dir)
+    # Persist meta
+    with open(out_dir / "meta.json","w") as f:
+        json.dump(meta, f)
+    # Unload SAM2 completely
+    del predictor, state
+    free_cuda(); unload_sam2_modules()
+    return meta
+# ---------------------------
+# Stage 2 — refine + compose
+# ---------------------------
+def stage2_refine_and_compose(video_path:str, mask_dir:Path, background_image:Image.Image,
+                              out_path:str, use_matany:bool=True) -> str:
+    w,h,fps,n = probe_video(video_path)
+    bg = background_image.resize((w,h), Image.LANCZOS)
+    bg_np = np.array(bg).astype(np.float32)
+    # Read meta
+    with open(mask_dir / "meta.json","r") as f:
+        meta = json.load(f)
+    storage = meta["storage"]; mask_path = meta["mask_path"]
+    # Optional MatAnyone
+    session = None
+    if use_matany:
+        try:
+            from models.matanyone_loader import MatAnyoneSession as _M
+        except Exception:
+            try:
+                from models.matanyone_loader import MatAnyoneLoader as _M
+            except Exception:
+                _M = None
+        if _M:
+            session = _M(device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+            if hasattr(session,"model") and session.model is not None:
+                session.model.eval()
+    # Open video + writer
+    cap = cv2.VideoCapture(video_path)
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    tmp_out = str(Path(out_path).with_suffix(".noaudio.mp4"))
+    writer = cv2.VideoWriter(tmp_out, fourcc, fps, (w,h))
+    # Open mask reader
+    if storage == "ffv1":
+        mreader = MaskFFV1Reader(mask_path, w, h)
+        mreader.__enter__()
+        read_mask = lambda : mreader.read()
+    else:
+        mreader = MaskPNGReader(Path(mask_path))
+        read_mask = lambda : mreader.read()
+    i = 0
+    try:
+        while True:
+            ok, frame_bgr = cap.read()
+            if not ok: break
+            mask_u8 = read_mask()
+            if mask_u8 is None:
+                # out of masks; write original
+                writer.write(frame_bgr); i+=1; continue
+            # Optional refine
+            if session is not None:
+                try:
+                    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+                    # Provide a float mask 0..1 to session; adapt if your API differs
+                    mask_f = (mask_u8.astype(np.float32) / 255.0)
+                    if hasattr(session,"refine_mask"):
+                        mask_refined = session.refine_mask(frame_rgb, mask_f)
+                    elif hasattr(session,"process_frame"):
+                        mask_refined = session.process_frame(frame_rgb, mask_f)
+                    else:
+                        mask_refined = mask_f
+                    if isinstance(mask_refined, torch.Tensor):
+                        mask_u8 = (mask_refined.detach().clamp(0,1).mul(255).to("cpu").numpy()).astype(np.uint8)
+                    elif isinstance(mask_refined, np.ndarray):
+                        mask_u8 = (np.clip(mask_refined,0,1)*255).astype(np.uint8)
+                except Exception as e:
+                    logger.debug(f"MatAnyone refine failed @frame {i}: {e}")
+            # Composite
+            m = (mask_u8.astype(np.float32)/255.0)[...,None]  # HxWx1
+            fr = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB).astype(np.float32)
+            comp = fr*m + bg_np*(1.0-m)
+            comp_bgr = cv2.cvtColor(comp.astype(np.uint8), cv2.COLOR_RGB2BGR)
+            writer.write(comp_bgr)
+            if i % 50 == 0:
+                logger.info(f"[Stage2] frame {i}/{n}")
+            i += 1
+    finally:
+        cap.release(); writer.release()
+        if isinstance(mreader, MaskFFV1Reader):
+            mreader.__exit__(None,None,None)
+    # Mux audio
+    final_out = str(Path(out_path))
+    cmd = [
+        "ffmpeg","-y","-hide_banner","-loglevel","error",
+        "-i", tmp_out, "-i", video_path,
+        "-map","0:v:0","-map","1:a:0","-c:v","copy","-c:a","aac","-shortest", final_out
+    ]
+    try:
+        r = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
+        if r.returncode != 0:
+            logger.warning(f"Audio mux failed: {r.stderr.strip()}")
+            shutil.move(tmp_out, final_out)
+        else:
+            os.remove(tmp_out)
+    except Exception:
+        shutil.move(tmp_out, final_out)
+    return final_out
+# ---------------------------
+# Orchestrator
+# ---------------------------
+def process_two_stage(
+    video_path:str,
+    background_image: Image.Image,
+    workdir: Optional[Path]=None,
+    progress: Optional[Callable[[str,float],None]] = None,
+    use_matany: bool = True,
+) -> str:
+    setup_env()
+    if workdir is None:
+        workdir = Path.cwd()/ "tmp" / f"job_{uuid.uuid4().hex[:8]}"
+    workdir.mkdir(parents=True, exist_ok=True)
+    # Stage 1
+    if progress: progress("Stage 1: SAM2 mask pass", 0.05)
+    mask_dir = workdir / "sam2_masks"
+    meta = stage1_dump_masks(video_path, mask_dir)
+    if progress: progress("Stage 1 complete", 0.45)
+    # Stage 2
+    if progress: progress("Stage 2: refine + compose", 0.50)
+    out_path = workdir / f"final_{int(time.time())}.mp4"
+    final_video = stage2_refine_and_compose(video_path, mask_dir, background_image, str(out_path), use_matany=use_matany)
+    if progress: progress("Done", 1.0)
+    logger.info(f"Output: {final_video}")
+    return final_video
+# ---------------------------
+# CLI
+# ---------------------------
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Two-stage BackgroundFX Pro")
+    parser.add_argument("--video", required=True)
+    parser.add_argument("--background", required=True)
+    parser.add_argument("--outdir", default=None)
+    parser.add_argument("--no-matany", action="store_true")
+    args = parser.parse_args()
+    bg = Image.open(args.background).convert("RGB")
+    out = process_two_stage(args.video, bg, Path(args.outdir) if args.outdir else None, use_matany=not args.no_matany)
+    print(out)

ui_core_functionality.py CHANGED Viewed

@@ -451,7 +451,7 @@ def process_video_pipeline(
     """Process video using the hardened pipeline"""
     try:
         # Lazy import to avoid startup issues
-        from pipeline import process as pipeline_process
         logger.info(f"🎬 Starting pipeline processing in {job_dir}")
         progress_tracker.update("Initializing pipeline...")

     """Process video using the hardened pipeline"""
     try:
         # Lazy import to avoid startup issues
+        from two_stage_pipeline import process as pipeline_process
         logger.info(f"🎬 Starting pipeline processing in {job_dir}")
         progress_tracker.update("Initializing pipeline...")