Spaces:

matthartman
/

red-lab-91

Runtime error

App Files Files Community

matthartman commited on Nov 6

Commit

a9d4b99

verified ·

1 Parent(s): 8074ca3

Deploy Gradio app with multiple files

Browse files

Files changed (20) hide show

app.py +675 -0
generate.py +235 -0
requirements.txt +29 -0
wan/__init__.py +7 -0
wan/animate.py +663 -0
wan/configs/__init__.py +50 -0
wan/configs/shared_config.py +20 -0
wan/configs/wan_animate_14B.py +40 -0
wan/configs/wan_i2v_A14B.py +37 -0
wan/configs/wan_s2v_14B.py +59 -0
wan/configs/wan_t2v_A14B.py +37 -0
wan/configs/wan_ti2v_5B.py +36 -0
wan/distributed/__init__.py +1 -0
wan/distributed/fsdp.py +45 -0
wan/distributed/sequence_parallel.py +176 -0
wan/distributed/ulysses.py +47 -0
wan/distributed/util.py +51 -0
wan/image2video.py +431 -0
wan/modules/__init__.py +19 -0
wan/modules/animate/__init__.py +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,675 @@

+import spaces
+from huggingface_hub import snapshot_download, hf_hub_download
+import os
+import subprocess
+import importlib, site
+from PIL import Image
+import uuid
+import shutil
+import time
+import cv2
+import json
+import gradio as gr
+import sys
+import gc
+BASE = os.path.dirname(os.path.abspath(__file__))
+PREPROCESS_DIR = os.path.join(BASE, "wan", "modules", "animate", "preprocess")
+sys.path.append(PREPROCESS_DIR)
+# Re-discover all .pth/.egg-link files
+for sitedir in site.getsitepackages():
+    site.addsitedir(sitedir)
+# Clear caches so importlib will pick up new modules
+importlib.invalidate_caches()
+def sh(cmd): subprocess.check_call(cmd, shell=True)
+try:
+    sh("pip install flash-attn --no-build-isolation")
+    # print("Attempting to download and build sam2...")
+    # print("download sam")
+    # sam_dir = snapshot_download(repo_id="alexnasa/sam2")
+    # @spaces.GPU(duration=500)
+    # def install_sam():
+    #     os.environ["TORCH_CUDA_ARCH_LIST"] = "9.0"
+    #     sh(f"cd {sam_dir} && python setup.py build_ext --inplace && pip install -e .")
+    # print("install sam")
+    # install_sam()
+    print("Attempting to download")
+    print("download sam")
+    snapshot_download(repo_id="alexnasa/sam2_C", local_dir=f"{os.getcwd()}" )
+    # tell Python to re-scan site-packages now that the egg-link exists
+    import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
+    print("sam2 installed successfully.")
+except Exception as e:
+    raise gr.Error("sam2 installation failed")
+import torch
+from generate import generate, load_model
+from preprocess_data import run as run_preprocess
+from preprocess_data import load_preprocess_models
+print(f"Torch version: {torch.__version__}")
+os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/processed_results"
+snapshot_download(repo_id="Wan-AI/Wan2.2-Animate-14B", local_dir="./Wan2.2-Animate-14B")
+wan_animate = load_model(True)
+rc_mapping = {
+    "Video → Ref Image" : False,
+    "Video ← Ref Image" : True
+}
+def preprocess_video(input_video_path, duration, session_id=None):
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    os.makedirs(output_dir, exist_ok=True)
+    process_video_path = os.path.join(output_dir, 'input_video.mp4')
+    clip_and_set_fps(input_video_path, process_video_path, duration_s=duration)
+    return process_video_path
+def extract_audio_from_video_ffmpeg(video_path, output_wav_path, sample_rate=None):
+    """
+    Extracts the audio track from a video file and saves it as a WAV file.
+    Args:
+        video_path (str): Path to the input video file.
+        output_wav_path (str): Path to save the extracted WAV file.
+        sample_rate (int, optional): Output sample rate (e.g., 16000).
+                                     If None, keep the original.
+    """
+    cmd = [
+        'ffmpeg',
+        '-i', video_path,             # Input video
+        '-vn',                        # Disable video
+        '-acodec', 'pcm_s16le',       # 16-bit PCM (WAV format)
+        '-ac', '1',                   # Mono channel (use '2' for stereo)
+        '-y',                         # Overwrite output
+        '-loglevel', 'error'          # Cleaner output
+    ]
+    # Only add the sample rate option if explicitly specified
+    if sample_rate is not None:
+        cmd.extend(['-ar', str(sample_rate)])
+    cmd.append(output_wav_path)
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, text=True)
+        return True
+    except subprocess.CalledProcessError as e:
+        return False
+def combine_video_and_audio_ffmpeg(video_path, audio_path, output_video_path):
+    """
+    Combines a silent MP4 video with a WAV audio file into a single MP4 with sound.
+    Args:
+        video_path (str): Path to the silent video file.
+        audio_path (str): Path to the WAV audio file.
+        output_video_path (str): Path to save the output MP4 with audio.
+    """
+    cmd = [
+        'ffmpeg',
+        '-i', video_path,            # Input video
+        '-i', audio_path,            # Input audio
+        '-c:v', 'copy',              # Copy video without re-encoding
+        '-c:a', 'aac',               # Encode audio as AAC (MP4-compatible)
+        '-shortest',                 # Stop when the shortest stream ends
+        '-y',                        # Overwrite output
+        '-loglevel', 'error',
+        output_video_path
+    ]
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, text=True)
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"ffmpeg failed ({e.returncode}): {e.stderr.strip()}")
+def clip_and_set_fps(input_video_path, output_video_path, duration_s=2, target_fps=30):
+    """
+    Trim to duration_s and (optionally) change FPS, without resizing.
+    - If target_fps is None, keeps the original FPS.
+    - Re-encodes video when changing FPS for predictable timing.
+    """
+    vf = []
+    if target_fps is not None:
+        vf.append(f"fps={target_fps}")
+    vf_arg = ",".join(vf) if vf else None
+    cmd = [
+        "ffmpeg",
+        "-nostdin",
+        "-hide_banner",
+        "-y",
+        "-i", input_video_path,
+        "-t", str(duration_s),
+    ]
+    if vf_arg:
+        cmd += ["-vf", vf_arg]
+    cmd += [
+        "-c:v", "libx264",
+        "-pix_fmt", "yuv420p",
+        "-preset", "veryfast",
+        "-crf", "18",
+        "-c:a", "aac",          # use aac so MP4 stays compatible
+        "-movflags", "+faststart",
+        output_video_path,
+    ]
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, text=True)
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"ffmpeg failed ({e.returncode}): {e.stderr.strip()}")
+def is_portrait(video_file):
+    # Get video information
+    cap = cv2.VideoCapture(video_file)
+    if not cap.isOpened():
+        error_msg = "Cannot open video file"
+        gr.Warning(error_msg)
+    orig_frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    cap.release()
+    return orig_width < orig_height
+def calculate_time_required(max_duration_s, rc_bool):
+    if max_duration_s == 2:
+        return 120
+    elif max_duration_s == 4:
+        return 180
+    elif max_duration_s == 6:
+        return 260
+    elif max_duration_s == 8:
+        return 330
+    elif max_duration_s == 10:
+        return 340
+def get_display_time_required(max_duration_s, rc_bool):
+    # the 30 seconds extra is just for saftey in case of a unexpected slow down
+    return calculate_time_required(max_duration_s, rc_bool) - 30
+def update_time_required(max_duration_s, rc_str):
+    rc_bool = rc_mapping[rc_str]
+    duration_s = get_display_time_required(max_duration_s, rc_bool)
+    duration_m = duration_s / 60
+    return gr.update(value=f"⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)")
+def get_duration(input_video, max_duration_s, edited_frame, rc_bool, session_id, progress):
+    return calculate_time_required(max_duration_s, rc_bool)
+@spaces.GPU(duration=get_duration)
+def _animate(input_video, max_duration_s, edited_frame, rc_bool, session_id = None, progress=gr.Progress(track_tqdm=True),):
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    os.makedirs(output_dir, exist_ok=True)
+    preprocess_dir = os.path.join(output_dir, "preprocess_dir")
+    os.makedirs(preprocess_dir, exist_ok=True)
+    output_video_path = os.path.join(output_dir, 'result.mp4')
+    # --- Measure preprocess time ---
+    start_preprocess = time.time()
+    if is_portrait(input_video):
+        w = 480
+        h = 832
+    else:
+        w = 832
+        h = 480
+    tag_string = "retarget_flag"
+    if rc_bool:
+        tag_string = "replace_flag"
+    preprocess_model = load_preprocess_models()
+    run_preprocess(preprocess_model, input_video, edited_frame, preprocess_dir, w, h, tag_string)
+    preprocess_time = time.time() - start_preprocess
+    print(f"Preprocess took {preprocess_time:.2f} seconds")
+    # --- Measure generate time ---
+    start_generate = time.time()
+    generate(wan_animate, preprocess_dir, output_video_path, rc_bool)
+    generate_time = time.time() - start_generate
+    print(f"Generate took {generate_time:.2f} seconds")
+    # --- Optional total time ---
+    total_time = preprocess_time + generate_time
+    print(f"Total time: {total_time:.2f} seconds")
+    gc.collect()
+    torch.cuda.empty_cache()
+    return output_video_path
+def animate_scene(input_video, max_duration_s, edited_frame, rc_str, use_ai_image, ai_prompt, session_id = None, progress=gr.Progress(track_tqdm=True),):
+    if not input_video:
+        raise gr.Error("Please provide an video")
+    if not use_ai_image and not edited_frame:
+        raise gr.Error("Please provide an image or enable AI generation")
+    if use_ai_image and not ai_prompt:
+        raise gr.Error("Please provide a prompt for AI image generation")
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    input_video = preprocess_video(input_video, max_duration_s, session_id)
+    rc_bool = rc_mapping[rc_str]
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    os.makedirs(output_dir, exist_ok=True)
+    input_audio_path = os.path.join(output_dir, 'input_audio.wav')
+    audio_extracted = extract_audio_from_video_ffmpeg(input_video, input_audio_path)
+    edited_frame_png = os.path.join(output_dir, 'edited_frame.png')
+    if use_ai_image:
+        # Generate image using AI model
+        generated_image = generate_ai_image(ai_prompt, session_id)
+        edited_frame_img = generated_image
+    else:
+        edited_frame_img = Image.open(edited_frame)
+    edited_frame_img.save(edited_frame_png)
+    print(f'{session_id} inference started')
+    output_video_path = _animate(input_video, max_duration_s, edited_frame_png, rc_bool, session_id, progress)
+    final_video_path = os.path.join(output_dir, 'final_result.mp4')
+    preprocess_dir = os.path.join(output_dir, "preprocess_dir")
+    pose_video =  os.path.join(preprocess_dir, 'src_pose.mp4')
+    if rc_bool:
+        mask_video = os.path.join(preprocess_dir, 'src_mask.mp4')
+        bg_video = os.path.join(preprocess_dir, 'src_bg.mp4')
+        face_video = os.path.join(preprocess_dir, 'src_face.mp4')
+    else:
+        mask_video = os.path.join(preprocess_dir, 'src_pose.mp4')
+        bg_video = os.path.join(preprocess_dir, 'src_pose.mp4')
+        face_video = os.path.join(preprocess_dir, 'src_pose.mp4')
+    if audio_extracted:
+        combine_video_and_audio_ffmpeg(output_video_path, input_audio_path, final_video_path)
+    else:
+        final_video_path = output_video_path
+    print(f"task for {session_id} finalised")
+    return final_video_path, pose_video, bg_video, mask_video, face_video
+css = """
+    #col-container {
+        margin: 0 auto;
+        max-width: 1600px;
+    }
+    #step-column {
+        padding: 10px;
+        border-radius: 8px;
+        box-shadow: var(--card-shadow);
+        margin: 10px;
+    }
+    #col-showcase {
+        margin: 0 auto;
+        max-width: 1100px;
+    }
+    .button-gradient {
+        background: linear-gradient(45deg, rgb(255, 65, 108), rgb(255, 75, 43), rgb(255, 155, 0), rgb(255, 65, 108)) 0% 0% / 400% 400%;
+        border: none;
+        padding: 14px 28px;
+        font-size: 16px;
+        font-weight: bold;
+        color: white;
+        border-radius: 10px;
+        cursor: pointer;
+        transition: 0.3s ease-in-out;
+        animation: 2s linear 0s infinite normal none running gradientAnimation;
+        box-shadow: rgba(255, 65, 108, 0.6) 0px 4px 10px;
+    }
+    .toggle-container {
+    display: inline-flex;
+    background-color: #ffd6ff;  /* light pink background */
+    border-radius: 9999px;
+    padding: 4px;
+    position: relative;
+    width: fit-content;
+    font-family: sans-serif;
+    }
+    .toggle-container input[type="radio"] {
+    display: none;
+    }
+    .toggle-container label {
+    position: relative;
+    z-index: 2;
+    flex: 1;
+    text-align: center;
+    font-weight: 700;
+    color: #4b2ab5; /* dark purple text for unselected */
+    padding: 6px 22px;
+    border-radius: 9999px;
+    cursor: pointer;
+    transition: color 0.25s ease;
+    }
+    /* Moving highlight */
+    .toggle-highlight {
+    position: absolute;
+    top: 4px;
+    left: 4px;
+    width: calc(50% - 4px);
+    height: calc(100% - 8px);
+    background-color: #4b2ab5; /* dark purple background */
+    border-radius: 9999px;
+    transition: transform 0.25s ease;
+    z-index: 1;
+    }
+    /* When "True" is checked */
+    #true:checked ~ label[for="true"] {
+    color: #ffd6ff; /* light pink text */
+    }
+    /* When "False" is checked */
+    #false:checked ~ label[for="false"] {
+    color: #ffd6ff; /* light pink text */
+    }
+    /* Move highlight to right side when False is checked */
+    #false:checked ~ .toggle-highlight {
+    transform: translateX(100%);
+    }
+    """
+def log_change(log_source, session_id, meta_data = None):
+    if not meta_data:
+        print(f'{session_id} changed {log_source}')
+    else:
+        print(f'{session_id} changed {log_source} with {meta_data}')
+def generate_ai_image(prompt, session_id):
+    """
+    Generate an image using an AI model based on the prompt.
+    This is a placeholder - implement with your preferred image generation model.
+    """
+    # TODO: Implement actual AI image generation
+    # Example using a hypothetical image generation model:
+    # from diffusers import StableDiffusionPipeline
+    # pipe = StableDiffusionPipeline.from_pretrained("model_name")
+    # image = pipe(prompt).images[0]
+    # For now, return a placeholder
+    raise gr.Error("AI image generation not yet implemented. Please upload an image instead.")
+def start_session(request: gr.Request):
+    return request.session_hash
+def cleanup(request: gr.Request):
+    sid = request.session_hash
+    if sid:
+        print(f"{sid} left")
+        d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid)
+        shutil.rmtree(d1, ignore_errors=True)
+with gr.Blocks(css=css, title="Wan 2.2 Animate --replace", theme=gr.themes.Ocean()) as demo:
+    session_state = gr.State()
+    demo.load(start_session, outputs=[session_state])
+    with gr.Column(elem_id="col-container"):
+        with gr.Row():
+            gr.HTML(
+                """
+                <div style="text-align: center;">
+                    <p style="font-size:16px; display: inline; margin: 0;">
+                        <strong>Wan2.2-Animate-14B </strong>
+                    </p>
+                    <a href="https://huggingface.co/Wan-AI/Wan2.2-Animate-14B" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
+                        [Model]
+                    </a>
+                    <p style="font-size:16px; display: inline; margin: 0;">
+                        -- HF Space By:
+                    </p>
+                    <a href="https://huggingface.co/alexnasa" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
+                        <img src="https://img.shields.io/badge/🤗-Follow Me-yellow.svg">
+                    </a>
+                </div>
+                """
+            )
+        with gr.Row():
+            with gr.Column(elem_id="step-column"):
+                gr.HTML("""
+                <div>
+                    <span style="font-size: 24px;">1. Upload a Video</span><br>
+                </div>
+                """)
+                input_video = gr.Video(label="Input Video", height=512)
+                max_duration_slider = gr.Slider(2, 10, 2, step=2, label="Max Duration", visible=False)
+                gr.Examples(
+                    examples=[
+                        [
+                            "./examples/martialart.mp4",
+                        ],
+                        [
+                            "./examples/test_example.mp4",
+                        ],
+                    ],
+                    inputs=[input_video],
+                    cache_examples=False,
+                )
+            with gr.Column(elem_id="step-column"):
+                gr.HTML("""
+                <div>
+                    <span style="font-size: 24px;">2. Upload or Generate Ref Image</span><br>
+                </div>
+                """)
+                use_ai_image = gr.Checkbox(label="Generate Image with AI", value=False)
+                with gr.Group() as upload_group:
+                    edited_frame = gr.Image(label="Ref Image", type="filepath", height=512)
+                with gr.Group(visible=False) as ai_group:
+                    ai_prompt = gr.Textbox(label="AI Image Prompt", placeholder="Describe the image you want to generate...")
+                    generate_btn = gr.Button("Generate Image", variant="secondary")
+                    ai_generated_preview = gr.Image(label="Generated Preview", type="pil", height=512)
+                default_replace_string = "Video ← Ref Image"
+                replace_character_string = gr.Radio(
+                    ["Video → Ref Image", "Video ← Ref Image"], value=default_replace_string, show_label=False
+                )
+                def toggle_image_input(use_ai):
+                    return gr.update(visible=not use_ai), gr.update(visible=use_ai)
+                use_ai_image.change(
+                    toggle_image_input,
+                    inputs=[use_ai_image],
+                    outputs=[upload_group, ai_group]
+                )
+                gr.Examples(
+                    examples=[
+                        [
+                            "./examples/ali.png",
+                        ],
+                        [
+                            "./examples/amber.png",
+                        ],
+                        [
+                            "./examples/ella.png",
+                        ],
+                        [
+                            "./examples/sydney.png",
+                        ],
+                    ],
+                    inputs=[edited_frame],
+                    cache_examples=False,
+                )
+            with gr.Column(elem_id="step-column"):
+                gr.HTML("""
+                <div>
+                    <span style="font-size: 24px;">3. Wan Animate it!</span><br>
+                </div>
+                """)
+                output_video = gr.Video(label="Edited Video", height=512)
+                duration_s = get_display_time_required(2, default_replace_string)
+                duration_m = duration_s / 60
+                time_required = f"⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)"
+                time_required = gr.Text(value=time_required, show_label=False, visible=False)
+                action_button = gr.Button("Wan Animate 🦆", variant='primary', elem_classes="button-gradient")
+                with gr.Accordion("Preprocessed Data", open=False, visible=True):
+                    with gr.Row():
+                        pose_video = gr.Video(label="Pose Video")
+                        bg_video = gr.Video(label="Background Video")
+                        face_video = gr.Video(label="Face Video")
+                        mask_video = gr.Video(label="Mask Video")
+        with gr.Row():
+            with gr.Column(elem_id="col-showcase"):
+                gr.Examples(
+                    examples=[
+                        [
+                            "./examples/okay.mp4",
+                            2,
+                            "./examples/amber.png",
+                            "Video ← Ref Image",
+                            False,
+                            ""
+                        ],
+                        [
+                            "./examples/superman.mp4",
+                            2,
+                            "./examples/superman.png",
+                            "Video ← Ref Image",
+                            False,
+                            ""
+                        ],
+                        [
+                            "./examples/test_example.mp4",
+                            2,
+                            "./examples/ella.png",
+                            "Video ← Ref Image",
+                            False,
+                            ""
+                        ],
+                        [
+                            "./examples/paul.mp4",
+                            2,
+                            "./examples/man.png",
+                            "Video → Ref Image",
+                            False,
+                            ""
+                        ],
+                        [
+                            "./examples/desi.mp4",
+                            2,
+                            "./examples/desi.png",
+                            "Video ← Ref Image",
+                            False,
+                            ""
+                        ],
+                    ],
+                    inputs=[input_video, max_duration_slider, edited_frame, replace_character_string, use_ai_image, ai_prompt],
+                    outputs=[output_video, pose_video, bg_video, mask_video, face_video],
+                    fn=animate_scene,
+                    cache_examples=True,
+                )
+    action_button.click(fn=animate_scene, inputs=[input_video, max_duration_slider, edited_frame, replace_character_string, use_ai_image, ai_prompt, session_state], outputs=[output_video, pose_video, bg_video, mask_video, face_video])
+    replace_character_string.change(update_time_required, inputs=[max_duration_slider, replace_character_string], outputs=[time_required])
+    max_duration_slider.change(log_change, inputs=[gr.State("slider"), session_state, max_duration_slider]).then(update_time_required, inputs=[max_duration_slider, replace_character_string], outputs=[time_required])
+    input_video.change(log_change, inputs=[gr.State("video"), session_state])
+    edited_frame.change(log_change, inputs=[gr.State("ref image"), session_state])
+if __name__ == "__main__":
+    demo.queue()
+    demo.unload(cleanup)
+    demo.launch(ssr_mode=False, share=True)

generate.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import argparse
+import logging
+import os
+import sys
+import warnings
+from datetime import datetime
+warnings.filterwarnings('ignore')
+import random
+import torch
+import torch.distributed as dist
+from PIL import Image
+import wan
+from wan.configs import MAX_AREA_CONFIGS, SIZE_CONFIGS, SUPPORTED_SIZES, WAN_CONFIGS
+from wan.distributed.util import init_distributed_group
+from wan.utils.prompt_extend import DashScopePromptExpander, QwenPromptExpander
+from wan.utils.utils import merge_video_audio, save_video, str2bool
+EXAMPLE_PROMPT = {
+    "t2v-A14B": {
+        "prompt":
+            "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage.",
+    },
+    "i2v-A14B": {
+        "prompt":
+            "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.",
+        "image":
+            "examples/i2v_input.JPG",
+    },
+    "ti2v-5B": {
+        "prompt":
+            "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage.",
+    },
+    "animate-14B": {
+        "prompt": "视频中的人在做动作",
+        "video": "",
+        "pose": "",
+        "mask": "",
+    },
+    "s2v-14B": {
+        "prompt":
+            "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.",
+        "image":
+            "examples/i2v_input.JPG",
+        "audio":
+            "examples/talk.wav",
+        "tts_prompt_audio":
+            "examples/zero_shot_prompt.wav",
+        "tts_prompt_text":
+            "希望你以后能够做的比我还好呦。",
+        "tts_text":
+            "收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。"
+    },
+}
+def _validate_args(args):
+    # Basic check
+    assert args.ckpt_dir is not None, "Please specify the checkpoint directory."
+    assert args.task in WAN_CONFIGS, f"Unsupport task: {args.task}"
+    assert args.task in EXAMPLE_PROMPT, f"Unsupport task: {args.task}"
+    if args.prompt is None:
+        args.prompt = EXAMPLE_PROMPT[args.task]["prompt"]
+    if args.image is None and "image" in EXAMPLE_PROMPT[args.task]:
+        args.image = EXAMPLE_PROMPT[args.task]["image"]
+    if args.audio is None and args.enable_tts is False and "audio" in EXAMPLE_PROMPT[args.task]:
+        args.audio = EXAMPLE_PROMPT[args.task]["audio"]
+    if (args.tts_prompt_audio is None or args.tts_text is None) and args.enable_tts is True and "audio" in EXAMPLE_PROMPT[args.task]:
+        args.tts_prompt_audio = EXAMPLE_PROMPT[args.task]["tts_prompt_audio"]
+        args.tts_prompt_text = EXAMPLE_PROMPT[args.task]["tts_prompt_text"]
+        args.tts_text = EXAMPLE_PROMPT[args.task]["tts_text"]
+    if args.task == "i2v-A14B":
+        assert args.image is not None, "Please specify the image path for i2v."
+    cfg = WAN_CONFIGS[args.task]
+    if args.sample_steps is None:
+        args.sample_steps = cfg.sample_steps
+    if args.sample_shift is None:
+        args.sample_shift = cfg.sample_shift
+    if args.sample_guide_scale is None:
+        args.sample_guide_scale = cfg.sample_guide_scale
+    if args.frame_num is None:
+        args.frame_num = cfg.frame_num
+    args.base_seed = args.base_seed if args.base_seed >= 0 else random.randint(
+        0, sys.maxsize)
+    # Size check
+    if not 's2v' in args.task:
+        assert args.size in SUPPORTED_SIZES[
+            args.
+            task], f"Unsupport size {args.size} for task {args.task}, supported sizes are: {', '.join(SUPPORTED_SIZES[args.task])}"
+class _Args:
+    pass
+def _parse_args():
+    args = _Args()
+    # core generation options
+    args.task                    = "animate-14B"
+    # args.size                    = "1280*720"
+    args.size                    = "720*1280"
+    args.frame_num               = None
+    args.ckpt_dir                = "./Wan2.2-Animate-14B/"
+    args.offload_model           = False
+    args.ulysses_size            = 1
+    args.t5_fsdp                 = False
+    args.t5_cpu                  = False
+    args.dit_fsdp                = False
+    args.prompt                  = None
+    args.use_prompt_extend       = False
+    args.prompt_extend_method    = "local_qwen"     # ["dashscope", "local_qwen"]
+    args.prompt_extend_model     = None
+    args.prompt_extend_target_lang = "zh"           # ["zh", "en"]
+    args.base_seed               = 1234
+    args.image                   = None
+    args.sample_solver           = "unipc"          # ['unipc', 'dpm++']
+    args.sample_steps            = None
+    args.sample_shift            = None
+    args.sample_guide_scale      = None
+    args.convert_model_dtype     = True
+    # animate
+    args.refert_num              = 1
+    # s2v-only
+    args.num_clip                = None
+    args.audio                   = None
+    args.enable_tts              = False
+    args.tts_prompt_audio        = None
+    args.tts_prompt_text         = None
+    args.tts_text                = None
+    args.pose_video              = None
+    args.start_from_ref          = False
+    args.infer_frames            = 80
+    _validate_args(args)
+    return args
+def _init_logging(rank):
+    # logging
+    if rank == 0:
+        # set format
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[%(asctime)s] %(levelname)s: %(message)s",
+            handlers=[logging.StreamHandler(stream=sys.stdout)])
+    else:
+        logging.basicConfig(level=logging.ERROR)
+def load_model(use_relighting_lora = False):
+    cfg = WAN_CONFIGS["animate-14B"]
+    return wan.WanAnimate(
+        config=cfg,
+        checkpoint_dir="./Wan2.2-Animate-14B/",
+        device_id=0,
+        rank=0,
+        t5_fsdp=False,
+        dit_fsdp=False,
+        use_sp=False,
+        t5_cpu=False,
+        convert_model_dtype=False,
+        use_relighting_lora=use_relighting_lora
+    )
+def generate(wan_animate, preprocess_dir, save_file, replace_flag = False):
+    args = _parse_args()
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    device = local_rank
+    _init_logging(rank)
+    cfg = WAN_CONFIGS[args.task]
+    logging.info(f"Input prompt: {args.prompt}")
+    img = None
+    if args.image is not None:
+        img = Image.open(args.image).convert("RGB")
+        logging.info(f"Input image: {args.image}")
+    print(f'rank:{rank}')
+    logging.info(f"Generating video ...")
+    video = wan_animate.generate(
+        src_root_path=preprocess_dir,
+        replace_flag=replace_flag,
+        refert_num = args.refert_num,
+        clip_len=args.frame_num,
+        shift=args.sample_shift,
+        sample_solver=args.sample_solver,
+        sampling_steps=args.sample_steps,
+        guide_scale=args.sample_guide_scale,
+        seed=args.base_seed,
+        offload_model=args.offload_model)
+    if rank == 0:
+        save_video(
+            tensor=video[None],
+            save_file=save_file,
+            fps=cfg.sample_fps,
+            nrow=1,
+            normalize=True,
+            value_range=(-1, 1))
+        # if "s2v" in args.task:
+        #     if args.enable_tts is False:
+        #         merge_video_audio(video_path=args.save_file, audio_path=args.audio)
+        #     else:
+        #         merge_video_audio(video_path=args.save_file, audio_path="tts.wav")
+    del video
+    torch.cuda.synchronize()
+    if dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+    logging.info("Finished.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+decord
+peft
+pandas
+matplotlib
+loguru
+sentencepiece
+dashscope
+ftfy
+diffusers
+opencv-python
+moviepy
+torchvision==0.23.0
+torchaudio==2.8.0
+transformers
+tokenizers
+accelerate
+tqdm
+imageio[ffmpeg]
+easydict
+imageio-ffmpeg
+numpy>=1.23.5,<2
+hydra-core
+iopath
+pytest
+pillow
+librosa
+fvcore
+onnxruntime-gpu
+flash-attn-3 @ https://huggingface.co/alexnasa/flash-attn-3/resolve/main/128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl

wan/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from . import configs, distributed, modules
+from .image2video import WanI2V
+from .speech2video import WanS2V
+from .text2video import WanT2V
+from .textimage2video import WanTI2V
+from .animate import WanAnimate

wan/animate.py ADDED Viewed

	@@ -0,0 +1,663 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import math
+import os
+import cv2
+import types
+from copy import deepcopy
+from functools import partial
+from einops import rearrange
+import numpy as np
+import torch
+import torch.distributed as dist
+from peft import set_peft_model_state_dict
+from decord import VideoReader
+from tqdm import tqdm
+import torch.nn.functional as F
+from .distributed.fsdp import shard_model
+from .distributed.sequence_parallel import sp_attn_forward, sp_dit_forward
+from .distributed.util import get_world_size
+from .modules.animate import WanAnimateModel
+from .modules.animate import CLIPModel
+from .modules.t5 import T5EncoderModel
+from .modules.vae2_1 import Wan2_1_VAE
+from .modules.animate.animate_utils import TensorList, get_loraconfig
+from .utils.fm_solvers import (
+    FlowDPMSolverMultistepScheduler,
+    get_sampling_sigmas,
+    retrieve_timesteps,
+)
+from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+class WanAnimate:
+    def __init__(
+        self,
+        config,
+        checkpoint_dir,
+        device_id=0,
+        rank=0,
+        t5_fsdp=False,
+        dit_fsdp=False,
+        use_sp=False,
+        t5_cpu=False,
+        init_on_cpu=True,
+        convert_model_dtype=False,
+        use_relighting_lora=False
+    ):
+        r"""
+        Initializes the generation model components.
+        Args:
+            config (EasyDict):
+                Object containing model parameters initialized from config.py
+            checkpoint_dir (`str`):
+                Path to directory containing model checkpoints
+            device_id (`int`,  *optional*, defaults to 0):
+                Id of target GPU device
+            rank (`int`,  *optional*, defaults to 0):
+                Process rank for distributed training
+            t5_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for T5 model
+            dit_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for DiT model
+            use_sp (`bool`, *optional*, defaults to False):
+                Enable distribution strategy of sequence parallel.
+            t5_cpu (`bool`, *optional*, defaults to False):
+                Whether to place T5 model on CPU. Only works without t5_fsdp.
+            init_on_cpu (`bool`, *optional*, defaults to True):
+                Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
+            convert_model_dtype (`bool`, *optional*, defaults to False):
+                Convert DiT model parameters dtype to 'config.param_dtype'.
+                Only works without FSDP.
+            use_relighting_lora (`bool`, *optional*, defaults to False):
+               Whether to use relighting lora for character replacement.
+        """
+        self.device = torch.device(f"cuda:{device_id}")
+        self.config = config
+        self.rank = rank
+        self.t5_cpu = t5_cpu
+        self.init_on_cpu = init_on_cpu
+        self.num_train_timesteps = config.num_train_timesteps
+        self.param_dtype = config.param_dtype
+        if t5_fsdp or dit_fsdp or use_sp:
+            self.init_on_cpu = False
+        shard_fn = partial(shard_model, device_id=device_id)
+        self.text_encoder = T5EncoderModel(
+            text_len=config.text_len,
+            dtype=config.t5_dtype,
+            device=torch.device('cpu'),
+            checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
+            shard_fn=shard_fn if t5_fsdp else None,
+        )
+        self.clip = CLIPModel(
+            dtype=torch.float16,
+            device=self.device,
+            checkpoint_path=os.path.join(checkpoint_dir,
+                                         config.clip_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.clip_tokenizer))
+        self.vae = Wan2_1_VAE(
+            vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
+            device=self.device)
+        logging.info(f"Creating WanAnimate from {checkpoint_dir}")
+        if not dit_fsdp:
+            self.noise_model = WanAnimateModel.from_pretrained(
+                checkpoint_dir,
+                torch_dtype=self.param_dtype,
+                device_map=self.device)
+        else:
+            self.noise_model = WanAnimateModel.from_pretrained(
+                checkpoint_dir, torch_dtype=self.param_dtype)
+        self.noise_model = self._configure_model(
+            model=self.noise_model,
+            use_sp=use_sp,
+            dit_fsdp=dit_fsdp,
+            shard_fn=shard_fn,
+            convert_model_dtype=convert_model_dtype,
+            use_lora=use_relighting_lora,
+            checkpoint_dir=checkpoint_dir,
+            config=config
+            )
+        # self.noise_model = torch.compile(self.noise_model)
+        if use_sp:
+            self.sp_size = get_world_size()
+        else:
+            self.sp_size = 1
+        self.sample_neg_prompt = config.sample_neg_prompt
+        self.sample_prompt = config.prompt
+    def _configure_model(self, model, use_sp, dit_fsdp, shard_fn,
+                         convert_model_dtype, use_lora, checkpoint_dir, config):
+        """
+        Configures a model object. This includes setting evaluation modes,
+        applying distributed parallel strategy, and handling device placement.
+        Args:
+            model (torch.nn.Module):
+                The model instance to configure.
+            use_sp (`bool`):
+                Enable distribution strategy of sequence parallel.
+            dit_fsdp (`bool`):
+                Enable FSDP sharding for DiT model.
+            shard_fn (callable):
+                The function to apply FSDP sharding.
+            convert_model_dtype (`bool`):
+                Convert DiT model parameters dtype to 'config.param_dtype'.
+                Only works without FSDP.
+        Returns:
+            torch.nn.Module:
+                The configured model.
+        """
+        model.eval().requires_grad_(False)
+        if use_sp:
+            for block in model.blocks:
+                block.self_attn.forward = types.MethodType(
+                    sp_attn_forward, block.self_attn)
+            model.use_context_parallel = True
+        if dist.is_initialized():
+            dist.barrier()
+        if use_lora:
+            logging.info("Loading Relighting Lora. ")
+            lora_config = get_loraconfig(
+                transformer=model,
+                rank=128,
+                alpha=128
+            )
+            model.add_adapter(lora_config)
+            lora_path = os.path.join(checkpoint_dir, config.lora_checkpoint)
+            peft_state_dict = torch.load(lora_path)["state_dict"]
+            set_peft_model_state_dict(model, peft_state_dict)
+        if dit_fsdp:
+            model = shard_fn(model, use_lora=use_lora)
+        else:
+            if convert_model_dtype:
+                model.to(self.param_dtype)
+            if not self.init_on_cpu:
+                model.to(self.device)
+        return model
+    def inputs_padding(self, array, target_len):
+        idx = 0
+        flip = False
+        target_array = []
+        while len(target_array) < target_len:
+            target_array.append(deepcopy(array[idx]))
+            if flip:
+                idx -= 1
+            else:
+                idx += 1
+            if idx == 0 or idx == len(array) - 1:
+                flip = not flip
+        return target_array[:target_len]
+    def get_valid_len(self, real_len, clip_len=81, overlap=1):
+        real_clip_len = clip_len - overlap
+        last_clip_num = (real_len - overlap) % real_clip_len
+        if last_clip_num == 0:
+            extra = 0
+        else:
+            extra = real_clip_len - last_clip_num
+        target_len = real_len + extra
+        return target_len
+    def get_i2v_mask(self, lat_t, lat_h, lat_w, mask_len=1, mask_pixel_values=None, device="cuda"):
+        if mask_pixel_values is None:
+            msk = torch.zeros(1, (lat_t-1) * 4 + 1, lat_h, lat_w, device=device)
+        else:
+            msk = mask_pixel_values.clone()
+        msk[:, :mask_len] = 1
+        msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+        msk = msk.transpose(1, 2)[0]
+        return msk
+    def padding_resize(self, img_ori, height=512, width=512, padding_color=(0, 0, 0), interpolation=cv2.INTER_LINEAR):
+        ori_height = img_ori.shape[0]
+        ori_width = img_ori.shape[1]
+        channel = img_ori.shape[2]
+        img_pad = np.zeros((height, width, channel))
+        if channel == 1:
+            img_pad[:, :, 0] = padding_color[0]
+        else:
+            img_pad[:, :, 0] = padding_color[0]
+            img_pad[:, :, 1] = padding_color[1]
+            img_pad[:, :, 2] = padding_color[2]
+        if (ori_height / ori_width) > (height / width):
+            new_width = int(height / ori_height * ori_width)
+            img = cv2.resize(img_ori, (new_width, height), interpolation=interpolation)
+            padding = int((width - new_width) / 2)
+            if len(img.shape) == 2:
+                img = img[:, :, np.newaxis]
+            img_pad[:, padding: padding + new_width, :] = img
+        else:
+            new_height = int(width / ori_width * ori_height)
+            img = cv2.resize(img_ori, (width, new_height), interpolation=interpolation)
+            padding = int((height - new_height) / 2)
+            if len(img.shape) == 2:
+                img = img[:, :, np.newaxis]
+            img_pad[padding: padding + new_height, :, :] = img
+        img_pad = np.uint8(img_pad)
+        return img_pad
+    def prepare_source(self, src_pose_path, src_face_path, src_ref_path):
+        pose_video_reader = VideoReader(src_pose_path)
+        pose_len = len(pose_video_reader)
+        pose_idxs = list(range(pose_len))
+        cond_images = pose_video_reader.get_batch(pose_idxs).asnumpy()
+        face_video_reader = VideoReader(src_face_path)
+        face_len = len(face_video_reader)
+        face_idxs = list(range(face_len))
+        face_images = face_video_reader.get_batch(face_idxs).asnumpy()
+        height, width = cond_images[0].shape[:2]
+        refer_images = cv2.imread(src_ref_path)[..., ::-1]
+        refer_images = self.padding_resize(refer_images, height=height, width=width)
+        return cond_images, face_images, refer_images
+    def prepare_source_for_replace(self, src_bg_path, src_mask_path):
+        bg_video_reader = VideoReader(src_bg_path)
+        bg_len = len(bg_video_reader)
+        bg_idxs = list(range(bg_len))
+        bg_images = bg_video_reader.get_batch(bg_idxs).asnumpy()
+        mask_video_reader = VideoReader(src_mask_path)
+        mask_len = len(mask_video_reader)
+        mask_idxs = list(range(mask_len))
+        mask_images = mask_video_reader.get_batch(mask_idxs).asnumpy()
+        mask_images = mask_images[:, :, :, 0] / 255
+        return bg_images, mask_images
+    def generate(
+        self,
+        src_root_path,
+        replace_flag=False,
+        clip_len=77,
+        refert_num=1,
+        shift=5.0,
+        sample_solver='dpm++',
+        sampling_steps=20,
+        guide_scale=1,
+        input_prompt="",
+        n_prompt="",
+        seed=-1,
+        offload_model=True,
+    ):
+        r"""
+        Generates video frames from input image using diffusion process.
+        Args:
+            src_root_path ('str'):
+                Process output path
+            replace_flag (`bool`, *optional*, defaults to False):
+                Whether to use character replace.
+            clip_len (`int`, *optional*, defaults to 77):
+                How many frames to generate per clips. The number should be 4n+1
+            refert_num (`int`, *optional*, defaults to 1):
+                How many frames used for temporal guidance. Recommended to be 1 or 5.
+            shift (`float`, *optional*, defaults to 5.0):
+                Noise schedule shift parameter.
+            sample_solver (`str`, *optional*, defaults to 'dpm++'):
+                Solver used to sample the video.
+            sampling_steps (`int`, *optional*, defaults to 20):
+                Number of diffusion sampling steps. Higher values improve quality but slow generation
+            guide_scale (`float` or tuple[`float`], *optional*, defaults 1.0):
+                Classifier-free guidance scale. We only use it for expression control.
+                In most cases, it's not necessary and faster generation can be achieved without it.
+                When expression adjustments are needed, you may consider using this feature.
+            input_prompt (`str`):
+                Text prompt for content generation. We don't recommend custom prompts (although they work)
+            n_prompt (`str`, *optional*, defaults to ""):
+                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
+            seed (`int`, *optional*, defaults to -1):
+                Random seed for noise generation. If -1, use random seed
+            offload_model (`bool`, *optional*, defaults to True):
+                If True, offloads models to CPU during generation to save VRAM
+        Returns:
+            torch.Tensor:
+                Generated video frames tensor. Dimensions: (C, N, H, W) where:
+                - C: Color channels (3 for RGB)
+                - N: Number of frames
+                - H: Frame height
+                - W: Frame width
+        """
+        assert refert_num == 1 or refert_num == 5, "refert_num should be 1 or 5."
+        seed_g = torch.Generator(device=self.device)
+        seed_g.manual_seed(seed)
+        if n_prompt == "":
+            n_prompt = self.sample_neg_prompt
+        if input_prompt == "":
+            input_prompt = self.sample_prompt
+        src_pose_path = os.path.join(src_root_path, "src_pose.mp4")
+        src_face_path = os.path.join(src_root_path, "src_face.mp4")
+        src_ref_path = os.path.join(src_root_path, "src_ref.png")
+        cond_images, face_images, refer_images = self.prepare_source(src_pose_path=src_pose_path, src_face_path=src_face_path, src_ref_path=src_ref_path)
+        if not self.t5_cpu:
+            self.text_encoder.model.to(self.device)
+            context = self.text_encoder([input_prompt], self.device)
+            context_null = self.text_encoder([n_prompt], self.device)
+            if offload_model:
+                self.text_encoder.model.cpu()
+        else:
+            context = self.text_encoder([input_prompt], torch.device('cpu'))
+            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
+            context = [t.to(self.device) for t in context]
+            context_null = [t.to(self.device) for t in context_null]
+        real_frame_len = len(cond_images)
+        target_len = self.get_valid_len(real_frame_len, clip_len, overlap=refert_num)
+        logging.info('real frames: {} target frames: {}'.format(real_frame_len, target_len))
+        cond_images = self.inputs_padding(cond_images, target_len)
+        face_images = self.inputs_padding(face_images, target_len)
+        if replace_flag:
+            src_bg_path = os.path.join(src_root_path, "src_bg.mp4")
+            src_mask_path = os.path.join(src_root_path, "src_mask.mp4")
+            bg_images, mask_images = self.prepare_source_for_replace(src_bg_path, src_mask_path)
+            bg_images = self.inputs_padding(bg_images, target_len)
+            mask_images = self.inputs_padding(mask_images, target_len)
+            self.noise_model.enable_adapters()
+        else:
+            self.noise_model.disable_adapters()
+        height, width = refer_images.shape[:2]
+        start = 0
+        end = clip_len
+        all_out_frames = []
+        total_iterations = ((len(cond_images) - 1) // clip_len + 1) * sampling_steps
+        with tqdm(total=total_iterations) as pbar:
+            while True:
+                if start + refert_num >= len(cond_images):
+                    break
+                if start == 0:
+                    mask_reft_len = 0
+                else:
+                    mask_reft_len = refert_num
+                batch = {
+                            "conditioning_pixel_values": torch.zeros(1, 3, clip_len, height, width),
+                            "bg_pixel_values": torch.zeros(1, 3, clip_len, height, width),
+                            "mask_pixel_values": torch.zeros(1, 1, clip_len, height, width),
+                            "face_pixel_values": torch.zeros(1, 3, clip_len, 512, 512),
+                            "refer_pixel_values": torch.zeros(1, 3, height, width),
+                            "refer_t_pixel_values": torch.zeros(refert_num, 3, height, width)
+                        }
+                batch["conditioning_pixel_values"] = rearrange(
+                    torch.tensor(np.stack(cond_images[start:end]) / 127.5 - 1),
+                    "t h w c -> 1 c t h w",
+                )
+                batch["face_pixel_values"] = rearrange(
+                    torch.tensor(np.stack(face_images[start:end]) / 127.5 - 1),
+                    "t h w c -> 1 c t h w",
+                )
+                batch["refer_pixel_values"] = rearrange(
+                    torch.tensor(refer_images / 127.5 - 1), "h w c -> 1 c h w"
+                )
+                if start > 0:
+                    batch["refer_t_pixel_values"] = rearrange(
+                        out_frames[0, :, -refert_num:].clone().detach(),
+                        "c t h w -> t c h w",
+                    )
+                batch["refer_t_pixel_values"] = rearrange(batch["refer_t_pixel_values"],
+                                                "t c h w -> 1 c t h w",
+                                                )
+                if replace_flag:
+                    batch["bg_pixel_values"] = rearrange(
+                        torch.tensor(np.stack(bg_images[start:end]) / 127.5 - 1),
+                        "t h w c -> 1 c t h w",
+                    )
+                    batch["mask_pixel_values"] = rearrange(
+                        torch.tensor(np.stack(mask_images[start:end])[:, :, :, None]),
+                        "t h w c -> 1 t c h w",
+                    )
+                for key, value in batch.items():
+                    if isinstance(value, torch.Tensor):
+                        batch[key] = value.to(device=self.device, dtype=torch.bfloat16)
+                ref_pixel_values = batch["refer_pixel_values"]
+                refer_t_pixel_values = batch["refer_t_pixel_values"]
+                conditioning_pixel_values = batch["conditioning_pixel_values"]
+                face_pixel_values = batch["face_pixel_values"]
+                B, _, H, W = ref_pixel_values.shape
+                T = clip_len
+                lat_h = H // 8
+                lat_w = W // 8
+                lat_t = T // 4 + 1
+                target_shape = [lat_t + 1, lat_h, lat_w]
+                noise = [
+                    torch.randn(
+                        16,
+                        target_shape[0],
+                        target_shape[1],
+                        target_shape[2],
+                        dtype=torch.float32,
+                        device=self.device,
+                        generator=seed_g,
+                    )
+                ]
+                max_seq_len = int(math.ceil(np.prod(target_shape) // 4 / self.sp_size)) * self.sp_size
+                if max_seq_len % self.sp_size != 0:
+                    raise ValueError(f"max_seq_len {max_seq_len} is not divisible by sp_size {self.sp_size}")
+                with (
+                    torch.autocast(device_type=str(self.device), dtype=torch.bfloat16, enabled=True),
+                    torch.no_grad()
+                ):
+                    if sample_solver == 'unipc':
+                        sample_scheduler = FlowUniPCMultistepScheduler(
+                            num_train_timesteps=self.num_train_timesteps,
+                            shift=1,
+                            use_dynamic_shifting=False)
+                        sample_scheduler.set_timesteps(
+                            sampling_steps, device=self.device, shift=shift)
+                        timesteps = sample_scheduler.timesteps
+                    elif sample_solver == 'dpm++':
+                        sample_scheduler = FlowDPMSolverMultistepScheduler(
+                            num_train_timesteps=self.num_train_timesteps,
+                            shift=1,
+                            use_dynamic_shifting=False)
+                        sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
+                        timesteps, _ = retrieve_timesteps(
+                            sample_scheduler,
+                            device=self.device,
+                            sigmas=sampling_sigmas)
+                    else:
+                        raise NotImplementedError("Unsupported solver.")
+                    latents = noise
+                    pose_latents_no_ref =  self.vae.encode(conditioning_pixel_values.to(torch.bfloat16))
+                    pose_latents_no_ref = torch.stack(pose_latents_no_ref)
+                    pose_latents = torch.cat([pose_latents_no_ref], dim=2)
+                    ref_pixel_values = rearrange(ref_pixel_values, "t c h w -> 1 c t h w")
+                    ref_latents =  self.vae.encode(ref_pixel_values.to(torch.bfloat16))
+                    ref_latents = torch.stack(ref_latents)
+                    mask_ref = self.get_i2v_mask(1, lat_h, lat_w, 1, device=self.device)
+                    y_ref = torch.concat([mask_ref, ref_latents[0]]).to(dtype=torch.bfloat16, device=self.device)
+                    img = ref_pixel_values[0, :, 0]
+                    clip_context = self.clip.visual([img[:, None, :, :]]).to(dtype=torch.bfloat16, device=self.device)
+                    if mask_reft_len > 0:
+                        if replace_flag:
+                            bg_pixel_values = batch["bg_pixel_values"]
+                            y_reft = self.vae.encode(
+                                [
+                                    torch.concat([refer_t_pixel_values[0, :, :mask_reft_len], bg_pixel_values[0, :, mask_reft_len:]], dim=1).to(self.device)
+                                ]
+                            )[0]
+                            mask_pixel_values = 1 - batch["mask_pixel_values"]
+                            mask_pixel_values = rearrange(mask_pixel_values, "b t c h w -> (b t) c h w")
+                            mask_pixel_values = F.interpolate(mask_pixel_values, size=(H//8, W//8), mode='nearest')
+                            mask_pixel_values = rearrange(mask_pixel_values, "(b t) c h w -> b t c h w", b=1)[:,:,0]
+                            msk_reft = self.get_i2v_mask(lat_t, lat_h, lat_w, mask_reft_len, mask_pixel_values=mask_pixel_values, device=self.device)
+                        else:
+                            y_reft = self.vae.encode(
+                                [
+                                    torch.concat(
+                                        [
+                                            torch.nn.functional.interpolate(refer_t_pixel_values[0, :, :mask_reft_len].cpu(),
+                                                                            size=(H, W), mode="bicubic"),
+                                            torch.zeros(3, T - mask_reft_len, H, W),
+                                        ],
+                                        dim=1,
+                                    ).to(self.device)
+                                ]
+                            )[0]
+                            msk_reft = self.get_i2v_mask(lat_t, lat_h, lat_w, mask_reft_len, device=self.device)
+                    else:
+                        if replace_flag:
+                            bg_pixel_values = batch["bg_pixel_values"]
+                            mask_pixel_values = 1 - batch["mask_pixel_values"]
+                            mask_pixel_values = rearrange(mask_pixel_values, "b t c h w -> (b t) c h w")
+                            mask_pixel_values = F.interpolate(mask_pixel_values, size=(H//8, W//8), mode='nearest')
+                            mask_pixel_values = rearrange(mask_pixel_values, "(b t) c h w -> b t c h w", b=1)[:,:,0]
+                            y_reft = self.vae.encode(
+                                [
+                                    torch.concat(
+                                        [
+                                            bg_pixel_values[0],
+                                        ],
+                                        dim=1,
+                                    ).to(self.device)
+                                ]
+                            )[0]
+                            msk_reft = self.get_i2v_mask(lat_t, lat_h, lat_w, mask_reft_len, mask_pixel_values=mask_pixel_values, device=self.device)
+                        else:
+                            y_reft = self.vae.encode(
+                                [
+                                    torch.concat(
+                                        [
+                                            torch.zeros(3, T - mask_reft_len, H, W),
+                                        ],
+                                        dim=1,
+                                    ).to(self.device)
+                                ]
+                            )[0]
+                            msk_reft = self.get_i2v_mask(lat_t, lat_h, lat_w, mask_reft_len, device=self.device)
+                    y_reft = torch.concat([msk_reft, y_reft]).to(dtype=torch.bfloat16, device=self.device)
+                    y = torch.concat([y_ref, y_reft], dim=1)
+                    arg_c = {
+                        "context": context,
+                        "seq_len": max_seq_len,
+                        "clip_fea": clip_context.to(dtype=torch.bfloat16, device=self.device),
+                        "y": [y],
+                        "pose_latents": pose_latents,
+                        "face_pixel_values": face_pixel_values,
+                    }
+                    if guide_scale > 1:
+                        face_pixel_values_uncond = face_pixel_values * 0 - 1
+                        arg_null = {
+                            "context": context_null,
+                            "seq_len": max_seq_len,
+                            "clip_fea": clip_context.to(dtype=torch.bfloat16, device=self.device),
+                            "y": [y],
+                            "pose_latents": pose_latents,
+                            "face_pixel_values": face_pixel_values_uncond,
+                        }
+                    for i, t in enumerate(timesteps):
+                        latent_model_input = latents
+                        timestep = [t]
+                        timestep = torch.stack(timestep)
+                        noise_pred_cond = TensorList(
+                            self.noise_model(TensorList(latent_model_input), t=timestep, **arg_c)
+                        )
+                        if guide_scale > 1:
+                            noise_pred_uncond = TensorList(
+                                self.noise_model(
+                                    TensorList(latent_model_input), t=timestep, **arg_null
+                                )
+                            )
+                            noise_pred = noise_pred_uncond + guide_scale * (
+                                noise_pred_cond - noise_pred_uncond
+                            )
+                        else:
+                            noise_pred = noise_pred_cond
+                        temp_x0 = sample_scheduler.step(
+                            noise_pred[0].unsqueeze(0),
+                            t,
+                            latents[0].unsqueeze(0),
+                            return_dict=False,
+                            generator=seed_g,
+                        )[0]
+                        latents[0] = temp_x0.squeeze(0)
+                        x0 = latents
+                        if pbar is not None:
+                            pbar.update(1)
+                    x0 = [x.to(dtype=torch.float32) for x in x0]
+                    out_frames = torch.stack(self.vae.decode([x0[0][:, 1:]]))
+                    if start != 0:
+                        out_frames = out_frames[:, :, refert_num:]
+                    all_out_frames.append(out_frames.cpu())
+                    start += clip_len - refert_num
+                    end += clip_len - refert_num
+        videos = torch.cat(all_out_frames, dim=2)[:, :, :real_frame_len]
+        return videos[0] if self.rank == 0 else None

wan/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import copy
+import os
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+from .wan_i2v_A14B import i2v_A14B
+from .wan_s2v_14B import s2v_14B
+from .wan_t2v_A14B import t2v_A14B
+from .wan_ti2v_5B import ti2v_5B
+from .wan_animate_14B import animate_14B
+WAN_CONFIGS = {
+    't2v-A14B': t2v_A14B,
+    'i2v-A14B': i2v_A14B,
+    'ti2v-5B': ti2v_5B,
+    'animate-14B': animate_14B,
+    's2v-14B': s2v_14B,
+}
+SIZE_CONFIGS = {
+    '720*1280': (720, 1280),
+    '1280*720': (1280, 720),
+    '480*832': (480, 832),
+    '832*480': (832, 480),
+    '704*1280': (704, 1280),
+    '1280*704': (1280, 704),
+    '1024*704': (1024, 704),
+    '704*1024': (704, 1024),
+}
+MAX_AREA_CONFIGS = {
+    '720*1280': 720 * 1280,
+    '1280*720': 1280 * 720,
+    '480*832': 480 * 832,
+    '832*480': 832 * 480,
+    '704*1280': 704 * 1280,
+    '1280*704': 1280 * 704,
+    '1024*704': 1024 * 704,
+    '704*1024': 704 * 1024,
+}
+SUPPORTED_SIZES = {
+    't2v-A14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    'i2v-A14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    'ti2v-5B': ('704*1280', '1280*704'),
+    's2v-14B': ('720*1280', '1280*720', '480*832', '832*480', '1024*704',
+                '704*1024', '704*1280', '1280*704'),
+    'animate-14B': ('720*1280', '1280*720')
+}

wan/configs/shared_config.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+#------------------------ Wan shared config ------------------------#
+wan_shared_cfg = EasyDict()
+# t5
+wan_shared_cfg.t5_model = 'umt5_xxl'
+wan_shared_cfg.t5_dtype = torch.bfloat16
+wan_shared_cfg.text_len = 512
+# transformer
+wan_shared_cfg.param_dtype = torch.bfloat16
+# inference
+wan_shared_cfg.num_train_timesteps = 1000
+wan_shared_cfg.sample_fps = 16
+wan_shared_cfg.sample_neg_prompt = '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'
+wan_shared_cfg.frame_num = 81

wan/configs/wan_animate_14B.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan animate 14B ------------------------#
+animate_14B = EasyDict(__name__='Config: Wan animate 14B')
+animate_14B.update(wan_shared_cfg)
+animate_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+animate_14B.t5_tokenizer = 'google/umt5-xxl'
+animate_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
+animate_14B.clip_tokenizer = 'xlm-roberta-large'
+animate_14B.lora_checkpoint = 'relighting_lora.ckpt'
+# vae
+animate_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+animate_14B.vae_stride = (4, 8, 8)
+# transformer
+animate_14B.patch_size = (1, 2, 2)
+animate_14B.dim = 5120
+animate_14B.ffn_dim = 13824
+animate_14B.freq_dim = 256
+animate_14B.num_heads = 40
+animate_14B.num_layers = 40
+animate_14B.window_size = (-1, -1)
+animate_14B.qk_norm = True
+animate_14B.cross_attn_norm = True
+animate_14B.eps = 1e-6
+animate_14B.use_face_encoder = True
+animate_14B.motion_encoder_dim = 512
+# inference
+animate_14B.sample_shift = 5.0
+animate_14B.sample_steps = 5
+animate_14B.sample_guide_scale = 1.0
+animate_14B.frame_num = 77
+animate_14B.sample_fps = 30
+animate_14B.prompt = '视频中的人在做动作'

wan/configs/wan_i2v_A14B.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan I2V A14B ------------------------#
+i2v_A14B = EasyDict(__name__='Config: Wan I2V A14B')
+i2v_A14B.update(wan_shared_cfg)
+i2v_A14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+i2v_A14B.t5_tokenizer = 'google/umt5-xxl'
+# vae
+i2v_A14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+i2v_A14B.vae_stride = (4, 8, 8)
+# transformer
+i2v_A14B.patch_size = (1, 2, 2)
+i2v_A14B.dim = 5120
+i2v_A14B.ffn_dim = 13824
+i2v_A14B.freq_dim = 256
+i2v_A14B.num_heads = 40
+i2v_A14B.num_layers = 40
+i2v_A14B.window_size = (-1, -1)
+i2v_A14B.qk_norm = True
+i2v_A14B.cross_attn_norm = True
+i2v_A14B.eps = 1e-6
+i2v_A14B.low_noise_checkpoint = 'low_noise_model'
+i2v_A14B.high_noise_checkpoint = 'high_noise_model'
+# inference
+i2v_A14B.sample_shift = 5.0
+i2v_A14B.sample_steps = 40
+i2v_A14B.boundary = 0.900
+i2v_A14B.sample_guide_scale = (3.5, 3.5)  # low noise, high noise

wan/configs/wan_s2v_14B.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan S2V 14B ------------------------#
+s2v_14B = EasyDict(__name__='Config: Wan S2V 14B')
+s2v_14B.update(wan_shared_cfg)
+# t5
+s2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+s2v_14B.t5_tokenizer = 'google/umt5-xxl'
+# vae
+s2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+s2v_14B.vae_stride = (4, 8, 8)
+# wav2vec
+s2v_14B.wav2vec = "wav2vec2-large-xlsr-53-english"
+s2v_14B.num_heads = 40
+# transformer
+s2v_14B.transformer = EasyDict(
+    __name__="Config: Transformer config for WanModel_S2V")
+s2v_14B.transformer.patch_size = (1, 2, 2)
+s2v_14B.transformer.dim = 5120
+s2v_14B.transformer.ffn_dim = 13824
+s2v_14B.transformer.freq_dim = 256
+s2v_14B.transformer.num_heads = 40
+s2v_14B.transformer.num_layers = 40
+s2v_14B.transformer.window_size = (-1, -1)
+s2v_14B.transformer.qk_norm = True
+s2v_14B.transformer.cross_attn_norm = True
+s2v_14B.transformer.eps = 1e-6
+s2v_14B.transformer.enable_adain = True
+s2v_14B.transformer.adain_mode = "attn_norm"
+s2v_14B.transformer.audio_inject_layers = [
+    0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39
+]
+s2v_14B.transformer.zero_init = True
+s2v_14B.transformer.zero_timestep = True
+s2v_14B.transformer.enable_motioner = False
+s2v_14B.transformer.add_last_motion = True
+s2v_14B.transformer.trainable_token = False
+s2v_14B.transformer.enable_tsm = False
+s2v_14B.transformer.enable_framepack = True
+s2v_14B.transformer.framepack_drop_mode = 'padd'
+s2v_14B.transformer.audio_dim = 1024
+s2v_14B.transformer.motion_frames = 73
+s2v_14B.transformer.cond_dim = 16
+# inference
+s2v_14B.sample_neg_prompt = "画面模糊，最差质量，画面模糊，细节模糊不清，情绪激动剧烈，手快速抖动，字幕，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+s2v_14B.drop_first_motion = True
+s2v_14B.sample_shift = 3
+s2v_14B.sample_steps = 40
+s2v_14B.sample_guide_scale = 4.5

wan/configs/wan_t2v_A14B.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan T2V A14B ------------------------#
+t2v_A14B = EasyDict(__name__='Config: Wan T2V A14B')
+t2v_A14B.update(wan_shared_cfg)
+# t5
+t2v_A14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_A14B.t5_tokenizer = 'google/umt5-xxl'
+# vae
+t2v_A14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_A14B.vae_stride = (4, 8, 8)
+# transformer
+t2v_A14B.patch_size = (1, 2, 2)
+t2v_A14B.dim = 5120
+t2v_A14B.ffn_dim = 13824
+t2v_A14B.freq_dim = 256
+t2v_A14B.num_heads = 40
+t2v_A14B.num_layers = 40
+t2v_A14B.window_size = (-1, -1)
+t2v_A14B.qk_norm = True
+t2v_A14B.cross_attn_norm = True
+t2v_A14B.eps = 1e-6
+t2v_A14B.low_noise_checkpoint = 'low_noise_model'
+t2v_A14B.high_noise_checkpoint = 'high_noise_model'
+# inference
+t2v_A14B.sample_shift = 12.0
+t2v_A14B.sample_steps = 40
+t2v_A14B.boundary = 0.875
+t2v_A14B.sample_guide_scale = (3.0, 4.0)  # low noise, high noise

wan/configs/wan_ti2v_5B.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan TI2V 5B ------------------------#
+ti2v_5B = EasyDict(__name__='Config: Wan TI2V 5B')
+ti2v_5B.update(wan_shared_cfg)
+# t5
+ti2v_5B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+ti2v_5B.t5_tokenizer = 'google/umt5-xxl'
+# vae
+ti2v_5B.vae_checkpoint = 'Wan2.2_VAE.pth'
+ti2v_5B.vae_stride = (4, 16, 16)
+# transformer
+ti2v_5B.patch_size = (1, 2, 2)
+ti2v_5B.dim = 3072
+ti2v_5B.ffn_dim = 14336
+ti2v_5B.freq_dim = 256
+ti2v_5B.num_heads = 24
+ti2v_5B.num_layers = 30
+ti2v_5B.window_size = (-1, -1)
+ti2v_5B.qk_norm = True
+ti2v_5B.cross_attn_norm = True
+ti2v_5B.eps = 1e-6
+# inference
+ti2v_5B.sample_fps = 24
+ti2v_5B.sample_shift = 5.0
+ti2v_5B.sample_steps = 50
+ti2v_5B.sample_guide_scale = 5.0
+ti2v_5B.frame_num = 121

wan/distributed/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.

wan/distributed/fsdp.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+from functools import partial
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
+from torch.distributed.utils import _free_storage
+def shard_model(
+    model,
+    device_id,
+    param_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,
+    buffer_dtype=torch.float32,
+    process_group=None,
+    sharding_strategy=ShardingStrategy.FULL_SHARD,
+    sync_module_states=True,
+    use_lora=False
+):
+    model = FSDP(
+        module=model,
+        process_group=process_group,
+        sharding_strategy=sharding_strategy,
+        auto_wrap_policy=partial(
+            lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks),
+        mixed_precision=MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype),
+        device_id=device_id,
+        sync_module_states=sync_module_states,
+        use_orig_params=True if use_lora else False)
+    return model
+def free_model(model):
+    for m in model.modules():
+        if isinstance(m, FSDP):
+            _free_storage(m._handle.flat_param.data)
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()

wan/distributed/sequence_parallel.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.cuda.amp as amp
+from ..modules.model import sinusoidal_embedding_1d
+from .ulysses import distributed_attention
+from .util import gather_forward, get_rank, get_world_size
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+@torch.amp.autocast('cuda', enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
+            s, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+                            dim=-1).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        sp_size = get_world_size()
+        sp_rank = get_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                       s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+def sp_dit_forward(
+    self,
+    x,
+    t,
+    context,
+    seq_len,
+    y=None,
+):
+    """
+    x:              A list of videos each with shape [C, T, H, W].
+    t:              [B].
+    context:        A list of text embeddings each with shape [L, C].
+    """
+    if self.model_type == 'i2v':
+        assert y is not None
+    # params
+    device = self.patch_embedding.weight.device
+    if self.freqs.device != device:
+        self.freqs = self.freqs.to(device)
+    if y is not None:
+        x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+    # embeddings
+    x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+    grid_sizes = torch.stack(
+        [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+    x = [u.flatten(2).transpose(1, 2) for u in x]
+    seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+    assert seq_lens.max() <= seq_len
+    x = torch.cat([
+        torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+        for u in x
+    ])
+    # time embeddings
+    if t.dim() == 1:
+        t = t.expand(t.size(0), seq_len)
+    with torch.amp.autocast('cuda', dtype=torch.float32):
+        bt = t.size(0)
+        t = t.flatten()
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim,
+                                    t).unflatten(0, (bt, seq_len)).float())
+        e0 = self.time_projection(e).unflatten(2, (6, self.dim))
+        assert e.dtype == torch.float32 and e0.dtype == torch.float32
+    # context
+    context_lens = None
+    context = self.text_embedding(
+        torch.stack([
+            torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+            for u in context
+        ]))
+    # Context Parallel
+    x = torch.chunk(x, get_world_size(), dim=1)[get_rank()]
+    e = torch.chunk(e, get_world_size(), dim=1)[get_rank()]
+    e0 = torch.chunk(e0, get_world_size(), dim=1)[get_rank()]
+    # arguments
+    kwargs = dict(
+        e=e0,
+        seq_lens=seq_lens,
+        grid_sizes=grid_sizes,
+        freqs=self.freqs,
+        context=context,
+        context_lens=context_lens)
+    for block in self.blocks:
+        x = block(x, **kwargs)
+    # head
+    x = self.head(x, e)
+    # Context Parallel
+    x = gather_forward(x, dim=1)
+    # unpatchify
+    x = self.unpatchify(x, grid_sizes)
+    return [u.float() for u in x]
+def sp_attn_forward(self, x, seq_lens, grid_sizes, freqs, dtype=torch.bfloat16):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+    x = distributed_attention(
+        half(q),
+        half(k),
+        half(v),
+        seq_lens,
+        window_size=self.window_size,
+    )
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x

wan/distributed/ulysses.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.distributed as dist
+from ..modules.attention import flash_attention
+from .util import all_to_all
+def distributed_attention(
+        q,
+        k,
+        v,
+        seq_lens,
+        window_size=(-1, -1),
+):
+    """
+    Performs distributed attention based on DeepSpeed Ulysses attention mechanism.
+    please refer to https://arxiv.org/pdf/2309.14509
+    Args:
+        q:           [B, Lq // p, Nq, C1].
+        k:           [B, Lk // p, Nk, C1].
+        v:           [B, Lk // p, Nk, C2]. Nq must be divisible by Nk.
+        seq_lens:    [B], length of each sequence in batch
+        window_size: (left right). If not (-1, -1), apply sliding window local attention.
+    """
+    if not dist.is_initialized():
+        raise ValueError("distributed group should be initialized.")
+    b = q.shape[0]
+    # gather q/k/v sequence
+    q = all_to_all(q, scatter_dim=2, gather_dim=1)
+    k = all_to_all(k, scatter_dim=2, gather_dim=1)
+    v = all_to_all(v, scatter_dim=2, gather_dim=1)
+    # apply attention
+    x = flash_attention(
+        q,
+        k,
+        v,
+        k_lens=seq_lens,
+        window_size=window_size,
+    )
+    # scatter q/k/v sequence
+    x = all_to_all(x, scatter_dim=1, gather_dim=2)
+    return x

wan/distributed/util.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.distributed as dist
+def init_distributed_group():
+    """r initialize sequence parallel group.
+    """
+    if not dist.is_initialized():
+        dist.init_process_group(backend='nccl')
+def get_rank():
+    return dist.get_rank()
+def get_world_size():
+    return dist.get_world_size()
+def all_to_all(x, scatter_dim, gather_dim, group=None, **kwargs):
+    """
+    `scatter` along one dimension and `gather` along another.
+    """
+    world_size = get_world_size()
+    if world_size > 1:
+        inputs = [u.contiguous() for u in x.chunk(world_size, dim=scatter_dim)]
+        outputs = [torch.empty_like(u) for u in inputs]
+        dist.all_to_all(outputs, inputs, group=group, **kwargs)
+        x = torch.cat(outputs, dim=gather_dim).contiguous()
+    return x
+def all_gather(tensor):
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return [tensor]
+    tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
+    torch.distributed.all_gather(tensor_list, tensor)
+    return tensor_list
+def gather_forward(input, dim):
+    # skip if world_size == 1
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return input
+    # gather sequence
+    output = all_gather(input)
+    return torch.cat(output, dim=dim).contiguous()

wan/image2video.py ADDED Viewed

	@@ -0,0 +1,431 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+import logging
+import math
+import os
+import random
+import sys
+import types
+from contextlib import contextmanager
+from functools import partial
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.distributed as dist
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from .distributed.fsdp import shard_model
+from .distributed.sequence_parallel import sp_attn_forward, sp_dit_forward
+from .distributed.util import get_world_size
+from .modules.model import WanModel
+from .modules.t5 import T5EncoderModel
+from .modules.vae2_1 import Wan2_1_VAE
+from .utils.fm_solvers import (
+    FlowDPMSolverMultistepScheduler,
+    get_sampling_sigmas,
+    retrieve_timesteps,
+)
+from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+class WanI2V:
+    def __init__(
+        self,
+        config,
+        checkpoint_dir,
+        device_id=0,
+        rank=0,
+        t5_fsdp=False,
+        dit_fsdp=False,
+        use_sp=False,
+        t5_cpu=False,
+        init_on_cpu=True,
+        convert_model_dtype=False,
+    ):
+        r"""
+        Initializes the image-to-video generation model components.
+        Args:
+            config (EasyDict):
+                Object containing model parameters initialized from config.py
+            checkpoint_dir (`str`):
+                Path to directory containing model checkpoints
+            device_id (`int`,  *optional*, defaults to 0):
+                Id of target GPU device
+            rank (`int`,  *optional*, defaults to 0):
+                Process rank for distributed training
+            t5_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for T5 model
+            dit_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for DiT model
+            use_sp (`bool`, *optional*, defaults to False):
+                Enable distribution strategy of sequence parallel.
+            t5_cpu (`bool`, *optional*, defaults to False):
+                Whether to place T5 model on CPU. Only works without t5_fsdp.
+            init_on_cpu (`bool`, *optional*, defaults to True):
+                Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
+            convert_model_dtype (`bool`, *optional*, defaults to False):
+                Convert DiT model parameters dtype to 'config.param_dtype'.
+                Only works without FSDP.
+        """
+        self.device = torch.device(f"cuda:{device_id}")
+        self.config = config
+        self.rank = rank
+        self.t5_cpu = t5_cpu
+        self.init_on_cpu = init_on_cpu
+        self.num_train_timesteps = config.num_train_timesteps
+        self.boundary = config.boundary
+        self.param_dtype = config.param_dtype
+        if t5_fsdp or dit_fsdp or use_sp:
+            self.init_on_cpu = False
+        shard_fn = partial(shard_model, device_id=device_id)
+        self.text_encoder = T5EncoderModel(
+            text_len=config.text_len,
+            dtype=config.t5_dtype,
+            device=torch.device('cpu'),
+            checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
+            shard_fn=shard_fn if t5_fsdp else None,
+        )
+        self.vae_stride = config.vae_stride
+        self.patch_size = config.patch_size
+        self.vae = Wan2_1_VAE(
+            vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
+            device=self.device)
+        logging.info(f"Creating WanModel from {checkpoint_dir}")
+        self.low_noise_model = WanModel.from_pretrained(
+            checkpoint_dir, subfolder=config.low_noise_checkpoint)
+        self.low_noise_model = self._configure_model(
+            model=self.low_noise_model,
+            use_sp=use_sp,
+            dit_fsdp=dit_fsdp,
+            shard_fn=shard_fn,
+            convert_model_dtype=convert_model_dtype)
+        self.high_noise_model = WanModel.from_pretrained(
+            checkpoint_dir, subfolder=config.high_noise_checkpoint)
+        self.high_noise_model = self._configure_model(
+            model=self.high_noise_model,
+            use_sp=use_sp,
+            dit_fsdp=dit_fsdp,
+            shard_fn=shard_fn,
+            convert_model_dtype=convert_model_dtype)
+        if use_sp:
+            self.sp_size = get_world_size()
+        else:
+            self.sp_size = 1
+        self.sample_neg_prompt = config.sample_neg_prompt
+    def _configure_model(self, model, use_sp, dit_fsdp, shard_fn,
+                         convert_model_dtype):
+        """
+        Configures a model object. This includes setting evaluation modes,
+        applying distributed parallel strategy, and handling device placement.
+        Args:
+            model (torch.nn.Module):
+                The model instance to configure.
+            use_sp (`bool`):
+                Enable distribution strategy of sequence parallel.
+            dit_fsdp (`bool`):
+                Enable FSDP sharding for DiT model.
+            shard_fn (callable):
+                The function to apply FSDP sharding.
+            convert_model_dtype (`bool`):
+                Convert DiT model parameters dtype to 'config.param_dtype'.
+                Only works without FSDP.
+        Returns:
+            torch.nn.Module:
+                The configured model.
+        """
+        model.eval().requires_grad_(False)
+        if use_sp:
+            for block in model.blocks:
+                block.self_attn.forward = types.MethodType(
+                    sp_attn_forward, block.self_attn)
+            model.forward = types.MethodType(sp_dit_forward, model)
+        if dist.is_initialized():
+            dist.barrier()
+        if dit_fsdp:
+            model = shard_fn(model)
+        else:
+            if convert_model_dtype:
+                model.to(self.param_dtype)
+            if not self.init_on_cpu:
+                model.to(self.device)
+        return model
+    def _prepare_model_for_timestep(self, t, boundary, offload_model):
+        r"""
+        Prepares and returns the required model for the current timestep.
+        Args:
+            t (torch.Tensor):
+                current timestep.
+            boundary (`int`):
+                The timestep threshold. If `t` is at or above this value,
+                the `high_noise_model` is considered as the required model.
+            offload_model (`bool`):
+                A flag intended to control the offloading behavior.
+        Returns:
+            torch.nn.Module:
+                The active model on the target device for the current timestep.
+        """
+        if t.item() >= boundary:
+            required_model_name = 'high_noise_model'
+            offload_model_name = 'low_noise_model'
+        else:
+            required_model_name = 'low_noise_model'
+            offload_model_name = 'high_noise_model'
+        if offload_model or self.init_on_cpu:
+            if next(getattr(
+                    self,
+                    offload_model_name).parameters()).device.type == 'cuda':
+                getattr(self, offload_model_name).to('cpu')
+            if next(getattr(
+                    self,
+                    required_model_name).parameters()).device.type == 'cpu':
+                getattr(self, required_model_name).to(self.device)
+        return getattr(self, required_model_name)
+    def generate(self,
+                 input_prompt,
+                 img,
+                 max_area=720 * 1280,
+                 frame_num=81,
+                 shift=5.0,
+                 sample_solver='unipc',
+                 sampling_steps=40,
+                 guide_scale=5.0,
+                 n_prompt="",
+                 seed=-1,
+                 offload_model=True):
+        r"""
+        Generates video frames from input image and text prompt using diffusion process.
+        Args:
+            input_prompt (`str`):
+                Text prompt for content generation.
+            img (PIL.Image.Image):
+                Input image tensor. Shape: [3, H, W]
+            max_area (`int`, *optional*, defaults to 720*1280):
+                Maximum pixel area for latent space calculation. Controls video resolution scaling
+            frame_num (`int`, *optional*, defaults to 81):
+                How many frames to sample from a video. The number should be 4n+1
+            shift (`float`, *optional*, defaults to 5.0):
+                Noise schedule shift parameter. Affects temporal dynamics
+                [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
+            sample_solver (`str`, *optional*, defaults to 'unipc'):
+                Solver used to sample the video.
+            sampling_steps (`int`, *optional*, defaults to 40):
+                Number of diffusion sampling steps. Higher values improve quality but slow generation
+            guide_scale (`float` or tuple[`float`], *optional*, defaults 5.0):
+                Classifier-free guidance scale. Controls prompt adherence vs. creativity.
+                If tuple, the first guide_scale will be used for low noise model and
+                the second guide_scale will be used for high noise model.
+            n_prompt (`str`, *optional*, defaults to ""):
+                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
+            seed (`int`, *optional*, defaults to -1):
+                Random seed for noise generation. If -1, use random seed
+            offload_model (`bool`, *optional*, defaults to True):
+                If True, offloads models to CPU during generation to save VRAM
+        Returns:
+            torch.Tensor:
+                Generated video frames tensor. Dimensions: (C, N H, W) where:
+                - C: Color channels (3 for RGB)
+                - N: Number of frames (81)
+                - H: Frame height (from max_area)
+                - W: Frame width from max_area)
+        """
+        # preprocess
+        guide_scale = (guide_scale, guide_scale) if isinstance(
+            guide_scale, float) else guide_scale
+        img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(self.device)
+        F = frame_num
+        h, w = img.shape[1:]
+        aspect_ratio = h / w
+        lat_h = round(
+            np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] //
+            self.patch_size[1] * self.patch_size[1])
+        lat_w = round(
+            np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] //
+            self.patch_size[2] * self.patch_size[2])
+        h = lat_h * self.vae_stride[1]
+        w = lat_w * self.vae_stride[2]
+        max_seq_len = ((F - 1) // self.vae_stride[0] + 1) * lat_h * lat_w // (
+            self.patch_size[1] * self.patch_size[2])
+        max_seq_len = int(math.ceil(max_seq_len / self.sp_size)) * self.sp_size
+        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
+        seed_g = torch.Generator(device=self.device)
+        seed_g.manual_seed(seed)
+        noise = torch.randn(
+            16,
+            (F - 1) // self.vae_stride[0] + 1,
+            lat_h,
+            lat_w,
+            dtype=torch.float32,
+            generator=seed_g,
+            device=self.device)
+        msk = torch.ones(1, F, lat_h, lat_w, device=self.device)
+        msk[:, 1:] = 0
+        msk = torch.concat([
+            torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]
+        ],
+                           dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+        msk = msk.transpose(1, 2)[0]
+        if n_prompt == "":
+            n_prompt = self.sample_neg_prompt
+        # preprocess
+        if not self.t5_cpu:
+            self.text_encoder.model.to(self.device)
+            context = self.text_encoder([input_prompt], self.device)
+            context_null = self.text_encoder([n_prompt], self.device)
+            if offload_model:
+                self.text_encoder.model.cpu()
+        else:
+            context = self.text_encoder([input_prompt], torch.device('cpu'))
+            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
+            context = [t.to(self.device) for t in context]
+            context_null = [t.to(self.device) for t in context_null]
+        y = self.vae.encode([
+            torch.concat([
+                torch.nn.functional.interpolate(
+                    img[None].cpu(), size=(h, w), mode='bicubic').transpose(
+                        0, 1),
+                torch.zeros(3, F - 1, h, w)
+            ],
+                         dim=1).to(self.device)
+        ])[0]
+        y = torch.concat([msk, y])
+        @contextmanager
+        def noop_no_sync():
+            yield
+        no_sync_low_noise = getattr(self.low_noise_model, 'no_sync',
+                                    noop_no_sync)
+        no_sync_high_noise = getattr(self.high_noise_model, 'no_sync',
+                                     noop_no_sync)
+        # evaluation mode
+        with (
+                torch.amp.autocast('cuda', dtype=self.param_dtype),
+                torch.no_grad(),
+                no_sync_low_noise(),
+                no_sync_high_noise(),
+        ):
+            boundary = self.boundary * self.num_train_timesteps
+            if sample_solver == 'unipc':
+                sample_scheduler = FlowUniPCMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sample_scheduler.set_timesteps(
+                    sampling_steps, device=self.device, shift=shift)
+                timesteps = sample_scheduler.timesteps
+            elif sample_solver == 'dpm++':
+                sample_scheduler = FlowDPMSolverMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
+                timesteps, _ = retrieve_timesteps(
+                    sample_scheduler,
+                    device=self.device,
+                    sigmas=sampling_sigmas)
+            else:
+                raise NotImplementedError("Unsupported solver.")
+            # sample videos
+            latent = noise
+            arg_c = {
+                'context': [context[0]],
+                'seq_len': max_seq_len,
+                'y': [y],
+            }
+            arg_null = {
+                'context': context_null,
+                'seq_len': max_seq_len,
+                'y': [y],
+            }
+            if offload_model:
+                torch.cuda.empty_cache()
+            for _, t in enumerate(tqdm(timesteps)):
+                latent_model_input = [latent.to(self.device)]
+                timestep = [t]
+                timestep = torch.stack(timestep).to(self.device)
+                model = self._prepare_model_for_timestep(
+                    t, boundary, offload_model)
+                sample_guide_scale = guide_scale[1] if t.item(
+                ) >= boundary else guide_scale[0]
+                noise_pred_cond = model(
+                    latent_model_input, t=timestep, **arg_c)[0]
+                if offload_model:
+                    torch.cuda.empty_cache()
+                noise_pred_uncond = model(
+                    latent_model_input, t=timestep, **arg_null)[0]
+                if offload_model:
+                    torch.cuda.empty_cache()
+                noise_pred = noise_pred_uncond + sample_guide_scale * (
+                    noise_pred_cond - noise_pred_uncond)
+                temp_x0 = sample_scheduler.step(
+                    noise_pred.unsqueeze(0),
+                    t,
+                    latent.unsqueeze(0),
+                    return_dict=False,
+                    generator=seed_g)[0]
+                latent = temp_x0.squeeze(0)
+                x0 = [latent]
+                del latent_model_input, timestep
+            if offload_model:
+                self.low_noise_model.cpu()
+                self.high_noise_model.cpu()
+                torch.cuda.empty_cache()
+            if self.rank == 0:
+                videos = self.vae.decode(x0)
+        del noise, latent, x0
+        del sample_scheduler
+        if offload_model:
+            gc.collect()
+            torch.cuda.synchronize()
+        if dist.is_initialized():
+            dist.barrier()
+        return videos[0] if self.rank == 0 else None

wan/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from .attention import flash_attention
+from .model import WanModel
+from .t5 import T5Decoder, T5Encoder, T5EncoderModel, T5Model
+from .tokenizers import HuggingfaceTokenizer
+from .vae2_1 import Wan2_1_VAE
+from .vae2_2 import Wan2_2_VAE
+__all__ = [
+    'Wan2_1_VAE',
+    'Wan2_2_VAE',
+    'WanModel',
+    'T5Model',
+    'T5Encoder',
+    'T5Decoder',
+    'T5EncoderModel',
+    'HuggingfaceTokenizer',
+    'flash_attention',
+]

wan/modules/animate/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from .model_animate import WanAnimateModel
+from .clip import CLIPModel
+__all__ = ['WanAnimateModel', 'CLIPModel']