Spaces:

Aleksandar
/

PartEdit

Running on Zero

File size: 100,657 Bytes

# Based on stable_diffusion_reference.py
# Based on https://github.com/RoyiRa/prompt-to-prompt-with-sdxl
from __future__ import annotations

import abc
import typing
from collections.abc import Iterable
from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import einops
import numpy as np
import torch
import torch.nn.functional as F
from diffusers import AutoencoderKL, StableDiffusionXLPipeline, UNet2DConditionModel
from diffusers import __version__ as diffusers_version
from diffusers.models.lora import adjust_lora_scale_text_encoder
from diffusers.models.attention_processor import AttnProcessor2_0
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
    rescale_noise_cfg,
)
from diffusers.pipelines.stable_diffusion.safety_checker import (
    StableDiffusionSafetyChecker,
)

from diffusers.pipelines.stable_diffusion_xl.pipeline_output import (
    StableDiffusionXLPipelineOutput,
)
from diffusers.utils import (
    USE_PEFT_BACKEND,
    logging,
    scale_lora_layers,
    unscale_lora_layers,
)
from diffusers.utils.import_utils import is_invisible_watermark_available
from packaging import version
from PIL import Image
from safetensors.torch import load_file
from torchvision.transforms import ToPILImage, ToTensor
from torchvision.utils import make_grid
from transformers import CLIPImageProcessor

if is_invisible_watermark_available():
    from diffusers.pipelines.stable_diffusion_xl.watermark import (
        StableDiffusionXLWatermarker,
    )


logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

try:
    from diffusers import LEditsPPPipelineStableDiffusionXL, EulerDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler
except ImportError as e:
    logger.error("DPMSolverMultistepScheduler or LEditsPPPipelineStableDiffusionXL not found. Verified on >= 0.29.1")
    from diffusers import DDIMScheduler, EulerDiscreteScheduler

if typing.TYPE_CHECKING:
    from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
    from transformers import (
        CLIPTextModel,
        CLIPTextModelWithProjection,
        CLIPTokenizer,
        CLIPVisionModelWithProjection,
    )
    from diffusers.models.attention import Attention
    from diffusers.schedulers import KarrasDiffusionSchedulers


# Original implementation from
# Updated to reflect
class PartEditPipeline(StableDiffusionXLPipeline):
    r"""
    PartEditPipeline for text-to-image generation Pusing Stable Diffusion XL with SD1.5 NSFW checker.

    This model inherits from [`StableDiffusionXLPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder. Stable Diffusion XL uses the text portion of
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
        text_encoder_2 ([` CLIPTextModelWithProjection`]):
            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
            specifically the
            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
            variant.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer_2 (`CLIPTokenizer`):
            Second Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
            `stabilityai/stable-diffusion-xl-base-1-0`.
        add_watermarker (`bool`, *optional*):
            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
            watermarker will be used.
    """

    _optional_components = ["feature_extractor", "add_watermarker, safety_checker"]

    # Added back from stable_diffusion_reference.py with safety_check to instantiate the NSFW checker from SD1.5
    def __init__(
        self,
        vae: AutoencoderKL,
        text_encoder: CLIPTextModel,
        text_encoder_2: CLIPTextModelWithProjection,
        tokenizer: CLIPTokenizer,
        tokenizer_2: CLIPTokenizer,
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        image_encoder: CLIPVisionModelWithProjection = None,
        feature_extractor: CLIPImageProcessor = None,
        force_zeros_for_empty_prompt: bool = True,
        add_watermarker: Optional[bool] = None,
        safety_checker: Optional[StableDiffusionSafetyChecker] = None,
    ):
        if safety_checker is not None:
            assert isinstance(safety_checker, StableDiffusionSafetyChecker), f"Expected safety_checker to be of type StableDiffusionSafetyChecker, got {type(safety_checker)}"
            assert feature_extractor is not None, "Feature Extractor must be present to use the NSFW checker"
        super().__init__(
            vae=vae,
            text_encoder=text_encoder,
            text_encoder_2=text_encoder_2,
            tokenizer=tokenizer,
            tokenizer_2=tokenizer_2,
            unet=unet,
            scheduler=scheduler,
            image_encoder=image_encoder,
            feature_extractor=feature_extractor,
            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt,
            add_watermarker=add_watermarker,
        )
        self.register_modules(
            safety_checker=safety_checker,
        )
        # self.warn_once_callback = True

    @staticmethod
    def default_pipeline(device, precision=torch.float16, scheduler_type: str = "euler", load_safety: bool = False) -> Tuple[StableDiffusionXLPipeline, PartEditPipeline]:
        if scheduler_type.strip().lower() in ["ddim", "editfriendly"]:
            scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler", torch_dtype=precision)  # Edit Friendly DDPM
        elif scheduler_type.strip().lower() in "leditspp":

            scheduler = DPMSolverMultistepScheduler.from_pretrained(
                "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler", algorithm_type="sde-dpmsolver++", solver_order=2
            )  # LEdits
        else:
            scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler", torch_dtype=precision)

        vae = AutoencoderKL.from_pretrained(
            "madebyollin/sdxl-vae-fp16-fix",
            torch_dtype=precision,
            use_safetensors=True,
            resume_download=None,
        )
        default_pipe = StableDiffusionXLPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0",
            device=device,
            vae=vae,
            resume_download=None,
            scheduler=DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler", torch_dtype=precision),
            torch_dtype=precision,
        )

        safety_checker = (
            StableDiffusionSafetyChecker.from_pretrained(
                "benjamin-paine/stable-diffusion-v1-5",  # runwayml/stable-diffusion-v1-5",
                device_map=device,
                torch_dtype=precision,
                subfolder="safety_checker",
            )
            if load_safety
            else None
        )
        feature_extractor = (
            CLIPImageProcessor.from_pretrained(
                "benjamin-paine/stable-diffusion-v1-5",  # "runwayml/stable-diffusion-v1-5",
                subfolder="feature_extractor",
                device_map=device,
            )
            if load_safety
            else None
        )
        pipeline: PartEditPipeline = PartEditPipeline(
            vae=vae,
            tokenizer=default_pipe.tokenizer,
            tokenizer_2=default_pipe.tokenizer_2,
            text_encoder=default_pipe.text_encoder,
            text_encoder_2=default_pipe.text_encoder_2,
            unet=default_pipe.unet,
            scheduler=scheduler,
            image_encoder=default_pipe.image_encoder,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
        )
        return default_pipe.to(device), pipeline.to(device)

    def check_inputs(
        self,
        prompt,
        prompt_2,
        height,
        width,
        callback_steps,
        negative_prompt=None,
        negative_prompt_2=None,
        prompt_embeds=None,
        negative_prompt_embeds=None,
        pooled_prompt_embeds=None,
        negative_pooled_prompt_embeds=None,
        ip_adapter_image=None,
        ip_adapter_image_embeds=None,
        callback_on_step_end_tensor_inputs=None,
        # PartEdit stuff
        embedding_opt: Optional[torch.FloatTensor] = None,
    ):
        # Check version of diffusers
        extra_params = (
            {
                "ip_adapter_image": ip_adapter_image,
                "ip_adapter_image_embeds": ip_adapter_image_embeds,
            }
            if version.parse(diffusers_version) >= version.parse("0.27.0")
            else {}
        )

        # Use super to check the inputs from the parent class
        super(PartEditPipeline, self).check_inputs(
            prompt,
            prompt_2,
            height,
            width,
            callback_steps,
            negative_prompt,
            negative_prompt_2,
            prompt_embeds,
            negative_prompt_embeds,
            pooled_prompt_embeds,
            negative_pooled_prompt_embeds,
            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
            **extra_params,
        )
        # PartEdit checks
        if embedding_opt is not None:
            assert embedding_opt.ndim == 2, f"Embedding should be of shape (2, features), got {embedding_opt.shape}"
            assert embedding_opt.shape[-1] == 2048, f"SDXL Embedding should have 2048 features, got {embedding_opt.shape[1]}"
            assert embedding_opt.dtype in [
                torch.float32,
                torch.float16,
            ], f"Embedding should be of type fp32/fp16, got {embedding_opt.dtype}"

        assert hasattr(self, "controller"), "Controller should be present"
        assert hasattr(self.controller, "extra_kwargs"), "Controller should have extra_kwargs"

        extra_kwargs: DotDictExtra = self.controller.extra_kwargs
        strategy: Binarization = extra_kwargs.th_strategy

        assert isinstance(strategy, Binarization), f"Expected strategy to be of type Binarization, got {type(strategy)}"
        assert hasattr(extra_kwargs, "pad_strategy"), "Controller should have pad_strategy"
        assert isinstance(extra_kwargs.pad_strategy, PaddingStrategy), f"Expected pad_strategy to be of type PaddingStrategy, got {type(self.controller.extra_kwargs.pad_strategy)}"

        if strategy in [Binarization.PROVIDED_MASK]:
            assert hasattr(extra_kwargs, "mask_edit"), "Mask should be present in extra_kwargs"

    def _aggregate_and_get_attention_maps_per_token(self, with_softmax, select: int = 0, res: int = 32):
        attention_maps = self.controller.aggregate_attention(
            res=res,
            from_where=("up", "down", "mid"),
            batch_size=self.controller.batch_size,
            is_cross=True,
            select=select,
        )
        attention_maps_list = self._get_attention_maps_list(attention_maps=attention_maps, with_softmax=with_softmax)
        return attention_maps_list

    @staticmethod
    def _get_attention_maps_list(attention_maps: torch.Tensor, with_softmax) -> List[torch.Tensor]:
        attention_maps *= 100

        if with_softmax:
            attention_maps = torch.nn.functional.softmax(attention_maps, dim=-1)

        attention_maps_list = [attention_maps[:, :, i] for i in range(attention_maps.shape[2])]
        return attention_maps_list

    @torch.inference_mode()  # if this gives problems change back to @torch.no_grad()
    def __call__(
        self,
        prompt: Union[str, List[str]],
        prompt_2: Optional[Union[str, List[str]]] = None,
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 50,
        denoising_end: Optional[float] = None,
        guidance_scale: float = 7.5,
        negative_prompt: Optional[Union[str, List[str]]] = None,
        negative_prompt_2: Optional[Union[str, List[str]]] = None,
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: Optional[int] = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        guidance_rescale: float = 0.0,
        original_size: Optional[Tuple[int, int]] = None,
        crops_coords_top_left: Tuple[int, int] = (0, 0),
        target_size: Optional[Tuple[int, int]] = None,
        negative_original_size: Optional[Tuple[int, int]] = None,
        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
        negative_target_size: Optional[Tuple[int, int]] = None,
        attn_res=None,
        callback_on_step_end: Optional[Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        # PartEdit
        embedding_opt: Optional[Union[torch.FloatTensor, str]] = None,
        extra_kwargs: Optional[Union[dict, DotDictExtra]] = None,  # All params, check DotDictExtra
        uncond_embeds: Optional[torch.FloatTensor] = None,  # Unconditional embeddings from Null text inversion
        latents_list=None,
        zs=None,
    ):
        r"""
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

                The keyword arguments to configure the edit are:
                - edit_type (`str`). The edit type to apply. Can be either of `replace`, `refine`, `reweight`.
                - n_cross_replace (`int`): Number of diffusion steps in which cross attention should be replaced
                - n_self_replace (`int`): Number of diffusion steps in which self attention should be replaced
                - local_blend_words(`List[str]`, *optional*, default to `None`): Determines which area should be
                  changed. If None, then the whole image can be changed.
                - equalizer_words(`List[str]`, *optional*, default to `None`): Required for edit type `reweight`.
                  Determines which words should be enhanced.
                - equalizer_strengths (`List[float]`, *optional*, default to `None`) Required for edit type `reweight`.
                  Determines which how much the words in `equalizer_words` should be enhanced.

            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
                using zero terminal SNR.
        PartEdit Parameters:
            embedding_opt (`Union[torch.FloatTensor, str]`, *optional*): The embedding to be inserted in the prompt. The embedding
                will be inserted as third batch dimension.
            extra_kwargs (`dict`, *optional*): A dictionary with extra parameters to be passed to the pipeline.
                - Check `pipe.part_edit_available_params()` for the available parameters.
        Returns:
            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        """

        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
        width = width or self.unet.config.sample_size * self.vae_scale_factor

        original_size = original_size or (height, width)
        target_size = target_size or (height, width)

        # PartEdit setup
        extra_kwargs = DotDictExtra() if extra_kwargs is None else DotDictExtra(extra_kwargs)
        prompt = prompt + [prompt[0]] if prompt[0] != prompt[-1] else prompt  # Add required extra batch if not present
        extra_kwargs.batch_indx = len(prompt) - 1 if extra_kwargs.batch_indx == -1 else extra_kwargs.batch_indx
        add_extra_step = extra_kwargs.add_extra_step

        if attn_res is None:
            attn_res = int(np.ceil(width / 32)), int(np.ceil(height / 32))
        self.attn_res = attn_res
        # _prompts = prompt if embedding_opt is None else prompt + [prompt[-1]]
        if hasattr(self, "controller"):
            self.controller.reset()

        self.controller = create_controller(
            prompt,
            cross_attention_kwargs,
            num_inference_steps,
            tokenizer=self.tokenizer,
            device=self.device,
            attn_res=self.attn_res,
            extra_kwargs=extra_kwargs,
        )
        assert self.controller is not None
        assert issubclass(type(self.controller), AttentionControl)
        self.register_attention_control(
            self.controller,
        )  # add attention controller

        # 1. Check inputs. Raise error if not correct
        self.check_inputs(
            prompt,
            prompt_2,
            height,
            width,
            callback_steps,
            negative_prompt,
            negative_prompt_2,
            prompt_embeds,
            negative_prompt_embeds,
            pooled_prompt_embeds,
            negative_pooled_prompt_embeds,
        )

        # 2. Define call parameters
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
        elif prompt is not None and isinstance(prompt, list):
            batch_size = len(prompt)
        else:
            batch_size = prompt_embeds.shape[0]
        # batch_size = batch_size + 1 if embedding_opt is not None else batch_size

        device = self._execution_device
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
        text_encoder_lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
        (
            prompt_embeds,
            negative_prompt_embeds,
            pooled_prompt_embeds,
            negative_pooled_prompt_embeds,
        ) = self.encode_prompt(
            prompt=prompt,
            prompt_2=prompt_2,
            device=device,
            num_images_per_prompt=num_images_per_prompt,
            do_classifier_free_guidance=do_classifier_free_guidance,
            negative_prompt=negative_prompt,
            negative_prompt_2=negative_prompt_2,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
        )

        # 4. Prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
            height,
            width,
            prompt_embeds.dtype,
            device,
            generator,
            latents,
        )
        latents[1] = latents[0]

        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7. Prepare added time ids & embeddings
        add_text_embeds = pooled_prompt_embeds
        add_time_ids = self._get_add_time_ids(
            original_size,
            crops_coords_top_left,
            target_size,
            dtype=prompt_embeds.dtype,
            text_encoder_projection_dim=self.text_encoder_2.config.projection_dim,  # if none should be changed to enc1
        )
        if negative_original_size is not None and negative_target_size is not None:
            negative_add_time_ids = self._get_add_time_ids(
                negative_original_size,
                negative_crops_coords_top_left,
                negative_target_size,
                dtype=prompt_embeds.dtype,
            )
        else:
            negative_add_time_ids = add_time_ids

        # PartEdit:
        prompt_embeds = self.process_embeddings(embedding_opt, prompt_embeds, self.controller.pad_strategy)
        self.prompt_embeds = prompt_embeds

        if do_classifier_free_guidance:
            _og_prompt_embeds = prompt_embeds.clone()
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)

        prompt_embeds = prompt_embeds.to(device)
        add_text_embeds = add_text_embeds.to(device)
        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)

        # 8. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

        # 7.1 Apply denoising_end
        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
            discrete_timestep_cutoff = int(round(self.scheduler.config.num_train_timesteps - (denoising_end * self.scheduler.config.num_train_timesteps)))
            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
            timesteps = timesteps[:num_inference_steps]
        # PartEdit
        if hasattr(self, "debug_list"):  # if its disabled and there was a list
            del self.debug_list
        if extra_kwargs.debug_vis:
            self.debug_list = []
        if add_extra_step:
            num_inference_steps += 1
            timesteps = torch.cat([timesteps[[0]], timesteps], dim=-1)
            _latents = latents.clone()

        self._num_timesteps = len(timesteps)  # Same as in SDXL
        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
                # if i in range(50):
                #     latents[0] = latents_list[i]
                # expand the latents if we are doing classifier free guidance
                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents

                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # NOTE(Alex): Null text inversion usage
                if uncond_embeds is not None:
                    # if callback_on_step_end is not None and self.warn_once_callback:
                    #     self.warn_once_callback = False
                    #     logger.warning("Callback on step end is not supported with Null text inversion - Know what you are doing!")
                    _indx_to_use = i if i < len(uncond_embeds) else len(uncond_embeds) - 1  # use last if we have extra steps
                    # _og_prompt_embeds
                    curr = uncond_embeds[_indx_to_use].to(dtype=prompt_embeds.dtype).to(device).repeat(_og_prompt_embeds.shape[0], 1, 1)
                    prompt_embeds = torch.cat([curr, _og_prompt_embeds], dim=0)  # For now not changing the pooled prompt embeds
                    # if prompt_embeds.shape != (2, 77, 2048):
                    #     print(f"Prompt Embeds should be of shape (2, 77, 2048), got {prompt_embeds.shape}")

                # predict the noise residual
                noise_pred = self.unet(
                    latent_model_input,
                    t,
                    encoder_hidden_states=prompt_embeds,
                    added_cond_kwargs=added_cond_kwargs,
                ).sample

                if add_extra_step:  # PartEdit
                    latents = _latents.clone()
                    add_extra_step = False
                    progress_bar.update()
                    self.scheduler._init_step_index(t)
                    continue  # we just wanted the unet, not to do the step

                # perform guidance
                if do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                    # gs = torch.tensor([guidance_scale] * len(noise_pred_uncond),
                    #                   device=noise_pred.device, dtype= noise_pred.dtype).view(-1, 1, 1, 1)
                    # gs[0] = 7.5
                    # our_gs = torch.FloatTensor([1.0, guidance_scale, 1.0]).view(-1, 1, 1, 1).to(latents.device, dtype=latents.dtype)
                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

                if do_classifier_free_guidance and guidance_rescale > 0.0:
                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)

                # compute the previous noisy sample x_t -> x_t-1 # synth
                latents = self.scheduler.step(
                    noise_pred, t, latents, **extra_step_kwargs
                )
                # inv
                # latents = self.scheduler.step(noise_pred, t, latents, variance_noise=zs[i], **extra_step_kwargs)

                if extra_kwargs.debug_vis:  # PartEdit
                    # Could be removed, with .prev_sample above
                    self.debug_list.append(latents.pred_original_sample.cpu())

                latents = latents.prev_sample  # Needed here because of logging above

                # step callback
                latents = self.controller.step_callback(latents)

                # Note(Alex): Copied from SDXL
                if callback_on_step_end is not None:
                    callback_kwargs = {}
                    for k in callback_on_step_end_tensor_inputs:
                        callback_kwargs[k] = locals()[k]
                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)

                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
                    negative_pooled_prompt_embeds = callback_outputs.pop("negative_pooled_prompt_embeds", negative_pooled_prompt_embeds)
                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
                if embedding_opt is not None:  # PartEdit
                    us_dx = 0
                    if i == 0 and us_dx != 0:
                        print(f'Using lantents[{us_dx}] instead of latents[0]')
                    latents[-1:] = latents[us_dx]  # always tie the diff process
                # if embedding_opt is not None and callback_on_step_end is not None and \
                # callback_on_step_end.reversed_latents is not None:
                #     latents[-1:] = callback_on_step_end.reversed_latents[i]

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
                    if callback is not None and i % callback_steps == 0:
                        step_idx = i // getattr(self.scheduler, "order", 1)
                        callback(step_idx, t, latents)

        # 8. Post-processing
        if output_type == "latent":
            image = latents
        else:
            self.final_map = self.controller.visualize_final_map(False)
            # Added to support lower VRAM gpus
            self.controller.offload_stores(torch.device("cpu"))
            image = self.latent2image(latents, device, output_type, force_upcast=False)

        # Offload all models
        self.maybe_free_model_hooks()

        if not return_dict:
            return image

        self.grid = self.visualize_maps()
        # Disable editing in case of
        self.unregister_attention_control()

        # Did not add NSFW output as it is not part of XLPipelineOuput
        return StableDiffusionXLPipelineOutput(images=image)

    @torch.no_grad()
    def latent2image(
        self: PartEditPipeline,
        latents: torch.Tensor,
        device: torch.device,
        output_type: str = "pil",  # ['latent', 'pt', 'np', 'pil']
        force_upcast: bool = False,
    ) -> Union[torch.Tensor, np.ndarray, Image.Image]:
        # make sure the VAE is in float32 mode, as it overflows in float16
        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast or force_upcast
        latents = latents.to(device)
        if needs_upcasting:
            self.upcast_vae()
        latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)

        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
        # cast back to fp16 if needed
        if needs_upcasting and not force_upcast:
            self.vae.to(dtype=torch.float16)
        image, has_nsfw_concept = self.run_safety_checker(image, device, latents.dtype)

        if has_nsfw_concept is None:
            do_denormalize = [True] * image.shape[0]
        else:
            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
            if not all(do_denormalize):
                logger.warn(
                    "NSFW detected in the following images: %s",
                    ", ".join([f"image {i + 1}" for i, has_nsfw in enumerate(has_nsfw_concept) if has_nsfw]),
                )
        if self.watermark is not None:
            image = self.watermark.apply_watermark(image)
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
        if output_type in ["pt", "latent"]:
            image = image.cpu()
            latents = latents.cpu()
        return image

    def run_safety_checker(self, image: Union[np.ndarray, torch.Tensor], device: torch.device, dtype: type):
        if self.safety_checker is None:
            has_nsfw_concept = None
        else:
            if torch.is_tensor(image):
                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
            else:
                feature_extractor_input = self.image_processor.numpy_to_pil(image)
            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values.to(dtype))
        return image, has_nsfw_concept

    def register_attention_control(self, controller):
        attn_procs = {}
        cross_att_count = 0
        self.attn_names = {}  # Name => Idx
        for name in self.unet.attn_processors:
            (None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim)
            if name.startswith("mid_block"):
                self.unet.config.block_out_channels[-1]
                place_in_unet = "mid"
            elif name.startswith("up_blocks"):
                block_id = int(name[len("up_blocks.")])
                list(reversed(self.unet.config.block_out_channels))[block_id]
                place_in_unet = "up"
            elif name.startswith("down_blocks"):
                block_id = int(name[len("down_blocks.")])
                self.unet.config.block_out_channels[block_id]
                place_in_unet = "down"
            else:
                continue
            attn_procs[name] = PartEditCrossAttnProcessor(controller=controller, place_in_unet=place_in_unet)
            # print(f'{cross_att_count}=>{name}')
            cross_att_count += 1

        self.unet.set_attn_processor(attn_procs)
        controller.num_att_layers = cross_att_count

    def unregister_attention_control(self):
        # if pytorch >= 2.0
        self.unet.set_attn_processor(AttnProcessor2_0())
        if hasattr(self, "controller") and self.controller is not None:
            if hasattr(self.controller, "last_otsu"):
                self.last_otsu_value = self.controller.last_otsu[-1]
            del self.controller
            # self.controller.allow_edit_control = False

    def available_params(self) -> str:

        pipeline_params = """
        Pipeline Parameters: 
            embedding_opt (`Union[torch.FloatTensor, str]`, *optional*): The embedding to be inserted in the prompt. The embedding
                will be inserted as third batch dimension.
            extra_kwargs (`dict`, *optional*): A dictionary with extra parameters to be passed to the pipeline. 
                - Check `pipe.part_edit_available_params()` for the available parameters.
        """

        return pipeline_params + "\n" + self.part_edit_available_params()

    def process_embeddings(
            self,
            embedding_opt: Optional[Union[torch.FloatTensor, str]],
            prompt_embeds: torch.FloatTensor,
            padd_strategy: PaddingStrategy,
    ) -> torch.Tensor:
        return process_embeddings(embedding_opt, prompt_embeds, padd_strategy)

    def part_edit_available_params(self) -> str:
        return DotDictExtra().explain()

    # def run_sa

    def visualize_maps(self, make_grid_kwargs: dict = None):
        """Wrapper function to select correct storage location"""
        if not hasattr(self, "controller") or self.controller is None:
            return self.grid if hasattr(self, "grid") else None

        return self.controller.visualize_maps_agg(
            self.controller.use_agg_store,
            make_grid_kwargs=make_grid_kwargs,
        )

    def visualize_map_across_time(self):
        """Wrapper function to visualize the same as above, but as one mask"""
        if hasattr(self, "final_map") and self.final_map is not None:
            return self.final_map
        return self.controller.visualize_final_map(self.controller.use_agg_store)

def process_embeddings(
        embedding_opt: Optional[Union[torch.Tensor, str]],
        prompt_embeds: torch.Tensor,
        padd_strategy: PaddingStrategy,
    ) -> torch.Tensor:
        if embedding_opt is None:
            return prompt_embeds
        assert isinstance(padd_strategy, PaddingStrategy), f"padd_strategy must be of type PaddingStrategy, got {type(padd_strategy)}"

        if isinstance(embedding_opt, str):
            embedding_opt = load_file(embedding_opt)["embedding"] if "safetensors" in embedding_opt else torch.load(embedding_opt)
        elif isinstance(embedding_opt, list):
            e = [load_file(i)["embedding"] if "safetensors" in i else torch.load(i) for i in embedding_opt]
            embedding_opt = torch.cat(e, dim=0)
            print(f'Embedding Opt shape: {embedding_opt.shape=}')
        embedding_opt = embedding_opt.to(device=prompt_embeds.device, dtype=prompt_embeds.dtype)
        if embedding_opt.ndim == 2:
            embedding_opt = embedding_opt[None]
        num_embeds = embedding_opt.shape[1] # BG + Num of classes
        prompt_embeds[-1:, :num_embeds, :] = embedding_opt[:, :num_embeds, :]

        if PaddingStrategy.context == padd_strategy:
            return prompt_embeds
        if not (hasattr(padd_strategy, "norm") and hasattr(padd_strategy, "scale")):
            raise ValueError(f"PaddingStrategy with {padd_strategy} not recognized")
        _norm, _scale = padd_strategy.norm, padd_strategy.scale

        if padd_strategy == PaddingStrategy.BG:
            prompt_embeds[-1:, num_embeds:, :] = embedding_opt[:, :1, :]
        elif padd_strategy == PaddingStrategy.EOS:
            prompt_embeds[-1:, num_embeds:, :] = prompt_embeds[-1:, -1:, :]
        elif padd_strategy == PaddingStrategy.ZERO:
            prompt_embeds[-1:, num_embeds:, :] = 0.0
        elif padd_strategy == PaddingStrategy.SOT_E:
            prompt_embeds[-1:, num_embeds:, :] = prompt_embeds[-1:, :1, :]
        else:
            raise ValueError(f"{padd_strategy} not recognized")
        # Not recommended
        if _norm:
            prompt_embeds[-1:, :, :] = F.normalize(prompt_embeds[-1:, :, :], p=2, dim=-1)
        if _scale:
            _eps = 1e-8
            _min, _max = prompt_embeds[:1].min(), prompt_embeds[:1].max()
            if _norm:
                prompt_embeds = (prompt_embeds - _min) / (_max - _min + _eps)
            else:
                _new_min, _new_max = (
                    prompt_embeds[-1:, num_embeds:, :].min(),
                    prompt_embeds[-1:, num_embeds:, :].max(),
                )
                prompt_embeds[-1:, num_embeds:, :] = (prompt_embeds[-1:, num_embeds:, :] - _new_min) / (_new_max - _new_min + _eps)
                prompt_embeds[-1:, num_embeds:, :] = prompt_embeds[-1:, num_embeds:, :] * (_max - _min + _eps) + _min
        return prompt_embeds

# Depends on layers used to train with
LAYERS_TO_USE = [
    24,
    25,
    26,
    27,
    28,
    29,
    30,
    31,
    32,
    33,
    34,
    35,
    36,
    37,
    38,
    39,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    49,
    50,
    51,
    52,
    53,
    54,
    55,
    56,
    57,
    58,
    59,
    0,
    1,
    2,
    3,
]  # noqa: E501


class Binarization(Enum):
    """Controls the binarization of attn maps
    in case of use_otsu lower_binarize and upper_binarizer are multilpiers of otsu threshold

    args:
        strategy: str: name of the strategy
        enabled: bool: if binarization is enabled
        lower_binarize: float: lower threshold for binarization
        upper_binarize: float: upper threshold for binarization
        use_otsu: bool: if otsu is used for binarization
    """

    P2P = "p2p", False, 0.5, 0.5, False  # Baseline
    PROVIDED_MASK = "mask", True, 0.5, 0.5, False
    BINARY_0_5 = "binary_0.5", True, 0.5, 0.5, False
    BINARY_OTSU = "binary_otsu", True, 1.0, 1.0, True
    PARTEDIT = "partedit", True, 0.5, 1.5, True
    DISABLED = "disabled", False, 0.5, 0.5, False

    def __new__(
        cls,
        strategy: str,
        enabled: bool,
        lower_binarize: float,
        upper_binarize: float,
        use_otsu: bool,
    ) -> "Binarization":
        obj = object.__new__(cls)
        obj._value_ = strategy
        obj.enabled = enabled
        obj.lower_binarize = lower_binarize
        obj.upper_binarize = upper_binarize
        obj.use_otsu = use_otsu
        assert isinstance(obj.enabled, bool), "enabled should be of type bool"
        assert isinstance(obj.lower_binarize, float), "lower_binarize should be of type float"
        assert isinstance(obj.upper_binarize, float), "upper_binarize should be of type float"
        assert isinstance(obj.use_otsu, bool), "use_otsu should be of type bool"
        return obj

    def __eq__(self, other: Optional[Union[Binarization, str]] = None) -> bool:
        if not other:
            return False
        if isinstance(other, Binarization):
            return self.value.lower() == other.value.lower()
        if isinstance(other, str):
            return self.value.lower() == other.lower()

    @staticmethod
    def available_strategies() -> List[str]:
        return [strategy.name for strategy in Binarization]

    def __str__(self) -> str:
        return f"Binarization: {self.name} (Enabled: {self.enabled} Lower: {self.lower_binarize} Upper: {self.upper_binarize} Otsu: {self.use_otsu})"

    @staticmethod
    def from_string(
        strategy: str,
        enabled: Optional[bool] = None,
        lower_binarize: Optional[bool] = None,
        upper_binarize: Optional[float] = None,
        use_otsu: Optional[bool] = None,
    ) -> Binarization:
        strategy = strategy.strip().lower()
        for _strategy in Binarization:
            if _strategy.name.lower() == strategy:
                if enabled is not None:
                    _strategy.enabled = enabled
                if lower_binarize is not None:
                    _strategy.lower_binarize = lower_binarize
                if upper_binarize is not None:
                    _strategy.upper_binarize = upper_binarize
                if use_otsu is not None:
                    _strategy.use_otsu = use_otsu
                return _strategy
        raise ValueError(f"binarization_strategy={strategy} not recognized")


class PaddingStrategy(Enum):
    # Default
    BG = "BG", False, False
    # Others added just for experimentation reasons
    context = "context", False, False
    EOS = "EoS", False, False
    ZERO = "zero", False, False
    SOT_E = "SoT_E", False, False

    def __new__(cls, strategy: str, norm: bool, scale: bool) -> "PaddingStrategy":
        obj = object.__new__(cls)
        obj._value_ = strategy
        obj.norm = norm
        obj.scale = scale
        return obj

    # compare based on value
    def __eq__(self, other: Optional[Union[PaddingStrategy, str]] = None) -> bool:
        if not other:
            return False
        if isinstance(other, PaddingStrategy):
            return self.value.lower() == other.value.lower()
        if isinstance(other, str):
            return self.value.lower() == other.lower()

    @staticmethod
    def available_strategies() -> List[str]:
        return [strategy.name for strategy in PaddingStrategy]

    def __str__(self) -> str:
        return f"PaddStrategy: {self.name} Norm: {self.norm} Scale: {self.scale}"

    @staticmethod
    def from_string(strategy_str, norm: Optional[bool] = False, scale: Optional[bool] = False) -> "PaddingStrategy":
        for strategy in PaddingStrategy:
            if strategy.name.lower() == strategy_str.lower():
                if norm is not None:
                    strategy.norm = norm
                if scale is not None:
                    strategy.scale = scale
                return strategy
        raise ValueError(f"padd_strategy={strategy} not recognized")


class DotDictExtra(dict):
    """
    dot.notation access to dictionary attributes
    Holds default values for the extra_kwargs
    """

    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

    _layers_to_use = LAYERS_TO_USE  # Training parameter, not exposed directly
    _enable_non_agg_storing = False  # Useful for visualization but very VRAM heavy! ~35GB without offload 14GB with offload
    _cpu_offload = False  # Lowers VRAM but Slows down drastically, hidden
    _default = {
        "th_strategy": Binarization.PARTEDIT,
        "pad_strategy": PaddingStrategy.BG,
        "omega": 1.5,  # values should be between 0.25 and 2.0
        "use_agg_store": False,
        "edit_mask": None,
        "edit_steps": 50, # End at this step
        "start_editing_at": 0,  # Recommended, but exposed in case of wanting to change
        "use_layer_subset_idx": None,  # In case we want to use specific layers, NOTE: order not aligned with UNet lаyers
        "add_extra_step": False,
        "batch_indx": -1,  # assume last batch
        "blend_layers": None,
        "force_cross_attn": False,  # Force cross attention to maps
        # Optimization stuff
        "VRAM_low": True,  # Leave on by default, except if causing erros
        "grounding": None,
    }
    _default_explanations = {
        "th_strategy": "Binarization strategy for attention maps",
        "pad_strategy": "Padding strategy for the added tokens",
        "omega": "Omega value for the PartEdit",
        "use_agg_store": "If the attention maps should be aggregated",
        "add_extra_step": "If extra 0 step should be added to the diffusion process",
        "edit_mask": "Mask for the edit when using ProvidedMask strategy",
        "edit_steps": "Number of edit steps",
        "start_editing_at": "Step at which the edit should start",
        "use_layer_subset_idx": "Sublayers to use, recommended 0-8 if really needed to use some",
        "VRAM_low": "Recommended to not change",
        "force_cross_attn": "Force cross attention to use OPT token maps",
    }

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        for key, value in self._default.items():
            if key not in self:
                self[key] = value

        # Extra changes to Binarization, PaddingStrategy
        if isinstance(self["th_strategy"], str):
            self["th_strategy"] = Binarization.from_string(self["th_strategy"])
        if isinstance(self["pad_strategy"], str):
            self["pad_strategy"] = PaddingStrategy.from_string(self["pad_strategy"])
        self["edit_steps"] = self["edit_steps"] + self["add_extra_step"]

        if self.edit_mask is not None :
            if isinstance(self.edit_mask, str):
                # load with PIL or torch/safetensors
                if self.edit_mask.endswith(".safetensors"):
                    self.edit_mask = load_file(self.edit_mask)["edit_mask"]
                elif self.edit_mask.endswith(".pt"):
                    self.edit_mask = torch.load(self.edit_mask)["edit_mask"]
                else:
                    self.edit_mask = Image.open(self.edit_mask)
            if isinstance(self.edit_mask, Image.Image):
                self.edit_mask = ToTensor()(self.edit_mask.convert("L"))
            elif isinstance(self.edit_mask, np.ndarray):
                self.edit_mask = torch.from_numpy(self.edit_mask).unsqueeze(0)
            if self.edit_mask.ndim == 2:
                self.edit_mask = self.edit_mask[None, None, ...]
            elif self.edit_mask.ndim == 3:
                self.edit_mask = self.edit_mask[None, ...]
            
            if self.edit_mask.max() > 1.0:
                self.edit_mask = self.edit_mask / self.edit_mask.max()
        if self.grounding is not None: # same as above, but slightly different function
            if isinstance(self.grounding, Image.Image):
                self.grounding = ToTensor()(self.grounding.convert("L"))
            elif isinstance(self.grounding, np.ndarray):
                self.grounding = torch.from_numpy(self.grounding).unsqueeze(0)
            if self.grounding.ndim == 2:
                self.grounding = self.grounding[None, None, ...]
            elif self.grounding.ndim == 3:
                self.grounding = self.grounding[None, ...]
            if self.grounding.max() > 1.0:
                self.grounding = self.grounding / self.grounding.max()

        assert isinstance(self.th_strategy, Binarization), "th_strategy should be of type Binarization"
        assert isinstance(self.pad_strategy, PaddingStrategy), "pad_strategy should be of type PaddingStrategy"

    def th_from_str(self, strategy: str):
        return Binarization.from_string(strategy)

    @staticmethod
    def explain() -> str:
        """Returns a string with all the explanations of the parameters"""
        return "\n".join(
            [
                f"{key}: {DotDictExtra._default_explanations[key]}"
                for key in DotDictExtra._default
                if DotDictExtra._default_explanations.get(key, "Recommended to not change") != "Recommended to not change"
            ]
        )


def pack_interpolate_unpack(att, size, interpolation_mode, unwrap_last_dim=True, rewrap=False):
    has_last_dim = att.shape[-1] in [77, 1]
    _last_dim = att.shape[-1]
    if unwrap_last_dim:
        if has_last_dim:
            sq = int(att.shape[-2] ** 0.5)
            att = att.reshape(att.shape[0], sq, sq, -1).permute(0, 3, 1, 2)  # B x H x W x D => B x D x H x W
        else:
            sq = int(att.shape[-1] ** 0.5)
            att = att.reshape(*att.shape[:-1], sq, sq)  # B x H x W
    att = att.unsqueeze(-3)  # add a channel dimension
    if att.shape[-2:] != size:
        att, ps = einops.pack(att, "* c h w")
        att = F.interpolate(
            att,
            size=size,
            mode=interpolation_mode,
        )
        att = torch.stack(einops.unpack(att, ps, "* c h w"))
    if rewrap:
        if has_last_dim:
            att = att.reshape(att.shape[0], -1, att.shape[-1] * att.shape[-1], _last_dim)
        else:
            att = att.reshape(att.shape[0], -1, att.shape[-1] * att.shape[-1])
    # returns
    # rewrap True:
    # B x heads x D
    # B x heads X D x N
    # rewrap FALSE:
    # B x heads x H x W
    # B x N x heads X H x W x  if has_last_dim
    return att


@torch.no_grad()
def threshold_otsu(image: torch.Tensor = None, nbins=256, hist=None):
    """Return threshold value based on Otsu's method using PyTorch.
    This is a reimplementation from scikit-image
    https://github.com/scikit-image/scikit-image/blob/b76ff13478a5123e4d8b422586aaa54c791f2604/skimage/filters/thresholding.py#L336

    Args:
    image: torch.Tensor
        Grayscale input image.
    nbins: int
        Number of bins used to calculate histogram.
    hist: torch.Tensor or tuple
        Histogram of the input image. If None, it will be calculated using the input image.
    Returns
    -------
    threshold : float
        Upper threshold value. All pixels with an intensity higher than
        this value are assumed to be foreground.
    """
    if image is not None and image.dim() > 2 and image.shape[-1] in (3, 4):
        raise ValueError(f"threshold_otsu is expected to work correctly only for " f"grayscale images; image shape {image.shape} looks like " f"that of an RGB image.")
    # Convert nbins to a tensor, on device
    nbins = torch.tensor(nbins, device=image.device)

    # Check if the image has more than one intensity value; if not, return that value
    if image is not None:
        first_pixel = image.view(-1)[0]
        if torch.all(image == first_pixel):
            return first_pixel.item()

    counts, bin_centers = _validate_image_histogram(image, hist, nbins)

    # class probabilities for all possible thresholds
    weight1 = torch.cumsum(counts, dim=0)
    weight2 = torch.cumsum(counts.flip(dims=[0]), dim=0).flip(dims=[0])
    # class means for all possible thresholds
    mean1 = torch.cumsum(counts * bin_centers, dim=0) / weight1
    mean2 = (torch.cumsum((counts * bin_centers).flip(dims=[0]), dim=0).flip(dims=[0])) / weight2

    # Clip ends to align class 1 and class 2 variables:
    # The last value of ``weight1``/``mean1`` should pair with zero values in
    # ``weight2``/``mean2``, which do not exist.
    variance12 = weight1[:-1] * weight2[1:] * (mean1[:-1] - mean2[1:]) ** 2

    idx = torch.argmax(variance12)
    threshold = bin_centers[idx]

    return threshold.item()


def _validate_image_histogram(image: torch.Tensor, hist, nbins):
    """Helper function to validate and compute histogram if necessary."""
    if hist is not None:
        if isinstance(hist, tuple) and len(hist) == 2:
            counts, bin_centers = hist
            if not (isinstance(counts, torch.Tensor) and isinstance(bin_centers, torch.Tensor)):
                counts = torch.tensor(counts)
                bin_centers = torch.tensor(bin_centers)
        else:
            counts = torch.tensor(hist)
            bin_centers = torch.linspace(0, 1, len(counts))
    else:
        if image is None:
            raise ValueError("Either image or hist must be provided.")
        image = image.to(torch.float32)
        counts, bin_edges = histogram(image, nbins)
        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    return counts, bin_centers


def histogram(xs: torch.Tensor, bins):
    # Like torch.histogram, but works with cuda
    # https://github.com/pytorch/pytorch/issues/69519#issuecomment-1183866843
    min, max = xs.min(), xs.max()
    counts = torch.histc(xs, bins, min=min, max=max).to(xs.device)
    boundaries = torch.linspace(min, max, bins + 1, device=xs.device)
    return counts, boundaries


# Modification of the original from
# https://github.com/google/prompt-to-prompt/blob/9c472e44aa1b607da59fea94820f7be9480ec545/prompt-to-prompt_stable.ipynb
def aggregate_attention(
    attention_store: AttentionStore,
    res: int,
    batch_size: int,
    from_where: List[str],
    is_cross: bool,
    upsample_everything: int = None,
    return_all_layers: bool = False,
    use_same_layers_as_train: bool = False,
    train_layers: Optional[list[int]] = None,
    use_layer_subset_idx: list[int] = None,
    use_step_store: bool = False,
):
    out = []
    attention_maps = attention_store.get_average_attention(use_step_store)
    num_pixels = res**2
    for location in from_where:
        for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:

            if upsample_everything or (use_same_layers_as_train and is_cross):
                item = pack_interpolate_unpack(item, (res, res), "bilinear", rewrap=True)
            if item.shape[-2] == num_pixels:
                cross_maps = item.reshape(batch_size, -1, res, res, item.shape[-1])[None]
                out.append(cross_maps)
    _dim = 0
    if is_cross and use_same_layers_as_train and train_layers is not None:
        out = [out[i] for i in train_layers]
        if use_layer_subset_idx is not None:  # after correct ordering
            out = [out[i] for i in use_layer_subset_idx]

    out = torch.cat(out, dim=_dim)
    if return_all_layers:
        return out
    else:
        out = out.sum(_dim) / out.shape[_dim]
    return out


def min_max_norm(a, _min=None, _max=None, eps=1e-6):
    _max = a.max() if _max is None else _max
    _min = a.min() if _min is None else _min
    return (a - _min) / (_max - _min + eps)


# Copied from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L209
class LocalBlend:
    def __call__(self, x_t, attention_store):
        # note that this code works on the latent level!
        k = 1
        # maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
        # These are the numbers because we want to take layers that are 256 x 256, I think this can be changed to something smarter...
        # like, get all attentions where thesecond dim is self.attn_res[0] * self.attn_res[1] in up and down cross.
        # NOTE(Alex): This would require activating saving of the attention maps (change in DotDictExtra _enable_non_agg_storing)
        # NOTE(Alex): Alternative is to use aggregate masks like in other examples
        maps = [m for m in attention_store["down_cross"] + attention_store["mid_cross"] + attention_store["up_cross"] if m.shape[1] == self.attn_res[0] * self.attn_res[1]]
        maps = [
            item.reshape(
                self.alpha_layers.shape[0],
                -1,
                1,
                self.attn_res[0],
                self.attn_res[1],
                self.max_num_words,
            )
            for item in maps
        ]
        maps = torch.cat(maps, dim=1)
        maps = (maps * self.alpha_layers).sum(-1).mean(1)
        # since alpha_layers is all 0s except where we edit, the product zeroes out all but what we change.
        # Then, the sum adds the values of the original and what we edit.
        # Then, we average across dim=1, which is the number of layers.
        mask = F.max_pool2d(maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k))
        mask = F.interpolate(mask, size=(x_t.shape[2:]))
        mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0]
        mask = mask.gt(self.threshold)

        mask = mask[:1] + mask[1:]
        mask = mask.to(torch.float16)
        if mask.shape[0] < x_t.shape[0]:  # PartEdit
            # concat last mask again
            mask = torch.cat([mask, mask[-1:]], dim=0)

        # ## NOTE(Alex): this is local blending with the mask
        # assert isinstance(attention_store, AttentionStore), "AttentionStore expected"
        # cur_res = x_t.shape[-1]

        # if attention_store.th_strategy == Binarization.PROVIDED_MASK:
        #     mask = attention_store.edit_mask.to(x_t.device)
        #     # resize to res
        #     mask = F.interpolate(
        #         mask, (cur_res, cur_res), mode="bilinear"
        #     ) # ).reshape(1, -1, 1)
        # else:
        #     mask =  attention_store.get_maps_agg(
        #         res=cur_res,
        #         device=x_t.device,
        #         use_agg_store=attention_store.use_agg_store,  # Agg is across time, Step is last step without time agg
        #         keepshape=True
        #     )  # provide in cross_attention_kwargs in pipeline
        # x_t[1:] = mask * x_t[1:] + (1 - mask) * x_t[0]
        # ## END NOTE(Alex): this is local blending with the mask

        x_t = x_t[:1] + mask * (x_t - x_t[:1])
        # The code applies a mask to the image difference between the original and each generated image, effectively retaining only the desired cells.
        return x_t

    # NOTE(Alex): Copied over for LocalBlend
    def __init__(
        self,
        prompts: List[str],
        words: List[List[str]],
        tokenizer,
        device,
        threshold=0.3,
        attn_res=None,
    ):
        self.max_num_words = 77
        self.attn_res = attn_res

        alpha_layers = torch.zeros(len(prompts), 1, 1, 1, 1, self.max_num_words)
        for i, (prompt, words_) in enumerate(zip(prompts, words)):
            if isinstance(words_, str):
                words_ = [words_]
            for word in words_:
                ind = get_word_inds(prompt, word, tokenizer)
                alpha_layers[i, :, :, :, :, ind] = 1
        self.alpha_layers = alpha_layers.to(device)  # a one-hot vector where the 1s are the words we modify (source and target)
        self.threshold = threshold


# Copied from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L129
class AttentionControl(abc.ABC):
    def step_callback(self, x_t):
        return x_t

    def between_steps(self):
        return

    @property
    def num_uncond_att_layers(self):
        return 0

    @abc.abstractmethod
    def forward(self, attn, is_cross: bool, place_in_unet: str, store: bool = True):
        raise NotImplementedError

    def __call__(self, attn, is_cross: bool, place_in_unet: str, store: bool = True):
        if self.cur_att_layer >= self.num_uncond_att_layers:
            h = attn.shape[0]
            attn[h // 2 :] = self.forward(attn[h // 2 :], is_cross, place_in_unet, store)
        self.cur_att_layer += 1
        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
            self.cur_att_layer = 0
            self.cur_step += 1
            self.between_steps()
        return attn

    def reset(self):
        self.cur_step = 0
        self.cur_att_layer = 0
        self.allow_edit_control = True

    def __init__(self, attn_res=None, extra_kwargs: DotDictExtra = None):
        # PartEdit
        self.extra_kwargs = extra_kwargs
        self.index_inside_batch = extra_kwargs.get("index_inside_batch", 1) # Default is one in our prior setting!
        if not isinstance(self.index_inside_batch, list):
            self.index_inside_batch = [self.index_inside_batch]
        self.layers_to_use = extra_kwargs.get("_layers_to_use", LAYERS_TO_USE)  # Training parameter, not exposed directly
        # Params
        self.th_strategy: Binarization = extra_kwargs.get("th_strategy", Binarization.P2P)
        self.pad_strategy: PaddingStrategy = extra_kwargs.get("pad_strategy", PaddingStrategy.BG)
        self.omega: float = extra_kwargs.get("omega", 1.0)
        self.use_agg_store: bool = extra_kwargs.get("use_agg_store", False)
        self.edit_mask: Optional[torch.Tensor] = extra_kwargs.get("edit_mask", None)  # edit_mask_t
        self.edit_steps: int = extra_kwargs.get("edit_steps", 50) # NOTE(Alex): This is the end step, IMPORTANT
        self.blend_layers: Optional[List] = None
        self.start_editing_at: int = extra_kwargs.get("start_editing_at", 0)
        self.use_layer_subset_idx: Optional[list[int]] = extra_kwargs.get("use_layer_subset_idx", None)
        self.batch_indx: int = extra_kwargs.get("batch_indx", 0)
        self.VRAM_low: bool = extra_kwargs.get("VRAM_low", False)
        self.allow_edit_control = True
        # Old
        self.cur_step: int = 0
        self.num_att_layers: int = -1
        self.cur_att_layer: int = 0
        self.attn_res: int = attn_res

    def get_maps_agg(self, resized_res, device):
        return None

    def _editing_allowed(self):
        return self.allow_edit_control  # TODO(Alex): Maybe make this only param, instead of unregister attn control?


# Copied from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L166
class EmptyControl(AttentionControl):
    def forward(self, attn, is_cross: bool, place_in_unet: str, store:bool = True):
        return attn


# Modified from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L171
class AttentionStore(AttentionControl):
    @staticmethod
    def get_empty_store():
        return {
            "down_cross": [],
            "mid_cross": [],
            "up_cross": [],
            "down_self": [],
            "mid_self": [],
            "up_self": [],
            "opt_cross": [],
            "opt_bg_cross": [],
        }

    def maybe_offload(self, attn_device, attn_dtype):
        if self.extra_kwargs.get("_cpu_offload", False):
            attn_device, attn_dtype = torch.device("cpu"), torch.float32
        return attn_device, attn_dtype

    def forward(self, attn, is_cross: bool, place_in_unet: str, store: bool = True):
        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
        _device, _dtype = self.maybe_offload(attn.device, attn.dtype)
        if store and self.batch_indx is not None and is_cross:
            # We always store for our method
            _dim = attn.shape[0] // self.num_prompt
            _val = attn[_dim * self.batch_indx : _dim * (self.batch_indx + 1), ..., self.index_inside_batch].sum(0, keepdim=True).to(_device, _dtype)
            if _val.shape[-1] != 1:
                # min_max each -1 seperately
                _max = _val.max()
                for i in range(_val.shape[-1]):
                    _val[..., i] = min_max_norm(_val[..., i], _max=_max)
                _val = _val.sum(-1, keepdim=True)
            self.step_store["opt_cross"].append(_val)
        if self.extra_kwargs.get("_enable_non_agg_storing", False) and store:
            _attn = attn.clone().detach().to(_device, _dtype, non_blocking=True)
            if attn.shape[1] <= 32**2:  # avoid memory overhead
                self.step_store[key].append(_attn)
        return attn

    def offload_stores(self, device):
        """Created for low VRAM usage, where we want to do this before Decoder"""
        for key in self.step_store:
            self.step_store[key] = [a.to(device) for a in self.step_store[key]]
        for key in self.attention_store:
            self.attention_store[key] = [a.to(device) for a in self.attention_store[key]]
        torch.cuda.empty_cache()

    @torch.no_grad()
    def calculate_mask_t_res(self, use_step_store: bool = False):
        mask_t_res = aggregate_attention(
            self,
            res=1024,
            from_where=["opt"],
            batch_size=1,
            is_cross=True,
            upsample_everything=False,
            return_all_layers=False, # Removed sum in this function
            use_same_layers_as_train=True,
            train_layers=self.layers_to_use,
            use_step_store=use_step_store,
            use_layer_subset_idx=self.use_layer_subset_idx,
        )[..., 0]

        strategy: Binarization = self.th_strategy

        mask_t_res = min_max_norm(mask_t_res)

        upper_threshold = strategy.upper_binarize
        lower_threshold = strategy.lower_binarize
        use_otsu = strategy.use_otsu
        tt = threshold_otsu(mask_t_res)  # NOTE(Alex): Moved outside, for Inversion Low confidence region copy
        if not hasattr(self, "last_otsu") or self.last_otsu == []:
            self.last_otsu = [tt]
        else:
            self.last_otsu.append(tt)
        if use_otsu:
            upper_threshold, lower_threshold = (
                tt * upper_threshold,
                tt * lower_threshold,
            )

        if strategy == Binarization.PARTEDIT:
            upper_threshold = self.omega * tt  # Assuming we are not chaning upper in PartEdit

        if strategy in [Binarization.P2P, Binarization.PROVIDED_MASK]:
            return mask_t_res

        mask_t_res[mask_t_res < lower_threshold] = 0
        mask_t_res[mask_t_res >= upper_threshold] = 1.0

        return mask_t_res

    def has_maps(self) -> bool:
        return len(self.mask_storage_step) > 0 or len(self.mask_storage_agg) > 0

    def _store_agg_map(self) -> None:
        if self.use_agg_store:
            self.mask_storage_agg[self.cur_step] = self.calculate_mask_t_res().cpu()
        else:
            self.mask_storage_step[self.cur_step] = self.calculate_mask_t_res(True).cpu()

    def between_steps(self):
        no_items = len(self.attention_store) == 0
        if no_items:
            self.attention_store = self.step_store
        else:
            for key in self.attention_store:
                for i in range(len(self.attention_store[key])):
                    self.attention_store[key][i] += self.step_store[key][i]

        self._store_agg_map()
        if not no_items:
            # only in this case, otherwise we are just assigning it
            for key in self.step_store:
                # Clear the list while maintaining the dictionary structure
                del self.step_store[key][:]

        self.step_store = self.get_empty_store()

    def get_maps_agg(self, res, device, use_agg_store: bool = None, keepshape: bool = False):
        if use_agg_store is None:
            use_agg_store = self.use_agg_store
        _store = self.mask_storage_agg if use_agg_store else self.mask_storage_step
        last_idx = sorted(_store.keys())[-1]
        mask_t_res = _store[last_idx].to(device)  # Should be 1 1 H W
        mask_t_res = F.interpolate(mask_t_res, (res, res), mode="bilinear")
        if not keepshape:
            mask_t_res = mask_t_res.reshape(1, -1, 1)
        return mask_t_res

    def visualize_maps_agg(self, use_agg_store: bool, make_grid_kwargs: dict = None):
        _store = self.mask_storage_agg if use_agg_store else self.mask_storage_step
        if make_grid_kwargs is None:
            make_grid_kwargs = {"nrow": 10}
        return ToPILImage()(make_grid(torch.cat(list(_store.values())), **make_grid_kwargs))

    def visualize_one_map(self, use_agg_store: bool, idx: int):
        _store = self.mask_storage_agg if use_agg_store else self.mask_storage_step
        return ToPILImage()(_store[idx])

    def visualize_final_map(self, use_agg_store: bool):
        """This method returns the agg non-binarized attn map of the whole process

        Args:
            use_agg_store (bool): If True, it will return the agg store, otherwise the step store

        Returns:
            [PIL.Image]: The non-binarized attention map
        """
        _store = self.mask_storage_agg if use_agg_store else self.mask_storage_step
        return ToPILImage()(torch.cat(list(_store.values())).mean(0))

    def get_average_attention(self, step: bool = False):
        _store = self.attention_store if not step else self.step_store
        average_attention = {key: [item / self.cur_step for item in _store[key]] for key in _store}
        return average_attention

    def reset(self):
        super(AttentionStore, self).reset()
        for key in self.step_store:
            del self.step_store[key][:]
        for key in self.attention_store:
            del self.attention_store[key][:]
        self.step_store = self.get_empty_store()
        self.attention_store = {}
        self.last_otsu = []

    def __init__(
        self,
        num_prompt: int,
        attn_res=None,
        extra_kwargs: DotDictExtra = None,
    ):
        super(AttentionStore, self).__init__(attn_res, extra_kwargs)

        self.num_prompt = num_prompt
        self.mask_storage_step = {}
        self.mask_storage_agg = {}
        if self.batch_indx is not None:
            assert num_prompt > 0, "num_prompt must be greater than 0 if batch_indx is not None"
        self.step_store = self.get_empty_store()
        self.attention_store = {}
        self.last_otsu = []


# Copied from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L246
class AttentionControlEdit(AttentionStore, abc.ABC):
    def step_callback(self, x_t):
        if self.local_blend is not None:
            # x_t = self.local_blend(x_t, self.attention_store) # TODO: Check if there is more memory efficient way
            x_t = self.local_blend(x_t, self)
        return x_t

    def replace_self_attention(self, attn_base, att_replace):
        if att_replace.shape[2] <= self.attn_res[0] ** 2:
            return attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape)
        else:
            return att_replace

    @abc.abstractmethod
    def replace_cross_attention(self, attn_base, att_replace):
        raise NotImplementedError

    def forward(self, attn, is_cross: bool, place_in_unet: str, store: bool = True):
        super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet, store)
        if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
            h = attn.shape[0] // (self.batch_size)
            try:
                attn = attn.reshape(self.batch_size, h, *attn.shape[1:])
            except RuntimeError as e:
                logger.error(f"Batch size: {self.batch_size}, h: {h}, attn.shape: {attn.shape}")
                raise e

            attn_base, attn_replace = attn[0], attn[1:]
            if is_cross:
                alpha_words = self.cross_replace_alpha[self.cur_step].to(attn_base.device)
                attn_replace_new = self.replace_cross_attention(attn_base, attn_replace) * alpha_words + (1 - alpha_words) * attn_replace
                

                attn[1:] = attn_replace_new
                if self.has_maps() and self.extra_kwargs.get("force_cross_attn", False):  # and self.cur_step <= 51:
                    mask_t_res = self.get_maps_agg(
                        res=int(attn_base.shape[1] ** 0.5),
                        device=attn_base.device,
                        use_agg_store=self.use_agg_store,  # Agg is across time, Step is last step without time agg
                        keepshape=False,
                    ).repeat(h, 1, 1)
                    zero_index = torch.argmax(torch.eq(self.cross_replace_alpha[0], 0).to(mask_t_res.dtype)).item()
                    # zero_index = torch.eq(self.cross_replace_alpha[0].flatten(), 0)
                    mean_curr = attn[1:2, ..., zero_index].mean()
                    ratio_to_mean = mean_curr / mask_t_res[..., 0].mean()
                    # print(f'{ratio_to_mean=}')
                    extra_mask = torch.where(mask_t_res[..., 0] > self.last_otsu[-1], ratio_to_mean * 2, 0.5)

                    attn[1:2, ..., zero_index : zero_index + 1] += mask_t_res[None] * extra_mask[None, ..., None]  # * ratio_to_mean # * 2
                    # attn[1:2, ..., zero_index] = (mask_t_res[..., 0][None] > self.last_otsu[-1] * 1.5).to(mask_t_res.dtype) * mean_curr
            else:
                attn[1:] = self.replace_self_attention(attn_base, attn_replace)
            attn = attn.reshape(self.batch_size * h, *attn.shape[2:])
        return attn

    def __init__(
        self,
        prompts: list[str],
        num_steps: int,
        cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
        self_replace_steps: Union[float, Tuple[float, float]],
        local_blend: Optional[LocalBlend],
        tokenizer,
        device: torch.device,
        attn_res=None,
        extra_kwargs: DotDictExtra = None,
    ):
        super(AttentionControlEdit, self).__init__(
            attn_res=attn_res,
            num_prompt=len(prompts),
            extra_kwargs=extra_kwargs,
        )
        # add tokenizer and device here

        self.tokenizer = tokenizer
        self.device = device

        self.batch_size = len(prompts)
        self.cross_replace_alpha = get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps, self.tokenizer).to(self.device)
        if isinstance(self_replace_steps, float):
            self_replace_steps = 0, self_replace_steps
        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
        self.local_blend = local_blend


# Copied from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L307
class AttentionReplace(AttentionControlEdit):
    def replace_cross_attention(self, attn_base, att_replace):
        return torch.einsum("hpw,bwn->bhpn", attn_base, self.mapper.to(attn_base.device))

    def __init__(
        self,
        prompts,
        num_steps: int,
        cross_replace_steps: float,
        self_replace_steps: float,
        local_blend: Optional[LocalBlend] = None,
        tokenizer=None,
        device=None,
        attn_res=None,
        extra_kwargs: DotDictExtra = None,
    ):
        super(AttentionReplace, self).__init__(
            prompts,
            num_steps,
            cross_replace_steps,
            self_replace_steps,
            local_blend,
            tokenizer,
            device,
            attn_res,
            extra_kwargs,
        )
        self.mapper = get_replacement_mapper(prompts, self.tokenizer).to(self.device)


# Copied from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L328
class AttentionRefine(AttentionControlEdit):
    def replace_cross_attention(self, attn_base, att_replace):
        attn_base_replace = attn_base[:, :, self.mapper].permute(2, 0, 1, 3)
        attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas)
        return attn_replace

    def __init__(
        self,
        prompts,
        num_steps: int,
        cross_replace_steps: float,
        self_replace_steps: float,
        local_blend: Optional[LocalBlend] = None,
        tokenizer=None,
        device=None,
        attn_res=None,
        extra_kwargs: DotDictExtra = None,
    ):
        super(AttentionRefine, self).__init__(
            prompts,
            num_steps,
            cross_replace_steps,
            self_replace_steps,
            local_blend,
            tokenizer,
            device,
            attn_res,
            extra_kwargs,
        )
        self.mapper, alphas = get_refinement_mapper(prompts, self.tokenizer)
        self.mapper, alphas = self.mapper.to(self.device), alphas.to(self.device)
        self.alphas = alphas.reshape(alphas.shape[0], 1, 1, alphas.shape[1])


# Copied from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L353
class AttentionReweight(AttentionControlEdit):
    def replace_cross_attention(self, attn_base: torch.Tensor, att_replace: torch.Tensor):
        if self.prev_controller is not None:
            attn_base = self.prev_controller.replace_cross_attention(attn_base, att_replace)
        attn_replace = attn_base[None, :, :, :] * self.equalizer[:, None, None, :]
        return attn_replace

    def __init__(
        self,
        prompts: list[str],
        num_steps: int,
        cross_replace_steps: float,
        self_replace_steps: float,
        equalizer,
        local_blend: Optional[LocalBlend] = None,
        controller: Optional[AttentionControlEdit] = None,
        tokenizer=None,
        device=None,
        attn_res=None,
        extra_kwargs: DotDictExtra = None,
    ):
        super(AttentionReweight, self).__init__(
            prompts,
            num_steps,
            cross_replace_steps,
            self_replace_steps,
            local_blend,
            tokenizer,
            device,
            attn_res,
            extra_kwargs,
        )
        self.equalizer = equalizer.to(self.device)
        self.prev_controller = controller


class PartEditCrossAttnProcessor:
    # Modified from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L11
    def __init__(
        self,
        controller: AttentionStore,
        place_in_unet,
        store_this_layer: bool = True,
    ):
        super().__init__()
        self.controller = controller
        assert issubclass(type(controller), AttentionControl), f"{controller} isn't subclass of AttentionControl"
        self.place_in_unet = place_in_unet
        self.store_this_layer = store_this_layer

    def has_maps(self) -> bool:
        return len(self.controller.mask_storage_step) > 0 or len(self.controller.mask_storage_agg) > 0 or self.controller.edit_mask is not None

    def condition_for_editing(self) -> bool:
        # If we have a given mask
        # If we are using PartEdit
        return self.controller.th_strategy.enabled

    def __call__(
        self,
        attn: Attention,
        hidden_states,
        encoder_hidden_states=None,
        attention_mask=None,
    ):
        batch_size, sequence_length, _ = hidden_states.shape
        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)

        query = attn.to_q(hidden_states)

        is_cross = encoder_hidden_states is not None
        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
        key = attn.to_k(encoder_hidden_states)
        value = attn.to_v(encoder_hidden_states)

        # initial_condition = hasattr(self, "controller") and hasattr(self.controller, "batch_indx") and batch_size > self.controller.batch_size

        if hasattr(self, "controller") and self.controller._editing_allowed() and self.controller.batch_indx > 0:
            # Set the negative/positive of the batch index to the zero image
            batch_indx = self.controller.batch_indx
            _bs = self.controller.batch_size
            query[[batch_indx, batch_indx + _bs]] = query[[0, _bs]]
            # value[[batch_indx, batch_indx+_bs]] = value[[0, _bs]]

        query = attn.head_to_batch_dim(query)
        key = attn.head_to_batch_dim(key)
        value = attn.head_to_batch_dim(value)

        attention_probs = attn.get_attention_scores(query, key, attention_mask)

        self.controller(attention_probs, is_cross, self.place_in_unet, self.store_this_layer)

        hidden_states = torch.bmm(attention_probs, value)
        hidden_states = attn.batch_to_head_dim(hidden_states)

        # linear proj
        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)

        res = int(np.sqrt(hidden_states.shape[1]))

        should_edit = (
            hasattr(self, "controller")
            and self.controller._editing_allowed()  # allow_edit_control
            and self.has_maps() 
            and self.condition_for_editing()
            and self.controller.cur_step > self.controller.start_editing_at
            and self.controller.cur_step < self.controller.edit_steps
        )
        if should_edit:
            if self.controller.th_strategy == Binarization.PROVIDED_MASK:
                mask_t_res = self.controller.edit_mask.to(hidden_states.device)
                # resize to res
                mask_t_res = F.interpolate(mask_t_res, (res, res), mode="bilinear").reshape(1, -1, 1)
            else:
                mask_t_res = self.controller.get_maps_agg(
                    res=res,
                    device=hidden_states.device,
                    use_agg_store=self.controller.use_agg_store,  # Agg is across time, Step is last step without time agg
                )  # provide in cross_attention_kwargs in pipeline
                # Note: Additional blending with grounding
                _extra_grounding = self.controller.extra_kwargs.get("grounding", None)
                if _extra_grounding is not None:
                    mask_t_res = mask_t_res * F.interpolate(_extra_grounding, (res, res), mode="bilinear").reshape(1, -1, 1).to(hidden_states.device)

            # hidden_states_orig = rearrange(hidden_states, "b (h w) c -> b h w c", w=res, h=res)
            b1_u = 0
            b1_c = self.controller.batch_size
            b2_u = 1
            b2_c = self.controller.batch_size + 1
            hidden_states[b2_u] = (1 - mask_t_res) * hidden_states[b1_u] + mask_t_res * hidden_states[b2_u]
            hidden_states[b2_c] = (1 - mask_t_res) * hidden_states[b1_c] + mask_t_res * hidden_states[b2_c]
            # hidden_states_after = rearrange(hidden_states, "b (h w) c -> b h w c", w=res, h=res)

        return hidden_states


# Adapted from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L48
def create_controller(
    prompts: List[str],
    cross_attention_kwargs: Dict,
    num_inference_steps: int,
    tokenizer,
    device: torch.device,
    attn_res: Tuple[int, int],
    extra_kwargs: dict,
) -> AttentionControl:
    edit_type = cross_attention_kwargs.get("edit_type", "replace")
    local_blend_words = cross_attention_kwargs.get("local_blend_words")
    equalizer_words = cross_attention_kwargs.get("equalizer_words")
    equalizer_strengths = cross_attention_kwargs.get("equalizer_strengths")
    n_cross_replace = cross_attention_kwargs.get("n_cross_replace", 0.4)
    n_self_replace = cross_attention_kwargs.get("n_self_replace", 0.4)

    # only replace
    if edit_type == "replace" and local_blend_words is None:
        return AttentionReplace(
            prompts,
            num_inference_steps,
            n_cross_replace,
            n_self_replace,
            tokenizer=tokenizer,
            device=device,
            attn_res=attn_res,
            extra_kwargs=extra_kwargs,
        )

    # replace + localblend
    if edit_type == "replace" and local_blend_words is not None:
        lb = LocalBlend(
            prompts,
            local_blend_words,
            tokenizer=tokenizer,
            device=device,
            attn_res=attn_res,
        )
        return AttentionReplace(
            prompts,
            num_inference_steps,
            n_cross_replace,
            n_self_replace,
            lb,
            tokenizer=tokenizer,
            device=device,
            attn_res=attn_res,
            extra_kwargs=extra_kwargs,
        )

    # only refine
    if edit_type == "refine" and local_blend_words is None:
        return AttentionRefine(
            prompts,
            num_inference_steps,
            n_cross_replace,
            n_self_replace,
            tokenizer=tokenizer,
            device=device,
            attn_res=attn_res,
            extra_kwargs=extra_kwargs,
        )

    # refine + localblend
    if edit_type == "refine" and local_blend_words is not None:
        lb = LocalBlend(
            prompts,
            local_blend_words,
            tokenizer=tokenizer,
            device=device,
            attn_res=attn_res,
        )
        return AttentionRefine(
            prompts,
            num_inference_steps,
            n_cross_replace,
            n_self_replace,
            lb,
            tokenizer=tokenizer,
            device=device,
            attn_res=attn_res,
            extra_kwargs=extra_kwargs,
        )

    # only reweight
    if edit_type == "reweight" and local_blend_words is None:
        assert equalizer_words is not None and equalizer_strengths is not None, "To use reweight edit, please specify equalizer_words and equalizer_strengths."
        assert len(equalizer_words) == len(equalizer_strengths), "equalizer_words and equalizer_strengths must be of same length."
        equalizer = get_equalizer(prompts[1], equalizer_words, equalizer_strengths, tokenizer=tokenizer)
        return AttentionReweight(
            prompts,
            num_inference_steps,
            n_cross_replace,
            n_self_replace,
            tokenizer=tokenizer,
            device=device,
            equalizer=equalizer,
            attn_res=attn_res,
            extra_kwargs=extra_kwargs,
        )

    # reweight and localblend
    if edit_type == "reweight" and local_blend_words:
        assert equalizer_words is not None and equalizer_strengths is not None, "To use reweight edit, please specify equalizer_words and equalizer_strengths."
        assert len(equalizer_words) == len(equalizer_strengths), "equalizer_words and equalizer_strengths must be of same length."
        equalizer = get_equalizer(prompts[1], equalizer_words, equalizer_strengths, tokenizer=tokenizer)
        lb = LocalBlend(
            prompts,
            local_blend_words,
            tokenizer=tokenizer,
            device=device,
            attn_res=attn_res,
        )
        return AttentionReweight(
            prompts,
            num_inference_steps,
            n_cross_replace,
            n_self_replace,
            tokenizer=tokenizer,
            device=device,
            equalizer=equalizer,
            attn_res=attn_res,
            local_blend=lb,
            extra_kwargs=extra_kwargs,
        )

    raise ValueError(f"Edit type {edit_type} not recognized. Use one of: replace, refine, reweight.")


# Copied from https://github.com/RoyiRa/prompt-to-prompt-with-sdxl/blob/e579861f06962b697b37f3c6dd4813c2acdd55bd/processors.py#L380-L596
### util functions for all Edits


def update_alpha_time_word(
    alpha,
    bounds: Union[float, Tuple[float, float]],
    prompt_ind: int,
    word_inds: Optional[torch.Tensor] = None,
):
    if isinstance(bounds, float):
        bounds = 0, bounds
    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
    if word_inds is None:
        word_inds = torch.arange(alpha.shape[2])
    alpha[:start, prompt_ind, word_inds] = 0
    alpha[start:end, prompt_ind, word_inds] = 1
    alpha[end:, prompt_ind, word_inds] = 0
    return alpha


def get_time_words_attention_alpha(
    prompts,
    num_steps,
    cross_replace_steps: Union[float, Dict[str, Tuple[float, float]]],
    tokenizer,
    max_num_words=77,
):
    if not isinstance(cross_replace_steps, dict):
        cross_replace_steps = {"default_": cross_replace_steps}
    if "default_" not in cross_replace_steps:
        cross_replace_steps["default_"] = (0.0, 1.0)
    alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words)
    for i in range(len(prompts) - 1):
        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"], i)
    for key, item in cross_replace_steps.items():
        if key != "default_":
            inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
            for i, ind in enumerate(inds):
                if len(ind) > 0:
                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
    alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words)
    return alpha_time_words


### util functions for LocalBlend and ReplacementEdit
def get_word_inds(text: str, word_place: int, tokenizer):
    split_text = text.split(" ")
    if isinstance(word_place, str):
        word_place = [i for i, word in enumerate(split_text) if word_place == word]
    elif isinstance(word_place, int):
        word_place = [word_place]
    out = []
    if len(word_place) > 0:
        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
        cur_len, ptr = 0, 0

        for i in range(len(words_encode)):
            cur_len += len(words_encode[i])
            if ptr in word_place:
                out.append(i + 1)
            if cur_len >= len(split_text[ptr]):
                ptr += 1
                cur_len = 0
    return np.array(out)


### util functions for ReplacementEdit
def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
    words_x = x.split(" ")
    words_y = y.split(" ")
    if len(words_x) != len(words_y):
        raise ValueError(
            f"attention replacement edit can only be applied on prompts with the same length" f" but prompt A has {len(words_x)} words and prompt B has {len(words_y)} words."
        )
    inds_replace = [i for i in range(len(words_y)) if words_y[i] != words_x[i]]
    inds_source = [get_word_inds(x, i, tokenizer) for i in inds_replace]
    inds_target = [get_word_inds(y, i, tokenizer) for i in inds_replace]
    mapper = np.zeros((max_len, max_len))
    i = j = 0
    cur_inds = 0
    while i < max_len and j < max_len:
        if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
            inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
            if len(inds_source_) == len(inds_target_):
                mapper[inds_source_, inds_target_] = 1
            else:
                ratio = 1 / len(inds_target_)
                for i_t in inds_target_:
                    mapper[inds_source_, i_t] = ratio
            cur_inds += 1
            i += len(inds_source_)
            j += len(inds_target_)
        elif cur_inds < len(inds_source):
            mapper[i, j] = 1
            i += 1
            j += 1
        else:
            mapper[j, j] = 1
            i += 1
            j += 1

    # return torch.from_numpy(mapper).float()
    return torch.from_numpy(mapper).to(torch.float16)


def get_replacement_mapper(prompts, tokenizer, max_len=77):
    x_seq = prompts[0]
    mappers = []
    for i in range(1, len(prompts)):
        mapper = get_replacement_mapper_(x_seq, prompts[i], tokenizer, max_len)
        mappers.append(mapper)
    return torch.stack(mappers)


### util functions for ReweightEdit
def get_equalizer(
    text: str,
    word_select: Union[int, Tuple[int, ...]],
    values: Union[List[float], Tuple[float, ...]],
    tokenizer,
):
    if isinstance(word_select, (int, str)):
        word_select = (word_select,)
    equalizer = torch.ones(len(values), 77)
    values = torch.tensor(values, dtype=torch.float32)
    for i, word in enumerate(word_select):
        inds = get_word_inds(text, word, tokenizer)
        equalizer[:, inds] = torch.FloatTensor(values[i])
    return equalizer


### util functions for RefinementEdit
class ScoreParams:
    def __init__(self, gap, match, mismatch):
        self.gap = gap
        self.match = match
        self.mismatch = mismatch

    def mis_match_char(self, x, y):
        if x != y:
            return self.mismatch
        else:
            return self.match


def get_matrix(size_x, size_y, gap):
    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
    matrix[0, 1:] = (np.arange(size_y) + 1) * gap
    matrix[1:, 0] = (np.arange(size_x) + 1) * gap
    return matrix


def get_traceback_matrix(size_x, size_y):
    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
    matrix[0, 1:] = 1
    matrix[1:, 0] = 2
    matrix[0, 0] = 4
    return matrix


def global_align(x, y, score):
    matrix = get_matrix(len(x), len(y), score.gap)
    trace_back = get_traceback_matrix(len(x), len(y))
    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            left = matrix[i, j - 1] + score.gap
            up = matrix[i - 1, j] + score.gap
            diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
            matrix[i, j] = max(left, up, diag)
            if matrix[i, j] == left:
                trace_back[i, j] = 1
            elif matrix[i, j] == up:
                trace_back[i, j] = 2
            else:
                trace_back[i, j] = 3
    return matrix, trace_back


def get_aligned_sequences(x, y, trace_back):
    x_seq = []
    y_seq = []
    i = len(x)
    j = len(y)
    mapper_y_to_x = []
    while i > 0 or j > 0:
        if trace_back[i, j] == 3:
            x_seq.append(x[i - 1])
            y_seq.append(y[j - 1])
            i = i - 1
            j = j - 1
            mapper_y_to_x.append((j, i))
        elif trace_back[i][j] == 1:
            x_seq.append("-")
            y_seq.append(y[j - 1])
            j = j - 1
            mapper_y_to_x.append((j, -1))
        elif trace_back[i][j] == 2:
            x_seq.append(x[i - 1])
            y_seq.append("-")
            i = i - 1
        elif trace_back[i][j] == 4:
            break
    mapper_y_to_x.reverse()
    return x_seq, y_seq, torch.tensor(mapper_y_to_x, dtype=torch.int64)


def get_mapper(x: str, y: str, tokenizer, max_len=77):
    x_seq = tokenizer.encode(x)
    y_seq = tokenizer.encode(y)
    score = ScoreParams(0, 1, -1)
    matrix, trace_back = global_align(x_seq, y_seq, score)
    mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
    alphas = torch.ones(max_len)
    alphas[: mapper_base.shape[0]] = mapper_base[:, 1].ne(-1).float()
    mapper = torch.zeros(max_len, dtype=torch.int64)
    mapper[: mapper_base.shape[0]] = mapper_base[:, 1]
    mapper[mapper_base.shape[0] :] = len(y_seq) + torch.arange(max_len - len(y_seq))
    return mapper, alphas


def get_refinement_mapper(prompts, tokenizer, max_len=77):
    x_seq = prompts[0]
    mappers, alphas = [], []
    for i in range(1, len(prompts)):
        mapper, alpha = get_mapper(x_seq, prompts[i], tokenizer, max_len)
        mappers.append(mapper)
        alphas.append(alpha)
    return torch.stack(mappers), torch.stack(alphas)