LibreFlux-IP-Adapter / flux_ip_adapter.py

Upload folder using huggingface_hub

7feb367 verified 7 months ago

13.6 kB

	from itertools import chain
	import torch
	from torch import nn
	from diffusers.models.attention_processor import (
	Attention,
	AttentionProcessor,
	)

	from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel
	import torch.nn.functional as F
	from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
	from diffusers.models.attention_processor import Attention
	import inspect
	from functools import partial
	from diffusers.models.normalization import RMSNorm
	from typing import Any, Dict, List, Optional, Union
	import torch
	import torch.nn as nn


	class IPFluxAttnProcessor2_0(nn.Module):
	"""Attention processor used typically in processing the SD3-like self-attention projections."""

	def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4, num_heads=0):
	super().__init__()

	self.hidden_size = hidden_size
	self.cross_attention_dim = cross_attention_dim
	self.scale = scale
	self.num_tokens = num_tokens

	self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
	self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)

	self.norm_added_k = RMSNorm(128, eps=1e-5, elementwise_affine=False)

	def __call__(
	self,
	attn,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: torch.FloatTensor = None,
	ip_encoder_hidden_states: torch.FloatTensor = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	image_rotary_emb: Optional[torch.Tensor] = None,
	layer_scale: Optional[torch.Tensor] = None,
	) -> torch.FloatTensor:
	batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape

	ip_hidden_states = ip_encoder_hidden_states

	# `sample` projections.
	query = attn.to_q(hidden_states)
	key = attn.to_k(hidden_states)
	value = attn.to_v(hidden_states)

	inner_dim = key.shape[-1]
	head_dim = inner_dim // attn.heads

	query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	# handle IP attention FIRST


	# for ip-adapter
	if ip_hidden_states != None:
	ip_key = self.to_k_ip(ip_hidden_states)
	ip_value = self.to_v_ip(ip_hidden_states)

	# reshaping to match query shape
	ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

	ip_key = self.norm_added_k(ip_key)


	# Using flux stype attention here
	ip_hidden_states = F.scaled_dot_product_attention(
	query,
	ip_key,
	ip_value,
	dropout_p=0.0,
	is_causal=False,
	attn_mask=None,
	)

	# reshaping ip_hidden_states in the same way as hidden_states
	ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(
	batch_size, -1, attn.heads * head_dim
	)

	# the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
	if encoder_hidden_states is not None:
	# `context` projections.
	encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
	encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)

	encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)

	encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
	batch_size, -1, attn.heads, head_dim
	).transpose(1, 2)
	encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
	batch_size, -1, attn.heads, head_dim
	).transpose(1, 2)
	encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
	batch_size, -1, attn.heads, head_dim
	).transpose(1, 2)

	if attn.norm_added_q is not None:
	encoder_hidden_states_query_proj = attn.norm_added_q(
	encoder_hidden_states_query_proj
	)
	if attn.norm_added_k is not None:
	encoder_hidden_states_key_proj = attn.norm_added_k(
	encoder_hidden_states_key_proj
	)

	# attention
	query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
	key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
	value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)

	if image_rotary_emb is not None:
	from diffusers.models.embeddings import apply_rotary_emb
	query = apply_rotary_emb(query, image_rotary_emb)

	key = apply_rotary_emb(key, image_rotary_emb)

	if attention_mask is not None:
	attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
	attention_mask = (attention_mask > 0).bool()
	attention_mask = attention_mask.to(
	device=hidden_states.device, dtype=query.dtype
	)
	original_hidden_states = hidden_states

	hidden_states = F.scaled_dot_product_attention(
	query,
	key,
	value,
	dropout_p=0.0,
	is_causal=False,
	attn_mask=attention_mask,
	)

	hidden_states = hidden_states.transpose(1, 2).reshape(
	batch_size, -1, attn.heads * head_dim
	)
	hidden_states = hidden_states.to(query.dtype)


	layer_scale = layer_scale.view(-1, 1, 1)

	if encoder_hidden_states is not None:

	encoder_hidden_states, hidden_states = (
	hidden_states[:, : encoder_hidden_states.shape[1]],
	hidden_states[:, encoder_hidden_states.shape[1] :],
	)

	# Final injection of ip addapter hidden_states
	if ip_hidden_states != None:
	hidden_states = hidden_states + (self.scale * layer_scale) * ip_hidden_states

	# linear proj
	hidden_states = attn.to_out[0](hidden_states)
	# dropout
	hidden_states = attn.to_out[1](hidden_states)
	encoder_hidden_states = attn.to_add_out(encoder_hidden_states)

	return hidden_states, encoder_hidden_states

	else:

	# Final injection of ip addapter hidden_states
	if ip_hidden_states != None:
	hidden_states = hidden_states + (self.scale * layer_scale) * ip_hidden_states

	if attn.to_out is not None:
	hidden_states = attn.to_out[0](hidden_states)
	hidden_states = attn.to_out[1](hidden_states)

	return hidden_states


	class ImageProjModel(nn.Module):
	def __init__(self, clip_dim=768, cross_attention_dim=4096, num_tokens=16):
	super().__init__()

	self.num_tokens = num_tokens
	self.cross_attention_dim = cross_attention_dim
	self.clip_dim = clip_dim

	self.proj = torch.nn.Sequential(
	torch.nn.Linear(clip_dim,clip_dim*2),
	torch.nn.GELU(),
	torch.nn.Linear(clip_dim2, cross_attention_dimnum_tokens),
	)
	self.norm = torch.nn.LayerNorm(cross_attention_dim)

	def forward(self,input):

	raw_proj = self.proj(input)
	reshaped_proj = raw_proj.reshape(input.shape[0],self.num_tokens,self.cross_attention_dim)
	reshaped_proj = self.norm( reshaped_proj )

	return reshaped_proj


	class LibreFluxIPAdapter(nn.Module):
	def __init__(self, transformer, image_proj_model, checkpoint=None):
	super().__init__()
	self.transformer = transformer
	self.image_proj_model = image_proj_model

	# Using startswith uses only double transformer blocks, and skips the single transformer blocks
	self.culled_transformer_blocks = {}
	for name, module in self.transformer.named_modules():
	if isinstance(module, Attention):
	if name.startswith('transformer_blocks') or name.startswith('single_transformer_blocks'):
	#print (f"Using Transformer: {name}")
	self.culled_transformer_blocks[name] = module
	#else:
	#print (f"Ignoring Transformer: {name}")
	# Apply the adapter to the culled blocks
	self.wrap_attention_blocks()

	if checkpoint:
	self.load_from_checkpoint(checkpoint)

	def wrap_attention_blocks(self,scale=1.0, num_tokens=16):
	""" Inject the IP-Adapter modules into the Transformer model """
	sample_attn = self.transformer.transformer_blocks[0].attn

	hidden_size = sample_attn.inner_dim
	cross_attention_dim = sample_attn.cross_attention_dim
	num_heads = sample_attn.heads
	scale = 1.0
	num_tokens = 16

	processor_list = []
	for name in self.culled_transformer_blocks:
	module = self.culled_transformer_blocks[name]
	module.processor = IPFluxAttnProcessor2_0(
	hidden_size= hidden_size,
	cross_attention_dim=4096,
	num_heads=num_heads,
	scale=1.0,
	num_tokens=16,
	)
	processor_list.append(module.processor )
	lay_count = len(processor_list)
	print (f"Added Attention IP Wrapper to {lay_count} layers")

	# Store adapters as a module list for saving/loading
	self.adapter_modules = torch.nn.ModuleList(processor_list)

	def parameters(self):
	""" Easy way to return all params """
	# Apply adapter
	adapter_param_list = []
	for name in self.culled_transformer_blocks:
	module = self.culled_transformer_blocks[name]
	adapter_param_list.append(module.processor.parameters())

	all_params = chain(*adapter_param_list,self.image_proj_model.parameters())
	return all_params

	def forward(self, ref_image, args, layer_scale= torch.Tensor([1.0]), *kwargs):
	""" Run projection and run forward """
	mod_dtype = next(self.image_proj_model.parameters()).dtype
	mod_device = next(self.image_proj_model.parameters()).device

	ip_encoder_hidden_states = None
	if ref_image != None:
	ip_encoder_hidden_states = self.image_proj_model(ref_image)

	# Add ip hidden states to kwargs
	if 'joint_attention_kwargs' not in kwargs:
	kwargs['joint_attention_kwargs'] = {}
	layer_scale = layer_scale.to(dtype=mod_dtype,
	device=mod_device)

	kwargs['joint_attention_kwargs']['ip_layer_scale'] = layer_scale
	kwargs['joint_attention_kwargs']['ip_hidden_states'] = ip_encoder_hidden_states

	output = self.transformer(*args,
	**kwargs)

	return output

	def save_pretrained(self,ckpt_path):
	""" Save model weights """
	state_dict = {}

	state_dict["image_proj"] = self.image_proj_model.state_dict()
	state_dict["ip_adapter"] = self.adapter_modules.state_dict()
	torch.save(state_dict, ckpt_path)

	def load_from_checkpoint(self, ckpt_path):
	""" Loader ripped from tencent repo """
	# Calculate original checksums
	orig_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
	orig_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))

	state_dict = torch.load(ckpt_path, map_location="cpu")

	# Load state dict for image_proj_model and adapter_modules
	self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
	self.adapter_modules.load_state_dict(state_dict["ip_adapter"], strict=True)

	# Calculate new checksums
	new_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
	new_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))

	# Verify if the weights have changed
	assert orig_ip_proj_sum != new_ip_proj_sum, "Weights of image_proj_model did not change!"
	assert orig_adapter_sum != new_adapter_sum, "Weights of adapter_modules did not change!"

	print(f"Successfully loaded weights from checkpoint {ckpt_path}")


	@property
	def dtype(self):
	return next(self.image_proj_model.parameters()).dtype