Spaces:
Build error
Build error
| import os | |
| import cv2 | |
| from PIL import Image | |
| import numpy as np | |
| from diffusers import AutoencoderKL | |
| from diffusers import UniPCMultistepScheduler | |
| from diffusers import StableDiffusionControlNetPipeline, ControlNetModel | |
| import torch | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| # Blip for Image Captioning | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| model = BlipForConditionalGeneration.from_pretrained( | |
| "Salesforce/blip-image-captioning-base", | |
| torch_dtype=torch.float16).to(device) | |
| # ControlNet for Image Variation Generation based on Canny Edge Detection | |
| pipe = StableDiffusionControlNetPipeline.from_pretrained( | |
| "stabilityai/stable-diffusion-2-1-base", | |
| controlnet=ControlNetModel.from_pretrained( | |
| "thibaud/controlnet-sd21-canny-diffusers", | |
| torch_dtype=torch.float16), | |
| torch_dtype=torch.float16, | |
| revision="fp16", | |
| vae=AutoencoderKL.from_pretrained( | |
| "stabilityai/sd-vae-ft-mse", | |
| torch_dtype=torch.float16 | |
| ).to(device) | |
| ).to(device) | |
| pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) | |
| pipe.enable_xformers_memory_efficient_attention() | |
| def pre_process_image(image): | |
| image = np.array(image) | |
| low_threshold = 100 | |
| high_threshold = 200 | |
| image = cv2.Canny(image, low_threshold, high_threshold) | |
| image = image[:, :, None] | |
| image = np.concatenate([image, image, image], axis=2) | |
| return Image.fromarray(image) | |
| def image_variations(image, input_prompt): | |
| canny_image = pre_process_image(image) | |
| if input_prompt: | |
| prompt = input_prompt | |
| else: | |
| inputs = processor(image, return_tensors="pt").to(device, torch.float16) | |
| out = model.generate(**inputs) | |
| prompt = processor.decode(out[0], skip_special_tokens=True) | |
| print(f"Blip Captioning: {prompt}") | |
| output_images = pipe( | |
| [prompt]*4, | |
| canny_image, | |
| negative_prompt=["distorted, noisy, lowres, bad anatomy, worst quality, low quality, bad eyes, rough face, unclear face"] * 4, | |
| num_inference_steps=25, | |
| ).images | |
| return output_images, canny_image | |