Spaces:
Runtime error
Runtime error
| from base64 import b64encode | |
| import numpy | |
| import torch | |
| from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel | |
| from huggingface_hub import notebook_login | |
| # For video display: | |
| from IPython.display import HTML | |
| from matplotlib import pyplot as plt | |
| from pathlib import Path | |
| from PIL import Image | |
| from torch import autocast | |
| from torchvision import transforms as tfms | |
| from tqdm.auto import tqdm | |
| from transformers import CLIPTextModel, CLIPTokenizer, logging | |
| import os | |
| torch_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" | |
| #torch_device = "cpu" | |
| # Load the autoencoder model which will be used to decode the latents into image space. | |
| vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae") | |
| # Load the tokenizer and text encoder to tokenize and encode the text. | |
| tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") | |
| text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") | |
| # The UNet model for generating the latents. | |
| unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet") | |
| # The noise scheduler | |
| scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) | |
| # To the GPU we go! | |
| vae = vae.to(torch_device) | |
| text_encoder = text_encoder.to(torch_device) | |
| unet = unet.to(torch_device); | |
| token_emb_layer = text_encoder.text_model.embeddings.token_embedding | |
| pos_emb_layer = text_encoder.text_model.embeddings.position_embedding | |
| position_ids = text_encoder.text_model.embeddings.position_ids[:, :77] | |
| position_embeddings = pos_emb_layer(position_ids) | |