AshBlanc commited on
Commit
520aa4f
·
verified ·
1 Parent(s): 9da0f47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +238 -45
app.py CHANGED
@@ -1,66 +1,259 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from diffusers import StableVideoDiffusionPipeline
 
 
4
  import torch
5
  import tempfile
6
  import os
 
 
 
7
 
8
- # Import music generation from separate file
9
- from music import generate_music
 
 
 
 
 
 
10
 
11
- # Load SmolLM-3B model once at startup
12
- tokenizer = AutoTokenizer.from_pretrained("cognitivecomputations/smolllm-3b")
13
- model = AutoModelForCausalLM.from_pretrained("cognitivecomputations/smolllm-3b")
14
-
15
- # Load Wan2.1 1.3B video model once at startup
16
- video_pipe = StableVideoDiffusionPipeline.from_pretrained("ByteDance/Wan-2-1-1-3B", torch_dtype=torch.float16).to("cuda")
 
 
 
17
  video_pipe.enable_model_cpu_offload()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def generate_scenes_with_smol(script, style):
20
- prompt = f"Break this {style.lower()} script into cinematic scenes with camera angles, characters, and mood.\nScript: {script}\nScene Breakdown:"
21
- inputs = tokenizer(prompt, return_tensors="pt")
22
- outputs = model.generate(**inputs, max_new_tokens=512)
23
- decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
24
- scenes = [{"scene_id": i+1, "description": scene.strip()} for i, scene in enumerate(decoded.split("\n")) if scene.strip()]
25
- return scenes
 
 
 
26
 
27
- def generate_video_with_wan(prompt):
28
- video = video_pipe(prompt, num_frames=24, height=512, width=512).frames[0] # Assuming single output
29
- tmp_dir = tempfile.mkdtemp()
30
- output_path = os.path.join(tmp_dir, "scene.mp4")
31
- video[0].save(output_path, fps=8) # Save as GIF or MP4 depending on format support
32
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def process_script(script, style, want_music):
 
 
 
 
 
35
  scenes = generate_scenes_with_smol(script, style)
 
 
36
  video_clips = []
37
-
38
- for scene in scenes:
39
  text_prompt = scene['description']
40
- video_path = generate_video_with_wan(text_prompt)
41
- video_clips.append((scene['description'], video_path))
42
-
43
- music_path = generate_music(script) if want_music else None
 
 
 
 
 
 
44
  return video_clips, music_path
45
 
46
- with gr.Blocks() as app:
47
- gr.Markdown("# 🎮 Vividly MVP AI Video Creator")
48
-
 
 
49
  with gr.Row():
50
- script_input = gr.Textbox(label="Video Script", lines=6)
51
- style_input = gr.Dropdown(["Cinematic", "Vlog", "Explainer"], value="Cinematic", label="Video Style")
52
- music_toggle = gr.Checkbox(label="Generate background music")
53
-
54
- submit_btn = gr.Button("Generate")
55
-
56
- video_outputs = gr.Video(label="Scene-wise Video Clips", interactive=False, visible=False)
57
- music_player = gr.Audio(label="Generated Music", visible=False)
58
-
59
- def wrap(script, style, music):
60
- scenes, music_path = process_script(script, style, music)
61
- first_video = scenes[0][1] if scenes else None
62
- return gr.update(value=first_video, visible=True), gr.update(value=music_path, visible=music is True)
63
-
64
- submit_btn.click(wrap, inputs=[script_input, style_input, music_toggle], outputs=[video_outputs, music_player])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- app.launch()
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from diffusers import StableVideoDiffusionPipeline
4
+ from audiocraft.models import MusicGen
5
+ from audiocraft.data.audio import audio_write
6
  import torch
7
  import tempfile
8
  import os
9
+ import cv2
10
+ import numpy as np
11
+ from PIL import Image
12
 
13
+ # Load SmolLM2-1.7B model (correct model name and size)
14
+ print("Loading text generation model...")
15
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ "HuggingFaceTB/SmolLM2-1.7B-Instruct",
18
+ torch_dtype=torch.float16,
19
+ device_map="auto"
20
+ )
21
 
22
+ # Load Stable Video Diffusion model (correct model name)
23
+ print("Loading video generation model...")
24
+ video_pipe = StableVideoDiffusionPipeline.from_pretrained(
25
+ "stabilityai/stable-video-diffusion-img2vid-xt",
26
+ torch_dtype=torch.float16,
27
+ variant="fp16"
28
+ )
29
+ if torch.cuda.is_available():
30
+ video_pipe = video_pipe.to("cuda")
31
  video_pipe.enable_model_cpu_offload()
32
+ video_pipe.enable_vae_slicing()
33
+
34
+ # Load MusicGen model
35
+ print("Loading music generation model...")
36
+ music_model = MusicGen.get_pretrained('facebook/musicgen-small')
37
+ music_model.set_generation_params(duration=8) # 8 seconds music
38
+
39
+ def generate_music(prompt: str):
40
+ """Generate background music from text prompt"""
41
+ try:
42
+ wav = music_model.generate([prompt], progress=True)
43
+ tmp_dir = tempfile.mkdtemp()
44
+ out_path = os.path.join(tmp_dir, "music")
45
+ audio_write(out_path, wav[0].cpu(), music_model.sample_rate, format="mp3")
46
+ return f"{out_path}.mp3"
47
+ except Exception as e:
48
+ print(f"Music generation error: {e}")
49
+ return None
50
 
51
  def generate_scenes_with_smol(script, style):
52
+ """Generate scene descriptions using SmolLM2"""
53
+ try:
54
+ prompt = f"""<|im_start|>system
55
+ You are a professional video director. Break down scripts into detailed cinematic scenes.
56
+ <|im_end|>
57
+ <|im_start|>user
58
+ Break this {style.lower()} script into 3-5 cinematic scenes with camera angles, characters, and mood.
59
+
60
+ Script: {script}
61
 
62
+ Format each scene as:
63
+ Scene X: [Detailed visual description with camera angle, lighting, characters, and action]
64
+ <|im_end|>
65
+ <|im_start|>assistant"""
66
+
67
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
68
+ if torch.cuda.is_available():
69
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
70
+
71
+ with torch.no_grad():
72
+ outputs = model.generate(
73
+ **inputs,
74
+ max_new_tokens=512,
75
+ temperature=0.7,
76
+ do_sample=True,
77
+ pad_token_id=tokenizer.eos_token_id
78
+ )
79
+
80
+ decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
81
+ # Extract only the assistant's response
82
+ response = decoded.split("<|im_start|>assistant")[-1].strip()
83
+
84
+ # Parse scenes
85
+ scenes = []
86
+ lines = response.split('\n')
87
+ for i, line in enumerate(lines):
88
+ if line.strip() and ('Scene' in line or len(line.strip()) > 20):
89
+ scenes.append({
90
+ "scene_id": len(scenes) + 1,
91
+ "description": line.strip()
92
+ })
93
+
94
+ # Ensure we have at least one scene
95
+ if not scenes:
96
+ scenes = [{"scene_id": 1, "description": f"A {style.lower()} scene: {script[:100]}..."}]
97
+
98
+ return scenes[:5] # Limit to 5 scenes max
99
+ except Exception as e:
100
+ print(f"Scene generation error: {e}")
101
+ return [{"scene_id": 1, "description": f"A {style.lower()} scene based on the script"}]
102
+
103
+ def create_initial_image(prompt, width=1024, height=576):
104
+ """Create a simple initial image for SVD (since it requires an input image)"""
105
+ # Create a simple gradient or solid color image as starting point
106
+ # In practice, you'd want to use a text-to-image model like Stable Diffusion
107
+ img = np.random.randint(50, 200, (height, width, 3), dtype=np.uint8)
108
+ img = Image.fromarray(img)
109
+ return img
110
+
111
+ def generate_video_with_svd(prompt):
112
+ """Generate video using Stable Video Diffusion"""
113
+ try:
114
+ # Create initial image (in practice, use a text-to-image model)
115
+ initial_image = create_initial_image(prompt)
116
+
117
+ # Generate video frames
118
+ frames = video_pipe(
119
+ image=initial_image,
120
+ decode_chunk_size=2,
121
+ generator=torch.manual_seed(42),
122
+ motion_bucket_id=127,
123
+ noise_aug_strength=0.02,
124
+ ).frames[0]
125
+
126
+ # Save as video file
127
+ tmp_dir = tempfile.mkdtemp()
128
+ output_path = os.path.join(tmp_dir, "scene.mp4")
129
+
130
+ # Convert PIL images to video using OpenCV
131
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
132
+ fps = 6 # SVD typically generates 6 fps
133
+ height, width = frames[0].size[1], frames[0].size[0]
134
+
135
+ out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
136
+
137
+ for frame in frames:
138
+ frame_array = np.array(frame)
139
+ frame_bgr = cv2.cvtColor(frame_array, cv2.COLOR_RGB2BGR)
140
+ out.write(frame_bgr)
141
+
142
+ out.release()
143
+ return output_path
144
+
145
+ except Exception as e:
146
+ print(f"Video generation error: {e}")
147
+ # Return a placeholder or None
148
+ return None
149
 
150
  def process_script(script, style, want_music):
151
+ """Main processing function"""
152
+ if not script.strip():
153
+ return [], None
154
+
155
+ print("Generating scenes...")
156
  scenes = generate_scenes_with_smol(script, style)
157
+
158
+ print("Generating videos...")
159
  video_clips = []
160
+ for i, scene in enumerate(scenes):
161
+ print(f"Processing scene {i+1}/{len(scenes)}")
162
  text_prompt = scene['description']
163
+ video_path = generate_video_with_svd(text_prompt)
164
+ if video_path:
165
+ video_clips.append((scene['description'], video_path))
166
+
167
+ music_path = None
168
+ if want_music:
169
+ print("Generating music...")
170
+ music_prompt = f"Background music for {style.lower()} video: {script[:100]}"
171
+ music_path = generate_music(music_prompt)
172
+
173
  return video_clips, music_path
174
 
175
+ # Gradio Interface
176
+ with gr.Blocks(title="Vividly MVP", theme=gr.themes.Soft()) as app:
177
+ gr.Markdown("# 🎬 Vividly MVP – AI Video Creator")
178
+ gr.Markdown("Transform your script into cinematic scenes with AI-generated videos and music!")
179
+
180
  with gr.Row():
181
+ with gr.Column(scale=2):
182
+ script_input = gr.Textbox(
183
+ label="Video Script",
184
+ lines=6,
185
+ placeholder="Enter your video script here..."
186
+ )
187
+ with gr.Column(scale=1):
188
+ style_input = gr.Dropdown(
189
+ ["Cinematic", "Vlog", "Explainer", "Documentary"],
190
+ value="Cinematic",
191
+ label="Video Style"
192
+ )
193
+ music_toggle = gr.Checkbox(label="Generate background music", value=True)
194
+ submit_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
195
+
196
+ with gr.Row():
197
+ with gr.Column():
198
+ video_outputs = gr.Video(
199
+ label="Generated Video Clip",
200
+ interactive=False,
201
+ visible=False
202
+ )
203
+ with gr.Column():
204
+ music_player = gr.Audio(
205
+ label="Generated Background Music",
206
+ visible=False
207
+ )
208
+
209
+ scene_gallery = gr.Gallery(
210
+ label="Scene Descriptions",
211
+ visible=False,
212
+ columns=1,
213
+ height="auto"
214
+ )
215
+
216
+ def wrap_processing(script, style, music):
217
+ if not script.strip():
218
+ return (
219
+ gr.update(visible=False),
220
+ gr.update(visible=False),
221
+ gr.update(visible=False)
222
+ )
223
+
224
+ try:
225
+ scenes, music_path = process_script(script, style, music)
226
+
227
+ # Show first video if available
228
+ first_video = scenes[0][1] if scenes else None
229
+
230
+ # Create scene descriptions for gallery
231
+ scene_descriptions = [scene[0] for scene in scenes] if scenes else []
232
+
233
+ return (
234
+ gr.update(value=first_video, visible=bool(first_video)),
235
+ gr.update(value=music_path, visible=bool(music_path)),
236
+ gr.update(value=scene_descriptions, visible=bool(scene_descriptions))
237
+ )
238
+ except Exception as e:
239
+ print(f"Processing error: {e}")
240
+ return (
241
+ gr.update(visible=False),
242
+ gr.update(visible=False),
243
+ gr.update(visible=False)
244
+ )
245
+
246
+ submit_btn.click(
247
+ wrap_processing,
248
+ inputs=[script_input, style_input, music_toggle],
249
+ outputs=[video_outputs, music_player, scene_gallery]
250
+ )
251
 
252
+ if __name__ == "__main__":
253
+ print("Starting Vividly MVP...")
254
+ app.launch(
255
+ server_name="0.0.0.0",
256
+ server_port=7860,
257
+ share=False,
258
+ debug=True
259
+ )