Spaces:
Paused
Paused
| from IPython.display import display, Image, Audio | |
| from moviepy.editor import VideoFileClip, AudioFileClip | |
| from moviepy.audio.io.AudioFileClip import AudioFileClip | |
| import cv2 | |
| import base64 | |
| import io | |
| import openai | |
| import os | |
| import requests | |
| import streamlit as st | |
| import tempfile | |
| ## 1. Turn video into frames | |
| def video_to_frames(video_file): | |
| # Save the uploaded video file to a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile: | |
| tmpfile.write(video_file.read()) | |
| video_filename = tmpfile.name | |
| video_duration = VideoFileClip(video_filename).duration | |
| video = cv2.VideoCapture(video_filename) | |
| base64Frame = [] | |
| while video.isOpened(): | |
| success, frame = video.read() | |
| if not success: | |
| break | |
| _, buffer = cv2.imencode('.jpg', frame) | |
| base64Frame.append(base64.b64encode(buffer).decode("utf-8")) | |
| video.release() | |
| print(len(base64Frame), "frames read.") | |
| return base64Frame, video_filename, video_duration | |
| ## 2. Generate stories based on frames with gpt4v | |
| def frames_to_story(base64Frames, prompt, api_key): | |
| PROMPT_MESSAGES = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| prompt, | |
| *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::50]), | |
| ], | |
| }, | |
| ] | |
| params = { | |
| "model": "gpt-4-vision-preview", | |
| "messages": PROMPT_MESSAGES, | |
| "api_key": api_key, | |
| "headers": {"Openai-Version": "2020-11-07"}, | |
| "max_tokens": 500, | |
| } | |
| result = openai.ChatCompletion.create(**params) | |
| print(result.choices[0].message.content) | |
| return result.choices[0].message.content | |
| ## 3. Generate voiceover from stories | |
| def text_to_audio(text, api_key, voice): | |
| response = requests.post( | |
| "https://api.openai.com/v1/audio/speech", | |
| headers={ | |
| "Authorization": f"Bearer {api_key}", | |
| }, | |
| json={ | |
| "model": "tts-1", | |
| "input": text, | |
| "voice": voice, | |
| }, | |
| ) | |
| if response.status_code != 200: | |
| raise Exception("Request failed with status code") | |
| audio_bytes_io = io.BytesIO() | |
| for chunk in response.iter_content(chunk_size=1024*1024): | |
| audio_bytes_io.write(chunk) | |
| audio_bytes_io.seek(0) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile: | |
| for chunk in response.iter_content(chunk_size=1024*1024): | |
| tmpfile.write(chunk) | |
| audio_filename = tmpfile.name | |
| return audio_filename, audio_bytes_io | |
| ## 4. Merge videos & audio | |
| def merge_audio_video(video_filename, audio_filename, output_filename): | |
| print("Merging audio and video ...") | |
| video_clip = VideoFileClip(video_filename) | |
| audio_clip = AudioFileClip(audio_filename) | |
| final_clip = video_clip.set_audio(audio_clip) | |
| final_clip.write_videofile(output_filename, codec='libx264', audio_codec="aac") | |
| video_clip.close() | |
| audio_clip.close() | |
| return output_filename | |
| ## 5. Streamlit UI | |
| def main(): | |
| st.set_page_config(page_title="AI Voiceover", page_icon="๐ฎ") | |
| st.title("GPT4V AI Voiceover ๐ฅ๐ฎ") | |
| openai_key = st.text_input("Enter your OpenAI API key") | |
| if not openai_key: | |
| st.error("Please enter your OpenAI API key.") | |
| return | |
| uploaded_file = st.file_uploader("Select a video file", type=["mp4", "avi"]) | |
| option = st.selectbox( | |
| 'Choose the voice you want', | |
| ('Female Voice', 'Male Voice')) | |
| classify = 'alloy' if option == 'Male Voice' else 'nova' | |
| if uploaded_file is not None: | |
| st.video(uploaded_file) | |
| p = 'Generate a short voiceover script for the video, matching the content with the video scenes. The style should be...' | |
| prompt = st.text_area("Prompt", value=p) | |
| if st.button("START PROCESSING", type="primary") and uploaded_file is not None: | |
| with st.spinner("Video is being processed..."): | |
| base64Frame, video_filename, video_duration = video_to_frames(uploaded_file) | |
| est_word_count = video_duration * 4 | |
| final_prompt = f"{prompt}(This video is ONLY {video_duration} seconds long. So make sure the voiceover MUST be able to be explained in less than {est_word_count} words. Ignore and don't generate anything else than the script that you'll use to voice over the video.)" | |
| text = frames_to_story(base64Frame, final_prompt, openai_key) | |
| st.write(text) | |
| audio_filename, audio_bytes_io = text_to_audio(text, openai_key, classify) | |
| output_video_filename = os.path.splitext(video_filename)[0] + "_output.mp4" | |
| final_video_filename = merge_audio_video(video_filename, audio_filename, output_video_filename) | |
| st.video(final_video_filename) | |
| os.unlink(video_filename) | |
| os.unlink(audio_filename) | |
| os.unlink(final_video_filename) | |
| if __name__ == "__main__": | |
| main() |