| | import gradio as gr |
| | import torch |
| | import torchaudio |
| | import whisper |
| | import cv2 |
| | import numpy as np |
| | from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip |
| | from transformers import pipeline, AutoTokenizer, AutoModel |
| | import tempfile |
| | import os |
| | import json |
| | from datetime import timedelta |
| | import librosa |
| | from scipy.signal import find_peaks |
| | import tensorflow as tf |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.metrics.pairwise import cosine_similarity |
| | import spacy |
| | import nltk |
| | from googletrans import Translator |
| | import warnings |
| | warnings.filterwarnings("ignore") |
| |
|
| | class ZenVisionModel: |
| | """ |
| | ZenVision - Advanced AI Subtitle Generation Model |
| | Desarrollado por el equipo ZenVision |
| | Modelo de 3GB+ con múltiples tecnologías de IA |
| | """ |
| | |
| | def __init__(self): |
| | self.device = "cuda" if torch.cuda.is_available() else "cpu" |
| | print(f"🚀 Inicializando ZenVision en {self.device}") |
| | |
| | |
| | self.load_models() |
| | |
| | def load_models(self): |
| | """Carga todos los modelos de IA necesarios""" |
| | print("📦 Cargando modelos de IA...") |
| | |
| | |
| | self.whisper_model = whisper.load_model("large-v2") |
| | |
| | |
| | self.translator = pipeline("translation", |
| | model="Helsinki-NLP/opus-mt-en-mul", |
| | device=0 if self.device == "cuda" else -1) |
| | |
| | |
| | self.sentiment_analyzer = pipeline("sentiment-analysis", |
| | model="cardiffnlp/twitter-roberta-base-sentiment-latest", |
| | device=0 if self.device == "cuda" else -1) |
| | |
| | |
| | self.emotion_detector = pipeline("text-classification", |
| | model="j-hartmann/emotion-english-distilroberta-base", |
| | device=0 if self.device == "cuda" else -1) |
| | |
| | |
| | self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") |
| | self.bert_model = AutoModel.from_pretrained("bert-base-multilingual-cased") |
| | |
| | |
| | self.google_translator = Translator() |
| | |
| | |
| | try: |
| | self.nlp = spacy.load("en_core_web_sm") |
| | except: |
| | print("⚠️ Modelo spacy no encontrado, usando funcionalidad básica") |
| | self.nlp = None |
| | |
| | print("✅ Todos los modelos cargados exitosamente") |
| | |
| | def extract_audio_features(self, video_path): |
| | """Extrae características avanzadas del audio""" |
| | print("🎵 Extrayendo características de audio...") |
| | |
| | |
| | video = VideoFileClip(video_path) |
| | audio_path = tempfile.mktemp(suffix=".wav") |
| | video.audio.write_audiofile(audio_path, verbose=False, logger=None) |
| | |
| | |
| | y, sr = librosa.load(audio_path, sr=16000) |
| | |
| | |
| | mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
| | spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr) |
| | chroma = librosa.feature.chroma_stft(y=y, sr=sr) |
| | |
| | |
| | intervals = librosa.effects.split(y, top_db=20) |
| | |
| | video.close() |
| | os.remove(audio_path) |
| | |
| | return { |
| | 'audio_data': y, |
| | 'sample_rate': sr, |
| | 'mfccs': mfccs, |
| | 'spectral_centroids': spectral_centroids, |
| | 'chroma': chroma, |
| | 'intervals': intervals, |
| | 'duration': len(y) / sr |
| | } |
| | |
| | def advanced_transcription(self, audio_features): |
| | """Transcripción avanzada con Whisper y análisis contextual""" |
| | print("🎤 Realizando transcripción avanzada...") |
| | |
| | |
| | result = self.whisper_model.transcribe( |
| | audio_features['audio_data'], |
| | language="auto", |
| | word_timestamps=True, |
| | verbose=False |
| | ) |
| | |
| | |
| | segments = [] |
| | for segment in result['segments']: |
| | |
| | sentiment = self.sentiment_analyzer(segment['text'])[0] |
| | |
| | |
| | emotion = self.emotion_detector(segment['text'])[0] |
| | |
| | |
| | entities = [] |
| | if self.nlp: |
| | doc = self.nlp(segment['text']) |
| | entities = [(ent.text, ent.label_) for ent in doc.ents] |
| | |
| | segments.append({ |
| | 'start': segment['start'], |
| | 'end': segment['end'], |
| | 'text': segment['text'], |
| | 'confidence': segment.get('avg_logprob', 0), |
| | 'sentiment': sentiment, |
| | 'emotion': emotion, |
| | 'entities': entities, |
| | 'words': segment.get('words', []) |
| | }) |
| | |
| | return { |
| | 'language': result['language'], |
| | 'segments': segments, |
| | 'full_text': result['text'] |
| | } |
| | |
| | def intelligent_translation(self, transcription, target_language): |
| | """Traducción inteligente con múltiples modelos""" |
| | print(f"🌍 Traduciendo a {target_language}...") |
| | |
| | translated_segments = [] |
| | |
| | for segment in transcription['segments']: |
| | original_text = segment['text'] |
| | |
| | |
| | try: |
| | google_translation = self.google_translator.translate( |
| | original_text, |
| | dest=target_language |
| | ).text |
| | except: |
| | google_translation = original_text |
| | |
| | |
| | final_translation = google_translation |
| | if segment['entities']: |
| | for entity_text, entity_type in segment['entities']: |
| | if entity_type in ['PERSON', 'ORG', 'GPE']: |
| | final_translation = final_translation.replace( |
| | entity_text.lower(), entity_text |
| | ) |
| | |
| | translated_segments.append({ |
| | **segment, |
| | 'translated_text': final_translation, |
| | 'original_text': original_text |
| | }) |
| | |
| | return translated_segments |
| | |
| | def generate_smart_subtitles(self, segments, video_duration): |
| | """Genera subtítulos inteligentes con formato optimizado""" |
| | print("📝 Generando subtítulos inteligentes...") |
| | |
| | subtitles = [] |
| | |
| | for i, segment in enumerate(segments): |
| | |
| | duration = segment['end'] - segment['start'] |
| | text = segment.get('translated_text', segment['text']) |
| | |
| | |
| | max_chars = 42 |
| | max_lines = 2 |
| | |
| | words = text.split() |
| | lines = [] |
| | current_line = "" |
| | |
| | for word in words: |
| | if len(current_line + " " + word) <= max_chars: |
| | current_line += (" " + word) if current_line else word |
| | else: |
| | if current_line: |
| | lines.append(current_line) |
| | current_line = word |
| | |
| | if len(lines) >= max_lines: |
| | break |
| | |
| | if current_line: |
| | lines.append(current_line) |
| | |
| | |
| | subtitle_text = "\n".join(lines[:max_lines]) |
| | |
| | |
| | emotion_label = segment['emotion']['label'] |
| | color = self.get_emotion_color(emotion_label) |
| | |
| | subtitles.append({ |
| | 'start': segment['start'], |
| | 'end': segment['end'], |
| | 'text': subtitle_text, |
| | 'emotion': emotion_label, |
| | 'color': color, |
| | 'confidence': segment['confidence'] |
| | }) |
| | |
| | return subtitles |
| | |
| | def get_emotion_color(self, emotion): |
| | """Asigna colores basados en emociones""" |
| | emotion_colors = { |
| | 'joy': 'yellow', |
| | 'sadness': 'blue', |
| | 'anger': 'red', |
| | 'fear': 'purple', |
| | 'surprise': 'orange', |
| | 'disgust': 'green', |
| | 'neutral': 'white' |
| | } |
| | return emotion_colors.get(emotion.lower(), 'white') |
| | |
| | def create_subtitle_video(self, video_path, subtitles, output_path): |
| | """Crea video con subtítulos integrados""" |
| | print("🎬 Creando video con subtítulos...") |
| | |
| | video = VideoFileClip(video_path) |
| | subtitle_clips = [] |
| | |
| | for subtitle in subtitles: |
| | |
| | txt_clip = TextClip( |
| | subtitle['text'], |
| | fontsize=24, |
| | font='Arial-Bold', |
| | color=subtitle['color'], |
| | stroke_color='black', |
| | stroke_width=2 |
| | ).set_position(('center', 'bottom')).set_duration( |
| | subtitle['end'] - subtitle['start'] |
| | ).set_start(subtitle['start']) |
| | |
| | subtitle_clips.append(txt_clip) |
| | |
| | |
| | final_video = CompositeVideoClip([video] + subtitle_clips) |
| | final_video.write_videofile( |
| | output_path, |
| | codec='libx264', |
| | audio_codec='aac', |
| | verbose=False, |
| | logger=None |
| | ) |
| | |
| | video.close() |
| | final_video.close() |
| | |
| | return output_path |
| | |
| | def export_subtitle_formats(self, subtitles, base_path): |
| | """Exporta subtítulos en múltiples formatos""" |
| | formats = {} |
| | |
| | |
| | srt_path = f"{base_path}.srt" |
| | with open(srt_path, 'w', encoding='utf-8') as f: |
| | for i, sub in enumerate(subtitles, 1): |
| | start_time = self.seconds_to_srt_time(sub['start']) |
| | end_time = self.seconds_to_srt_time(sub['end']) |
| | f.write(f"{i}\n{start_time} --> {end_time}\n{sub['text']}\n\n") |
| | formats['srt'] = srt_path |
| | |
| | |
| | vtt_path = f"{base_path}.vtt" |
| | with open(vtt_path, 'w', encoding='utf-8') as f: |
| | f.write("WEBVTT\n\n") |
| | for sub in subtitles: |
| | start_time = self.seconds_to_vtt_time(sub['start']) |
| | end_time = self.seconds_to_vtt_time(sub['end']) |
| | f.write(f"{start_time} --> {end_time}\n{sub['text']}\n\n") |
| | formats['vtt'] = vtt_path |
| | |
| | |
| | json_path = f"{base_path}.json" |
| | with open(json_path, 'w', encoding='utf-8') as f: |
| | json.dump(subtitles, f, indent=2, ensure_ascii=False) |
| | formats['json'] = json_path |
| | |
| | return formats |
| | |
| | def seconds_to_srt_time(self, seconds): |
| | """Convierte segundos a formato SRT""" |
| | td = timedelta(seconds=seconds) |
| | hours, remainder = divmod(td.total_seconds(), 3600) |
| | minutes, seconds = divmod(remainder, 60) |
| | milliseconds = int((seconds % 1) * 1000) |
| | return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{milliseconds:03d}" |
| | |
| | def seconds_to_vtt_time(self, seconds): |
| | """Convierte segundos a formato VTT""" |
| | td = timedelta(seconds=seconds) |
| | hours, remainder = divmod(td.total_seconds(), 3600) |
| | minutes, seconds = divmod(remainder, 60) |
| | milliseconds = int((seconds % 1) * 1000) |
| | return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{milliseconds:03d}" |
| | |
| | def process_video(self, video_file, target_language="es", include_emotions=True): |
| | """Procesa video completo para generar subtítulos""" |
| | if video_file is None: |
| | return None, None, "Por favor sube un video" |
| | |
| | try: |
| | print("🎯 Iniciando procesamiento con ZenVision...") |
| | |
| | |
| | audio_features = self.extract_audio_features(video_file.name) |
| | |
| | |
| | transcription = self.advanced_transcription(audio_features) |
| | |
| | |
| | if target_language != transcription['language']: |
| | segments = self.intelligent_translation(transcription, target_language) |
| | else: |
| | segments = transcription['segments'] |
| | |
| | |
| | subtitles = self.generate_smart_subtitles(segments, audio_features['duration']) |
| | |
| | |
| | output_video_path = tempfile.mktemp(suffix=".mp4") |
| | self.create_subtitle_video(video_file.name, subtitles, output_video_path) |
| | |
| | |
| | subtitle_base_path = tempfile.mktemp() |
| | subtitle_formats = self.export_subtitle_formats(subtitles, subtitle_base_path) |
| | |
| | |
| | stats = { |
| | 'language_detected': transcription['language'], |
| | 'total_segments': len(subtitles), |
| | 'duration': audio_features['duration'], |
| | 'avg_confidence': np.mean([s['confidence'] for s in segments]), |
| | 'emotions_detected': len(set([s['emotion']['label'] for s in segments])) |
| | } |
| | |
| | status_msg = f"""✅ Procesamiento completado con ZenVision! |
| | |
| | 📊 Estadísticas: |
| | • Idioma detectado: {stats['language_detected']} |
| | • Segmentos generados: {stats['total_segments']} |
| | • Duración: {stats['duration']:.1f}s |
| | • Confianza promedio: {stats['avg_confidence']:.2f} |
| | • Emociones detectadas: {stats['emotions_detected']} |
| | |
| | 🎯 Tecnologías utilizadas: |
| | • Whisper Large-v2 (Transcripción) |
| | • BERT Multilingual (Embeddings) |
| | • RoBERTa (Análisis de sentimientos) |
| | • DistilRoBERTa (Detección de emociones) |
| | • Google Translate (Traducción) |
| | • OpenCV + MoviePy (Procesamiento de video) |
| | • Librosa (Análisis de audio) |
| | • spaCy (NLP avanzado) |
| | """ |
| | |
| | return output_video_path, subtitle_formats['srt'], status_msg |
| | |
| | except Exception as e: |
| | return None, None, f"❌ Error en ZenVision: {str(e)}" |
| |
|
| | |
| | print("🚀 Inicializando ZenVision Model...") |
| | zenvision = ZenVisionModel() |
| |
|
| | |
| | with gr.Blocks(title="ZenVision - AI Subtitle Generator", theme=gr.themes.Soft()) as demo: |
| | gr.HTML(""" |
| | <div style="text-align: center; padding: 20px;"> |
| | <h1>🎬 ZenVision AI Subtitle Generator</h1> |
| | <p style="font-size: 18px; color: #666;"> |
| | Modelo avanzado de subtitulado automático con IA<br> |
| | <strong>Desarrollado por el equipo ZenVision</strong> |
| | </p> |
| | <p style="font-size: 14px; color: #888;"> |
| | Modelo de 3GB+ • Whisper • BERT • RoBERTa • OpenCV • Librosa • spaCy |
| | </p> |
| | </div> |
| | """) |
| | |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | gr.Markdown("### 📤 Entrada") |
| | video_input = gr.Video(label="Subir Video", height=300) |
| | |
| | with gr.Row(): |
| | language_dropdown = gr.Dropdown( |
| | choices=[ |
| | ("Español", "es"), |
| | ("English", "en"), |
| | ("Français", "fr"), |
| | ("Deutsch", "de"), |
| | ("Italiano", "it"), |
| | ("Português", "pt"), |
| | ("中文", "zh"), |
| | ("日本語", "ja"), |
| | ("한국어", "ko"), |
| | ("Русский", "ru") |
| | ], |
| | value="es", |
| | label="Idioma de destino" |
| | ) |
| | |
| | emotions_checkbox = gr.Checkbox( |
| | label="Incluir análisis de emociones", |
| | value=True |
| | ) |
| | |
| | process_btn = gr.Button( |
| | "🚀 Procesar con ZenVision", |
| | variant="primary", |
| | size="lg" |
| | ) |
| | |
| | with gr.Column(scale=1): |
| | gr.Markdown("### 📥 Resultados") |
| | video_output = gr.Video(label="Video con Subtítulos", height=300) |
| | subtitle_file = gr.File(label="Archivo de Subtítulos (.srt)") |
| | |
| | with gr.Row(): |
| | status_output = gr.Textbox( |
| | label="Estado del Procesamiento", |
| | lines=15, |
| | interactive=False |
| | ) |
| | |
| | |
| | gr.Markdown("### 🎯 Características de ZenVision") |
| | gr.HTML(""" |
| | <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 20px 0;"> |
| | <div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
| | <h4>🎤 Transcripción Avanzada</h4> |
| | <p>Whisper Large-v2 con timestamps precisos y detección automática de idioma</p> |
| | </div> |
| | <div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
| | <h4>🌍 Traducción Inteligente</h4> |
| | <p>Google Translate + preservación de entidades nombradas</p> |
| | </div> |
| | <div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
| | <h4>😊 Análisis Emocional</h4> |
| | <p>Detección de emociones y sentimientos con colores adaptativos</p> |
| | </div> |
| | <div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
| | <h4>📝 Múltiples Formatos</h4> |
| | <p>Exportación en SRT, VTT y JSON con metadatos completos</p> |
| | </div> |
| | </div> |
| | """) |
| | |
| | |
| | process_btn.click( |
| | fn=zenvision.process_video, |
| | inputs=[video_input, language_dropdown, emotions_checkbox], |
| | outputs=[video_output, subtitle_file, status_output] |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch( |
| | server_name="0.0.0.0", |
| | server_port=7860, |
| | share=True |
| | ) |