Spaces:

aditya2001
/

VidSimplify

Running

App Files Files Community

Adityahulk commited on 11 days ago

Commit

3ccc955

1 Parent(s): 1110dbd

adding pdf parsing logic correctly

Browse files

Files changed (5) hide show

manimator/agents/reflexion_agent.py +85 -7
manimator/api/animation_generation.py +80 -10
manimator/services/voiceover.py +23 -39
manimator/utils/content_preprocessor.py +172 -0
requirements.txt +2 -1

manimator/agents/reflexion_agent.py CHANGED Viewed

@@ -21,6 +21,7 @@ import litellm
 from ..utils.system_prompts import get_system_prompt
 from ..utils.code_postprocessor import post_process_code
 from ..utils.code_validator import CodeValidator
 logger = logging.getLogger(__name__)
@@ -238,8 +239,78 @@ class ReflexionAgent:
 YOU MUST APPLY THESE LESSONS IN YOUR CODE! Do not repeat these mistakes.
 """
-        # Build user message
-        user_message = f"""Create a video about:
 {goal}
@@ -290,11 +361,18 @@ self.play(items[2].animate.scale(1.1).set_color(GREEN))
         ]
         try:
-            response = litellm.completion(
-                model=self.actor_model,
-                messages=messages,
-                num_retries=2
-            )
             content = response.choices[0].message.content
             code = self._extract_code(content)

 from ..utils.system_prompts import get_system_prompt
 from ..utils.code_postprocessor import post_process_code
 from ..utils.code_validator import CodeValidator
+from ..utils.content_preprocessor import preprocess_long_content, get_script_mode_prompt_for_long_content
 logger = logging.getLogger(__name__)
 YOU MUST APPLY THESE LESSONS IN YOUR CODE! Do not repeat these mistakes.
 """
+        # Detect if input is a ready-made script (long content) vs short prompt
+        word_count = len(goal.split())
+        is_script_mode = word_count > 200
+        # For very long content, preprocess into sections
+        processed_goal = goal
+        section_count = 0
+        if word_count > 1000:
+            processed_goal, section_count = preprocess_long_content(goal)
+        if section_count > 0:
+            # Very long content - use sectioned prompt
+            logger.info(f"📝 LONG DOCUMENT MODE: {word_count} words -> {section_count} sections")
+            user_message = get_script_mode_prompt_for_long_content(processed_goal, section_count)
+        elif is_script_mode:
+            logger.info(f"📝 SCRIPT MODE: Input has {word_count} words - treating as ready-made script")
+            user_message = f"""# 🎬 SCRIPT MODE - ANIMATE THE USER'S CONTENT
+## IMPORTANT: The user has provided their COMPLETE script/content below.
+This is NOT a topic to research - this IS the exact narration/content they want animated.
+## YOUR TASK:
+1. **Use the content below AS the voiceover text** - split it into logical sections
+2. **Create beautiful animations that MATCH each section** of their content
+3. **Do NOT rewrite, summarize, or generate new information** - animate THEIR words
+4. **Every paragraph/section should become a voiceover block** with matching visuals
+5. **Create visualizations that illustrate what THEIR text describes**
+## USER'S SCRIPT TO ANIMATE:
+---
+{goal}
+---
+# ============================================================================
+# 🚨 CRITICAL REQUIREMENTS - YOU MUST FOLLOW THESE
+# ============================================================================
+## SCREEN BOUNDARIES (CRITICAL!)
+- **ALL content MUST stay on screen** - nothing should be cut off
+- For any VGroup with 4+ items: USE `group.scale_to_fit_height(config.frame_height - 2.5)`
+- Maximum 4-5 items visible at once, use smaller fonts (28-32pt) for lists
+- Always leave margins: top 1.0, bottom 0.8, sides 0.5
+## DYNAMIC ANIMATIONS (CRITICAL!)
+- **NEVER use only Write()** - mix at least 4 different animation types
+- **MUST use LaggedStart** for any list of items: `LaggedStart(*[FadeIn(x, shift=RIGHT) for x in items], lag_ratio=0.2)`
+- **MUST include emphasis animations**: `Indicate()`, `Circumscribe()`, `Flash()` on key elements
+- **Use motion during voiceover**: `obj.animate.scale(1.05)` while explaining
+- **Creative transitions**: `FadeOut(old, shift=LEFT), FadeIn(new, shift=RIGHT)`
+## VOICEOVER STRUCTURE FOR SCRIPT MODE:
+Use their content directly in voiceover blocks:
+```python
+# Section 1 - use their first paragraph/section
+with self.voiceover(text="[First section of their content here]") as tracker:
+    # Create animations that ILLUSTRATE what this section describes
+# Section 2 - use their next paragraph/section
+with self.voiceover(text="[Next section of their content here]") as tracker:
+    # Create animations that ILLUSTRATE what this section describes
+```
+## NO STATIC/BORING MOMENTS
+- NEVER have blank screens - always show something
+- NEVER use `self.wait()` longer than 0.5s without animation
+- Every section should have at least one emphasis animation
+- Objects should move and transform, not just appear
+"""
+        else:
+            logger.info(f"📝 GENERATION MODE: Input has {word_count} words - LLM will generate content")
+            # Short prompt - LLM generates content (existing behavior)
+            user_message = f"""Create a video about:
 {goal}
         ]
         try:
+            # Only set max_tokens for long documents where we need extended output
+            # For short prompts, let the model use its default behavior to avoid errors
+            kwargs = {
+                "model": self.actor_model,
+                "messages": messages,
+                "num_retries": 2
+            }
+            if section_count > 0:
+                kwargs["max_tokens"] = 12000  # Increased limit for long docs
+            response = litellm.completion(**kwargs)
             content = response.choices[0].message.content
             code = self._extract_code(content)

manimator/api/animation_generation.py CHANGED Viewed

@@ -8,6 +8,7 @@ from ..utils.code_postprocessor import post_process_code
 from ..utils.code_validator import CodeValidator
 from ..utils.code_fixer import CodeFixer
 from ..inputs.processor import InputProcessor
 logger = logging.getLogger(__name__)
@@ -65,6 +66,48 @@ def _generate_legacy(prompt: str, category: str, max_attempts: int = 3) -> str:
             # Get dynamic system prompt based on category
             system_prompt = get_system_prompt(category)
             messages = [
                 {
                     "role": "system",
@@ -72,26 +115,53 @@ def _generate_legacy(prompt: str, category: str, max_attempts: int = 3) -> str:
                 },
                 {
                     "role": "user",
-                    "content": f"Create a video about:\n\n{prompt}\n\n NOTE!!!:\n1. NO BLANK SCREENS: Keep the screen populated. If a voiceover is playing, show something.\n2. NO OVERLAPS: Ensure text and objects do not overlap. Use `next_to` and `arrange`.\n3. CLEAN TRANSITIONS: Fade out old content before showing new content, but don't leave the screen empty for long.\n4. VARIED ANIMATIONS: Use a mix of Write, FadeIn, GrowFromCenter, etc.\n5. STAY ON SCREEN: Ensure all text and objects are within the screen boundaries. Use .scale_to_fit_width(config.frame_width - 1) for large groups.",
                 },
             ]
             logger.info(f"Generating code (attempt {attempt + 1}/{max_attempts}) with model {model}")
-            response = litellm.completion(
-                model=model,
-                messages=messages,
-                num_retries=2
-            )
             raw_code = response.choices[0].message.content
-            # Extract code if wrapped in markdown
-            if "```python" in raw_code:
-                import re
-                match = re.search(r"```python\n(.*?)```", raw_code, re.DOTALL)
                 if match:
                     raw_code = match.group(1).strip()
             # Post-process the code to fix common issues
             processed_code = post_process_code(raw_code)

 from ..utils.code_validator import CodeValidator
 from ..utils.code_fixer import CodeFixer
 from ..inputs.processor import InputProcessor
+from ..utils.content_preprocessor import preprocess_long_content, get_script_mode_prompt_for_long_content
 logger = logging.getLogger(__name__)
             # Get dynamic system prompt based on category
             system_prompt = get_system_prompt(category)
+            # Detect if input is a ready-made script (long content) vs short prompt
+            word_count = len(prompt.split())
+            is_script_mode = word_count > 200
+            # For very long content, preprocess into sections
+            processed_prompt = prompt
+            section_count = 0
+            if word_count > 1000:
+                processed_prompt, section_count = preprocess_long_content(prompt)
+            if section_count > 0:
+                # Very long content - use sectioned prompt
+                logger.info(f"📝 LONG DOCUMENT MODE (Legacy): {word_count} words -> {section_count} sections")
+                user_content = get_script_mode_prompt_for_long_content(processed_prompt, section_count)
+            elif is_script_mode:
+                logger.info(f"📝 SCRIPT MODE (Legacy): Input has {word_count} words - treating as ready-made script")
+                user_content = f"""# 🎬 SCRIPT MODE - ANIMATE THE USER'S CONTENT
+## IMPORTANT: The user has provided their COMPLETE script/content below.
+This is NOT a topic to research - this IS the exact narration/content they want animated.
+## YOUR TASK:
+1. **Use the content below AS the voiceover text** - split it into logical sections
+2. **Create beautiful animations that MATCH each section** of their content
+3. **Do NOT rewrite, summarize, or generate new information** - animate THEIR words
+4. **Every paragraph/section should become a voiceover block** with matching visuals
+## USER'S SCRIPT TO ANIMATE:
+---
+{prompt}
+---
+NOTE!!!:
+1. NO BLANK SCREENS: Keep the screen populated. If a voiceover is playing, show something.
+2. NO OVERLAPS: Ensure text and objects do not overlap. Use `next_to` and `arrange`.
+3. CLEAN TRANSITIONS: Fade out old content before showing new content, but don't leave the screen empty for long.
+4. VARIED ANIMATIONS: Use a mix of Write, FadeIn, GrowFromCenter, etc.
+5. STAY ON SCREEN: Ensure all text and objects are within the screen boundaries. Use .scale_to_fit_width(config.frame_width - 1) for large groups."""
+            else:
+                logger.info(f"📝 GENERATION MODE (Legacy): Input has {word_count} words - LLM will generate content")
+                user_content = f"Create a video about:\n\n{prompt}\n\n NOTE!!!:\n1. NO BLANK SCREENS: Keep the screen populated. If a voiceover is playing, show something.\n2. NO OVERLAPS: Ensure text and objects do not overlap. Use `next_to` and `arrange`.\n3. CLEAN TRANSITIONS: Fade out old content before showing new content, but don't leave the screen empty for long.\n4. VARIED ANIMATIONS: Use a mix of Write, FadeIn, GrowFromCenter, etc.\n5. STAY ON SCREEN: Ensure all text and objects are within the screen boundaries. Use .scale_to_fit_width(config.frame_width - 1) for large groups."
             messages = [
                 {
                     "role": "system",
                 },
                 {
                     "role": "user",
+                    "content": user_content,
                 },
             ]
             logger.info(f"Generating code (attempt {attempt + 1}/{max_attempts}) with model {model}")
+            # Only set max_tokens for long documents
+            kwargs = {
+                "model": model,
+                "messages": messages,
+                "num_retries": 2
+            }
+            if section_count > 0:
+                kwargs["max_tokens"] = 12000
+            response = litellm.completion(**kwargs)
             raw_code = response.choices[0].message.content
+            # Extract code if wrapped in markdown (handle various formats)
+            import re
+            # Try different markdown patterns
+            code_patterns = [
+                r'```python\n(.*?)```',     # Standard: ```python ... ```
+                r'````python\n(.*?)````',   # Quad backticks
+                r'```py\n(.*?)```',         # ```py
+                r'```\n(.*?)```',           # Just backticks without language
+            ]
+            for pattern in code_patterns:
+                match = re.search(pattern, raw_code, re.DOTALL)
                 if match:
                     raw_code = match.group(1).strip()
+                    break
+            # If still has backticks, try to clean up
+            if raw_code.startswith('```'):
+                lines = raw_code.split('\n')
+                # Remove first line if it's just ```python or similar
+                if lines[0].strip().startswith('```'):
+                    lines = lines[1:]
+                # Remove last line if it's just ```
+                if lines and lines[-1].strip() == '```':
+                    lines = lines[:-1]
+                raw_code = '\n'.join(lines)
             # Post-process the code to fix common issues
             processed_code = post_process_code(raw_code)

manimator/services/voiceover.py CHANGED Viewed

@@ -144,19 +144,33 @@ class SimpleElevenLabsService:
             logger.info(f"Generating Edge TTS ({edge_voice}) for: {text[:30]}...")
-            # Edge-tts is async, so we need to run it in an event loop
             async def _generate():
                 communicate = edge_tts.Communicate(text, edge_voice)
                 await communicate.save(str(output_path))
-            # Run the async function
             try:
-                loop = asyncio.get_event_loop()
-            except RuntimeError:
-                loop = asyncio.new_event_loop()
-                asyncio.set_event_loop(loop)
-            loop.run_until_complete(_generate())
             # Verify file was created successfully
             if output_path.exists() and output_path.stat().st_size > 0:
@@ -167,35 +181,5 @@ class SimpleElevenLabsService:
             return output_path
         except Exception as e:
-            logger.error(f"Edge TTS failed: {str(e)}. Falling back to gTTS.")
-            return self._generate_with_gtts(text)
-    def _generate_with_gtts(self, text: str) -> Path:
-        """
-        Last resort fallback using Google Text-to-Speech.
-        """
-        try:
-            from gtts import gTTS
-            # Use absolute path for gTTS cache (important for containerized environments)
-            gtts_cache_dir = BASE_DIR / "media" / "voiceover" / "gtts"
-            gtts_cache_dir.mkdir(parents=True, exist_ok=True)
-            content_hash = hashlib.md5(text.encode("utf-8")).hexdigest()
-            output_path = gtts_cache_dir / f"{content_hash}.mp3"
-            if output_path.exists() and output_path.stat().st_size > 0:
-                logger.info(f"Using cached gTTS voiceover for hash {content_hash}")
-                return output_path
-            logger.info(f"Generating gTTS fallback for: {text[:30]}...")
-            tts = gTTS(text=text, lang='en')
-            tts.save(str(output_path))
-            logger.info(f"gTTS voiceover saved to {output_path}")
-            return output_path
-        except Exception as e:
-            logger.error(f"gTTS fallback failed: {str(e)}")
-            raise RuntimeError(f"All TTS methods failed: {str(e)}")

             logger.info(f"Generating Edge TTS ({edge_voice}) for: {text[:30]}...")
+            # Edge-tts is async, handle event loop properly for Streamlit/Flask contexts
             async def _generate():
                 communicate = edge_tts.Communicate(text, edge_voice)
                 await communicate.save(str(output_path))
+            # Try to use nest_asyncio for Streamlit/Jupyter compatibility
             try:
+                import nest_asyncio
+                nest_asyncio.apply()
+            except ImportError:
+                pass  # nest_asyncio not available, continue anyway
+            # Run the async function with proper event loop handling
+            try:
+                # Try asyncio.run() first (Python 3.7+, creates new loop)
+                asyncio.run(_generate())
+            except RuntimeError as e:
+                # If there's already an event loop running (e.g., in Streamlit/Jupyter)
+                if "cannot be called from a running event loop" in str(e) or "There is no current event loop" in str(e):
+                    loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(loop)
+                    try:
+                        loop.run_until_complete(_generate())
+                    finally:
+                        loop.close()
+                else:
+                    raise
             # Verify file was created successfully
             if output_path.exists() and output_path.stat().st_size > 0:
             return output_path
         except Exception as e:
+            logger.error(f"Edge TTS failed: {str(e)}")
+            raise RuntimeError(f"Edge TTS voiceover generation failed: {str(e)}")

manimator/utils/content_preprocessor.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Content Preprocessor for Long Inputs
+Handles very long content (PDFs, large text) by:
+1. Chunking content into logical sections
+2. Numbering sections for explicit coverage
+3. Ensuring proportional representation in the video
+"""
+import logging
+import re
+from typing import List, Tuple
+logger = logging.getLogger(__name__)
+def chunk_content(content: str, max_words_per_chunk: int = 150) -> List[str]:
+    """
+    Split content into logical chunks based on paragraphs and sentences.
+    Args:
+        content: The full text content
+        max_words_per_chunk: Target words per chunk (will be approximate)
+    Returns:
+        List of content chunks
+    """
+    # First, split by double newlines (paragraphs)
+    paragraphs = re.split(r'\n\s*\n', content.strip())
+    paragraphs = [p.strip() for p in paragraphs if p.strip()]
+    chunks = []
+    current_chunk = []
+    current_word_count = 0
+    for para in paragraphs:
+        para_words = len(para.split())
+        # If paragraph itself is too long, split by sentences
+        if para_words > max_words_per_chunk:
+            # Commit current chunk first
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = []
+                current_word_count = 0
+            # Split paragraph by sentences
+            sentences = re.split(r'(?<=[.!?])\s+', para)
+            temp_chunk = []
+            temp_count = 0
+            for sentence in sentences:
+                sent_words = len(sentence.split())
+                if temp_count + sent_words > max_words_per_chunk and temp_chunk:
+                    chunks.append(' '.join(temp_chunk))
+                    temp_chunk = [sentence]
+                    temp_count = sent_words
+                else:
+                    temp_chunk.append(sentence)
+                    temp_count += sent_words
+            if temp_chunk:
+                chunks.append(' '.join(temp_chunk))
+        else:
+            # Normal paragraph - add to current chunk
+            if current_word_count + para_words > max_words_per_chunk and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [para]
+                current_word_count = para_words
+            else:
+                current_chunk.append(para)
+                current_word_count += para_words
+    # Don't forget the last chunk
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+def preprocess_long_content(content: str) -> Tuple[str, int]:
+    """
+    Preprocess long content by chunking and adding section markers.
+    For very long content (>1000 words), this creates a structured format
+    with numbered sections that the LLM MUST cover proportionally.
+    Args:
+        content: The raw content from PDF/text input
+    Returns:
+        Tuple of (processed_content, section_count)
+    """
+    word_count = len(content.split())
+    # For shorter content, return as-is
+    if word_count <= 1000:
+        return content, 0
+    logger.info(f"📄 Preprocessing very long content: {word_count} words")
+    # Calculate appropriate chunk size based on content length
+    # Longer content = smaller chunks to ensure coverage
+    if word_count > 5000:
+        max_words = 120  # Very long - more sections
+    elif word_count > 3000:
+        max_words = 150
+    elif word_count > 2000:
+        max_words = 180
+    else:
+        max_words = 200
+    chunks = chunk_content(content, max_words_per_chunk=max_words)
+    section_count = len(chunks)
+    logger.info(f"📄 Split into {section_count} sections (avg ~{word_count // section_count} words each)")
+    # Create structured content with numbered sections
+    structured_parts = []
+    structured_parts.append(f"# STRUCTURED CONTENT ({section_count} SECTIONS)")
+    structured_parts.append(f"# YOU MUST CREATE A VOICEOVER BLOCK FOR EACH SECTION BELOW")
+    structured_parts.append(f"# Video should cover ALL {section_count} sections proportionally")
+    structured_parts.append("")
+    for i, chunk in enumerate(chunks, 1):
+        structured_parts.append(f"=== SECTION {i} OF {section_count} ===")
+        structured_parts.append(chunk)
+        structured_parts.append("")
+    return '\n'.join(structured_parts), section_count
+def get_script_mode_prompt_for_long_content(goal: str, section_count: int) -> str:
+    """
+    Generate the user prompt for very long (chunked) content.
+    This prompt explicitly instructs the LLM to cover ALL sections
+    with DETAILED, HIGH-QUALITY animations - not rushed content.
+    """
+    # Cap sections to a reasonable number for quality
+    effective_sections = min(section_count, 12)
+    return f"""Create a DETAILED animated video from this document.
+CONTENT TO ANIMATE:
+{goal}
+CRITICAL REQUIREMENTS:
+1. CREATE {effective_sections} DISTINCT SECTIONS - each with its own voiceover block
+2. EACH SECTION MUST BE 20-40 SECONDS with rich animations
+3. USE VARIED ANIMATIONS: FadeIn, Write, GrowFromCenter, LaggedStart, Indicate, Circumscribe
+4. DO NOT RUSH - build visuals progressively in each section
+5. CLEAN TRANSITIONS between sections using FadeOut before new content
+6. USE THE ACTUAL TEXT from each section as voiceover content
+DO NOT:
+- Create only 1-2 voiceover blocks
+- Rush through in 5 seconds
+- Skip middle content
+- Use only Write() for everything
+VIDEO DURATION: Approximately {effective_sections * 30} seconds total
+Each section should have:
+- A title/header animation
+- Multiple visual elements built progressively
+- Emphasis animations (Indicate, Circumscribe)
+- Clean transition to next section
+"""

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ requests
 beautifulsoup4>=4.12.0
 lxml>=4.9.0
 readability-lxml>=0.8.1
-edge-tts>=6.1.0

 beautifulsoup4>=4.12.0
 lxml>=4.9.0
 readability-lxml>=0.8.1
+edge-tts>=6.1.0
+nest_asyncio>=1.5.0