Spaces:
Build error
Build error
| import gradio as gr | |
| import requests | |
| import os | |
| import base64 | |
| import json | |
| import re | |
| from pathlib import Path | |
| from typing import List, Dict, Optional, Tuple | |
| import zipfile | |
| import io | |
| from datetime import datetime | |
| import math | |
| from utils import ( | |
| clean_code_content, | |
| get_file_language, | |
| estimate_tokens, | |
| create_chunked_output | |
| ) | |
| from models import ( | |
| process_github_repo, | |
| process_huggingface_repo, | |
| download_repo_as_zip | |
| ) | |
| from config import ( | |
| SUPPORTED_EXTENSIONS, | |
| MAX_FILE_SIZE, | |
| MAX_TOTAL_SIZE, | |
| CHUNK_SIZE, | |
| GITHUB_API_BASE, | |
| HF_API_BASE | |
| ) | |
| # CSS for better UI | |
| css = """ | |
| .container { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| } | |
| .progress-bar { | |
| height: 20px; | |
| background: linear-gradient(90deg, #4CAF50, #45a049); | |
| border-radius: 10px; | |
| transition: width 0.3s ease; | |
| } | |
| .file-stats { | |
| background: #f0f0f0; | |
| padding: 10px; | |
| border-radius: 5px; | |
| margin: 10px 0; | |
| } | |
| .warning { | |
| background: #fff3cd; | |
| border: 1px solid #ffeaa7; | |
| padding: 10px; | |
| border-radius: 5px; | |
| color: #856404; | |
| } | |
| .error { | |
| background: #f8d7da; | |
| border: 1px solid #f5c6cb; | |
| padding: 10px; | |
| border-radius: 5px; | |
| color: #721c24; | |
| } | |
| .success { | |
| background: #d4edda; | |
| border: 1px solid #c3e6cb; | |
| padding: 10px; | |
| border-radius: 5px; | |
| color: #155724; | |
| } | |
| """ | |
| def validate_repo_url(url: str) -> Tuple[str, str]: | |
| """Validate and determine repository type and owner/name""" | |
| url = url.strip() | |
| # GitHub URL patterns | |
| github_patterns = [ | |
| r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$', | |
| r'api\.github\.com/repos/([^/]+)/([^/]+)' | |
| ] | |
| # Hugging Face URL patterns | |
| hf_patterns = [ | |
| r'huggingface\.co/([^/]+)/([^/]+?)(?:\.git)?/?$', | |
| r'hf\.co/([^/]+)/([^/]+?)(?:\.git)?/?$' | |
| ] | |
| for pattern in github_patterns: | |
| match = re.search(pattern, url) | |
| if match: | |
| return "github", f"{match.group(1)}/{match.group(2)}" | |
| for pattern in hf_patterns: | |
| match = re.search(pattern, url) | |
| if match: | |
| return "huggingface", f"{match.group(1)}/{match.group(2)}" | |
| raise ValueError("Invalid repository URL. Please provide a valid GitHub or Hugging Face repository URL.") | |
| def process_repository( | |
| repo_url: str, | |
| token: str = "", | |
| include_patterns: str = "", | |
| exclude_patterns: str = "", | |
| max_file_size_mb: int = 10, | |
| chunk_size: int = 50000, | |
| include_metadata: bool = True, | |
| remove_comments: bool = False, | |
| progress=gr.Progress() | |
| ) -> Tuple[str, str, str]: | |
| """Main function to process repository and generate text file""" | |
| try: | |
| # Validate URL and get repo info | |
| repo_type, repo_path = validate_repo_url(repo_url) | |
| # Parse include/exclude patterns | |
| include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] if include_patterns else [] | |
| exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] if exclude_patterns else [] | |
| progress(0.1, desc="Fetching repository information...") | |
| # Process repository based on type | |
| if repo_type == "github": | |
| files_data, repo_info = process_github_repo( | |
| repo_path, | |
| token, | |
| include_list, | |
| exclude_list, | |
| max_file_size_mb * 1024 * 1024 | |
| ) | |
| else: # huggingface | |
| files_data, repo_info = process_huggingface_repo( | |
| repo_path, | |
| token, | |
| include_list, | |
| exclude_list, | |
| max_file_size_mb * 1024 * 1024 | |
| ) | |
| if not files_data: | |
| return "", "β οΈ No files found matching the criteria.", "" | |
| progress(0.3, desc="Processing files...") | |
| # Generate consolidated text | |
| total_files = len(files_data) | |
| processed_files = 0 | |
| total_tokens = 0 | |
| total_chars = 0 | |
| # Create header | |
| header_lines = [] | |
| if include_metadata: | |
| header_lines.append("=" * 80) | |
| header_lines.append(f"REPOSITORY: {repo_info.get('full_name', repo_path)}") | |
| header_lines.append(f"DESCRIPTION: {repo_info.get('description', 'No description')}") | |
| header_lines.append(f"URL: {repo_url}") | |
| header_lines.append(f"PROCESSED: {datetime.now().isoformat()}") | |
| header_lines.append(f"TOTAL FILES: {total_files}") | |
| header_lines.append("=" * 80) | |
| header_lines.append("") | |
| content_parts = ["\n".join(header_lines)] | |
| # Process each file | |
| for i, (file_path, content, file_size) in enumerate(files_data): | |
| progress(0.3 + (0.5 * i / total_files), desc=f"Processing file {i+1}/{total_files}") | |
| # Clean content if requested | |
| if remove_comments: | |
| content = clean_code_content(content, file_path) | |
| # Add file header | |
| file_header = f"\n{'-' * 60}\n" | |
| file_header += f"FILE: {file_path}\n" | |
| file_header += f"SIZE: {file_size:,} bytes\n" | |
| file_header += f"LANGUAGE: {get_file_language(file_path)}\n" | |
| file_header += f"{'-' * 60}\n\n" | |
| # Add content | |
| file_content = file_header + content + "\n\n" | |
| # Check if adding this file would exceed chunk size | |
| if len("\n".join(content_parts + [file_content])) > chunk_size: | |
| # Save current chunk | |
| yield "\n".join(content_parts), generate_stats(processed_files, total_tokens, total_chars, total_files), "success" | |
| # Start new chunk | |
| content_parts = [file_header + "\n".join(header_lines)] | |
| content_parts.append(file_content) | |
| processed_files += 1 | |
| total_chars += len(content) | |
| total_tokens += estimate_tokens(content) | |
| progress(0.9, desc="Finalizing...") | |
| # Final content | |
| final_content = "\n".join(content_parts) | |
| # Add footer | |
| if include_metadata: | |
| footer = f"\n{'=' * 80}\n" | |
| footer += f"SUMMARY:\n" | |
| footer += f"- Files processed: {processed_files}\n" | |
| footer += f"- Total characters: {total_chars:,}\n" | |
| footer += f"- Estimated tokens: {total_tokens:,}\n" | |
| footer += f"- Repository: {repo_info.get('full_name', repo_path)}\n" | |
| footer += f"{'=' * 80}\n" | |
| final_content += footer | |
| progress(1.0, desc="Complete!") | |
| return final_content, generate_stats(processed_files, total_tokens, total_chars, total_files), "success" | |
| except Exception as e: | |
| error_msg = f"β Error: {str(e)}" | |
| return "", error_msg, "error" | |
| def generate_stats(files_processed: int, tokens: int, chars: int, total_files: int) -> str: | |
| """Generate statistics HTML""" | |
| stats_html = f""" | |
| <div class="file-stats"> | |
| <h3>π Processing Statistics</h3> | |
| <p><strong>Files Processed:</strong> {files_processed:,} / {total_files:,}</p> | |
| <p><strong>Total Characters:</strong> {chars:,}</p> | |
| <p><strong>Estimated Tokens:</strong> {tokens:,}</p> | |
| <p><strong>Average Tokens per File:</strong> {tokens // max(files_processed, 1):,}</p> | |
| </div> | |
| """ | |
| return stats_html | |
| def download_repo_locally(repo_url: str, token: str = "") -> str: | |
| """Download repository as ZIP for local processing""" | |
| try: | |
| repo_type, repo_path = validate_repo_url(repo_url) | |
| if repo_type == "github": | |
| return download_repo_as_zip(f"github.com/{repo_path}", token) | |
| else: | |
| return download_repo_as_zip(f"huggingface.co/{repo_path}", token) | |
| except Exception as e: | |
| return f"Error downloading repository: {str(e)}" | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="Repo-to-Text Converter", | |
| theme=gr.themes.Soft(), | |
| css=css | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π Repository to Text Converter | |
| Convert GitHub or Hugging Face repositories into formatted text files perfect for LLM training. | |
| **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Input section | |
| gr.Markdown("## π₯ Repository Input") | |
| repo_url = gr.Textbox( | |
| label="Repository URL", | |
| placeholder="https://github.com/username/repo or https://huggingface.co/username/repo", | |
| lines=2 | |
| ) | |
| token = gr.Textbox( | |
| label="Access Token (Optional)", | |
| placeholder="GitHub token or Hugging Face token for private repos", | |
| type="password" | |
| ) | |
| with gr.Accordion("π§ Advanced Options", open=False): | |
| include_patterns = gr.Textbox( | |
| label="Include Patterns (comma-separated)", | |
| placeholder="*.py,*.md,src/**/*.py", | |
| info="Only include files matching these patterns" | |
| ) | |
| exclude_patterns = gr.Textbox( | |
| label="Exclude Patterns (comma-separated)", | |
| placeholder="*.git*,*.log,node_modules/**", | |
| value="*.git*,*.log,node_modules/**,__pycache__/**,.DS_Store" | |
| ) | |
| max_file_size = gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| value=10, | |
| step=1, | |
| label="Max File Size (MB)", | |
| info="Files larger than this will be skipped" | |
| ) | |
| chunk_size = gr.Slider( | |
| minimum=1000, | |
| maximum=100000, | |
| value=50000, | |
| step=1000, | |
| label="Chunk Size (characters)", | |
| info="Split output into chunks of this size" | |
| ) | |
| include_metadata = gr.Checkbox( | |
| value=True, | |
| label="Include Metadata", | |
| info="Add repository information and statistics" | |
| ) | |
| remove_comments = gr.Checkbox( | |
| value=False, | |
| label="Remove Comments", | |
| info="Strip comments from code files (experimental)" | |
| ) | |
| process_btn = gr.Button( | |
| "π Process Repository", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| download_btn = gr.Button( | |
| "β¬οΈ Download as ZIP", | |
| variant="secondary" | |
| ) | |
| with gr.Column(scale=1): | |
| # Info section | |
| gr.Markdown("## βΉοΈ Information") | |
| gr.Markdown(""" | |
| ### Supported Platforms: | |
| - β GitHub (public and private) | |
| - β Hugging Face (public and private) | |
| ### Supported File Types: | |
| - Code files (.py, .js, .java, .cpp, etc.) | |
| - Documentation (.md, .txt, .rst) | |
| - Configuration files (.json, .yaml, .toml) | |
| - And many more! | |
| ### Features: | |
| - π Chunked output for large repos | |
| - π Token estimation | |
| - π― Pattern-based file filtering | |
| - π§Ή Optional comment removal | |
| """) | |
| # Output section | |
| gr.Markdown("## π€ Output") | |
| with gr.Row(): | |
| stats_display = gr.HTML(label="Statistics") | |
| output_text = gr.Textbox( | |
| label="Generated Text", | |
| lines=20, | |
| max_lines=50, | |
| show_copy_button=True, | |
| interactive=True | |
| ) | |
| status_display = gr.HTML() | |
| # Event handlers | |
| process_btn.click( | |
| fn=process_repository, | |
| inputs=[ | |
| repo_url, | |
| token, | |
| include_patterns, | |
| exclude_patterns, | |
| max_file_size, | |
| chunk_size, | |
| include_metadata, | |
| remove_comments | |
| ], | |
| outputs=[output_text, stats_display, status_display] | |
| ) | |
| download_btn.click( | |
| fn=download_repo_locally, | |
| inputs=[repo_url, token], | |
| outputs=gr.File(label="Downloaded Repository") | |
| ) | |
| # Examples | |
| gr.Markdown("## π― Examples") | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "https://github.com/gradio-app/gradio", | |
| "", | |
| "*.py,*.md", | |
| "", | |
| 10, | |
| 50000, | |
| True, | |
| False | |
| ], | |
| [ | |
| "https://huggingface.co/huggingface/transformers", | |
| "", | |
| "*.py,*.md,*.rst", | |
| "tests/**,docs/**", | |
| 5, | |
| 30000, | |
| True, | |
| False | |
| ] | |
| ], | |
| inputs=[ | |
| repo_url, | |
| token, | |
| include_patterns, | |
| exclude_patterns, | |
| max_file_size, | |
| chunk_size, | |
| include_metadata, | |
| remove_comments | |
| ] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch( | |
| share=True, | |
| show_error=True, | |
| show_tips=True | |
| ) | |