Spaces:
Build error
Build error
Deploy Gradio app with multiple files
Browse files- app.py +441 -0
- config.py +238 -0
- models.py +226 -0
- requirements.txt +45 -0
- utils.py +365 -0
app.py
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import requests
|
| 3 |
+
import os
|
| 4 |
+
import base64
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import List, Dict, Optional, Tuple
|
| 9 |
+
import zipfile
|
| 10 |
+
import io
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
import math
|
| 13 |
+
|
| 14 |
+
from utils import (
|
| 15 |
+
clean_code_content,
|
| 16 |
+
get_file_language,
|
| 17 |
+
estimate_tokens,
|
| 18 |
+
create_chunked_output
|
| 19 |
+
)
|
| 20 |
+
from models import (
|
| 21 |
+
process_github_repo,
|
| 22 |
+
process_huggingface_repo,
|
| 23 |
+
download_repo_as_zip
|
| 24 |
+
)
|
| 25 |
+
from config import (
|
| 26 |
+
SUPPORTED_EXTENSIONS,
|
| 27 |
+
MAX_FILE_SIZE,
|
| 28 |
+
MAX_TOTAL_SIZE,
|
| 29 |
+
CHUNK_SIZE,
|
| 30 |
+
GITHUB_API_BASE,
|
| 31 |
+
HF_API_BASE
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# CSS for better UI
|
| 35 |
+
css = """
|
| 36 |
+
.container {
|
| 37 |
+
max-width: 1200px;
|
| 38 |
+
margin: 0 auto;
|
| 39 |
+
}
|
| 40 |
+
.progress-bar {
|
| 41 |
+
height: 20px;
|
| 42 |
+
background: linear-gradient(90deg, #4CAF50, #45a049);
|
| 43 |
+
border-radius: 10px;
|
| 44 |
+
transition: width 0.3s ease;
|
| 45 |
+
}
|
| 46 |
+
.file-stats {
|
| 47 |
+
background: #f0f0f0;
|
| 48 |
+
padding: 10px;
|
| 49 |
+
border-radius: 5px;
|
| 50 |
+
margin: 10px 0;
|
| 51 |
+
}
|
| 52 |
+
.warning {
|
| 53 |
+
background: #fff3cd;
|
| 54 |
+
border: 1px solid #ffeaa7;
|
| 55 |
+
padding: 10px;
|
| 56 |
+
border-radius: 5px;
|
| 57 |
+
color: #856404;
|
| 58 |
+
}
|
| 59 |
+
.error {
|
| 60 |
+
background: #f8d7da;
|
| 61 |
+
border: 1px solid #f5c6cb;
|
| 62 |
+
padding: 10px;
|
| 63 |
+
border-radius: 5px;
|
| 64 |
+
color: #721c24;
|
| 65 |
+
}
|
| 66 |
+
.success {
|
| 67 |
+
background: #d4edda;
|
| 68 |
+
border: 1px solid #c3e6cb;
|
| 69 |
+
padding: 10px;
|
| 70 |
+
border-radius: 5px;
|
| 71 |
+
color: #155724;
|
| 72 |
+
}
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
def validate_repo_url(url: str) -> Tuple[str, str]:
|
| 76 |
+
"""Validate and determine repository type and owner/name"""
|
| 77 |
+
url = url.strip()
|
| 78 |
+
|
| 79 |
+
# GitHub URL patterns
|
| 80 |
+
github_patterns = [
|
| 81 |
+
r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$',
|
| 82 |
+
r'api\.github\.com/repos/([^/]+)/([^/]+)'
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
# Hugging Face URL patterns
|
| 86 |
+
hf_patterns = [
|
| 87 |
+
r'huggingface\.co/([^/]+)/([^/]+?)(?:\.git)?/?$',
|
| 88 |
+
r'hf\.co/([^/]+)/([^/]+?)(?:\.git)?/?$'
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
for pattern in github_patterns:
|
| 92 |
+
match = re.search(pattern, url)
|
| 93 |
+
if match:
|
| 94 |
+
return "github", f"{match.group(1)}/{match.group(2)}"
|
| 95 |
+
|
| 96 |
+
for pattern in hf_patterns:
|
| 97 |
+
match = re.search(pattern, url)
|
| 98 |
+
if match:
|
| 99 |
+
return "huggingface", f"{match.group(1)}/{match.group(2)}"
|
| 100 |
+
|
| 101 |
+
raise ValueError("Invalid repository URL. Please provide a valid GitHub or Hugging Face repository URL.")
|
| 102 |
+
|
| 103 |
+
def process_repository(
|
| 104 |
+
repo_url: str,
|
| 105 |
+
token: str = "",
|
| 106 |
+
include_patterns: str = "",
|
| 107 |
+
exclude_patterns: str = "",
|
| 108 |
+
max_file_size_mb: int = 10,
|
| 109 |
+
chunk_size: int = 50000,
|
| 110 |
+
include_metadata: bool = True,
|
| 111 |
+
remove_comments: bool = False,
|
| 112 |
+
progress=gr.Progress()
|
| 113 |
+
) -> Tuple[str, str, str]:
|
| 114 |
+
"""Main function to process repository and generate text file"""
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
# Validate URL and get repo info
|
| 118 |
+
repo_type, repo_path = validate_repo_url(repo_url)
|
| 119 |
+
|
| 120 |
+
# Parse include/exclude patterns
|
| 121 |
+
include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] if include_patterns else []
|
| 122 |
+
exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] if exclude_patterns else []
|
| 123 |
+
|
| 124 |
+
progress(0.1, desc="Fetching repository information...")
|
| 125 |
+
|
| 126 |
+
# Process repository based on type
|
| 127 |
+
if repo_type == "github":
|
| 128 |
+
files_data, repo_info = process_github_repo(
|
| 129 |
+
repo_path,
|
| 130 |
+
token,
|
| 131 |
+
include_list,
|
| 132 |
+
exclude_list,
|
| 133 |
+
max_file_size_mb * 1024 * 1024
|
| 134 |
+
)
|
| 135 |
+
else: # huggingface
|
| 136 |
+
files_data, repo_info = process_huggingface_repo(
|
| 137 |
+
repo_path,
|
| 138 |
+
token,
|
| 139 |
+
include_list,
|
| 140 |
+
exclude_list,
|
| 141 |
+
max_file_size_mb * 1024 * 1024
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if not files_data:
|
| 145 |
+
return "", "⚠️ No files found matching the criteria.", ""
|
| 146 |
+
|
| 147 |
+
progress(0.3, desc="Processing files...")
|
| 148 |
+
|
| 149 |
+
# Generate consolidated text
|
| 150 |
+
total_files = len(files_data)
|
| 151 |
+
processed_files = 0
|
| 152 |
+
total_tokens = 0
|
| 153 |
+
total_chars = 0
|
| 154 |
+
|
| 155 |
+
# Create header
|
| 156 |
+
header_lines = []
|
| 157 |
+
if include_metadata:
|
| 158 |
+
header_lines.append("=" * 80)
|
| 159 |
+
header_lines.append(f"REPOSITORY: {repo_info.get('full_name', repo_path)}")
|
| 160 |
+
header_lines.append(f"DESCRIPTION: {repo_info.get('description', 'No description')}")
|
| 161 |
+
header_lines.append(f"URL: {repo_url}")
|
| 162 |
+
header_lines.append(f"PROCESSED: {datetime.now().isoformat()}")
|
| 163 |
+
header_lines.append(f"TOTAL FILES: {total_files}")
|
| 164 |
+
header_lines.append("=" * 80)
|
| 165 |
+
header_lines.append("")
|
| 166 |
+
|
| 167 |
+
content_parts = ["\n".join(header_lines)]
|
| 168 |
+
|
| 169 |
+
# Process each file
|
| 170 |
+
for i, (file_path, content, file_size) in enumerate(files_data):
|
| 171 |
+
progress(0.3 + (0.5 * i / total_files), desc=f"Processing file {i+1}/{total_files}")
|
| 172 |
+
|
| 173 |
+
# Clean content if requested
|
| 174 |
+
if remove_comments:
|
| 175 |
+
content = clean_code_content(content, file_path)
|
| 176 |
+
|
| 177 |
+
# Add file header
|
| 178 |
+
file_header = f"\n{'-' * 60}\n"
|
| 179 |
+
file_header += f"FILE: {file_path}\n"
|
| 180 |
+
file_header += f"SIZE: {file_size:,} bytes\n"
|
| 181 |
+
file_header += f"LANGUAGE: {get_file_language(file_path)}\n"
|
| 182 |
+
file_header += f"{'-' * 60}\n\n"
|
| 183 |
+
|
| 184 |
+
# Add content
|
| 185 |
+
file_content = file_header + content + "\n\n"
|
| 186 |
+
|
| 187 |
+
# Check if adding this file would exceed chunk size
|
| 188 |
+
if len("\n".join(content_parts + [file_content])) > chunk_size:
|
| 189 |
+
# Save current chunk
|
| 190 |
+
yield "\n".join(content_parts), generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
|
| 191 |
+
# Start new chunk
|
| 192 |
+
content_parts = [file_header + "\n".join(header_lines)]
|
| 193 |
+
|
| 194 |
+
content_parts.append(file_content)
|
| 195 |
+
processed_files += 1
|
| 196 |
+
total_chars += len(content)
|
| 197 |
+
total_tokens += estimate_tokens(content)
|
| 198 |
+
|
| 199 |
+
progress(0.9, desc="Finalizing...")
|
| 200 |
+
|
| 201 |
+
# Final content
|
| 202 |
+
final_content = "\n".join(content_parts)
|
| 203 |
+
|
| 204 |
+
# Add footer
|
| 205 |
+
if include_metadata:
|
| 206 |
+
footer = f"\n{'=' * 80}\n"
|
| 207 |
+
footer += f"SUMMARY:\n"
|
| 208 |
+
footer += f"- Files processed: {processed_files}\n"
|
| 209 |
+
footer += f"- Total characters: {total_chars:,}\n"
|
| 210 |
+
footer += f"- Estimated tokens: {total_tokens:,}\n"
|
| 211 |
+
footer += f"- Repository: {repo_info.get('full_name', repo_path)}\n"
|
| 212 |
+
footer += f"{'=' * 80}\n"
|
| 213 |
+
final_content += footer
|
| 214 |
+
|
| 215 |
+
progress(1.0, desc="Complete!")
|
| 216 |
+
|
| 217 |
+
return final_content, generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
|
| 218 |
+
|
| 219 |
+
except Exception as e:
|
| 220 |
+
error_msg = f"❌ Error: {str(e)}"
|
| 221 |
+
return "", error_msg, "error"
|
| 222 |
+
|
| 223 |
+
def generate_stats(files_processed: int, tokens: int, chars: int, total_files: int) -> str:
|
| 224 |
+
"""Generate statistics HTML"""
|
| 225 |
+
stats_html = f"""
|
| 226 |
+
<div class="file-stats">
|
| 227 |
+
<h3>📊 Processing Statistics</h3>
|
| 228 |
+
<p><strong>Files Processed:</strong> {files_processed:,} / {total_files:,}</p>
|
| 229 |
+
<p><strong>Total Characters:</strong> {chars:,}</p>
|
| 230 |
+
<p><strong>Estimated Tokens:</strong> {tokens:,}</p>
|
| 231 |
+
<p><strong>Average Tokens per File:</strong> {tokens // max(files_processed, 1):,}</p>
|
| 232 |
+
</div>
|
| 233 |
+
"""
|
| 234 |
+
return stats_html
|
| 235 |
+
|
| 236 |
+
def download_repo_locally(repo_url: str, token: str = "") -> str:
|
| 237 |
+
"""Download repository as ZIP for local processing"""
|
| 238 |
+
try:
|
| 239 |
+
repo_type, repo_path = validate_repo_url(repo_url)
|
| 240 |
+
|
| 241 |
+
if repo_type == "github":
|
| 242 |
+
return download_repo_as_zip(f"github.com/{repo_path}", token)
|
| 243 |
+
else:
|
| 244 |
+
return download_repo_as_zip(f"huggingface.co/{repo_path}", token)
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
return f"Error downloading repository: {str(e)}"
|
| 248 |
+
|
| 249 |
+
# Create Gradio interface
|
| 250 |
+
def create_interface():
|
| 251 |
+
with gr.Blocks(
|
| 252 |
+
title="Repo-to-Text Converter",
|
| 253 |
+
theme=gr.themes.Soft(),
|
| 254 |
+
css=css
|
| 255 |
+
) as demo:
|
| 256 |
+
|
| 257 |
+
gr.Markdown("""
|
| 258 |
+
# 📚 Repository to Text Converter
|
| 259 |
+
|
| 260 |
+
Convert GitHub or Hugging Face repositories into formatted text files perfect for LLM training.
|
| 261 |
+
|
| 262 |
+
**Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
|
| 263 |
+
""")
|
| 264 |
+
|
| 265 |
+
with gr.Row():
|
| 266 |
+
with gr.Column(scale=2):
|
| 267 |
+
# Input section
|
| 268 |
+
gr.Markdown("## 📥 Repository Input")
|
| 269 |
+
|
| 270 |
+
repo_url = gr.Textbox(
|
| 271 |
+
label="Repository URL",
|
| 272 |
+
placeholder="https://github.com/username/repo or https://huggingface.co/username/repo",
|
| 273 |
+
lines=2
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
token = gr.Textbox(
|
| 277 |
+
label="Access Token (Optional)",
|
| 278 |
+
placeholder="GitHub token or Hugging Face token for private repos",
|
| 279 |
+
type="password"
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
with gr.Accordion("🔧 Advanced Options", open=False):
|
| 283 |
+
include_patterns = gr.Textbox(
|
| 284 |
+
label="Include Patterns (comma-separated)",
|
| 285 |
+
placeholder="*.py,*.md,src/**/*.py",
|
| 286 |
+
info="Only include files matching these patterns"
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
exclude_patterns = gr.Textbox(
|
| 290 |
+
label="Exclude Patterns (comma-separated)",
|
| 291 |
+
placeholder="*.git*,*.log,node_modules/**",
|
| 292 |
+
value="*.git*,*.log,node_modules/**,__pycache__/**,.DS_Store"
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
max_file_size = gr.Slider(
|
| 296 |
+
minimum=1,
|
| 297 |
+
maximum=100,
|
| 298 |
+
value=10,
|
| 299 |
+
step=1,
|
| 300 |
+
label="Max File Size (MB)",
|
| 301 |
+
info="Files larger than this will be skipped"
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
chunk_size = gr.Slider(
|
| 305 |
+
minimum=1000,
|
| 306 |
+
maximum=100000,
|
| 307 |
+
value=50000,
|
| 308 |
+
step=1000,
|
| 309 |
+
label="Chunk Size (characters)",
|
| 310 |
+
info="Split output into chunks of this size"
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
include_metadata = gr.Checkbox(
|
| 314 |
+
value=True,
|
| 315 |
+
label="Include Metadata",
|
| 316 |
+
info="Add repository information and statistics"
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
remove_comments = gr.Checkbox(
|
| 320 |
+
value=False,
|
| 321 |
+
label="Remove Comments",
|
| 322 |
+
info="Strip comments from code files (experimental)"
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
process_btn = gr.Button(
|
| 326 |
+
"🚀 Process Repository",
|
| 327 |
+
variant="primary",
|
| 328 |
+
size="lg"
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
download_btn = gr.Button(
|
| 332 |
+
"⬇️ Download as ZIP",
|
| 333 |
+
variant="secondary"
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
with gr.Column(scale=1):
|
| 337 |
+
# Info section
|
| 338 |
+
gr.Markdown("## ℹ️ Information")
|
| 339 |
+
|
| 340 |
+
gr.Markdown("""
|
| 341 |
+
### Supported Platforms:
|
| 342 |
+
- ✅ GitHub (public and private)
|
| 343 |
+
- ✅ Hugging Face (public and private)
|
| 344 |
+
|
| 345 |
+
### Supported File Types:
|
| 346 |
+
- Code files (.py, .js, .java, .cpp, etc.)
|
| 347 |
+
- Documentation (.md, .txt, .rst)
|
| 348 |
+
- Configuration files (.json, .yaml, .toml)
|
| 349 |
+
- And many more!
|
| 350 |
+
|
| 351 |
+
### Features:
|
| 352 |
+
- 🔄 Chunked output for large repos
|
| 353 |
+
- 📊 Token estimation
|
| 354 |
+
- 🎯 Pattern-based file filtering
|
| 355 |
+
- 🧹 Optional comment removal
|
| 356 |
+
""")
|
| 357 |
+
|
| 358 |
+
# Output section
|
| 359 |
+
gr.Markdown("## 📤 Output")
|
| 360 |
+
|
| 361 |
+
with gr.Row():
|
| 362 |
+
stats_display = gr.HTML(label="Statistics")
|
| 363 |
+
|
| 364 |
+
output_text = gr.Textbox(
|
| 365 |
+
label="Generated Text",
|
| 366 |
+
lines=20,
|
| 367 |
+
max_lines=50,
|
| 368 |
+
show_copy_button=True,
|
| 369 |
+
interactive=True
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
status_display = gr.HTML()
|
| 373 |
+
|
| 374 |
+
# Event handlers
|
| 375 |
+
process_btn.click(
|
| 376 |
+
fn=process_repository,
|
| 377 |
+
inputs=[
|
| 378 |
+
repo_url,
|
| 379 |
+
token,
|
| 380 |
+
include_patterns,
|
| 381 |
+
exclude_patterns,
|
| 382 |
+
max_file_size,
|
| 383 |
+
chunk_size,
|
| 384 |
+
include_metadata,
|
| 385 |
+
remove_comments
|
| 386 |
+
],
|
| 387 |
+
outputs=[output_text, stats_display, status_display]
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
download_btn.click(
|
| 391 |
+
fn=download_repo_locally,
|
| 392 |
+
inputs=[repo_url, token],
|
| 393 |
+
outputs=gr.File(label="Downloaded Repository")
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
# Examples
|
| 397 |
+
gr.Markdown("## 🎯 Examples")
|
| 398 |
+
gr.Examples(
|
| 399 |
+
examples=[
|
| 400 |
+
[
|
| 401 |
+
"https://github.com/gradio-app/gradio",
|
| 402 |
+
"",
|
| 403 |
+
"*.py,*.md",
|
| 404 |
+
"",
|
| 405 |
+
10,
|
| 406 |
+
50000,
|
| 407 |
+
True,
|
| 408 |
+
False
|
| 409 |
+
],
|
| 410 |
+
[
|
| 411 |
+
"https://huggingface.co/huggingface/transformers",
|
| 412 |
+
"",
|
| 413 |
+
"*.py,*.md,*.rst",
|
| 414 |
+
"tests/**,docs/**",
|
| 415 |
+
5,
|
| 416 |
+
30000,
|
| 417 |
+
True,
|
| 418 |
+
False
|
| 419 |
+
]
|
| 420 |
+
],
|
| 421 |
+
inputs=[
|
| 422 |
+
repo_url,
|
| 423 |
+
token,
|
| 424 |
+
include_patterns,
|
| 425 |
+
exclude_patterns,
|
| 426 |
+
max_file_size,
|
| 427 |
+
chunk_size,
|
| 428 |
+
include_metadata,
|
| 429 |
+
remove_comments
|
| 430 |
+
]
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
return demo
|
| 434 |
+
|
| 435 |
+
if __name__ == "__main__":
|
| 436 |
+
demo = create_interface()
|
| 437 |
+
demo.launch(
|
| 438 |
+
share=True,
|
| 439 |
+
show_error=True,
|
| 440 |
+
show_tips=True
|
| 441 |
+
)
|
config.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration constants
|
| 2 |
+
|
| 3 |
+
# API endpoints
|
| 4 |
+
GITHUB_API_BASE = "https://api.github.com"
|
| 5 |
+
HF_API_BASE = "https://huggingface.co"
|
| 6 |
+
|
| 7 |
+
# Supported file extensions for text processing
|
| 8 |
+
SUPPORTED_EXTENSIONS = {
|
| 9 |
+
# Programming languages
|
| 10 |
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.cs', '.go', '.rs',
|
| 11 |
+
'.php', '.rb', '.swift', '.kt', '.scala', '.r', '.m', '.sh', '.bash', '.zsh',
|
| 12 |
+
'.fish', '.ps1', '.bat', '.sql', '.html', '.htm', '.xml', '.css', '.scss',
|
| 13 |
+
'.sass', '.less', '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
|
| 14 |
+
'.md', '.rst', '.txt', '.log', '.dockerfile', '.gitignore', '.gitattributes',
|
| 15 |
+
'.editorconfig', '.eslintrc', '.prettierrc', '.babelrc', '.tsconfig',
|
| 16 |
+
|
| 17 |
+
# Configuration files
|
| 18 |
+
'.env', '.env.example', '.env.local', '.env.development', '.env.production',
|
| 19 |
+
'package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
|
| 20 |
+
'requirements.txt', 'Pipfile', 'poetry.lock', 'pyproject.toml',
|
| 21 |
+
'Cargo.toml', 'Cargo.lock', 'go.mod', 'go.sum', 'composer.json',
|
| 22 |
+
'composer.lock', 'Gemfile', 'Gemfile.lock', 'pom.xml', 'build.gradle',
|
| 23 |
+
'CMakeLists.txt', 'Makefile', 'Dockerfile', 'docker-compose.yml',
|
| 24 |
+
|
| 25 |
+
# Documentation
|
| 26 |
+
'.md', '.rst', '.txt', '.adoc', '.tex', '.bib',
|
| 27 |
+
|
| 28 |
+
# Data formats
|
| 29 |
+
'.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
|
| 30 |
+
'.csv', '.tsv', '.xml', '.rss', '.atom',
|
| 31 |
+
|
| 32 |
+
# Scripts
|
| 33 |
+
'.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
|
| 34 |
+
'.py', '.pl', '.rb', '.lua', '.tcl', '.awk', '.sed',
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Size limits
|
| 38 |
+
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB default
|
| 39 |
+
MAX_TOTAL_SIZE = 100 * 1024 * 1024 # 100MB default
|
| 40 |
+
CHUNK_SIZE = 50000 # Characters per chunk
|
| 41 |
+
|
| 42 |
+
# File patterns to exclude by default
|
| 43 |
+
DEFAULT_EXCLUDE_PATTERNS = [
|
| 44 |
+
"*.git*",
|
| 45 |
+
"*.log",
|
| 46 |
+
"node_modules/**",
|
| 47 |
+
"__pycache__/**",
|
| 48 |
+
".DS_Store",
|
| 49 |
+
"Thumbs.db",
|
| 50 |
+
"*.tmp",
|
| 51 |
+
"*.temp",
|
| 52 |
+
"*.swp",
|
| 53 |
+
"*.swo",
|
| 54 |
+
"*~",
|
| 55 |
+
".vscode/**",
|
| 56 |
+
".idea/**",
|
| 57 |
+
"*.pyc",
|
| 58 |
+
"*.pyo",
|
| 59 |
+
"*.pyd",
|
| 60 |
+
".Python",
|
| 61 |
+
"build/**",
|
| 62 |
+
"dist/**",
|
| 63 |
+
"*.egg-info/**",
|
| 64 |
+
".pytest_cache/**",
|
| 65 |
+
".coverage",
|
| 66 |
+
"htmlcov/**",
|
| 67 |
+
".tox/**",
|
| 68 |
+
"*.cover",
|
| 69 |
+
"coverage.xml",
|
| 70 |
+
"*.cover",
|
| 71 |
+
".hypothesis/**",
|
| 72 |
+
".mypy_cache/**",
|
| 73 |
+
"dmypy.json",
|
| 74 |
+
dmypy.json",
|
| 75 |
+
".pytest_cache/**",
|
| 76 |
+
"nosetests.xml",
|
| 77 |
+
"coverage.xml",
|
| 78 |
+
"*.cover",
|
| 79 |
+
".hypothesis/**",
|
| 80 |
+
".cache/**",
|
| 81 |
+
"*.pid",
|
| 82 |
+
"*.seed",
|
| 83 |
+
"*.pid.lock",
|
| 84 |
+
".nyc_output",
|
| 85 |
+
".grunt",
|
| 86 |
+
".bower",
|
| 87 |
+
".lock-wscript",
|
| 88 |
+
"build/Release",
|
| 89 |
+
"jspm_packages/",
|
| 90 |
+
"typings",
|
| 91 |
+
".npm",
|
| 92 |
+
".eslintcache",
|
| 93 |
+
".stylelintcache",
|
| 94 |
+
"*.tsbuildinfo",
|
| 95 |
+
".rsync_user",
|
| 96 |
+
".vscode-test",
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
# File patterns to include by default
|
| 100 |
+
DEFAULT_INCLUDE_PATTERNS = [
|
| 101 |
+
"*.py",
|
| 102 |
+
"*.js",
|
| 103 |
+
"*.ts",
|
| 104 |
+
"*.jsx",
|
| 105 |
+
"*.tsx",
|
| 106 |
+
"*.java",
|
| 107 |
+
"*.cpp",
|
| 108 |
+
"*.c",
|
| 109 |
+
"*.cs",
|
| 110 |
+
"*.go",
|
| 111 |
+
"*.rs",
|
| 112 |
+
"*.php",
|
| 113 |
+
"*.rb",
|
| 114 |
+
"*.swift",
|
| 115 |
+
"*.kt",
|
| 116 |
+
"*.scala",
|
| 117 |
+
"*.r",
|
| 118 |
+
"*.m",
|
| 119 |
+
"*.sh",
|
| 120 |
+
"*.bash",
|
| 121 |
+
"*.zsh",
|
| 122 |
+
"*.fish",
|
| 123 |
+
"*.ps1",
|
| 124 |
+
"*.bat",
|
| 125 |
+
"*.sql",
|
| 126 |
+
"*.html",
|
| 127 |
+
"*.htm",
|
| 128 |
+
"*.xml",
|
| 129 |
+
"*.css",
|
| 130 |
+
"*.scss",
|
| 131 |
+
"*.sass",
|
| 132 |
+
"*.less",
|
| 133 |
+
"*.json",
|
| 134 |
+
"*.yaml",
|
| 135 |
+
"*.yml",
|
| 136 |
+
"*.toml",
|
| 137 |
+
"*.ini",
|
| 138 |
+
"*.cfg",
|
| 139 |
+
"*.conf",
|
| 140 |
+
"*.md",
|
| 141 |
+
"*.rst",
|
| 142 |
+
"*.txt",
|
| 143 |
+
"*.dockerfile",
|
| 144 |
+
"*.gitignore",
|
| 145 |
+
"*.gitattributes",
|
| 146 |
+
"*.editorconfig",
|
| 147 |
+
"*.eslintrc",
|
| 148 |
+
"*.prettierrc",
|
| 149 |
+
"*.babelrc",
|
| 150 |
+
"*.tsconfig",
|
| 151 |
+
"package.json",
|
| 152 |
+
"requirements.txt",
|
| 153 |
+
"Pipfile",
|
| 154 |
+
"poetry.lock",
|
| 155 |
+
"pyproject.toml",
|
| 156 |
+
"Cargo.toml",
|
| 157 |
+
"go.mod",
|
| 158 |
+
"composer.json",
|
| 159 |
+
"Gemfile",
|
| 160 |
+
"pom.xml",
|
| 161 |
+
"build.gradle",
|
| 162 |
+
"CMakeLists.txt",
|
| 163 |
+
"Makefile",
|
| 164 |
+
"Dockerfile",
|
| 165 |
+
"docker-compose.yml",
|
| 166 |
+
]
|
| 167 |
+
|
| 168 |
+
# Language comment patterns for cleaning
|
| 169 |
+
COMMENT_PATTERNS = {
|
| 170 |
+
'python': [r'#.*$', r'""".*?"""', r"'''.*?'''"],
|
| 171 |
+
'javascript': [r'//.*$', r'/\*.*?\*/'],
|
| 172 |
+
'java': [r'//.*$', r'/\*.*?\*/'],
|
| 173 |
+
'cpp': [r'//.*$', r'/\*.*?\*/'],
|
| 174 |
+
'c': [r'//.*$', r'/\*.*?\*/'],
|
| 175 |
+
'cs': [r'//.*$', r'/\*.*?\*/'],
|
| 176 |
+
'go': [r'//.*$', r'/\*.*?\*/'],
|
| 177 |
+
'rs': [r'//.*$', r'/\*.*?\*/'],
|
| 178 |
+
'php': [r'//.*$', r'#.*$', r'/\*.*?\*/'],
|
| 179 |
+
'ruby': [r'#.*$', r'=begin.*?=end'],
|
| 180 |
+
'shell': [r'#.*$'],
|
| 181 |
+
'sql': [r'--.*$', r'/\*.*?\*/'],
|
| 182 |
+
'html': [r'<!--.*?-->'],
|
| 183 |
+
'xml': [r'<!--.*?-->'],
|
| 184 |
+
'css': [r'/\*.*?\*/'],
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
# Token estimation multipliers for different languages
|
| 188 |
+
TOKEN_MULTIPLIERS = {
|
| 189 |
+
'python': 0.25,
|
| 190 |
+
'javascript': 0.3,
|
| 191 |
+
'java': 0.25,
|
| 192 |
+
'cpp': 0.25,
|
| 193 |
+
'c': 0.25,
|
| 194 |
+
'cs': 0.25,
|
| 195 |
+
'go': 0.25,
|
| 196 |
+
'rs': 0.25,
|
| 197 |
+
'php': 0.3,
|
| 198 |
+
'ruby': 0.25,
|
| 199 |
+
'shell': 0.3,
|
| 200 |
+
'sql': 0.25,
|
| 201 |
+
'html': 0.2,
|
| 202 |
+
'xml': 0.2,
|
| 203 |
+
'css': 0.25,
|
| 204 |
+
'json': 0.15,
|
| 205 |
+
'yaml': 0.2,
|
| 206 |
+
'markdown': 0.2,
|
| 207 |
+
'text': 0.25,
|
| 208 |
+
'default': 0.25,
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
# Rate limiting
|
| 212 |
+
MAX_REQUESTS_PER_MINUTE = 60
|
| 213 |
+
REQUEST_TIMEOUT = 30
|
| 214 |
+
|
| 215 |
+
# UI Configuration
|
| 216 |
+
THEME_COLORS = {
|
| 217 |
+
'primary': '#3070f0',
|
| 218 |
+
'secondary': '#64748b',
|
| 219 |
+
'success': '#10b981',
|
| 220 |
+
'warning': '#f59e0b',
|
| 221 |
+
'error': '#ef4444',
|
| 222 |
+
'background': '#ffffff',
|
| 223 |
+
'surface': '#f8fafc',
|
| 224 |
+
'text': '#1e293b',
|
| 225 |
+
'text_secondary': '#64748b',
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
# Progress tracking
|
| 229 |
+
PROGRESS_STEPS = [
|
| 230 |
+
(0.0, "Initializing..."),
|
| 231 |
+
(0.1, "Fetching repository information..."),
|
| 232 |
+
(0.2, "Scanning files..."),
|
| 233 |
+
(0.3, "Processing files..."),
|
| 234 |
+
(0.5, "Analyzing content..."),
|
| 235 |
+
(0.7, "Generating output..."),
|
| 236 |
+
(0.9, "Finalizing..."),
|
| 237 |
+
(1.0, "Complete!"),
|
| 238 |
+
]
|
models.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import base64
|
| 3 |
+
import json
|
| 4 |
+
import zipfile
|
| 5 |
+
import io
|
| 6 |
+
import os
|
| 7 |
+
from typing import List, Dict, Tuple, Optional
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
from utils import matches_patterns, is_binary_file, format_file_size
|
| 12 |
+
from config import GITHUB_API_BASE, HF_API_BASE
|
| 13 |
+
|
| 14 |
+
def process_github_repo(
|
| 15 |
+
repo_path: str,
|
| 16 |
+
token: str,
|
| 17 |
+
include_patterns: List[str],
|
| 18 |
+
exclude_patterns: List[str],
|
| 19 |
+
max_file_size: int
|
| 20 |
+
) -> Tuple[List[Tuple[str, str, int]], Dict]:
|
| 21 |
+
"""Process GitHub repository and return file contents"""
|
| 22 |
+
|
| 23 |
+
headers = {}
|
| 24 |
+
if token:
|
| 25 |
+
headers['Authorization'] = f'token {token}'
|
| 26 |
+
|
| 27 |
+
# Get repository info
|
| 28 |
+
repo_url = f"{GITHUB_API_BASE}/repos/{repo_path}"
|
| 29 |
+
repo_response = requests.get(repo_url, headers=headers)
|
| 30 |
+
|
| 31 |
+
if repo_response.status_code != 200:
|
| 32 |
+
raise Exception(f"Failed to fetch repository info: {repo_response.json().get('message', 'Unknown error')}")
|
| 33 |
+
|
| 34 |
+
repo_info = repo_response.json()
|
| 35 |
+
|
| 36 |
+
# Get all files recursively
|
| 37 |
+
files_data = []
|
| 38 |
+
contents_queue = [""]
|
| 39 |
+
|
| 40 |
+
while contents_queue:
|
| 41 |
+
current_path = contents_queue.pop(0)
|
| 42 |
+
|
| 43 |
+
# Get directory contents
|
| 44 |
+
contents_url = f"{GITHUB_API_BASE}/repos/{repo_path}/contents/{current_path}"
|
| 45 |
+
contents_response = requests.get(contents_url, headers=headers)
|
| 46 |
+
|
| 47 |
+
if contents_response.status_code != 200:
|
| 48 |
+
continue
|
| 49 |
+
|
| 50 |
+
contents = contents_response.json()
|
| 51 |
+
|
| 52 |
+
if isinstance(contents, dict):
|
| 53 |
+
# Single file
|
| 54 |
+
contents = [contents]
|
| 55 |
+
|
| 56 |
+
for item in contents:
|
| 57 |
+
item_path = f"{current_path}/{item['name']}" if current_path else item['name']
|
| 58 |
+
|
| 59 |
+
if item['type'] == 'dir':
|
| 60 |
+
contents_queue.append(item_path)
|
| 61 |
+
elif item['type'] == 'file':
|
| 62 |
+
# Check if file matches patterns
|
| 63 |
+
if not matches_patterns(item_path, include_patterns, exclude_patterns):
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
# Check file size
|
| 67 |
+
if item['size'] > max_file_size:
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
# Get file content
|
| 71 |
+
try:
|
| 72 |
+
file_url = item['url']
|
| 73 |
+
file_response = requests.get(file_url, headers=headers)
|
| 74 |
+
|
| 75 |
+
if file_response.status_code == 200:
|
| 76 |
+
file_data = file_response.json()
|
| 77 |
+
content = base64.b64decode(file_data['content']).decode('utf-8', errors='ignore')
|
| 78 |
+
|
| 79 |
+
# Skip binary files
|
| 80 |
+
if is_binary_file(content, item_path):
|
| 81 |
+
continue
|
| 82 |
+
|
| 83 |
+
files_data.append((item_path, content, item['size']))
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"Error processing file {item_path}: {e}")
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
return files_data, repo_info
|
| 90 |
+
|
| 91 |
+
def process_huggingface_repo(
|
| 92 |
+
repo_path: str,
|
| 93 |
+
token: str,
|
| 94 |
+
include_patterns: List[str],
|
| 95 |
+
exclude_patterns: List[str],
|
| 96 |
+
max_file_size: int
|
| 97 |
+
) -> Tuple[List[Tuple[str, str, int]], Dict]:
|
| 98 |
+
"""Process Hugging Face repository and return file contents"""
|
| 99 |
+
|
| 100 |
+
headers = {}
|
| 101 |
+
if token:
|
| 102 |
+
headers['Authorization'] = f'Bearer {token}'
|
| 103 |
+
|
| 104 |
+
# Get repository info
|
| 105 |
+
repo_url = f"{HF_API_BASE}/api/models/{repo_path}"
|
| 106 |
+
repo_response = requests.get(repo_url, headers=headers)
|
| 107 |
+
|
| 108 |
+
if repo_response.status_code != 200:
|
| 109 |
+
raise Exception(f"Failed to fetch repository info: {repo_response.json().get('error', 'Unknown error')}")
|
| 110 |
+
|
| 111 |
+
repo_info = repo_response.json()
|
| 112 |
+
|
| 113 |
+
# Get repository tree
|
| 114 |
+
tree_url = f"{HF_API_BASE}/api/models/{repo_path}/tree/main"
|
| 115 |
+
tree_response = requests.get(tree_url, headers=headers)
|
| 116 |
+
|
| 117 |
+
if tree_response.status_code != 200:
|
| 118 |
+
raise Exception(f"Failed to fetch repository tree: {tree_response.json().get('error', 'Unknown error')}")
|
| 119 |
+
|
| 120 |
+
tree_data = tree_response.json()
|
| 121 |
+
|
| 122 |
+
files_data = []
|
| 123 |
+
|
| 124 |
+
def process_tree_item(item, current_path=""):
|
| 125 |
+
if isinstance(item, list):
|
| 126 |
+
for subitem in item:
|
| 127 |
+
process_tree_item(subitem, current_path)
|
| 128 |
+
elif isinstance(item, dict):
|
| 129 |
+
item_path = f"{current_path}/{item['path']}" if current_path else item['path']
|
| 130 |
+
|
| 131 |
+
if item['type'] == 'directory':
|
| 132 |
+
# Get directory contents
|
| 133 |
+
dir_url = f"{HF_API_BASE}/api/models/{repo_path}/tree/main/{item_path}"
|
| 134 |
+
dir_response = requests.get(dir_url, headers=headers)
|
| 135 |
+
|
| 136 |
+
if dir_response.status_code == 200:
|
| 137 |
+
process_tree_item(dir_response.json(), item_path)
|
| 138 |
+
elif item['type'] == 'file':
|
| 139 |
+
# Check if file matches patterns
|
| 140 |
+
if not matches_patterns(item_path, include_patterns, exclude_patterns):
|
| 141 |
+
return
|
| 142 |
+
|
| 143 |
+
# Check file size
|
| 144 |
+
if item.get('size', 0) > max_file_size:
|
| 145 |
+
return
|
| 146 |
+
|
| 147 |
+
# Get file content
|
| 148 |
+
try:
|
| 149 |
+
raw_url = f"https://huggingface.co/{repo_path}/raw/main/{item_path}"
|
| 150 |
+
file_response = requests.get(raw_url, headers=headers)
|
| 151 |
+
|
| 152 |
+
if file_response.status_code == 200:
|
| 153 |
+
content = file_response.text
|
| 154 |
+
|
| 155 |
+
# Skip binary files
|
| 156 |
+
if is_binary_file(content, item_path):
|
| 157 |
+
return
|
| 158 |
+
|
| 159 |
+
files_data.append((item_path, content, len(content)))
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
print(f"Error processing file {item_path}: {e}")
|
| 163 |
+
return
|
| 164 |
+
|
| 165 |
+
process_tree_item(tree_data)
|
| 166 |
+
|
| 167 |
+
return files_data, repo_info
|
| 168 |
+
|
| 169 |
+
def download_repo_as_zip(repo_url: str, token: str) -> str:
|
| 170 |
+
"""Download repository as ZIP file"""
|
| 171 |
+
|
| 172 |
+
if "github.com" in repo_url:
|
| 173 |
+
# GitHub ZIP URL
|
| 174 |
+
if token:
|
| 175 |
+
headers = {'Authorization': f'token {token}'}
|
| 176 |
+
zip_url = repo_url.replace("github.com", "api.github.com/repos") + "/zipball/main"
|
| 177 |
+
else:
|
| 178 |
+
headers = {}
|
| 179 |
+
zip_url = repo_url.replace("github.com", "codeload.github.com") + "/zip/main"
|
| 180 |
+
elif "huggingface.co" in repo_url:
|
| 181 |
+
# Hugging Face ZIP URL
|
| 182 |
+
headers = {}
|
| 183 |
+
if token:
|
| 184 |
+
headers['Authorization'] = f'Bearer {token}'
|
| 185 |
+
zip_url = repo_url.replace("huggingface.co", "huggingface.co") + "/resolve/main?download=true"
|
| 186 |
+
else:
|
| 187 |
+
raise ValueError("Unsupported repository URL")
|
| 188 |
+
|
| 189 |
+
response = requests.get(zip_url, headers=headers, stream=True)
|
| 190 |
+
|
| 191 |
+
if response.status_code != 200:
|
| 192 |
+
raise Exception(f"Failed to download repository: {response.status_code}")
|
| 193 |
+
|
| 194 |
+
# Save to temporary file
|
| 195 |
+
temp_path = f"/tmp/repo_{hash(repo_url)}.zip"
|
| 196 |
+
|
| 197 |
+
with open(temp_path, 'wb') as f:
|
| 198 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 199 |
+
f.write(chunk)
|
| 200 |
+
|
| 201 |
+
return temp_path
|
| 202 |
+
|
| 203 |
+
def extract_repo_info(repo_url: str, repo_type: str) -> Dict:
|
| 204 |
+
"""Extract basic repository information"""
|
| 205 |
+
if repo_type == "github":
|
| 206 |
+
# Extract owner and repo name
|
| 207 |
+
match = re.search(r'github\.com/([^/]+)/([^/]+)', repo_url)
|
| 208 |
+
if match:
|
| 209 |
+
return {
|
| 210 |
+
'owner': match.group(1),
|
| 211 |
+
'repo': match.group(2),
|
| 212 |
+
'full_name': f"{match.group(1)}/{match.group(2)}",
|
| 213 |
+
'url': repo_url
|
| 214 |
+
}
|
| 215 |
+
elif repo_type == "huggingface":
|
| 216 |
+
# Extract owner and repo name
|
| 217 |
+
match = re.search(r'huggingface\.co/([^/]+)/([^/]+)', repo_url)
|
| 218 |
+
if match:
|
| 219 |
+
return {
|
| 220 |
+
'owner': match.group(1),
|
| 221 |
+
'repo': match.group(2),
|
| 222 |
+
'full_name': f"{match.group(1)}/{match.group(2)}",
|
| 223 |
+
'url': repo_url
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
return {'url': repo_url}
|
requirements.txt
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
requests>=2.31.0
|
| 3 |
+
python-multipart>=0.0.6
|
| 4 |
+
pathlib>=1.0.1
|
| 5 |
+
re>=2.2.1
|
| 6 |
+
hashlib>=20081119
|
| 7 |
+
zipfile>=0.5
|
| 8 |
+
io>=0.1
|
| 9 |
+
datetime>=4.3
|
| 10 |
+
mimetypes>=0.1
|
| 11 |
+
fnmatch>=2.4.3
|
| 12 |
+
base64>=0.1
|
| 13 |
+
json>=2.0.9
|
| 14 |
+
|
| 15 |
+
This Gradio application provides a comprehensive solution for converting GitHub or Hugging Face repositories into text files suitable for LLM training. Here are the key features:
|
| 16 |
+
|
| 17 |
+
## 🚀 Main Features:
|
| 18 |
+
|
| 19 |
+
1. **Multi-Platform Support**: Works with both GitHub and Hugging Face repositories
|
| 20 |
+
2. **Smart File Filtering**: Include/exclude patterns to process only relevant files
|
| 21 |
+
3. **Token Estimation**: Provides rough token counts for training planning
|
| 22 |
+
4. **Chunked Output**: Splits large repositories into manageable chunks
|
| 23 |
+
5. **Comment Removal**: Optional comment stripping for cleaner training data
|
| 24 |
+
6. **Binary File Detection**: Automatically skips binary files
|
| 25 |
+
7. **Language Detection**: Identifies programming languages for better organization
|
| 26 |
+
8. **Progress Tracking**: Real-time progress updates during processing
|
| 27 |
+
|
| 28 |
+
## 🛠️ Advanced Options:
|
| 29 |
+
|
| 30 |
+
- File size limits to prevent processing huge files
|
| 31 |
+
- Pattern-based filtering (glob patterns supported)
|
| 32 |
+
- Chunk size customization
|
| 33 |
+
- Metadata inclusion
|
| 34 |
+
- Private repository support with tokens
|
| 35 |
+
- ZIP download option
|
| 36 |
+
|
| 37 |
+
## 📊 Output Features:
|
| 38 |
+
|
| 39 |
+
- Repository metadata and statistics
|
| 40 |
+
- File headers with path, size, and language info
|
| 41 |
+
- Token and character counts
|
| 42 |
+
- Formatted, readable output structure
|
| 43 |
+
- Error handling and status messages
|
| 44 |
+
|
| 45 |
+
The application is designed to handle repositories of various sizes while providing useful feedback and statistics about the processed content. It's perfect for preparing code repositories for LLM fine-tuning or analysis.
|
utils.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import hashlib
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
+
import mimetypes
|
| 5 |
+
|
| 6 |
+
def clean_code_content(content: str, file_path: str) -> str:
|
| 7 |
+
"""Remove comments from code files while preserving structure"""
|
| 8 |
+
ext = file_path.split('.')[-1].lower()
|
| 9 |
+
|
| 10 |
+
# Language-specific comment patterns
|
| 11 |
+
comment_patterns = {
|
| 12 |
+
'py': [
|
| 13 |
+
(r'#.*$', ''), # Single line comments
|
| 14 |
+
(r'""".*?"""', '', re.DOTALL), # Triple quotes
|
| 15 |
+
(r"'''.*?'''", '', re.DOTALL),
|
| 16 |
+
],
|
| 17 |
+
'js': [
|
| 18 |
+
(r'//.*$', ''), # Single line comments
|
| 19 |
+
(r'/\*.*?\*/', '', re.DOTALL), # Multi-line comments
|
| 20 |
+
],
|
| 21 |
+
'java': [
|
| 22 |
+
(r'//.*$', ''),
|
| 23 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 24 |
+
],
|
| 25 |
+
'cpp': [
|
| 26 |
+
(r'//.*$', ''),
|
| 27 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 28 |
+
],
|
| 29 |
+
'c': [
|
| 30 |
+
(r'//.*$', ''),
|
| 31 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 32 |
+
],
|
| 33 |
+
'cs': [
|
| 34 |
+
(r'//.*$', ''),
|
| 35 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 36 |
+
],
|
| 37 |
+
'go': [
|
| 38 |
+
(r'//.*$', ''),
|
| 39 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 40 |
+
],
|
| 41 |
+
'rs': [
|
| 42 |
+
(r'//.*$', ''),
|
| 43 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 44 |
+
],
|
| 45 |
+
'php': [
|
| 46 |
+
(r'//.*$', ''),
|
| 47 |
+
(r'#.*$', ''),
|
| 48 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 49 |
+
],
|
| 50 |
+
'rb': [
|
| 51 |
+
(r'#.*$', ''),
|
| 52 |
+
(r'=begin.*?=end', '', re.DOTALL),
|
| 53 |
+
],
|
| 54 |
+
'sh': [
|
| 55 |
+
(r'#.*$', ''),
|
| 56 |
+
],
|
| 57 |
+
'sql': [
|
| 58 |
+
(r'--.*$', ''),
|
| 59 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 60 |
+
],
|
| 61 |
+
'html': [
|
| 62 |
+
(r'<!--.*?-->', '', re.DOTALL),
|
| 63 |
+
],
|
| 64 |
+
'xml': [
|
| 65 |
+
(r'<!--.*?-->', '', re.DOTALL),
|
| 66 |
+
],
|
| 67 |
+
'css': [
|
| 68 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
| 69 |
+
],
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
if ext in comment_patterns:
|
| 73 |
+
content = content.strip()
|
| 74 |
+
for pattern, replacement, *flags in comment_patterns[ext]:
|
| 75 |
+
flags = flags[0] if flags else 0
|
| 76 |
+
content = re.sub(pattern, replacement, content, flags=flags)
|
| 77 |
+
|
| 78 |
+
# Clean up extra whitespace
|
| 79 |
+
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
|
| 80 |
+
content = content.strip()
|
| 81 |
+
|
| 82 |
+
return content
|
| 83 |
+
|
| 84 |
+
def get_file_language(file_path: str) -> str:
|
| 85 |
+
"""Determine programming language from file extension"""
|
| 86 |
+
ext = file_path.split('.')[-1].lower()
|
| 87 |
+
|
| 88 |
+
language_map = {
|
| 89 |
+
'py': 'Python',
|
| 90 |
+
'js': 'JavaScript',
|
| 91 |
+
'ts': 'TypeScript',
|
| 92 |
+
'jsx': 'React JSX',
|
| 93 |
+
'tsx': 'React TSX',
|
| 94 |
+
'java': 'Java',
|
| 95 |
+
'cpp': 'C++',
|
| 96 |
+
'c': 'C',
|
| 97 |
+
'cs': 'C#',
|
| 98 |
+
'go': 'Go',
|
| 99 |
+
'rs': 'Rust',
|
| 100 |
+
'php': 'PHP',
|
| 101 |
+
'rb': 'Ruby',
|
| 102 |
+
'swift': 'Swift',
|
| 103 |
+
'kt': 'Kotlin',
|
| 104 |
+
'scala': 'Scala',
|
| 105 |
+
'r': 'R',
|
| 106 |
+
'm': 'Objective-C',
|
| 107 |
+
'sh': 'Shell',
|
| 108 |
+
'bash': 'Bash',
|
| 109 |
+
'zsh': 'Zsh',
|
| 110 |
+
'fish': 'Fish',
|
| 111 |
+
'ps1': 'PowerShell',
|
| 112 |
+
'bat': 'Batch',
|
| 113 |
+
'sql': 'SQL',
|
| 114 |
+
'html': 'HTML',
|
| 115 |
+
'htm': 'HTML',
|
| 116 |
+
'xml': 'XML',
|
| 117 |
+
'css': 'CSS',
|
| 118 |
+
'scss': 'SCSS',
|
| 119 |
+
'sass': 'SASS',
|
| 120 |
+
'less': 'LESS',
|
| 121 |
+
'json': 'JSON',
|
| 122 |
+
'yaml': 'YAML',
|
| 123 |
+
'yml': 'YAML',
|
| 124 |
+
'toml': 'TOML',
|
| 125 |
+
'ini': 'INI',
|
| 126 |
+
'cfg': 'Config',
|
| 127 |
+
'conf': 'Config',
|
| 128 |
+
'md': 'Markdown',
|
| 129 |
+
'rst': 'reStructuredText',
|
| 130 |
+
'txt': 'Text',
|
| 131 |
+
'log': 'Log',
|
| 132 |
+
'dockerfile': 'Docker',
|
| 133 |
+
'docker': 'Docker',
|
| 134 |
+
'gitignore': 'Git',
|
| 135 |
+
'gitattributes': 'Git',
|
| 136 |
+
'editorconfig': 'EditorConfig',
|
| 137 |
+
'eslintrc': 'ESLint',
|
| 138 |
+
'prettierrc': 'Prettier',
|
| 139 |
+
'babelrc': 'Babel',
|
| 140 |
+
'tsconfig': 'TypeScript',
|
| 141 |
+
'package': 'NPM',
|
| 142 |
+
'lock': 'Lock',
|
| 143 |
+
'requirements': 'Python',
|
| 144 |
+
'pipfile': 'Python',
|
| 145 |
+
'poetry': 'Python',
|
| 146 |
+
'makefile': 'Make',
|
| 147 |
+
'cmake': 'CMake',
|
| 148 |
+
'gradle': 'Gradle',
|
| 149 |
+
'pom': 'Maven',
|
| 150 |
+
'sbt': 'SBT',
|
| 151 |
+
'vue': 'Vue',
|
| 152 |
+
'svelte': 'Svelte',
|
| 153 |
+
'elm': 'Elm',
|
| 154 |
+
'pug': 'Pug',
|
| 155 |
+
'haml': 'Haml',
|
| 156 |
+
'erb': 'ERB',
|
| 157 |
+
'ejs': 'EJS',
|
| 158 |
+
'twig': 'Twig',
|
| 159 |
+
'liquid': 'Liquid',
|
| 160 |
+
'handlebars': 'Handlebars',
|
| 161 |
+
'mustache': 'Mustache',
|
| 162 |
+
'jinja': 'Jinja',
|
| 163 |
+
'tex': 'LaTeX',
|
| 164 |
+
'bib': 'BibTeX',
|
| 165 |
+
'plt': 'Gnuplot',
|
| 166 |
+
'dot': 'Graphviz',
|
| 167 |
+
'mermaid': 'Mermaid',
|
| 168 |
+
'drawio': 'DrawIO',
|
| 169 |
+
'puml': 'PlantUML',
|
| 170 |
+
'wsdl': 'WSDL',
|
| 171 |
+
'xsd': 'XSD',
|
| 172 |
+
'xslt': 'XSLT',
|
| 173 |
+
'graphql': 'GraphQL',
|
| 174 |
+
'proto': 'Protocol Buffers',
|
| 175 |
+
'avro': 'Avro',
|
| 176 |
+
'parquet': 'Parquet',
|
| 177 |
+
'arrow': 'Arrow',
|
| 178 |
+
'feather': 'Feather',
|
| 179 |
+
'hdf5': 'HDF5',
|
| 180 |
+
'netcdf': 'NetCDF',
|
| 181 |
+
'matlab': 'MATLAB',
|
| 182 |
+
'mex': 'MATLAB',
|
| 183 |
+
'fig': 'MATLAB',
|
| 184 |
+
'slx': 'Simulink',
|
| 185 |
+
'simulink': 'Simulink',
|
| 186 |
+
'labview': 'LabVIEW',
|
| 187 |
+
'vi': 'LabVIEW',
|
| 188 |
+
'lvproj': 'LabVIEW',
|
| 189 |
+
'lvlib': 'LabVIEW',
|
| 190 |
+
'stata': 'Stata',
|
| 191 |
+
'do': 'Stata',
|
| 192 |
+
'ado': 'Stata',
|
| 193 |
+
'spss': 'SPSS',
|
| 194 |
+
'sav': 'SPSS',
|
| 195 |
+
'sas': 'SAS',
|
| 196 |
+
's7dat': 'SAS',
|
| 197 |
+
's7bdat': 'SAS',
|
| 198 |
+
'xpt': 'SAS',
|
| 199 |
+
'dta': 'Stata',
|
| 200 |
+
'rdata': 'R',
|
| 201 |
+
'rds': 'R',
|
| 202 |
+
'rda': 'R',
|
| 203 |
+
'jl': 'Julia',
|
| 204 |
+
'nim': 'Nim',
|
| 205 |
+
'zig': 'Zig',
|
| 206 |
+
'v': 'V',
|
| 207 |
+
'ada': 'Ada',
|
| 208 |
+
'adb': 'Ada',
|
| 209 |
+
'ads': 'Ada',
|
| 210 |
+
'pas': 'Pascal',
|
| 211 |
+
'pp': 'Pascal',
|
| 212 |
+
'dpr': 'Pascal',
|
| 213 |
+
'lpr': 'Pascal',
|
| 214 |
+
'dfm': 'Pascal',
|
| 215 |
+
'pl': 'Perl',
|
| 216 |
+
'pm': 'Perl',
|
| 217 |
+
't': 'Perl',
|
| 218 |
+
'pod': 'Perl',
|
| 219 |
+
'lua': 'Lua',
|
| 220 |
+
'moon': 'MoonScript',
|
| 221 |
+
'el': 'Emacs Lisp',
|
| 222 |
+
'elc': 'Emacs Lisp',
|
| 223 |
+
'elisp': 'Emacs Lisp',
|
| 224 |
+
'cl': 'Common Lisp',
|
| 225 |
+
'lisp': 'Common Lisp',
|
| 226 |
+
'lsp': 'Common Lisp',
|
| 227 |
+
'fasl': 'Common Lisp',
|
| 228 |
+
'ss': 'Scheme',
|
| 229 |
+
'scm': 'Scheme',
|
| 230 |
+
'rkt': 'Scheme',
|
| 231 |
+
'sch': 'Scheme',
|
| 232 |
+
'fs': 'F#',
|
| 233 |
+
'fsi': 'F#',
|
| 234 |
+
'fsx': 'F#',
|
| 235 |
+
'fsscript': 'F#',
|
| 236 |
+
'ml': 'OCaml',
|
| 237 |
+
'mli': 'OCaml',
|
| 238 |
+
'll': 'LLVM',
|
| 239 |
+
'bc': 'LLVM',
|
| 240 |
+
'nim': 'Nim',
|
| 241 |
+
'nimble': 'Nim',
|
| 242 |
+
'nims': 'Nim',
|
| 243 |
+
'v': 'V',
|
| 244 |
+
'vsh': 'V',
|
| 245 |
+
'vv': 'V',
|
| 246 |
+
'vh': 'V',
|
| 247 |
+
'd': 'D',
|
| 248 |
+
'di': 'D',
|
| 249 |
+
'dart': 'Dart',
|
| 250 |
+
'groovy': 'Groovy',
|
| 251 |
+
'gvy': 'Groovy',
|
| 252 |
+
'gy': 'Groovy',
|
| 253 |
+
'gsh': 'Groovy',
|
| 254 |
+
'clj': 'Clojure',
|
| 255 |
+
'cljs': 'ClojureScript',
|
| 256 |
+
'cljc': 'Clojure',
|
| 257 |
+
'edn': 'Clojure',
|
| 258 |
+
'coffee': 'CoffeeScript',
|
| 259 |
+
'litcoffee': 'CoffeeScript',
|
| 260 |
+
'cjsx': 'Cjsx',
|
| 261 |
+
'iced': 'IcedCoffeeScript',
|
| 262 |
+
'hx': 'Haxe',
|
| 263 |
+
'hxml': 'Haxe',
|
| 264 |
+
'purs': 'PureScript',
|
| 265 |
+
'elm': 'Elm',
|
| 266 |
+
'p8': 'Pico-8',
|
| 267 |
+
'lua': 'Lua',
|
| 268 |
+
'moon': 'MoonScript',
|
| 269 |
+
'wren': 'Wren',
|
| 270 |
+
'earl-grey': 'Earl Grey',
|
| 271 |
+
'eg': 'Earl Grey',
|
| 272 |
+
'tsv': 'TSV',
|
| 273 |
+
'csv': 'CSV',
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
return language_map.get(ext, ext.upper())
|
| 277 |
+
|
| 278 |
+
def estimate_tokens(text: str) -> int:
|
| 279 |
+
"""Estimate token count (rough approximation)"""
|
| 280 |
+
# Simple heuristic: ~4 characters per token for English text
|
| 281 |
+
# For code, this varies more, but it's a reasonable approximation
|
| 282 |
+
return len(text) // 4
|
| 283 |
+
|
| 284 |
+
def create_chunked_output(content: str, chunk_size: int) -> List[str]:
|
| 285 |
+
"""Split content into chunks of specified size"""
|
| 286 |
+
chunks = []
|
| 287 |
+
current_chunk = ""
|
| 288 |
+
|
| 289 |
+
lines = content.split('\n')
|
| 290 |
+
|
| 291 |
+
for line in lines:
|
| 292 |
+
if len(current_chunk) + len(line) + 1 > chunk_size:
|
| 293 |
+
if current_chunk:
|
| 294 |
+
chunks.append(current_chunk)
|
| 295 |
+
current_chunk = line
|
| 296 |
+
else:
|
| 297 |
+
if current_chunk:
|
| 298 |
+
current_chunk += '\n' + line
|
| 299 |
+
else:
|
| 300 |
+
current_chunk = line
|
| 301 |
+
|
| 302 |
+
if current_chunk:
|
| 303 |
+
chunks.append(current_chunk)
|
| 304 |
+
|
| 305 |
+
return chunks
|
| 306 |
+
|
| 307 |
+
def matches_patterns(file_path: str, include_patterns: List[str], exclude_patterns: List[str]) -> bool:
|
| 308 |
+
"""Check if file matches include/exclude patterns"""
|
| 309 |
+
import fnmatch
|
| 310 |
+
|
| 311 |
+
# Check exclude patterns first
|
| 312 |
+
for pattern in exclude_patterns:
|
| 313 |
+
if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
|
| 314 |
+
return False
|
| 315 |
+
|
| 316 |
+
# If no include patterns, include everything else
|
| 317 |
+
if not include_patterns:
|
| 318 |
+
return True
|
| 319 |
+
|
| 320 |
+
# Check include patterns
|
| 321 |
+
for pattern in include_patterns:
|
| 322 |
+
if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
|
| 323 |
+
return True
|
| 324 |
+
|
| 325 |
+
return False
|
| 326 |
+
|
| 327 |
+
def format_file_size(size_bytes: int) -> str:
|
| 328 |
+
"""Format file size in human readable format"""
|
| 329 |
+
for unit in ['B', 'KB', 'MB', 'GB']:
|
| 330 |
+
if size_bytes < 1024.0:
|
| 331 |
+
return f"{size_bytes:.1f} {unit}"
|
| 332 |
+
size_bytes /= 1024.0
|
| 333 |
+
return f"{size_bytes:.1f} TB"
|
| 334 |
+
|
| 335 |
+
def generate_file_hash(content: str) -> str:
|
| 336 |
+
"""Generate SHA-256 hash of file content"""
|
| 337 |
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
| 338 |
+
|
| 339 |
+
def is_binary_file(content: str, file_path: str) -> bool:
|
| 340 |
+
"""Check if file is binary"""
|
| 341 |
+
# Check file extension first
|
| 342 |
+
binary_extensions = {
|
| 343 |
+
'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp',
|
| 344 |
+
'mp3', 'mp4', 'avi', 'mov', 'wav', 'flac', 'ogg',
|
| 345 |
+
'zip', 'rar', 'tar', 'gz', '7z', 'bz2', 'xz',
|
| 346 |
+
'exe', 'dll', 'so', 'dylib',
|
| 347 |
+
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
|
| 348 |
+
'ttf', 'otf', 'woff', 'woff2', 'eot',
|
| 349 |
+
'bin', 'dat', 'db', 'sqlite', 'sqlite3',
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
ext = file_path.split('.')[-1].lower()
|
| 353 |
+
if ext in binary_extensions:
|
| 354 |
+
return True
|
| 355 |
+
|
| 356 |
+
# Check content for null bytes (indicator of binary)
|
| 357 |
+
if '\0' in content[:1024]:
|
| 358 |
+
return True
|
| 359 |
+
|
| 360 |
+
# Check if content has too many non-printable characters
|
| 361 |
+
printable_chars = sum(1 for c in content[:1024] if c.isprintable() or c in '\t\n\r')
|
| 362 |
+
if printable_chars / len(content[:1024]) < 0.7:
|
| 363 |
+
return True
|
| 364 |
+
|
| 365 |
+
return False
|