Spaces:
Build error
Build error
| import re | |
| import hashlib | |
| from typing import List, Dict, Optional | |
| import mimetypes | |
| def clean_code_content(content: str, file_path: str) -> str: | |
| """Remove comments from code files while preserving structure""" | |
| ext = file_path.split('.')[-1].lower() | |
| # Language-specific comment patterns | |
| comment_patterns = { | |
| 'py': [ | |
| (r'#.*$', ''), # Single line comments | |
| (r'""".*?"""', '', re.DOTALL), # Triple quotes | |
| (r"'''.*?'''", '', re.DOTALL), | |
| ], | |
| 'js': [ | |
| (r'//.*$', ''), # Single line comments | |
| (r'/\*.*?\*/', '', re.DOTALL), # Multi-line comments | |
| ], | |
| 'java': [ | |
| (r'//.*$', ''), | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| 'cpp': [ | |
| (r'//.*$', ''), | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| 'c': [ | |
| (r'//.*$', ''), | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| 'cs': [ | |
| (r'//.*$', ''), | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| 'go': [ | |
| (r'//.*$', ''), | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| 'rs': [ | |
| (r'//.*$', ''), | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| 'php': [ | |
| (r'//.*$', ''), | |
| (r'#.*$', ''), | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| 'rb': [ | |
| (r'#.*$', ''), | |
| (r'=begin.*?=end', '', re.DOTALL), | |
| ], | |
| 'sh': [ | |
| (r'#.*$', ''), | |
| ], | |
| 'sql': [ | |
| (r'--.*$', ''), | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| 'html': [ | |
| (r'<!--.*?-->', '', re.DOTALL), | |
| ], | |
| 'xml': [ | |
| (r'<!--.*?-->', '', re.DOTALL), | |
| ], | |
| 'css': [ | |
| (r'/\*.*?\*/', '', re.DOTALL), | |
| ], | |
| } | |
| if ext in comment_patterns: | |
| content = content.strip() | |
| for pattern, replacement, *flags in comment_patterns[ext]: | |
| flags = flags[0] if flags else 0 | |
| content = re.sub(pattern, replacement, content, flags=flags) | |
| # Clean up extra whitespace | |
| content = re.sub(r'\n\s*\n\s*\n', '\n\n', content) | |
| content = content.strip() | |
| return content | |
| def get_file_language(file_path: str) -> str: | |
| """Determine programming language from file extension""" | |
| ext = file_path.split('.')[-1].lower() | |
| language_map = { | |
| 'py': 'Python', | |
| 'js': 'JavaScript', | |
| 'ts': 'TypeScript', | |
| 'jsx': 'React JSX', | |
| 'tsx': 'React TSX', | |
| 'java': 'Java', | |
| 'cpp': 'C++', | |
| 'c': 'C', | |
| 'cs': 'C#', | |
| 'go': 'Go', | |
| 'rs': 'Rust', | |
| 'php': 'PHP', | |
| 'rb': 'Ruby', | |
| 'swift': 'Swift', | |
| 'kt': 'Kotlin', | |
| 'scala': 'Scala', | |
| 'r': 'R', | |
| 'm': 'Objective-C', | |
| 'sh': 'Shell', | |
| 'bash': 'Bash', | |
| 'zsh': 'Zsh', | |
| 'fish': 'Fish', | |
| 'ps1': 'PowerShell', | |
| 'bat': 'Batch', | |
| 'sql': 'SQL', | |
| 'html': 'HTML', | |
| 'htm': 'HTML', | |
| 'xml': 'XML', | |
| 'css': 'CSS', | |
| 'scss': 'SCSS', | |
| 'sass': 'SASS', | |
| 'less': 'LESS', | |
| 'json': 'JSON', | |
| 'yaml': 'YAML', | |
| 'yml': 'YAML', | |
| 'toml': 'TOML', | |
| 'ini': 'INI', | |
| 'cfg': 'Config', | |
| 'conf': 'Config', | |
| 'md': 'Markdown', | |
| 'rst': 'reStructuredText', | |
| 'txt': 'Text', | |
| 'log': 'Log', | |
| 'dockerfile': 'Docker', | |
| 'docker': 'Docker', | |
| 'gitignore': 'Git', | |
| 'gitattributes': 'Git', | |
| 'editorconfig': 'EditorConfig', | |
| 'eslintrc': 'ESLint', | |
| 'prettierrc': 'Prettier', | |
| 'babelrc': 'Babel', | |
| 'tsconfig': 'TypeScript', | |
| 'package': 'NPM', | |
| 'lock': 'Lock', | |
| 'requirements': 'Python', | |
| 'pipfile': 'Python', | |
| 'poetry': 'Python', | |
| 'makefile': 'Make', | |
| 'cmake': 'CMake', | |
| 'gradle': 'Gradle', | |
| 'pom': 'Maven', | |
| 'sbt': 'SBT', | |
| 'vue': 'Vue', | |
| 'svelte': 'Svelte', | |
| 'elm': 'Elm', | |
| 'pug': 'Pug', | |
| 'haml': 'Haml', | |
| 'erb': 'ERB', | |
| 'ejs': 'EJS', | |
| 'twig': 'Twig', | |
| 'liquid': 'Liquid', | |
| 'handlebars': 'Handlebars', | |
| 'mustache': 'Mustache', | |
| 'jinja': 'Jinja', | |
| 'tex': 'LaTeX', | |
| 'bib': 'BibTeX', | |
| 'plt': 'Gnuplot', | |
| 'dot': 'Graphviz', | |
| 'mermaid': 'Mermaid', | |
| 'drawio': 'DrawIO', | |
| 'puml': 'PlantUML', | |
| 'wsdl': 'WSDL', | |
| 'xsd': 'XSD', | |
| 'xslt': 'XSLT', | |
| 'graphql': 'GraphQL', | |
| 'proto': 'Protocol Buffers', | |
| 'avro': 'Avro', | |
| 'parquet': 'Parquet', | |
| 'arrow': 'Arrow', | |
| 'feather': 'Feather', | |
| 'hdf5': 'HDF5', | |
| 'netcdf': 'NetCDF', | |
| 'matlab': 'MATLAB', | |
| 'mex': 'MATLAB', | |
| 'fig': 'MATLAB', | |
| 'slx': 'Simulink', | |
| 'simulink': 'Simulink', | |
| 'labview': 'LabVIEW', | |
| 'vi': 'LabVIEW', | |
| 'lvproj': 'LabVIEW', | |
| 'lvlib': 'LabVIEW', | |
| 'stata': 'Stata', | |
| 'do': 'Stata', | |
| 'ado': 'Stata', | |
| 'spss': 'SPSS', | |
| 'sav': 'SPSS', | |
| 'sas': 'SAS', | |
| 's7dat': 'SAS', | |
| 's7bdat': 'SAS', | |
| 'xpt': 'SAS', | |
| 'dta': 'Stata', | |
| 'rdata': 'R', | |
| 'rds': 'R', | |
| 'rda': 'R', | |
| 'jl': 'Julia', | |
| 'nim': 'Nim', | |
| 'zig': 'Zig', | |
| 'v': 'V', | |
| 'ada': 'Ada', | |
| 'adb': 'Ada', | |
| 'ads': 'Ada', | |
| 'pas': 'Pascal', | |
| 'pp': 'Pascal', | |
| 'dpr': 'Pascal', | |
| 'lpr': 'Pascal', | |
| 'dfm': 'Pascal', | |
| 'pl': 'Perl', | |
| 'pm': 'Perl', | |
| 't': 'Perl', | |
| 'pod': 'Perl', | |
| 'lua': 'Lua', | |
| 'moon': 'MoonScript', | |
| 'el': 'Emacs Lisp', | |
| 'elc': 'Emacs Lisp', | |
| 'elisp': 'Emacs Lisp', | |
| 'cl': 'Common Lisp', | |
| 'lisp': 'Common Lisp', | |
| 'lsp': 'Common Lisp', | |
| 'fasl': 'Common Lisp', | |
| 'ss': 'Scheme', | |
| 'scm': 'Scheme', | |
| 'rkt': 'Scheme', | |
| 'sch': 'Scheme', | |
| 'fs': 'F#', | |
| 'fsi': 'F#', | |
| 'fsx': 'F#', | |
| 'fsscript': 'F#', | |
| 'ml': 'OCaml', | |
| 'mli': 'OCaml', | |
| 'll': 'LLVM', | |
| 'bc': 'LLVM', | |
| 'nim': 'Nim', | |
| 'nimble': 'Nim', | |
| 'nims': 'Nim', | |
| 'v': 'V', | |
| 'vsh': 'V', | |
| 'vv': 'V', | |
| 'vh': 'V', | |
| 'd': 'D', | |
| 'di': 'D', | |
| 'dart': 'Dart', | |
| 'groovy': 'Groovy', | |
| 'gvy': 'Groovy', | |
| 'gy': 'Groovy', | |
| 'gsh': 'Groovy', | |
| 'clj': 'Clojure', | |
| 'cljs': 'ClojureScript', | |
| 'cljc': 'Clojure', | |
| 'edn': 'Clojure', | |
| 'coffee': 'CoffeeScript', | |
| 'litcoffee': 'CoffeeScript', | |
| 'cjsx': 'Cjsx', | |
| 'iced': 'IcedCoffeeScript', | |
| 'hx': 'Haxe', | |
| 'hxml': 'Haxe', | |
| 'purs': 'PureScript', | |
| 'elm': 'Elm', | |
| 'p8': 'Pico-8', | |
| 'lua': 'Lua', | |
| 'moon': 'MoonScript', | |
| 'wren': 'Wren', | |
| 'earl-grey': 'Earl Grey', | |
| 'eg': 'Earl Grey', | |
| 'tsv': 'TSV', | |
| 'csv': 'CSV', | |
| } | |
| return language_map.get(ext, ext.upper()) | |
| def estimate_tokens(text: str) -> int: | |
| """Estimate token count (rough approximation)""" | |
| # Simple heuristic: ~4 characters per token for English text | |
| # For code, this varies more, but it's a reasonable approximation | |
| return len(text) // 4 | |
| def create_chunked_output(content: str, chunk_size: int) -> List[str]: | |
| """Split content into chunks of specified size""" | |
| chunks = [] | |
| current_chunk = "" | |
| lines = content.split('\n') | |
| for line in lines: | |
| if len(current_chunk) + len(line) + 1 > chunk_size: | |
| if current_chunk: | |
| chunks.append(current_chunk) | |
| current_chunk = line | |
| else: | |
| if current_chunk: | |
| current_chunk += '\n' + line | |
| else: | |
| current_chunk = line | |
| if current_chunk: | |
| chunks.append(current_chunk) | |
| return chunks | |
| def matches_patterns(file_path: str, include_patterns: List[str], exclude_patterns: List[str]) -> bool: | |
| """Check if file matches include/exclude patterns""" | |
| import fnmatch | |
| # Check exclude patterns first | |
| for pattern in exclude_patterns: | |
| if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"): | |
| return False | |
| # If no include patterns, include everything else | |
| if not include_patterns: | |
| return True | |
| # Check include patterns | |
| for pattern in include_patterns: | |
| if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"): | |
| return True | |
| return False | |
| def format_file_size(size_bytes: int) -> str: | |
| """Format file size in human readable format""" | |
| for unit in ['B', 'KB', 'MB', 'GB']: | |
| if size_bytes < 1024.0: | |
| return f"{size_bytes:.1f} {unit}" | |
| size_bytes /= 1024.0 | |
| return f"{size_bytes:.1f} TB" | |
| def generate_file_hash(content: str) -> str: | |
| """Generate SHA-256 hash of file content""" | |
| return hashlib.sha256(content.encode()).hexdigest()[:16] | |
| def is_binary_file(content: str, file_path: str) -> bool: | |
| """Check if file is binary""" | |
| # Check file extension first | |
| binary_extensions = { | |
| 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp', | |
| 'mp3', 'mp4', 'avi', 'mov', 'wav', 'flac', 'ogg', | |
| 'zip', 'rar', 'tar', 'gz', '7z', 'bz2', 'xz', | |
| 'exe', 'dll', 'so', 'dylib', | |
| 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', | |
| 'ttf', 'otf', 'woff', 'woff2', 'eot', | |
| 'bin', 'dat', 'db', 'sqlite', 'sqlite3', | |
| } | |
| ext = file_path.split('.')[-1].lower() | |
| if ext in binary_extensions: | |
| return True | |
| # Check content for null bytes (indicator of binary) | |
| if '\0' in content[:1024]: | |
| return True | |
| # Check if content has too many non-printable characters | |
| printable_chars = sum(1 for c in content[:1024] if c.isprintable() or c in '\t\n\r') | |
| if printable_chars / len(content[:1024]) < 0.7: | |
| return True | |
| return False | |