import re import hashlib from typing import List, Dict, Optional import mimetypes def clean_code_content(content: str, file_path: str) -> str: """Remove comments from code files while preserving structure""" ext = file_path.split('.')[-1].lower() # Language-specific comment patterns comment_patterns = { 'py': [ (r'#.*$', ''), # Single line comments (r'""".*?"""', '', re.DOTALL), # Triple quotes (r"'''.*?'''", '', re.DOTALL), ], 'js': [ (r'//.*$', ''), # Single line comments (r'/\*.*?\*/', '', re.DOTALL), # Multi-line comments ], 'java': [ (r'//.*$', ''), (r'/\*.*?\*/', '', re.DOTALL), ], 'cpp': [ (r'//.*$', ''), (r'/\*.*?\*/', '', re.DOTALL), ], 'c': [ (r'//.*$', ''), (r'/\*.*?\*/', '', re.DOTALL), ], 'cs': [ (r'//.*$', ''), (r'/\*.*?\*/', '', re.DOTALL), ], 'go': [ (r'//.*$', ''), (r'/\*.*?\*/', '', re.DOTALL), ], 'rs': [ (r'//.*$', ''), (r'/\*.*?\*/', '', re.DOTALL), ], 'php': [ (r'//.*$', ''), (r'#.*$', ''), (r'/\*.*?\*/', '', re.DOTALL), ], 'rb': [ (r'#.*$', ''), (r'=begin.*?=end', '', re.DOTALL), ], 'sh': [ (r'#.*$', ''), ], 'sql': [ (r'--.*$', ''), (r'/\*.*?\*/', '', re.DOTALL), ], 'html': [ (r'', '', re.DOTALL), ], 'xml': [ (r'', '', re.DOTALL), ], 'css': [ (r'/\*.*?\*/', '', re.DOTALL), ], } if ext in comment_patterns: content = content.strip() for pattern, replacement, *flags in comment_patterns[ext]: flags = flags[0] if flags else 0 content = re.sub(pattern, replacement, content, flags=flags) # Clean up extra whitespace content = re.sub(r'\n\s*\n\s*\n', '\n\n', content) content = content.strip() return content def get_file_language(file_path: str) -> str: """Determine programming language from file extension""" ext = file_path.split('.')[-1].lower() language_map = { 'py': 'Python', 'js': 'JavaScript', 'ts': 'TypeScript', 'jsx': 'React JSX', 'tsx': 'React TSX', 'java': 'Java', 'cpp': 'C++', 'c': 'C', 'cs': 'C#', 'go': 'Go', 'rs': 'Rust', 'php': 'PHP', 'rb': 'Ruby', 'swift': 'Swift', 'kt': 'Kotlin', 'scala': 'Scala', 'r': 'R', 'm': 'Objective-C', 'sh': 'Shell', 'bash': 'Bash', 'zsh': 'Zsh', 'fish': 'Fish', 'ps1': 'PowerShell', 'bat': 'Batch', 'sql': 'SQL', 'html': 'HTML', 'htm': 'HTML', 'xml': 'XML', 'css': 'CSS', 'scss': 'SCSS', 'sass': 'SASS', 'less': 'LESS', 'json': 'JSON', 'yaml': 'YAML', 'yml': 'YAML', 'toml': 'TOML', 'ini': 'INI', 'cfg': 'Config', 'conf': 'Config', 'md': 'Markdown', 'rst': 'reStructuredText', 'txt': 'Text', 'log': 'Log', 'dockerfile': 'Docker', 'docker': 'Docker', 'gitignore': 'Git', 'gitattributes': 'Git', 'editorconfig': 'EditorConfig', 'eslintrc': 'ESLint', 'prettierrc': 'Prettier', 'babelrc': 'Babel', 'tsconfig': 'TypeScript', 'package': 'NPM', 'lock': 'Lock', 'requirements': 'Python', 'pipfile': 'Python', 'poetry': 'Python', 'makefile': 'Make', 'cmake': 'CMake', 'gradle': 'Gradle', 'pom': 'Maven', 'sbt': 'SBT', 'vue': 'Vue', 'svelte': 'Svelte', 'elm': 'Elm', 'pug': 'Pug', 'haml': 'Haml', 'erb': 'ERB', 'ejs': 'EJS', 'twig': 'Twig', 'liquid': 'Liquid', 'handlebars': 'Handlebars', 'mustache': 'Mustache', 'jinja': 'Jinja', 'tex': 'LaTeX', 'bib': 'BibTeX', 'plt': 'Gnuplot', 'dot': 'Graphviz', 'mermaid': 'Mermaid', 'drawio': 'DrawIO', 'puml': 'PlantUML', 'wsdl': 'WSDL', 'xsd': 'XSD', 'xslt': 'XSLT', 'graphql': 'GraphQL', 'proto': 'Protocol Buffers', 'avro': 'Avro', 'parquet': 'Parquet', 'arrow': 'Arrow', 'feather': 'Feather', 'hdf5': 'HDF5', 'netcdf': 'NetCDF', 'matlab': 'MATLAB', 'mex': 'MATLAB', 'fig': 'MATLAB', 'slx': 'Simulink', 'simulink': 'Simulink', 'labview': 'LabVIEW', 'vi': 'LabVIEW', 'lvproj': 'LabVIEW', 'lvlib': 'LabVIEW', 'stata': 'Stata', 'do': 'Stata', 'ado': 'Stata', 'spss': 'SPSS', 'sav': 'SPSS', 'sas': 'SAS', 's7dat': 'SAS', 's7bdat': 'SAS', 'xpt': 'SAS', 'dta': 'Stata', 'rdata': 'R', 'rds': 'R', 'rda': 'R', 'jl': 'Julia', 'nim': 'Nim', 'zig': 'Zig', 'v': 'V', 'ada': 'Ada', 'adb': 'Ada', 'ads': 'Ada', 'pas': 'Pascal', 'pp': 'Pascal', 'dpr': 'Pascal', 'lpr': 'Pascal', 'dfm': 'Pascal', 'pl': 'Perl', 'pm': 'Perl', 't': 'Perl', 'pod': 'Perl', 'lua': 'Lua', 'moon': 'MoonScript', 'el': 'Emacs Lisp', 'elc': 'Emacs Lisp', 'elisp': 'Emacs Lisp', 'cl': 'Common Lisp', 'lisp': 'Common Lisp', 'lsp': 'Common Lisp', 'fasl': 'Common Lisp', 'ss': 'Scheme', 'scm': 'Scheme', 'rkt': 'Scheme', 'sch': 'Scheme', 'fs': 'F#', 'fsi': 'F#', 'fsx': 'F#', 'fsscript': 'F#', 'ml': 'OCaml', 'mli': 'OCaml', 'll': 'LLVM', 'bc': 'LLVM', 'nim': 'Nim', 'nimble': 'Nim', 'nims': 'Nim', 'v': 'V', 'vsh': 'V', 'vv': 'V', 'vh': 'V', 'd': 'D', 'di': 'D', 'dart': 'Dart', 'groovy': 'Groovy', 'gvy': 'Groovy', 'gy': 'Groovy', 'gsh': 'Groovy', 'clj': 'Clojure', 'cljs': 'ClojureScript', 'cljc': 'Clojure', 'edn': 'Clojure', 'coffee': 'CoffeeScript', 'litcoffee': 'CoffeeScript', 'cjsx': 'Cjsx', 'iced': 'IcedCoffeeScript', 'hx': 'Haxe', 'hxml': 'Haxe', 'purs': 'PureScript', 'elm': 'Elm', 'p8': 'Pico-8', 'lua': 'Lua', 'moon': 'MoonScript', 'wren': 'Wren', 'earl-grey': 'Earl Grey', 'eg': 'Earl Grey', 'tsv': 'TSV', 'csv': 'CSV', } return language_map.get(ext, ext.upper()) def estimate_tokens(text: str) -> int: """Estimate token count (rough approximation)""" # Simple heuristic: ~4 characters per token for English text # For code, this varies more, but it's a reasonable approximation return len(text) // 4 def create_chunked_output(content: str, chunk_size: int) -> List[str]: """Split content into chunks of specified size""" chunks = [] current_chunk = "" lines = content.split('\n') for line in lines: if len(current_chunk) + len(line) + 1 > chunk_size: if current_chunk: chunks.append(current_chunk) current_chunk = line else: if current_chunk: current_chunk += '\n' + line else: current_chunk = line if current_chunk: chunks.append(current_chunk) return chunks def matches_patterns(file_path: str, include_patterns: List[str], exclude_patterns: List[str]) -> bool: """Check if file matches include/exclude patterns""" import fnmatch # Check exclude patterns first for pattern in exclude_patterns: if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"): return False # If no include patterns, include everything else if not include_patterns: return True # Check include patterns for pattern in include_patterns: if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"): return True return False def format_file_size(size_bytes: int) -> str: """Format file size in human readable format""" for unit in ['B', 'KB', 'MB', 'GB']: if size_bytes < 1024.0: return f"{size_bytes:.1f} {unit}" size_bytes /= 1024.0 return f"{size_bytes:.1f} TB" def generate_file_hash(content: str) -> str: """Generate SHA-256 hash of file content""" return hashlib.sha256(content.encode()).hexdigest()[:16] def is_binary_file(content: str, file_path: str) -> bool: """Check if file is binary""" # Check file extension first binary_extensions = { 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp', 'mp3', 'mp4', 'avi', 'mov', 'wav', 'flac', 'ogg', 'zip', 'rar', 'tar', 'gz', '7z', 'bz2', 'xz', 'exe', 'dll', 'so', 'dylib', 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', 'ttf', 'otf', 'woff', 'woff2', 'eot', 'bin', 'dat', 'db', 'sqlite', 'sqlite3', } ext = file_path.split('.')[-1].lower() if ext in binary_extensions: return True # Check content for null bytes (indicator of binary) if '\0' in content[:1024]: return True # Check if content has too many non-printable characters printable_chars = sum(1 for c in content[:1024] if c.isprintable() or c in '\t\n\r') if printable_chars / len(content[:1024]) < 0.7: return True return False