my-app-565 / utils.py
AiCoderv2's picture
Deploy Gradio app with multiple files
d79890b verified
import re
import hashlib
from typing import List, Dict, Optional
import mimetypes
def clean_code_content(content: str, file_path: str) -> str:
"""Remove comments from code files while preserving structure"""
ext = file_path.split('.')[-1].lower()
# Language-specific comment patterns
comment_patterns = {
'py': [
(r'#.*$', ''), # Single line comments
(r'""".*?"""', '', re.DOTALL), # Triple quotes
(r"'''.*?'''", '', re.DOTALL),
],
'js': [
(r'//.*$', ''), # Single line comments
(r'/\*.*?\*/', '', re.DOTALL), # Multi-line comments
],
'java': [
(r'//.*$', ''),
(r'/\*.*?\*/', '', re.DOTALL),
],
'cpp': [
(r'//.*$', ''),
(r'/\*.*?\*/', '', re.DOTALL),
],
'c': [
(r'//.*$', ''),
(r'/\*.*?\*/', '', re.DOTALL),
],
'cs': [
(r'//.*$', ''),
(r'/\*.*?\*/', '', re.DOTALL),
],
'go': [
(r'//.*$', ''),
(r'/\*.*?\*/', '', re.DOTALL),
],
'rs': [
(r'//.*$', ''),
(r'/\*.*?\*/', '', re.DOTALL),
],
'php': [
(r'//.*$', ''),
(r'#.*$', ''),
(r'/\*.*?\*/', '', re.DOTALL),
],
'rb': [
(r'#.*$', ''),
(r'=begin.*?=end', '', re.DOTALL),
],
'sh': [
(r'#.*$', ''),
],
'sql': [
(r'--.*$', ''),
(r'/\*.*?\*/', '', re.DOTALL),
],
'html': [
(r'<!--.*?-->', '', re.DOTALL),
],
'xml': [
(r'<!--.*?-->', '', re.DOTALL),
],
'css': [
(r'/\*.*?\*/', '', re.DOTALL),
],
}
if ext in comment_patterns:
content = content.strip()
for pattern, replacement, *flags in comment_patterns[ext]:
flags = flags[0] if flags else 0
content = re.sub(pattern, replacement, content, flags=flags)
# Clean up extra whitespace
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
content = content.strip()
return content
def get_file_language(file_path: str) -> str:
"""Determine programming language from file extension"""
ext = file_path.split('.')[-1].lower()
language_map = {
'py': 'Python',
'js': 'JavaScript',
'ts': 'TypeScript',
'jsx': 'React JSX',
'tsx': 'React TSX',
'java': 'Java',
'cpp': 'C++',
'c': 'C',
'cs': 'C#',
'go': 'Go',
'rs': 'Rust',
'php': 'PHP',
'rb': 'Ruby',
'swift': 'Swift',
'kt': 'Kotlin',
'scala': 'Scala',
'r': 'R',
'm': 'Objective-C',
'sh': 'Shell',
'bash': 'Bash',
'zsh': 'Zsh',
'fish': 'Fish',
'ps1': 'PowerShell',
'bat': 'Batch',
'sql': 'SQL',
'html': 'HTML',
'htm': 'HTML',
'xml': 'XML',
'css': 'CSS',
'scss': 'SCSS',
'sass': 'SASS',
'less': 'LESS',
'json': 'JSON',
'yaml': 'YAML',
'yml': 'YAML',
'toml': 'TOML',
'ini': 'INI',
'cfg': 'Config',
'conf': 'Config',
'md': 'Markdown',
'rst': 'reStructuredText',
'txt': 'Text',
'log': 'Log',
'dockerfile': 'Docker',
'docker': 'Docker',
'gitignore': 'Git',
'gitattributes': 'Git',
'editorconfig': 'EditorConfig',
'eslintrc': 'ESLint',
'prettierrc': 'Prettier',
'babelrc': 'Babel',
'tsconfig': 'TypeScript',
'package': 'NPM',
'lock': 'Lock',
'requirements': 'Python',
'pipfile': 'Python',
'poetry': 'Python',
'makefile': 'Make',
'cmake': 'CMake',
'gradle': 'Gradle',
'pom': 'Maven',
'sbt': 'SBT',
'vue': 'Vue',
'svelte': 'Svelte',
'elm': 'Elm',
'pug': 'Pug',
'haml': 'Haml',
'erb': 'ERB',
'ejs': 'EJS',
'twig': 'Twig',
'liquid': 'Liquid',
'handlebars': 'Handlebars',
'mustache': 'Mustache',
'jinja': 'Jinja',
'tex': 'LaTeX',
'bib': 'BibTeX',
'plt': 'Gnuplot',
'dot': 'Graphviz',
'mermaid': 'Mermaid',
'drawio': 'DrawIO',
'puml': 'PlantUML',
'wsdl': 'WSDL',
'xsd': 'XSD',
'xslt': 'XSLT',
'graphql': 'GraphQL',
'proto': 'Protocol Buffers',
'avro': 'Avro',
'parquet': 'Parquet',
'arrow': 'Arrow',
'feather': 'Feather',
'hdf5': 'HDF5',
'netcdf': 'NetCDF',
'matlab': 'MATLAB',
'mex': 'MATLAB',
'fig': 'MATLAB',
'slx': 'Simulink',
'simulink': 'Simulink',
'labview': 'LabVIEW',
'vi': 'LabVIEW',
'lvproj': 'LabVIEW',
'lvlib': 'LabVIEW',
'stata': 'Stata',
'do': 'Stata',
'ado': 'Stata',
'spss': 'SPSS',
'sav': 'SPSS',
'sas': 'SAS',
's7dat': 'SAS',
's7bdat': 'SAS',
'xpt': 'SAS',
'dta': 'Stata',
'rdata': 'R',
'rds': 'R',
'rda': 'R',
'jl': 'Julia',
'nim': 'Nim',
'zig': 'Zig',
'v': 'V',
'ada': 'Ada',
'adb': 'Ada',
'ads': 'Ada',
'pas': 'Pascal',
'pp': 'Pascal',
'dpr': 'Pascal',
'lpr': 'Pascal',
'dfm': 'Pascal',
'pl': 'Perl',
'pm': 'Perl',
't': 'Perl',
'pod': 'Perl',
'lua': 'Lua',
'moon': 'MoonScript',
'el': 'Emacs Lisp',
'elc': 'Emacs Lisp',
'elisp': 'Emacs Lisp',
'cl': 'Common Lisp',
'lisp': 'Common Lisp',
'lsp': 'Common Lisp',
'fasl': 'Common Lisp',
'ss': 'Scheme',
'scm': 'Scheme',
'rkt': 'Scheme',
'sch': 'Scheme',
'fs': 'F#',
'fsi': 'F#',
'fsx': 'F#',
'fsscript': 'F#',
'ml': 'OCaml',
'mli': 'OCaml',
'll': 'LLVM',
'bc': 'LLVM',
'nim': 'Nim',
'nimble': 'Nim',
'nims': 'Nim',
'v': 'V',
'vsh': 'V',
'vv': 'V',
'vh': 'V',
'd': 'D',
'di': 'D',
'dart': 'Dart',
'groovy': 'Groovy',
'gvy': 'Groovy',
'gy': 'Groovy',
'gsh': 'Groovy',
'clj': 'Clojure',
'cljs': 'ClojureScript',
'cljc': 'Clojure',
'edn': 'Clojure',
'coffee': 'CoffeeScript',
'litcoffee': 'CoffeeScript',
'cjsx': 'Cjsx',
'iced': 'IcedCoffeeScript',
'hx': 'Haxe',
'hxml': 'Haxe',
'purs': 'PureScript',
'elm': 'Elm',
'p8': 'Pico-8',
'lua': 'Lua',
'moon': 'MoonScript',
'wren': 'Wren',
'earl-grey': 'Earl Grey',
'eg': 'Earl Grey',
'tsv': 'TSV',
'csv': 'CSV',
}
return language_map.get(ext, ext.upper())
def estimate_tokens(text: str) -> int:
"""Estimate token count (rough approximation)"""
# Simple heuristic: ~4 characters per token for English text
# For code, this varies more, but it's a reasonable approximation
return len(text) // 4
def create_chunked_output(content: str, chunk_size: int) -> List[str]:
"""Split content into chunks of specified size"""
chunks = []
current_chunk = ""
lines = content.split('\n')
for line in lines:
if len(current_chunk) + len(line) + 1 > chunk_size:
if current_chunk:
chunks.append(current_chunk)
current_chunk = line
else:
if current_chunk:
current_chunk += '\n' + line
else:
current_chunk = line
if current_chunk:
chunks.append(current_chunk)
return chunks
def matches_patterns(file_path: str, include_patterns: List[str], exclude_patterns: List[str]) -> bool:
"""Check if file matches include/exclude patterns"""
import fnmatch
# Check exclude patterns first
for pattern in exclude_patterns:
if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
return False
# If no include patterns, include everything else
if not include_patterns:
return True
# Check include patterns
for pattern in include_patterns:
if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
return True
return False
def format_file_size(size_bytes: int) -> str:
"""Format file size in human readable format"""
for unit in ['B', 'KB', 'MB', 'GB']:
if size_bytes < 1024.0:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.1f} TB"
def generate_file_hash(content: str) -> str:
"""Generate SHA-256 hash of file content"""
return hashlib.sha256(content.encode()).hexdigest()[:16]
def is_binary_file(content: str, file_path: str) -> bool:
"""Check if file is binary"""
# Check file extension first
binary_extensions = {
'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp',
'mp3', 'mp4', 'avi', 'mov', 'wav', 'flac', 'ogg',
'zip', 'rar', 'tar', 'gz', '7z', 'bz2', 'xz',
'exe', 'dll', 'so', 'dylib',
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
'ttf', 'otf', 'woff', 'woff2', 'eot',
'bin', 'dat', 'db', 'sqlite', 'sqlite3',
}
ext = file_path.split('.')[-1].lower()
if ext in binary_extensions:
return True
# Check content for null bytes (indicator of binary)
if '\0' in content[:1024]:
return True
# Check if content has too many non-printable characters
printable_chars = sum(1 for c in content[:1024] if c.isprintable() or c in '\t\n\r')
if printable_chars / len(content[:1024]) < 0.7:
return True
return False