Spaces:

AiCoderv2
/

my-app-565

Build error

App Files Files Community

my-app-565 / utils.py

AiCoderv2

Deploy Gradio app with multiple files

d79890b verified 3 months ago

raw

history blame contribute delete

10.2 kB

	import re
	import hashlib
	from typing import List, Dict, Optional
	import mimetypes

	def clean_code_content(content: str, file_path: str) -> str:
	"""Remove comments from code files while preserving structure"""
	ext = file_path.split('.')[-1].lower()

	# Language-specific comment patterns
	comment_patterns = {
	'py': [
	(r'#.*$', ''), # Single line comments
	(r'""".*?"""', '', re.DOTALL), # Triple quotes
	(r"'''.*?'''", '', re.DOTALL),
	],
	'js': [
	(r'//.*$', ''), # Single line comments
	(r'/\.?\*/', '', re.DOTALL), # Multi-line comments
	],
	'java': [
	(r'//.*$', ''),
	(r'/\.?\*/', '', re.DOTALL),
	],
	'cpp': [
	(r'//.*$', ''),
	(r'/\.?\*/', '', re.DOTALL),
	],
	'c': [
	(r'//.*$', ''),
	(r'/\.?\*/', '', re.DOTALL),
	],
	'cs': [
	(r'//.*$', ''),
	(r'/\.?\*/', '', re.DOTALL),
	],
	'go': [
	(r'//.*$', ''),
	(r'/\.?\*/', '', re.DOTALL),
	],
	'rs': [
	(r'//.*$', ''),
	(r'/\.?\*/', '', re.DOTALL),
	],
	'php': [
	(r'//.*$', ''),
	(r'#.*$', ''),
	(r'/\.?\*/', '', re.DOTALL),
	],
	'rb': [
	(r'#.*$', ''),
	(r'=begin.*?=end', '', re.DOTALL),
	],
	'sh': [
	(r'#.*$', ''),
	],
	'sql': [
	(r'--.*$', ''),
	(r'/\.?\*/', '', re.DOTALL),
	],
	'html': [
	(r'<!--.*?-->', '', re.DOTALL),
	],
	'xml': [
	(r'<!--.*?-->', '', re.DOTALL),
	],
	'css': [
	(r'/\.?\*/', '', re.DOTALL),
	],
	}

	if ext in comment_patterns:
	content = content.strip()
	for pattern, replacement, *flags in comment_patterns[ext]:
	flags = flags[0] if flags else 0
	content = re.sub(pattern, replacement, content, flags=flags)

	# Clean up extra whitespace
	content = re.sub(r'\n\s\n\s\n', '\n\n', content)
	content = content.strip()

	return content

	def get_file_language(file_path: str) -> str:
	"""Determine programming language from file extension"""
	ext = file_path.split('.')[-1].lower()

	language_map = {
	'py': 'Python',
	'js': 'JavaScript',
	'ts': 'TypeScript',
	'jsx': 'React JSX',
	'tsx': 'React TSX',
	'java': 'Java',
	'cpp': 'C++',
	'c': 'C',
	'cs': 'C#',
	'go': 'Go',
	'rs': 'Rust',
	'php': 'PHP',
	'rb': 'Ruby',
	'swift': 'Swift',
	'kt': 'Kotlin',
	'scala': 'Scala',
	'r': 'R',
	'm': 'Objective-C',
	'sh': 'Shell',
	'bash': 'Bash',
	'zsh': 'Zsh',
	'fish': 'Fish',
	'ps1': 'PowerShell',
	'bat': 'Batch',
	'sql': 'SQL',
	'html': 'HTML',
	'htm': 'HTML',
	'xml': 'XML',
	'css': 'CSS',
	'scss': 'SCSS',
	'sass': 'SASS',
	'less': 'LESS',
	'json': 'JSON',
	'yaml': 'YAML',
	'yml': 'YAML',
	'toml': 'TOML',
	'ini': 'INI',
	'cfg': 'Config',
	'conf': 'Config',
	'md': 'Markdown',
	'rst': 'reStructuredText',
	'txt': 'Text',
	'log': 'Log',
	'dockerfile': 'Docker',
	'docker': 'Docker',
	'gitignore': 'Git',
	'gitattributes': 'Git',
	'editorconfig': 'EditorConfig',
	'eslintrc': 'ESLint',
	'prettierrc': 'Prettier',
	'babelrc': 'Babel',
	'tsconfig': 'TypeScript',
	'package': 'NPM',
	'lock': 'Lock',
	'requirements': 'Python',
	'pipfile': 'Python',
	'poetry': 'Python',
	'makefile': 'Make',
	'cmake': 'CMake',
	'gradle': 'Gradle',
	'pom': 'Maven',
	'sbt': 'SBT',
	'vue': 'Vue',
	'svelte': 'Svelte',
	'elm': 'Elm',
	'pug': 'Pug',
	'haml': 'Haml',
	'erb': 'ERB',
	'ejs': 'EJS',
	'twig': 'Twig',
	'liquid': 'Liquid',
	'handlebars': 'Handlebars',
	'mustache': 'Mustache',
	'jinja': 'Jinja',
	'tex': 'LaTeX',
	'bib': 'BibTeX',
	'plt': 'Gnuplot',
	'dot': 'Graphviz',
	'mermaid': 'Mermaid',
	'drawio': 'DrawIO',
	'puml': 'PlantUML',
	'wsdl': 'WSDL',
	'xsd': 'XSD',
	'xslt': 'XSLT',
	'graphql': 'GraphQL',
	'proto': 'Protocol Buffers',
	'avro': 'Avro',
	'parquet': 'Parquet',
	'arrow': 'Arrow',
	'feather': 'Feather',
	'hdf5': 'HDF5',
	'netcdf': 'NetCDF',
	'matlab': 'MATLAB',
	'mex': 'MATLAB',
	'fig': 'MATLAB',
	'slx': 'Simulink',
	'simulink': 'Simulink',
	'labview': 'LabVIEW',
	'vi': 'LabVIEW',
	'lvproj': 'LabVIEW',
	'lvlib': 'LabVIEW',
	'stata': 'Stata',
	'do': 'Stata',
	'ado': 'Stata',
	'spss': 'SPSS',
	'sav': 'SPSS',
	'sas': 'SAS',
	's7dat': 'SAS',
	's7bdat': 'SAS',
	'xpt': 'SAS',
	'dta': 'Stata',
	'rdata': 'R',
	'rds': 'R',
	'rda': 'R',
	'jl': 'Julia',
	'nim': 'Nim',
	'zig': 'Zig',
	'v': 'V',
	'ada': 'Ada',
	'adb': 'Ada',
	'ads': 'Ada',
	'pas': 'Pascal',
	'pp': 'Pascal',
	'dpr': 'Pascal',
	'lpr': 'Pascal',
	'dfm': 'Pascal',
	'pl': 'Perl',
	'pm': 'Perl',
	't': 'Perl',
	'pod': 'Perl',
	'lua': 'Lua',
	'moon': 'MoonScript',
	'el': 'Emacs Lisp',
	'elc': 'Emacs Lisp',
	'elisp': 'Emacs Lisp',
	'cl': 'Common Lisp',
	'lisp': 'Common Lisp',
	'lsp': 'Common Lisp',
	'fasl': 'Common Lisp',
	'ss': 'Scheme',
	'scm': 'Scheme',
	'rkt': 'Scheme',
	'sch': 'Scheme',
	'fs': 'F#',
	'fsi': 'F#',
	'fsx': 'F#',
	'fsscript': 'F#',
	'ml': 'OCaml',
	'mli': 'OCaml',
	'll': 'LLVM',
	'bc': 'LLVM',
	'nim': 'Nim',
	'nimble': 'Nim',
	'nims': 'Nim',
	'v': 'V',
	'vsh': 'V',
	'vv': 'V',
	'vh': 'V',
	'd': 'D',
	'di': 'D',
	'dart': 'Dart',
	'groovy': 'Groovy',
	'gvy': 'Groovy',
	'gy': 'Groovy',
	'gsh': 'Groovy',
	'clj': 'Clojure',
	'cljs': 'ClojureScript',
	'cljc': 'Clojure',
	'edn': 'Clojure',
	'coffee': 'CoffeeScript',
	'litcoffee': 'CoffeeScript',
	'cjsx': 'Cjsx',
	'iced': 'IcedCoffeeScript',
	'hx': 'Haxe',
	'hxml': 'Haxe',
	'purs': 'PureScript',
	'elm': 'Elm',
	'p8': 'Pico-8',
	'lua': 'Lua',
	'moon': 'MoonScript',
	'wren': 'Wren',
	'earl-grey': 'Earl Grey',
	'eg': 'Earl Grey',
	'tsv': 'TSV',
	'csv': 'CSV',
	}

	return language_map.get(ext, ext.upper())

	def estimate_tokens(text: str) -> int:
	"""Estimate token count (rough approximation)"""
	# Simple heuristic: ~4 characters per token for English text
	# For code, this varies more, but it's a reasonable approximation
	return len(text) // 4

	def create_chunked_output(content: str, chunk_size: int) -> List[str]:
	"""Split content into chunks of specified size"""
	chunks = []
	current_chunk = ""

	lines = content.split('\n')

	for line in lines:
	if len(current_chunk) + len(line) + 1 > chunk_size:
	if current_chunk:
	chunks.append(current_chunk)
	current_chunk = line
	else:
	if current_chunk:
	current_chunk += '\n' + line
	else:
	current_chunk = line

	if current_chunk:
	chunks.append(current_chunk)

	return chunks

	def matches_patterns(file_path: str, include_patterns: List[str], exclude_patterns: List[str]) -> bool:
	"""Check if file matches include/exclude patterns"""
	import fnmatch

	# Check exclude patterns first
	for pattern in exclude_patterns:
	if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
	return False

	# If no include patterns, include everything else
	if not include_patterns:
	return True

	# Check include patterns
	for pattern in include_patterns:
	if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
	return True

	return False

	def format_file_size(size_bytes: int) -> str:
	"""Format file size in human readable format"""
	for unit in ['B', 'KB', 'MB', 'GB']:
	if size_bytes < 1024.0:
	return f"{size_bytes:.1f} {unit}"
	size_bytes /= 1024.0
	return f"{size_bytes:.1f} TB"

	def generate_file_hash(content: str) -> str:
	"""Generate SHA-256 hash of file content"""
	return hashlib.sha256(content.encode()).hexdigest()[:16]

	def is_binary_file(content: str, file_path: str) -> bool:
	"""Check if file is binary"""
	# Check file extension first
	binary_extensions = {
	'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp',
	'mp3', 'mp4', 'avi', 'mov', 'wav', 'flac', 'ogg',
	'zip', 'rar', 'tar', 'gz', '7z', 'bz2', 'xz',
	'exe', 'dll', 'so', 'dylib',
	'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
	'ttf', 'otf', 'woff', 'woff2', 'eot',
	'bin', 'dat', 'db', 'sqlite', 'sqlite3',
	}

	ext = file_path.split('.')[-1].lower()
	if ext in binary_extensions:
	return True

	# Check content for null bytes (indicator of binary)
	if '\0' in content[:1024]:
	return True

	# Check if content has too many non-printable characters
	printable_chars = sum(1 for c in content[:1024] if c.isprintable() or c in '\t\n\r')
	if printable_chars / len(content[:1024]) < 0.7:
	return True

	return False