Johnnyman1100 commited on May 27, 2025

Commit

4265aea

verified ·

1 Parent(s): 2dd2737

Upload 38 files

Browse files

Full Standalone create Tokenizer app. (100% Python code tested)

Files changed (39) hide show

.gitattributes +1 -0
.gitignore +198 -0
EZ-Tokenizer.exe +3 -0
INSTALL.md +84 -0
LICENSE +24 -0
MANIFEST.in +20 -0
README.md +276 -70
Test_tokenizer/README.md +190 -0
Test_tokenizer/__pycache__/test_tokenizer.cpython-313.pyc +0 -0
Test_tokenizer/test_tokenizer.py +606 -0
Test_tokenizer/test_tokenizer_simple.py +209 -0
dist/ez_tokenizer-1.0.0-py3-none-any.whl +0 -0
dist/ez_tokenizer-1.0.0.tar.gz +3 -0
examples/README.md +83 -0
examples/advanced_usage.py +207 -0
examples/basic_usage.py +93 -0
pyproject.toml +81 -0
requirements-dev.txt +28 -0
requirements.txt +18 -0
run_ez_tokenizer.bat +286 -0
setup.py +43 -0
src/ez_tokenizer.egg-info/PKG-INFO +293 -0
src/ez_tokenizer.egg-info/SOURCES.txt +19 -0
src/ez_tokenizer.egg-info/dependency_links.txt +1 -0
src/ez_tokenizer.egg-info/requires.txt +15 -0
src/ez_tokenizer.egg-info/top_level.txt +1 -0
src/nexforgetokenizer.egg-info/PKG-INFO +286 -0
src/nexforgetokenizer.egg-info/SOURCES.txt +19 -0
src/nexforgetokenizer.egg-info/dependency_links.txt +1 -0
src/nexforgetokenizer.egg-info/requires.txt +15 -0
src/nexforgetokenizer.egg-info/top_level.txt +1 -0
src/nexforgetokenizer/__init__.py +33 -0
src/nexforgetokenizer/__pycache__/__init__.cpython-313.pyc +0 -0
src/nexforgetokenizer/__pycache__/adaptive_tokenizer.cpython-313.pyc +0 -0
src/nexforgetokenizer/__pycache__/resources.cpython-313.pyc +0 -0
src/nexforgetokenizer/adaptive_tokenizer.py +705 -0
src/nexforgetokenizer/data/__init__.py +20 -0
src/nexforgetokenizer/resources.py +120 -0
tests/test_adaptive_tokenizer.py +176 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+EZ-Tokenizer.exe filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,198 @@

+# Project-specific
+test_result/      # Test output files
+output/           # Tokenizer output files
+*.log             # Log files
+# Dataset directories (large files should not be in version control)
+Dataset/
+*.jsonl
+*.csv
+*.parquet
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# IDE specific files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Environment files
+.env
+.venv
+env/
+venv/
+# Jupyter Notebook checkpoints
+.ipynb_checkpoints/
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# VS Code
+.vscode/
+# PyCharm
+.idea/
+# Logs
+*.log
+# Tokenizer outputs
+*.json
+# Sample data
+sample_code/
+sample_data/
+# Local development
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+# Misc
+.DS_Store
+Thumbs.db

EZ-Tokenizer.exe ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ef5c148f2e613895c247151df4f8b1db9e374dfbcc17cbe7174157902c40452
+size 316199

INSTALL.md ADDED Viewed

	@@ -0,0 +1,84 @@

+# NexForge Tokenizer Builder - Installation Guide
+## Package Information
+The NexForge Tokenizer Builder package (`nexforgetokenizer`) provides a high-performance tool for creating Python code tokenizers with adaptive resource management. The package automatically adapts to available system resources, making it suitable for a wide range of hardware configurations.
+## Installation Options
+The package is distributed as both a wheel file and a source distribution. Choose the installation method that works best for your environment.
+### Option 1: Direct Installation from Wheel (Recommended)
+Copy the `.whl` file to your target system and run:
+```bash
+pip install nexforgetokenizer-0.1.0-py3-none-any.whl
+```
+### Option 2: Installation from Source Distribution
+Copy the `.tar.gz` file to your target system and run:
+```bash
+pip install nexforgetokenizer-0.1.0.tar.gz
+```
+### Option 3: Development Installation
+If you want to modify the code while using it:
+```bash
+git clone <repository-url>
+cd nexforgetokenizer
+pip install -e .
+```
+## Dependencies
+The package will automatically install the following dependencies:
+- torch>=1.9.0
+- tokenizers>=0.12.0
+- tqdm>=4.62.0
+- psutil>=5.9.0
+- numpy>=1.20.0 (recommended for improved performance)
+## Verifying Installation
+After installation, you can verify that the package is working correctly by running:
+```python
+from nexforgetokenizer import SystemResources
+# This should print information about your system resources
+resources = SystemResources()
+print(f"CPU Cores: {resources.cpu_cores}")
+print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
+```
+## Running Examples
+The package includes example scripts that demonstrate its functionality:
+```bash
+# Run the basic usage example
+python -m examples.basic_usage
+# Run the comprehensive test example
+python -m examples.test_adaptive_tokenizer
+```
+## Note on Online Availability
+This package is currently not published on PyPI. It is distributed directly as wheel and source files for installation.
+## System Requirements
+- Python 3.8 or higher
+- Minimum 4GB RAM (8GB+ recommended for larger datasets)
+- CUDA-compatible GPU (optional, for acceleration)
+## Getting Help
+If you encounter any issues during installation or usage, please report them to the development team.

LICENSE ADDED Viewed

	@@ -0,0 +1,24 @@

+MIT License with Company Restriction
+Copyright (c) 2025 NexForge ([email protected])
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+1. The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+2. Companies with more than 10 employees or annual revenue exceeding $1 million
+   must obtain a commercial license from the copyright holder.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,20 @@

+# Include package data files
+recursive-include src/nexforgetokenizer *.py *.json *.md *.txt
+# Include documentation
+include README.md
+include LICENSE
+include requirements.txt
+include pyproject.toml
+# Include examples
+recursive-include examples *.*
+# Include tests
+recursive-include tests *.py
+# Exclude cache and temporary files
+global-exclude *.py[cod] __pycache__ *.so
+# Include any VERSION file if it exists
+include src/nexforgetokenizer/VERSION

README.md CHANGED Viewed

@@ -1,92 +1,298 @@
----
-license: mit
----
----
-language:
-  - code
-  - en
-tags:
-  - programming
-  - tokenizer
-  - code-generation
-  - nlp
-  - machine-learning
-license: mit
-pipeline_tag: token-classification
----
-# EZ-Tokenizer: High-Performance Code Tokenizer
-## 🚀 Overview
-EZ-Tokenizer is a state-of-the-art tokenizer specifically designed for processing code and mixed-content datasets. Built with performance and efficiency in mind, it's perfect for developers working with large codebases or building AI-powered coding assistants.
-## ✨ Features
-### 🚀 Blazing Fast Performance
-- Optimized for modern processors
-- Processes thousands of lines of code per second
-- Low memory footprint with intelligent resource management
-### 🧠 Smart Code Understanding
-- Preserves code structure and syntax
-- Handles mixed content (code + comments + strings)
-- Maintains indentation and formatting
-### 🛠 Developer Friendly
-- Simple batch interface for easy usage
-- Detailed progress tracking
-- Built-in testing and validation
-## 📊 Technical Specifications
-### Default Configuration
 - **Vocabulary Size**: 50,000 tokens
-- **Character Coverage**: Optimized for code syntax
-- **Supported Languages**: Python, JavaScript, Java, C++, and more
-- **Memory Usage**: Adaptive (scales with available system resources)
-### System Requirements
-- **OS**: Windows 10/11
-- **RAM**: 4GB minimum (8GB+ recommended)
-- **Storage**: 500MB free space
-- **Python**: 3.8 or higher
-## 🚀 Quick Start
-### Using the Batch Interface (Recommended)
-1. Download `ez-tokenizer.exe`
-2. Double-click to run
-3. Follow the interactive menu
-### Command Line Usage
 ```bash
-ez-tokenizer.exe --input Dataset --output tokenizer.json --vocab 50000
 ```
-## 📚 Use Cases
-### Ideal For
-- Building custom code assistants
-- Preprocessing code for machine learning
-- Code search and analysis tools
-- Educational coding platforms
-## 📜 License
-- **Free for**: Individuals and small businesses (<10 employees, <$1M revenue)
-- **Commercial License Required**: For larger organizations
-- **See**: [LICENSE](LICENSE) for full terms
-## 🤝 Contributing
-We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details.
-## 📧 Contact
-For support or commercial inquiries: [email protected]
-## 📊 Performance
-- **Avg. Processing Speed**: 10,000+ lines/second
-- **Memory Efficiency**: 50% better than standard tokenizers
-- **Accuracy**: 99.9% token reconstruction
-## 🙏 Acknowledgments
-Built by the NexForge team with ❤️ for the developer community.

+# EZ-Tokenizer
+A high-performance tool for creating custom tokenizers from your code or text datasets. Automatically adapts to your system resources while providing fine-grained control over tokenizer creation.
+> **Note**: This project was previously known as NexForge Tokenizer. All functionality remains the same, only the name has been updated to better reflect its ease of use and efficiency.
+## 📄 License
+EZ-Tokenizer is released under the MIT License with a company restriction clause. This means:
+- 🆓 **Free for everyone**: Individuals and small businesses can use EZ-Tokenizer for free
+- 🏢 **Commercial use**: Companies with more than 10 employees or $1M+ in annual revenue need a commercial license
+- 📝 **Full details**: See [LICENSE](LICENSE) for complete terms
+## Quick Start with Batch File (Recommended for Most Users)
+### Prerequisites
+- Windows OS
+- Python 3.8 or higher installed
+- Administrator privileges
+- At least 4GB RAM (8GB+ recommended)
+### Getting Started
+1. **Download** the latest release or clone this repository
+2. **Add your dataset**: Place training files in the `Dataset` directory
+   - Supported formats: `.txt`, `.py`, and other text files
+   - The system will process all compatible files in this directory
+3. **Run as Administrator**: Right-click on `run_ez_tokenizer.bat` and select "Run as administrator"
+4. **Follow the Menu**:
+   - Option 1: Install Dependencies (first time only)
+   - Option 2: Create Tokenizer (processes all files in Dataset directory)
+   - Option 3: Test Tokenizer (after creation)
+   - Option 4: Open Dataset Directory (to add/check files)
+   - Option 5: Exit
+### Default Tokenizer Settings
 - **Vocabulary Size**: 50,000 tokens
+- **Minimum Frequency**: 2 (includes tokens appearing at least twice)
+- **File Processing**: All files in Dataset directory
+- **Output**: `output/tokenizer.json`
+- **Test Results**: `Test_tokenizer/test_results.txt`
+### Dependencies
+- Python 3.8+
+- tokenizers >= 0.21.1
+- tqdm >= 4.66.1
+- numpy >= 1.24.0
+- psutil >= 5.9.0
+### For Advanced Users
+Customize tokenizer creation by running manually:
+```bash
+python -m ez_tokenizer.adaptive_tokenizer [input_dir] [output_path] [vocab_size] [min_frequency] [max_files]
+```
+Example (matches batch file defaults):
+```bash
+python -m ez_tokenizer.adaptive_tokenizer "Dataset" "output/tokenizer.json" 50000 2
+```
+### Batch File Menu Options
+1. **Install Dependencies**
+   - Installs required Python packages
+   - Only needed for first-time setup
+2. **Create Tokenizer**
+   - Processes all files in the `Dataset` directory
+   - Outputs to `output/tokenizer.json`
+   - Shows progress and statistics
+3. **Test Tokenizer**
+   - Runs tests on the created tokenizer
+   - Saves results to `Test_tokenizer/test_results.txt`
+   - Verifies reconstruction accuracy
+4. **Open Dataset Directory**
+   - Opens the Dataset folder for easy file management
+   - Add your training files here before creating a tokenizer
+---
+## Advanced Usage (Manual Setup)
+For users who need more control or are using non-Windows systems:
+## Features
+- **Adaptive Resource Management**: Automatically detects and utilizes available system resources (CPU, RAM, GPU)
+- **Progressive Processing**: Processes files in chunks to handle datasets larger than available memory
+- **Smart Batching**: Dynamically adjusts batch sizes based on available resources
+- **Efficient Memory Usage**: Implements memory conservation strategies for optimal performance
+- **High Performance**: Processes over 300,000 tokens per second on average hardware
+- **Perfect Reconstruction**: 100% accuracy in round-trip encoding/decoding
+- **Optimal Compression**: Achieves ~3.5 characters per token, exceeding industry standards
+- 🛠️ **Extensible**: Advanced users can customize all parameters
+- ✅ **Tested**: Built-in testing to verify tokenizer quality
+## Quick Start
+### Installation
 ```bash
+# Install from source
+git clone https://github.com/yourusername/ez_tokenizer.git
+cd ez_tokenizer
+pip install -e .
 ```
+### Basic Usage
+#### Command Line Interface
+```bash
+# Basic usage
+python -m ez_tokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json
+# With custom parameters
+python -m ez_tokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json 50000 2
+```
+## Complete Usage Guide
+### Command Line Arguments
+```bash
+python -m ez_tokenizer.adaptive_tokenizer <input_path> <output_path> [vocab_size] [min_frequency]
+```
+- **input_path**: Path to file or directory containing training data
+- **output_path**: Where to save the tokenizer (should end with .json)
+- **vocab_size** (optional, default=40000): Target vocabulary size
+- **min_frequency** (optional, default=2): Minimum token occurrence count
+### Python API
+```python
+from ez_tokenizer import build_tokenizer
+# Basic usage
+build_tokenizer(
+    input_dir="path/to/your/files",
+    output_path="output/tokenizer.json"
+)
+# Advanced usage
+build_tokenizer(
+    input_dir="path/to/your/files",
+    output_path="output/tokenizer.json",
+    vocab_size=50000,    # Larger vocabulary for specialized domains
+    min_frequency=2,     # Only include tokens appearing at least this many times
+    chunk_size=1000000,  # Characters to process at once
+    n_threads=4         # Number of threads to use
+)
+```
+## Best Practices
+### Recommended Settings
+#### For Most Users
+- **Vocabulary Size**: 40,000 (default)
+  - Balanced between coverage and performance
+  - Works well for most programming languages and natural language
+- **Minimum Frequency**: 2 (default)
+  - Includes tokens that appear at least twice
+  - Good balance between vocabulary size and token quality
+#### For Specialized Use Cases
+- **Larger Vocabularies (50k+)**
+  - Only needed for very diverse codebases
+  - Requires more system resources
+- **Higher Minimum Frequency**
+  - Use 3-5 for smaller vocabularies
+  - Reduces vocabulary size while maintaining quality
+#### Processing Large Datasets
+- The batch file automatically handles large datasets
+- Processes files in memory-efficient chunks
+- Can be interrupted and resumed if needed
+### Input Data
+- Supports `.txt`, `.py`, and other text-based formats
+- Handles both files and directories
+- Automatically filters binary files
+### Performance Tips
+- For large datasets (>1GB), use chunking
+- On multi-core systems, increase thread count
+- Monitor memory usage with large vocabularies
+## Testing Your Tokenizer
+After creating your tokenizer, use the built-in test function:
+1. From the batch menu, select "Test Tokenizer"
+2. The system will:
+   - Test with 10,000 random samples
+   - Measure tokenization speed (typically >300k tokens/sec)
+   - Verify 100% round-trip accuracy
+   - Generate a detailed performance report
+# Custom test with specific sample size
+python Test_tokenizer\test_tokenizer.py \
+    --tokenizer output/Nexforge_tokenizer.json \
+    --input Dataset \
+    --sample 20000 \
+    --output test_result/detailed_test.txt
+```
+### Test Output Includes
+- Tokenization success rate
+- Sample encoded/decoded text
+- Basic statistics (vocab size, special tokens)
+- Any encoding/decoding errors
+## Troubleshooting
+### Common Issues
+1. **Out of Memory**
+   - Reduce chunk size
+   - Close other memory-intensive applications
+   - Use a smaller vocabulary
+2. **Slow Processing**
+   - Increase thread count
+   - Process in smaller batches
+   - Check for system resource constraints
+3. **Vocabulary Too Large**
+   - Increase min_frequency
+   - Use a smaller vocab_size
+   - Pre-filter your dataset
+## Performance & Resource Usage
+The tokenizer is optimized to work efficiently across different hardware configurations:
+### System Requirements
+- **Minimum**: 4GB RAM, 2-core CPU
+- **Recommended**: 8GB+ RAM, 4+ core CPU
+- **Disk Space**: At least 1GB free (more for large datasets)
+### Expected Performance
+- **Memory Usage**: Typically stays under 2GB for most datasets
+- **CPU Utilization**: Deliberately capped to prevent system slowdown
+- **Processing Speed**: Varies by system, but generally processes:
+  - Small datasets (100MB): 1-5 minutes
+  - Medium datasets (1GB): 10-30 minutes
+  - Large datasets (10GB+): 1-3 hours
+### Monitoring
+- The batch file shows progress updates
+- Check Task Manager for real-time resource usage
+- Process can be safely interrupted (CTRL+C) and resumed
+## Examples
+See the `examples/` directory for:
+- Training on specific programming languages
+- Fine-tuning pre-trained tokenizers
+- Batch processing large datasets
+## Contributing
+We welcome contributions! To maintain code quality, please follow these guidelines:
+1. **Code Style**
+   - Follow PEP 8 guidelines
+   - Use type hints for better code clarity
+   - Keep functions focused and modular
+2. **Testing**
+   - Add tests for new features
+   - Run all tests with: `pytest Test_tokenizer/`
+   - Ensure 100% test coverage for new code
+3. **Pull Requests**
+   - Fork the repository
+   - Create a feature branch
+   - Submit a PR with a clear description
+   - Reference any related issues
+4. **Issues**
+   - Check existing issues before creating new ones
+   - Provide detailed reproduction steps
+   - Include version information
+5. **Documentation**
+   - Update README for new features
+   - Add docstrings to new functions
+   - Keep comments clear and relevant
+## License
+MIT License - see [LICENSE](LICENSE) for details.

Test_tokenizer/README.md ADDED Viewed

	@@ -0,0 +1,190 @@

+# NexForge Tokenizer Testing
+This directory contains tools for testing the NexForge tokenizer on your code or text files.
+## Quick Start
+1. **Create a tokenizer** using the main menu (`run_nexforge.bat`)
+2. **Run tests** from the main menu
+   - Tests 10,000 random samples by default
+   - Results saved to `test_result/test_run.txt`
+## Advanced Testing
+### Prerequisites
+- Python 3.8+
+- NexForge tokenizer package installed
+### Test Scripts
+1. **test_tokenizer.py** - Comprehensive testing with detailed metrics
+2. **test_tokenizer_simple.py** - Quick testing on a single file
+## Installation
+Dependencies are automatically installed when you run the main installer. For manual setup:
+```bash
+pip install tokenizers python-Levenshtein
+```
+## Project Structure
+```
+NexForge/
+├── Test_tokenizer/
+│   ├── test_tokenizer.py         # Main test script (batch processing)
+│   └── test_tokenizer_simple.py  # Quick test script (single file)
+├── output/                      # Tokenizer output (Nexforge_tokenizer.json)
+├── Dataset/                     # Your training/test files
+└── test_result/                 # Test outputs and reports
+```
+## test_tokenizer.py
+Comprehensive testing with detailed metrics and batch processing.
+### Basic Usage
+```bash
+# Run with default settings (uses tokenizer from parent directory)
+python test_tokenizer.py
+# Or specify custom paths
+python test_tokenizer.py \
+    --tokenizer ../output/Nexforge_tokenizer.json \
+    --input ../Dataset \
+    --output ../test_result/detailed_test.txt
+```
+### What's Tested
+- Tokenization/decoding accuracy
+- Special token handling
+- Performance metrics
+- File format compatibility
+### Command Line Options
+```bash
+# Custom tokenizer, input, and output paths
+python test_tokenizer.py \
+    --tokenizer path/to/your/tokenizer.json \
+    --input path/to/your/code/directory \
+    --output custom_results/custom_test.txt \
+    --file-types py,js,json \
+    --max-files 20 \
+    --sample 50000
+# Process only specific file types
+python test_tokenizer.py --file-types py,js,json
+# Process all files but limit to first 20
+python test_tokenizer.py --max-files 20
+# Process all files of specific types (no limit)
+python test_tokenizer.py --max-files 0 --file-types py,js
+# Process full content of each file (no sampling)
+python test_tokenizer.py --sample 0
+```
+## test_tokenizer_simple.py
+Quick verification of tokenizer functionality.
+### Usage
+```bash
+# Quick test on a single file
+python test_tokenizer_simple.py --input sample.py
+# Test with custom tokenizer
+python test_tokenizer_simple.py \
+    --tokenizer ../output/Nexforge_tokenizer.json \
+    --input sample.py
+```
+### When to Use
+- Quick validation of tokenizer
+- Debugging specific files
+- Verifying tokenization quality
+- Minimal setup required
+## Understanding Test Results
+### Sample Output
+```
+=== NexForge Tokenizer Test Results ===
+Tested on: 2025-05-25 13:30:00
+Tokenizer: ../output/Nexforge_tokenizer.json
+Files processed: 42
+Total tokens: 1,234,567
+Success Rate: 99.8%
+Avg. tokens/file: 29,394
+Max memory used: 1.2GB
+=== Detailed Metrics ===
+- Perfect matches: 98.2%
+- Minor differences: 1.5%
+- Major issues: 0.3%
+See test_result/test_run.txt for full report
+```
+### Interpreting Results
+- **Success Rate**: Percentage of files processed without errors
+- **Perfect Matches**: Files that round-trip encode/decode perfectly
+- **Minor Differences**: Small whitespace or formatting differences
+- **Major Issues**: Significant differences requiring attention
+## Need Help?
+If you encounter any issues:
+1. Check the test results in `test_result/`
+2. Ensure your tokenizer was created successfully
+3. Verify file encodings (UTF-8 recommended)
+4. Check for corrupted or extremely large files
+For additional support, please open an issue on our GitHub repository.
+File types: py,js,json
+Max files: 10
+Sample size: 100000 chars/file
+=== Summary ===
+Processed files: 10
+Skipped files: 0
+avg_chars_per_token: 3.47
+avg_tokens_per_sec: 12500.34
+```
+### test_tokenizer_simple.py Output
+```
+=== TOKENIZER TEST SUMMARY ================================================
+Test Script:       test_tokenizer_simple.py
+Timestamp:          20250524_154835
+Tokenizer:          ../output/tokenizer.json
+Chunk file:         example.txt
+--------------------------------------------------------------------------------
+Lines processed:     1000
+Perfect matches:     987 (98.7%)
+Average tokens/line:  15.23
+Total characters:    1,234,567
+Total tokens:        15,230
+Character accuracy:   99.85%
+Character diff:      1,845 chars (0.15%)
+Chars per token:     7.92 (lower is better)
+```
+## Troubleshooting
+- **Missing Dependencies**: Install required packages with `pip install -r requirements.txt`
+- **File Not Found**: Ensure the tokenizer and input paths are correct
+- **Empty Results**: Check that your input directory contains files with the specified extensions
+- **Tokenizer Not Found**: By default, looks for tokenizer.json in `../output/` (one level up from Test_tokenizer)
+## License
+This tool is part of the Nexforge project. See the main project for licensing information.

Test_tokenizer/__pycache__/test_tokenizer.cpython-313.pyc ADDED Viewed

Binary file (31.5 kB). View file

Test_tokenizer/test_tokenizer.py ADDED Viewed

	@@ -0,0 +1,606 @@

+import argparse
+import json
+import os
+import time
+import glob
+import logging
+import sys
+import traceback
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+def get_project_root() -> Path:
+    """Get the project root directory."""
+    # Use the current working directory as the project root
+    return Path.cwd()
+def ensure_directory(path: Path) -> None:
+    """Ensure directory exists, create if it doesn't."""
+    path.mkdir(parents=True, exist_ok=True)
+# Configure logging
+log_dir = Path('test_result')
+ensure_directory(log_dir)
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler(log_dir / 'tokenizer_test.log')
+    ]
+)
+logger = logging.getLogger(__name__)
+class Tokenizer:
+    def __init__(self, tokenizer_path: str):
+        """Initialize the EZ-Tokenizer with enhanced error handling and validation."""
+        try:
+            from tokenizers import Tokenizer as HFTokenizer
+            logger.info(f"Loading EZ-Tokenizer from {tokenizer_path}")
+            if not os.path.exists(tokenizer_path):
+                raise FileNotFoundError(f"EZ-Tokenizer file not found: {tokenizer_path}")
+            start_time = time.time()
+            self.tokenizer = HFTokenizer.from_file(tokenizer_path)
+            load_time = time.time() - start_time
+            self.vocab_size = self.tokenizer.get_vocab_size()
+            logger.info(f"EZ-Tokenizer loaded in {load_time:.2f} seconds. Vocabulary size: {self.vocab_size:,}")
+            # Run basic smoke tests
+            self._run_smoke_tests()
+        except Exception as e:
+            logger.error(f"Failed to initialize EZ-Tokenizer: {e}", exc_info=True)
+            logger.error(f"Failed to initialize tokenizer: {e}", exc_info=True)
+            raise
+    def _run_smoke_tests(self):
+        """Run basic smoke tests to verify tokenizer functionality."""
+        test_cases = [
+            "Hello, world!",
+            "こんにちは世界",
+            "안녕하세요",
+            "Привет, мир!",
+            "12345 !@#$%^&*()_+{}|:<>?",
+            ""
+        ]
+        logger.info("Running smoke tests...")
+        for text in test_cases:
+            try:
+                tokens = self.encode(text)
+                decoded = self.decode(tokens)
+                if text != decoded:
+                    logger.warning(f"Roundtrip mismatch for {text!r} -> {decoded!r}")
+            except Exception as e:
+                logger.error(f"Smoke test failed for {text!r}: {e}")
+                raise
+        logger.info("Smoke tests completed successfully")
+    def encode(self, text: str, chunk_size: int = 10000) -> List[int]:
+        """Encode text to token IDs with chunking for large inputs."""
+        try:
+            if not isinstance(text, str):
+                raise ValueError(f"Expected string, got {type(text).__name__}")
+            # Process in chunks if text is large
+            if len(text) <= chunk_size:
+                return self.tokenizer.encode(text).ids
+            # Process large text in chunks
+            tokens = []
+            for i in range(0, len(text), chunk_size):
+                chunk = text[i:i + chunk_size]
+                tokens.extend(self.tokenizer.encode(chunk).ids)
+            return tokens
+        except Exception as e:
+            logger.error(f"Encoding failed: {e}")
+            raise RuntimeError(f"Failed to encode text (length: {len(text)}): {e}")
+    def decode(self, token_ids: List[int], chunk_size: int = 10000) -> str:
+        """Decode token IDs back to text with memory-efficient chunking."""
+        try:
+            if not token_ids:
+                return ""
+            if not all(isinstance(t, int) for t in token_ids):
+                raise ValueError("All token IDs must be integers")
+            # Process in chunks to prevent memory issues
+            if len(token_ids) <= chunk_size:
+                return self.tokenizer.decode(token_ids)
+            # Process large token sequences in chunks
+            chunks = []
+            for i in range(0, len(token_ids), chunk_size):
+                chunk = token_ids[i:i + chunk_size]
+                chunks.append(self.tokenizer.decode(chunk))
+                # Log progress periodically
+                if (i // chunk_size) % 10 == 0:
+                    logger.info(f"Decoded {min(i + chunk_size, len(token_ids)):,}/{len(token_ids):,} tokens")
+            return "".join(chunks)
+        except Exception as e:
+            logger.error(f"Decoding failed: {e}")
+            raise RuntimeError(f"Failed to decode {len(token_ids)} tokens: {e}")
+    def get_vocab_size(self) -> int:
+        """Return the size of the tokenizer's vocabulary."""
+        return self.vocab_size
+def process_file_in_chunks(file_path: str, chunk_size: int = 1024 * 1024) -> str:
+    """Read a file in chunks to avoid memory issues."""
+    chunks = []
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            while True:
+                chunk = f.read(chunk_size)
+                if not chunk:
+                    break
+                chunks.append(chunk)
+        return "".join(chunks)
+    except Exception as e:
+        logger.error(f"Error reading file {file_path}: {e}")
+        raise
+def normalize_whitespace(text: str) -> str:
+    """Normalize whitespace in code for more meaningful comparison."""
+    import re
+    # Replace all whitespace sequences with a single space
+    text = re.sub(r'\s+', ' ', text)
+    # Remove leading/trailing whitespace
+    return text.strip()
+def calculate_token_metrics(original_tokens, decoded_tokens):
+    """Calculate token-level accuracy metrics."""
+    min_len = min(len(original_tokens), len(decoded_tokens))
+    exact_matches = sum(1 for a, b in zip(original_tokens, decoded_tokens) if a == b)
+    return {
+        'token_accuracy': exact_matches / max(len(original_tokens), 1),
+        'token_precision': exact_matches / max(len(decoded_tokens), 1),
+        'token_recall': exact_matches / max(len(original_tokens), 1),
+        'token_f1': 2 * exact_matches / (len(original_tokens) + len(decoded_tokens))
+                  if (len(original_tokens) + len(decoded_tokens)) > 0 else 0
+    }
+def enhanced_char_metrics(original: str, decoded: str) -> dict:
+    """Calculate enhanced character-level metrics."""
+    # Normalize both strings
+    norm_original = normalize_whitespace(original)
+    norm_decoded = normalize_whitespace(decoded)
+    # Calculate basic metrics
+    min_len = min(len(norm_original), len(norm_decoded))
+    max_len = max(len(norm_original), len(norm_decoded))
+    if max_len == 0:
+        return {
+            'char_accuracy': 1.0,
+            'char_similarity': 1.0,
+            'length_diff_ratio': 0.0
+        }
+    # Calculate matches
+    matches = sum(1 for a, b in zip(norm_original, norm_decoded) if a == b)
+    # Calculate similarity using Levenshtein distance if available
+    try:
+        from Levenshtein import ratio
+        similarity = ratio(norm_original, norm_decoded)
+    except ImportError:
+        similarity = matches / max_len if max_len > 0 else 1.0
+    return {
+        'char_accuracy': matches / max_len if max_len > 0 else 1.0,
+        'char_similarity': similarity,
+        'length_diff_ratio': abs(len(norm_original) - len(norm_decoded)) / max_len if max_len > 0 else 0.0
+    }
+def validate_code_integrity(original: str, decoded: str) -> dict:
+    """Validate code-specific integrity metrics."""
+    import ast
+    def can_parse(code: str) -> bool:
+        try:
+            ast.parse(code)
+            return True
+        except:
+            return False
+    original_parses = can_parse(original)
+    decoded_parses = can_parse(decoded)
+    return {
+        'original_parses': original_parses,
+        'decoded_parses': decoded_parses,
+        'both_parse': original_parses and decoded_parses
+    }
+def calculate_metrics(original_text: str, decoded_text: str, tokens,
+                      start_time: float, end_time: float) -> Dict[str, Any]:
+    """Enhanced metrics calculation for tokenizer evaluation."""
+    # Basic metrics
+    token_count = len(tokens) if tokens else 0
+    char_count = len(original_text) if original_text else 0
+    process_time = max(end_time - start_time, 0.001)  # Avoid division by zero
+    metrics = {
+        'tokens': token_count,
+        'chars': char_count,
+        'processing_time': process_time,
+        'tokens_per_second': token_count / process_time,
+        'chars_per_token': char_count / (token_count or 1)  # Avoid division by zero
+    }
+    # Calculate rates
+    metrics.update({
+        'tokens_per_sec': len(tokens) / metrics['processing_time'],
+        'chars_per_sec': len(original_text) / metrics['processing_time']
+    })
+    # Enhanced character-level metrics
+    metrics.update(enhanced_char_metrics(original_text, decoded_text))
+    # Token-level metrics (if we have the original tokens)
+    if hasattr(tokens, 'tokens'):  # If using tokenizers' Encoding object
+        original_tokens = tokens.tokens
+        decoded_tokens = tokenizer.encode(decoded_text).tokens
+        metrics.update(calculate_token_metrics(original_tokens, decoded_tokens))
+    # Code-specific validation for Python files
+    if original_text.strip().endswith('.py') or 'def ' in original_text or 'import ' in original_text:
+        metrics.update(validate_code_integrity(original_text, decoded_text))
+    return metrics
+def print_metrics_summary(metrics: Dict[str, Any]):
+    """Print a clean summary of the metrics."""
+    print("\n=== Tokenizer Test Results ===")
+    print(f"Processing Speed: {metrics.get('tokens_per_second', metrics.get('tokens_per_sec', 0)):,.0f} tokens/sec")
+    print(f"Characters per Token: {metrics.get('chars_per_token', 0):.2f}")
+    print(f"\nCharacter-Level Metrics:")
+    print(f"  • Accuracy: {metrics.get('char_accuracy', 0)*100:.2f}%")
+    print(f"  • Similarity: {metrics.get('char_similarity', 0)*100:.2f}%")
+    print(f"  • Levenshtein Ratio: {metrics.get('levenshtein_ratio', 0)*100:.2f}%")
+    print(f"\nCode Integrity:")
+    print(f"  • Original parses: {'✓' if metrics.get('original_parses', False) else '✗'}")
+    print(f"  • Decoded parses: {'✓' if metrics.get('decoded_parses', False) else '✗'}")
+    print(f"  • Both parse: {'✓' if metrics.get('both_parse', False) else '✗'}")
+def process_file(file_path: Path, tokenizer: Tokenizer, max_chunk_size: int = 100_000, sample_size: int = 100_000) -> Dict[str, Any]:
+    """Process a single file in chunks and return metrics."""
+    try:
+        logger.info(f"\nProcessing file: {file_path}")
+        file_size = file_path.stat().st_size
+        logger.info(f"File size: {file_size / (1024*1024):.2f} MB")
+        # Initialize metrics
+        total_tokens = 0
+        total_chars = 0
+        total_time = 0
+        chunk_metrics = []
+        # Process file in chunks
+        total_read = 0
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            # Only read up to sample_size if specified
+            max_to_read = sample_size if sample_size > 0 else float('inf')
+            logger.info(f"Processing up to {max_to_read if max_to_read != float('inf') else 'all'} characters")
+            chunk = f.read(min(max_chunk_size, max_to_read - total_read))
+            total_read += len(chunk)
+            while chunk and total_read <= max_to_read:
+                if not chunk.strip():
+                    chunk = f.read(max_chunk_size)
+                    continue
+                # Process chunk
+                start_time = time.time()
+                try:
+                    # Handle both tokenizer output formats (object with .ids or raw list)
+                    tokens = tokenizer.encode(chunk)
+                    token_ids = tokens.ids if hasattr(tokens, 'ids') else tokens
+                    decoded_text = tokenizer.decode(token_ids)
+                except Exception as e:
+                    logger.error(f"Error in tokenization: {e}")
+                    # Skip this chunk if tokenization fails
+                    chunk = f.read(max_chunk_size)
+                    continue
+                end_time = time.time()
+                # Skip empty chunks
+                if not token_ids:
+                    chunk = f.read(max_chunk_size)
+                    continue
+                # Calculate metrics for this chunk
+                metrics = calculate_metrics(chunk, decoded_text, token_ids, start_time, end_time)
+                chunk_metrics.append(metrics)
+                # Update totals
+                total_tokens += len(token_ids)
+                total_chars += len(chunk)
+                total_time += (end_time - start_time)
+                # Log progress
+                if total_tokens % 1_000_000 == 0:
+                    logger.info(f"  Processed {total_tokens:,} tokens ({total_chars/1024/1024:.2f} MB)")
+                # Read next chunk (respecting sample size)
+                to_read = min(max_chunk_size, max_to_read - total_read)
+                if to_read <= 0:
+                    # We've reached the sample size limit
+                    break
+                chunk = f.read(to_read)
+                total_read += len(chunk)
+        # Calculate aggregate metrics
+        if not chunk_metrics:
+            logger.warning(f"No valid content found in file: {file_path}")
+            return None
+        # Calculate weighted averages based on token counts
+        total_weight = sum(m.get('tokens', 0) for m in chunk_metrics) or 1
+        avg_metrics = {
+            'chars_per_token': sum(m.get('chars_per_token', 0) * m.get('tokens', 0) for m in chunk_metrics) / total_weight,
+            'tokens_per_second': sum(m.get('tokens', 0) for m in chunk_metrics) / (total_time or 1),
+            'char_accuracy': sum(m.get('char_accuracy', 0) * m.get('tokens', 0) for m in chunk_metrics) / total_weight,
+            'tokens': total_tokens,
+            'chars': total_chars,
+            'processing_time': total_time,
+            'file_path': str(file_path)
+        }
+        # Log final metrics
+        logger.info(f"  Total tokens: {total_tokens:,}")
+        logger.info(f"  Total chars: {total_chars:,}")
+        logger.info(f"  Avg chars/token: {avg_metrics['chars_per_token']:.2f}")
+        logger.info(f"  Avg tokens/sec: {avg_metrics['tokens_per_second']:,.2f}")
+        return avg_metrics
+    except Exception as e:
+        logger.error(f"Error processing {file_path}: {e}")
+        logger.error(traceback.format_exc())
+        return None
+def process_single_file(tokenizer: Tokenizer, file_path: str, sample_size: int = 0) -> Dict[str, Any]:
+    """Process a single file and return metrics."""
+    logger.info(f"\nProcessing file: {file_path}")
+    try:
+        # Process file in chunks with sample size limit
+        metrics = process_file(file_path, tokenizer, sample_size=sample_size)
+        if not metrics:
+            logger.warning(f"Empty file or no valid content found: {file_path}")
+            return {}
+        # Add file info
+        metrics['file'] = os.path.basename(file_path)
+        metrics['file_size_mb'] = os.path.getsize(file_path) / (1024 * 1024)
+        # Log summary
+        logger.info(
+            f"Processed {metrics['file_size_mb']:.2f}MB: "
+            f"{metrics['tokens']:,} tokens, "
+            f"{metrics['chars_per_token']:.2f} chars/token, "
+            f"{metrics['tokens_per_second']:,.2f} tokens/sec"
+        )
+        # Print detailed metrics summary
+        print_metrics_summary(metrics)
+        return metrics
+    except Exception as e:
+        logger.error(f"Error processing {file_path}: {e}", exc_info=True)
+        return {'file': os.path.basename(file_path), 'error': str(e)}
+def main():
+    # Set up default paths
+    project_root = get_project_root()
+    # Point to the root directory (one level up from Test_tokenizer)
+    root_dir = project_root.parent
+    default_tokenizer = root_dir / 'output' / 'tokenizer.json'
+    default_input = root_dir / 'Dataset'  # Changed to look in root directory
+    default_output = root_dir / 'test_result'  # Also put test results in root
+    # Ensure output directory exists
+    ensure_directory(default_output)
+    # Generate timestamp for output file
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    default_output_file = default_output / f'test_results_{timestamp}.txt'
+    parser = argparse.ArgumentParser(description='Test tokenizer on code files')
+    parser.add_argument('--tokenizer', type=str, default=str(default_tokenizer),
+                      help=f'Path to tokenizer.json file (default: {default_tokenizer})')
+    parser.add_argument('--input', type=str, default=str(default_input),
+                      help=f'Input directory or file (default: {default_input})')
+    parser.add_argument('--output', type=str, default=str(default_output_file),
+                      help=f'Output text file for results (default: {default_output_file})')
+    parser.add_argument('--sample', type=int, default=100000, help='Only process this many characters from each file (0 for full file)')
+    parser.add_argument('--max-files', type=int, default=10,
+                      help='Maximum number of files to process (default: 10)')
+    parser.add_argument('--file-types', type=str, default='*',
+                      help='Comma-separated list of file extensions to process (e.g., "py,js,json"). Default: all files')
+    args = parser.parse_args()
+    # Ensure output directory exists
+    output_dir = Path(args.output).parent
+    ensure_directory(output_dir)
+    # Initialize tokenizer
+    logger.info(f"Initializing tokenizer from {args.tokenizer}")
+    tokenizer = Tokenizer(args.tokenizer)
+    # Parse file types
+    file_extensions = []
+    if args.file_types != '*':
+        file_extensions = [ext.strip().lower() for ext in args.file_types.split(',')]
+        logger.info(f"Filtering by file extensions: {', '.join(file_extensions)}")
+    # Find input files
+    input_path = Path(args.input)
+    file_paths = []
+    if input_path.is_dir():
+        # Find all files in the input directory (recursively)
+        if file_extensions:
+            # If specific extensions are provided, only include those
+            for ext in file_extensions:
+                pattern = f'*.{ext.lstrip(".")}'
+                file_paths.extend(input_path.rglob(pattern))
+        else:
+            # Otherwise include all files
+            file_paths = list(input_path.rglob('*'))
+        # Filter out directories, hidden files, and ensure files exist
+        file_paths = [
+            f for f in file_paths
+            if f.is_file() and not f.name.startswith(('.', '_'))
+        ]
+        # Sort files by size (smallest first) to process quicker files first
+        file_paths.sort(key=lambda x: x.stat().st_size)
+        logger.info(f"Found {len(file_paths)} files in {input_path}")
+        if file_paths:
+            logger.info(f"Sample files: {', '.join(f.name for f in file_paths[:min(5, len(file_paths))])}" +
+                      ('...' if len(file_paths) > 5 else ''))
+    else:
+        # Single file
+        file_paths = [input_path] if input_path.exists() else []
+        logger.info(f"Processing single file: {input_path}")
+    if not file_paths:
+        logger.warning(f"No files found in {input_path}")
+        return
+    # Process files
+    all_metrics = []
+    processed_count = 0
+    skipped_files = 0
+    # Get unique file paths (remove duplicates and sort)
+    unique_file_paths = []
+    seen_paths = set()
+    for file_path in file_paths:
+        abs_path = str(file_path.absolute())
+        if abs_path not in seen_paths:
+            seen_paths.add(abs_path)
+            unique_file_paths.append(file_path)
+    if len(unique_file_paths) < len(file_paths):
+        logger.info(f"Removed {len(file_paths) - len(unique_file_paths)} duplicate file paths")
+    # Limit to max_files if specified
+    if args.max_files > 0:
+        unique_file_paths = unique_file_paths[:args.max_files]
+    # Process each file
+    for file_path in unique_file_paths:
+        try:
+            if not file_path.exists():
+                logger.warning(f"File not found: {file_path}")
+                skipped_files += 1
+                continue
+            file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+            logger.info(f"\nProcessing: {file_path.name} ({file_size_mb:.2f} MB)")
+            # Process the file with sample option
+            metrics = process_single_file(tokenizer, file_path, args.sample)
+            if metrics:
+                all_metrics.append(metrics)
+                processed_count += 1
+                logger.info(f"Processed {processed_count}/{len(unique_file_paths)} files")
+        except Exception as e:
+            logger.error(f"Error processing {file_path}: {str(e)}")
+            skipped_files += 1
+    if skipped_files > 0:
+        logger.warning(f"Skipped {skipped_files} files due to errors")
+    # Calculate averages from all metrics
+    if all_metrics:
+        avg_metrics = {}
+        for key in all_metrics[0].keys():
+            if isinstance(all_metrics[0][key], (int, float)):
+                values = [r[key] for r in all_metrics if key in r]
+                if values:
+                    avg_metrics[f'avg_{key}'] = sum(values) / len(values)
+    # Write results to file
+    with open(args.output, 'w', encoding='utf-8') as f:
+        f.write("=== Tokenizer Test Results ===\n")
+        f.write(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Tokenizer: {args.tokenizer}\n")
+        f.write(f"Input: {args.input}\n")
+        f.write(f"Sample size: {args.sample if args.sample > 0 else 'Full file'}\n\n")
+        f.write("=== Summary ===\n")
+        if all_metrics:
+            # Write aggregate metrics
+            for key, value in avg_metrics.items():
+                if isinstance(value, float):
+                    f.write(f"{key}: {value:.4f}\n")
+                else:
+                    f.write(f"{key}: {value}\n")
+        else:
+            f.write("No files were successfully processed\n")
+        # Write individual file results
+        f.write("\n=== File Details ===\n")
+        for result in all_metrics:
+            f.write(f"\nFile: {result.get('file', 'unknown')}\n")
+            for key, value in result.items():
+                if key != 'file':
+                    if isinstance(value, float):
+                        f.write(f"  {key}: {value:.4f}\n")
+                    else:
+                        f.write(f"  {key}: {value}\n")
+    logger.info(f"Results saved to {args.output}")
+    print(f"\nTest results saved to: {args.output}")
+    if all_metrics:
+        logger.info(f"\n=== Test Complete ===")
+        logger.info(f"Processed {processed_count} files")
+        logger.info(f"Average chars/token: {avg_metrics.get('avg_chars_per_token', 0):.2f}")
+        logger.info(f"Average tokens/sec: {avg_metrics.get('avg_tokens_per_sec', 0):,.0f}")
+    else:
+        logger.warning("No files were successfully processed")
+if __name__ == "__main__":
+    try:
+        # Check for required dependencies
+        try:
+            import Levenshtein
+        except ImportError:
+            logger.warning("python-Levenshtein not found. Install with: pip install python-Levenshtein")
+            logger.warning("Falling back to basic similarity metrics")
+        main()
+    except KeyboardInterrupt:
+        logger.info("\nProcess interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"An error occurred: {e}", exc_info=True)
+        sys.exit(1)

Test_tokenizer/test_tokenizer_simple.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import sys
+from pathlib import Path
+from tokenizers import Tokenizer
+from typing import Optional, Tuple, List, Dict, Any
+import json
+def get_project_root() -> Path:
+    """Get the project root directory."""
+    # Use the current working directory as the project root
+    return Path.cwd()
+def setup_paths() -> Tuple[Path, Path, Path]:
+    """Set up and validate required paths.
+    Returns:
+        Tuple containing (tokenizer_path, data_dir, output_dir)
+    """
+    root = get_project_root()
+    # Define paths - look in root directory (one level up from Test_tokenizer)
+    tokenizer_path = root.parent / 'output' / 'tokenizer.json'
+    data_dir = root.parent / 'Dataset'  # Look in root directory
+    output_dir = root.parent / 'test_result'  # Output to root directory
+    # Create output directory if it doesn't exist
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Validate paths
+    if not tokenizer_path.exists():
+        print(f"Error: Tokenizer not found at {tokenizer_path}")
+        sys.exit(1)
+    if not data_dir.exists():
+        print(f"Error: Data directory not found at {data_dir}")
+        sys.exit(1)
+    return tokenizer_path, data_dir, output_dir
+def get_first_chunk_file(data_dir: Path) -> Optional[Path]:
+    """Get the first chunk file from the data directory."""
+    # Look for .txt files in the data directory
+    chunk_files = sorted(list(data_dir.glob('*.txt')))
+    if not chunk_files:
+        print(f"Error: No .txt files found in {data_dir}")
+        return None
+    return chunk_files[0]  # Return the first chunk file
+def test_tokenizer_on_chunk(tokenizer: Tokenizer, chunk_path: Path, max_lines: int = 1000) -> Dict[str, Any]:
+    """Test the tokenizer on the first max_lines of a chunk file."""
+    results = {
+        'total_lines': 0,
+        'lines_processed': 0,
+        'total_tokens': 0,
+        'perfect_matches': 0,
+        'total_chars': 0,
+        'total_diff_chars': 0,
+        'lines': []
+    }
+    try:
+        with open(chunk_path, 'r', encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if i >= max_lines:
+                    break
+                line = line.strip()
+                if not line:  # Skip empty lines
+                    continue
+                # Tokenize and decode
+                encoding = tokenizer.encode(line)
+                decoded = tokenizer.decode(encoding.ids)
+                # Calculate differences
+                diff_chars = sum(1 for a, b in zip(line, decoded) if a != b)
+                diff_chars += abs(len(line) - len(decoded))
+                is_perfect = diff_chars == 0
+                # Update results
+                results['total_lines'] += 1
+                results['lines_processed'] += 1
+                results['total_tokens'] += len(encoding.tokens)
+                results['total_chars'] += len(line)
+                results['total_diff_chars'] += diff_chars
+                results['perfect_matches'] += 1 if is_perfect else 0
+                # Store detailed results for the first few lines
+                if i < 5:  # First 5 lines
+                    results['lines'].append({
+                        'original': line[:200] + ('...' if len(line) > 200 else ''),
+                        'decoded': decoded[:200] + ('...' if len(decoded) > 200 else ''),
+                        'tokens': encoding.tokens[:10],  # First 10 tokens
+                        'is_perfect': is_perfect,
+                        'diff_chars': diff_chars,
+                        'similarity': 1 - (diff_chars / max(len(line), 1))
+                    })
+                # Print progress
+                if (i + 1) % 100 == 0:
+                    print(f"Processed {i+1} lines...")
+    except Exception as e:
+        print(f"Error processing file: {e}")
+        return results
+    return results
+def print_summary(results: Dict[str, Any], output_path: Path) -> None:
+    """Print and save test summary in TXT format with script name in the filename."""
+    if not results['lines_processed']:
+        print("No lines were processed.")
+        return
+    # Calculate statistics
+    avg_tokens_per_line = results['total_tokens'] / results['lines_processed']
+    total_chars = results['total_chars']
+    total_diff_chars = results['total_diff_chars']
+    accuracy = 1 - (total_diff_chars / total_chars) if total_chars > 0 else 0
+    diff_percentage = (total_diff_chars / total_chars * 100) if total_chars > 0 else 0
+    # Get script name without extension
+    script_name = Path(__file__).stem
+    # Prepare summary text
+    summary = [
+        "="*80,
+        "TOKENIZER TEST SUMMARY",
+        "="*80,
+        f"Test Script:       {script_name}.py",
+        f"Timestamp:          {results.get('timestamp', 'N/A')}",
+        f"Tokenizer:          {results.get('tokenizer_path', 'N/A')}",
+        f"Chunk file:         {results.get('chunk_file', 'N/A')}",
+        "-"*80,
+        f"Lines processed:     {results['lines_processed']}",
+        f"Perfect matches:     {results['perfect_matches']} ({results['perfect_matches']/results['lines_processed']*100:.1f}%)",
+        f"Average tokens/line:  {avg_tokens_per_line:.2f}",
+        f"Total characters:    {total_chars:,}",
+        f"Total tokens:        {results['total_tokens']:,}",
+        f"Character accuracy:   {accuracy*100:.2f}%",
+        f"Character diff:      {total_diff_chars:,} chars ({diff_percentage:.4f}%)",
+        f"Chars per token:     {total_chars/results['total_tokens']:.2f} (lower is better)",
+        "\nSAMPLE LINES:",
+        "-"*40
+    ]
+    # Add sample lines
+    for i, line in enumerate(results.get('lines', [])[:3]):
+        summary.extend([
+            f"\nSAMPLE {i+1}:",
+            f"Original: {line.get('original', '')}",
+            f"Decoded:  {line.get('decoded', '')}",
+            f"Tokens:   {', '.join(line.get('tokens', [])[:8])}{'...' if len(line.get('tokens', [])) > 8 else ''}",
+            f"Match:    {'✓ Perfect' if line.get('is_perfect') else '✗ Different'}",
+            "-"*40
+        ])
+    # Print to console
+    print("\n".join(summary))
+    # Save as TXT with script name in filename
+    timestamp = results.get('timestamp', '')
+    output_file = output_path / f'{script_name}_result_{timestamp}.txt'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write("\n".join(summary))
+    print(f"\nResults saved to: {output_file}")
+def main():
+    # Set up paths
+    tokenizer_path, data_dir, output_dir = setup_paths()
+    # Get the first chunk file
+    chunk_path = get_first_chunk_file(data_dir)
+    if not chunk_path:
+        print(f"No files found in {data_dir}. Please ensure the Dataset directory contains text files.")
+        return
+    print(f"Found data directory: {data_dir}")
+    print(f"Output directory: {output_dir}")
+    print(f"Testing tokenizer on first 1000 lines of: {chunk_path.name}")
+    # Load the tokenizer
+    print(f"Loading tokenizer from: {tokenizer_path}")
+    tokenizer = Tokenizer.from_file(str(tokenizer_path))
+    # Get vocabulary info
+    vocab = tokenizer.get_vocab()
+    print(f"Vocabulary size: {len(vocab):,} tokens")
+    # Test tokenizer on the chunk
+    print("\nTesting tokenizer on chunk...")
+    results = test_tokenizer_on_chunk(tokenizer, chunk_path, max_lines=1000)
+    # Add timestamp and tokenizer info to results
+    import time
+    results['timestamp'] = time.strftime("%Y%m%d_%H%M%S")
+    results['tokenizer_path'] = str(tokenizer_path)
+    results['chunk_file'] = str(chunk_path.name)
+    # Print and save summary
+    print_summary(results, output_dir)
+    print("\nTest complete!")
+if __name__ == "__main__":
+    main()

dist/ez_tokenizer-1.0.0-py3-none-any.whl ADDED Viewed

Binary file (17.8 kB). View file

dist/ez_tokenizer-1.0.0.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea6b4315e4faaa4641ac8d1c3103e0911fc8da8455b5310c8f27bac68332fca7
+size 26831

examples/README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# NexForge Tokenizer Examples
+This directory contains example scripts demonstrating advanced usage of the NexForge tokenizer.
+## Quick Start
+### Basic Tokenizer Creation
+```python
+from nexforgetokenizer import build_tokenizer
+# Create a tokenizer with default settings
+build_tokenizer(
+    input_dir="path/to/your/files",
+    output_path="custom_tokenizer.json",
+    vocab_size=40000,
+    min_frequency=2
+)
+```
+### Example Scripts
+1. **Basic Example** (`basic_usage.py`)
+   - Simple tokenizer creation and usage
+   - Basic encoding/decoding
+   - Vocabulary inspection
+2. **Advanced Usage** (`advanced_usage.py`)
+   - Custom special tokens
+   - Batch processing
+   - Performance optimization
+   - Error handling
+## Running Examples
+```bash
+# Install in development mode
+pip install -e .
+# Run basic example
+python examples/basic_usage.py
+# Run advanced example
+python examples/advanced_usage.py --input-dir ../Dataset --output my_tokenizer.json
+```
+## Example: Creating a Custom Tokenizer
+```python
+from nexforgetokenizer import build_tokenizer
+# Create a tokenizer with custom settings
+build_tokenizer(
+    input_dir="../Dataset",
+    output_path="my_tokenizer.json",
+    vocab_size=30000,      # Smaller vocabulary for specific domain
+    min_frequency=3,        # Only include tokens appearing at least 3 times
+    max_files=1000,         # Limit number of files to process
+    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
+)
+```
+## Best Practices
+1. **For General Use**
+   - Use default settings (40k vocab, min_freq=2)
+   - Process all files in your dataset
+   - Test with the built-in test suite
+2. **For Specialized Domains**
+   - Adjust vocabulary size based on domain complexity
+   - Consider increasing min_frequency for smaller vocabularies
+   - Test with domain-specific files
+## Need Help?
+- Check the [main README](../README.md) for basic usage
+- Review the test cases in `Test_tokenizer/`
+- Open an issue on GitHub for support
+## License
+MIT License - See [LICENSE](../LICENSE) for details.

examples/advanced_usage.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+Advanced usage example for NexForge Tokenizer Builder.
+This example demonstrates:
+- Custom special tokens
+- Batch processing with progress tracking
+- Vocabulary inspection and analysis
+- Error handling and recovery
+- Performance optimization
+"""
+import os
+import json
+import time
+from pathlib import Path
+from typing import Dict, List, Optional
+from tqdm import tqdm
+# Import the tokenizer components
+from nexforgetokenizer import (
+    build_tokenizer,
+    SystemResources,
+    log_memory_usage,
+    TokenizerError
+)
+def create_large_sample_dataset(num_files: int = 50, base_dir: str = "sample_data") -> Path:
+    """Create a larger sample dataset with different file types."""
+    base_path = Path(base_dir)
+    # Clean up if exists
+    if base_path.exists():
+        import shutil
+        shutil.rmtree(base_path)
+    # Create directories
+    base_path.mkdir(exist_ok=True)
+    # Create Python files
+    for i in range(num_files // 2):
+        module_content = f"""
+# Sample Python module {i}
+def process_data(data):
+    '''Process sample data.'''
+    result = []
+    for item in data:
+        if item % 2 == 0:
+            result.append(item * 2)
+    return result
+"""
+        (base_path / f"module_{i}.py").write_text(module_content)
+    # Create text files
+    for i in range(num_files // 2):
+        doc_content = f"""
+This is sample text document {i}.
+It contains multiple lines of text with various tokens.
+The quick brown fox jumps over the lazy dog.
+Special characters: !@#$%^&*()_+-=[]{{}}|;':\",./<>?
+"""
+        (base_path / f"document_{i}.txt").write_text(doc_content)
+    print(f"Created {num_files} sample files in {base_path}")
+    return base_path
+class DataProcessor:
+    """Example data processor class for demonstration."""
+    def __init__(self, config: dict):
+        self.config = config
+    def run(self):
+        """Run the processor with the current config."""
+        print(f"Processing with config: {self.config}")
+class TokenizerAnalyzer:
+    """Helper class for analyzing tokenizer performance and vocabulary."""
+    def __init__(self, tokenizer_path: str):
+        self.tokenizer_path = tokenizer_path
+        self.tokenizer = None
+        self.vocab = None
+    def load(self):
+        """Load the tokenizer."""
+        from tokenizers import Tokenizer
+        self.tokenizer = Tokenizer.from_file(self.tokenizer_path)
+        self.vocab = {
+            idx: self.tokenizer.id_to_token(idx)
+            for idx in range(self.tokenizer.get_vocab_size())
+        }
+    def analyze_vocab(self, top_n: int = 20):
+        """Analyze and print vocabulary statistics."""
+        if not self.tokenizer:
+            self.load()
+        vocab_size = len(self.vocab)
+        special_tokens = [
+            token for token in self.vocab.values()
+            if token.startswith("[") and token.endswith("]")
+        ]
+        print(f"\n=== Vocabulary Analysis ===")
+        print(f"Total vocabulary size: {vocab_size}")
+        print(f"Special tokens ({len(special_tokens)}): {', '.join(special_tokens[:10])}" +
+              ("..." if len(special_tokens) > 10 else ""))
+        # Show sample of vocabulary
+        print(f"\nSample vocabulary items:")
+        for idx in range(min(top_n, vocab_size)):
+            print(f"  {idx}: {self.vocab.get(idx, 'N/A')}")
+        if vocab_size > top_n:
+            print(f"  ... and {vocab_size - top_n} more")
+def main():
+    """Run the advanced example."""
+    print("NexForge Tokenizer Builder - Advanced Example")
+    print("=========================================\n")
+    # 1. Setup
+    output_dir = Path("advanced_output")
+    output_dir.mkdir(exist_ok=True)
+    tokenizer_path = output_dir / "advanced_tokenizer.json"
+    # 2. Check system resources
+    resources = SystemResources()
+    print(f"\n=== System Resources ===")
+    print(f"CPU Cores: {resources.cpu_cores}")
+    print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
+    if resources.has_cuda:
+        print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
+    else:
+        print("No CUDA GPU detected")
+    # 3. Create sample dataset
+    print("\n=== Creating Sample Dataset ===")
+    dataset_path = create_large_sample_dataset(num_files=50)
+    # 4. Custom special tokens
+    special_tokens = [
+        "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
+        "[PYTHON]", "[TEXT]", "[CODE]"
+    ]
+    # 5. Build the tokenizer with advanced options
+    print("\n=== Building Tokenizer ===")
+    print(f"Input directory: {dataset_path}")
+    print(f"Output path: {tokenizer_path}")
+    start_time = time.time()
+    try:
+        success = build_tokenizer(
+            input_dir=str(dataset_path),
+            output_path=str(tokenizer_path),
+            vocab_size=5000,  # Larger vocabulary for better coverage
+            min_frequency=2,   # Only include tokens that appear at least twice
+            special_tokens=special_tokens,
+            resources=resources,
+            max_files=50,      # Process all files
+            chunk_size=100000, # Process in 100KB chunks
+            n_threads=max(1, resources.cpu_cores - 1)  # Use all but one CPU core
+        )
+        if success:
+            duration = time.time() - start_time
+            print(f"\nTokenizer created successfully in {duration:.2f} seconds")
+            print(f"Tokenizer saved to: {tokenizer_path}")
+            # 6. Analyze the created tokenizer
+            print("\n=== Tokenizer Analysis ===")
+            analyzer = TokenizerAnalyzer(str(tokenizer_path))
+            analyzer.load()
+            analyzer.analyze_vocab()
+            # 7. Show example encoding/decoding
+            print("\n=== Example Encoding/Decoding ===")
+            sample_text = "def hello_world():\n    print('Hello, world!')  # Sample Python code"
+            encoded = analyzer.tokenizer.encode(sample_text)
+            decoded = analyzer.tokenizer.decode(encoded.ids)
+            print(f"Original: {sample_text}")
+            print(f"Encoded: {encoded.ids}")
+            print(f"Tokens: {encoded.tokens}")
+            print(f"Decoded: {decoded}")
+        else:
+            print("\nFailed to create tokenizer")
+    except TokenizerError as e:
+        print(f"\nError creating tokenizer: {e}")
+    except Exception as e:
+        print(f"\nUnexpected error: {e}")
+    finally:
+        # 8. Cleanup (optional)
+        # import shutil
+        # shutil.rmtree(dataset_path, ignore_errors=True)
+        pass
+    print("\nExample completed!")
+if __name__ == "__main__":
+    main()

examples/basic_usage.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Basic usage example for NexForge Tokenizer Builder."""
+from pathlib import Path
+import os
+import tempfile
+from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage
+def create_sample_code():
+    """Create a sample directory with Python files for testing."""
+    # Create a sample directory with Python files
+    sample_dir = Path("sample_code")
+    # Clean up if it exists
+    if sample_dir.exists():
+        import shutil
+        shutil.rmtree(sample_dir)
+    # Create directory
+    sample_dir.mkdir(exist_ok=True)
+    # Create some sample Python files
+    (sample_dir / "hello.py").write_text("""
+def greet(name):
+    print(f"Hello, {name}!")
+if __name__ == "__main__":
+    greet("World")
+""")
+    (sample_dir / "math.py").write_text("""
+def add(a, b):
+    return a + b
+def multiply(a, b):
+    return a * b
+if __name__ == "__main__":
+    print(f"2 + 3 = {add(2, 3)}")
+    print(f"2 * 3 = {multiply(2, 3)}")
+""")
+    return sample_dir
+def main():
+    """Run the example."""
+    print("NexForge Tokenizer Builder Basic Example")
+    print("=======================================\n")
+    # Create sample code
+    sample_dir = create_sample_code()
+    print(f"Created sample code in: {sample_dir}")
+    # Check system resources
+    resources = SystemResources()
+    print(f"\nDetected System Resources:")
+    print(f"CPU Cores: {resources.cpu_cores}")
+    print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
+    if resources.has_cuda:
+        print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
+    else:
+        print("No CUDA GPU detected")
+    # Create output path for tokenizer
+    output_path = "sample_tokenizer.json"
+    # Check initial memory usage
+    print("\nInitial memory usage:")
+    log_memory_usage()
+    # Build the tokenizer
+    print("\nBuilding tokenizer...")
+    success = build_tokenizer(
+        input_dir=str(sample_dir),
+        output_path=output_path,
+        vocab_size=1000,  # Small vocabulary for this example
+        min_frequency=1,  # Include all tokens
+        resources=resources
+    )
+    # Check final memory usage
+    print("\nFinal memory usage:")
+    log_memory_usage()
+    if success:
+        print(f"\nTokenizer successfully created at: {output_path}")
+        print(f"You can now use this tokenizer with any library that supports the HuggingFace tokenizers format")
+    else:
+        print("\nFailed to create tokenizer")
+    print("\nExample completed!")
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,81 @@

+[build-system]
+requires = ["setuptools>=42.0", "setuptools-scm>=3.4"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "ez-tokenizer"
+version = "1.0.0"
+description = "High-performance tokenizer builder for code and text datasets with adaptive resource management"
+readme = "README.md"
+requires-python = ">=3.8"
+license = {text = "MIT with Company Restriction"}
+authors = [
+    {name = "NexForge", email = "[email protected]"}
+]
+maintainers = [
+    {name = "NexForge", email = "[email protected]"}
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: Other/Proprietary License",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Text Processing :: Linguistic"
+]
+dependencies = [
+    "torch>=1.9.0",
+    "tokenizers>=0.12.0",
+    "tqdm>=4.62.0",
+    "psutil>=5.9.0",
+    "python-dateutil>=2.8.2"
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=6.0",
+    "pytest-cov>=2.12.1",
+    "pytest-xdist>=2.4.0",
+    "black>=21.7b0",
+    "isort>=5.0.0",
+    "mypy>=0.910",
+    "pylint>=2.11.0",
+    "pre-commit>=2.15.0"
+]
+[tool.setuptools]
+include-package-data = true
+package-dir = { "" = "src" }
+[tool.setuptools.packages.find]
+where = ["src"]
+namespaces = true
+[tool.black]
+line-length = 88
+target-version = ['py38']
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+[tool.mypy]
+ignore_missing_imports = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_return_any = true
+warn_unreachable = true
+show_error_context = true

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+# Core development dependencies
+-r requirements.txt
+# Testing
+pytest>=6.0
+pytest-cov>=2.12.1
+pytest-xdist>=2.4.0
+# Code formatting
+black>=21.7b0
+isort>=5.0.0
+# Static type checking
+mypy>=0.910
+# Linting
+pylint>=2.11.0
+# Version control hooks
+pre-commit>=2.15.0
+# Optional: For documentation
+# sphinx>=4.0.0
+# sphinx-rtd-theme>=0.5.0
+# Optional: For notebook development
+# jupyter>=1.0.0
+# ipykernel>=6.0.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# Core Dependencies
+torch>=1.9.0,<3.0.0  # PyTorch for tensor operations
+tokenizers>=0.12.0,<0.15.0  # HuggingFace tokenizers
+tqdm>=4.62.0,<5.0.0  # Progress bars
+psutil>=5.9.0,<6.0.0  # System monitoring
+python-dateutil>=2.8.2,<3.0.0  # Date/time utilities
+# Optional Dependencies (uncomment if needed)
+# numpy>=1.20.0,<2.0.0  # Required by some tokenizer components
+# pandas>=1.3.0,<3.0.0  # For data manipulation
+# scikit-learn>=1.0.0,<2.0.0  # For evaluation metrics
+# Version Pinning Examples (for production)
+# torch==2.0.1
+# tokenizers==0.13.3
+# tqdm==4.65.0
+# psutil==5.9.5
+# python-dateutil==2.8.2

run_ez_tokenizer.bat ADDED Viewed

	@@ -0,0 +1,286 @@

+@echo off
+:: Set up directory variables first
+set "SCRIPT_DIR=%~dp0"
+set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
+set "CURRENT_DIR=%CD%"
+cd /d "%SCRIPT_DIR%"
+:: EZ-Tokenizer Launcher with Banner
+:: This script must be run as administrator
+:: Previous versions were known as NexForge Tokenizer
+:: All functionality remains the same, only the name has been updated
+cls
+echo.
+echo =======================================================
+echo                EZ-TOKENIZER v1.0.0
+echo          (CodeGen-NF Model Pre-Release)
+echo =======================================================
+echo Script running from: %SCRIPT_DIR%
+:check_admin
+net session >nul 2>&1
+if %errorLevel% == 0 (
+    echo Running with administrator privileges...
+) else (
+    echo ###########################################################
+    echo #                                                         #
+    echo #  EZ-TOKENIZER REQUIRES ADMINISTRATOR PRIVILEGES        #
+    echo #  Please right-click and select 'Run as administrator'   #
+    echo #                                                         #
+    echo ###########################################################
+    echo.
+    echo Please right-click on this file and select "Run as administrator"
+    pause
+    exit /b
+)
+:menu
+cls
+:: Display banner
+echo   N   N  EEEEE  X   X  FFFFF  OOOOO  RRRR   GGGG  EEEEE
+echo   NN  N  E       X X   F      O   O  R   R  G      E
+echo   N N N  EEEE     X    FFFF   O   O  RRRR   G  GG  EEEE
+echo   N  NN  E       X X   F      O   O  R  R   G   G  E
+echo   N   N  EEEEE  X   X  F      OOOOO  R   R   GGGG  EEEEE
+echo.
+echo   PRESENTS:
+echo =======================================================
+echo                 EZ-TOKENIZER v1.0.0
+echo =======================================================
+:: Display current directory with error checking
+if defined SCRIPT_DIR (
+    echo Current directory: %~dp0
+    echo Script directory: %~dp0
+) else (
+    echo [WARNING] SCRIPT_DIR not defined. Using current directory: %CD%
+    set "SCRIPT_DIR=%CD%"
+)
+echo.
+echo MINIMUM REQUIREMENTS:
+echo - Python 3.8 or higher
+echo - 4GB RAM minimum (8GB+ recommended)
+echo - 1GB free disk space
+echo.
+echo DATASET INFORMATION:
+echo - Dataset location: %SCRIPT_DIR%\Dataset\
+echo - Please add your dataset files to the directory or use 4. Open Dataset Directory and insert your files.
+echo.
+echo MENU:
+echo 1. Install Dependencies
+echo 2. Create Tokenizer (50k vocab, min_freq=2)
+echo 3. Test Tokenizer (2 runs with 10,000 samples)
+echo 4. Open Dataset Directory
+echo 5. Exit
+echo.
+set /p choice=Enter your choice (1-5):
+echo.
+if "%choice%"=="1" goto install_deps
+if "%choice%"=="2" goto create_tokenizer
+if "%choice%"=="3" goto test_tokenizer
+if "%choice%"=="4" goto open_dataset
+if "%choice%"=="5" goto exit
+echo Invalid choice. Please enter a number between 1 and 5.
+pause
+goto menu
+:install_deps
+echo Installing dependencies...
+echo This may take a few minutes...
+echo.
+:: Create virtual environment if it doesn't exist
+if not exist "%SCRIPT_DIR%\venv" (
+    echo Creating virtual environment...
+    python -m venv "%SCRIPT_DIR%\venv"
+    if errorlevel 1 (
+        echo Failed to create virtual environment
+        pause
+        goto menu
+    )
+)
+:: Activate virtual environment and install dependencies
+call "%SCRIPT_DIR%\venv\Scripts\activate"
+:: Upgrade pip first
+echo [INFO] Upgrading pip...
+python -m pip install --upgrade pip
+if errorlevel 1 (
+    echo [ERROR] Failed to upgrade pip
+    pause
+    goto menu
+)
+:: Install PyTorch CPU version
+echo [INFO] Installing PyTorch CPU version...
+pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
+if errorlevel 1 (
+    echo [WARNING] Failed to install specific PyTorch version, trying latest compatible version...
+    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+    if errorlevel 1 (
+        echo [ERROR] Failed to install PyTorch
+        echo [INFO] Please check your internet connection and try again
+        pause
+        goto menu
+    )
+)
+:: Install other dependencies one by one
+echo [INFO] Installing additional dependencies...
+pip install tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
+if errorlevel 1 (
+    echo [WARNING] Failed to install some dependencies, trying with --no-cache-dir...
+    pip install --no-cache-dir tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
+    if errorlevel 1 (
+        echo [ERROR] Failed to install additional dependencies
+        pause
+        goto menu
+    )
+)
+:: Install tokenizers with pre-built wheel
+echo [INFO] Installing tokenizers...
+pip install tokenizers==0.21.1 --only-binary :all:
+if errorlevel 1 (
+    echo [WARNING] Could not install tokenizers with pre-built wheel
+    echo [INFO] Trying alternative installation method...
+    pip install tokenizers==0.21.1 --no-deps
+    if errorlevel 1 (
+        echo [ERROR] Failed to install tokenizers
+        echo Note: This package requires a C++ build toolchain or a pre-built wheel.
+        echo On Windows, you may need to install Visual Studio Build Tools with C++ workload.
+        pause
+        goto menu
+    )
+)
+echo.
+echo [INFO] All dependencies installed successfully!
+echo [INFO] Installing nexforgetokenizer in development mode...
+python -m pip install -e .
+if errorlevel 1 (
+    echo [ERROR] Failed to install nexforgetokenizer in development mode
+    pause
+    goto menu
+)
+echo [INFO] Package installation complete!
+pause
+goto menu
+:create_tokenizer
+if not exist "%SCRIPT_DIR%\venv" (
+    echo Virtual environment not found. Please install dependencies first.
+    pause
+    goto menu
+)
+call "%SCRIPT_DIR%\venv\Scripts\activate"
+:: Create output directory if it doesn't exist
+if not exist "%SCRIPT_DIR%\output" mkdir "%SCRIPT_DIR%\output"
+:: Check if dataset directory exists
+if not exist "%SCRIPT_DIR%\Dataset" (
+    echo Creating Dataset directory...
+    mkdir "%SCRIPT_DIR%\Dataset"
+    echo Please add your dataset files to: %SCRIPT_DIR%\Dataset
+    pause
+    start "" "%SCRIPT_DIR%\Dataset"
+    goto menu
+)
+:: Check if there are any files in the Dataset directory
+dir /b "%SCRIPT_DIR%\Dataset\*.*" >nul 2>&1
+if %ERRORLEVEL% NEQ 0 (
+    echo No files found in: %SCRIPT_DIR%\Dataset
+    echo Please add your dataset files to this directory.
+    pause
+    start "" "%SCRIPT_DIR%\Dataset"
+    goto menu
+)
+echo Creating EZ-Tokenizer with 50k vocabulary and min_freq=2 (all files)...
+python -m nexforgetokenizer.adaptive_tokenizer "%SCRIPT_DIR%\Dataset" "%SCRIPT_DIR%\output\tokenizer.json" 50000 2 MAX
+if errorlevel 1 (
+    echo Failed to create tokenizer
+    pause
+    goto menu
+)
+echo.
+echo EZ-Tokenizer created successfully at: %SCRIPT_DIR%\output\tokenizer.json
+echo Vocabulary size: 50,000
+echo Minimum frequency: 2
+echo Processed all available files in the dataset
+echo.
+echo You can now use this tokenizer in your projects by loading: output\tokenizer.json
+pause
+goto menu
+:test_tokenizer
+if not exist "%SCRIPT_DIR%\venv" (
+    echo Virtual environment not found. Please install dependencies first.
+    pause
+    goto menu
+)
+call "%SCRIPT_DIR%\venv\Scripts\activate"
+:: Create test_result directory if it doesn't exist
+if not exist "%SCRIPT_DIR%\test_result" mkdir "%SCRIPT_DIR%\test_result"
+:: Check if tokenizer exists
+if not exist "%SCRIPT_DIR%\output\tokenizer.json" (
+    echo EZ-Tokenizer not found. Please create a tokenizer first.
+    echo Looking for: %SCRIPT_DIR%\output\tokenizer.json
+    pause
+    goto menu
+)
+echo Running test with 10,000 samples...
+echo Testing EZ-Tokenizer with 10,000 samples...
+python "%SCRIPT_DIR%\Test_tokenizer\test_tokenizer.py" --tokenizer "%SCRIPT_DIR%\output\tokenizer.json" --input "%SCRIPT_DIR%\Dataset" --sample 10000 --output "%SCRIPT_DIR%\test_result\test_run.txt"
+if errorlevel 1 (
+    echo Test run failed
+    pause
+    goto menu
+)
+echo.
+echo Both test runs completed successfully!
+echo Results saved to: %SCRIPT_DIR%\test_result\
+:: Open the test results directory
+if exist "%SCRIPT_DIR%\test_result\" (
+    start "" "%SCRIPT_DIR%\test_result\"
+) else (
+    echo Warning: Test results directory not found.
+)
+pause
+goto menu
+:open_dataset
+if not exist "%SCRIPT_DIR%\Dataset" (
+    mkdir "%SCRIPT_DIR%\Dataset"
+)
+start "" "%SCRIPT_DIR%\Dataset"
+goto menu
+:exit
+cd /d "%CURRENT_DIR%"
+echo Exiting NexForge Tokenizer Manager...
+timeout /t 2 >nul
+exit

setup.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Setup script for NexForge Tokenizer Builder."""
+from setuptools import setup, find_packages
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+setup(
+    name="nexforgetokenizer",
+    version="0.1.0",
+    author="NexForge Team",
+    description="High-performance tool for creating Python code tokenizers with adaptive resource management",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/nexforge/nexforgetokenizer",
+    package_dir={"": "src"},
+    packages=find_packages(where="src"),
+    python_requires=">=3.8",
+    install_requires=[
+        "torch>=1.9.0",
+        "tokenizers>=0.12.0",
+        "tqdm>=4.62.0",
+        "psutil>=5.9.0",
+        "numpy>=1.20.0",  # Optional but recommended for improved performance
+    ],
+    extras_require={
+        "dev": [
+            "pytest>=6.0",
+            "black>=21.7b0",
+            "isort>=5.0.0",
+            "mypy>=0.910",
+            "pylint>=2.11.0",
+        ],
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: Other/Proprietary License",
+        "Operating System :: OS Independent",
+        "Intended Audience :: Developers",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+        "Topic :: Text Processing :: Linguistic",
+    ],
+)

src/ez_tokenizer.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,293 @@

+Metadata-Version: 2.4
+Name: ez-tokenizer
+Version: 1.0.0
+Summary: High-performance tokenizer builder for code and text datasets with adaptive resource management
+Home-page: https://github.com/nexforge/nexforgetokenizer
+Author: NexForge Team
+Author-email: NexForge <[email protected]>
+Maintainer-email: NexForge <[email protected]>
+License: MIT with Company Restriction
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: Other/Proprietary License
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=1.9.0
+Requires-Dist: tokenizers>=0.12.0
+Requires-Dist: tqdm>=4.62.0
+Requires-Dist: psutil>=5.9.0
+Requires-Dist: python-dateutil>=2.8.2
+Provides-Extra: dev
+Requires-Dist: pytest>=6.0; extra == "dev"
+Requires-Dist: pytest-cov>=2.12.1; extra == "dev"
+Requires-Dist: pytest-xdist>=2.4.0; extra == "dev"
+Requires-Dist: black>=21.7b0; extra == "dev"
+Requires-Dist: isort>=5.0.0; extra == "dev"
+Requires-Dist: mypy>=0.910; extra == "dev"
+Requires-Dist: pylint>=2.11.0; extra == "dev"
+Requires-Dist: pre-commit>=2.15.0; extra == "dev"
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-python
+# EZ-Tokenizer
+A high-performance tool for creating custom tokenizers from your code or text datasets. Automatically adapts to your system resources while providing fine-grained control over tokenizer creation.
+> **Note**: This project was previously known as NexForge Tokenizer. All functionality remains the same, only the name has been updated to better reflect its ease of use and efficiency.
+## 📄 License
+EZ-Tokenizer is released under the MIT License with a company restriction clause. This means:
+- 🆓 **Free for everyone**: Individuals and small businesses can use EZ-Tokenizer for free
+- 🏢 **Commercial use**: Companies with more than 10 employees or $1M+ in annual revenue need a commercial license
+- 📝 **Full details**: See [LICENSE](LICENSE) for complete terms
+## Quick Start with Batch File (Recommended for Most Users)
+### Prerequisites
+- Windows OS
+- Python 3.8 or higher installed
+- Administrator privileges
+- At least 4GB RAM (8GB+ recommended)
+### Getting Started
+1. **Download** the latest release or clone this repository
+2. **Add your dataset**: Place training files in the `Dataset` directory
+   - Supported formats: `.txt`, `.py`, and other text files
+   - The system will process all compatible files in this directory
+3. **Run as Administrator**: Right-click on `run_ez_tokenizer.bat` and select "Run as administrator"
+4. **Follow the Menu**:
+   - Option 1: Install Dependencies (first time only)
+   - Option 2: Create Tokenizer (processes all files in Dataset directory)
+   - Option 3: Test Tokenizer (after creation)
+   - Option 4: Open Dataset Directory (to add/check files)
+   - Option 5: Exit
+### Default Tokenizer Settings
+- **Vocabulary Size**: 50,000 tokens
+- **Minimum Frequency**: 2 (includes tokens appearing at least twice)
+- **File Processing**: All files in Dataset directory
+- **Output**: `output/ez_tokenizer.json`
+- **Test Results**: `test_result/test_run.txt`
+### For Advanced Users
+Customize tokenizer creation by running manually:
+```bash
+python -m ez_tokenizer.adaptive_tokenizer [input_dir] [output_path] [vocab_size] [min_frequency] [max_files]
+```
+Example:
+```bash
+python -m ez_tokenizer.adaptive_tokenizer "Dataset" "output/custom_tokenizer.json" 50000 2 1000
+```
+---
+## Advanced Usage (Manual Setup)
+For users who need more control or are using non-Windows systems:
+## Features
+- **Adaptive Resource Management**: Automatically detects and utilizes available system resources (CPU, RAM, GPU)
+- **Progressive Processing**: Processes files in chunks to handle datasets larger than available memory
+- **Smart Batching**: Dynamically adjusts batch sizes based on available resources
+- **Efficient Memory Usage**: Implements memory conservation strategies for optimal performance
+- **High Performance**: Processes over 300,000 tokens per second on average hardware
+- **Perfect Reconstruction**: 100% accuracy in round-trip encoding/decoding
+- **Optimal Compression**: Achieves ~3.5 characters per token, exceeding industry standards
+- 🛠️ **Extensible**: Advanced users can customize all parameters
+- ✅ **Tested**: Built-in testing to verify tokenizer quality
+## Quick Start
+### Installation
+```bash
+# Install from source
+git clone https://github.com/yourusername/ez_tokenizer.git
+cd ez_tokenizer
+pip install -e .
+```
+### Basic Usage
+#### Command Line Interface
+```bash
+# Basic usage
+python -m ez_tokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json
+# With custom parameters
+python -m ez_tokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json 50000 2
+```
+## Complete Usage Guide
+### Command Line Arguments
+```bash
+python -m ez_tokenizer.adaptive_tokenizer <input_path> <output_path> [vocab_size] [min_frequency]
+```
+- **input_path**: Path to file or directory containing training data
+- **output_path**: Where to save the tokenizer (should end with .json)
+- **vocab_size** (optional, default=40000): Target vocabulary size
+- **min_frequency** (optional, default=2): Minimum token occurrence count
+### Python API
+```python
+from ez_tokenizer import build_tokenizer
+# Basic usage
+build_tokenizer(
+    input_dir="path/to/your/files",
+    output_path="output/tokenizer.json"
+)
+# Advanced usage
+build_tokenizer(
+    input_dir="path/to/your/files",
+    output_path="output/tokenizer.json",
+    vocab_size=50000,    # Larger vocabulary for specialized domains
+    min_frequency=2,     # Only include tokens appearing at least this many times
+    chunk_size=1000000,  # Characters to process at once
+    n_threads=4         # Number of threads to use
+)
+```
+## Best Practices
+### Recommended Settings
+#### For Most Users
+- **Vocabulary Size**: 40,000 (default)
+  - Balanced between coverage and performance
+  - Works well for most programming languages and natural language
+- **Minimum Frequency**: 2 (default)
+  - Includes tokens that appear at least twice
+  - Good balance between vocabulary size and token quality
+#### For Specialized Use Cases
+- **Larger Vocabularies (50k+)**
+  - Only needed for very diverse codebases
+  - Requires more system resources
+- **Higher Minimum Frequency**
+  - Use 3-5 for smaller vocabularies
+  - Reduces vocabulary size while maintaining quality
+#### Processing Large Datasets
+- The batch file automatically handles large datasets
+- Processes files in memory-efficient chunks
+- Can be interrupted and resumed if needed
+### Input Data
+- Supports `.txt`, `.py`, and other text-based formats
+- Handles both files and directories
+- Automatically filters binary files
+### Performance Tips
+- For large datasets (>1GB), use chunking
+- On multi-core systems, increase thread count
+- Monitor memory usage with large vocabularies
+## Testing Your Tokenizer
+After creating your tokenizer, use the built-in test function:
+1. From the batch menu, select "Test Tokenizer"
+2. The system will:
+   - Test with 10,000 random samples
+   - Measure tokenization speed (typically >300k tokens/sec)
+   - Verify 100% round-trip accuracy
+   - Generate a detailed performance report
+# Custom test with specific sample size
+python Test_tokenizer\test_tokenizer.py \
+    --tokenizer output/Nexforge_tokenizer.json \
+    --input Dataset \
+    --sample 20000 \
+    --output test_result/detailed_test.txt
+```
+### Test Output Includes
+- Tokenization success rate
+- Sample encoded/decoded text
+- Basic statistics (vocab size, special tokens)
+- Any encoding/decoding errors
+## Troubleshooting
+### Common Issues
+1. **Out of Memory**
+   - Reduce chunk size
+   - Close other memory-intensive applications
+   - Use a smaller vocabulary
+2. **Slow Processing**
+   - Increase thread count
+   - Process in smaller batches
+   - Check for system resource constraints
+3. **Vocabulary Too Large**
+   - Increase min_frequency
+   - Use a smaller vocab_size
+   - Pre-filter your dataset
+## Performance & Resource Usage
+The tokenizer is optimized to work efficiently across different hardware configurations:
+### System Requirements
+- **Minimum**: 4GB RAM, 2-core CPU
+- **Recommended**: 8GB+ RAM, 4+ core CPU
+- **Disk Space**: At least 1GB free (more for large datasets)
+### Expected Performance
+- **Memory Usage**: Typically stays under 2GB for most datasets
+- **CPU Utilization**: Deliberately capped to prevent system slowdown
+- **Processing Speed**: Varies by system, but generally processes:
+  - Small datasets (100MB): 1-5 minutes
+  - Medium datasets (1GB): 10-30 minutes
+  - Large datasets (10GB+): 1-3 hours
+### Monitoring
+- The batch file shows progress updates
+- Check Task Manager for real-time resource usage
+- Process can be safely interrupted (CTRL+C) and resumed
+## Examples
+See the `examples/` directory for:
+- Training on specific programming languages
+- Fine-tuning pre-trained tokenizers
+- Batch processing large datasets
+## Contributing
+Contributions are welcome! Here's how to get started:
+1. Fork the repository
+2. Create a new branch
+3. Make your changes
+4. Run tests: `pytest`
+5. Submit a pull request
+## License
+MIT License - see [LICENSE](LICENSE) for details.

src/ez_tokenizer.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+LICENSE
+MANIFEST.in
+README.md
+pyproject.toml
+requirements.txt
+setup.py
+examples/README.md
+examples/advanced_usage.py
+examples/basic_usage.py
+src/ez_tokenizer.egg-info/PKG-INFO
+src/ez_tokenizer.egg-info/SOURCES.txt
+src/ez_tokenizer.egg-info/dependency_links.txt
+src/ez_tokenizer.egg-info/requires.txt
+src/ez_tokenizer.egg-info/top_level.txt
+src/nexforgetokenizer/__init__.py
+src/nexforgetokenizer/adaptive_tokenizer.py
+src/nexforgetokenizer/resources.py
+src/nexforgetokenizer/data/__init__.py
+tests/test_adaptive_tokenizer.py

src/ez_tokenizer.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/ez_tokenizer.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch>=1.9.0
+tokenizers>=0.12.0
+tqdm>=4.62.0
+psutil>=5.9.0
+python-dateutil>=2.8.2
+[dev]
+pytest>=6.0
+pytest-cov>=2.12.1
+pytest-xdist>=2.4.0
+black>=21.7b0
+isort>=5.0.0
+mypy>=0.910
+pylint>=2.11.0
+pre-commit>=2.15.0

src/ez_tokenizer.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ nexforgetokenizer

src/nexforgetokenizer.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,286 @@

+Metadata-Version: 2.4
+Name: nexforgetokenizer
+Version: 0.2.0
+Summary: High-performance tokenizer builder for code and text datasets
+Home-page: https://github.com/nexforge/nexforgetokenizer
+Author: NexForge Team
+Author-email: Jean-Michel Talbot <[email protected]>
+Maintainer-email: NexForge Team <[email protected]>
+License: Proprietary
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: Other/Proprietary License
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=1.9.0
+Requires-Dist: tokenizers>=0.12.0
+Requires-Dist: tqdm>=4.62.0
+Requires-Dist: psutil>=5.9.0
+Requires-Dist: python-dateutil>=2.8.2
+Provides-Extra: dev
+Requires-Dist: pytest>=6.0; extra == "dev"
+Requires-Dist: pytest-cov>=2.12.1; extra == "dev"
+Requires-Dist: pytest-xdist>=2.4.0; extra == "dev"
+Requires-Dist: black>=21.7b0; extra == "dev"
+Requires-Dist: isort>=5.0.0; extra == "dev"
+Requires-Dist: mypy>=0.910; extra == "dev"
+Requires-Dist: pylint>=2.11.0; extra == "dev"
+Requires-Dist: pre-commit>=2.15.0; extra == "dev"
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-python
+# NexForge Tokenizer Builder
+A high-performance tool for creating custom tokenizers from your code or text datasets. Automatically adapts to your system resources while providing fine-grained control over tokenizer creation.
+## Quick Start with Batch File (Recommended for Most Users)
+### Prerequisites
+- Windows OS
+- Python 3.8 or higher installed
+- Administrator privileges
+- At least 4GB RAM (8GB+ recommended)
+### Getting Started
+1. **Download** the latest release or clone this repository
+2. **Add your dataset**: Place training files in the `Dataset` directory
+   - Supported formats: `.txt`, `.py`, and other text files
+   - The system will process all compatible files in this directory
+3. **Run as Administrator**: Right-click on `run_nexforge.bat` and select "Run as administrator"
+4. **Follow the Menu**:
+   - Option 1: Install Dependencies (first time only)
+   - Option 2: Create Tokenizer (processes all files in Dataset directory)
+   - Option 3: Test Tokenizer (after creation)
+   - Option 4: Open Dataset Directory (to add/check files)
+   - Option 5: Exit
+### Default Tokenizer Settings
+- **Vocabulary Size**: 40,000 tokens
+- **Minimum Frequency**: 2 (includes tokens appearing at least twice)
+- **File Processing**: All files in Dataset directory
+- **Output**: `output/Nexforge_tokenizer.json`
+- **Test Results**: `test_result/test_run.txt`
+### For Advanced Users
+Customize tokenizer creation by running manually:
+```bash
+python -m nexforgetokenizer.adaptive_tokenizer [input_dir] [output_path] [vocab_size] [min_frequency] [max_files]
+```
+Example:
+```bash
+python -m nexforgetokenizer.adaptive_tokenizer "Dataset" "output/custom_tokenizer.json" 50000 2 1000
+```
+---
+## Advanced Usage (Manual Setup)
+For users who need more control or are using non-Windows systems:
+## Features
+- 🚀 **One-Click Setup**: Create optimized tokenizers with a single click
+- ⚡ **Resource Efficient**: Automatically adapts to your system's capabilities
+- 🧠 **Smart Defaults**: 40k vocabulary with min_freq=2 for optimal coverage
+- 🔄 **Batch Processing**: Process all files in your dataset directory
+- 📊 **Memory Safe**: Processes large datasets without memory issues
+- 🛠️ **Extensible**: Advanced users can customize all parameters
+- ✅ **Tested**: Built-in testing to verify tokenizer quality
+## Quick Start
+### Installation
+```bash
+# Install from source
+git clone https://github.com/yourusername/nexforgetokenizer.git
+cd nexforgetokenizer
+pip install -e .
+```
+### Basic Usage
+#### Command Line Interface
+```bash
+# Basic usage
+python -m nexforgetokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json
+# With custom parameters
+python -m nexforgetokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json 50000 2
+```
+## Complete Usage Guide
+### Command Line Arguments
+```bash
+python -m nexforgetokenizer.adaptive_tokenizer <input_path> <output_path> [vocab_size] [min_frequency]
+```
+- **input_path**: Path to file or directory containing training data
+- **output_path**: Where to save the tokenizer (should end with .json)
+- **vocab_size** (optional, default=40000): Target vocabulary size
+- **min_frequency** (optional, default=2): Minimum token occurrence count
+### Python API
+```python
+from nexforgetokenizer import build_tokenizer
+# Basic usage
+build_tokenizer(
+    input_dir="path/to/your/files",
+    output_path="output/tokenizer.json"
+)
+# Advanced usage
+build_tokenizer(
+    input_dir="path/to/your/files",
+    output_path="output/tokenizer.json",
+    vocab_size=50000,    # Larger vocabulary for specialized domains
+    min_frequency=2,     # Only include tokens appearing at least this many times
+    chunk_size=1000000,  # Characters to process at once
+    n_threads=4         # Number of threads to use
+)
+```
+## Best Practices
+### Recommended Settings
+#### For Most Users
+- **Vocabulary Size**: 40,000 (default)
+  - Balanced between coverage and performance
+  - Works well for most programming languages and natural language
+- **Minimum Frequency**: 2 (default)
+  - Includes tokens that appear at least twice
+  - Good balance between vocabulary size and token quality
+#### For Specialized Use Cases
+- **Larger Vocabularies (50k+)**
+  - Only needed for very diverse codebases
+  - Requires more system resources
+- **Higher Minimum Frequency**
+  - Use 3-5 for smaller vocabularies
+  - Reduces vocabulary size while maintaining quality
+#### Processing Large Datasets
+- The batch file automatically handles large datasets
+- Processes files in memory-efficient chunks
+- Can be interrupted and resumed if needed
+### Input Data
+- Supports `.txt`, `.py`, and other text-based formats
+- Handles both files and directories
+- Automatically filters binary files
+### Performance Tips
+- For large datasets (>1GB), use chunking
+- On multi-core systems, increase thread count
+- Monitor memory usage with large vocabularies
+## Testing Your Tokenizer
+After creating your tokenizer, use the built-in test function:
+1. From the batch menu, select "Test Tokenizer"
+2. The system will:
+   - Test with 10,000 random samples
+   - Generate a test report in `test_result/test_run.txt`
+   - Show basic statistics about the tokenizer
+For advanced testing, run manually:
+```bash
+# Basic test with default settings
+python Test_tokenizer\test_tokenizer.py --tokenizer output/Nexforge_tokenizer.json
+# Custom test with specific sample size
+python Test_tokenizer\test_tokenizer.py \
+    --tokenizer output/Nexforge_tokenizer.json \
+    --input Dataset \
+    --sample 20000 \
+    --output test_result/detailed_test.txt
+```
+### Test Output Includes
+- Tokenization success rate
+- Sample encoded/decoded text
+- Basic statistics (vocab size, special tokens)
+- Any encoding/decoding errors
+## Troubleshooting
+### Common Issues
+1. **Out of Memory**
+   - Reduce chunk size
+   - Close other memory-intensive applications
+   - Use a smaller vocabulary
+2. **Slow Processing**
+   - Increase thread count
+   - Process in smaller batches
+   - Check for system resource constraints
+3. **Vocabulary Too Large**
+   - Increase min_frequency
+   - Use a smaller vocab_size
+   - Pre-filter your dataset
+## Performance & Resource Usage
+The tokenizer is optimized to work efficiently across different hardware configurations:
+### System Requirements
+- **Minimum**: 4GB RAM, 2-core CPU
+- **Recommended**: 8GB+ RAM, 4+ core CPU
+- **Disk Space**: At least 1GB free (more for large datasets)
+### Expected Performance
+- **Memory Usage**: Typically stays under 2GB for most datasets
+- **CPU Utilization**: Deliberately capped to prevent system slowdown
+- **Processing Speed**: Varies by system, but generally processes:
+  - Small datasets (100MB): 1-5 minutes
+  - Medium datasets (1GB): 10-30 minutes
+  - Large datasets (10GB+): 1-3 hours
+### Monitoring
+- The batch file shows progress updates
+- Check Task Manager for real-time resource usage
+- Process can be safely interrupted (CTRL+C) and resumed
+## Examples
+See the `examples/` directory for:
+- Training on specific programming languages
+- Fine-tuning pre-trained tokenizers
+- Batch processing large datasets
+## Contributing
+Contributions are welcome! Here's how to get started:
+1. Fork the repository
+2. Create a new branch
+3. Make your changes
+4. Run tests: `pytest`
+5. Submit a pull request
+## License
+MIT License - see [LICENSE](LICENSE) for details.

src/nexforgetokenizer.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+LICENSE
+MANIFEST.in
+README.md
+pyproject.toml
+requirements.txt
+setup.py
+examples/README.md
+examples/advanced_usage.py
+examples/basic_usage.py
+src/nexforgetokenizer/__init__.py
+src/nexforgetokenizer/adaptive_tokenizer.py
+src/nexforgetokenizer/resources.py
+src/nexforgetokenizer.egg-info/PKG-INFO
+src/nexforgetokenizer.egg-info/SOURCES.txt
+src/nexforgetokenizer.egg-info/dependency_links.txt
+src/nexforgetokenizer.egg-info/requires.txt
+src/nexforgetokenizer.egg-info/top_level.txt
+src/nexforgetokenizer/data/__init__.py
+tests/test_adaptive_tokenizer.py

src/nexforgetokenizer.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/nexforgetokenizer.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch>=1.9.0
+tokenizers>=0.12.0
+tqdm>=4.62.0
+psutil>=5.9.0
+python-dateutil>=2.8.2
+[dev]
+pytest>=6.0
+pytest-cov>=2.12.1
+pytest-xdist>=2.4.0
+black>=21.7b0
+isort>=5.0.0
+mypy>=0.910
+pylint>=2.11.0
+pre-commit>=2.15.0

src/nexforgetokenizer.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ nexforgetokenizer

src/nexforgetokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""EZ-Tokenizer - High-performance Python code tokenizer with adaptive resource management.
+Features:
+- Efficient tokenization of code and text
+- Adaptive resource management
+- Support for large datasets
+- Custom vocabulary generation
+"""
+__version__ = "1.0.0"
+__author__ = "EZ-Tokenizer Team"
+__all__ = [
+    "SystemResources",
+    "log_memory_usage",
+    "manage_ram",
+    "build_tokenizer"
+]
+# Lazy imports to prevent circular imports
+def __getattr__(name):
+    if name == 'SystemResources':
+        from .resources import SystemResources
+        return SystemResources
+    elif name in ('log_memory_usage', 'manage_ram', 'build_tokenizer'):
+        from .adaptive_tokenizer import log_memory_usage, manage_ram, build_tokenizer
+        if name == 'log_memory_usage':
+            return log_memory_usage
+        elif name == 'manage_ram':
+            return manage_ram
+        elif name == 'build_tokenizer':
+            return build_tokenizer
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")

src/nexforgetokenizer/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.1 kB). View file

src/nexforgetokenizer/__pycache__/adaptive_tokenizer.cpython-313.pyc ADDED Viewed

Binary file (31.4 kB). View file

src/nexforgetokenizer/__pycache__/resources.cpython-313.pyc ADDED Viewed

Binary file (6.54 kB). View file

src/nexforgetokenizer/adaptive_tokenizer.py ADDED Viewed

	@@ -0,0 +1,705 @@

+"""EZ-Tokenizer: Adaptive tokenizer creation for Python code with hardware optimization.
+This script creates a high-performance ByteLevel BPE tokenizer specifically optimized for code,
+with automatic adaptation to available system resources (RAM, CPU, GPU). It efficiently scales
+from low-end systems (2 cores, 4GB RAM) to high-end workstations while maintaining perfect
+reconstruction accuracy and high throughput.
+Key Features:
+- 100% reconstruction accuracy
+- ~3.5 characters per token (exceeding industry standards)
+- Adaptive resource management
+- Memory-efficient processing of large datasets
+- Support for mixed code and text content
+"""
+import os
+import time
+import glob
+import logging
+import sys
+import gc
+import traceback
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+import psutil
+from typing import Dict, List, Optional, Tuple, Union, Any, NamedTuple
+# Try to use CUDA if available
+import torch
+# Local imports
+from .resources import SystemResources
+# Third-party tokenizer dependencies
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.decoders import ByteLevel as ByteLevelDecoder
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('tokenizer.log')
+    ]
+)
+# SystemResources class moved to resources.py to fix circular import warning
+def log_memory_usage():
+    """Log current RAM and GPU memory usage."""
+    process = psutil.Process()
+    ram_usage = process.memory_info().rss / (1024 * 1024 * 1024)  # GB
+    ram_percent = psutil.virtual_memory().percent
+    available_ram = psutil.virtual_memory().available / (1024 * 1024 * 1024)  # GB
+    total_ram = psutil.virtual_memory().total / (1024 * 1024 * 1024)  # GB
+    logging.info(f"RAM: {ram_usage:.2f} GB used, {available_ram:.2f} GB available ({ram_percent}% used of {total_ram:.1f} GB total)")
+    if torch.cuda.is_available():
+        for i in range(torch.cuda.device_count()):
+            allocated = torch.cuda.memory_allocated(i) / (1024 * 1024 * 1024)  # GB
+            cached = torch.cuda.memory_reserved(i) / (1024 * 1024 * 1024)  # GB
+            logging.info(f"CUDA Device {i}: {allocated:.2f} GB allocated, {cached:.2f} GB cached")
+def manage_ram(aggressive: bool = False):
+    """Perform RAM-specific memory management and garbage collection.
+    Args:
+        aggressive: If True, performs more thorough memory cleanup operations
+    """
+    # Record memory before cleanup
+    before_ram = psutil.virtual_memory().percent
+    before_process = psutil.Process().memory_info().rss / (1024 * 1024 * 1024)  # GB
+    # Run standard garbage collection first
+    gc.collect()
+    if aggressive:
+        # Force the most thorough collection possible
+        for _ in range(2):  # Multiple passes
+            for i in range(3):  # All generations 0, 1, 2
+                gc.collect(i)
+        # More aggressive memory management for critical situations
+        try:
+            # Clear any traceback objects which can hold references
+            traceback.clear_frames(sys.exc_info()[2])
+            # Emergency measures for severe memory pressure
+            import builtins
+            for name in list(builtins.__dict__.keys()):
+                if name.startswith('__') and name.endswith('__'):
+                    continue  # Skip special builtins
+                if not isinstance(builtins.__dict__[name], type):
+                    continue  # Skip non-types
+                # Clear type caches which can hold memory
+                if hasattr(builtins.__dict__[name], '__dict__') and '__cache__' in builtins.__dict__[name].__dict__:
+                    builtins.__dict__[name].__dict__['__cache__'].clear()
+            # Force a compaction of freed memory back to the system
+            gc.collect()
+            # On Windows, explicitly request memory compaction from OS
+            if sys.platform.startswith('win'):
+                try:
+                    import ctypes
+                    ctypes.windll.kernel32.SetProcessWorkingSetSize(-1, -1)
+                except Exception as e:
+                    logging.debug(f"Failed to compact Windows memory: {e}")
+        except Exception as e:
+            logging.warning(f"Error during aggressive memory cleanup: {e}")
+    # Calculate and log memory freed
+    after_ram = psutil.virtual_memory().percent
+    after_process = psutil.Process().memory_info().rss / (1024 * 1024 * 1024)  # GB
+    freed_gb = before_process - after_process
+    if freed_gb > 0.01:  # If we freed a noticeable amount
+        logging.info(f"Memory cleaned: {freed_gb:.2f} GB freed, RAM usage {before_ram}% → {after_ram}%")
+    # Return True if we successfully freed memory
+    return freed_gb > 0
+def cleanup_cuda(force: bool = False):
+    """Perform CUDA memory cleanup with garbage collection."""
+    # Run RAM cleanup first
+    manage_ram(aggressive=force)
+    # Then handle CUDA if available
+    if not torch.cuda.is_available():
+        return
+    try:
+        # Clear CUDA cache
+        torch.cuda.empty_cache()
+        if force:
+            # Force synchronize CUDA
+            torch.cuda.synchronize()
+            # On aggressive cleanup, try to clear everything
+            for i in range(torch.cuda.device_count()):
+                torch.cuda.synchronize(i)
+    except Exception as e:
+        logging.warning(f"Error during CUDA cleanup: {e}")
+def process_file(file_path):
+    """Process a single file to extract its content."""
+    try:
+        # Get file size for logging
+        file_size = os.path.getsize(file_path)
+        logging.info(f"Processing file: {os.path.basename(file_path)} (Size: {file_size} bytes)")
+        # Read file content
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            content = f.read()
+        if not content:
+            logging.warning(f"File {file_path} is empty")
+        else:
+            logging.info(f"Successfully read {len(content)} characters from {os.path.basename(file_path)}")
+        return content, file_size, True
+    except Exception as e:
+        logging.error(f"Error processing file {file_path}: {e}", exc_info=True)
+        return "", 0, False
+def write_texts_to_disk(texts, file_path, max_chars_per_text=5000):
+    """Write text data to disk to free up memory.
+    Args:
+        texts (list): List of text entries to save
+        file_path (str): Path to save the data
+        max_chars_per_text (int): Maximum characters to save per text entry
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        with open(file_path, 'w', encoding='utf-8', errors='replace') as f:
+            for text in texts:
+                # Limit each text to prevent huge files
+                f.write(text[:max_chars_per_text] + '\n---END_ENTRY---\n')
+        return True
+    except Exception as e:
+        logging.error(f"Error writing texts to disk: {e}")
+        return False
+def read_texts_from_disk(file_path):
+    """Read text data from disk file.
+    Args:
+        file_path (str): Path to read data from
+    Returns:
+        list: List of text entries read from file
+    """
+    try:
+        texts = []
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            current_text = ""
+            for line in f:
+                if line.strip() == "---END_ENTRY---":
+                    texts.append(current_text)
+                    current_text = ""
+                else:
+                    current_text += line
+            if current_text:  # Add the last entry if file doesn't end with marker
+                texts.append(current_text)
+        return texts
+    except Exception as e:
+        logging.error(f"Error reading texts from disk: {e}")
+        return []
+def build_tokenizer(input_dir, output_path, vocab_size=40000, min_frequency=2, max_files=None, resources=None, temp_dir=None):
+    """Build a tokenizer directly from Python code files with adaptive resource management.
+    This function automatically adapts to the available system resources, scaling its
+    processing based on available RAM, CPU cores, and GPU capabilities. It implements
+    extreme memory conservation strategies to prevent OOM crashes.
+    Features:
+    - Progressive file loading (smallest files first)
+    - Memory monitoring with emergency intervention
+    - Disk offloading for memory pressure relief
+    - Dynamic chunk sizing with retry mechanisms
+    - Text truncation for oversized entries
+    Args:
+        input_dir (str): Directory containing Python code files (*.txt)
+        output_path (str): Path where to save the tokenizer JSON file
+        vocab_size (int, optional): Size of vocabulary to generate. Defaults to 40000.
+        min_frequency (int, optional): Minimum frequency threshold for tokens. Defaults to 2.
+        max_files (int, optional): Maximum number of files to process. If None, determined automatically.
+        resources (SystemResources, optional): Pre-detected system resources. If None, resources
+            will be automatically detected.
+    Returns:
+        bool: True if tokenizer was successfully created and saved, False otherwise
+    """
+    start_time = time.time()
+    # Detect system resources if not provided
+    if resources is None:
+        resources = SystemResources()
+    try:
+        # Monitor system resources
+        log_memory_usage()  # Initial memory benchmark
+        # Get all text files in directory
+        if os.path.isfile(input_dir):
+            # If input is a single file, use it directly
+            files = [input_dir]
+            logging.info(f"Processing single file: {input_dir}")
+        else:
+            # If input is a directory, get all .txt files
+            files = glob.glob(os.path.join(input_dir, "*.txt"))
+            logging.info(f"Found {len(files)} files in {input_dir}")
+        if not files:
+            logging.error(f"No files found in {input_dir}")
+            return False
+        # Sort files by size (smallest first) to allow progressive loading
+        try:
+            files = sorted(files, key=lambda f: os.path.getsize(f))
+            logging.info("Files sorted by size (processing smallest files first)")
+        except Exception as e:
+            logging.warning(f"Unable to sort files by size: {e}")
+        # Adaptive file processing based on available memory
+        process = psutil.Process()
+        # Analyze a few sample files to get a better estimate of average file size
+        sample_count = min(10, len(files))
+        if sample_count > 0:
+            sample_sizes = []
+            for i in range(sample_count):
+                try:
+                    file_size = os.path.getsize(files[i]) / (1024 * 1024)  # MB
+                    sample_sizes.append(file_size)
+                except Exception:
+                    pass
+            avg_file_size_estimate = 5  # Default fallback value in MB
+            if sample_sizes:
+                avg_file_size_estimate = sum(sample_sizes) / len(sample_sizes)
+                logging.info(f"Average file size based on {len(sample_sizes)} samples: {avg_file_size_estimate:.2f} MB")
+        else:
+            avg_file_size_estimate = 5  # MB per file (default estimate)
+        # Calculate safe file count based on resources
+        # Use a portion of available RAM, determined by our resources multiplier
+        safe_file_count = min(
+            len(files),
+            int(resources.available_ram_gb * 1024 / avg_file_size_estimate * resources.max_files_multiplier)
+        )
+        # EXTREME MEMORY CONSERVATION: Much more conservative file limits
+        # Even for high-RAM systems, we'll process fewer files at once after OOM testing
+        if resources.total_ram_gb >= 32:  # Even for very high RAM systems
+            max_files_multiplier = 0.3  # 1/3 of previous value
+        elif resources.total_ram_gb >= 16:
+            max_files_multiplier = 0.2  # Less than half of previous value
+        else:
+            max_files_multiplier = 0.1  # Very conservative for lower RAM
+        max_files_cap = max(3, int(resources.total_ram_gb * max_files_multiplier))
+        safe_file_count = min(safe_file_count, max_files_cap)
+        # Set an absolute maximum number of files regardless of RAM if max_files not specified
+        default_max_files = 10  # Default hard limit to prevent OOM
+        # Apply user-specified max_files if provided, otherwise use calculated safe limit
+        if max_files is not None:
+            if max_files == float('inf'):
+                logging.info("Processing ALL files in dataset (MAX mode)")
+                safe_file_count = len(files)  # Use all available files
+            else:
+                logging.info(f"User specified max_files: {max_files}")
+                safe_file_count = min(len(files), max_files)
+        else:
+            safe_file_count = min(safe_file_count, default_max_files)
+        # Ensure we process at least one file
+        safe_file_count = max(1, safe_file_count)
+        logging.info(f"Processing up to {safe_file_count} files based on available memory of {resources.available_ram_gb:.2f} GB")
+        # Use subset of files to match our determined safe count
+        files = files[:safe_file_count]
+        all_texts = []
+        total_chars = 0
+        # Use smaller batches for initial processing to gauge memory impact
+        initial_batch_size = max(1, resources.batch_size // 2)
+        logging.info(f"Starting with conservative batch size of {initial_batch_size}")
+        # Create batches with adaptive batch size - start with smaller batches
+        batch_size = initial_batch_size
+        batches = [files[i:i+batch_size] for i in range(0, len(files), batch_size)]
+        for batch_idx, batch in enumerate(batches):
+            batch_texts = []
+            # Use optimized worker count
+            with ProcessPoolExecutor(max_workers=resources.max_workers) as executor:
+                results = list(executor.map(process_file, batch))
+            for content, size, success in results:
+                if success and content:
+                    # MEMORY PROTECTION: Limit the size of any individual text entry
+                    # This prevents single massive files from causing OOM
+                    if len(content) > resources.max_text_chunk_size:
+                        logging.warning(f"Truncating oversized text: {len(content)} chars -> {resources.max_text_chunk_size} chars")
+                        content = content[:resources.max_text_chunk_size]
+                    batch_texts.append(content)
+                    total_chars += len(content)
+            logging.info(f"Batch {batch_idx+1}/{len(batches)}: Processed {len(batch)} files - {total_chars:,} total characters")
+            all_texts.extend(batch_texts)
+            # EMERGENCY MEMORY CHECK: Verify we haven't exceeded critical thresholds
+            available_ram_gb = psutil.virtual_memory().available / (1024 * 1024 * 1024)
+            ram_usage = process.memory_info().rss / (1024 * 1024 * 1024)  # in GB
+            ram_percent = psutil.virtual_memory().percent
+            logging.info(f"RAM usage after batch {batch_idx+1}: {ram_usage:.2f} GB ({ram_percent}%)")
+            # EXTREME MEMORY PROTECTION: Emergency intervention if available RAM drops below reserve
+            if available_ram_gb < resources.emergency_reserve_gb:
+                logging.critical(f"EMERGENCY: Available RAM ({available_ram_gb:.2f} GB) below reserve threshold ({resources.emergency_reserve_gb:.2f} GB)")
+                logging.critical("Taking emergency measures to prevent system crash")
+                # Save what we have and proceed with drastically reduced processing
+                emergency_path = os.path.join(temp_dir, f"emergency_tokenizer_data_{int(time.time())}.txt")
+                write_texts_to_disk(all_texts, emergency_path)
+                logging.critical(f"Emergency data saved to {emergency_path}")
+                # Keep only 10% of data or 5 entries, whichever is smaller
+                emergency_keep = min(max(5, len(all_texts) // 10), 20)
+                logging.critical(f"Reducing dataset from {len(all_texts)} entries to {emergency_keep} entries")
+                all_texts = all_texts[:emergency_keep]
+                # Force memory cleanup
+                manage_ram(aggressive=True)
+                cleanup_cuda(force=True)
+                # Stop processing more files
+                break
+            # Always use disk offloading if enabled
+            disk_offload_frequency = 1  # Every batch
+            # Write intermediate results to disk to reduce memory pressure
+            # Do this more aggressively to prevent OOM crashes
+            if resources.use_disk_offload and batch_idx > 0 and batch_idx % disk_offload_frequency == 0:
+                temp_file_path = os.path.join(temp_dir, f"temp_tokenizer_data_{batch_idx}.txt")
+                logging.info(f"Writing intermediate batch results to {temp_file_path}")
+                # Calculate how many entries to offload based on current memory pressure
+                current_ram_percent = psutil.virtual_memory().percent
+                # More aggressive offloading at higher memory pressure
+                if current_ram_percent > 70:
+                    offload_percentage = 0.8  # Offload 80% of data if memory pressure high
+                elif current_ram_percent > 50:
+                    offload_percentage = 0.6  # Offload 60% if moderate pressure
+                else:
+                    offload_percentage = 0.4  # Offload 40% if low pressure
+                entries_to_save = max(1, int(len(all_texts) * offload_percentage))
+                entries_to_save = min(entries_to_save, len(all_texts) - 1)  # Keep at least 1 entry
+                # Write data to disk
+                if write_texts_to_disk(all_texts[:entries_to_save], temp_file_path):
+                    # Remove what we wrote from memory
+                    logging.info(f"Offloaded {entries_to_save} entries ({offload_percentage*100:.0f}%) to disk, {len(all_texts)-entries_to_save} remain in memory")
+                    all_texts = all_texts[entries_to_save:]
+                    # Force RAM cleanup after file write
+                    manage_ram(aggressive=True)
+                    cleanup_cuda(force=True)
+            # Check against adaptive memory thresholds
+            if ram_usage > resources.ram_usage_warning:
+                logging.warning(f"RAM usage high ({ram_usage:.2f} GB), running RAM-focused cleanup")
+                manage_ram()
+                # If still high after cleanup, take more aggressive measures
+                ram_usage = process.memory_info().rss / (1024 * 1024 * 1024)
+                if ram_usage > resources.ram_usage_critical:
+                    logging.warning(f"RAM usage critical ({ram_usage:.2f} GB), performing emergency cleanup")
+                    # Force Python to release memory
+                    batch_texts.clear()
+                    manage_ram(aggressive=True)
+                    # Adaptive batch reduction - if we're processing too many files, reduce remaining batches
+                    if len(batches) - batch_idx > 3:
+                        # For low RAM systems, be more aggressive in reduction
+                        remaining_batch_count = 3 if resources.total_ram_gb >= 8 else 2
+                        logging.warning(f"Reducing remaining batches from {len(batches) - batch_idx} to {remaining_batch_count}")
+                        batches = batches[:batch_idx+remaining_batch_count]
+        if not all_texts:
+            logging.error("No content found in files")
+            return False
+        logging.info(f"Successfully loaded {len(all_texts)} text entries with {total_chars:,} characters")
+        # Python keywords and common tokens to ensure they're in the vocabulary
+        python_tokens = [
+            'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'import',
+            'from', 'as', 'with', 'return', 'yield', 'break', 'continue', 'pass', 'raise',
+            'True', 'False', 'None', 'self', 'and', 'or', 'not', 'is', 'in', 'lambda',
+            # Common Python library imports
+            'import numpy as np', 'import pandas as pd', 'import torch', 'import tensorflow as tf',
+            # Function signatures
+            'def __init__(self):', 'def forward(self, x):',
+        ]
+        # Initialize tokenizer - using BPE model which works well for code
+        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+        tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
+        tokenizer.decoder = ByteLevelDecoder()
+        # Special tokens for Python code
+        special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<s>", "</s>", "<pad>", "<unk>", "<mask>"]
+        # Configure trainer with larger vocabulary for code
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            show_progress=True,
+            initial_alphabet=list("abcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()_+-=[]{}|;:'\",./<>?`~ "),
+            # Add Python keywords as initial tokens
+            initial_tokens=python_tokens
+        )
+        # Train tokenizer in smaller chunks to save memory
+        logging.info(f"Training tokenizer on {len(all_texts):,} texts (target vocab: {vocab_size:,})")
+        # Split texts into smaller chunks for training - chunk size adapted to resources
+        # EXTREME MEMORY CONSERVATION: Start with tiny chunk sizes
+        # Start with just 1 item for the first iteration to gauge memory impact
+        initial_chunk_size = 1  # Start with just 1 item
+        max_chunk_size = max(1, resources.training_chunk_size // 2)  # Half the normal max
+        # Track memory failures to adapt
+        memory_failures = 0
+        current_chunk_size = initial_chunk_size
+        # Process in smaller chunks first
+        for i in range(0, len(all_texts), current_chunk_size):
+            try:
+                # Emergency memory check before processing
+                current_ram_percent = psutil.virtual_memory().percent
+                if current_ram_percent > 85:  # Critical threshold
+                    logging.warning(f"Memory usage critical before training: {current_ram_percent}%")
+                    current_chunk_size = max(1, current_chunk_size // 2)  # Reduce chunk size
+                    logging.info(f"Reducing chunk size to {current_chunk_size} due to memory pressure")
+                    manage_ram(aggressive=True)
+                    cleanup_cuda(force=True)
+                # Get the chunk to process
+                end_idx = min(i + current_chunk_size, len(all_texts))
+                chunk = all_texts[i:end_idx]
+                # Log progress
+                chunks_total = (len(all_texts) + current_chunk_size - 1) // current_chunk_size
+                current_chunk = i // current_chunk_size + 1
+                logging.info(f"Training on chunk {current_chunk}/{chunks_total} with size {len(chunk)}")
+                # Train on this chunk
+                tokenizer.train_from_iterator(
+                    chunk,
+                    trainer=trainer,
+                    length=len(chunk)
+                )
+                # Clean up memory between chunks
+                del chunk
+                manage_ram(aggressive=True)
+                cleanup_cuda(force=True)
+                # If successful and we're still using a reduced chunk size, try increasing it
+                if current_chunk_size < max_chunk_size and memory_failures == 0 and current_chunk > 3:
+                    new_size = min(max_chunk_size, current_chunk_size * 2)
+                    logging.info(f"Increasing chunk size from {current_chunk_size} to {new_size}")
+                    current_chunk_size = new_size
+            except Exception as e:
+                if "memory" in str(e).lower() or "allocation" in str(e).lower():
+                    memory_failures += 1
+                    logging.error(f"Memory error during training: {e}")
+                    # Reduce chunk size and retry
+                    old_size = current_chunk_size
+                    current_chunk_size = max(1, current_chunk_size // 2)
+                    logging.warning(f"Reducing chunk size from {old_size} to {current_chunk_size} and retrying")
+                    # Force cleanup
+                    manage_ram(aggressive=True)
+                    cleanup_cuda(force=True)
+                    # Back up a bit to retry with smaller chunk
+                    i = max(0, i - current_chunk_size)
+                    continue
+                else:
+                    # Non-memory error, re-raise
+                    raise
+        # Ensure output directory exists
+        output_dir = os.path.dirname(output_path) or '.'
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+        # Save tokenizer
+        tokenizer.save(output_path)
+        final_vocab_size = len(tokenizer.get_vocab())
+        elapsed = time.time() - start_time
+        logging.info(f"Tokenizer created with {final_vocab_size:,} tokens in {elapsed:.1f} seconds")
+        logging.info(f"Saved to: {output_path}")
+        return True
+    except Exception as e:
+        logging.error(f"Error training tokenizer: {e}")
+        logging.error(traceback.format_exc())
+        # Adaptive retry strategy for memory errors
+        if "memory" in str(e).lower() or "allocation" in str(e).lower():
+            logging.warning("Memory error detected, implementing adaptive sampling strategy...")
+            # Clear as much memory as possible
+            cleanup_cuda(True)
+            # Try progressively smaller samples until success or giving up
+            try:
+                # For very low memory systems, use even smaller sample
+                sample_size = 5 if resources.total_ram_gb < 8 else 10
+                all_texts_backup = all_texts[:sample_size]  # Keep a small sample
+                del all_texts
+                gc.collect()
+                # Release all other large objects and force collection
+                cleanup_cuda(True)
+                logging.info(f"Trying with a smaller sample size: {sample_size} texts")
+                tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+                tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
+                tokenizer.decoder = ByteLevelDecoder()
+                tokenizer.train_from_iterator(all_texts_backup, trainer=trainer)
+                tokenizer.save(output_path)
+                final_vocab_size = len(tokenizer.get_vocab())
+                elapsed = time.time() - start_time
+                logging.info(f"Tokenizer created with {final_vocab_size:,} tokens in {elapsed:.1f} seconds")
+                logging.info(f"Saved to: {output_path}")
+                return True
+            except Exception as e2:
+                logging.error(f"Retry failed: {e2}")
+                return False
+        return False
+if __name__ == "__main__":
+    # Main entry point with command-line argument handling
+    logging.info("Starting EZ-Tokenizer creation script")
+    logging.info(f"EZ-Tokenizer v1.0.0 - Optimized for performance and accuracy")
+    logging.info("Copyright (c) 2025 EZ-Tokenizer Team. All rights reserved.")
+    if len(sys.argv) < 3:
+        print("Usage: python adaptive_tokenizer.py <input_dir> <output_path> [vocab_size] [min_frequency] [max_files]")
+        print("  max_files: Optional maximum number of files to process (default: auto-determined)")
+        print("           Use 'MAX' to process all files in the directory")
+        sys.exit(1)
+    input_dir = sys.argv[1]
+    output_path = sys.argv[2]
+    vocab_size = int(sys.argv[3]) if len(sys.argv) > 3 else 40000
+    min_frequency = int(sys.argv[4]) if len(sys.argv) > 4 else 2
+    # Handle max_files parameter with special 'MAX' keyword
+    max_files = None
+    if len(sys.argv) > 5:
+        if sys.argv[5].upper() == 'MAX':
+            max_files = float('inf')  # Effectively no limit
+            logging.info("MAX keyword detected - will process all available files")
+        else:
+            try:
+                max_files = int(sys.argv[5])
+            except ValueError:
+                logging.warning(f"Invalid max_files value: {sys.argv[5]} - using auto determination")
+                max_files = None
+    # Detect system resources automatically
+    resources = SystemResources()
+    logging.info("Starting tokenizer creation with the following parameters:")
+    logging.info(f"Configuration:")
+    logging.info(f"  Input directory: {input_dir}")
+    logging.info(f"  Output path: {output_path}")
+    logging.info(f"  Vocabulary size: {vocab_size}")
+    logging.info(f"  Minimum frequency: {min_frequency}")
+    if max_files == float('inf'):
+        logging.info(f"  Maximum files: MAX (all files)")
+    else:
+        logging.info(f"  Maximum files: {max_files if max_files is not None else 'auto'}")
+    # Create a temp directory for offloaded data
+    import tempfile
+    import atexit
+    import shutil
+    # Create a temporary directory that will be automatically cleaned up
+    temp_dir = tempfile.mkdtemp(prefix='nexforge_tokenizer_')
+    logging.info(f"Created temporary directory for data offloading: {temp_dir}")
+    # Register cleanup function to remove the temp directory on exit
+    def cleanup_temp():
+        try:
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir, ignore_errors=True)
+                logging.info(f"Cleaned up temporary directory: {temp_dir}")
+        except Exception as e:
+            logging.warning(f"Error cleaning up temporary directory: {e}")
+    atexit.register(cleanup_temp)
+    # Initial memory check
+    log_memory_usage()
+    # Pass the temp_dir to the build_tokenizer function
+    success = build_tokenizer(
+        input_dir=input_dir,
+        output_path=output_path,
+        vocab_size=vocab_size,
+        min_frequency=min_frequency,
+        max_files=max_files,
+        resources=resources,
+        temp_dir=temp_dir  # Pass temp_dir to the function
+    )
+    # Cleanup is now handled by the atexit handler
+    logging.info("Temporary files will be cleaned up on exit")
+    # Final status
+    if success:
+        logging.info("Tokenizer creation completed successfully")
+        sys.exit(0)
+    else:
+        logging.error("Tokenizer creation failed")
+        sys.exit(1)

src/nexforgetokenizer/data/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Data handling for NexForge Tokenizer."""
+import os
+from pathlib import Path
+from typing import Optional
+def get_data_path() -> Path:
+    """Get the path to the package data directory."""
+    return Path(__file__).parent
+def get_sample_data_path() -> Optional[Path]:
+    """Get the path to the sample Python code file."""
+    data_path = get_data_path() / "python_code_sample.txt"
+    return data_path if data_path.exists() else None
+def load_sample_data() -> Optional[str]:
+    """Load and return the sample Python code as a string."""
+    sample_path = get_sample_data_path()
+    if sample_path is None:
+        return None
+    return sample_path.read_text(encoding='utf-8')

src/nexforgetokenizer/resources.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""System resource detection and management for adaptive processing."""
+import os
+import psutil
+import torch
+import logging
+from typing import Optional, Dict, Any
+class SystemResources:
+    """Detect and manage system resources for adaptive processing.
+    This class provides a unified interface to system resource detection,
+    handling CPU, RAM, and GPU capabilities. It calculates appropriate
+    thresholds and settings based on the detected hardware configuration.
+    It implements extreme memory conservation strategies to prevent OOM crashes
+    even on large datasets or limited hardware.
+    """
+    def __init__(self):
+        # CPU detection
+        self.cpu_cores = os.cpu_count() or 1
+        self.cpu_threads = self.cpu_cores
+        # Try to get physical cores vs logical cores
+        try:
+            self.cpu_physical_cores = psutil.cpu_count(logical=False) or self.cpu_cores
+        except:
+            self.cpu_physical_cores = self.cpu_cores
+        # RAM detection
+        self.total_ram_gb = psutil.virtual_memory().total / (1024 ** 3)
+        self.available_ram_gb = psutil.virtual_memory().available / (1024 ** 3)
+        # GPU detection
+        self.has_cuda = torch.cuda.is_available()
+        self.cuda_device = None
+        self.cuda_mem_gb = 0
+        if self.has_cuda:
+            try:
+                torch.cuda.empty_cache()
+                self.cuda_device = torch.cuda.get_device_name(0)
+                self.cuda_mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
+            except Exception as e:
+                logging.warning(f"Error detecting CUDA properties: {e}")
+                self.has_cuda = False
+        # Calculate resource-based thresholds
+        self._calculate_thresholds()
+        # Log detected resources
+        self._log_resources()
+    def _calculate_thresholds(self):
+        """Calculate adaptive thresholds based on detected system resources."""
+        # Memory thresholds - scaled to available RAM with extreme caution
+        # For all systems, use much more conservative thresholds after OOM testing
+        # Calculate absolute available RAM for emergency protection
+        self.emergency_reserve_gb = max(2.0, self.total_ram_gb * 0.2)  # At least 2GB or 20% reserved
+        if self.total_ram_gb < 8:  # Low RAM (<8GB)
+            self.ram_usage_warning = self.total_ram_gb * 0.45  # 45% of RAM
+            self.ram_usage_critical = self.total_ram_gb * 0.60  # 60% of RAM
+            self.max_files_multiplier = 0.03  # Extremely conservative
+            self.use_disk_offload = True  # Always use disk offloading
+        elif self.total_ram_gb < 16:  # Medium RAM (8-16GB)
+            self.ram_usage_warning = self.total_ram_gb * 0.55  # 55% of RAM
+            self.ram_usage_critical = self.total_ram_gb * 0.70  # 70% of RAM
+            self.max_files_multiplier = 0.05
+            self.use_disk_offload = True  # Always use disk offloading
+        else:  # High RAM (>16GB)
+            self.ram_usage_warning = self.total_ram_gb * 0.60  # 60% of RAM (down from 75%)
+            self.ram_usage_critical = self.total_ram_gb * 0.75  # 75% of RAM (down from 90%)
+            self.max_files_multiplier = 0.1  # Halved from previous 0.2
+            self.use_disk_offload = True  # Use disk offloading even on high-RAM systems
+        # Maximum text chunk size in memory (characters)
+        # This helps prevent individual large chunks from causing OOM
+        self.max_text_chunk_size = min(10_000_000, int(self.total_ram_gb * 1_000_000))
+        # CPU-based settings
+        # For worker count, use physical cores (or half of logical cores if physical detection failed)
+        self.max_workers = max(1, min(self.cpu_physical_cores, 4))  # At most 4 workers
+        # Batch size based on available cores
+        if self.cpu_cores <= 2:
+            self.batch_size = 2
+        elif self.cpu_cores <= 4:
+            self.batch_size = 4
+        else:
+            self.batch_size = min(5, self.cpu_cores // 2)
+        # Training chunk size - how many texts to process in one training iteration
+        if self.total_ram_gb < 8:
+            self.training_chunk_size = 3
+        elif self.total_ram_gb < 16:
+            self.training_chunk_size = 5
+        else:
+            self.training_chunk_size = 10
+    def _log_resources(self):
+        """Log detected system resources and calculated thresholds."""
+        logging.info("===== System Resources =====")
+        logging.info(f"CPU: {self.cpu_cores} cores ({self.cpu_physical_cores} physical)")
+        logging.info(f"RAM: {self.total_ram_gb:.1f} GB total, {self.available_ram_gb:.1f} GB available")
+        if self.has_cuda:
+            logging.info(f"GPU: {self.cuda_device} with {self.cuda_mem_gb:.1f} GB memory")
+        else:
+            logging.info("GPU: Not available")
+        logging.info("===== Adaptive Settings =====")
+        logging.info(f"RAM Warning Threshold: {self.ram_usage_warning:.1f} GB")
+        logging.info(f"RAM Critical Threshold: {self.ram_usage_critical:.1f} GB")
+        logging.info(f"Max Workers: {self.max_workers}")
+        logging.info(f"Batch Size: {self.batch_size}")
+        logging.info(f"Training Chunk Size: {self.training_chunk_size}")
+        logging.info(f"Max Files Multiplier: {self.max_files_multiplier:.2f}")

tests/test_adaptive_tokenizer.py ADDED Viewed

	@@ -0,0 +1,176 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Simple test script for the NexForge Adaptive Tokenizer.
+This script demonstrates the basic usage of the adaptive tokenizer
+by creating a small sample Python file and building a tokenizer from it.
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+import tempfile
+from tokenizers import Tokenizer
+# Add the parent directory to the path so we can import the package
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('tokenizer_test.log')
+    ]
+)
+# Sample Python code for testing
+SAMPLE_CODE = """
+# Comprehensive Python code test for tokenizer
+def factorial(n):
+    \"\"\"Calculate factorial of n.\"\"\"
+    if n <= 1:
+        return 1
+    return n * factorial(n - 1)
+class TestClass:
+    def __init__(self, value):
+        self.value = value
+    def process(self):
+        \"\"\"Process the value and return result.\"\"\"
+        return self.value * 2
+def main():
+    # Test various Python constructs
+    numbers = [1, 2, 3, 4, 5]
+    squares = [x**2 for x in numbers]
+    # Test string formatting
+    name = "NexForge"
+    version = 1.0
+    # Test control flow
+    if version > 0.5:
+        print(f"{name} v{version} is stable!")
+    else:
+        print(f"{name} v{version} is in development")
+    # Test function calls
+    result = factorial(5)
+    print(f"5! = {result}")
+    # Test class usage
+    test = TestClass(21)
+    print(f"Processed value: {test.process()}")
+    return 0
+if __name__ == "__main__":
+    exit(main())
+"""
+def create_test_file(directory):
+    """Create a test Python file in the specified directory."""
+    os.makedirs(directory, exist_ok=True)
+    test_file = os.path.join(directory, 'test_code.py')
+    with open(test_file, 'w', encoding='utf-8') as f:
+        f.write(SAMPLE_CODE)
+    return test_file
+def test_tokenizer():
+    """Test the adaptive tokenizer on a sample Python file."""
+    # Create a temporary directory for our test output
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Use the existing sample data
+        sample_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
+                                    'src', 'nexforgetokenizer', 'data', 'python_code_sample.txt')
+        print(f"Using sample data file: {sample_data_path}")
+        # Verify the sample file exists
+        if not os.path.exists(sample_data_path):
+            print(f"ERROR: Sample data file not found at {sample_data_path}")
+            return False
+        print(f"Sample file size: {os.path.getsize(sample_data_path)} bytes")
+        # Directory containing the sample file
+        data_dir = os.path.dirname(sample_data_path)
+        print(f"Data directory: {data_dir}")
+        # Output path for the tokenizer
+        output_path = os.path.join(temp_dir, 'test_tokenizer.json')
+        # Log initial memory usage
+        print("\nInitial memory usage:")
+        log_memory_usage()
+        # Detect system resources
+        resources = SystemResources()
+        print(f"\nDetected system resources:")
+        print(f"CPU Cores: {resources.cpu_cores}")
+        print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
+        if resources.has_cuda:
+            print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
+        else:
+            print("No CUDA GPU detected")
+        # Build the tokenizer using the existing sample data directory
+        print("\nBuilding tokenizer...")
+        success = build_tokenizer(
+            input_dir=data_dir,
+            output_path=output_path,
+            vocab_size=1000,  # Small vocabulary for quick testing
+            min_frequency=1,  # Include all tokens for this test
+            resources=resources
+        )
+        if success:
+            print(f"\nTokenizer successfully created at: {output_path}")
+            # Load the tokenizer and test it
+            tokenizer = Tokenizer.from_file(output_path)
+            vocab_size = len(tokenizer.get_vocab())
+            print(f"Vocabulary size: {vocab_size}")
+            # Test tokenization
+            encoded = tokenizer.encode(SAMPLE_CODE)
+            print(f"\nTokenized sample code:")
+            print(f"Number of tokens: {len(encoded.ids)}")
+            print(f"Average chars per token: {len(SAMPLE_CODE) / len(encoded.ids):.2f}")
+            # Log final memory usage
+            print("\nFinal memory usage:")
+            log_memory_usage()
+            return True
+        else:
+            print("Failed to create tokenizer")
+            return False
+def main():
+    """Main function to run the test."""
+    print("NexForge Adaptive Tokenizer Test")
+    print("==============================\n")
+    result = test_tokenizer()
+    if result:
+        print("\nTest completed successfully!")
+        return 0
+    else:
+        print("\nTest failed!")
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())