Upload 38 files
Browse filesFull Standalone create Tokenizer app. (100% Python code tested)
- .gitattributes +1 -0
- .gitignore +198 -0
- EZ-Tokenizer.exe +3 -0
- INSTALL.md +84 -0
- LICENSE +24 -0
- MANIFEST.in +20 -0
- README.md +276 -70
- Test_tokenizer/README.md +190 -0
- Test_tokenizer/__pycache__/test_tokenizer.cpython-313.pyc +0 -0
- Test_tokenizer/test_tokenizer.py +606 -0
- Test_tokenizer/test_tokenizer_simple.py +209 -0
- dist/ez_tokenizer-1.0.0-py3-none-any.whl +0 -0
- dist/ez_tokenizer-1.0.0.tar.gz +3 -0
- examples/README.md +83 -0
- examples/advanced_usage.py +207 -0
- examples/basic_usage.py +93 -0
- pyproject.toml +81 -0
- requirements-dev.txt +28 -0
- requirements.txt +18 -0
- run_ez_tokenizer.bat +286 -0
- setup.py +43 -0
- src/ez_tokenizer.egg-info/PKG-INFO +293 -0
- src/ez_tokenizer.egg-info/SOURCES.txt +19 -0
- src/ez_tokenizer.egg-info/dependency_links.txt +1 -0
- src/ez_tokenizer.egg-info/requires.txt +15 -0
- src/ez_tokenizer.egg-info/top_level.txt +1 -0
- src/nexforgetokenizer.egg-info/PKG-INFO +286 -0
- src/nexforgetokenizer.egg-info/SOURCES.txt +19 -0
- src/nexforgetokenizer.egg-info/dependency_links.txt +1 -0
- src/nexforgetokenizer.egg-info/requires.txt +15 -0
- src/nexforgetokenizer.egg-info/top_level.txt +1 -0
- src/nexforgetokenizer/__init__.py +33 -0
- src/nexforgetokenizer/__pycache__/__init__.cpython-313.pyc +0 -0
- src/nexforgetokenizer/__pycache__/adaptive_tokenizer.cpython-313.pyc +0 -0
- src/nexforgetokenizer/__pycache__/resources.cpython-313.pyc +0 -0
- src/nexforgetokenizer/adaptive_tokenizer.py +705 -0
- src/nexforgetokenizer/data/__init__.py +20 -0
- src/nexforgetokenizer/resources.py +120 -0
- tests/test_adaptive_tokenizer.py +176 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
EZ-Tokenizer.exe filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Project-specific
|
| 2 |
+
test_result/ # Test output files
|
| 3 |
+
output/ # Tokenizer output files
|
| 4 |
+
*.log # Log files
|
| 5 |
+
|
| 6 |
+
# Dataset directories (large files should not be in version control)
|
| 7 |
+
Dataset/
|
| 8 |
+
*.jsonl
|
| 9 |
+
*.csv
|
| 10 |
+
*.parquet
|
| 11 |
+
|
| 12 |
+
# Byte-compiled / optimized / DLL files
|
| 13 |
+
__pycache__/
|
| 14 |
+
*.py[cod]
|
| 15 |
+
*$py.class
|
| 16 |
+
|
| 17 |
+
# C extensions
|
| 18 |
+
*.so
|
| 19 |
+
|
| 20 |
+
# Distribution / packaging
|
| 21 |
+
.Python
|
| 22 |
+
build/
|
| 23 |
+
develop-eggs/
|
| 24 |
+
dist/
|
| 25 |
+
downloads/
|
| 26 |
+
eggs/
|
| 27 |
+
.eggs/
|
| 28 |
+
lib/
|
| 29 |
+
lib64/
|
| 30 |
+
parts/
|
| 31 |
+
sdist/
|
| 32 |
+
var/
|
| 33 |
+
wheels/
|
| 34 |
+
share/python-wheels/
|
| 35 |
+
*.egg-info/
|
| 36 |
+
.installed.cfg
|
| 37 |
+
*.egg
|
| 38 |
+
MANIFEST
|
| 39 |
+
|
| 40 |
+
# PyInstaller
|
| 41 |
+
*.manifest
|
| 42 |
+
*.spec
|
| 43 |
+
|
| 44 |
+
# Installer logs
|
| 45 |
+
pip-log.txt
|
| 46 |
+
pip-delete-this-directory.txt
|
| 47 |
+
|
| 48 |
+
# Unit test / coverage reports
|
| 49 |
+
htmlcov/
|
| 50 |
+
.tox/
|
| 51 |
+
.nox/
|
| 52 |
+
.coverage
|
| 53 |
+
.coverage.*
|
| 54 |
+
.cache
|
| 55 |
+
nosetests.xml
|
| 56 |
+
coverage.xml
|
| 57 |
+
*.cover
|
| 58 |
+
*.py,cover
|
| 59 |
+
.hypothesis/
|
| 60 |
+
.pytest_cache/
|
| 61 |
+
|
| 62 |
+
# IDE specific files
|
| 63 |
+
.vscode/
|
| 64 |
+
.idea/
|
| 65 |
+
*.swp
|
| 66 |
+
*.swo
|
| 67 |
+
*~
|
| 68 |
+
|
| 69 |
+
# Environment files
|
| 70 |
+
.env
|
| 71 |
+
.venv
|
| 72 |
+
env/
|
| 73 |
+
venv/
|
| 74 |
+
|
| 75 |
+
# Jupyter Notebook checkpoints
|
| 76 |
+
.ipynb_checkpoints/
|
| 77 |
+
|
| 78 |
+
# OS generated files
|
| 79 |
+
.DS_Store
|
| 80 |
+
.DS_Store?
|
| 81 |
+
._*
|
| 82 |
+
.Spotlight-V100
|
| 83 |
+
.Trashes
|
| 84 |
+
ehthumbs.db
|
| 85 |
+
Thumbs.db
|
| 86 |
+
cover/
|
| 87 |
+
|
| 88 |
+
# Translations
|
| 89 |
+
*.mo
|
| 90 |
+
*.pot
|
| 91 |
+
|
| 92 |
+
# Django stuff:
|
| 93 |
+
*.log
|
| 94 |
+
local_settings.py
|
| 95 |
+
db.sqlite3
|
| 96 |
+
db.sqlite3-journal
|
| 97 |
+
|
| 98 |
+
# Flask stuff:
|
| 99 |
+
instance/
|
| 100 |
+
.webassets-cache
|
| 101 |
+
|
| 102 |
+
# Scrapy stuff:
|
| 103 |
+
.scrapy
|
| 104 |
+
|
| 105 |
+
# Sphinx documentation
|
| 106 |
+
docs/_build/
|
| 107 |
+
|
| 108 |
+
# PyBuilder
|
| 109 |
+
.pybuilder/
|
| 110 |
+
target/
|
| 111 |
+
|
| 112 |
+
# Jupyter Notebook
|
| 113 |
+
.ipynb_checkpoints
|
| 114 |
+
|
| 115 |
+
# IPython
|
| 116 |
+
profile_default/
|
| 117 |
+
ipython_config.py
|
| 118 |
+
|
| 119 |
+
# pyenv
|
| 120 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 121 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 122 |
+
# .python-version
|
| 123 |
+
|
| 124 |
+
# pipenv
|
| 125 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 126 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 127 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 128 |
+
# install all needed dependencies.
|
| 129 |
+
#Pipfile.lock
|
| 130 |
+
|
| 131 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 132 |
+
__pypackages__/
|
| 133 |
+
|
| 134 |
+
# Celery stuff
|
| 135 |
+
celerybeat-schedule
|
| 136 |
+
celerybeat.pid
|
| 137 |
+
|
| 138 |
+
# SageMath parsed files
|
| 139 |
+
*.sage.py
|
| 140 |
+
|
| 141 |
+
# Environments
|
| 142 |
+
.env
|
| 143 |
+
.venv
|
| 144 |
+
env/
|
| 145 |
+
venv/
|
| 146 |
+
ENV/
|
| 147 |
+
env.bak/
|
| 148 |
+
venv.bak/
|
| 149 |
+
|
| 150 |
+
# Spyder project settings
|
| 151 |
+
.spyderproject
|
| 152 |
+
.spyproject
|
| 153 |
+
|
| 154 |
+
# Rope project settings
|
| 155 |
+
.ropeproject
|
| 156 |
+
|
| 157 |
+
# mkdocs documentation
|
| 158 |
+
/site
|
| 159 |
+
|
| 160 |
+
# mypy
|
| 161 |
+
.mypy_cache/
|
| 162 |
+
.dmypy.json
|
| 163 |
+
dmypy.json
|
| 164 |
+
|
| 165 |
+
# Pyre type checker
|
| 166 |
+
.pyre/
|
| 167 |
+
|
| 168 |
+
# pytype static type analyzer
|
| 169 |
+
.pytype/
|
| 170 |
+
|
| 171 |
+
# Cython debug symbols
|
| 172 |
+
cython_debug/
|
| 173 |
+
|
| 174 |
+
# VS Code
|
| 175 |
+
.vscode/
|
| 176 |
+
|
| 177 |
+
# PyCharm
|
| 178 |
+
.idea/
|
| 179 |
+
|
| 180 |
+
# Logs
|
| 181 |
+
*.log
|
| 182 |
+
|
| 183 |
+
# Tokenizer outputs
|
| 184 |
+
*.json
|
| 185 |
+
|
| 186 |
+
# Sample data
|
| 187 |
+
sample_code/
|
| 188 |
+
sample_data/
|
| 189 |
+
|
| 190 |
+
# Local development
|
| 191 |
+
.env.local
|
| 192 |
+
.env.development.local
|
| 193 |
+
.env.test.local
|
| 194 |
+
.env.production.local
|
| 195 |
+
|
| 196 |
+
# Misc
|
| 197 |
+
.DS_Store
|
| 198 |
+
Thumbs.db
|
EZ-Tokenizer.exe
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ef5c148f2e613895c247151df4f8b1db9e374dfbcc17cbe7174157902c40452
|
| 3 |
+
size 316199
|
INSTALL.md
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NexForge Tokenizer Builder - Installation Guide
|
| 2 |
+
|
| 3 |
+
## Package Information
|
| 4 |
+
|
| 5 |
+
The NexForge Tokenizer Builder package (`nexforgetokenizer`) provides a high-performance tool for creating Python code tokenizers with adaptive resource management. The package automatically adapts to available system resources, making it suitable for a wide range of hardware configurations.
|
| 6 |
+
|
| 7 |
+
## Installation Options
|
| 8 |
+
|
| 9 |
+
The package is distributed as both a wheel file and a source distribution. Choose the installation method that works best for your environment.
|
| 10 |
+
|
| 11 |
+
### Option 1: Direct Installation from Wheel (Recommended)
|
| 12 |
+
|
| 13 |
+
Copy the `.whl` file to your target system and run:
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
pip install nexforgetokenizer-0.1.0-py3-none-any.whl
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
### Option 2: Installation from Source Distribution
|
| 20 |
+
|
| 21 |
+
Copy the `.tar.gz` file to your target system and run:
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
pip install nexforgetokenizer-0.1.0.tar.gz
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
### Option 3: Development Installation
|
| 28 |
+
|
| 29 |
+
If you want to modify the code while using it:
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
git clone <repository-url>
|
| 33 |
+
cd nexforgetokenizer
|
| 34 |
+
pip install -e .
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## Dependencies
|
| 38 |
+
|
| 39 |
+
The package will automatically install the following dependencies:
|
| 40 |
+
|
| 41 |
+
- torch>=1.9.0
|
| 42 |
+
- tokenizers>=0.12.0
|
| 43 |
+
- tqdm>=4.62.0
|
| 44 |
+
- psutil>=5.9.0
|
| 45 |
+
- numpy>=1.20.0 (recommended for improved performance)
|
| 46 |
+
|
| 47 |
+
## Verifying Installation
|
| 48 |
+
|
| 49 |
+
After installation, you can verify that the package is working correctly by running:
|
| 50 |
+
|
| 51 |
+
```python
|
| 52 |
+
from nexforgetokenizer import SystemResources
|
| 53 |
+
|
| 54 |
+
# This should print information about your system resources
|
| 55 |
+
resources = SystemResources()
|
| 56 |
+
print(f"CPU Cores: {resources.cpu_cores}")
|
| 57 |
+
print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## Running Examples
|
| 61 |
+
|
| 62 |
+
The package includes example scripts that demonstrate its functionality:
|
| 63 |
+
|
| 64 |
+
```bash
|
| 65 |
+
# Run the basic usage example
|
| 66 |
+
python -m examples.basic_usage
|
| 67 |
+
|
| 68 |
+
# Run the comprehensive test example
|
| 69 |
+
python -m examples.test_adaptive_tokenizer
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Note on Online Availability
|
| 73 |
+
|
| 74 |
+
This package is currently not published on PyPI. It is distributed directly as wheel and source files for installation.
|
| 75 |
+
|
| 76 |
+
## System Requirements
|
| 77 |
+
|
| 78 |
+
- Python 3.8 or higher
|
| 79 |
+
- Minimum 4GB RAM (8GB+ recommended for larger datasets)
|
| 80 |
+
- CUDA-compatible GPU (optional, for acceleration)
|
| 81 |
+
|
| 82 |
+
## Getting Help
|
| 83 |
+
|
| 84 |
+
If you encounter any issues during installation or usage, please report them to the development team.
|
LICENSE
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License with Company Restriction
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 NexForge ([email protected])
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
1. The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
2. Companies with more than 10 employees or annual revenue exceeding $1 million
|
| 16 |
+
must obtain a commercial license from the copyright holder.
|
| 17 |
+
|
| 18 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 19 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 20 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 21 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 22 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 23 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 24 |
+
SOFTWARE.
|
MANIFEST.in
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Include package data files
|
| 2 |
+
recursive-include src/nexforgetokenizer *.py *.json *.md *.txt
|
| 3 |
+
|
| 4 |
+
# Include documentation
|
| 5 |
+
include README.md
|
| 6 |
+
include LICENSE
|
| 7 |
+
include requirements.txt
|
| 8 |
+
include pyproject.toml
|
| 9 |
+
|
| 10 |
+
# Include examples
|
| 11 |
+
recursive-include examples *.*
|
| 12 |
+
|
| 13 |
+
# Include tests
|
| 14 |
+
recursive-include tests *.py
|
| 15 |
+
|
| 16 |
+
# Exclude cache and temporary files
|
| 17 |
+
global-exclude *.py[cod] __pycache__ *.so
|
| 18 |
+
|
| 19 |
+
# Include any VERSION file if it exists
|
| 20 |
+
include src/nexforgetokenizer/VERSION
|
README.md
CHANGED
|
@@ -1,92 +1,298 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
---
|
| 5 |
-
language:
|
| 6 |
-
- code
|
| 7 |
-
- en
|
| 8 |
-
tags:
|
| 9 |
-
- programming
|
| 10 |
-
- tokenizer
|
| 11 |
-
- code-generation
|
| 12 |
-
- nlp
|
| 13 |
-
- machine-learning
|
| 14 |
-
|
| 15 |
-
license: mit
|
| 16 |
-
pipeline_tag: token-classification
|
| 17 |
-
---
|
| 18 |
|
| 19 |
-
|
| 20 |
|
| 21 |
-
##
|
| 22 |
-
EZ-Tokenizer is a state-of-the-art tokenizer specifically designed for processing code and mixed-content datasets. Built with performance and efficiency in mind, it's perfect for developers working with large codebases or building AI-powered coding assistants.
|
| 23 |
|
| 24 |
-
|
| 25 |
|
| 26 |
-
|
| 27 |
-
-
|
| 28 |
-
-
|
| 29 |
-
- Low memory footprint with intelligent resource management
|
| 30 |
|
| 31 |
-
|
| 32 |
-
- Preserves code structure and syntax
|
| 33 |
-
- Handles mixed content (code + comments + strings)
|
| 34 |
-
- Maintains indentation and formatting
|
| 35 |
|
| 36 |
-
###
|
| 37 |
-
-
|
| 38 |
-
-
|
| 39 |
-
-
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
- **Vocabulary Size**: 50,000 tokens
|
| 45 |
-
- **
|
| 46 |
-
- **
|
| 47 |
-
- **
|
|
|
|
| 48 |
|
| 49 |
-
###
|
| 50 |
-
-
|
| 51 |
-
-
|
| 52 |
-
-
|
| 53 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
### Command Line Usage
|
| 63 |
```bash
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
| 65 |
```
|
| 66 |
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
|
| 70 |
-
-
|
| 71 |
-
-
|
| 72 |
-
-
|
| 73 |
-
- Educational coding platforms
|
| 74 |
|
| 75 |
-
|
| 76 |
-
-
|
| 77 |
-
-
|
| 78 |
-
-
|
|
|
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
##
|
| 87 |
-
- **Avg. Processing Speed**: 10,000+ lines/second
|
| 88 |
-
- **Memory Efficiency**: 50% better than standard tokenizers
|
| 89 |
-
- **Accuracy**: 99.9% token reconstruction
|
| 90 |
|
| 91 |
-
|
| 92 |
-
Built by the NexForge team with ❤️ for the developer community.
|
|
|
|
| 1 |
+
# EZ-Tokenizer
|
| 2 |
+
|
| 3 |
+
A high-performance tool for creating custom tokenizers from your code or text datasets. Automatically adapts to your system resources while providing fine-grained control over tokenizer creation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
> **Note**: This project was previously known as NexForge Tokenizer. All functionality remains the same, only the name has been updated to better reflect its ease of use and efficiency.
|
| 6 |
|
| 7 |
+
## 📄 License
|
|
|
|
| 8 |
|
| 9 |
+
EZ-Tokenizer is released under the MIT License with a company restriction clause. This means:
|
| 10 |
|
| 11 |
+
- 🆓 **Free for everyone**: Individuals and small businesses can use EZ-Tokenizer for free
|
| 12 |
+
- 🏢 **Commercial use**: Companies with more than 10 employees or $1M+ in annual revenue need a commercial license
|
| 13 |
+
- 📝 **Full details**: See [LICENSE](LICENSE) for complete terms
|
|
|
|
| 14 |
|
| 15 |
+
## Quick Start with Batch File (Recommended for Most Users)
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
### Prerequisites
|
| 18 |
+
- Windows OS
|
| 19 |
+
- Python 3.8 or higher installed
|
| 20 |
+
- Administrator privileges
|
| 21 |
+
- At least 4GB RAM (8GB+ recommended)
|
| 22 |
|
| 23 |
+
### Getting Started
|
| 24 |
|
| 25 |
+
1. **Download** the latest release or clone this repository
|
| 26 |
+
2. **Add your dataset**: Place training files in the `Dataset` directory
|
| 27 |
+
- Supported formats: `.txt`, `.py`, and other text files
|
| 28 |
+
- The system will process all compatible files in this directory
|
| 29 |
+
3. **Run as Administrator**: Right-click on `run_ez_tokenizer.bat` and select "Run as administrator"
|
| 30 |
+
4. **Follow the Menu**:
|
| 31 |
+
- Option 1: Install Dependencies (first time only)
|
| 32 |
+
- Option 2: Create Tokenizer (processes all files in Dataset directory)
|
| 33 |
+
- Option 3: Test Tokenizer (after creation)
|
| 34 |
+
- Option 4: Open Dataset Directory (to add/check files)
|
| 35 |
+
- Option 5: Exit
|
| 36 |
+
|
| 37 |
+
### Default Tokenizer Settings
|
| 38 |
- **Vocabulary Size**: 50,000 tokens
|
| 39 |
+
- **Minimum Frequency**: 2 (includes tokens appearing at least twice)
|
| 40 |
+
- **File Processing**: All files in Dataset directory
|
| 41 |
+
- **Output**: `output/tokenizer.json`
|
| 42 |
+
- **Test Results**: `Test_tokenizer/test_results.txt`
|
| 43 |
|
| 44 |
+
### Dependencies
|
| 45 |
+
- Python 3.8+
|
| 46 |
+
- tokenizers >= 0.21.1
|
| 47 |
+
- tqdm >= 4.66.1
|
| 48 |
+
- numpy >= 1.24.0
|
| 49 |
+
- psutil >= 5.9.0
|
| 50 |
+
|
| 51 |
+
### For Advanced Users
|
| 52 |
+
Customize tokenizer creation by running manually:
|
| 53 |
+
```bash
|
| 54 |
+
python -m ez_tokenizer.adaptive_tokenizer [input_dir] [output_path] [vocab_size] [min_frequency] [max_files]
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
Example (matches batch file defaults):
|
| 58 |
+
```bash
|
| 59 |
+
python -m ez_tokenizer.adaptive_tokenizer "Dataset" "output/tokenizer.json" 50000 2
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
### Batch File Menu Options
|
| 63 |
+
1. **Install Dependencies**
|
| 64 |
+
- Installs required Python packages
|
| 65 |
+
- Only needed for first-time setup
|
| 66 |
|
| 67 |
+
2. **Create Tokenizer**
|
| 68 |
+
- Processes all files in the `Dataset` directory
|
| 69 |
+
- Outputs to `output/tokenizer.json`
|
| 70 |
+
- Shows progress and statistics
|
| 71 |
+
|
| 72 |
+
3. **Test Tokenizer**
|
| 73 |
+
- Runs tests on the created tokenizer
|
| 74 |
+
- Saves results to `Test_tokenizer/test_results.txt`
|
| 75 |
+
- Verifies reconstruction accuracy
|
| 76 |
+
|
| 77 |
+
4. **Open Dataset Directory**
|
| 78 |
+
- Opens the Dataset folder for easy file management
|
| 79 |
+
- Add your training files here before creating a tokenizer
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
|
| 83 |
+
## Advanced Usage (Manual Setup)
|
| 84 |
+
|
| 85 |
+
For users who need more control or are using non-Windows systems:
|
| 86 |
+
|
| 87 |
+
## Features
|
| 88 |
+
|
| 89 |
+
- **Adaptive Resource Management**: Automatically detects and utilizes available system resources (CPU, RAM, GPU)
|
| 90 |
+
- **Progressive Processing**: Processes files in chunks to handle datasets larger than available memory
|
| 91 |
+
- **Smart Batching**: Dynamically adjusts batch sizes based on available resources
|
| 92 |
+
- **Efficient Memory Usage**: Implements memory conservation strategies for optimal performance
|
| 93 |
+
- **High Performance**: Processes over 300,000 tokens per second on average hardware
|
| 94 |
+
- **Perfect Reconstruction**: 100% accuracy in round-trip encoding/decoding
|
| 95 |
+
- **Optimal Compression**: Achieves ~3.5 characters per token, exceeding industry standards
|
| 96 |
+
- 🛠️ **Extensible**: Advanced users can customize all parameters
|
| 97 |
+
- ✅ **Tested**: Built-in testing to verify tokenizer quality
|
| 98 |
+
|
| 99 |
+
## Quick Start
|
| 100 |
+
|
| 101 |
+
### Installation
|
| 102 |
|
|
|
|
| 103 |
```bash
|
| 104 |
+
# Install from source
|
| 105 |
+
git clone https://github.com/yourusername/ez_tokenizer.git
|
| 106 |
+
cd ez_tokenizer
|
| 107 |
+
pip install -e .
|
| 108 |
```
|
| 109 |
|
| 110 |
+
### Basic Usage
|
| 111 |
+
|
| 112 |
+
#### Command Line Interface
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
# Basic usage
|
| 116 |
+
python -m ez_tokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json
|
| 117 |
+
|
| 118 |
+
# With custom parameters
|
| 119 |
+
python -m ez_tokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json 50000 2
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
## Complete Usage Guide
|
| 123 |
+
|
| 124 |
+
### Command Line Arguments
|
| 125 |
+
|
| 126 |
+
```bash
|
| 127 |
+
python -m ez_tokenizer.adaptive_tokenizer <input_path> <output_path> [vocab_size] [min_frequency]
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
- **input_path**: Path to file or directory containing training data
|
| 131 |
+
- **output_path**: Where to save the tokenizer (should end with .json)
|
| 132 |
+
- **vocab_size** (optional, default=40000): Target vocabulary size
|
| 133 |
+
- **min_frequency** (optional, default=2): Minimum token occurrence count
|
| 134 |
+
|
| 135 |
+
### Python API
|
| 136 |
+
|
| 137 |
+
```python
|
| 138 |
+
from ez_tokenizer import build_tokenizer
|
| 139 |
+
|
| 140 |
+
# Basic usage
|
| 141 |
+
build_tokenizer(
|
| 142 |
+
input_dir="path/to/your/files",
|
| 143 |
+
output_path="output/tokenizer.json"
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Advanced usage
|
| 147 |
+
build_tokenizer(
|
| 148 |
+
input_dir="path/to/your/files",
|
| 149 |
+
output_path="output/tokenizer.json",
|
| 150 |
+
vocab_size=50000, # Larger vocabulary for specialized domains
|
| 151 |
+
min_frequency=2, # Only include tokens appearing at least this many times
|
| 152 |
+
chunk_size=1000000, # Characters to process at once
|
| 153 |
+
n_threads=4 # Number of threads to use
|
| 154 |
+
)
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
## Best Practices
|
| 158 |
+
|
| 159 |
+
### Recommended Settings
|
| 160 |
+
|
| 161 |
+
#### For Most Users
|
| 162 |
+
- **Vocabulary Size**: 40,000 (default)
|
| 163 |
+
- Balanced between coverage and performance
|
| 164 |
+
- Works well for most programming languages and natural language
|
| 165 |
+
- **Minimum Frequency**: 2 (default)
|
| 166 |
+
- Includes tokens that appear at least twice
|
| 167 |
+
- Good balance between vocabulary size and token quality
|
| 168 |
+
|
| 169 |
+
#### For Specialized Use Cases
|
| 170 |
+
- **Larger Vocabularies (50k+)**
|
| 171 |
+
- Only needed for very diverse codebases
|
| 172 |
+
- Requires more system resources
|
| 173 |
+
- **Higher Minimum Frequency**
|
| 174 |
+
- Use 3-5 for smaller vocabularies
|
| 175 |
+
- Reduces vocabulary size while maintaining quality
|
| 176 |
+
|
| 177 |
+
#### Processing Large Datasets
|
| 178 |
+
- The batch file automatically handles large datasets
|
| 179 |
+
- Processes files in memory-efficient chunks
|
| 180 |
+
- Can be interrupted and resumed if needed
|
| 181 |
+
|
| 182 |
+
### Input Data
|
| 183 |
+
|
| 184 |
+
- Supports `.txt`, `.py`, and other text-based formats
|
| 185 |
+
- Handles both files and directories
|
| 186 |
+
- Automatically filters binary files
|
| 187 |
+
|
| 188 |
+
### Performance Tips
|
| 189 |
+
|
| 190 |
+
- For large datasets (>1GB), use chunking
|
| 191 |
+
- On multi-core systems, increase thread count
|
| 192 |
+
- Monitor memory usage with large vocabularies
|
| 193 |
+
|
| 194 |
+
## Testing Your Tokenizer
|
| 195 |
+
|
| 196 |
+
After creating your tokenizer, use the built-in test function:
|
| 197 |
+
|
| 198 |
+
1. From the batch menu, select "Test Tokenizer"
|
| 199 |
+
2. The system will:
|
| 200 |
+
- Test with 10,000 random samples
|
| 201 |
+
- Measure tokenization speed (typically >300k tokens/sec)
|
| 202 |
+
- Verify 100% round-trip accuracy
|
| 203 |
+
- Generate a detailed performance report
|
| 204 |
+
# Custom test with specific sample size
|
| 205 |
+
python Test_tokenizer\test_tokenizer.py \
|
| 206 |
+
--tokenizer output/Nexforge_tokenizer.json \
|
| 207 |
+
--input Dataset \
|
| 208 |
+
--sample 20000 \
|
| 209 |
+
--output test_result/detailed_test.txt
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
### Test Output Includes
|
| 213 |
+
- Tokenization success rate
|
| 214 |
+
- Sample encoded/decoded text
|
| 215 |
+
- Basic statistics (vocab size, special tokens)
|
| 216 |
+
- Any encoding/decoding errors
|
| 217 |
+
|
| 218 |
+
## Troubleshooting
|
| 219 |
+
|
| 220 |
+
### Common Issues
|
| 221 |
+
|
| 222 |
+
1. **Out of Memory**
|
| 223 |
+
- Reduce chunk size
|
| 224 |
+
- Close other memory-intensive applications
|
| 225 |
+
- Use a smaller vocabulary
|
| 226 |
+
|
| 227 |
+
2. **Slow Processing**
|
| 228 |
+
- Increase thread count
|
| 229 |
+
- Process in smaller batches
|
| 230 |
+
- Check for system resource constraints
|
| 231 |
+
|
| 232 |
+
3. **Vocabulary Too Large**
|
| 233 |
+
- Increase min_frequency
|
| 234 |
+
- Use a smaller vocab_size
|
| 235 |
+
- Pre-filter your dataset
|
| 236 |
+
|
| 237 |
+
## Performance & Resource Usage
|
| 238 |
+
|
| 239 |
+
The tokenizer is optimized to work efficiently across different hardware configurations:
|
| 240 |
+
|
| 241 |
+
### System Requirements
|
| 242 |
+
- **Minimum**: 4GB RAM, 2-core CPU
|
| 243 |
+
- **Recommended**: 8GB+ RAM, 4+ core CPU
|
| 244 |
+
- **Disk Space**: At least 1GB free (more for large datasets)
|
| 245 |
+
|
| 246 |
+
### Expected Performance
|
| 247 |
+
- **Memory Usage**: Typically stays under 2GB for most datasets
|
| 248 |
+
- **CPU Utilization**: Deliberately capped to prevent system slowdown
|
| 249 |
+
- **Processing Speed**: Varies by system, but generally processes:
|
| 250 |
+
- Small datasets (100MB): 1-5 minutes
|
| 251 |
+
- Medium datasets (1GB): 10-30 minutes
|
| 252 |
+
- Large datasets (10GB+): 1-3 hours
|
| 253 |
+
|
| 254 |
+
### Monitoring
|
| 255 |
+
- The batch file shows progress updates
|
| 256 |
+
- Check Task Manager for real-time resource usage
|
| 257 |
+
- Process can be safely interrupted (CTRL+C) and resumed
|
| 258 |
+
|
| 259 |
+
## Examples
|
| 260 |
+
|
| 261 |
+
See the `examples/` directory for:
|
| 262 |
+
- Training on specific programming languages
|
| 263 |
+
- Fine-tuning pre-trained tokenizers
|
| 264 |
+
- Batch processing large datasets
|
| 265 |
+
|
| 266 |
+
## Contributing
|
| 267 |
+
|
| 268 |
+
We welcome contributions! To maintain code quality, please follow these guidelines:
|
| 269 |
+
|
| 270 |
+
1. **Code Style**
|
| 271 |
+
- Follow PEP 8 guidelines
|
| 272 |
+
- Use type hints for better code clarity
|
| 273 |
+
- Keep functions focused and modular
|
| 274 |
|
| 275 |
+
2. **Testing**
|
| 276 |
+
- Add tests for new features
|
| 277 |
+
- Run all tests with: `pytest Test_tokenizer/`
|
| 278 |
+
- Ensure 100% test coverage for new code
|
|
|
|
| 279 |
|
| 280 |
+
3. **Pull Requests**
|
| 281 |
+
- Fork the repository
|
| 282 |
+
- Create a feature branch
|
| 283 |
+
- Submit a PR with a clear description
|
| 284 |
+
- Reference any related issues
|
| 285 |
|
| 286 |
+
4. **Issues**
|
| 287 |
+
- Check existing issues before creating new ones
|
| 288 |
+
- Provide detailed reproduction steps
|
| 289 |
+
- Include version information
|
| 290 |
|
| 291 |
+
5. **Documentation**
|
| 292 |
+
- Update README for new features
|
| 293 |
+
- Add docstrings to new functions
|
| 294 |
+
- Keep comments clear and relevant
|
| 295 |
|
| 296 |
+
## License
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
|
Test_tokenizer/README.md
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NexForge Tokenizer Testing
|
| 2 |
+
|
| 3 |
+
This directory contains tools for testing the NexForge tokenizer on your code or text files.
|
| 4 |
+
|
| 5 |
+
## Quick Start
|
| 6 |
+
|
| 7 |
+
1. **Create a tokenizer** using the main menu (`run_nexforge.bat`)
|
| 8 |
+
2. **Run tests** from the main menu
|
| 9 |
+
- Tests 10,000 random samples by default
|
| 10 |
+
- Results saved to `test_result/test_run.txt`
|
| 11 |
+
|
| 12 |
+
## Advanced Testing
|
| 13 |
+
|
| 14 |
+
### Prerequisites
|
| 15 |
+
- Python 3.8+
|
| 16 |
+
- NexForge tokenizer package installed
|
| 17 |
+
|
| 18 |
+
### Test Scripts
|
| 19 |
+
|
| 20 |
+
1. **test_tokenizer.py** - Comprehensive testing with detailed metrics
|
| 21 |
+
2. **test_tokenizer_simple.py** - Quick testing on a single file
|
| 22 |
+
|
| 23 |
+
## Installation
|
| 24 |
+
|
| 25 |
+
Dependencies are automatically installed when you run the main installer. For manual setup:
|
| 26 |
+
|
| 27 |
+
```bash
|
| 28 |
+
pip install tokenizers python-Levenshtein
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
## Project Structure
|
| 32 |
+
|
| 33 |
+
```
|
| 34 |
+
NexForge/
|
| 35 |
+
├── Test_tokenizer/
|
| 36 |
+
│ ├── test_tokenizer.py # Main test script (batch processing)
|
| 37 |
+
│ └── test_tokenizer_simple.py # Quick test script (single file)
|
| 38 |
+
├── output/ # Tokenizer output (Nexforge_tokenizer.json)
|
| 39 |
+
├── Dataset/ # Your training/test files
|
| 40 |
+
└── test_result/ # Test outputs and reports
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## test_tokenizer.py
|
| 44 |
+
|
| 45 |
+
Comprehensive testing with detailed metrics and batch processing.
|
| 46 |
+
|
| 47 |
+
### Basic Usage
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
# Run with default settings (uses tokenizer from parent directory)
|
| 51 |
+
python test_tokenizer.py
|
| 52 |
+
|
| 53 |
+
# Or specify custom paths
|
| 54 |
+
python test_tokenizer.py \
|
| 55 |
+
--tokenizer ../output/Nexforge_tokenizer.json \
|
| 56 |
+
--input ../Dataset \
|
| 57 |
+
--output ../test_result/detailed_test.txt
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### What's Tested
|
| 61 |
+
- Tokenization/decoding accuracy
|
| 62 |
+
- Special token handling
|
| 63 |
+
- Performance metrics
|
| 64 |
+
- File format compatibility
|
| 65 |
+
|
| 66 |
+
### Command Line Options
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
# Custom tokenizer, input, and output paths
|
| 70 |
+
python test_tokenizer.py \
|
| 71 |
+
--tokenizer path/to/your/tokenizer.json \
|
| 72 |
+
--input path/to/your/code/directory \
|
| 73 |
+
--output custom_results/custom_test.txt \
|
| 74 |
+
--file-types py,js,json \
|
| 75 |
+
--max-files 20 \
|
| 76 |
+
--sample 50000
|
| 77 |
+
|
| 78 |
+
# Process only specific file types
|
| 79 |
+
python test_tokenizer.py --file-types py,js,json
|
| 80 |
+
|
| 81 |
+
# Process all files but limit to first 20
|
| 82 |
+
python test_tokenizer.py --max-files 20
|
| 83 |
+
|
| 84 |
+
# Process all files of specific types (no limit)
|
| 85 |
+
python test_tokenizer.py --max-files 0 --file-types py,js
|
| 86 |
+
|
| 87 |
+
# Process full content of each file (no sampling)
|
| 88 |
+
python test_tokenizer.py --sample 0
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## test_tokenizer_simple.py
|
| 92 |
+
|
| 93 |
+
Quick verification of tokenizer functionality.
|
| 94 |
+
|
| 95 |
+
### Usage
|
| 96 |
+
|
| 97 |
+
```bash
|
| 98 |
+
# Quick test on a single file
|
| 99 |
+
python test_tokenizer_simple.py --input sample.py
|
| 100 |
+
|
| 101 |
+
# Test with custom tokenizer
|
| 102 |
+
python test_tokenizer_simple.py \
|
| 103 |
+
--tokenizer ../output/Nexforge_tokenizer.json \
|
| 104 |
+
--input sample.py
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
### When to Use
|
| 108 |
+
- Quick validation of tokenizer
|
| 109 |
+
- Debugging specific files
|
| 110 |
+
- Verifying tokenization quality
|
| 111 |
+
- Minimal setup required
|
| 112 |
+
|
| 113 |
+
## Understanding Test Results
|
| 114 |
+
|
| 115 |
+
### Sample Output
|
| 116 |
+
|
| 117 |
+
```
|
| 118 |
+
=== NexForge Tokenizer Test Results ===
|
| 119 |
+
Tested on: 2025-05-25 13:30:00
|
| 120 |
+
Tokenizer: ../output/Nexforge_tokenizer.json
|
| 121 |
+
Files processed: 42
|
| 122 |
+
Total tokens: 1,234,567
|
| 123 |
+
|
| 124 |
+
Success Rate: 99.8%
|
| 125 |
+
Avg. tokens/file: 29,394
|
| 126 |
+
Max memory used: 1.2GB
|
| 127 |
+
|
| 128 |
+
=== Detailed Metrics ===
|
| 129 |
+
- Perfect matches: 98.2%
|
| 130 |
+
- Minor differences: 1.5%
|
| 131 |
+
- Major issues: 0.3%
|
| 132 |
+
|
| 133 |
+
See test_result/test_run.txt for full report
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
### Interpreting Results
|
| 137 |
+
- **Success Rate**: Percentage of files processed without errors
|
| 138 |
+
- **Perfect Matches**: Files that round-trip encode/decode perfectly
|
| 139 |
+
- **Minor Differences**: Small whitespace or formatting differences
|
| 140 |
+
- **Major Issues**: Significant differences requiring attention
|
| 141 |
+
|
| 142 |
+
## Need Help?
|
| 143 |
+
|
| 144 |
+
If you encounter any issues:
|
| 145 |
+
1. Check the test results in `test_result/`
|
| 146 |
+
2. Ensure your tokenizer was created successfully
|
| 147 |
+
3. Verify file encodings (UTF-8 recommended)
|
| 148 |
+
4. Check for corrupted or extremely large files
|
| 149 |
+
|
| 150 |
+
For additional support, please open an issue on our GitHub repository.
|
| 151 |
+
File types: py,js,json
|
| 152 |
+
Max files: 10
|
| 153 |
+
Sample size: 100000 chars/file
|
| 154 |
+
|
| 155 |
+
=== Summary ===
|
| 156 |
+
Processed files: 10
|
| 157 |
+
Skipped files: 0
|
| 158 |
+
avg_chars_per_token: 3.47
|
| 159 |
+
avg_tokens_per_sec: 12500.34
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
### test_tokenizer_simple.py Output
|
| 163 |
+
|
| 164 |
+
```
|
| 165 |
+
=== TOKENIZER TEST SUMMARY ================================================
|
| 166 |
+
Test Script: test_tokenizer_simple.py
|
| 167 |
+
Timestamp: 20250524_154835
|
| 168 |
+
Tokenizer: ../output/tokenizer.json
|
| 169 |
+
Chunk file: example.txt
|
| 170 |
+
--------------------------------------------------------------------------------
|
| 171 |
+
Lines processed: 1000
|
| 172 |
+
Perfect matches: 987 (98.7%)
|
| 173 |
+
Average tokens/line: 15.23
|
| 174 |
+
Total characters: 1,234,567
|
| 175 |
+
Total tokens: 15,230
|
| 176 |
+
Character accuracy: 99.85%
|
| 177 |
+
Character diff: 1,845 chars (0.15%)
|
| 178 |
+
Chars per token: 7.92 (lower is better)
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
## Troubleshooting
|
| 182 |
+
|
| 183 |
+
- **Missing Dependencies**: Install required packages with `pip install -r requirements.txt`
|
| 184 |
+
- **File Not Found**: Ensure the tokenizer and input paths are correct
|
| 185 |
+
- **Empty Results**: Check that your input directory contains files with the specified extensions
|
| 186 |
+
- **Tokenizer Not Found**: By default, looks for tokenizer.json in `../output/` (one level up from Test_tokenizer)
|
| 187 |
+
|
| 188 |
+
## License
|
| 189 |
+
|
| 190 |
+
This tool is part of the Nexforge project. See the main project for licensing information.
|
Test_tokenizer/__pycache__/test_tokenizer.cpython-313.pyc
ADDED
|
Binary file (31.5 kB). View file
|
|
|
Test_tokenizer/test_tokenizer.py
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
import glob
|
| 6 |
+
import logging
|
| 7 |
+
import sys
|
| 8 |
+
import traceback
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 12 |
+
|
| 13 |
+
def get_project_root() -> Path:
|
| 14 |
+
"""Get the project root directory."""
|
| 15 |
+
# Use the current working directory as the project root
|
| 16 |
+
return Path.cwd()
|
| 17 |
+
|
| 18 |
+
def ensure_directory(path: Path) -> None:
|
| 19 |
+
"""Ensure directory exists, create if it doesn't."""
|
| 20 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
# Configure logging
|
| 23 |
+
log_dir = Path('test_result')
|
| 24 |
+
ensure_directory(log_dir)
|
| 25 |
+
|
| 26 |
+
logging.basicConfig(
|
| 27 |
+
level=logging.INFO,
|
| 28 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 29 |
+
handlers=[
|
| 30 |
+
logging.StreamHandler(sys.stdout),
|
| 31 |
+
logging.FileHandler(log_dir / 'tokenizer_test.log')
|
| 32 |
+
]
|
| 33 |
+
)
|
| 34 |
+
logger = logging.getLogger(__name__)
|
| 35 |
+
|
| 36 |
+
class Tokenizer:
|
| 37 |
+
def __init__(self, tokenizer_path: str):
|
| 38 |
+
"""Initialize the EZ-Tokenizer with enhanced error handling and validation."""
|
| 39 |
+
try:
|
| 40 |
+
from tokenizers import Tokenizer as HFTokenizer
|
| 41 |
+
|
| 42 |
+
logger.info(f"Loading EZ-Tokenizer from {tokenizer_path}")
|
| 43 |
+
if not os.path.exists(tokenizer_path):
|
| 44 |
+
raise FileNotFoundError(f"EZ-Tokenizer file not found: {tokenizer_path}")
|
| 45 |
+
|
| 46 |
+
start_time = time.time()
|
| 47 |
+
self.tokenizer = HFTokenizer.from_file(tokenizer_path)
|
| 48 |
+
load_time = time.time() - start_time
|
| 49 |
+
|
| 50 |
+
self.vocab_size = self.tokenizer.get_vocab_size()
|
| 51 |
+
logger.info(f"EZ-Tokenizer loaded in {load_time:.2f} seconds. Vocabulary size: {self.vocab_size:,}")
|
| 52 |
+
|
| 53 |
+
# Run basic smoke tests
|
| 54 |
+
self._run_smoke_tests()
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.error(f"Failed to initialize EZ-Tokenizer: {e}", exc_info=True)
|
| 58 |
+
logger.error(f"Failed to initialize tokenizer: {e}", exc_info=True)
|
| 59 |
+
raise
|
| 60 |
+
|
| 61 |
+
def _run_smoke_tests(self):
|
| 62 |
+
"""Run basic smoke tests to verify tokenizer functionality."""
|
| 63 |
+
test_cases = [
|
| 64 |
+
"Hello, world!",
|
| 65 |
+
"こんにちは世界",
|
| 66 |
+
"안녕하세요",
|
| 67 |
+
"Привет, мир!",
|
| 68 |
+
"12345 !@#$%^&*()_+{}|:<>?",
|
| 69 |
+
""
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
logger.info("Running smoke tests...")
|
| 73 |
+
for text in test_cases:
|
| 74 |
+
try:
|
| 75 |
+
tokens = self.encode(text)
|
| 76 |
+
decoded = self.decode(tokens)
|
| 77 |
+
if text != decoded:
|
| 78 |
+
logger.warning(f"Roundtrip mismatch for {text!r} -> {decoded!r}")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Smoke test failed for {text!r}: {e}")
|
| 81 |
+
raise
|
| 82 |
+
logger.info("Smoke tests completed successfully")
|
| 83 |
+
|
| 84 |
+
def encode(self, text: str, chunk_size: int = 10000) -> List[int]:
|
| 85 |
+
"""Encode text to token IDs with chunking for large inputs."""
|
| 86 |
+
try:
|
| 87 |
+
if not isinstance(text, str):
|
| 88 |
+
raise ValueError(f"Expected string, got {type(text).__name__}")
|
| 89 |
+
|
| 90 |
+
# Process in chunks if text is large
|
| 91 |
+
if len(text) <= chunk_size:
|
| 92 |
+
return self.tokenizer.encode(text).ids
|
| 93 |
+
|
| 94 |
+
# Process large text in chunks
|
| 95 |
+
tokens = []
|
| 96 |
+
for i in range(0, len(text), chunk_size):
|
| 97 |
+
chunk = text[i:i + chunk_size]
|
| 98 |
+
tokens.extend(self.tokenizer.encode(chunk).ids)
|
| 99 |
+
return tokens
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logger.error(f"Encoding failed: {e}")
|
| 103 |
+
raise RuntimeError(f"Failed to encode text (length: {len(text)}): {e}")
|
| 104 |
+
|
| 105 |
+
def decode(self, token_ids: List[int], chunk_size: int = 10000) -> str:
|
| 106 |
+
"""Decode token IDs back to text with memory-efficient chunking."""
|
| 107 |
+
try:
|
| 108 |
+
if not token_ids:
|
| 109 |
+
return ""
|
| 110 |
+
|
| 111 |
+
if not all(isinstance(t, int) for t in token_ids):
|
| 112 |
+
raise ValueError("All token IDs must be integers")
|
| 113 |
+
|
| 114 |
+
# Process in chunks to prevent memory issues
|
| 115 |
+
if len(token_ids) <= chunk_size:
|
| 116 |
+
return self.tokenizer.decode(token_ids)
|
| 117 |
+
|
| 118 |
+
# Process large token sequences in chunks
|
| 119 |
+
chunks = []
|
| 120 |
+
for i in range(0, len(token_ids), chunk_size):
|
| 121 |
+
chunk = token_ids[i:i + chunk_size]
|
| 122 |
+
chunks.append(self.tokenizer.decode(chunk))
|
| 123 |
+
|
| 124 |
+
# Log progress periodically
|
| 125 |
+
if (i // chunk_size) % 10 == 0:
|
| 126 |
+
logger.info(f"Decoded {min(i + chunk_size, len(token_ids)):,}/{len(token_ids):,} tokens")
|
| 127 |
+
|
| 128 |
+
return "".join(chunks)
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Decoding failed: {e}")
|
| 132 |
+
raise RuntimeError(f"Failed to decode {len(token_ids)} tokens: {e}")
|
| 133 |
+
|
| 134 |
+
def get_vocab_size(self) -> int:
|
| 135 |
+
"""Return the size of the tokenizer's vocabulary."""
|
| 136 |
+
return self.vocab_size
|
| 137 |
+
|
| 138 |
+
def process_file_in_chunks(file_path: str, chunk_size: int = 1024 * 1024) -> str:
|
| 139 |
+
"""Read a file in chunks to avoid memory issues."""
|
| 140 |
+
chunks = []
|
| 141 |
+
try:
|
| 142 |
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 143 |
+
while True:
|
| 144 |
+
chunk = f.read(chunk_size)
|
| 145 |
+
if not chunk:
|
| 146 |
+
break
|
| 147 |
+
chunks.append(chunk)
|
| 148 |
+
return "".join(chunks)
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.error(f"Error reading file {file_path}: {e}")
|
| 151 |
+
raise
|
| 152 |
+
|
| 153 |
+
def normalize_whitespace(text: str) -> str:
|
| 154 |
+
"""Normalize whitespace in code for more meaningful comparison."""
|
| 155 |
+
import re
|
| 156 |
+
# Replace all whitespace sequences with a single space
|
| 157 |
+
text = re.sub(r'\s+', ' ', text)
|
| 158 |
+
# Remove leading/trailing whitespace
|
| 159 |
+
return text.strip()
|
| 160 |
+
|
| 161 |
+
def calculate_token_metrics(original_tokens, decoded_tokens):
|
| 162 |
+
"""Calculate token-level accuracy metrics."""
|
| 163 |
+
min_len = min(len(original_tokens), len(decoded_tokens))
|
| 164 |
+
exact_matches = sum(1 for a, b in zip(original_tokens, decoded_tokens) if a == b)
|
| 165 |
+
|
| 166 |
+
return {
|
| 167 |
+
'token_accuracy': exact_matches / max(len(original_tokens), 1),
|
| 168 |
+
'token_precision': exact_matches / max(len(decoded_tokens), 1),
|
| 169 |
+
'token_recall': exact_matches / max(len(original_tokens), 1),
|
| 170 |
+
'token_f1': 2 * exact_matches / (len(original_tokens) + len(decoded_tokens))
|
| 171 |
+
if (len(original_tokens) + len(decoded_tokens)) > 0 else 0
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
def enhanced_char_metrics(original: str, decoded: str) -> dict:
|
| 175 |
+
"""Calculate enhanced character-level metrics."""
|
| 176 |
+
# Normalize both strings
|
| 177 |
+
norm_original = normalize_whitespace(original)
|
| 178 |
+
norm_decoded = normalize_whitespace(decoded)
|
| 179 |
+
|
| 180 |
+
# Calculate basic metrics
|
| 181 |
+
min_len = min(len(norm_original), len(norm_decoded))
|
| 182 |
+
max_len = max(len(norm_original), len(norm_decoded))
|
| 183 |
+
|
| 184 |
+
if max_len == 0:
|
| 185 |
+
return {
|
| 186 |
+
'char_accuracy': 1.0,
|
| 187 |
+
'char_similarity': 1.0,
|
| 188 |
+
'length_diff_ratio': 0.0
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
# Calculate matches
|
| 192 |
+
matches = sum(1 for a, b in zip(norm_original, norm_decoded) if a == b)
|
| 193 |
+
|
| 194 |
+
# Calculate similarity using Levenshtein distance if available
|
| 195 |
+
try:
|
| 196 |
+
from Levenshtein import ratio
|
| 197 |
+
similarity = ratio(norm_original, norm_decoded)
|
| 198 |
+
except ImportError:
|
| 199 |
+
similarity = matches / max_len if max_len > 0 else 1.0
|
| 200 |
+
|
| 201 |
+
return {
|
| 202 |
+
'char_accuracy': matches / max_len if max_len > 0 else 1.0,
|
| 203 |
+
'char_similarity': similarity,
|
| 204 |
+
'length_diff_ratio': abs(len(norm_original) - len(norm_decoded)) / max_len if max_len > 0 else 0.0
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
def validate_code_integrity(original: str, decoded: str) -> dict:
|
| 208 |
+
"""Validate code-specific integrity metrics."""
|
| 209 |
+
import ast
|
| 210 |
+
|
| 211 |
+
def can_parse(code: str) -> bool:
|
| 212 |
+
try:
|
| 213 |
+
ast.parse(code)
|
| 214 |
+
return True
|
| 215 |
+
except:
|
| 216 |
+
return False
|
| 217 |
+
|
| 218 |
+
original_parses = can_parse(original)
|
| 219 |
+
decoded_parses = can_parse(decoded)
|
| 220 |
+
|
| 221 |
+
return {
|
| 222 |
+
'original_parses': original_parses,
|
| 223 |
+
'decoded_parses': decoded_parses,
|
| 224 |
+
'both_parse': original_parses and decoded_parses
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
def calculate_metrics(original_text: str, decoded_text: str, tokens,
|
| 228 |
+
start_time: float, end_time: float) -> Dict[str, Any]:
|
| 229 |
+
"""Enhanced metrics calculation for tokenizer evaluation."""
|
| 230 |
+
# Basic metrics
|
| 231 |
+
token_count = len(tokens) if tokens else 0
|
| 232 |
+
char_count = len(original_text) if original_text else 0
|
| 233 |
+
process_time = max(end_time - start_time, 0.001) # Avoid division by zero
|
| 234 |
+
|
| 235 |
+
metrics = {
|
| 236 |
+
'tokens': token_count,
|
| 237 |
+
'chars': char_count,
|
| 238 |
+
'processing_time': process_time,
|
| 239 |
+
'tokens_per_second': token_count / process_time,
|
| 240 |
+
'chars_per_token': char_count / (token_count or 1) # Avoid division by zero
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
# Calculate rates
|
| 244 |
+
metrics.update({
|
| 245 |
+
'tokens_per_sec': len(tokens) / metrics['processing_time'],
|
| 246 |
+
'chars_per_sec': len(original_text) / metrics['processing_time']
|
| 247 |
+
})
|
| 248 |
+
|
| 249 |
+
# Enhanced character-level metrics
|
| 250 |
+
metrics.update(enhanced_char_metrics(original_text, decoded_text))
|
| 251 |
+
|
| 252 |
+
# Token-level metrics (if we have the original tokens)
|
| 253 |
+
if hasattr(tokens, 'tokens'): # If using tokenizers' Encoding object
|
| 254 |
+
original_tokens = tokens.tokens
|
| 255 |
+
decoded_tokens = tokenizer.encode(decoded_text).tokens
|
| 256 |
+
metrics.update(calculate_token_metrics(original_tokens, decoded_tokens))
|
| 257 |
+
|
| 258 |
+
# Code-specific validation for Python files
|
| 259 |
+
if original_text.strip().endswith('.py') or 'def ' in original_text or 'import ' in original_text:
|
| 260 |
+
metrics.update(validate_code_integrity(original_text, decoded_text))
|
| 261 |
+
|
| 262 |
+
return metrics
|
| 263 |
+
|
| 264 |
+
def print_metrics_summary(metrics: Dict[str, Any]):
|
| 265 |
+
"""Print a clean summary of the metrics."""
|
| 266 |
+
print("\n=== Tokenizer Test Results ===")
|
| 267 |
+
print(f"Processing Speed: {metrics.get('tokens_per_second', metrics.get('tokens_per_sec', 0)):,.0f} tokens/sec")
|
| 268 |
+
print(f"Characters per Token: {metrics.get('chars_per_token', 0):.2f}")
|
| 269 |
+
print(f"\nCharacter-Level Metrics:")
|
| 270 |
+
print(f" • Accuracy: {metrics.get('char_accuracy', 0)*100:.2f}%")
|
| 271 |
+
print(f" • Similarity: {metrics.get('char_similarity', 0)*100:.2f}%")
|
| 272 |
+
print(f" • Levenshtein Ratio: {metrics.get('levenshtein_ratio', 0)*100:.2f}%")
|
| 273 |
+
|
| 274 |
+
print(f"\nCode Integrity:")
|
| 275 |
+
print(f" • Original parses: {'✓' if metrics.get('original_parses', False) else '✗'}")
|
| 276 |
+
print(f" • Decoded parses: {'✓' if metrics.get('decoded_parses', False) else '✗'}")
|
| 277 |
+
print(f" • Both parse: {'✓' if metrics.get('both_parse', False) else '✗'}")
|
| 278 |
+
|
| 279 |
+
def process_file(file_path: Path, tokenizer: Tokenizer, max_chunk_size: int = 100_000, sample_size: int = 100_000) -> Dict[str, Any]:
|
| 280 |
+
"""Process a single file in chunks and return metrics."""
|
| 281 |
+
try:
|
| 282 |
+
logger.info(f"\nProcessing file: {file_path}")
|
| 283 |
+
file_size = file_path.stat().st_size
|
| 284 |
+
logger.info(f"File size: {file_size / (1024*1024):.2f} MB")
|
| 285 |
+
|
| 286 |
+
# Initialize metrics
|
| 287 |
+
total_tokens = 0
|
| 288 |
+
total_chars = 0
|
| 289 |
+
total_time = 0
|
| 290 |
+
chunk_metrics = []
|
| 291 |
+
|
| 292 |
+
# Process file in chunks
|
| 293 |
+
total_read = 0
|
| 294 |
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 295 |
+
# Only read up to sample_size if specified
|
| 296 |
+
max_to_read = sample_size if sample_size > 0 else float('inf')
|
| 297 |
+
logger.info(f"Processing up to {max_to_read if max_to_read != float('inf') else 'all'} characters")
|
| 298 |
+
|
| 299 |
+
chunk = f.read(min(max_chunk_size, max_to_read - total_read))
|
| 300 |
+
total_read += len(chunk)
|
| 301 |
+
|
| 302 |
+
while chunk and total_read <= max_to_read:
|
| 303 |
+
if not chunk.strip():
|
| 304 |
+
chunk = f.read(max_chunk_size)
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
# Process chunk
|
| 308 |
+
start_time = time.time()
|
| 309 |
+
try:
|
| 310 |
+
# Handle both tokenizer output formats (object with .ids or raw list)
|
| 311 |
+
tokens = tokenizer.encode(chunk)
|
| 312 |
+
token_ids = tokens.ids if hasattr(tokens, 'ids') else tokens
|
| 313 |
+
decoded_text = tokenizer.decode(token_ids)
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logger.error(f"Error in tokenization: {e}")
|
| 316 |
+
# Skip this chunk if tokenization fails
|
| 317 |
+
chunk = f.read(max_chunk_size)
|
| 318 |
+
continue
|
| 319 |
+
|
| 320 |
+
end_time = time.time()
|
| 321 |
+
|
| 322 |
+
# Skip empty chunks
|
| 323 |
+
if not token_ids:
|
| 324 |
+
chunk = f.read(max_chunk_size)
|
| 325 |
+
continue
|
| 326 |
+
|
| 327 |
+
# Calculate metrics for this chunk
|
| 328 |
+
metrics = calculate_metrics(chunk, decoded_text, token_ids, start_time, end_time)
|
| 329 |
+
chunk_metrics.append(metrics)
|
| 330 |
+
|
| 331 |
+
# Update totals
|
| 332 |
+
total_tokens += len(token_ids)
|
| 333 |
+
total_chars += len(chunk)
|
| 334 |
+
total_time += (end_time - start_time)
|
| 335 |
+
|
| 336 |
+
# Log progress
|
| 337 |
+
if total_tokens % 1_000_000 == 0:
|
| 338 |
+
logger.info(f" Processed {total_tokens:,} tokens ({total_chars/1024/1024:.2f} MB)")
|
| 339 |
+
|
| 340 |
+
# Read next chunk (respecting sample size)
|
| 341 |
+
to_read = min(max_chunk_size, max_to_read - total_read)
|
| 342 |
+
if to_read <= 0:
|
| 343 |
+
# We've reached the sample size limit
|
| 344 |
+
break
|
| 345 |
+
|
| 346 |
+
chunk = f.read(to_read)
|
| 347 |
+
total_read += len(chunk)
|
| 348 |
+
|
| 349 |
+
# Calculate aggregate metrics
|
| 350 |
+
if not chunk_metrics:
|
| 351 |
+
logger.warning(f"No valid content found in file: {file_path}")
|
| 352 |
+
return None
|
| 353 |
+
|
| 354 |
+
# Calculate weighted averages based on token counts
|
| 355 |
+
total_weight = sum(m.get('tokens', 0) for m in chunk_metrics) or 1
|
| 356 |
+
|
| 357 |
+
avg_metrics = {
|
| 358 |
+
'chars_per_token': sum(m.get('chars_per_token', 0) * m.get('tokens', 0) for m in chunk_metrics) / total_weight,
|
| 359 |
+
'tokens_per_second': sum(m.get('tokens', 0) for m in chunk_metrics) / (total_time or 1),
|
| 360 |
+
'char_accuracy': sum(m.get('char_accuracy', 0) * m.get('tokens', 0) for m in chunk_metrics) / total_weight,
|
| 361 |
+
'tokens': total_tokens,
|
| 362 |
+
'chars': total_chars,
|
| 363 |
+
'processing_time': total_time,
|
| 364 |
+
'file_path': str(file_path)
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
# Log final metrics
|
| 368 |
+
logger.info(f" Total tokens: {total_tokens:,}")
|
| 369 |
+
logger.info(f" Total chars: {total_chars:,}")
|
| 370 |
+
logger.info(f" Avg chars/token: {avg_metrics['chars_per_token']:.2f}")
|
| 371 |
+
logger.info(f" Avg tokens/sec: {avg_metrics['tokens_per_second']:,.2f}")
|
| 372 |
+
|
| 373 |
+
return avg_metrics
|
| 374 |
+
|
| 375 |
+
except Exception as e:
|
| 376 |
+
logger.error(f"Error processing {file_path}: {e}")
|
| 377 |
+
logger.error(traceback.format_exc())
|
| 378 |
+
return None
|
| 379 |
+
|
| 380 |
+
def process_single_file(tokenizer: Tokenizer, file_path: str, sample_size: int = 0) -> Dict[str, Any]:
|
| 381 |
+
"""Process a single file and return metrics."""
|
| 382 |
+
logger.info(f"\nProcessing file: {file_path}")
|
| 383 |
+
|
| 384 |
+
try:
|
| 385 |
+
# Process file in chunks with sample size limit
|
| 386 |
+
metrics = process_file(file_path, tokenizer, sample_size=sample_size)
|
| 387 |
+
|
| 388 |
+
if not metrics:
|
| 389 |
+
logger.warning(f"Empty file or no valid content found: {file_path}")
|
| 390 |
+
return {}
|
| 391 |
+
|
| 392 |
+
# Add file info
|
| 393 |
+
metrics['file'] = os.path.basename(file_path)
|
| 394 |
+
metrics['file_size_mb'] = os.path.getsize(file_path) / (1024 * 1024)
|
| 395 |
+
|
| 396 |
+
# Log summary
|
| 397 |
+
logger.info(
|
| 398 |
+
f"Processed {metrics['file_size_mb']:.2f}MB: "
|
| 399 |
+
f"{metrics['tokens']:,} tokens, "
|
| 400 |
+
f"{metrics['chars_per_token']:.2f} chars/token, "
|
| 401 |
+
f"{metrics['tokens_per_second']:,.2f} tokens/sec"
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
# Print detailed metrics summary
|
| 405 |
+
print_metrics_summary(metrics)
|
| 406 |
+
|
| 407 |
+
return metrics
|
| 408 |
+
|
| 409 |
+
except Exception as e:
|
| 410 |
+
logger.error(f"Error processing {file_path}: {e}", exc_info=True)
|
| 411 |
+
return {'file': os.path.basename(file_path), 'error': str(e)}
|
| 412 |
+
|
| 413 |
+
def main():
|
| 414 |
+
# Set up default paths
|
| 415 |
+
project_root = get_project_root()
|
| 416 |
+
# Point to the root directory (one level up from Test_tokenizer)
|
| 417 |
+
root_dir = project_root.parent
|
| 418 |
+
default_tokenizer = root_dir / 'output' / 'tokenizer.json'
|
| 419 |
+
default_input = root_dir / 'Dataset' # Changed to look in root directory
|
| 420 |
+
default_output = root_dir / 'test_result' # Also put test results in root
|
| 421 |
+
|
| 422 |
+
# Ensure output directory exists
|
| 423 |
+
ensure_directory(default_output)
|
| 424 |
+
|
| 425 |
+
# Generate timestamp for output file
|
| 426 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 427 |
+
default_output_file = default_output / f'test_results_{timestamp}.txt'
|
| 428 |
+
|
| 429 |
+
parser = argparse.ArgumentParser(description='Test tokenizer on code files')
|
| 430 |
+
parser.add_argument('--tokenizer', type=str, default=str(default_tokenizer),
|
| 431 |
+
help=f'Path to tokenizer.json file (default: {default_tokenizer})')
|
| 432 |
+
parser.add_argument('--input', type=str, default=str(default_input),
|
| 433 |
+
help=f'Input directory or file (default: {default_input})')
|
| 434 |
+
parser.add_argument('--output', type=str, default=str(default_output_file),
|
| 435 |
+
help=f'Output text file for results (default: {default_output_file})')
|
| 436 |
+
parser.add_argument('--sample', type=int, default=100000, help='Only process this many characters from each file (0 for full file)')
|
| 437 |
+
parser.add_argument('--max-files', type=int, default=10,
|
| 438 |
+
help='Maximum number of files to process (default: 10)')
|
| 439 |
+
parser.add_argument('--file-types', type=str, default='*',
|
| 440 |
+
help='Comma-separated list of file extensions to process (e.g., "py,js,json"). Default: all files')
|
| 441 |
+
|
| 442 |
+
args = parser.parse_args()
|
| 443 |
+
|
| 444 |
+
# Ensure output directory exists
|
| 445 |
+
output_dir = Path(args.output).parent
|
| 446 |
+
ensure_directory(output_dir)
|
| 447 |
+
|
| 448 |
+
# Initialize tokenizer
|
| 449 |
+
logger.info(f"Initializing tokenizer from {args.tokenizer}")
|
| 450 |
+
tokenizer = Tokenizer(args.tokenizer)
|
| 451 |
+
|
| 452 |
+
# Parse file types
|
| 453 |
+
file_extensions = []
|
| 454 |
+
if args.file_types != '*':
|
| 455 |
+
file_extensions = [ext.strip().lower() for ext in args.file_types.split(',')]
|
| 456 |
+
logger.info(f"Filtering by file extensions: {', '.join(file_extensions)}")
|
| 457 |
+
|
| 458 |
+
# Find input files
|
| 459 |
+
input_path = Path(args.input)
|
| 460 |
+
file_paths = []
|
| 461 |
+
|
| 462 |
+
if input_path.is_dir():
|
| 463 |
+
# Find all files in the input directory (recursively)
|
| 464 |
+
if file_extensions:
|
| 465 |
+
# If specific extensions are provided, only include those
|
| 466 |
+
for ext in file_extensions:
|
| 467 |
+
pattern = f'*.{ext.lstrip(".")}'
|
| 468 |
+
file_paths.extend(input_path.rglob(pattern))
|
| 469 |
+
else:
|
| 470 |
+
# Otherwise include all files
|
| 471 |
+
file_paths = list(input_path.rglob('*'))
|
| 472 |
+
|
| 473 |
+
# Filter out directories, hidden files, and ensure files exist
|
| 474 |
+
file_paths = [
|
| 475 |
+
f for f in file_paths
|
| 476 |
+
if f.is_file() and not f.name.startswith(('.', '_'))
|
| 477 |
+
]
|
| 478 |
+
|
| 479 |
+
# Sort files by size (smallest first) to process quicker files first
|
| 480 |
+
file_paths.sort(key=lambda x: x.stat().st_size)
|
| 481 |
+
|
| 482 |
+
logger.info(f"Found {len(file_paths)} files in {input_path}")
|
| 483 |
+
if file_paths:
|
| 484 |
+
logger.info(f"Sample files: {', '.join(f.name for f in file_paths[:min(5, len(file_paths))])}" +
|
| 485 |
+
('...' if len(file_paths) > 5 else ''))
|
| 486 |
+
else:
|
| 487 |
+
# Single file
|
| 488 |
+
file_paths = [input_path] if input_path.exists() else []
|
| 489 |
+
logger.info(f"Processing single file: {input_path}")
|
| 490 |
+
|
| 491 |
+
if not file_paths:
|
| 492 |
+
logger.warning(f"No files found in {input_path}")
|
| 493 |
+
return
|
| 494 |
+
|
| 495 |
+
# Process files
|
| 496 |
+
all_metrics = []
|
| 497 |
+
processed_count = 0
|
| 498 |
+
skipped_files = 0
|
| 499 |
+
|
| 500 |
+
# Get unique file paths (remove duplicates and sort)
|
| 501 |
+
unique_file_paths = []
|
| 502 |
+
seen_paths = set()
|
| 503 |
+
|
| 504 |
+
for file_path in file_paths:
|
| 505 |
+
abs_path = str(file_path.absolute())
|
| 506 |
+
if abs_path not in seen_paths:
|
| 507 |
+
seen_paths.add(abs_path)
|
| 508 |
+
unique_file_paths.append(file_path)
|
| 509 |
+
|
| 510 |
+
if len(unique_file_paths) < len(file_paths):
|
| 511 |
+
logger.info(f"Removed {len(file_paths) - len(unique_file_paths)} duplicate file paths")
|
| 512 |
+
|
| 513 |
+
# Limit to max_files if specified
|
| 514 |
+
if args.max_files > 0:
|
| 515 |
+
unique_file_paths = unique_file_paths[:args.max_files]
|
| 516 |
+
|
| 517 |
+
# Process each file
|
| 518 |
+
for file_path in unique_file_paths:
|
| 519 |
+
try:
|
| 520 |
+
if not file_path.exists():
|
| 521 |
+
logger.warning(f"File not found: {file_path}")
|
| 522 |
+
skipped_files += 1
|
| 523 |
+
continue
|
| 524 |
+
|
| 525 |
+
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
| 526 |
+
logger.info(f"\nProcessing: {file_path.name} ({file_size_mb:.2f} MB)")
|
| 527 |
+
|
| 528 |
+
# Process the file with sample option
|
| 529 |
+
metrics = process_single_file(tokenizer, file_path, args.sample)
|
| 530 |
+
if metrics:
|
| 531 |
+
all_metrics.append(metrics)
|
| 532 |
+
processed_count += 1
|
| 533 |
+
logger.info(f"Processed {processed_count}/{len(unique_file_paths)} files")
|
| 534 |
+
except Exception as e:
|
| 535 |
+
logger.error(f"Error processing {file_path}: {str(e)}")
|
| 536 |
+
skipped_files += 1
|
| 537 |
+
|
| 538 |
+
if skipped_files > 0:
|
| 539 |
+
logger.warning(f"Skipped {skipped_files} files due to errors")
|
| 540 |
+
|
| 541 |
+
# Calculate averages from all metrics
|
| 542 |
+
if all_metrics:
|
| 543 |
+
avg_metrics = {}
|
| 544 |
+
for key in all_metrics[0].keys():
|
| 545 |
+
if isinstance(all_metrics[0][key], (int, float)):
|
| 546 |
+
values = [r[key] for r in all_metrics if key in r]
|
| 547 |
+
if values:
|
| 548 |
+
avg_metrics[f'avg_{key}'] = sum(values) / len(values)
|
| 549 |
+
|
| 550 |
+
# Write results to file
|
| 551 |
+
with open(args.output, 'w', encoding='utf-8') as f:
|
| 552 |
+
f.write("=== Tokenizer Test Results ===\n")
|
| 553 |
+
f.write(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 554 |
+
f.write(f"Tokenizer: {args.tokenizer}\n")
|
| 555 |
+
f.write(f"Input: {args.input}\n")
|
| 556 |
+
f.write(f"Sample size: {args.sample if args.sample > 0 else 'Full file'}\n\n")
|
| 557 |
+
|
| 558 |
+
f.write("=== Summary ===\n")
|
| 559 |
+
if all_metrics:
|
| 560 |
+
# Write aggregate metrics
|
| 561 |
+
for key, value in avg_metrics.items():
|
| 562 |
+
if isinstance(value, float):
|
| 563 |
+
f.write(f"{key}: {value:.4f}\n")
|
| 564 |
+
else:
|
| 565 |
+
f.write(f"{key}: {value}\n")
|
| 566 |
+
else:
|
| 567 |
+
f.write("No files were successfully processed\n")
|
| 568 |
+
|
| 569 |
+
# Write individual file results
|
| 570 |
+
f.write("\n=== File Details ===\n")
|
| 571 |
+
for result in all_metrics:
|
| 572 |
+
f.write(f"\nFile: {result.get('file', 'unknown')}\n")
|
| 573 |
+
for key, value in result.items():
|
| 574 |
+
if key != 'file':
|
| 575 |
+
if isinstance(value, float):
|
| 576 |
+
f.write(f" {key}: {value:.4f}\n")
|
| 577 |
+
else:
|
| 578 |
+
f.write(f" {key}: {value}\n")
|
| 579 |
+
|
| 580 |
+
logger.info(f"Results saved to {args.output}")
|
| 581 |
+
print(f"\nTest results saved to: {args.output}")
|
| 582 |
+
|
| 583 |
+
if all_metrics:
|
| 584 |
+
logger.info(f"\n=== Test Complete ===")
|
| 585 |
+
logger.info(f"Processed {processed_count} files")
|
| 586 |
+
logger.info(f"Average chars/token: {avg_metrics.get('avg_chars_per_token', 0):.2f}")
|
| 587 |
+
logger.info(f"Average tokens/sec: {avg_metrics.get('avg_tokens_per_sec', 0):,.0f}")
|
| 588 |
+
else:
|
| 589 |
+
logger.warning("No files were successfully processed")
|
| 590 |
+
|
| 591 |
+
if __name__ == "__main__":
|
| 592 |
+
try:
|
| 593 |
+
# Check for required dependencies
|
| 594 |
+
try:
|
| 595 |
+
import Levenshtein
|
| 596 |
+
except ImportError:
|
| 597 |
+
logger.warning("python-Levenshtein not found. Install with: pip install python-Levenshtein")
|
| 598 |
+
logger.warning("Falling back to basic similarity metrics")
|
| 599 |
+
|
| 600 |
+
main()
|
| 601 |
+
except KeyboardInterrupt:
|
| 602 |
+
logger.info("\nProcess interrupted by user")
|
| 603 |
+
sys.exit(1)
|
| 604 |
+
except Exception as e:
|
| 605 |
+
logger.error(f"An error occurred: {e}", exc_info=True)
|
| 606 |
+
sys.exit(1)
|
Test_tokenizer/test_tokenizer_simple.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from tokenizers import Tokenizer
|
| 5 |
+
from typing import Optional, Tuple, List, Dict, Any
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
def get_project_root() -> Path:
|
| 9 |
+
"""Get the project root directory."""
|
| 10 |
+
# Use the current working directory as the project root
|
| 11 |
+
return Path.cwd()
|
| 12 |
+
|
| 13 |
+
def setup_paths() -> Tuple[Path, Path, Path]:
|
| 14 |
+
"""Set up and validate required paths.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
Tuple containing (tokenizer_path, data_dir, output_dir)
|
| 18 |
+
"""
|
| 19 |
+
root = get_project_root()
|
| 20 |
+
|
| 21 |
+
# Define paths - look in root directory (one level up from Test_tokenizer)
|
| 22 |
+
tokenizer_path = root.parent / 'output' / 'tokenizer.json'
|
| 23 |
+
data_dir = root.parent / 'Dataset' # Look in root directory
|
| 24 |
+
output_dir = root.parent / 'test_result' # Output to root directory
|
| 25 |
+
|
| 26 |
+
# Create output directory if it doesn't exist
|
| 27 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
# Validate paths
|
| 31 |
+
if not tokenizer_path.exists():
|
| 32 |
+
print(f"Error: Tokenizer not found at {tokenizer_path}")
|
| 33 |
+
sys.exit(1)
|
| 34 |
+
|
| 35 |
+
if not data_dir.exists():
|
| 36 |
+
print(f"Error: Data directory not found at {data_dir}")
|
| 37 |
+
sys.exit(1)
|
| 38 |
+
|
| 39 |
+
return tokenizer_path, data_dir, output_dir
|
| 40 |
+
|
| 41 |
+
def get_first_chunk_file(data_dir: Path) -> Optional[Path]:
|
| 42 |
+
"""Get the first chunk file from the data directory."""
|
| 43 |
+
# Look for .txt files in the data directory
|
| 44 |
+
chunk_files = sorted(list(data_dir.glob('*.txt')))
|
| 45 |
+
if not chunk_files:
|
| 46 |
+
print(f"Error: No .txt files found in {data_dir}")
|
| 47 |
+
return None
|
| 48 |
+
return chunk_files[0] # Return the first chunk file
|
| 49 |
+
|
| 50 |
+
def test_tokenizer_on_chunk(tokenizer: Tokenizer, chunk_path: Path, max_lines: int = 1000) -> Dict[str, Any]:
|
| 51 |
+
"""Test the tokenizer on the first max_lines of a chunk file."""
|
| 52 |
+
results = {
|
| 53 |
+
'total_lines': 0,
|
| 54 |
+
'lines_processed': 0,
|
| 55 |
+
'total_tokens': 0,
|
| 56 |
+
'perfect_matches': 0,
|
| 57 |
+
'total_chars': 0,
|
| 58 |
+
'total_diff_chars': 0,
|
| 59 |
+
'lines': []
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
with open(chunk_path, 'r', encoding='utf-8') as f:
|
| 64 |
+
for i, line in enumerate(f):
|
| 65 |
+
if i >= max_lines:
|
| 66 |
+
break
|
| 67 |
+
|
| 68 |
+
line = line.strip()
|
| 69 |
+
if not line: # Skip empty lines
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
# Tokenize and decode
|
| 73 |
+
encoding = tokenizer.encode(line)
|
| 74 |
+
decoded = tokenizer.decode(encoding.ids)
|
| 75 |
+
|
| 76 |
+
# Calculate differences
|
| 77 |
+
diff_chars = sum(1 for a, b in zip(line, decoded) if a != b)
|
| 78 |
+
diff_chars += abs(len(line) - len(decoded))
|
| 79 |
+
is_perfect = diff_chars == 0
|
| 80 |
+
|
| 81 |
+
# Update results
|
| 82 |
+
results['total_lines'] += 1
|
| 83 |
+
results['lines_processed'] += 1
|
| 84 |
+
results['total_tokens'] += len(encoding.tokens)
|
| 85 |
+
results['total_chars'] += len(line)
|
| 86 |
+
results['total_diff_chars'] += diff_chars
|
| 87 |
+
results['perfect_matches'] += 1 if is_perfect else 0
|
| 88 |
+
|
| 89 |
+
# Store detailed results for the first few lines
|
| 90 |
+
if i < 5: # First 5 lines
|
| 91 |
+
results['lines'].append({
|
| 92 |
+
'original': line[:200] + ('...' if len(line) > 200 else ''),
|
| 93 |
+
'decoded': decoded[:200] + ('...' if len(decoded) > 200 else ''),
|
| 94 |
+
'tokens': encoding.tokens[:10], # First 10 tokens
|
| 95 |
+
'is_perfect': is_perfect,
|
| 96 |
+
'diff_chars': diff_chars,
|
| 97 |
+
'similarity': 1 - (diff_chars / max(len(line), 1))
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
# Print progress
|
| 101 |
+
if (i + 1) % 100 == 0:
|
| 102 |
+
print(f"Processed {i+1} lines...")
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"Error processing file: {e}")
|
| 106 |
+
return results
|
| 107 |
+
|
| 108 |
+
return results
|
| 109 |
+
|
| 110 |
+
def print_summary(results: Dict[str, Any], output_path: Path) -> None:
|
| 111 |
+
"""Print and save test summary in TXT format with script name in the filename."""
|
| 112 |
+
if not results['lines_processed']:
|
| 113 |
+
print("No lines were processed.")
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
# Calculate statistics
|
| 117 |
+
avg_tokens_per_line = results['total_tokens'] / results['lines_processed']
|
| 118 |
+
total_chars = results['total_chars']
|
| 119 |
+
total_diff_chars = results['total_diff_chars']
|
| 120 |
+
accuracy = 1 - (total_diff_chars / total_chars) if total_chars > 0 else 0
|
| 121 |
+
diff_percentage = (total_diff_chars / total_chars * 100) if total_chars > 0 else 0
|
| 122 |
+
|
| 123 |
+
# Get script name without extension
|
| 124 |
+
script_name = Path(__file__).stem
|
| 125 |
+
|
| 126 |
+
# Prepare summary text
|
| 127 |
+
summary = [
|
| 128 |
+
"="*80,
|
| 129 |
+
"TOKENIZER TEST SUMMARY",
|
| 130 |
+
"="*80,
|
| 131 |
+
f"Test Script: {script_name}.py",
|
| 132 |
+
f"Timestamp: {results.get('timestamp', 'N/A')}",
|
| 133 |
+
f"Tokenizer: {results.get('tokenizer_path', 'N/A')}",
|
| 134 |
+
f"Chunk file: {results.get('chunk_file', 'N/A')}",
|
| 135 |
+
"-"*80,
|
| 136 |
+
f"Lines processed: {results['lines_processed']}",
|
| 137 |
+
f"Perfect matches: {results['perfect_matches']} ({results['perfect_matches']/results['lines_processed']*100:.1f}%)",
|
| 138 |
+
f"Average tokens/line: {avg_tokens_per_line:.2f}",
|
| 139 |
+
f"Total characters: {total_chars:,}",
|
| 140 |
+
f"Total tokens: {results['total_tokens']:,}",
|
| 141 |
+
f"Character accuracy: {accuracy*100:.2f}%",
|
| 142 |
+
f"Character diff: {total_diff_chars:,} chars ({diff_percentage:.4f}%)",
|
| 143 |
+
f"Chars per token: {total_chars/results['total_tokens']:.2f} (lower is better)",
|
| 144 |
+
"\nSAMPLE LINES:",
|
| 145 |
+
"-"*40
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
# Add sample lines
|
| 149 |
+
for i, line in enumerate(results.get('lines', [])[:3]):
|
| 150 |
+
summary.extend([
|
| 151 |
+
f"\nSAMPLE {i+1}:",
|
| 152 |
+
f"Original: {line.get('original', '')}",
|
| 153 |
+
f"Decoded: {line.get('decoded', '')}",
|
| 154 |
+
f"Tokens: {', '.join(line.get('tokens', [])[:8])}{'...' if len(line.get('tokens', [])) > 8 else ''}",
|
| 155 |
+
f"Match: {'✓ Perfect' if line.get('is_perfect') else '✗ Different'}",
|
| 156 |
+
"-"*40
|
| 157 |
+
])
|
| 158 |
+
|
| 159 |
+
# Print to console
|
| 160 |
+
print("\n".join(summary))
|
| 161 |
+
|
| 162 |
+
# Save as TXT with script name in filename
|
| 163 |
+
timestamp = results.get('timestamp', '')
|
| 164 |
+
output_file = output_path / f'{script_name}_result_{timestamp}.txt'
|
| 165 |
+
|
| 166 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 167 |
+
f.write("\n".join(summary))
|
| 168 |
+
|
| 169 |
+
print(f"\nResults saved to: {output_file}")
|
| 170 |
+
|
| 171 |
+
def main():
|
| 172 |
+
# Set up paths
|
| 173 |
+
tokenizer_path, data_dir, output_dir = setup_paths()
|
| 174 |
+
|
| 175 |
+
# Get the first chunk file
|
| 176 |
+
chunk_path = get_first_chunk_file(data_dir)
|
| 177 |
+
if not chunk_path:
|
| 178 |
+
print(f"No files found in {data_dir}. Please ensure the Dataset directory contains text files.")
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
print(f"Found data directory: {data_dir}")
|
| 182 |
+
print(f"Output directory: {output_dir}")
|
| 183 |
+
|
| 184 |
+
print(f"Testing tokenizer on first 1000 lines of: {chunk_path.name}")
|
| 185 |
+
|
| 186 |
+
# Load the tokenizer
|
| 187 |
+
print(f"Loading tokenizer from: {tokenizer_path}")
|
| 188 |
+
tokenizer = Tokenizer.from_file(str(tokenizer_path))
|
| 189 |
+
|
| 190 |
+
# Get vocabulary info
|
| 191 |
+
vocab = tokenizer.get_vocab()
|
| 192 |
+
print(f"Vocabulary size: {len(vocab):,} tokens")
|
| 193 |
+
|
| 194 |
+
# Test tokenizer on the chunk
|
| 195 |
+
print("\nTesting tokenizer on chunk...")
|
| 196 |
+
results = test_tokenizer_on_chunk(tokenizer, chunk_path, max_lines=1000)
|
| 197 |
+
|
| 198 |
+
# Add timestamp and tokenizer info to results
|
| 199 |
+
import time
|
| 200 |
+
results['timestamp'] = time.strftime("%Y%m%d_%H%M%S")
|
| 201 |
+
results['tokenizer_path'] = str(tokenizer_path)
|
| 202 |
+
results['chunk_file'] = str(chunk_path.name)
|
| 203 |
+
|
| 204 |
+
# Print and save summary
|
| 205 |
+
print_summary(results, output_dir)
|
| 206 |
+
print("\nTest complete!")
|
| 207 |
+
|
| 208 |
+
if __name__ == "__main__":
|
| 209 |
+
main()
|
dist/ez_tokenizer-1.0.0-py3-none-any.whl
ADDED
|
Binary file (17.8 kB). View file
|
|
|
dist/ez_tokenizer-1.0.0.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea6b4315e4faaa4641ac8d1c3103e0911fc8da8455b5310c8f27bac68332fca7
|
| 3 |
+
size 26831
|
examples/README.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NexForge Tokenizer Examples
|
| 2 |
+
|
| 3 |
+
This directory contains example scripts demonstrating advanced usage of the NexForge tokenizer.
|
| 4 |
+
|
| 5 |
+
## Quick Start
|
| 6 |
+
|
| 7 |
+
### Basic Tokenizer Creation
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
from nexforgetokenizer import build_tokenizer
|
| 11 |
+
|
| 12 |
+
# Create a tokenizer with default settings
|
| 13 |
+
build_tokenizer(
|
| 14 |
+
input_dir="path/to/your/files",
|
| 15 |
+
output_path="custom_tokenizer.json",
|
| 16 |
+
vocab_size=40000,
|
| 17 |
+
min_frequency=2
|
| 18 |
+
)
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
### Example Scripts
|
| 22 |
+
|
| 23 |
+
1. **Basic Example** (`basic_usage.py`)
|
| 24 |
+
- Simple tokenizer creation and usage
|
| 25 |
+
- Basic encoding/decoding
|
| 26 |
+
- Vocabulary inspection
|
| 27 |
+
|
| 28 |
+
2. **Advanced Usage** (`advanced_usage.py`)
|
| 29 |
+
- Custom special tokens
|
| 30 |
+
- Batch processing
|
| 31 |
+
- Performance optimization
|
| 32 |
+
- Error handling
|
| 33 |
+
|
| 34 |
+
## Running Examples
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
# Install in development mode
|
| 38 |
+
pip install -e .
|
| 39 |
+
|
| 40 |
+
# Run basic example
|
| 41 |
+
python examples/basic_usage.py
|
| 42 |
+
|
| 43 |
+
# Run advanced example
|
| 44 |
+
python examples/advanced_usage.py --input-dir ../Dataset --output my_tokenizer.json
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
## Example: Creating a Custom Tokenizer
|
| 48 |
+
|
| 49 |
+
```python
|
| 50 |
+
from nexforgetokenizer import build_tokenizer
|
| 51 |
+
|
| 52 |
+
# Create a tokenizer with custom settings
|
| 53 |
+
build_tokenizer(
|
| 54 |
+
input_dir="../Dataset",
|
| 55 |
+
output_path="my_tokenizer.json",
|
| 56 |
+
vocab_size=30000, # Smaller vocabulary for specific domain
|
| 57 |
+
min_frequency=3, # Only include tokens appearing at least 3 times
|
| 58 |
+
max_files=1000, # Limit number of files to process
|
| 59 |
+
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
|
| 60 |
+
)
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## Best Practices
|
| 64 |
+
|
| 65 |
+
1. **For General Use**
|
| 66 |
+
- Use default settings (40k vocab, min_freq=2)
|
| 67 |
+
- Process all files in your dataset
|
| 68 |
+
- Test with the built-in test suite
|
| 69 |
+
|
| 70 |
+
2. **For Specialized Domains**
|
| 71 |
+
- Adjust vocabulary size based on domain complexity
|
| 72 |
+
- Consider increasing min_frequency for smaller vocabularies
|
| 73 |
+
- Test with domain-specific files
|
| 74 |
+
|
| 75 |
+
## Need Help?
|
| 76 |
+
|
| 77 |
+
- Check the [main README](../README.md) for basic usage
|
| 78 |
+
- Review the test cases in `Test_tokenizer/`
|
| 79 |
+
- Open an issue on GitHub for support
|
| 80 |
+
|
| 81 |
+
## License
|
| 82 |
+
|
| 83 |
+
MIT License - See [LICENSE](../LICENSE) for details.
|
examples/advanced_usage.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Advanced usage example for NexForge Tokenizer Builder.
|
| 3 |
+
|
| 4 |
+
This example demonstrates:
|
| 5 |
+
- Custom special tokens
|
| 6 |
+
- Batch processing with progress tracking
|
| 7 |
+
- Vocabulary inspection and analysis
|
| 8 |
+
- Error handling and recovery
|
| 9 |
+
- Performance optimization
|
| 10 |
+
"""
|
| 11 |
+
import os
|
| 12 |
+
import json
|
| 13 |
+
import time
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Dict, List, Optional
|
| 16 |
+
|
| 17 |
+
from tqdm import tqdm
|
| 18 |
+
|
| 19 |
+
# Import the tokenizer components
|
| 20 |
+
from nexforgetokenizer import (
|
| 21 |
+
build_tokenizer,
|
| 22 |
+
SystemResources,
|
| 23 |
+
log_memory_usage,
|
| 24 |
+
TokenizerError
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
def create_large_sample_dataset(num_files: int = 50, base_dir: str = "sample_data") -> Path:
|
| 28 |
+
"""Create a larger sample dataset with different file types."""
|
| 29 |
+
base_path = Path(base_dir)
|
| 30 |
+
|
| 31 |
+
# Clean up if exists
|
| 32 |
+
if base_path.exists():
|
| 33 |
+
import shutil
|
| 34 |
+
shutil.rmtree(base_path)
|
| 35 |
+
|
| 36 |
+
# Create directories
|
| 37 |
+
base_path.mkdir(exist_ok=True)
|
| 38 |
+
|
| 39 |
+
# Create Python files
|
| 40 |
+
for i in range(num_files // 2):
|
| 41 |
+
module_content = f"""
|
| 42 |
+
# Sample Python module {i}
|
| 43 |
+
|
| 44 |
+
def process_data(data):
|
| 45 |
+
'''Process sample data.'''
|
| 46 |
+
result = []
|
| 47 |
+
for item in data:
|
| 48 |
+
if item % 2 == 0:
|
| 49 |
+
result.append(item * 2)
|
| 50 |
+
return result
|
| 51 |
+
"""
|
| 52 |
+
(base_path / f"module_{i}.py").write_text(module_content)
|
| 53 |
+
|
| 54 |
+
# Create text files
|
| 55 |
+
for i in range(num_files // 2):
|
| 56 |
+
doc_content = f"""
|
| 57 |
+
This is sample text document {i}.
|
| 58 |
+
It contains multiple lines of text with various tokens.
|
| 59 |
+
The quick brown fox jumps over the lazy dog.
|
| 60 |
+
Special characters: !@#$%^&*()_+-=[]{{}}|;':\",./<>?
|
| 61 |
+
"""
|
| 62 |
+
(base_path / f"document_{i}.txt").write_text(doc_content)
|
| 63 |
+
|
| 64 |
+
print(f"Created {num_files} sample files in {base_path}")
|
| 65 |
+
return base_path
|
| 66 |
+
|
| 67 |
+
class DataProcessor:
|
| 68 |
+
"""Example data processor class for demonstration."""
|
| 69 |
+
def __init__(self, config: dict):
|
| 70 |
+
self.config = config
|
| 71 |
+
|
| 72 |
+
def run(self):
|
| 73 |
+
"""Run the processor with the current config."""
|
| 74 |
+
print(f"Processing with config: {self.config}")
|
| 75 |
+
|
| 76 |
+
class TokenizerAnalyzer:
|
| 77 |
+
"""Helper class for analyzing tokenizer performance and vocabulary."""
|
| 78 |
+
|
| 79 |
+
def __init__(self, tokenizer_path: str):
|
| 80 |
+
self.tokenizer_path = tokenizer_path
|
| 81 |
+
self.tokenizer = None
|
| 82 |
+
self.vocab = None
|
| 83 |
+
|
| 84 |
+
def load(self):
|
| 85 |
+
"""Load the tokenizer."""
|
| 86 |
+
from tokenizers import Tokenizer
|
| 87 |
+
self.tokenizer = Tokenizer.from_file(self.tokenizer_path)
|
| 88 |
+
self.vocab = {
|
| 89 |
+
idx: self.tokenizer.id_to_token(idx)
|
| 90 |
+
for idx in range(self.tokenizer.get_vocab_size())
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
def analyze_vocab(self, top_n: int = 20):
|
| 94 |
+
"""Analyze and print vocabulary statistics."""
|
| 95 |
+
if not self.tokenizer:
|
| 96 |
+
self.load()
|
| 97 |
+
|
| 98 |
+
vocab_size = len(self.vocab)
|
| 99 |
+
special_tokens = [
|
| 100 |
+
token for token in self.vocab.values()
|
| 101 |
+
if token.startswith("[") and token.endswith("]")
|
| 102 |
+
]
|
| 103 |
+
|
| 104 |
+
print(f"\n=== Vocabulary Analysis ===")
|
| 105 |
+
print(f"Total vocabulary size: {vocab_size}")
|
| 106 |
+
print(f"Special tokens ({len(special_tokens)}): {', '.join(special_tokens[:10])}" +
|
| 107 |
+
("..." if len(special_tokens) > 10 else ""))
|
| 108 |
+
|
| 109 |
+
# Show sample of vocabulary
|
| 110 |
+
print(f"\nSample vocabulary items:")
|
| 111 |
+
for idx in range(min(top_n, vocab_size)):
|
| 112 |
+
print(f" {idx}: {self.vocab.get(idx, 'N/A')}")
|
| 113 |
+
|
| 114 |
+
if vocab_size > top_n:
|
| 115 |
+
print(f" ... and {vocab_size - top_n} more")
|
| 116 |
+
|
| 117 |
+
def main():
|
| 118 |
+
"""Run the advanced example."""
|
| 119 |
+
print("NexForge Tokenizer Builder - Advanced Example")
|
| 120 |
+
print("=========================================\n")
|
| 121 |
+
|
| 122 |
+
# 1. Setup
|
| 123 |
+
output_dir = Path("advanced_output")
|
| 124 |
+
output_dir.mkdir(exist_ok=True)
|
| 125 |
+
|
| 126 |
+
tokenizer_path = output_dir / "advanced_tokenizer.json"
|
| 127 |
+
|
| 128 |
+
# 2. Check system resources
|
| 129 |
+
resources = SystemResources()
|
| 130 |
+
print(f"\n=== System Resources ===")
|
| 131 |
+
print(f"CPU Cores: {resources.cpu_cores}")
|
| 132 |
+
print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
|
| 133 |
+
if resources.has_cuda:
|
| 134 |
+
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
|
| 135 |
+
else:
|
| 136 |
+
print("No CUDA GPU detected")
|
| 137 |
+
|
| 138 |
+
# 3. Create sample dataset
|
| 139 |
+
print("\n=== Creating Sample Dataset ===")
|
| 140 |
+
dataset_path = create_large_sample_dataset(num_files=50)
|
| 141 |
+
|
| 142 |
+
# 4. Custom special tokens
|
| 143 |
+
special_tokens = [
|
| 144 |
+
"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
|
| 145 |
+
"[PYTHON]", "[TEXT]", "[CODE]"
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
# 5. Build the tokenizer with advanced options
|
| 149 |
+
print("\n=== Building Tokenizer ===")
|
| 150 |
+
print(f"Input directory: {dataset_path}")
|
| 151 |
+
print(f"Output path: {tokenizer_path}")
|
| 152 |
+
|
| 153 |
+
start_time = time.time()
|
| 154 |
+
|
| 155 |
+
try:
|
| 156 |
+
success = build_tokenizer(
|
| 157 |
+
input_dir=str(dataset_path),
|
| 158 |
+
output_path=str(tokenizer_path),
|
| 159 |
+
vocab_size=5000, # Larger vocabulary for better coverage
|
| 160 |
+
min_frequency=2, # Only include tokens that appear at least twice
|
| 161 |
+
special_tokens=special_tokens,
|
| 162 |
+
resources=resources,
|
| 163 |
+
max_files=50, # Process all files
|
| 164 |
+
chunk_size=100000, # Process in 100KB chunks
|
| 165 |
+
n_threads=max(1, resources.cpu_cores - 1) # Use all but one CPU core
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
if success:
|
| 169 |
+
duration = time.time() - start_time
|
| 170 |
+
print(f"\nTokenizer created successfully in {duration:.2f} seconds")
|
| 171 |
+
print(f"Tokenizer saved to: {tokenizer_path}")
|
| 172 |
+
|
| 173 |
+
# 6. Analyze the created tokenizer
|
| 174 |
+
print("\n=== Tokenizer Analysis ===")
|
| 175 |
+
analyzer = TokenizerAnalyzer(str(tokenizer_path))
|
| 176 |
+
analyzer.load()
|
| 177 |
+
analyzer.analyze_vocab()
|
| 178 |
+
|
| 179 |
+
# 7. Show example encoding/decoding
|
| 180 |
+
print("\n=== Example Encoding/Decoding ===")
|
| 181 |
+
sample_text = "def hello_world():\n print('Hello, world!') # Sample Python code"
|
| 182 |
+
|
| 183 |
+
encoded = analyzer.tokenizer.encode(sample_text)
|
| 184 |
+
decoded = analyzer.tokenizer.decode(encoded.ids)
|
| 185 |
+
|
| 186 |
+
print(f"Original: {sample_text}")
|
| 187 |
+
print(f"Encoded: {encoded.ids}")
|
| 188 |
+
print(f"Tokens: {encoded.tokens}")
|
| 189 |
+
print(f"Decoded: {decoded}")
|
| 190 |
+
|
| 191 |
+
else:
|
| 192 |
+
print("\nFailed to create tokenizer")
|
| 193 |
+
|
| 194 |
+
except TokenizerError as e:
|
| 195 |
+
print(f"\nError creating tokenizer: {e}")
|
| 196 |
+
except Exception as e:
|
| 197 |
+
print(f"\nUnexpected error: {e}")
|
| 198 |
+
finally:
|
| 199 |
+
# 8. Cleanup (optional)
|
| 200 |
+
# import shutil
|
| 201 |
+
# shutil.rmtree(dataset_path, ignore_errors=True)
|
| 202 |
+
pass
|
| 203 |
+
|
| 204 |
+
print("\nExample completed!")
|
| 205 |
+
|
| 206 |
+
if __name__ == "__main__":
|
| 207 |
+
main()
|
examples/basic_usage.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Basic usage example for NexForge Tokenizer Builder."""
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
|
| 6 |
+
from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage
|
| 7 |
+
|
| 8 |
+
def create_sample_code():
|
| 9 |
+
"""Create a sample directory with Python files for testing."""
|
| 10 |
+
# Create a sample directory with Python files
|
| 11 |
+
sample_dir = Path("sample_code")
|
| 12 |
+
|
| 13 |
+
# Clean up if it exists
|
| 14 |
+
if sample_dir.exists():
|
| 15 |
+
import shutil
|
| 16 |
+
shutil.rmtree(sample_dir)
|
| 17 |
+
|
| 18 |
+
# Create directory
|
| 19 |
+
sample_dir.mkdir(exist_ok=True)
|
| 20 |
+
|
| 21 |
+
# Create some sample Python files
|
| 22 |
+
(sample_dir / "hello.py").write_text("""
|
| 23 |
+
def greet(name):
|
| 24 |
+
print(f"Hello, {name}!")
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
greet("World")
|
| 28 |
+
""")
|
| 29 |
+
|
| 30 |
+
(sample_dir / "math.py").write_text("""
|
| 31 |
+
def add(a, b):
|
| 32 |
+
return a + b
|
| 33 |
+
|
| 34 |
+
def multiply(a, b):
|
| 35 |
+
return a * b
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
print(f"2 + 3 = {add(2, 3)}")
|
| 39 |
+
print(f"2 * 3 = {multiply(2, 3)}")
|
| 40 |
+
""")
|
| 41 |
+
|
| 42 |
+
return sample_dir
|
| 43 |
+
|
| 44 |
+
def main():
|
| 45 |
+
"""Run the example."""
|
| 46 |
+
print("NexForge Tokenizer Builder Basic Example")
|
| 47 |
+
print("=======================================\n")
|
| 48 |
+
|
| 49 |
+
# Create sample code
|
| 50 |
+
sample_dir = create_sample_code()
|
| 51 |
+
print(f"Created sample code in: {sample_dir}")
|
| 52 |
+
|
| 53 |
+
# Check system resources
|
| 54 |
+
resources = SystemResources()
|
| 55 |
+
print(f"\nDetected System Resources:")
|
| 56 |
+
print(f"CPU Cores: {resources.cpu_cores}")
|
| 57 |
+
print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
|
| 58 |
+
if resources.has_cuda:
|
| 59 |
+
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
|
| 60 |
+
else:
|
| 61 |
+
print("No CUDA GPU detected")
|
| 62 |
+
|
| 63 |
+
# Create output path for tokenizer
|
| 64 |
+
output_path = "sample_tokenizer.json"
|
| 65 |
+
|
| 66 |
+
# Check initial memory usage
|
| 67 |
+
print("\nInitial memory usage:")
|
| 68 |
+
log_memory_usage()
|
| 69 |
+
|
| 70 |
+
# Build the tokenizer
|
| 71 |
+
print("\nBuilding tokenizer...")
|
| 72 |
+
success = build_tokenizer(
|
| 73 |
+
input_dir=str(sample_dir),
|
| 74 |
+
output_path=output_path,
|
| 75 |
+
vocab_size=1000, # Small vocabulary for this example
|
| 76 |
+
min_frequency=1, # Include all tokens
|
| 77 |
+
resources=resources
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Check final memory usage
|
| 81 |
+
print("\nFinal memory usage:")
|
| 82 |
+
log_memory_usage()
|
| 83 |
+
|
| 84 |
+
if success:
|
| 85 |
+
print(f"\nTokenizer successfully created at: {output_path}")
|
| 86 |
+
print(f"You can now use this tokenizer with any library that supports the HuggingFace tokenizers format")
|
| 87 |
+
else:
|
| 88 |
+
print("\nFailed to create tokenizer")
|
| 89 |
+
|
| 90 |
+
print("\nExample completed!")
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
main()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=42.0", "setuptools-scm>=3.4"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "ez-tokenizer"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "High-performance tokenizer builder for code and text datasets with adaptive resource management"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.8"
|
| 11 |
+
license = {text = "MIT with Company Restriction"}
|
| 12 |
+
authors = [
|
| 13 |
+
{name = "NexForge", email = "[email protected]"}
|
| 14 |
+
]
|
| 15 |
+
maintainers = [
|
| 16 |
+
{name = "NexForge", email = "[email protected]"}
|
| 17 |
+
]
|
| 18 |
+
classifiers = [
|
| 19 |
+
"Development Status :: 4 - Beta",
|
| 20 |
+
"Intended Audience :: Developers",
|
| 21 |
+
"Intended Audience :: Science/Research",
|
| 22 |
+
"License :: Other/Proprietary License",
|
| 23 |
+
"Programming Language :: Python :: 3.8",
|
| 24 |
+
"Programming Language :: Python :: 3.9",
|
| 25 |
+
"Programming Language :: Python :: 3.10",
|
| 26 |
+
"Programming Language :: Python :: 3.11",
|
| 27 |
+
"Programming Language :: Python :: 3.12",
|
| 28 |
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
| 29 |
+
"Topic :: Text Processing :: Linguistic"
|
| 30 |
+
]
|
| 31 |
+
dependencies = [
|
| 32 |
+
"torch>=1.9.0",
|
| 33 |
+
"tokenizers>=0.12.0",
|
| 34 |
+
"tqdm>=4.62.0",
|
| 35 |
+
"psutil>=5.9.0",
|
| 36 |
+
"python-dateutil>=2.8.2"
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
[project.optional-dependencies]
|
| 40 |
+
dev = [
|
| 41 |
+
"pytest>=6.0",
|
| 42 |
+
"pytest-cov>=2.12.1",
|
| 43 |
+
"pytest-xdist>=2.4.0",
|
| 44 |
+
"black>=21.7b0",
|
| 45 |
+
"isort>=5.0.0",
|
| 46 |
+
"mypy>=0.910",
|
| 47 |
+
"pylint>=2.11.0",
|
| 48 |
+
"pre-commit>=2.15.0"
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
[tool.setuptools]
|
| 52 |
+
include-package-data = true
|
| 53 |
+
package-dir = { "" = "src" }
|
| 54 |
+
|
| 55 |
+
[tool.setuptools.packages.find]
|
| 56 |
+
where = ["src"]
|
| 57 |
+
namespaces = true
|
| 58 |
+
|
| 59 |
+
[tool.black]
|
| 60 |
+
line-length = 88
|
| 61 |
+
target-version = ['py38']
|
| 62 |
+
|
| 63 |
+
[tool.isort]
|
| 64 |
+
profile = "black"
|
| 65 |
+
multi_line_output = 3
|
| 66 |
+
include_trailing_comma = true
|
| 67 |
+
force_grid_wrap = 0
|
| 68 |
+
use_parentheses = true
|
| 69 |
+
ensure_newline_before_comments = true
|
| 70 |
+
|
| 71 |
+
[tool.mypy]
|
| 72 |
+
ignore_missing_imports = true
|
| 73 |
+
disallow_untyped_defs = true
|
| 74 |
+
disallow_incomplete_defs = true
|
| 75 |
+
check_untyped_defs = true
|
| 76 |
+
no_implicit_optional = true
|
| 77 |
+
warn_redundant_casts = true
|
| 78 |
+
warn_unused_ignores = true
|
| 79 |
+
warn_return_any = true
|
| 80 |
+
warn_unreachable = true
|
| 81 |
+
show_error_context = true
|
requirements-dev.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core development dependencies
|
| 2 |
+
-r requirements.txt
|
| 3 |
+
|
| 4 |
+
# Testing
|
| 5 |
+
pytest>=6.0
|
| 6 |
+
pytest-cov>=2.12.1
|
| 7 |
+
pytest-xdist>=2.4.0
|
| 8 |
+
|
| 9 |
+
# Code formatting
|
| 10 |
+
black>=21.7b0
|
| 11 |
+
isort>=5.0.0
|
| 12 |
+
|
| 13 |
+
# Static type checking
|
| 14 |
+
mypy>=0.910
|
| 15 |
+
|
| 16 |
+
# Linting
|
| 17 |
+
pylint>=2.11.0
|
| 18 |
+
|
| 19 |
+
# Version control hooks
|
| 20 |
+
pre-commit>=2.15.0
|
| 21 |
+
|
| 22 |
+
# Optional: For documentation
|
| 23 |
+
# sphinx>=4.0.0
|
| 24 |
+
# sphinx-rtd-theme>=0.5.0
|
| 25 |
+
|
| 26 |
+
# Optional: For notebook development
|
| 27 |
+
# jupyter>=1.0.0
|
| 28 |
+
# ipykernel>=6.0.0
|
requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Dependencies
|
| 2 |
+
torch>=1.9.0,<3.0.0 # PyTorch for tensor operations
|
| 3 |
+
tokenizers>=0.12.0,<0.15.0 # HuggingFace tokenizers
|
| 4 |
+
tqdm>=4.62.0,<5.0.0 # Progress bars
|
| 5 |
+
psutil>=5.9.0,<6.0.0 # System monitoring
|
| 6 |
+
python-dateutil>=2.8.2,<3.0.0 # Date/time utilities
|
| 7 |
+
|
| 8 |
+
# Optional Dependencies (uncomment if needed)
|
| 9 |
+
# numpy>=1.20.0,<2.0.0 # Required by some tokenizer components
|
| 10 |
+
# pandas>=1.3.0,<3.0.0 # For data manipulation
|
| 11 |
+
# scikit-learn>=1.0.0,<2.0.0 # For evaluation metrics
|
| 12 |
+
|
| 13 |
+
# Version Pinning Examples (for production)
|
| 14 |
+
# torch==2.0.1
|
| 15 |
+
# tokenizers==0.13.3
|
| 16 |
+
# tqdm==4.65.0
|
| 17 |
+
# psutil==5.9.5
|
| 18 |
+
# python-dateutil==2.8.2
|
run_ez_tokenizer.bat
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
|
| 3 |
+
:: Set up directory variables first
|
| 4 |
+
set "SCRIPT_DIR=%~dp0"
|
| 5 |
+
set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
|
| 6 |
+
set "CURRENT_DIR=%CD%"
|
| 7 |
+
cd /d "%SCRIPT_DIR%"
|
| 8 |
+
|
| 9 |
+
:: EZ-Tokenizer Launcher with Banner
|
| 10 |
+
:: This script must be run as administrator
|
| 11 |
+
:: Previous versions were known as NexForge Tokenizer
|
| 12 |
+
:: All functionality remains the same, only the name has been updated
|
| 13 |
+
|
| 14 |
+
cls
|
| 15 |
+
|
| 16 |
+
echo.
|
| 17 |
+
echo =======================================================
|
| 18 |
+
echo EZ-TOKENIZER v1.0.0
|
| 19 |
+
echo (CodeGen-NF Model Pre-Release)
|
| 20 |
+
echo =======================================================
|
| 21 |
+
echo Script running from: %SCRIPT_DIR%
|
| 22 |
+
|
| 23 |
+
:check_admin
|
| 24 |
+
net session >nul 2>&1
|
| 25 |
+
if %errorLevel% == 0 (
|
| 26 |
+
echo Running with administrator privileges...
|
| 27 |
+
) else (
|
| 28 |
+
echo ###########################################################
|
| 29 |
+
echo # #
|
| 30 |
+
echo # EZ-TOKENIZER REQUIRES ADMINISTRATOR PRIVILEGES #
|
| 31 |
+
echo # Please right-click and select 'Run as administrator' #
|
| 32 |
+
echo # #
|
| 33 |
+
echo ###########################################################
|
| 34 |
+
echo.
|
| 35 |
+
echo Please right-click on this file and select "Run as administrator"
|
| 36 |
+
pause
|
| 37 |
+
exit /b
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
:menu
|
| 41 |
+
cls
|
| 42 |
+
:: Display banner
|
| 43 |
+
echo N N EEEEE X X FFFFF OOOOO RRRR GGGG EEEEE
|
| 44 |
+
echo NN N E X X F O O R R G E
|
| 45 |
+
echo N N N EEEE X FFFF O O RRRR G GG EEEE
|
| 46 |
+
echo N NN E X X F O O R R G G E
|
| 47 |
+
echo N N EEEEE X X F OOOOO R R GGGG EEEEE
|
| 48 |
+
echo.
|
| 49 |
+
echo PRESENTS:
|
| 50 |
+
echo =======================================================
|
| 51 |
+
echo EZ-TOKENIZER v1.0.0
|
| 52 |
+
echo =======================================================
|
| 53 |
+
:: Display current directory with error checking
|
| 54 |
+
if defined SCRIPT_DIR (
|
| 55 |
+
echo Current directory: %~dp0
|
| 56 |
+
echo Script directory: %~dp0
|
| 57 |
+
) else (
|
| 58 |
+
echo [WARNING] SCRIPT_DIR not defined. Using current directory: %CD%
|
| 59 |
+
set "SCRIPT_DIR=%CD%"
|
| 60 |
+
)
|
| 61 |
+
echo.
|
| 62 |
+
echo MINIMUM REQUIREMENTS:
|
| 63 |
+
echo - Python 3.8 or higher
|
| 64 |
+
echo - 4GB RAM minimum (8GB+ recommended)
|
| 65 |
+
echo - 1GB free disk space
|
| 66 |
+
|
| 67 |
+
echo.
|
| 68 |
+
echo DATASET INFORMATION:
|
| 69 |
+
echo - Dataset location: %SCRIPT_DIR%\Dataset\
|
| 70 |
+
echo - Please add your dataset files to the directory or use 4. Open Dataset Directory and insert your files.
|
| 71 |
+
|
| 72 |
+
echo.
|
| 73 |
+
echo MENU:
|
| 74 |
+
echo 1. Install Dependencies
|
| 75 |
+
echo 2. Create Tokenizer (50k vocab, min_freq=2)
|
| 76 |
+
echo 3. Test Tokenizer (2 runs with 10,000 samples)
|
| 77 |
+
echo 4. Open Dataset Directory
|
| 78 |
+
echo 5. Exit
|
| 79 |
+
echo.
|
| 80 |
+
set /p choice=Enter your choice (1-5):
|
| 81 |
+
|
| 82 |
+
echo.
|
| 83 |
+
|
| 84 |
+
if "%choice%"=="1" goto install_deps
|
| 85 |
+
if "%choice%"=="2" goto create_tokenizer
|
| 86 |
+
if "%choice%"=="3" goto test_tokenizer
|
| 87 |
+
if "%choice%"=="4" goto open_dataset
|
| 88 |
+
if "%choice%"=="5" goto exit
|
| 89 |
+
|
| 90 |
+
echo Invalid choice. Please enter a number between 1 and 5.
|
| 91 |
+
pause
|
| 92 |
+
goto menu
|
| 93 |
+
|
| 94 |
+
:install_deps
|
| 95 |
+
echo Installing dependencies...
|
| 96 |
+
echo This may take a few minutes...
|
| 97 |
+
echo.
|
| 98 |
+
|
| 99 |
+
:: Create virtual environment if it doesn't exist
|
| 100 |
+
if not exist "%SCRIPT_DIR%\venv" (
|
| 101 |
+
echo Creating virtual environment...
|
| 102 |
+
python -m venv "%SCRIPT_DIR%\venv"
|
| 103 |
+
if errorlevel 1 (
|
| 104 |
+
echo Failed to create virtual environment
|
| 105 |
+
pause
|
| 106 |
+
goto menu
|
| 107 |
+
)
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
:: Activate virtual environment and install dependencies
|
| 111 |
+
call "%SCRIPT_DIR%\venv\Scripts\activate"
|
| 112 |
+
|
| 113 |
+
:: Upgrade pip first
|
| 114 |
+
echo [INFO] Upgrading pip...
|
| 115 |
+
python -m pip install --upgrade pip
|
| 116 |
+
if errorlevel 1 (
|
| 117 |
+
echo [ERROR] Failed to upgrade pip
|
| 118 |
+
pause
|
| 119 |
+
goto menu
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
:: Install PyTorch CPU version
|
| 123 |
+
echo [INFO] Installing PyTorch CPU version...
|
| 124 |
+
pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
|
| 125 |
+
if errorlevel 1 (
|
| 126 |
+
echo [WARNING] Failed to install specific PyTorch version, trying latest compatible version...
|
| 127 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 128 |
+
if errorlevel 1 (
|
| 129 |
+
echo [ERROR] Failed to install PyTorch
|
| 130 |
+
echo [INFO] Please check your internet connection and try again
|
| 131 |
+
pause
|
| 132 |
+
goto menu
|
| 133 |
+
)
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
:: Install other dependencies one by one
|
| 137 |
+
echo [INFO] Installing additional dependencies...
|
| 138 |
+
pip install tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
|
| 139 |
+
if errorlevel 1 (
|
| 140 |
+
echo [WARNING] Failed to install some dependencies, trying with --no-cache-dir...
|
| 141 |
+
pip install --no-cache-dir tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
|
| 142 |
+
if errorlevel 1 (
|
| 143 |
+
echo [ERROR] Failed to install additional dependencies
|
| 144 |
+
pause
|
| 145 |
+
goto menu
|
| 146 |
+
)
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
:: Install tokenizers with pre-built wheel
|
| 150 |
+
echo [INFO] Installing tokenizers...
|
| 151 |
+
pip install tokenizers==0.21.1 --only-binary :all:
|
| 152 |
+
if errorlevel 1 (
|
| 153 |
+
echo [WARNING] Could not install tokenizers with pre-built wheel
|
| 154 |
+
echo [INFO] Trying alternative installation method...
|
| 155 |
+
pip install tokenizers==0.21.1 --no-deps
|
| 156 |
+
if errorlevel 1 (
|
| 157 |
+
echo [ERROR] Failed to install tokenizers
|
| 158 |
+
echo Note: This package requires a C++ build toolchain or a pre-built wheel.
|
| 159 |
+
echo On Windows, you may need to install Visual Studio Build Tools with C++ workload.
|
| 160 |
+
pause
|
| 161 |
+
goto menu
|
| 162 |
+
)
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
echo.
|
| 166 |
+
echo [INFO] All dependencies installed successfully!
|
| 167 |
+
|
| 168 |
+
echo [INFO] Installing nexforgetokenizer in development mode...
|
| 169 |
+
python -m pip install -e .
|
| 170 |
+
if errorlevel 1 (
|
| 171 |
+
echo [ERROR] Failed to install nexforgetokenizer in development mode
|
| 172 |
+
pause
|
| 173 |
+
goto menu
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
echo [INFO] Package installation complete!
|
| 177 |
+
pause
|
| 178 |
+
goto menu
|
| 179 |
+
|
| 180 |
+
:create_tokenizer
|
| 181 |
+
if not exist "%SCRIPT_DIR%\venv" (
|
| 182 |
+
echo Virtual environment not found. Please install dependencies first.
|
| 183 |
+
pause
|
| 184 |
+
goto menu
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
call "%SCRIPT_DIR%\venv\Scripts\activate"
|
| 188 |
+
|
| 189 |
+
:: Create output directory if it doesn't exist
|
| 190 |
+
if not exist "%SCRIPT_DIR%\output" mkdir "%SCRIPT_DIR%\output"
|
| 191 |
+
|
| 192 |
+
:: Check if dataset directory exists
|
| 193 |
+
if not exist "%SCRIPT_DIR%\Dataset" (
|
| 194 |
+
echo Creating Dataset directory...
|
| 195 |
+
mkdir "%SCRIPT_DIR%\Dataset"
|
| 196 |
+
echo Please add your dataset files to: %SCRIPT_DIR%\Dataset
|
| 197 |
+
pause
|
| 198 |
+
start "" "%SCRIPT_DIR%\Dataset"
|
| 199 |
+
goto menu
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
:: Check if there are any files in the Dataset directory
|
| 203 |
+
dir /b "%SCRIPT_DIR%\Dataset\*.*" >nul 2>&1
|
| 204 |
+
if %ERRORLEVEL% NEQ 0 (
|
| 205 |
+
echo No files found in: %SCRIPT_DIR%\Dataset
|
| 206 |
+
echo Please add your dataset files to this directory.
|
| 207 |
+
pause
|
| 208 |
+
start "" "%SCRIPT_DIR%\Dataset"
|
| 209 |
+
goto menu
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
echo Creating EZ-Tokenizer with 50k vocabulary and min_freq=2 (all files)...
|
| 213 |
+
python -m nexforgetokenizer.adaptive_tokenizer "%SCRIPT_DIR%\Dataset" "%SCRIPT_DIR%\output\tokenizer.json" 50000 2 MAX
|
| 214 |
+
|
| 215 |
+
if errorlevel 1 (
|
| 216 |
+
echo Failed to create tokenizer
|
| 217 |
+
pause
|
| 218 |
+
goto menu
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
echo.
|
| 222 |
+
echo EZ-Tokenizer created successfully at: %SCRIPT_DIR%\output\tokenizer.json
|
| 223 |
+
echo Vocabulary size: 50,000
|
| 224 |
+
echo Minimum frequency: 2
|
| 225 |
+
echo Processed all available files in the dataset
|
| 226 |
+
echo.
|
| 227 |
+
echo You can now use this tokenizer in your projects by loading: output\tokenizer.json
|
| 228 |
+
pause
|
| 229 |
+
goto menu
|
| 230 |
+
|
| 231 |
+
:test_tokenizer
|
| 232 |
+
if not exist "%SCRIPT_DIR%\venv" (
|
| 233 |
+
echo Virtual environment not found. Please install dependencies first.
|
| 234 |
+
pause
|
| 235 |
+
goto menu
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
call "%SCRIPT_DIR%\venv\Scripts\activate"
|
| 239 |
+
|
| 240 |
+
:: Create test_result directory if it doesn't exist
|
| 241 |
+
if not exist "%SCRIPT_DIR%\test_result" mkdir "%SCRIPT_DIR%\test_result"
|
| 242 |
+
|
| 243 |
+
:: Check if tokenizer exists
|
| 244 |
+
if not exist "%SCRIPT_DIR%\output\tokenizer.json" (
|
| 245 |
+
echo EZ-Tokenizer not found. Please create a tokenizer first.
|
| 246 |
+
echo Looking for: %SCRIPT_DIR%\output\tokenizer.json
|
| 247 |
+
pause
|
| 248 |
+
goto menu
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
echo Running test with 10,000 samples...
|
| 252 |
+
echo Testing EZ-Tokenizer with 10,000 samples...
|
| 253 |
+
python "%SCRIPT_DIR%\Test_tokenizer\test_tokenizer.py" --tokenizer "%SCRIPT_DIR%\output\tokenizer.json" --input "%SCRIPT_DIR%\Dataset" --sample 10000 --output "%SCRIPT_DIR%\test_result\test_run.txt"
|
| 254 |
+
|
| 255 |
+
if errorlevel 1 (
|
| 256 |
+
echo Test run failed
|
| 257 |
+
pause
|
| 258 |
+
goto menu
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
echo.
|
| 262 |
+
echo Both test runs completed successfully!
|
| 263 |
+
echo Results saved to: %SCRIPT_DIR%\test_result\
|
| 264 |
+
|
| 265 |
+
:: Open the test results directory
|
| 266 |
+
if exist "%SCRIPT_DIR%\test_result\" (
|
| 267 |
+
start "" "%SCRIPT_DIR%\test_result\"
|
| 268 |
+
) else (
|
| 269 |
+
echo Warning: Test results directory not found.
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
pause
|
| 273 |
+
goto menu
|
| 274 |
+
|
| 275 |
+
:open_dataset
|
| 276 |
+
if not exist "%SCRIPT_DIR%\Dataset" (
|
| 277 |
+
mkdir "%SCRIPT_DIR%\Dataset"
|
| 278 |
+
)
|
| 279 |
+
start "" "%SCRIPT_DIR%\Dataset"
|
| 280 |
+
goto menu
|
| 281 |
+
|
| 282 |
+
:exit
|
| 283 |
+
cd /d "%CURRENT_DIR%"
|
| 284 |
+
echo Exiting NexForge Tokenizer Manager...
|
| 285 |
+
timeout /t 2 >nul
|
| 286 |
+
exit
|
setup.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Setup script for NexForge Tokenizer Builder."""
|
| 2 |
+
|
| 3 |
+
from setuptools import setup, find_packages
|
| 4 |
+
|
| 5 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
| 6 |
+
long_description = fh.read()
|
| 7 |
+
|
| 8 |
+
setup(
|
| 9 |
+
name="nexforgetokenizer",
|
| 10 |
+
version="0.1.0",
|
| 11 |
+
author="NexForge Team",
|
| 12 |
+
description="High-performance tool for creating Python code tokenizers with adaptive resource management",
|
| 13 |
+
long_description=long_description,
|
| 14 |
+
long_description_content_type="text/markdown",
|
| 15 |
+
url="https://github.com/nexforge/nexforgetokenizer",
|
| 16 |
+
package_dir={"": "src"},
|
| 17 |
+
packages=find_packages(where="src"),
|
| 18 |
+
python_requires=">=3.8",
|
| 19 |
+
install_requires=[
|
| 20 |
+
"torch>=1.9.0",
|
| 21 |
+
"tokenizers>=0.12.0",
|
| 22 |
+
"tqdm>=4.62.0",
|
| 23 |
+
"psutil>=5.9.0",
|
| 24 |
+
"numpy>=1.20.0", # Optional but recommended for improved performance
|
| 25 |
+
],
|
| 26 |
+
extras_require={
|
| 27 |
+
"dev": [
|
| 28 |
+
"pytest>=6.0",
|
| 29 |
+
"black>=21.7b0",
|
| 30 |
+
"isort>=5.0.0",
|
| 31 |
+
"mypy>=0.910",
|
| 32 |
+
"pylint>=2.11.0",
|
| 33 |
+
],
|
| 34 |
+
},
|
| 35 |
+
classifiers=[
|
| 36 |
+
"Programming Language :: Python :: 3",
|
| 37 |
+
"License :: Other/Proprietary License",
|
| 38 |
+
"Operating System :: OS Independent",
|
| 39 |
+
"Intended Audience :: Developers",
|
| 40 |
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
| 41 |
+
"Topic :: Text Processing :: Linguistic",
|
| 42 |
+
],
|
| 43 |
+
)
|
src/ez_tokenizer.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: ez-tokenizer
|
| 3 |
+
Version: 1.0.0
|
| 4 |
+
Summary: High-performance tokenizer builder for code and text datasets with adaptive resource management
|
| 5 |
+
Home-page: https://github.com/nexforge/nexforgetokenizer
|
| 6 |
+
Author: NexForge Team
|
| 7 |
+
Author-email: NexForge <[email protected]>
|
| 8 |
+
Maintainer-email: NexForge <[email protected]>
|
| 9 |
+
License: MIT with Company Restriction
|
| 10 |
+
Classifier: Development Status :: 4 - Beta
|
| 11 |
+
Classifier: Intended Audience :: Developers
|
| 12 |
+
Classifier: Intended Audience :: Science/Research
|
| 13 |
+
Classifier: License :: Other/Proprietary License
|
| 14 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 15 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 16 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 19 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
| 20 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
| 21 |
+
Requires-Python: >=3.8
|
| 22 |
+
Description-Content-Type: text/markdown
|
| 23 |
+
License-File: LICENSE
|
| 24 |
+
Requires-Dist: torch>=1.9.0
|
| 25 |
+
Requires-Dist: tokenizers>=0.12.0
|
| 26 |
+
Requires-Dist: tqdm>=4.62.0
|
| 27 |
+
Requires-Dist: psutil>=5.9.0
|
| 28 |
+
Requires-Dist: python-dateutil>=2.8.2
|
| 29 |
+
Provides-Extra: dev
|
| 30 |
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
| 31 |
+
Requires-Dist: pytest-cov>=2.12.1; extra == "dev"
|
| 32 |
+
Requires-Dist: pytest-xdist>=2.4.0; extra == "dev"
|
| 33 |
+
Requires-Dist: black>=21.7b0; extra == "dev"
|
| 34 |
+
Requires-Dist: isort>=5.0.0; extra == "dev"
|
| 35 |
+
Requires-Dist: mypy>=0.910; extra == "dev"
|
| 36 |
+
Requires-Dist: pylint>=2.11.0; extra == "dev"
|
| 37 |
+
Requires-Dist: pre-commit>=2.15.0; extra == "dev"
|
| 38 |
+
Dynamic: author
|
| 39 |
+
Dynamic: home-page
|
| 40 |
+
Dynamic: license-file
|
| 41 |
+
Dynamic: requires-python
|
| 42 |
+
|
| 43 |
+
# EZ-Tokenizer
|
| 44 |
+
|
| 45 |
+
A high-performance tool for creating custom tokenizers from your code or text datasets. Automatically adapts to your system resources while providing fine-grained control over tokenizer creation.
|
| 46 |
+
|
| 47 |
+
> **Note**: This project was previously known as NexForge Tokenizer. All functionality remains the same, only the name has been updated to better reflect its ease of use and efficiency.
|
| 48 |
+
|
| 49 |
+
## 📄 License
|
| 50 |
+
|
| 51 |
+
EZ-Tokenizer is released under the MIT License with a company restriction clause. This means:
|
| 52 |
+
|
| 53 |
+
- 🆓 **Free for everyone**: Individuals and small businesses can use EZ-Tokenizer for free
|
| 54 |
+
- 🏢 **Commercial use**: Companies with more than 10 employees or $1M+ in annual revenue need a commercial license
|
| 55 |
+
- 📝 **Full details**: See [LICENSE](LICENSE) for complete terms
|
| 56 |
+
|
| 57 |
+
## Quick Start with Batch File (Recommended for Most Users)
|
| 58 |
+
|
| 59 |
+
### Prerequisites
|
| 60 |
+
- Windows OS
|
| 61 |
+
- Python 3.8 or higher installed
|
| 62 |
+
- Administrator privileges
|
| 63 |
+
- At least 4GB RAM (8GB+ recommended)
|
| 64 |
+
|
| 65 |
+
### Getting Started
|
| 66 |
+
|
| 67 |
+
1. **Download** the latest release or clone this repository
|
| 68 |
+
2. **Add your dataset**: Place training files in the `Dataset` directory
|
| 69 |
+
- Supported formats: `.txt`, `.py`, and other text files
|
| 70 |
+
- The system will process all compatible files in this directory
|
| 71 |
+
3. **Run as Administrator**: Right-click on `run_ez_tokenizer.bat` and select "Run as administrator"
|
| 72 |
+
4. **Follow the Menu**:
|
| 73 |
+
- Option 1: Install Dependencies (first time only)
|
| 74 |
+
- Option 2: Create Tokenizer (processes all files in Dataset directory)
|
| 75 |
+
- Option 3: Test Tokenizer (after creation)
|
| 76 |
+
- Option 4: Open Dataset Directory (to add/check files)
|
| 77 |
+
- Option 5: Exit
|
| 78 |
+
|
| 79 |
+
### Default Tokenizer Settings
|
| 80 |
+
- **Vocabulary Size**: 50,000 tokens
|
| 81 |
+
- **Minimum Frequency**: 2 (includes tokens appearing at least twice)
|
| 82 |
+
- **File Processing**: All files in Dataset directory
|
| 83 |
+
- **Output**: `output/ez_tokenizer.json`
|
| 84 |
+
- **Test Results**: `test_result/test_run.txt`
|
| 85 |
+
|
| 86 |
+
### For Advanced Users
|
| 87 |
+
Customize tokenizer creation by running manually:
|
| 88 |
+
```bash
|
| 89 |
+
python -m ez_tokenizer.adaptive_tokenizer [input_dir] [output_path] [vocab_size] [min_frequency] [max_files]
|
| 90 |
+
```
|
| 91 |
+
Example:
|
| 92 |
+
```bash
|
| 93 |
+
python -m ez_tokenizer.adaptive_tokenizer "Dataset" "output/custom_tokenizer.json" 50000 2 1000
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## Advanced Usage (Manual Setup)
|
| 99 |
+
|
| 100 |
+
For users who need more control or are using non-Windows systems:
|
| 101 |
+
|
| 102 |
+
## Features
|
| 103 |
+
|
| 104 |
+
- **Adaptive Resource Management**: Automatically detects and utilizes available system resources (CPU, RAM, GPU)
|
| 105 |
+
- **Progressive Processing**: Processes files in chunks to handle datasets larger than available memory
|
| 106 |
+
- **Smart Batching**: Dynamically adjusts batch sizes based on available resources
|
| 107 |
+
- **Efficient Memory Usage**: Implements memory conservation strategies for optimal performance
|
| 108 |
+
- **High Performance**: Processes over 300,000 tokens per second on average hardware
|
| 109 |
+
- **Perfect Reconstruction**: 100% accuracy in round-trip encoding/decoding
|
| 110 |
+
- **Optimal Compression**: Achieves ~3.5 characters per token, exceeding industry standards
|
| 111 |
+
- 🛠️ **Extensible**: Advanced users can customize all parameters
|
| 112 |
+
- ✅ **Tested**: Built-in testing to verify tokenizer quality
|
| 113 |
+
|
| 114 |
+
## Quick Start
|
| 115 |
+
|
| 116 |
+
### Installation
|
| 117 |
+
|
| 118 |
+
```bash
|
| 119 |
+
# Install from source
|
| 120 |
+
git clone https://github.com/yourusername/ez_tokenizer.git
|
| 121 |
+
cd ez_tokenizer
|
| 122 |
+
pip install -e .
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
### Basic Usage
|
| 126 |
+
|
| 127 |
+
#### Command Line Interface
|
| 128 |
+
|
| 129 |
+
```bash
|
| 130 |
+
# Basic usage
|
| 131 |
+
python -m ez_tokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json
|
| 132 |
+
|
| 133 |
+
# With custom parameters
|
| 134 |
+
python -m ez_tokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json 50000 2
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
## Complete Usage Guide
|
| 138 |
+
|
| 139 |
+
### Command Line Arguments
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
python -m ez_tokenizer.adaptive_tokenizer <input_path> <output_path> [vocab_size] [min_frequency]
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
- **input_path**: Path to file or directory containing training data
|
| 146 |
+
- **output_path**: Where to save the tokenizer (should end with .json)
|
| 147 |
+
- **vocab_size** (optional, default=40000): Target vocabulary size
|
| 148 |
+
- **min_frequency** (optional, default=2): Minimum token occurrence count
|
| 149 |
+
|
| 150 |
+
### Python API
|
| 151 |
+
|
| 152 |
+
```python
|
| 153 |
+
from ez_tokenizer import build_tokenizer
|
| 154 |
+
|
| 155 |
+
# Basic usage
|
| 156 |
+
build_tokenizer(
|
| 157 |
+
input_dir="path/to/your/files",
|
| 158 |
+
output_path="output/tokenizer.json"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Advanced usage
|
| 162 |
+
build_tokenizer(
|
| 163 |
+
input_dir="path/to/your/files",
|
| 164 |
+
output_path="output/tokenizer.json",
|
| 165 |
+
vocab_size=50000, # Larger vocabulary for specialized domains
|
| 166 |
+
min_frequency=2, # Only include tokens appearing at least this many times
|
| 167 |
+
chunk_size=1000000, # Characters to process at once
|
| 168 |
+
n_threads=4 # Number of threads to use
|
| 169 |
+
)
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
## Best Practices
|
| 173 |
+
|
| 174 |
+
### Recommended Settings
|
| 175 |
+
|
| 176 |
+
#### For Most Users
|
| 177 |
+
- **Vocabulary Size**: 40,000 (default)
|
| 178 |
+
- Balanced between coverage and performance
|
| 179 |
+
- Works well for most programming languages and natural language
|
| 180 |
+
- **Minimum Frequency**: 2 (default)
|
| 181 |
+
- Includes tokens that appear at least twice
|
| 182 |
+
- Good balance between vocabulary size and token quality
|
| 183 |
+
|
| 184 |
+
#### For Specialized Use Cases
|
| 185 |
+
- **Larger Vocabularies (50k+)**
|
| 186 |
+
- Only needed for very diverse codebases
|
| 187 |
+
- Requires more system resources
|
| 188 |
+
- **Higher Minimum Frequency**
|
| 189 |
+
- Use 3-5 for smaller vocabularies
|
| 190 |
+
- Reduces vocabulary size while maintaining quality
|
| 191 |
+
|
| 192 |
+
#### Processing Large Datasets
|
| 193 |
+
- The batch file automatically handles large datasets
|
| 194 |
+
- Processes files in memory-efficient chunks
|
| 195 |
+
- Can be interrupted and resumed if needed
|
| 196 |
+
|
| 197 |
+
### Input Data
|
| 198 |
+
|
| 199 |
+
- Supports `.txt`, `.py`, and other text-based formats
|
| 200 |
+
- Handles both files and directories
|
| 201 |
+
- Automatically filters binary files
|
| 202 |
+
|
| 203 |
+
### Performance Tips
|
| 204 |
+
|
| 205 |
+
- For large datasets (>1GB), use chunking
|
| 206 |
+
- On multi-core systems, increase thread count
|
| 207 |
+
- Monitor memory usage with large vocabularies
|
| 208 |
+
|
| 209 |
+
## Testing Your Tokenizer
|
| 210 |
+
|
| 211 |
+
After creating your tokenizer, use the built-in test function:
|
| 212 |
+
|
| 213 |
+
1. From the batch menu, select "Test Tokenizer"
|
| 214 |
+
2. The system will:
|
| 215 |
+
- Test with 10,000 random samples
|
| 216 |
+
- Measure tokenization speed (typically >300k tokens/sec)
|
| 217 |
+
- Verify 100% round-trip accuracy
|
| 218 |
+
- Generate a detailed performance report
|
| 219 |
+
# Custom test with specific sample size
|
| 220 |
+
python Test_tokenizer\test_tokenizer.py \
|
| 221 |
+
--tokenizer output/Nexforge_tokenizer.json \
|
| 222 |
+
--input Dataset \
|
| 223 |
+
--sample 20000 \
|
| 224 |
+
--output test_result/detailed_test.txt
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
### Test Output Includes
|
| 228 |
+
- Tokenization success rate
|
| 229 |
+
- Sample encoded/decoded text
|
| 230 |
+
- Basic statistics (vocab size, special tokens)
|
| 231 |
+
- Any encoding/decoding errors
|
| 232 |
+
|
| 233 |
+
## Troubleshooting
|
| 234 |
+
|
| 235 |
+
### Common Issues
|
| 236 |
+
|
| 237 |
+
1. **Out of Memory**
|
| 238 |
+
- Reduce chunk size
|
| 239 |
+
- Close other memory-intensive applications
|
| 240 |
+
- Use a smaller vocabulary
|
| 241 |
+
|
| 242 |
+
2. **Slow Processing**
|
| 243 |
+
- Increase thread count
|
| 244 |
+
- Process in smaller batches
|
| 245 |
+
- Check for system resource constraints
|
| 246 |
+
|
| 247 |
+
3. **Vocabulary Too Large**
|
| 248 |
+
- Increase min_frequency
|
| 249 |
+
- Use a smaller vocab_size
|
| 250 |
+
- Pre-filter your dataset
|
| 251 |
+
|
| 252 |
+
## Performance & Resource Usage
|
| 253 |
+
|
| 254 |
+
The tokenizer is optimized to work efficiently across different hardware configurations:
|
| 255 |
+
|
| 256 |
+
### System Requirements
|
| 257 |
+
- **Minimum**: 4GB RAM, 2-core CPU
|
| 258 |
+
- **Recommended**: 8GB+ RAM, 4+ core CPU
|
| 259 |
+
- **Disk Space**: At least 1GB free (more for large datasets)
|
| 260 |
+
|
| 261 |
+
### Expected Performance
|
| 262 |
+
- **Memory Usage**: Typically stays under 2GB for most datasets
|
| 263 |
+
- **CPU Utilization**: Deliberately capped to prevent system slowdown
|
| 264 |
+
- **Processing Speed**: Varies by system, but generally processes:
|
| 265 |
+
- Small datasets (100MB): 1-5 minutes
|
| 266 |
+
- Medium datasets (1GB): 10-30 minutes
|
| 267 |
+
- Large datasets (10GB+): 1-3 hours
|
| 268 |
+
|
| 269 |
+
### Monitoring
|
| 270 |
+
- The batch file shows progress updates
|
| 271 |
+
- Check Task Manager for real-time resource usage
|
| 272 |
+
- Process can be safely interrupted (CTRL+C) and resumed
|
| 273 |
+
|
| 274 |
+
## Examples
|
| 275 |
+
|
| 276 |
+
See the `examples/` directory for:
|
| 277 |
+
- Training on specific programming languages
|
| 278 |
+
- Fine-tuning pre-trained tokenizers
|
| 279 |
+
- Batch processing large datasets
|
| 280 |
+
|
| 281 |
+
## Contributing
|
| 282 |
+
|
| 283 |
+
Contributions are welcome! Here's how to get started:
|
| 284 |
+
|
| 285 |
+
1. Fork the repository
|
| 286 |
+
2. Create a new branch
|
| 287 |
+
3. Make your changes
|
| 288 |
+
4. Run tests: `pytest`
|
| 289 |
+
5. Submit a pull request
|
| 290 |
+
|
| 291 |
+
## License
|
| 292 |
+
|
| 293 |
+
MIT License - see [LICENSE](LICENSE) for details.
|
src/ez_tokenizer.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LICENSE
|
| 2 |
+
MANIFEST.in
|
| 3 |
+
README.md
|
| 4 |
+
pyproject.toml
|
| 5 |
+
requirements.txt
|
| 6 |
+
setup.py
|
| 7 |
+
examples/README.md
|
| 8 |
+
examples/advanced_usage.py
|
| 9 |
+
examples/basic_usage.py
|
| 10 |
+
src/ez_tokenizer.egg-info/PKG-INFO
|
| 11 |
+
src/ez_tokenizer.egg-info/SOURCES.txt
|
| 12 |
+
src/ez_tokenizer.egg-info/dependency_links.txt
|
| 13 |
+
src/ez_tokenizer.egg-info/requires.txt
|
| 14 |
+
src/ez_tokenizer.egg-info/top_level.txt
|
| 15 |
+
src/nexforgetokenizer/__init__.py
|
| 16 |
+
src/nexforgetokenizer/adaptive_tokenizer.py
|
| 17 |
+
src/nexforgetokenizer/resources.py
|
| 18 |
+
src/nexforgetokenizer/data/__init__.py
|
| 19 |
+
tests/test_adaptive_tokenizer.py
|
src/ez_tokenizer.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
src/ez_tokenizer.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=1.9.0
|
| 2 |
+
tokenizers>=0.12.0
|
| 3 |
+
tqdm>=4.62.0
|
| 4 |
+
psutil>=5.9.0
|
| 5 |
+
python-dateutil>=2.8.2
|
| 6 |
+
|
| 7 |
+
[dev]
|
| 8 |
+
pytest>=6.0
|
| 9 |
+
pytest-cov>=2.12.1
|
| 10 |
+
pytest-xdist>=2.4.0
|
| 11 |
+
black>=21.7b0
|
| 12 |
+
isort>=5.0.0
|
| 13 |
+
mypy>=0.910
|
| 14 |
+
pylint>=2.11.0
|
| 15 |
+
pre-commit>=2.15.0
|
src/ez_tokenizer.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
nexforgetokenizer
|
src/nexforgetokenizer.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: nexforgetokenizer
|
| 3 |
+
Version: 0.2.0
|
| 4 |
+
Summary: High-performance tokenizer builder for code and text datasets
|
| 5 |
+
Home-page: https://github.com/nexforge/nexforgetokenizer
|
| 6 |
+
Author: NexForge Team
|
| 7 |
+
Author-email: Jean-Michel Talbot <[email protected]>
|
| 8 |
+
Maintainer-email: NexForge Team <[email protected]>
|
| 9 |
+
License: Proprietary
|
| 10 |
+
Classifier: Development Status :: 4 - Beta
|
| 11 |
+
Classifier: Intended Audience :: Developers
|
| 12 |
+
Classifier: Intended Audience :: Science/Research
|
| 13 |
+
Classifier: License :: Other/Proprietary License
|
| 14 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 15 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 16 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 19 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
| 20 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
| 21 |
+
Requires-Python: >=3.8
|
| 22 |
+
Description-Content-Type: text/markdown
|
| 23 |
+
License-File: LICENSE
|
| 24 |
+
Requires-Dist: torch>=1.9.0
|
| 25 |
+
Requires-Dist: tokenizers>=0.12.0
|
| 26 |
+
Requires-Dist: tqdm>=4.62.0
|
| 27 |
+
Requires-Dist: psutil>=5.9.0
|
| 28 |
+
Requires-Dist: python-dateutil>=2.8.2
|
| 29 |
+
Provides-Extra: dev
|
| 30 |
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
| 31 |
+
Requires-Dist: pytest-cov>=2.12.1; extra == "dev"
|
| 32 |
+
Requires-Dist: pytest-xdist>=2.4.0; extra == "dev"
|
| 33 |
+
Requires-Dist: black>=21.7b0; extra == "dev"
|
| 34 |
+
Requires-Dist: isort>=5.0.0; extra == "dev"
|
| 35 |
+
Requires-Dist: mypy>=0.910; extra == "dev"
|
| 36 |
+
Requires-Dist: pylint>=2.11.0; extra == "dev"
|
| 37 |
+
Requires-Dist: pre-commit>=2.15.0; extra == "dev"
|
| 38 |
+
Dynamic: author
|
| 39 |
+
Dynamic: home-page
|
| 40 |
+
Dynamic: license-file
|
| 41 |
+
Dynamic: requires-python
|
| 42 |
+
|
| 43 |
+
# NexForge Tokenizer Builder
|
| 44 |
+
|
| 45 |
+
A high-performance tool for creating custom tokenizers from your code or text datasets. Automatically adapts to your system resources while providing fine-grained control over tokenizer creation.
|
| 46 |
+
|
| 47 |
+
## Quick Start with Batch File (Recommended for Most Users)
|
| 48 |
+
|
| 49 |
+
### Prerequisites
|
| 50 |
+
- Windows OS
|
| 51 |
+
- Python 3.8 or higher installed
|
| 52 |
+
- Administrator privileges
|
| 53 |
+
- At least 4GB RAM (8GB+ recommended)
|
| 54 |
+
|
| 55 |
+
### Getting Started
|
| 56 |
+
|
| 57 |
+
1. **Download** the latest release or clone this repository
|
| 58 |
+
2. **Add your dataset**: Place training files in the `Dataset` directory
|
| 59 |
+
- Supported formats: `.txt`, `.py`, and other text files
|
| 60 |
+
- The system will process all compatible files in this directory
|
| 61 |
+
3. **Run as Administrator**: Right-click on `run_nexforge.bat` and select "Run as administrator"
|
| 62 |
+
4. **Follow the Menu**:
|
| 63 |
+
- Option 1: Install Dependencies (first time only)
|
| 64 |
+
- Option 2: Create Tokenizer (processes all files in Dataset directory)
|
| 65 |
+
- Option 3: Test Tokenizer (after creation)
|
| 66 |
+
- Option 4: Open Dataset Directory (to add/check files)
|
| 67 |
+
- Option 5: Exit
|
| 68 |
+
|
| 69 |
+
### Default Tokenizer Settings
|
| 70 |
+
- **Vocabulary Size**: 40,000 tokens
|
| 71 |
+
- **Minimum Frequency**: 2 (includes tokens appearing at least twice)
|
| 72 |
+
- **File Processing**: All files in Dataset directory
|
| 73 |
+
- **Output**: `output/Nexforge_tokenizer.json`
|
| 74 |
+
- **Test Results**: `test_result/test_run.txt`
|
| 75 |
+
|
| 76 |
+
### For Advanced Users
|
| 77 |
+
Customize tokenizer creation by running manually:
|
| 78 |
+
```bash
|
| 79 |
+
python -m nexforgetokenizer.adaptive_tokenizer [input_dir] [output_path] [vocab_size] [min_frequency] [max_files]
|
| 80 |
+
```
|
| 81 |
+
Example:
|
| 82 |
+
```bash
|
| 83 |
+
python -m nexforgetokenizer.adaptive_tokenizer "Dataset" "output/custom_tokenizer.json" 50000 2 1000
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
---
|
| 87 |
+
|
| 88 |
+
## Advanced Usage (Manual Setup)
|
| 89 |
+
|
| 90 |
+
For users who need more control or are using non-Windows systems:
|
| 91 |
+
|
| 92 |
+
## Features
|
| 93 |
+
|
| 94 |
+
- 🚀 **One-Click Setup**: Create optimized tokenizers with a single click
|
| 95 |
+
- ⚡ **Resource Efficient**: Automatically adapts to your system's capabilities
|
| 96 |
+
- 🧠 **Smart Defaults**: 40k vocabulary with min_freq=2 for optimal coverage
|
| 97 |
+
- 🔄 **Batch Processing**: Process all files in your dataset directory
|
| 98 |
+
- 📊 **Memory Safe**: Processes large datasets without memory issues
|
| 99 |
+
- 🛠️ **Extensible**: Advanced users can customize all parameters
|
| 100 |
+
- ✅ **Tested**: Built-in testing to verify tokenizer quality
|
| 101 |
+
|
| 102 |
+
## Quick Start
|
| 103 |
+
|
| 104 |
+
### Installation
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
# Install from source
|
| 108 |
+
git clone https://github.com/yourusername/nexforgetokenizer.git
|
| 109 |
+
cd nexforgetokenizer
|
| 110 |
+
pip install -e .
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
### Basic Usage
|
| 114 |
+
|
| 115 |
+
#### Command Line Interface
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
# Basic usage
|
| 119 |
+
python -m nexforgetokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json
|
| 120 |
+
|
| 121 |
+
# With custom parameters
|
| 122 |
+
python -m nexforgetokenizer.adaptive_tokenizer path/to/your/files output/tokenizer.json 50000 2
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## Complete Usage Guide
|
| 126 |
+
|
| 127 |
+
### Command Line Arguments
|
| 128 |
+
|
| 129 |
+
```bash
|
| 130 |
+
python -m nexforgetokenizer.adaptive_tokenizer <input_path> <output_path> [vocab_size] [min_frequency]
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
- **input_path**: Path to file or directory containing training data
|
| 134 |
+
- **output_path**: Where to save the tokenizer (should end with .json)
|
| 135 |
+
- **vocab_size** (optional, default=40000): Target vocabulary size
|
| 136 |
+
- **min_frequency** (optional, default=2): Minimum token occurrence count
|
| 137 |
+
|
| 138 |
+
### Python API
|
| 139 |
+
|
| 140 |
+
```python
|
| 141 |
+
from nexforgetokenizer import build_tokenizer
|
| 142 |
+
|
| 143 |
+
# Basic usage
|
| 144 |
+
build_tokenizer(
|
| 145 |
+
input_dir="path/to/your/files",
|
| 146 |
+
output_path="output/tokenizer.json"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# Advanced usage
|
| 150 |
+
build_tokenizer(
|
| 151 |
+
input_dir="path/to/your/files",
|
| 152 |
+
output_path="output/tokenizer.json",
|
| 153 |
+
vocab_size=50000, # Larger vocabulary for specialized domains
|
| 154 |
+
min_frequency=2, # Only include tokens appearing at least this many times
|
| 155 |
+
chunk_size=1000000, # Characters to process at once
|
| 156 |
+
n_threads=4 # Number of threads to use
|
| 157 |
+
)
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
## Best Practices
|
| 161 |
+
|
| 162 |
+
### Recommended Settings
|
| 163 |
+
|
| 164 |
+
#### For Most Users
|
| 165 |
+
- **Vocabulary Size**: 40,000 (default)
|
| 166 |
+
- Balanced between coverage and performance
|
| 167 |
+
- Works well for most programming languages and natural language
|
| 168 |
+
- **Minimum Frequency**: 2 (default)
|
| 169 |
+
- Includes tokens that appear at least twice
|
| 170 |
+
- Good balance between vocabulary size and token quality
|
| 171 |
+
|
| 172 |
+
#### For Specialized Use Cases
|
| 173 |
+
- **Larger Vocabularies (50k+)**
|
| 174 |
+
- Only needed for very diverse codebases
|
| 175 |
+
- Requires more system resources
|
| 176 |
+
- **Higher Minimum Frequency**
|
| 177 |
+
- Use 3-5 for smaller vocabularies
|
| 178 |
+
- Reduces vocabulary size while maintaining quality
|
| 179 |
+
|
| 180 |
+
#### Processing Large Datasets
|
| 181 |
+
- The batch file automatically handles large datasets
|
| 182 |
+
- Processes files in memory-efficient chunks
|
| 183 |
+
- Can be interrupted and resumed if needed
|
| 184 |
+
|
| 185 |
+
### Input Data
|
| 186 |
+
|
| 187 |
+
- Supports `.txt`, `.py`, and other text-based formats
|
| 188 |
+
- Handles both files and directories
|
| 189 |
+
- Automatically filters binary files
|
| 190 |
+
|
| 191 |
+
### Performance Tips
|
| 192 |
+
|
| 193 |
+
- For large datasets (>1GB), use chunking
|
| 194 |
+
- On multi-core systems, increase thread count
|
| 195 |
+
- Monitor memory usage with large vocabularies
|
| 196 |
+
|
| 197 |
+
## Testing Your Tokenizer
|
| 198 |
+
|
| 199 |
+
After creating your tokenizer, use the built-in test function:
|
| 200 |
+
|
| 201 |
+
1. From the batch menu, select "Test Tokenizer"
|
| 202 |
+
2. The system will:
|
| 203 |
+
- Test with 10,000 random samples
|
| 204 |
+
- Generate a test report in `test_result/test_run.txt`
|
| 205 |
+
- Show basic statistics about the tokenizer
|
| 206 |
+
|
| 207 |
+
For advanced testing, run manually:
|
| 208 |
+
```bash
|
| 209 |
+
# Basic test with default settings
|
| 210 |
+
python Test_tokenizer\test_tokenizer.py --tokenizer output/Nexforge_tokenizer.json
|
| 211 |
+
|
| 212 |
+
# Custom test with specific sample size
|
| 213 |
+
python Test_tokenizer\test_tokenizer.py \
|
| 214 |
+
--tokenizer output/Nexforge_tokenizer.json \
|
| 215 |
+
--input Dataset \
|
| 216 |
+
--sample 20000 \
|
| 217 |
+
--output test_result/detailed_test.txt
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
### Test Output Includes
|
| 221 |
+
- Tokenization success rate
|
| 222 |
+
- Sample encoded/decoded text
|
| 223 |
+
- Basic statistics (vocab size, special tokens)
|
| 224 |
+
- Any encoding/decoding errors
|
| 225 |
+
|
| 226 |
+
## Troubleshooting
|
| 227 |
+
|
| 228 |
+
### Common Issues
|
| 229 |
+
|
| 230 |
+
1. **Out of Memory**
|
| 231 |
+
- Reduce chunk size
|
| 232 |
+
- Close other memory-intensive applications
|
| 233 |
+
- Use a smaller vocabulary
|
| 234 |
+
|
| 235 |
+
2. **Slow Processing**
|
| 236 |
+
- Increase thread count
|
| 237 |
+
- Process in smaller batches
|
| 238 |
+
- Check for system resource constraints
|
| 239 |
+
|
| 240 |
+
3. **Vocabulary Too Large**
|
| 241 |
+
- Increase min_frequency
|
| 242 |
+
- Use a smaller vocab_size
|
| 243 |
+
- Pre-filter your dataset
|
| 244 |
+
|
| 245 |
+
## Performance & Resource Usage
|
| 246 |
+
|
| 247 |
+
The tokenizer is optimized to work efficiently across different hardware configurations:
|
| 248 |
+
|
| 249 |
+
### System Requirements
|
| 250 |
+
- **Minimum**: 4GB RAM, 2-core CPU
|
| 251 |
+
- **Recommended**: 8GB+ RAM, 4+ core CPU
|
| 252 |
+
- **Disk Space**: At least 1GB free (more for large datasets)
|
| 253 |
+
|
| 254 |
+
### Expected Performance
|
| 255 |
+
- **Memory Usage**: Typically stays under 2GB for most datasets
|
| 256 |
+
- **CPU Utilization**: Deliberately capped to prevent system slowdown
|
| 257 |
+
- **Processing Speed**: Varies by system, but generally processes:
|
| 258 |
+
- Small datasets (100MB): 1-5 minutes
|
| 259 |
+
- Medium datasets (1GB): 10-30 minutes
|
| 260 |
+
- Large datasets (10GB+): 1-3 hours
|
| 261 |
+
|
| 262 |
+
### Monitoring
|
| 263 |
+
- The batch file shows progress updates
|
| 264 |
+
- Check Task Manager for real-time resource usage
|
| 265 |
+
- Process can be safely interrupted (CTRL+C) and resumed
|
| 266 |
+
|
| 267 |
+
## Examples
|
| 268 |
+
|
| 269 |
+
See the `examples/` directory for:
|
| 270 |
+
- Training on specific programming languages
|
| 271 |
+
- Fine-tuning pre-trained tokenizers
|
| 272 |
+
- Batch processing large datasets
|
| 273 |
+
|
| 274 |
+
## Contributing
|
| 275 |
+
|
| 276 |
+
Contributions are welcome! Here's how to get started:
|
| 277 |
+
|
| 278 |
+
1. Fork the repository
|
| 279 |
+
2. Create a new branch
|
| 280 |
+
3. Make your changes
|
| 281 |
+
4. Run tests: `pytest`
|
| 282 |
+
5. Submit a pull request
|
| 283 |
+
|
| 284 |
+
## License
|
| 285 |
+
|
| 286 |
+
MIT License - see [LICENSE](LICENSE) for details.
|
src/nexforgetokenizer.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LICENSE
|
| 2 |
+
MANIFEST.in
|
| 3 |
+
README.md
|
| 4 |
+
pyproject.toml
|
| 5 |
+
requirements.txt
|
| 6 |
+
setup.py
|
| 7 |
+
examples/README.md
|
| 8 |
+
examples/advanced_usage.py
|
| 9 |
+
examples/basic_usage.py
|
| 10 |
+
src/nexforgetokenizer/__init__.py
|
| 11 |
+
src/nexforgetokenizer/adaptive_tokenizer.py
|
| 12 |
+
src/nexforgetokenizer/resources.py
|
| 13 |
+
src/nexforgetokenizer.egg-info/PKG-INFO
|
| 14 |
+
src/nexforgetokenizer.egg-info/SOURCES.txt
|
| 15 |
+
src/nexforgetokenizer.egg-info/dependency_links.txt
|
| 16 |
+
src/nexforgetokenizer.egg-info/requires.txt
|
| 17 |
+
src/nexforgetokenizer.egg-info/top_level.txt
|
| 18 |
+
src/nexforgetokenizer/data/__init__.py
|
| 19 |
+
tests/test_adaptive_tokenizer.py
|
src/nexforgetokenizer.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
src/nexforgetokenizer.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=1.9.0
|
| 2 |
+
tokenizers>=0.12.0
|
| 3 |
+
tqdm>=4.62.0
|
| 4 |
+
psutil>=5.9.0
|
| 5 |
+
python-dateutil>=2.8.2
|
| 6 |
+
|
| 7 |
+
[dev]
|
| 8 |
+
pytest>=6.0
|
| 9 |
+
pytest-cov>=2.12.1
|
| 10 |
+
pytest-xdist>=2.4.0
|
| 11 |
+
black>=21.7b0
|
| 12 |
+
isort>=5.0.0
|
| 13 |
+
mypy>=0.910
|
| 14 |
+
pylint>=2.11.0
|
| 15 |
+
pre-commit>=2.15.0
|
src/nexforgetokenizer.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
nexforgetokenizer
|
src/nexforgetokenizer/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""EZ-Tokenizer - High-performance Python code tokenizer with adaptive resource management.
|
| 2 |
+
|
| 3 |
+
Features:
|
| 4 |
+
- Efficient tokenization of code and text
|
| 5 |
+
- Adaptive resource management
|
| 6 |
+
- Support for large datasets
|
| 7 |
+
- Custom vocabulary generation
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
__version__ = "1.0.0"
|
| 11 |
+
__author__ = "EZ-Tokenizer Team"
|
| 12 |
+
__all__ = [
|
| 13 |
+
"SystemResources",
|
| 14 |
+
"log_memory_usage",
|
| 15 |
+
"manage_ram",
|
| 16 |
+
"build_tokenizer"
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
# Lazy imports to prevent circular imports
|
| 20 |
+
def __getattr__(name):
|
| 21 |
+
if name == 'SystemResources':
|
| 22 |
+
from .resources import SystemResources
|
| 23 |
+
return SystemResources
|
| 24 |
+
elif name in ('log_memory_usage', 'manage_ram', 'build_tokenizer'):
|
| 25 |
+
from .adaptive_tokenizer import log_memory_usage, manage_ram, build_tokenizer
|
| 26 |
+
if name == 'log_memory_usage':
|
| 27 |
+
return log_memory_usage
|
| 28 |
+
elif name == 'manage_ram':
|
| 29 |
+
return manage_ram
|
| 30 |
+
elif name == 'build_tokenizer':
|
| 31 |
+
return build_tokenizer
|
| 32 |
+
|
| 33 |
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
src/nexforgetokenizer/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (1.1 kB). View file
|
|
|
src/nexforgetokenizer/__pycache__/adaptive_tokenizer.cpython-313.pyc
ADDED
|
Binary file (31.4 kB). View file
|
|
|
src/nexforgetokenizer/__pycache__/resources.cpython-313.pyc
ADDED
|
Binary file (6.54 kB). View file
|
|
|
src/nexforgetokenizer/adaptive_tokenizer.py
ADDED
|
@@ -0,0 +1,705 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""EZ-Tokenizer: Adaptive tokenizer creation for Python code with hardware optimization.
|
| 2 |
+
|
| 3 |
+
This script creates a high-performance ByteLevel BPE tokenizer specifically optimized for code,
|
| 4 |
+
with automatic adaptation to available system resources (RAM, CPU, GPU). It efficiently scales
|
| 5 |
+
from low-end systems (2 cores, 4GB RAM) to high-end workstations while maintaining perfect
|
| 6 |
+
reconstruction accuracy and high throughput.
|
| 7 |
+
|
| 8 |
+
Key Features:
|
| 9 |
+
- 100% reconstruction accuracy
|
| 10 |
+
- ~3.5 characters per token (exceeding industry standards)
|
| 11 |
+
- Adaptive resource management
|
| 12 |
+
- Memory-efficient processing of large datasets
|
| 13 |
+
- Support for mixed code and text content
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import time
|
| 18 |
+
import glob
|
| 19 |
+
import logging
|
| 20 |
+
import sys
|
| 21 |
+
import gc
|
| 22 |
+
import traceback
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
from concurrent.futures import ProcessPoolExecutor
|
| 25 |
+
import psutil
|
| 26 |
+
from typing import Dict, List, Optional, Tuple, Union, Any, NamedTuple
|
| 27 |
+
|
| 28 |
+
# Try to use CUDA if available
|
| 29 |
+
import torch
|
| 30 |
+
|
| 31 |
+
# Local imports
|
| 32 |
+
from .resources import SystemResources
|
| 33 |
+
|
| 34 |
+
# Third-party tokenizer dependencies
|
| 35 |
+
from tokenizers import Tokenizer
|
| 36 |
+
from tokenizers.models import BPE
|
| 37 |
+
from tokenizers.trainers import BpeTrainer
|
| 38 |
+
from tokenizers.pre_tokenizers import ByteLevel
|
| 39 |
+
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
|
| 40 |
+
|
| 41 |
+
# Configure logging
|
| 42 |
+
logging.basicConfig(
|
| 43 |
+
level=logging.INFO,
|
| 44 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 45 |
+
handlers=[
|
| 46 |
+
logging.StreamHandler(),
|
| 47 |
+
logging.FileHandler('tokenizer.log')
|
| 48 |
+
]
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# SystemResources class moved to resources.py to fix circular import warning
|
| 52 |
+
|
| 53 |
+
def log_memory_usage():
|
| 54 |
+
"""Log current RAM and GPU memory usage."""
|
| 55 |
+
process = psutil.Process()
|
| 56 |
+
ram_usage = process.memory_info().rss / (1024 * 1024 * 1024) # GB
|
| 57 |
+
ram_percent = psutil.virtual_memory().percent
|
| 58 |
+
available_ram = psutil.virtual_memory().available / (1024 * 1024 * 1024) # GB
|
| 59 |
+
total_ram = psutil.virtual_memory().total / (1024 * 1024 * 1024) # GB
|
| 60 |
+
logging.info(f"RAM: {ram_usage:.2f} GB used, {available_ram:.2f} GB available ({ram_percent}% used of {total_ram:.1f} GB total)")
|
| 61 |
+
|
| 62 |
+
if torch.cuda.is_available():
|
| 63 |
+
for i in range(torch.cuda.device_count()):
|
| 64 |
+
allocated = torch.cuda.memory_allocated(i) / (1024 * 1024 * 1024) # GB
|
| 65 |
+
cached = torch.cuda.memory_reserved(i) / (1024 * 1024 * 1024) # GB
|
| 66 |
+
logging.info(f"CUDA Device {i}: {allocated:.2f} GB allocated, {cached:.2f} GB cached")
|
| 67 |
+
|
| 68 |
+
def manage_ram(aggressive: bool = False):
|
| 69 |
+
"""Perform RAM-specific memory management and garbage collection.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
aggressive: If True, performs more thorough memory cleanup operations
|
| 73 |
+
"""
|
| 74 |
+
# Record memory before cleanup
|
| 75 |
+
before_ram = psutil.virtual_memory().percent
|
| 76 |
+
before_process = psutil.Process().memory_info().rss / (1024 * 1024 * 1024) # GB
|
| 77 |
+
|
| 78 |
+
# Run standard garbage collection first
|
| 79 |
+
gc.collect()
|
| 80 |
+
|
| 81 |
+
if aggressive:
|
| 82 |
+
# Force the most thorough collection possible
|
| 83 |
+
for _ in range(2): # Multiple passes
|
| 84 |
+
for i in range(3): # All generations 0, 1, 2
|
| 85 |
+
gc.collect(i)
|
| 86 |
+
|
| 87 |
+
# More aggressive memory management for critical situations
|
| 88 |
+
try:
|
| 89 |
+
# Clear any traceback objects which can hold references
|
| 90 |
+
traceback.clear_frames(sys.exc_info()[2])
|
| 91 |
+
|
| 92 |
+
# Emergency measures for severe memory pressure
|
| 93 |
+
import builtins
|
| 94 |
+
for name in list(builtins.__dict__.keys()):
|
| 95 |
+
if name.startswith('__') and name.endswith('__'):
|
| 96 |
+
continue # Skip special builtins
|
| 97 |
+
if not isinstance(builtins.__dict__[name], type):
|
| 98 |
+
continue # Skip non-types
|
| 99 |
+
# Clear type caches which can hold memory
|
| 100 |
+
if hasattr(builtins.__dict__[name], '__dict__') and '__cache__' in builtins.__dict__[name].__dict__:
|
| 101 |
+
builtins.__dict__[name].__dict__['__cache__'].clear()
|
| 102 |
+
|
| 103 |
+
# Force a compaction of freed memory back to the system
|
| 104 |
+
gc.collect()
|
| 105 |
+
|
| 106 |
+
# On Windows, explicitly request memory compaction from OS
|
| 107 |
+
if sys.platform.startswith('win'):
|
| 108 |
+
try:
|
| 109 |
+
import ctypes
|
| 110 |
+
ctypes.windll.kernel32.SetProcessWorkingSetSize(-1, -1)
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logging.debug(f"Failed to compact Windows memory: {e}")
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logging.warning(f"Error during aggressive memory cleanup: {e}")
|
| 115 |
+
|
| 116 |
+
# Calculate and log memory freed
|
| 117 |
+
after_ram = psutil.virtual_memory().percent
|
| 118 |
+
after_process = psutil.Process().memory_info().rss / (1024 * 1024 * 1024) # GB
|
| 119 |
+
freed_gb = before_process - after_process
|
| 120 |
+
|
| 121 |
+
if freed_gb > 0.01: # If we freed a noticeable amount
|
| 122 |
+
logging.info(f"Memory cleaned: {freed_gb:.2f} GB freed, RAM usage {before_ram}% → {after_ram}%")
|
| 123 |
+
|
| 124 |
+
# Return True if we successfully freed memory
|
| 125 |
+
return freed_gb > 0
|
| 126 |
+
|
| 127 |
+
def cleanup_cuda(force: bool = False):
|
| 128 |
+
"""Perform CUDA memory cleanup with garbage collection."""
|
| 129 |
+
# Run RAM cleanup first
|
| 130 |
+
manage_ram(aggressive=force)
|
| 131 |
+
|
| 132 |
+
# Then handle CUDA if available
|
| 133 |
+
if not torch.cuda.is_available():
|
| 134 |
+
return
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
# Clear CUDA cache
|
| 138 |
+
torch.cuda.empty_cache()
|
| 139 |
+
|
| 140 |
+
if force:
|
| 141 |
+
# Force synchronize CUDA
|
| 142 |
+
torch.cuda.synchronize()
|
| 143 |
+
|
| 144 |
+
# On aggressive cleanup, try to clear everything
|
| 145 |
+
for i in range(torch.cuda.device_count()):
|
| 146 |
+
torch.cuda.synchronize(i)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logging.warning(f"Error during CUDA cleanup: {e}")
|
| 149 |
+
|
| 150 |
+
def process_file(file_path):
|
| 151 |
+
"""Process a single file to extract its content."""
|
| 152 |
+
try:
|
| 153 |
+
# Get file size for logging
|
| 154 |
+
file_size = os.path.getsize(file_path)
|
| 155 |
+
logging.info(f"Processing file: {os.path.basename(file_path)} (Size: {file_size} bytes)")
|
| 156 |
+
|
| 157 |
+
# Read file content
|
| 158 |
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 159 |
+
content = f.read()
|
| 160 |
+
|
| 161 |
+
if not content:
|
| 162 |
+
logging.warning(f"File {file_path} is empty")
|
| 163 |
+
else:
|
| 164 |
+
logging.info(f"Successfully read {len(content)} characters from {os.path.basename(file_path)}")
|
| 165 |
+
|
| 166 |
+
return content, file_size, True
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logging.error(f"Error processing file {file_path}: {e}", exc_info=True)
|
| 169 |
+
return "", 0, False
|
| 170 |
+
|
| 171 |
+
def write_texts_to_disk(texts, file_path, max_chars_per_text=5000):
|
| 172 |
+
"""Write text data to disk to free up memory.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
texts (list): List of text entries to save
|
| 176 |
+
file_path (str): Path to save the data
|
| 177 |
+
max_chars_per_text (int): Maximum characters to save per text entry
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
bool: True if successful, False otherwise
|
| 181 |
+
"""
|
| 182 |
+
try:
|
| 183 |
+
with open(file_path, 'w', encoding='utf-8', errors='replace') as f:
|
| 184 |
+
for text in texts:
|
| 185 |
+
# Limit each text to prevent huge files
|
| 186 |
+
f.write(text[:max_chars_per_text] + '\n---END_ENTRY---\n')
|
| 187 |
+
return True
|
| 188 |
+
except Exception as e:
|
| 189 |
+
logging.error(f"Error writing texts to disk: {e}")
|
| 190 |
+
return False
|
| 191 |
+
|
| 192 |
+
def read_texts_from_disk(file_path):
|
| 193 |
+
"""Read text data from disk file.
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
file_path (str): Path to read data from
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
list: List of text entries read from file
|
| 200 |
+
"""
|
| 201 |
+
try:
|
| 202 |
+
texts = []
|
| 203 |
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 204 |
+
current_text = ""
|
| 205 |
+
for line in f:
|
| 206 |
+
if line.strip() == "---END_ENTRY---":
|
| 207 |
+
texts.append(current_text)
|
| 208 |
+
current_text = ""
|
| 209 |
+
else:
|
| 210 |
+
current_text += line
|
| 211 |
+
if current_text: # Add the last entry if file doesn't end with marker
|
| 212 |
+
texts.append(current_text)
|
| 213 |
+
return texts
|
| 214 |
+
except Exception as e:
|
| 215 |
+
logging.error(f"Error reading texts from disk: {e}")
|
| 216 |
+
return []
|
| 217 |
+
|
| 218 |
+
def build_tokenizer(input_dir, output_path, vocab_size=40000, min_frequency=2, max_files=None, resources=None, temp_dir=None):
|
| 219 |
+
"""Build a tokenizer directly from Python code files with adaptive resource management.
|
| 220 |
+
|
| 221 |
+
This function automatically adapts to the available system resources, scaling its
|
| 222 |
+
processing based on available RAM, CPU cores, and GPU capabilities. It implements
|
| 223 |
+
extreme memory conservation strategies to prevent OOM crashes.
|
| 224 |
+
|
| 225 |
+
Features:
|
| 226 |
+
- Progressive file loading (smallest files first)
|
| 227 |
+
- Memory monitoring with emergency intervention
|
| 228 |
+
- Disk offloading for memory pressure relief
|
| 229 |
+
- Dynamic chunk sizing with retry mechanisms
|
| 230 |
+
- Text truncation for oversized entries
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
input_dir (str): Directory containing Python code files (*.txt)
|
| 234 |
+
output_path (str): Path where to save the tokenizer JSON file
|
| 235 |
+
vocab_size (int, optional): Size of vocabulary to generate. Defaults to 40000.
|
| 236 |
+
min_frequency (int, optional): Minimum frequency threshold for tokens. Defaults to 2.
|
| 237 |
+
max_files (int, optional): Maximum number of files to process. If None, determined automatically.
|
| 238 |
+
resources (SystemResources, optional): Pre-detected system resources. If None, resources
|
| 239 |
+
will be automatically detected.
|
| 240 |
+
|
| 241 |
+
Returns:
|
| 242 |
+
bool: True if tokenizer was successfully created and saved, False otherwise
|
| 243 |
+
"""
|
| 244 |
+
start_time = time.time()
|
| 245 |
+
|
| 246 |
+
# Detect system resources if not provided
|
| 247 |
+
if resources is None:
|
| 248 |
+
resources = SystemResources()
|
| 249 |
+
|
| 250 |
+
try:
|
| 251 |
+
# Monitor system resources
|
| 252 |
+
log_memory_usage() # Initial memory benchmark
|
| 253 |
+
|
| 254 |
+
# Get all text files in directory
|
| 255 |
+
if os.path.isfile(input_dir):
|
| 256 |
+
# If input is a single file, use it directly
|
| 257 |
+
files = [input_dir]
|
| 258 |
+
logging.info(f"Processing single file: {input_dir}")
|
| 259 |
+
else:
|
| 260 |
+
# If input is a directory, get all .txt files
|
| 261 |
+
files = glob.glob(os.path.join(input_dir, "*.txt"))
|
| 262 |
+
logging.info(f"Found {len(files)} files in {input_dir}")
|
| 263 |
+
|
| 264 |
+
if not files:
|
| 265 |
+
logging.error(f"No files found in {input_dir}")
|
| 266 |
+
return False
|
| 267 |
+
|
| 268 |
+
# Sort files by size (smallest first) to allow progressive loading
|
| 269 |
+
try:
|
| 270 |
+
files = sorted(files, key=lambda f: os.path.getsize(f))
|
| 271 |
+
logging.info("Files sorted by size (processing smallest files first)")
|
| 272 |
+
except Exception as e:
|
| 273 |
+
logging.warning(f"Unable to sort files by size: {e}")
|
| 274 |
+
|
| 275 |
+
# Adaptive file processing based on available memory
|
| 276 |
+
process = psutil.Process()
|
| 277 |
+
|
| 278 |
+
# Analyze a few sample files to get a better estimate of average file size
|
| 279 |
+
sample_count = min(10, len(files))
|
| 280 |
+
if sample_count > 0:
|
| 281 |
+
sample_sizes = []
|
| 282 |
+
for i in range(sample_count):
|
| 283 |
+
try:
|
| 284 |
+
file_size = os.path.getsize(files[i]) / (1024 * 1024) # MB
|
| 285 |
+
sample_sizes.append(file_size)
|
| 286 |
+
except Exception:
|
| 287 |
+
pass
|
| 288 |
+
|
| 289 |
+
avg_file_size_estimate = 5 # Default fallback value in MB
|
| 290 |
+
if sample_sizes:
|
| 291 |
+
avg_file_size_estimate = sum(sample_sizes) / len(sample_sizes)
|
| 292 |
+
logging.info(f"Average file size based on {len(sample_sizes)} samples: {avg_file_size_estimate:.2f} MB")
|
| 293 |
+
else:
|
| 294 |
+
avg_file_size_estimate = 5 # MB per file (default estimate)
|
| 295 |
+
|
| 296 |
+
# Calculate safe file count based on resources
|
| 297 |
+
# Use a portion of available RAM, determined by our resources multiplier
|
| 298 |
+
safe_file_count = min(
|
| 299 |
+
len(files),
|
| 300 |
+
int(resources.available_ram_gb * 1024 / avg_file_size_estimate * resources.max_files_multiplier)
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
# EXTREME MEMORY CONSERVATION: Much more conservative file limits
|
| 304 |
+
# Even for high-RAM systems, we'll process fewer files at once after OOM testing
|
| 305 |
+
if resources.total_ram_gb >= 32: # Even for very high RAM systems
|
| 306 |
+
max_files_multiplier = 0.3 # 1/3 of previous value
|
| 307 |
+
elif resources.total_ram_gb >= 16:
|
| 308 |
+
max_files_multiplier = 0.2 # Less than half of previous value
|
| 309 |
+
else:
|
| 310 |
+
max_files_multiplier = 0.1 # Very conservative for lower RAM
|
| 311 |
+
|
| 312 |
+
max_files_cap = max(3, int(resources.total_ram_gb * max_files_multiplier))
|
| 313 |
+
safe_file_count = min(safe_file_count, max_files_cap)
|
| 314 |
+
|
| 315 |
+
# Set an absolute maximum number of files regardless of RAM if max_files not specified
|
| 316 |
+
default_max_files = 10 # Default hard limit to prevent OOM
|
| 317 |
+
|
| 318 |
+
# Apply user-specified max_files if provided, otherwise use calculated safe limit
|
| 319 |
+
if max_files is not None:
|
| 320 |
+
if max_files == float('inf'):
|
| 321 |
+
logging.info("Processing ALL files in dataset (MAX mode)")
|
| 322 |
+
safe_file_count = len(files) # Use all available files
|
| 323 |
+
else:
|
| 324 |
+
logging.info(f"User specified max_files: {max_files}")
|
| 325 |
+
safe_file_count = min(len(files), max_files)
|
| 326 |
+
else:
|
| 327 |
+
safe_file_count = min(safe_file_count, default_max_files)
|
| 328 |
+
|
| 329 |
+
# Ensure we process at least one file
|
| 330 |
+
safe_file_count = max(1, safe_file_count)
|
| 331 |
+
|
| 332 |
+
logging.info(f"Processing up to {safe_file_count} files based on available memory of {resources.available_ram_gb:.2f} GB")
|
| 333 |
+
# Use subset of files to match our determined safe count
|
| 334 |
+
files = files[:safe_file_count]
|
| 335 |
+
|
| 336 |
+
all_texts = []
|
| 337 |
+
total_chars = 0
|
| 338 |
+
|
| 339 |
+
# Use smaller batches for initial processing to gauge memory impact
|
| 340 |
+
initial_batch_size = max(1, resources.batch_size // 2)
|
| 341 |
+
logging.info(f"Starting with conservative batch size of {initial_batch_size}")
|
| 342 |
+
|
| 343 |
+
# Create batches with adaptive batch size - start with smaller batches
|
| 344 |
+
batch_size = initial_batch_size
|
| 345 |
+
batches = [files[i:i+batch_size] for i in range(0, len(files), batch_size)]
|
| 346 |
+
|
| 347 |
+
for batch_idx, batch in enumerate(batches):
|
| 348 |
+
batch_texts = []
|
| 349 |
+
|
| 350 |
+
# Use optimized worker count
|
| 351 |
+
with ProcessPoolExecutor(max_workers=resources.max_workers) as executor:
|
| 352 |
+
results = list(executor.map(process_file, batch))
|
| 353 |
+
|
| 354 |
+
for content, size, success in results:
|
| 355 |
+
if success and content:
|
| 356 |
+
# MEMORY PROTECTION: Limit the size of any individual text entry
|
| 357 |
+
# This prevents single massive files from causing OOM
|
| 358 |
+
if len(content) > resources.max_text_chunk_size:
|
| 359 |
+
logging.warning(f"Truncating oversized text: {len(content)} chars -> {resources.max_text_chunk_size} chars")
|
| 360 |
+
content = content[:resources.max_text_chunk_size]
|
| 361 |
+
|
| 362 |
+
batch_texts.append(content)
|
| 363 |
+
total_chars += len(content)
|
| 364 |
+
|
| 365 |
+
logging.info(f"Batch {batch_idx+1}/{len(batches)}: Processed {len(batch)} files - {total_chars:,} total characters")
|
| 366 |
+
|
| 367 |
+
all_texts.extend(batch_texts)
|
| 368 |
+
|
| 369 |
+
# EMERGENCY MEMORY CHECK: Verify we haven't exceeded critical thresholds
|
| 370 |
+
available_ram_gb = psutil.virtual_memory().available / (1024 * 1024 * 1024)
|
| 371 |
+
ram_usage = process.memory_info().rss / (1024 * 1024 * 1024) # in GB
|
| 372 |
+
ram_percent = psutil.virtual_memory().percent
|
| 373 |
+
logging.info(f"RAM usage after batch {batch_idx+1}: {ram_usage:.2f} GB ({ram_percent}%)")
|
| 374 |
+
|
| 375 |
+
# EXTREME MEMORY PROTECTION: Emergency intervention if available RAM drops below reserve
|
| 376 |
+
if available_ram_gb < resources.emergency_reserve_gb:
|
| 377 |
+
logging.critical(f"EMERGENCY: Available RAM ({available_ram_gb:.2f} GB) below reserve threshold ({resources.emergency_reserve_gb:.2f} GB)")
|
| 378 |
+
logging.critical("Taking emergency measures to prevent system crash")
|
| 379 |
+
|
| 380 |
+
# Save what we have and proceed with drastically reduced processing
|
| 381 |
+
emergency_path = os.path.join(temp_dir, f"emergency_tokenizer_data_{int(time.time())}.txt")
|
| 382 |
+
write_texts_to_disk(all_texts, emergency_path)
|
| 383 |
+
logging.critical(f"Emergency data saved to {emergency_path}")
|
| 384 |
+
|
| 385 |
+
# Keep only 10% of data or 5 entries, whichever is smaller
|
| 386 |
+
emergency_keep = min(max(5, len(all_texts) // 10), 20)
|
| 387 |
+
logging.critical(f"Reducing dataset from {len(all_texts)} entries to {emergency_keep} entries")
|
| 388 |
+
all_texts = all_texts[:emergency_keep]
|
| 389 |
+
|
| 390 |
+
# Force memory cleanup
|
| 391 |
+
manage_ram(aggressive=True)
|
| 392 |
+
cleanup_cuda(force=True)
|
| 393 |
+
|
| 394 |
+
# Stop processing more files
|
| 395 |
+
break
|
| 396 |
+
|
| 397 |
+
# Always use disk offloading if enabled
|
| 398 |
+
disk_offload_frequency = 1 # Every batch
|
| 399 |
+
|
| 400 |
+
# Write intermediate results to disk to reduce memory pressure
|
| 401 |
+
# Do this more aggressively to prevent OOM crashes
|
| 402 |
+
if resources.use_disk_offload and batch_idx > 0 and batch_idx % disk_offload_frequency == 0:
|
| 403 |
+
temp_file_path = os.path.join(temp_dir, f"temp_tokenizer_data_{batch_idx}.txt")
|
| 404 |
+
logging.info(f"Writing intermediate batch results to {temp_file_path}")
|
| 405 |
+
|
| 406 |
+
# Calculate how many entries to offload based on current memory pressure
|
| 407 |
+
current_ram_percent = psutil.virtual_memory().percent
|
| 408 |
+
|
| 409 |
+
# More aggressive offloading at higher memory pressure
|
| 410 |
+
if current_ram_percent > 70:
|
| 411 |
+
offload_percentage = 0.8 # Offload 80% of data if memory pressure high
|
| 412 |
+
elif current_ram_percent > 50:
|
| 413 |
+
offload_percentage = 0.6 # Offload 60% if moderate pressure
|
| 414 |
+
else:
|
| 415 |
+
offload_percentage = 0.4 # Offload 40% if low pressure
|
| 416 |
+
|
| 417 |
+
entries_to_save = max(1, int(len(all_texts) * offload_percentage))
|
| 418 |
+
entries_to_save = min(entries_to_save, len(all_texts) - 1) # Keep at least 1 entry
|
| 419 |
+
|
| 420 |
+
# Write data to disk
|
| 421 |
+
if write_texts_to_disk(all_texts[:entries_to_save], temp_file_path):
|
| 422 |
+
# Remove what we wrote from memory
|
| 423 |
+
logging.info(f"Offloaded {entries_to_save} entries ({offload_percentage*100:.0f}%) to disk, {len(all_texts)-entries_to_save} remain in memory")
|
| 424 |
+
all_texts = all_texts[entries_to_save:]
|
| 425 |
+
|
| 426 |
+
# Force RAM cleanup after file write
|
| 427 |
+
manage_ram(aggressive=True)
|
| 428 |
+
cleanup_cuda(force=True)
|
| 429 |
+
|
| 430 |
+
# Check against adaptive memory thresholds
|
| 431 |
+
if ram_usage > resources.ram_usage_warning:
|
| 432 |
+
logging.warning(f"RAM usage high ({ram_usage:.2f} GB), running RAM-focused cleanup")
|
| 433 |
+
manage_ram()
|
| 434 |
+
|
| 435 |
+
# If still high after cleanup, take more aggressive measures
|
| 436 |
+
ram_usage = process.memory_info().rss / (1024 * 1024 * 1024)
|
| 437 |
+
if ram_usage > resources.ram_usage_critical:
|
| 438 |
+
logging.warning(f"RAM usage critical ({ram_usage:.2f} GB), performing emergency cleanup")
|
| 439 |
+
# Force Python to release memory
|
| 440 |
+
batch_texts.clear()
|
| 441 |
+
manage_ram(aggressive=True)
|
| 442 |
+
|
| 443 |
+
# Adaptive batch reduction - if we're processing too many files, reduce remaining batches
|
| 444 |
+
if len(batches) - batch_idx > 3:
|
| 445 |
+
# For low RAM systems, be more aggressive in reduction
|
| 446 |
+
remaining_batch_count = 3 if resources.total_ram_gb >= 8 else 2
|
| 447 |
+
logging.warning(f"Reducing remaining batches from {len(batches) - batch_idx} to {remaining_batch_count}")
|
| 448 |
+
batches = batches[:batch_idx+remaining_batch_count]
|
| 449 |
+
|
| 450 |
+
if not all_texts:
|
| 451 |
+
logging.error("No content found in files")
|
| 452 |
+
return False
|
| 453 |
+
|
| 454 |
+
logging.info(f"Successfully loaded {len(all_texts)} text entries with {total_chars:,} characters")
|
| 455 |
+
|
| 456 |
+
# Python keywords and common tokens to ensure they're in the vocabulary
|
| 457 |
+
python_tokens = [
|
| 458 |
+
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'import',
|
| 459 |
+
'from', 'as', 'with', 'return', 'yield', 'break', 'continue', 'pass', 'raise',
|
| 460 |
+
'True', 'False', 'None', 'self', 'and', 'or', 'not', 'is', 'in', 'lambda',
|
| 461 |
+
# Common Python library imports
|
| 462 |
+
'import numpy as np', 'import pandas as pd', 'import torch', 'import tensorflow as tf',
|
| 463 |
+
# Function signatures
|
| 464 |
+
'def __init__(self):', 'def forward(self, x):',
|
| 465 |
+
]
|
| 466 |
+
|
| 467 |
+
# Initialize tokenizer - using BPE model which works well for code
|
| 468 |
+
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
| 469 |
+
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
|
| 470 |
+
tokenizer.decoder = ByteLevelDecoder()
|
| 471 |
+
|
| 472 |
+
# Special tokens for Python code
|
| 473 |
+
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<s>", "</s>", "<pad>", "<unk>", "<mask>"]
|
| 474 |
+
|
| 475 |
+
# Configure trainer with larger vocabulary for code
|
| 476 |
+
trainer = BpeTrainer(
|
| 477 |
+
vocab_size=vocab_size,
|
| 478 |
+
min_frequency=min_frequency,
|
| 479 |
+
special_tokens=special_tokens,
|
| 480 |
+
show_progress=True,
|
| 481 |
+
initial_alphabet=list("abcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()_+-=[]{}|;:'\",./<>?`~ "),
|
| 482 |
+
# Add Python keywords as initial tokens
|
| 483 |
+
initial_tokens=python_tokens
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
# Train tokenizer in smaller chunks to save memory
|
| 487 |
+
logging.info(f"Training tokenizer on {len(all_texts):,} texts (target vocab: {vocab_size:,})")
|
| 488 |
+
|
| 489 |
+
# Split texts into smaller chunks for training - chunk size adapted to resources
|
| 490 |
+
# EXTREME MEMORY CONSERVATION: Start with tiny chunk sizes
|
| 491 |
+
# Start with just 1 item for the first iteration to gauge memory impact
|
| 492 |
+
initial_chunk_size = 1 # Start with just 1 item
|
| 493 |
+
max_chunk_size = max(1, resources.training_chunk_size // 2) # Half the normal max
|
| 494 |
+
|
| 495 |
+
# Track memory failures to adapt
|
| 496 |
+
memory_failures = 0
|
| 497 |
+
current_chunk_size = initial_chunk_size
|
| 498 |
+
|
| 499 |
+
# Process in smaller chunks first
|
| 500 |
+
for i in range(0, len(all_texts), current_chunk_size):
|
| 501 |
+
try:
|
| 502 |
+
# Emergency memory check before processing
|
| 503 |
+
current_ram_percent = psutil.virtual_memory().percent
|
| 504 |
+
if current_ram_percent > 85: # Critical threshold
|
| 505 |
+
logging.warning(f"Memory usage critical before training: {current_ram_percent}%")
|
| 506 |
+
current_chunk_size = max(1, current_chunk_size // 2) # Reduce chunk size
|
| 507 |
+
logging.info(f"Reducing chunk size to {current_chunk_size} due to memory pressure")
|
| 508 |
+
manage_ram(aggressive=True)
|
| 509 |
+
cleanup_cuda(force=True)
|
| 510 |
+
|
| 511 |
+
# Get the chunk to process
|
| 512 |
+
end_idx = min(i + current_chunk_size, len(all_texts))
|
| 513 |
+
chunk = all_texts[i:end_idx]
|
| 514 |
+
|
| 515 |
+
# Log progress
|
| 516 |
+
chunks_total = (len(all_texts) + current_chunk_size - 1) // current_chunk_size
|
| 517 |
+
current_chunk = i // current_chunk_size + 1
|
| 518 |
+
logging.info(f"Training on chunk {current_chunk}/{chunks_total} with size {len(chunk)}")
|
| 519 |
+
|
| 520 |
+
# Train on this chunk
|
| 521 |
+
tokenizer.train_from_iterator(
|
| 522 |
+
chunk,
|
| 523 |
+
trainer=trainer,
|
| 524 |
+
length=len(chunk)
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
# Clean up memory between chunks
|
| 528 |
+
del chunk
|
| 529 |
+
manage_ram(aggressive=True)
|
| 530 |
+
cleanup_cuda(force=True)
|
| 531 |
+
|
| 532 |
+
# If successful and we're still using a reduced chunk size, try increasing it
|
| 533 |
+
if current_chunk_size < max_chunk_size and memory_failures == 0 and current_chunk > 3:
|
| 534 |
+
new_size = min(max_chunk_size, current_chunk_size * 2)
|
| 535 |
+
logging.info(f"Increasing chunk size from {current_chunk_size} to {new_size}")
|
| 536 |
+
current_chunk_size = new_size
|
| 537 |
+
|
| 538 |
+
except Exception as e:
|
| 539 |
+
if "memory" in str(e).lower() or "allocation" in str(e).lower():
|
| 540 |
+
memory_failures += 1
|
| 541 |
+
logging.error(f"Memory error during training: {e}")
|
| 542 |
+
|
| 543 |
+
# Reduce chunk size and retry
|
| 544 |
+
old_size = current_chunk_size
|
| 545 |
+
current_chunk_size = max(1, current_chunk_size // 2)
|
| 546 |
+
logging.warning(f"Reducing chunk size from {old_size} to {current_chunk_size} and retrying")
|
| 547 |
+
|
| 548 |
+
# Force cleanup
|
| 549 |
+
manage_ram(aggressive=True)
|
| 550 |
+
cleanup_cuda(force=True)
|
| 551 |
+
|
| 552 |
+
# Back up a bit to retry with smaller chunk
|
| 553 |
+
i = max(0, i - current_chunk_size)
|
| 554 |
+
continue
|
| 555 |
+
else:
|
| 556 |
+
# Non-memory error, re-raise
|
| 557 |
+
raise
|
| 558 |
+
|
| 559 |
+
# Ensure output directory exists
|
| 560 |
+
output_dir = os.path.dirname(output_path) or '.'
|
| 561 |
+
if output_dir:
|
| 562 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 563 |
+
|
| 564 |
+
# Save tokenizer
|
| 565 |
+
tokenizer.save(output_path)
|
| 566 |
+
|
| 567 |
+
final_vocab_size = len(tokenizer.get_vocab())
|
| 568 |
+
elapsed = time.time() - start_time
|
| 569 |
+
logging.info(f"Tokenizer created with {final_vocab_size:,} tokens in {elapsed:.1f} seconds")
|
| 570 |
+
logging.info(f"Saved to: {output_path}")
|
| 571 |
+
|
| 572 |
+
return True
|
| 573 |
+
|
| 574 |
+
except Exception as e:
|
| 575 |
+
logging.error(f"Error training tokenizer: {e}")
|
| 576 |
+
logging.error(traceback.format_exc())
|
| 577 |
+
|
| 578 |
+
# Adaptive retry strategy for memory errors
|
| 579 |
+
if "memory" in str(e).lower() or "allocation" in str(e).lower():
|
| 580 |
+
logging.warning("Memory error detected, implementing adaptive sampling strategy...")
|
| 581 |
+
|
| 582 |
+
# Clear as much memory as possible
|
| 583 |
+
cleanup_cuda(True)
|
| 584 |
+
|
| 585 |
+
# Try progressively smaller samples until success or giving up
|
| 586 |
+
try:
|
| 587 |
+
# For very low memory systems, use even smaller sample
|
| 588 |
+
sample_size = 5 if resources.total_ram_gb < 8 else 10
|
| 589 |
+
all_texts_backup = all_texts[:sample_size] # Keep a small sample
|
| 590 |
+
del all_texts
|
| 591 |
+
gc.collect()
|
| 592 |
+
|
| 593 |
+
# Release all other large objects and force collection
|
| 594 |
+
cleanup_cuda(True)
|
| 595 |
+
|
| 596 |
+
logging.info(f"Trying with a smaller sample size: {sample_size} texts")
|
| 597 |
+
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
| 598 |
+
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
|
| 599 |
+
tokenizer.decoder = ByteLevelDecoder()
|
| 600 |
+
|
| 601 |
+
tokenizer.train_from_iterator(all_texts_backup, trainer=trainer)
|
| 602 |
+
tokenizer.save(output_path)
|
| 603 |
+
|
| 604 |
+
final_vocab_size = len(tokenizer.get_vocab())
|
| 605 |
+
elapsed = time.time() - start_time
|
| 606 |
+
logging.info(f"Tokenizer created with {final_vocab_size:,} tokens in {elapsed:.1f} seconds")
|
| 607 |
+
logging.info(f"Saved to: {output_path}")
|
| 608 |
+
return True
|
| 609 |
+
except Exception as e2:
|
| 610 |
+
logging.error(f"Retry failed: {e2}")
|
| 611 |
+
return False
|
| 612 |
+
|
| 613 |
+
return False
|
| 614 |
+
|
| 615 |
+
|
| 616 |
+
if __name__ == "__main__":
|
| 617 |
+
# Main entry point with command-line argument handling
|
| 618 |
+
logging.info("Starting EZ-Tokenizer creation script")
|
| 619 |
+
logging.info(f"EZ-Tokenizer v1.0.0 - Optimized for performance and accuracy")
|
| 620 |
+
logging.info("Copyright (c) 2025 EZ-Tokenizer Team. All rights reserved.")
|
| 621 |
+
|
| 622 |
+
if len(sys.argv) < 3:
|
| 623 |
+
print("Usage: python adaptive_tokenizer.py <input_dir> <output_path> [vocab_size] [min_frequency] [max_files]")
|
| 624 |
+
print(" max_files: Optional maximum number of files to process (default: auto-determined)")
|
| 625 |
+
print(" Use 'MAX' to process all files in the directory")
|
| 626 |
+
sys.exit(1)
|
| 627 |
+
|
| 628 |
+
input_dir = sys.argv[1]
|
| 629 |
+
output_path = sys.argv[2]
|
| 630 |
+
|
| 631 |
+
vocab_size = int(sys.argv[3]) if len(sys.argv) > 3 else 40000
|
| 632 |
+
min_frequency = int(sys.argv[4]) if len(sys.argv) > 4 else 2
|
| 633 |
+
|
| 634 |
+
# Handle max_files parameter with special 'MAX' keyword
|
| 635 |
+
max_files = None
|
| 636 |
+
if len(sys.argv) > 5:
|
| 637 |
+
if sys.argv[5].upper() == 'MAX':
|
| 638 |
+
max_files = float('inf') # Effectively no limit
|
| 639 |
+
logging.info("MAX keyword detected - will process all available files")
|
| 640 |
+
else:
|
| 641 |
+
try:
|
| 642 |
+
max_files = int(sys.argv[5])
|
| 643 |
+
except ValueError:
|
| 644 |
+
logging.warning(f"Invalid max_files value: {sys.argv[5]} - using auto determination")
|
| 645 |
+
max_files = None
|
| 646 |
+
|
| 647 |
+
# Detect system resources automatically
|
| 648 |
+
resources = SystemResources()
|
| 649 |
+
|
| 650 |
+
logging.info("Starting tokenizer creation with the following parameters:")
|
| 651 |
+
logging.info(f"Configuration:")
|
| 652 |
+
logging.info(f" Input directory: {input_dir}")
|
| 653 |
+
logging.info(f" Output path: {output_path}")
|
| 654 |
+
logging.info(f" Vocabulary size: {vocab_size}")
|
| 655 |
+
logging.info(f" Minimum frequency: {min_frequency}")
|
| 656 |
+
if max_files == float('inf'):
|
| 657 |
+
logging.info(f" Maximum files: MAX (all files)")
|
| 658 |
+
else:
|
| 659 |
+
logging.info(f" Maximum files: {max_files if max_files is not None else 'auto'}")
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
# Create a temp directory for offloaded data
|
| 663 |
+
import tempfile
|
| 664 |
+
import atexit
|
| 665 |
+
import shutil
|
| 666 |
+
|
| 667 |
+
# Create a temporary directory that will be automatically cleaned up
|
| 668 |
+
temp_dir = tempfile.mkdtemp(prefix='nexforge_tokenizer_')
|
| 669 |
+
logging.info(f"Created temporary directory for data offloading: {temp_dir}")
|
| 670 |
+
|
| 671 |
+
# Register cleanup function to remove the temp directory on exit
|
| 672 |
+
def cleanup_temp():
|
| 673 |
+
try:
|
| 674 |
+
if os.path.exists(temp_dir):
|
| 675 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 676 |
+
logging.info(f"Cleaned up temporary directory: {temp_dir}")
|
| 677 |
+
except Exception as e:
|
| 678 |
+
logging.warning(f"Error cleaning up temporary directory: {e}")
|
| 679 |
+
|
| 680 |
+
atexit.register(cleanup_temp)
|
| 681 |
+
|
| 682 |
+
# Initial memory check
|
| 683 |
+
log_memory_usage()
|
| 684 |
+
|
| 685 |
+
# Pass the temp_dir to the build_tokenizer function
|
| 686 |
+
success = build_tokenizer(
|
| 687 |
+
input_dir=input_dir,
|
| 688 |
+
output_path=output_path,
|
| 689 |
+
vocab_size=vocab_size,
|
| 690 |
+
min_frequency=min_frequency,
|
| 691 |
+
max_files=max_files,
|
| 692 |
+
resources=resources,
|
| 693 |
+
temp_dir=temp_dir # Pass temp_dir to the function
|
| 694 |
+
)
|
| 695 |
+
|
| 696 |
+
# Cleanup is now handled by the atexit handler
|
| 697 |
+
logging.info("Temporary files will be cleaned up on exit")
|
| 698 |
+
|
| 699 |
+
# Final status
|
| 700 |
+
if success:
|
| 701 |
+
logging.info("Tokenizer creation completed successfully")
|
| 702 |
+
sys.exit(0)
|
| 703 |
+
else:
|
| 704 |
+
logging.error("Tokenizer creation failed")
|
| 705 |
+
sys.exit(1)
|
src/nexforgetokenizer/data/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data handling for NexForge Tokenizer."""
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
def get_data_path() -> Path:
|
| 7 |
+
"""Get the path to the package data directory."""
|
| 8 |
+
return Path(__file__).parent
|
| 9 |
+
|
| 10 |
+
def get_sample_data_path() -> Optional[Path]:
|
| 11 |
+
"""Get the path to the sample Python code file."""
|
| 12 |
+
data_path = get_data_path() / "python_code_sample.txt"
|
| 13 |
+
return data_path if data_path.exists() else None
|
| 14 |
+
|
| 15 |
+
def load_sample_data() -> Optional[str]:
|
| 16 |
+
"""Load and return the sample Python code as a string."""
|
| 17 |
+
sample_path = get_sample_data_path()
|
| 18 |
+
if sample_path is None:
|
| 19 |
+
return None
|
| 20 |
+
return sample_path.read_text(encoding='utf-8')
|
src/nexforgetokenizer/resources.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""System resource detection and management for adaptive processing."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import psutil
|
| 5 |
+
import torch
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Optional, Dict, Any
|
| 8 |
+
|
| 9 |
+
class SystemResources:
|
| 10 |
+
"""Detect and manage system resources for adaptive processing.
|
| 11 |
+
|
| 12 |
+
This class provides a unified interface to system resource detection,
|
| 13 |
+
handling CPU, RAM, and GPU capabilities. It calculates appropriate
|
| 14 |
+
thresholds and settings based on the detected hardware configuration.
|
| 15 |
+
|
| 16 |
+
It implements extreme memory conservation strategies to prevent OOM crashes
|
| 17 |
+
even on large datasets or limited hardware.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
# CPU detection
|
| 22 |
+
self.cpu_cores = os.cpu_count() or 1
|
| 23 |
+
self.cpu_threads = self.cpu_cores
|
| 24 |
+
|
| 25 |
+
# Try to get physical cores vs logical cores
|
| 26 |
+
try:
|
| 27 |
+
self.cpu_physical_cores = psutil.cpu_count(logical=False) or self.cpu_cores
|
| 28 |
+
except:
|
| 29 |
+
self.cpu_physical_cores = self.cpu_cores
|
| 30 |
+
|
| 31 |
+
# RAM detection
|
| 32 |
+
self.total_ram_gb = psutil.virtual_memory().total / (1024 ** 3)
|
| 33 |
+
self.available_ram_gb = psutil.virtual_memory().available / (1024 ** 3)
|
| 34 |
+
|
| 35 |
+
# GPU detection
|
| 36 |
+
self.has_cuda = torch.cuda.is_available()
|
| 37 |
+
self.cuda_device = None
|
| 38 |
+
self.cuda_mem_gb = 0
|
| 39 |
+
|
| 40 |
+
if self.has_cuda:
|
| 41 |
+
try:
|
| 42 |
+
torch.cuda.empty_cache()
|
| 43 |
+
self.cuda_device = torch.cuda.get_device_name(0)
|
| 44 |
+
self.cuda_mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logging.warning(f"Error detecting CUDA properties: {e}")
|
| 47 |
+
self.has_cuda = False
|
| 48 |
+
|
| 49 |
+
# Calculate resource-based thresholds
|
| 50 |
+
self._calculate_thresholds()
|
| 51 |
+
|
| 52 |
+
# Log detected resources
|
| 53 |
+
self._log_resources()
|
| 54 |
+
|
| 55 |
+
def _calculate_thresholds(self):
|
| 56 |
+
"""Calculate adaptive thresholds based on detected system resources."""
|
| 57 |
+
# Memory thresholds - scaled to available RAM with extreme caution
|
| 58 |
+
# For all systems, use much more conservative thresholds after OOM testing
|
| 59 |
+
|
| 60 |
+
# Calculate absolute available RAM for emergency protection
|
| 61 |
+
self.emergency_reserve_gb = max(2.0, self.total_ram_gb * 0.2) # At least 2GB or 20% reserved
|
| 62 |
+
|
| 63 |
+
if self.total_ram_gb < 8: # Low RAM (<8GB)
|
| 64 |
+
self.ram_usage_warning = self.total_ram_gb * 0.45 # 45% of RAM
|
| 65 |
+
self.ram_usage_critical = self.total_ram_gb * 0.60 # 60% of RAM
|
| 66 |
+
self.max_files_multiplier = 0.03 # Extremely conservative
|
| 67 |
+
self.use_disk_offload = True # Always use disk offloading
|
| 68 |
+
elif self.total_ram_gb < 16: # Medium RAM (8-16GB)
|
| 69 |
+
self.ram_usage_warning = self.total_ram_gb * 0.55 # 55% of RAM
|
| 70 |
+
self.ram_usage_critical = self.total_ram_gb * 0.70 # 70% of RAM
|
| 71 |
+
self.max_files_multiplier = 0.05
|
| 72 |
+
self.use_disk_offload = True # Always use disk offloading
|
| 73 |
+
else: # High RAM (>16GB)
|
| 74 |
+
self.ram_usage_warning = self.total_ram_gb * 0.60 # 60% of RAM (down from 75%)
|
| 75 |
+
self.ram_usage_critical = self.total_ram_gb * 0.75 # 75% of RAM (down from 90%)
|
| 76 |
+
self.max_files_multiplier = 0.1 # Halved from previous 0.2
|
| 77 |
+
self.use_disk_offload = True # Use disk offloading even on high-RAM systems
|
| 78 |
+
|
| 79 |
+
# Maximum text chunk size in memory (characters)
|
| 80 |
+
# This helps prevent individual large chunks from causing OOM
|
| 81 |
+
self.max_text_chunk_size = min(10_000_000, int(self.total_ram_gb * 1_000_000))
|
| 82 |
+
|
| 83 |
+
# CPU-based settings
|
| 84 |
+
# For worker count, use physical cores (or half of logical cores if physical detection failed)
|
| 85 |
+
self.max_workers = max(1, min(self.cpu_physical_cores, 4)) # At most 4 workers
|
| 86 |
+
|
| 87 |
+
# Batch size based on available cores
|
| 88 |
+
if self.cpu_cores <= 2:
|
| 89 |
+
self.batch_size = 2
|
| 90 |
+
elif self.cpu_cores <= 4:
|
| 91 |
+
self.batch_size = 4
|
| 92 |
+
else:
|
| 93 |
+
self.batch_size = min(5, self.cpu_cores // 2)
|
| 94 |
+
|
| 95 |
+
# Training chunk size - how many texts to process in one training iteration
|
| 96 |
+
if self.total_ram_gb < 8:
|
| 97 |
+
self.training_chunk_size = 3
|
| 98 |
+
elif self.total_ram_gb < 16:
|
| 99 |
+
self.training_chunk_size = 5
|
| 100 |
+
else:
|
| 101 |
+
self.training_chunk_size = 10
|
| 102 |
+
|
| 103 |
+
def _log_resources(self):
|
| 104 |
+
"""Log detected system resources and calculated thresholds."""
|
| 105 |
+
logging.info("===== System Resources =====")
|
| 106 |
+
logging.info(f"CPU: {self.cpu_cores} cores ({self.cpu_physical_cores} physical)")
|
| 107 |
+
logging.info(f"RAM: {self.total_ram_gb:.1f} GB total, {self.available_ram_gb:.1f} GB available")
|
| 108 |
+
|
| 109 |
+
if self.has_cuda:
|
| 110 |
+
logging.info(f"GPU: {self.cuda_device} with {self.cuda_mem_gb:.1f} GB memory")
|
| 111 |
+
else:
|
| 112 |
+
logging.info("GPU: Not available")
|
| 113 |
+
|
| 114 |
+
logging.info("===== Adaptive Settings =====")
|
| 115 |
+
logging.info(f"RAM Warning Threshold: {self.ram_usage_warning:.1f} GB")
|
| 116 |
+
logging.info(f"RAM Critical Threshold: {self.ram_usage_critical:.1f} GB")
|
| 117 |
+
logging.info(f"Max Workers: {self.max_workers}")
|
| 118 |
+
logging.info(f"Batch Size: {self.batch_size}")
|
| 119 |
+
logging.info(f"Training Chunk Size: {self.training_chunk_size}")
|
| 120 |
+
logging.info(f"Max Files Multiplier: {self.max_files_multiplier:.2f}")
|
tests/test_adaptive_tokenizer.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
Simple test script for the NexForge Adaptive Tokenizer.
|
| 6 |
+
|
| 7 |
+
This script demonstrates the basic usage of the adaptive tokenizer
|
| 8 |
+
by creating a small sample Python file and building a tokenizer from it.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
import logging
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import tempfile
|
| 16 |
+
from tokenizers import Tokenizer
|
| 17 |
+
|
| 18 |
+
# Add the parent directory to the path so we can import the package
|
| 19 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 20 |
+
|
| 21 |
+
from nexforgetokenizer import SystemResources, build_tokenizer, log_memory_usage
|
| 22 |
+
|
| 23 |
+
# Configure logging
|
| 24 |
+
logging.basicConfig(
|
| 25 |
+
level=logging.INFO,
|
| 26 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 27 |
+
handlers=[
|
| 28 |
+
logging.StreamHandler(),
|
| 29 |
+
logging.FileHandler('tokenizer_test.log')
|
| 30 |
+
]
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Sample Python code for testing
|
| 34 |
+
SAMPLE_CODE = """
|
| 35 |
+
# Comprehensive Python code test for tokenizer
|
| 36 |
+
|
| 37 |
+
def factorial(n):
|
| 38 |
+
\"\"\"Calculate factorial of n.\"\"\"
|
| 39 |
+
if n <= 1:
|
| 40 |
+
return 1
|
| 41 |
+
return n * factorial(n - 1)
|
| 42 |
+
|
| 43 |
+
class TestClass:
|
| 44 |
+
def __init__(self, value):
|
| 45 |
+
self.value = value
|
| 46 |
+
|
| 47 |
+
def process(self):
|
| 48 |
+
\"\"\"Process the value and return result.\"\"\"
|
| 49 |
+
return self.value * 2
|
| 50 |
+
|
| 51 |
+
def main():
|
| 52 |
+
# Test various Python constructs
|
| 53 |
+
numbers = [1, 2, 3, 4, 5]
|
| 54 |
+
squares = [x**2 for x in numbers]
|
| 55 |
+
|
| 56 |
+
# Test string formatting
|
| 57 |
+
name = "NexForge"
|
| 58 |
+
version = 1.0
|
| 59 |
+
|
| 60 |
+
# Test control flow
|
| 61 |
+
if version > 0.5:
|
| 62 |
+
print(f"{name} v{version} is stable!")
|
| 63 |
+
else:
|
| 64 |
+
print(f"{name} v{version} is in development")
|
| 65 |
+
|
| 66 |
+
# Test function calls
|
| 67 |
+
result = factorial(5)
|
| 68 |
+
print(f"5! = {result}")
|
| 69 |
+
|
| 70 |
+
# Test class usage
|
| 71 |
+
test = TestClass(21)
|
| 72 |
+
print(f"Processed value: {test.process()}")
|
| 73 |
+
|
| 74 |
+
return 0
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
exit(main())
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
def create_test_file(directory):
|
| 81 |
+
"""Create a test Python file in the specified directory."""
|
| 82 |
+
os.makedirs(directory, exist_ok=True)
|
| 83 |
+
test_file = os.path.join(directory, 'test_code.py')
|
| 84 |
+
|
| 85 |
+
with open(test_file, 'w', encoding='utf-8') as f:
|
| 86 |
+
f.write(SAMPLE_CODE)
|
| 87 |
+
|
| 88 |
+
return test_file
|
| 89 |
+
|
| 90 |
+
def test_tokenizer():
|
| 91 |
+
"""Test the adaptive tokenizer on a sample Python file."""
|
| 92 |
+
# Create a temporary directory for our test output
|
| 93 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 94 |
+
# Use the existing sample data
|
| 95 |
+
sample_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
|
| 96 |
+
'src', 'nexforgetokenizer', 'data', 'python_code_sample.txt')
|
| 97 |
+
|
| 98 |
+
print(f"Using sample data file: {sample_data_path}")
|
| 99 |
+
|
| 100 |
+
# Verify the sample file exists
|
| 101 |
+
if not os.path.exists(sample_data_path):
|
| 102 |
+
print(f"ERROR: Sample data file not found at {sample_data_path}")
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
print(f"Sample file size: {os.path.getsize(sample_data_path)} bytes")
|
| 106 |
+
|
| 107 |
+
# Directory containing the sample file
|
| 108 |
+
data_dir = os.path.dirname(sample_data_path)
|
| 109 |
+
print(f"Data directory: {data_dir}")
|
| 110 |
+
|
| 111 |
+
# Output path for the tokenizer
|
| 112 |
+
output_path = os.path.join(temp_dir, 'test_tokenizer.json')
|
| 113 |
+
|
| 114 |
+
# Log initial memory usage
|
| 115 |
+
print("\nInitial memory usage:")
|
| 116 |
+
log_memory_usage()
|
| 117 |
+
|
| 118 |
+
# Detect system resources
|
| 119 |
+
resources = SystemResources()
|
| 120 |
+
print(f"\nDetected system resources:")
|
| 121 |
+
print(f"CPU Cores: {resources.cpu_cores}")
|
| 122 |
+
print(f"Available RAM: {resources.available_ram_gb:.2f} GB")
|
| 123 |
+
if resources.has_cuda:
|
| 124 |
+
print(f"GPU: {resources.cuda_device} with {resources.cuda_mem_gb:.2f} GB")
|
| 125 |
+
else:
|
| 126 |
+
print("No CUDA GPU detected")
|
| 127 |
+
|
| 128 |
+
# Build the tokenizer using the existing sample data directory
|
| 129 |
+
print("\nBuilding tokenizer...")
|
| 130 |
+
success = build_tokenizer(
|
| 131 |
+
input_dir=data_dir,
|
| 132 |
+
output_path=output_path,
|
| 133 |
+
vocab_size=1000, # Small vocabulary for quick testing
|
| 134 |
+
min_frequency=1, # Include all tokens for this test
|
| 135 |
+
resources=resources
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
if success:
|
| 139 |
+
print(f"\nTokenizer successfully created at: {output_path}")
|
| 140 |
+
|
| 141 |
+
# Load the tokenizer and test it
|
| 142 |
+
tokenizer = Tokenizer.from_file(output_path)
|
| 143 |
+
vocab_size = len(tokenizer.get_vocab())
|
| 144 |
+
print(f"Vocabulary size: {vocab_size}")
|
| 145 |
+
|
| 146 |
+
# Test tokenization
|
| 147 |
+
encoded = tokenizer.encode(SAMPLE_CODE)
|
| 148 |
+
print(f"\nTokenized sample code:")
|
| 149 |
+
print(f"Number of tokens: {len(encoded.ids)}")
|
| 150 |
+
print(f"Average chars per token: {len(SAMPLE_CODE) / len(encoded.ids):.2f}")
|
| 151 |
+
|
| 152 |
+
# Log final memory usage
|
| 153 |
+
print("\nFinal memory usage:")
|
| 154 |
+
log_memory_usage()
|
| 155 |
+
|
| 156 |
+
return True
|
| 157 |
+
else:
|
| 158 |
+
print("Failed to create tokenizer")
|
| 159 |
+
return False
|
| 160 |
+
|
| 161 |
+
def main():
|
| 162 |
+
"""Main function to run the test."""
|
| 163 |
+
print("NexForge Adaptive Tokenizer Test")
|
| 164 |
+
print("==============================\n")
|
| 165 |
+
|
| 166 |
+
result = test_tokenizer()
|
| 167 |
+
|
| 168 |
+
if result:
|
| 169 |
+
print("\nTest completed successfully!")
|
| 170 |
+
return 0
|
| 171 |
+
else:
|
| 172 |
+
print("\nTest failed!")
|
| 173 |
+
return 1
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
sys.exit(main())
|