Request for minimal inference script
#2
by NDugar - opened
Hello, I am quite impressed by the results you have achieved on https://huggingface.co/xaskasdf/brandon-tiny-10m-instruct however I am unable to run it using the huggingface methods. Would you be kind enough to share a minimal inference script ?
Also in the training scripts here https://github.com/xaskasdf/brandon-tiny , how can we save the models in huggingface loadable way with autotokenizer and automodel?
from pathlib import Path
from huggingface_hub import hf_hub_download
# Import project code
import sys
sys.path.append(str(Path(__file__).resolve().parents[1]))
from src.model import TinyLlama, ModelConfig
from src.tokenizer import Tokenizer
# Model repo and filenames
repo_id = "xaskasdf/brandon-tiny-10m-instruct"
ckpt_file = "model.safetensors"
vocab_file = "tokenizer.model"
# Download files
ckpt_path = hf_hub_download(repo_id, filename=ckpt_file)
vocab_path = hf_hub_download(repo_id, filename=vocab_file)
# Load tokenizer
tokenizer = Tokenizer(vocab_path)
# Model config matching the checkpoint (see README / training script)
model_cfg = ModelConfig(
dim=256,
n_layers=24,
n_heads=8,
n_kv_heads=2,
vocab_size=8192,
hidden_dim=720,
max_seq_len=512,
dropout=0.05,
weight_tying=True,
norm_eps=1e-5,
rope_theta=10000.0,
block_sharing=True,
dense_former=True,
value_residual=True,
n_registers=4,
)
# Load model (safetensors format)
# Create a config matching the checkpoint
model_cfg = ModelConfig(
dim=256,
n_layers=24,
n_heads=8,
n_kv_heads=2,
vocab_size=8192,
hidden_dim=720,
max_seq_len=512,
dropout=0.05,
weight_tying=True,
norm_eps=1e-5,
rope_theta=10000.0,
block_sharing=True,
dense_former=True,
value_residual=True,
n_registers=4,
)
model = TinyLlama(model_cfg)
# Load safetensors checkpoint
try:
from safetensors.torch import load_file as safe_load
except ImportError:
import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "safetensors", "-q"])
from safetensors.torch import load_file as safe_load
state_dict = safe_load(ckpt_path)
model.load_state_dict(state_dict, strict=False)
model.eval()
# Simple generation helper
def generate(prompt: str, max_new_tokens: int =2* 128):
# Encode only the raw prompt (no ChatML wrappers) to keep generation simple
input_ids = torch.tensor([tokenizer.encode(prompt)])
with torch.no_grad():
out = model.generate(
input_ids,
max_new_tokens=max_new_tokens,
temperature=0.8,
top_p=0.7,
top_k=50,
)
# Decode full sequence (prompt + generated) and drop the original prompt
full_text = tokenizer.decode(out[0].tolist())
generated = full_text[len(prompt):]
return generated.strip()
# Demo prompts
if __name__ == "__main__":
prompts = ["hello, my name is",
"What is the capital of France?",
"Explain photosynthesis in simple terms.",
"Write a short poem about the moon.",
]
for p in prompts:
print(f"Q: {p}\nA: {generate(p)}\n")
Here is how I am currently running but it is giving very repeating outputs.I think there is some issue.