Request for minimal inference script

#2
by NDugar - opened

Hello, I am quite impressed by the results you have achieved on https://huggingface.co/xaskasdf/brandon-tiny-10m-instruct however I am unable to run it using the huggingface methods. Would you be kind enough to share a minimal inference script ?

Also in the training scripts here https://github.com/xaskasdf/brandon-tiny , how can we save the models in huggingface loadable way with autotokenizer and automodel?

from pathlib import Path
from huggingface_hub import hf_hub_download

# Import project code
import sys
sys.path.append(str(Path(__file__).resolve().parents[1]))
from src.model import TinyLlama, ModelConfig
from src.tokenizer import Tokenizer

# Model repo and filenames
repo_id = "xaskasdf/brandon-tiny-10m-instruct"
ckpt_file = "model.safetensors"
vocab_file = "tokenizer.model"

# Download files
ckpt_path = hf_hub_download(repo_id, filename=ckpt_file)
vocab_path = hf_hub_download(repo_id, filename=vocab_file)

# Load tokenizer
tokenizer = Tokenizer(vocab_path)

# Model config matching the checkpoint (see README / training script)
model_cfg = ModelConfig(
    dim=256,
    n_layers=24,
    n_heads=8,
    n_kv_heads=2,
    vocab_size=8192,
    hidden_dim=720,
    max_seq_len=512,
    dropout=0.05,
    weight_tying=True,
    norm_eps=1e-5,
    rope_theta=10000.0,
    block_sharing=True,
    dense_former=True,
    value_residual=True,
    n_registers=4,
)

# Load model (safetensors format)
# Create a config matching the checkpoint
model_cfg = ModelConfig(
    dim=256,
    n_layers=24,
    n_heads=8,
    n_kv_heads=2,
    vocab_size=8192,
    hidden_dim=720,
    max_seq_len=512,
    dropout=0.05,
    weight_tying=True,
    norm_eps=1e-5,
    rope_theta=10000.0,
    block_sharing=True,
    dense_former=True,
    value_residual=True,
    n_registers=4,
)
model = TinyLlama(model_cfg)
# Load safetensors checkpoint
try:
    from safetensors.torch import load_file as safe_load
except ImportError:
    import subprocess, sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "safetensors", "-q"])
    from safetensors.torch import load_file as safe_load
state_dict = safe_load(ckpt_path)
model.load_state_dict(state_dict, strict=False)
model.eval()

# Simple generation helper

def generate(prompt: str, max_new_tokens: int =2* 128):
    # Encode only the raw prompt (no ChatML wrappers) to keep generation simple
    input_ids = torch.tensor([tokenizer.encode(prompt)])
    with torch.no_grad():
        out = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=0.8,
            top_p=0.7,
            top_k=50,
        )
    # Decode full sequence (prompt + generated) and drop the original prompt
    full_text = tokenizer.decode(out[0].tolist())
    generated = full_text[len(prompt):]
    return generated.strip()

# Demo prompts
if __name__ == "__main__":
    prompts = ["hello, my name is",
        "What is the capital of France?",
        "Explain photosynthesis in simple terms.",
        "Write a short poem about the moon.",
    ]
    for p in prompts:
        print(f"Q: {p}\nA: {generate(p)}\n")

Here is how I am currently running but it is giving very repeating outputs.I think there is some issue.

Sign up or log in to comment