from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import torch tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3.2-1b-bnb-4bit") base_model = AutoModelForCausalLM.from_pretrained( "unsloth/llama-3.2-1b-bnb-4bit", device_map="auto", torch_dtype=torch.float16, ) model = PeftModel.from_pretrained(base_model, "MeWan2808/SIT_legalTech_llama3.2") model = model.merge_and_unload() device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device)