Update inference examples to use the correct chat template

#2
by mario-sanz - opened
Files changed (1) hide show
  1. README.md +8 -8
README.md CHANGED
@@ -43,13 +43,13 @@ You can use OLMo with the standard HuggingFace transformers library:
43
  from transformers import AutoModelForCausalLM, AutoTokenizer
44
  olmo = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3.1-32B-Think")
45
  tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3.1-32B-Think")
46
- message = ["Who would win in a fight - a dinosaur or a cow named Moo Moo?"]
47
- inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False)
48
  # optional verifying cuda
49
  # inputs = {k: v.to('cuda') for k,v in inputs.items()}
50
  # olmo = olmo.to('cuda')
51
  response = olmo.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
52
- print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
53
  >> '<think>Okay, so the question is who would win in a fight...'
54
  ```
55
 
@@ -184,8 +184,8 @@ model = AutoModelForCausalLM.from_pretrained(
184
  device_map="auto",
185
  )
186
 
187
- prompt = "Who would win in a fight - a dinosaur or a cow named MooMoo?"
188
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
189
 
190
  outputs = model.generate(
191
  **inputs,
@@ -194,7 +194,7 @@ outputs = model.generate(
194
  max_new_tokens=32768,
195
  )
196
 
197
- print(tokenizer.decode(outputs[0], skip_special_tokens=True))
198
  ```
199
 
200
  ### vllm Example
@@ -210,8 +210,8 @@ sampling_params = SamplingParams(
210
  max_tokens=32768,
211
  )
212
 
213
- prompt = "Who would win in a fight - a dinosaur or a cow named MooMoo?"
214
- outputs = llm.generate(prompt, sampling_params)
215
  print(outputs[0].outputs[0].text)
216
  ```
217
 
 
43
  from transformers import AutoModelForCausalLM, AutoTokenizer
44
  olmo = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3.1-32B-Think")
45
  tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3.1-32B-Think")
46
+ message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
47
+ inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors='pt', return_dict=True)
48
  # optional verifying cuda
49
  # inputs = {k: v.to('cuda') for k,v in inputs.items()}
50
  # olmo = olmo.to('cuda')
51
  response = olmo.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
52
+ print(tokenizer.decode(response[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
53
  >> '<think>Okay, so the question is who would win in a fight...'
54
  ```
55
 
 
184
  device_map="auto",
185
  )
186
 
187
+ message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
188
+ inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors='pt', return_dict=True).to(model.device)
189
 
190
  outputs = model.generate(
191
  **inputs,
 
194
  max_new_tokens=32768,
195
  )
196
 
197
+ print(tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
198
  ```
199
 
200
  ### vllm Example
 
210
  max_tokens=32768,
211
  )
212
 
213
+ message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
214
+ outputs = llm.chat(message, sampling_params)
215
  print(outputs[0].outputs[0].text)
216
  ```
217