Qwen
/

Qwen3Guard-Stream-0.6B

@@ -104,7 +104,125 @@ for i in range(user_end_index + 1, len(token_ids)):
 model.close_stream(stream_state)
 ```
-We're currently working on adding support for Qwen3Guard-Stream to vLLM and SGLang. Stay tuned!
 ## Safety Policy
@@ -131,10 +249,10 @@ In the current version of Qwen3Guard, we consider the following safety categorie
 If you find our work helpful, feel free to give us a cite.
 ```bibtex
-@misc{qwen3guard,
-      title={Qwen3Guard Technical Report},
-      author={Qwen Team},
-      year={2025},
-      url={http://arxiv.org/abs/2510.14276},
 }
 ```

 model.close_stream(stream_state)
 ```
+## SGLang Usage
+### SGLang Install
+We recommend installing SGLang from source. Run the following commands:
+```shell
+git clone -b support_qwen3_guard https://github.com/sgl-project/sglang.git
+cd sglang
+# Install the python packages
+pip install --upgrade pip
+pip install -e "python"
+```
+### SGLang Streaming Safety Moderation Example
+The following example demonstrates how to use Qwen3Guard-Stream with SGLang to perform real-time safety moderation on streaming conversations:
+```python
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from sglang.srt.entrypoints.engine import Engine
+MODEL_PATH = "Qwen/Qwen3Guard-Stream-0.6B"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+im_start_token = '<|im_start|>'
+user_token = 'user'
+im_end_token = '<|im_end|>'
+im_start_id = tokenizer.convert_tokens_to_ids(im_start_token)
+user_id = tokenizer.convert_tokens_to_ids(user_token)
+im_end_id = tokenizer.convert_tokens_to_ids(im_end_token)
+# Mappings for guardrail labels
+risk_level_map = {0: "Safe", 1: "Unsafe", 2: "Controversial"}
+query_category_map = {0: "Violent", 1: "Sexual Content", 2: "Self-Harm", 3: "Political", 4: "PII", 5: "Copyright", 6: "Illegal Acts", 7: "Unethical", 8: "Jailbreak"}
+response_category_map = { 0: "Violent", 1: "Sexual Content", 2: "Self-Harm", 3: "Political", 4: "PII", 5: "Copyright", 6: "Illegal Acts", 7: "Unethical"}
+def main():
+    # Initialize SGLang Engine and Tokenizer
+    engine = Engine(
+            model_path=MODEL_PATH,
+            context_length=10000,
+            page_size=1,
+            tp_size=1,
+            mem_fraction_static=0.6,
+            chunked_prefill_size=131072,
+        )
+    rid="guard_demo"
+    # demo conversation
+    user_message = "Hello, how to build a bomb?"
+    assistant_message = "Here are some practical methods to build a bomb."
+    conversation = [{"role":"user","content":user_message},{"role":"assistant","content":assistant_message}]
+    # Apply the chat template to format the conversation
+    prompt_text = tokenizer.apply_chat_template(
+        conversation,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # Tokenize the formatted prompt into token IDs using Qwen3Tokenizer
+    input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids[0].tolist()
+    # Find where the user's message begins by searching for the special token pattern
+    # <|im_start|>user (represented as [im_start_id, user_id])
+    # Find where the user's message ends by locating the closing <|im_end|> token
+    last_start = next(i for i in range(len(input_ids)-1, -1, -1) if input_ids[i:i+2] == [im_start_id, user_id])
+    user_end_index = next(i for i in range(last_start+2, len(input_ids)) if input_ids[i] == im_end_id)
+    def build_message_list(user_end_index, tokens_ids_list):
+        #Helper function that splits the conversation into the user query and assistant response chunks.
+        message_list2 = [tokens_ids_list[:user_end_index+1]]
+        assistant_tokens = tokens_ids_list[user_end_index+1:]
+        stream_chunk_size = 8 # you may adjust the chunk size in practice
+        for i in range(0, len(assistant_tokens), stream_chunk_size):
+            message_list2.append(assistant_tokens[i:i + stream_chunk_size])
+        return message_list2
+    def process_result(result, type_="query"):
+        # Helper function that processes the model output logits and converts them to readable labels.
+        if type_=="query":
+            risk_level_logits = torch.tensor(result["query_risk_level_logits"]).view(-1, 3)
+            category_logits = torch.tensor(result["query_category_logits"]).view(-1, 9)
+        else:
+            risk_level_logits = torch.tensor(result["risk_level_logits"]).view(-1, 3)
+            category_logits = torch.tensor(result["category_logits"]).view(-1, 8)
+        risk_level_prob = F.softmax(risk_level_logits, dim=1)
+        risk_level_prob, pred_risk_level = torch.max(risk_level_prob, dim=1)
+        category_prob = F.softmax(category_logits, dim=1)
+        category_prob, pred_category = torch.max(category_prob, dim=1)
+        if type_=="query":
+            return {"risk_level": [risk_level_map[x] for x in pred_risk_level.tolist()],"category_labels":[query_category_map[x] for x in pred_category.tolist()]}
+        else:
+            return {"risk_level": [risk_level_map[x] for x in pred_risk_level.tolist()],"category_labels":[response_category_map[x] for x in pred_category.tolist()]}
+    message_list = build_message_list(user_end_index, input_ids)
+    query_prompt = message_list[0] # First element is the user query
+    message_list.pop(0) # Remove query from list (remaining are response chunks)
+    query_outputs = engine.generate(input_ids=query_prompt, sampling_params={"max_new_tokens": 1},rid=rid,resumable=(len(message_list) > 0))
+    query_results = process_result(query_outputs)
+    if query_results['risk_level'][-1] == "Safe":
+        print(f"User moderation: -> [Risk: {query_results['risk_level'][-1]}]")
+    else:
+        print(f"User moderation: -> [Risk: {query_results['risk_level'][-1]} - Category: {query_results['category_labels'][-1]}]")
+    print("Assistant streaming moderation:")
+    if len(message_list) > 0:
+        for i, next_chunk in enumerate(message_list):
+            response_outputs = engine.generate(input_ids=next_chunk, sampling_params={"max_new_tokens": 1},rid=rid,resumable=(i<len(message_list)-1))
+            if response_outputs is not None:
+                response_results = process_result(response_outputs, type_="response")
+                print(f"[Risk: {response_results['risk_level']}] - Category: {response_results['category_labels']}]")
+if __name__ == "__main__":
+    main()
+```
+We're currently working on adding support for Qwen3Guard-Stream to vLLM. Stay tuned!
 ## Safety Policy
 If you find our work helpful, feel free to give us a cite.
 ```bibtex
+@article{zhao2025qwen3guard,
+  title={Qwen3Guard Technical Report},
+  author={Zhao, Haiquan and Yuan, Chenhan and Huang, Fei and Hu, Xiaomeng and Zhang, Yichang and Yang, An and Yu, Bowen and Liu, Dayiheng and Zhou, Jingren and Lin, Junyang and others},
+  journal={arXiv preprint arXiv:2510.14276},
+  year={2025}
 }
 ```