File size: 26,271 Bytes

eb10636

import ctypes
import enum
import os

# Define constants from the header
CPU0 = (1 << 0)  # 0x01
CPU1 = (1 << 1)  # 0x02
CPU2 = (1 << 2)  # 0x04
CPU3 = (1 << 3)  # 0x08
CPU4 = (1 << 4)  # 0x10
CPU5 = (1 << 5)  # 0x20
CPU6 = (1 << 6)  # 0x40
CPU7 = (1 << 7)  # 0x80

# --- Enums ---
class LLMCallState(enum.IntEnum):
    RKLLM_RUN_NORMAL = 0
    RKLLM_RUN_WAITING = 1
    RKLLM_RUN_FINISH = 2
    RKLLM_RUN_ERROR = 3

class RKLLMInputType(enum.IntEnum):
    RKLLM_INPUT_PROMPT = 0
    RKLLM_INPUT_TOKEN = 1
    RKLLM_INPUT_EMBED = 2
    RKLLM_INPUT_MULTIMODAL = 3

class RKLLMInferMode(enum.IntEnum):
    RKLLM_INFER_GENERATE = 0
    RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1
    RKLLM_INFER_GET_LOGITS = 2

# --- Structures ---
class RKLLMExtendParam(ctypes.Structure):
    # 基础iommu domain ID, 对>1b的模型建议设置为1
    base_domain_id: ctypes.c_int32
    # 是否使用flash存储Embedding
    embed_flash: ctypes.c_int8
    # 启用的cpu核心数
    enabled_cpus_num: ctypes.c_int8
    # 启用的cpu核心掩码
    enabled_cpus_mask: ctypes.c_uint32
    reserved: ctypes.c_uint8 * 106

    _fields_ = [
        ("base_domain_id", ctypes.c_int32),
        ("embed_flash", ctypes.c_int8),
        ("enabled_cpus_num", ctypes.c_int8),
        ("enabled_cpus_mask", ctypes.c_uint32),
        ("reserved", ctypes.c_uint8 * 106)
    ]

class RKLLMParam(ctypes.Structure):
    # 模型文件路径
    model_path: ctypes.c_char_p
    # 上下文窗口最大token数
    max_context_len: ctypes.c_int32
    # 最大生成新token数
    max_new_tokens: ctypes.c_int32
    # Top-K采样参数
    top_k: ctypes.c_int32
    # 上下文窗口移动时保留的kv缓存数量
    n_keep: ctypes.c_int32
    # Top-P采样参数
    top_p: ctypes.c_float
    # 采样温度，影响token选择的随机性
    temperature: ctypes.c_float
    # 重复token惩罚
    repeat_penalty: ctypes.c_float
    # 频繁token惩罚
    frequency_penalty: ctypes.c_float
    # 输入中已存在token的惩罚
    presence_penalty: ctypes.c_float
    # Mirostat采样策略标志（0表示禁用）
    mirostat: ctypes.c_int32
    # Mirostat采样Tau参数
    mirostat_tau: ctypes.c_float
    # Mirostat采样Eta参数
    mirostat_eta: ctypes.c_float
    # 是否跳过特殊token
    skip_special_token: ctypes.c_bool
    # 是否异步推理
    is_async: ctypes.c_bool
    # 多模态输入中图像的起始Token
    img_start: ctypes.c_char_p
    # 多模态输入中图像的结束Token
    img_end: ctypes.c_char_p
    # 图像内容指针
    img_content: ctypes.c_char_p
    # 扩展参数
    extend_param: RKLLMExtendParam

    _fields_ = [
        ("model_path", ctypes.c_char_p),         # 模型文件路径
        ("max_context_len", ctypes.c_int32),     # 上下文窗口最大token数
        ("max_new_tokens", ctypes.c_int32),      # 最大生成新token数
        ("top_k", ctypes.c_int32),               # Top-K采样参数
        ("n_keep", ctypes.c_int32),              # 上下文窗口移动时保留的kv缓存数量
        ("top_p", ctypes.c_float),               # Top-P（nucleus）采样参数
        ("temperature", ctypes.c_float),         # 采样温度，影响token选择的随机性
        ("repeat_penalty", ctypes.c_float),      # 重复token惩罚
        ("frequency_penalty", ctypes.c_float),   # 频繁token惩罚
        ("presence_penalty", ctypes.c_float),    # 输入中已存在token的惩罚
        ("mirostat", ctypes.c_int32),            # Mirostat采样策略标志（0表示禁用）
        ("mirostat_tau", ctypes.c_float),        # Mirostat采样Tau参数
        ("mirostat_eta", ctypes.c_float),        # Mirostat采样Eta参数
        ("skip_special_token", ctypes.c_bool),   # 是否跳过特殊token
        ("is_async", ctypes.c_bool),             # 是否异步推理
        ("img_start", ctypes.c_char_p),          # 多模态输入中图像的起始Token
        ("img_end", ctypes.c_char_p),            # 多模态输入中图像的结束Token
        ("img_content", ctypes.c_char_p),        # 图像内容指针
        ("extend_param", RKLLMExtendParam)       # 扩展参数
    ]

class RKLLMLoraAdapter(ctypes.Structure):
    lora_adapter_path: ctypes.c_char_p
    lora_adapter_name: ctypes.c_char_p
    scale: ctypes.c_float

    _fields_ = [
        ("lora_adapter_path", ctypes.c_char_p),
        ("lora_adapter_name", ctypes.c_char_p),
        ("scale", ctypes.c_float)
    ]

class RKLLMEmbedInput(ctypes.Structure):
    # Shape: [n_tokens, embed_size]
    embed: ctypes.POINTER(ctypes.c_float)
    n_tokens: ctypes.c_size_t

    _fields_ = [
        ("embed", ctypes.POINTER(ctypes.c_float)),  
        ("n_tokens", ctypes.c_size_t)
    ]

class RKLLMTokenInput(ctypes.Structure):
    # Shape: [n_tokens]
    input_ids: ctypes.POINTER(ctypes.c_int32)
    n_tokens: ctypes.c_size_t

    _fields_ = [
        ("input_ids", ctypes.POINTER(ctypes.c_int32)),
        ("n_tokens", ctypes.c_size_t)
    ]

class RKLLMMultiModelInput(ctypes.Structure):
    prompt: ctypes.c_char_p
    image_embed: ctypes.POINTER(ctypes.c_float)
    n_image_tokens: ctypes.c_size_t
    n_image: ctypes.c_size_t
    image_width: ctypes.c_size_t
    image_height: ctypes.c_size_t

    _fields_ = [
        ("prompt", ctypes.c_char_p),
        ("image_embed", ctypes.POINTER(ctypes.c_float)),
        ("n_image_tokens", ctypes.c_size_t),
        ("n_image", ctypes.c_size_t),
        ("image_width", ctypes.c_size_t),
        ("image_height", ctypes.c_size_t)
    ]

class _RKLLMInputUnion(ctypes.Union):
    prompt_input: ctypes.c_char_p
    embed_input: RKLLMEmbedInput
    token_input: RKLLMTokenInput
    multimodal_input: RKLLMMultiModelInput

    _fields_ = [
        ("prompt_input", ctypes.c_char_p),
        ("embed_input", RKLLMEmbedInput),
        ("token_input", RKLLMTokenInput),
        ("multimodal_input", RKLLMMultiModelInput)
    ]

class RKLLMInput(ctypes.Structure):
    input_type: ctypes.c_int
    _union_data: _RKLLMInputUnion

    _fields_ = [
        ("input_type", ctypes.c_int), # Enum will be passed as int, changed RKLLMInputType to ctypes.c_int
        ("_union_data", _RKLLMInputUnion)
    ]
    # Properties to make accessing union members easier
    @property
    def prompt_input(self) -> bytes: # Assuming c_char_p maps to bytes
        if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
            return self._union_data.prompt_input
        raise AttributeError("Not a prompt input")
    @prompt_input.setter
    def prompt_input(self, value: bytes): # Assuming c_char_p maps to bytes
        if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
            self._union_data.prompt_input = value
        else:
            raise AttributeError("Not a prompt input")
    @property
    def embed_input(self) -> RKLLMEmbedInput:
        if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
            return self._union_data.embed_input
        raise AttributeError("Not an embed input")
    @embed_input.setter
    def embed_input(self, value: RKLLMEmbedInput):
        if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
            self._union_data.embed_input = value
        else:
            raise AttributeError("Not an embed input")

    @property
    def token_input(self) -> RKLLMTokenInput:
        if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
            return self._union_data.token_input
        raise AttributeError("Not a token input")
    @token_input.setter
    def token_input(self, value: RKLLMTokenInput):
        if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
            self._union_data.token_input = value
        else:
            raise AttributeError("Not a token input")

    @property
    def multimodal_input(self) -> RKLLMMultiModelInput:
        if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
            return self._union_data.multimodal_input
        raise AttributeError("Not a multimodal input")
    @multimodal_input.setter
    def multimodal_input(self, value: RKLLMMultiModelInput):
        if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
            self._union_data.multimodal_input = value
        else:
            raise AttributeError("Not a multimodal input")

class RKLLMLoraParam(ctypes.Structure): # For inference
    lora_adapter_name: ctypes.c_char_p

    _fields_ = [
        ("lora_adapter_name", ctypes.c_char_p)
    ]

class RKLLMPromptCacheParam(ctypes.Structure): # For inference
    save_prompt_cache: ctypes.c_int # bool-like
    prompt_cache_path: ctypes.c_char_p

    _fields_ = [
        ("save_prompt_cache", ctypes.c_int), # bool-like
        ("prompt_cache_path", ctypes.c_char_p)
    ]

class RKLLMInferParam(ctypes.Structure):
    mode: ctypes.c_int
    lora_params: ctypes.POINTER(RKLLMLoraParam)
    prompt_cache_params: ctypes.POINTER(RKLLMPromptCacheParam)
    keep_history: ctypes.c_int # bool-like

    _fields_ = [
        ("mode", ctypes.c_int), # Enum will be passed as int, changed RKLLMInferMode to ctypes.c_int
        ("lora_params", ctypes.POINTER(RKLLMLoraParam)),
        ("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam)),
        ("keep_history", ctypes.c_int) # bool-like
    ]

class RKLLMResultLastHiddenLayer(ctypes.Structure):
    # Shape: [num_tokens, embd_size]
    hidden_states: ctypes.POINTER(ctypes.c_float)
    # 隐藏层大小
    embd_size: ctypes.c_int
    # 输出token数
    num_tokens: ctypes.c_int

    _fields_ = [
        ("hidden_states", ctypes.POINTER(ctypes.c_float)),
        ("embd_size", ctypes.c_int),
        ("num_tokens", ctypes.c_int)
    ]

class RKLLMResultLogits(ctypes.Structure):
    # Shape: [num_tokens, vocab_size]
    logits: ctypes.POINTER(ctypes.c_float)
    # 词汇表大小
    vocab_size: ctypes.c_int
    # 输出token数
    num_tokens: ctypes.c_int

    _fields_ = [
        ("logits", ctypes.POINTER(ctypes.c_float)),
        ("vocab_size", ctypes.c_int),
        ("num_tokens", ctypes.c_int)
    ]

class RKLLMResult(ctypes.Structure):
    text: ctypes.c_char_p
    token_id: ctypes.c_int32
    last_hidden_layer: RKLLMResultLastHiddenLayer
    logits: RKLLMResultLogits

    _fields_ = [
        ("text", ctypes.c_char_p),
        ("token_id", ctypes.c_int32),
        ("last_hidden_layer", RKLLMResultLastHiddenLayer),
        ("logits", RKLLMResultLogits)
    ]

# --- Typedefs ---
LLMHandle = ctypes.c_void_p

# --- Callback Function Type ---
LLMResultCallback = ctypes.CFUNCTYPE(
    None,  # return type: void
    ctypes.POINTER(RKLLMResult),
    ctypes.c_void_p,  # userdata
    ctypes.c_int      # enum, will be passed as int. Changed LLMCallState to ctypes.c_int
)


class RKLLMRuntime:
    def __init__(self, library_path="./librkllmrt.so"):
        try:
            self.lib = ctypes.CDLL(library_path)
        except OSError as e:
            raise OSError(f"Failed to load RKLLM library from {library_path}. "
                          f"Ensure it's in your LD_LIBRARY_PATH or provide the full path. Error: {e}")
        self._setup_functions()
        self.llm_handle = LLMHandle()
        self._c_callback = None # To keep the callback object alive

    def _setup_functions(self):
        # RKLLMParam rkllm_createDefaultParam();
        self.lib.rkllm_createDefaultParam.restype = RKLLMParam
        self.lib.rkllm_createDefaultParam.argtypes = []

        # int rkllm_init(LLMHandle* handle, RKLLMParam* param, LLMResultCallback callback);
        self.lib.rkllm_init.restype = ctypes.c_int
        self.lib.rkllm_init.argtypes = [
            ctypes.POINTER(LLMHandle),
            ctypes.POINTER(RKLLMParam),
            LLMResultCallback
        ]

        # int rkllm_load_lora(LLMHandle handle, RKLLMLoraAdapter* lora_adapter);
        self.lib.rkllm_load_lora.restype = ctypes.c_int
        self.lib.rkllm_load_lora.argtypes = [LLMHandle, ctypes.POINTER(RKLLMLoraAdapter)]

        # int rkllm_load_prompt_cache(LLMHandle handle, const char* prompt_cache_path);
        self.lib.rkllm_load_prompt_cache.restype = ctypes.c_int
        self.lib.rkllm_load_prompt_cache.argtypes = [LLMHandle, ctypes.c_char_p]

        # int rkllm_release_prompt_cache(LLMHandle handle);
        self.lib.rkllm_release_prompt_cache.restype = ctypes.c_int
        self.lib.rkllm_release_prompt_cache.argtypes = [LLMHandle]

        # int rkllm_destroy(LLMHandle handle);
        self.lib.rkllm_destroy.restype = ctypes.c_int
        self.lib.rkllm_destroy.argtypes = [LLMHandle]

        # int rkllm_run(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
        self.lib.rkllm_run.restype = ctypes.c_int
        self.lib.rkllm_run.argtypes = [
            LLMHandle,
            ctypes.POINTER(RKLLMInput),
            ctypes.POINTER(RKLLMInferParam),
            ctypes.c_void_p # userdata
        ]

        # int rkllm_run_async(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
        # Assuming async also takes userdata for the callback context
        self.lib.rkllm_run_async.restype = ctypes.c_int
        self.lib.rkllm_run_async.argtypes = [
            LLMHandle,
            ctypes.POINTER(RKLLMInput),
            ctypes.POINTER(RKLLMInferParam),
            ctypes.c_void_p # userdata
        ]

        # int rkllm_abort(LLMHandle handle);
        self.lib.rkllm_abort.restype = ctypes.c_int
        self.lib.rkllm_abort.argtypes = [LLMHandle]

        # int rkllm_is_running(LLMHandle handle);
        self.lib.rkllm_is_running.restype = ctypes.c_int # 0 if running, non-zero otherwise
        self.lib.rkllm_is_running.argtypes = [LLMHandle]

        # int rkllm_clear_kv_cache(LLMHandle handle, int keep_system_prompt);
        self.lib.rkllm_clear_kv_cache.restype = ctypes.c_int
        self.lib.rkllm_clear_kv_cache.argtypes = [LLMHandle, ctypes.c_int]

        # int rkllm_set_chat_template(LLMHandle handle, const char* system_prompt, const char* prompt_prefix, const char* prompt_postfix);
        self.lib.rkllm_set_chat_template.restype = ctypes.c_int
        self.lib.rkllm_set_chat_template.argtypes = [
            LLMHandle,
            ctypes.c_char_p,
            ctypes.c_char_p,
            ctypes.c_char_p
        ]

    def create_default_param(self) -> RKLLMParam:
        """Creates a default RKLLMParam structure."""
        return self.lib.rkllm_createDefaultParam()

    def init(self, param: RKLLMParam, callback_func) -> int:
        """
        Initializes the LLM.
        :param param: RKLLMParam structure.
        :param callback_func: A Python function that matches the signature:
                              def my_callback(result_ptr, userdata_ptr, state_enum):
                                  result = result_ptr.contents # RKLLMResult
                                  # Process result
                                  # userdata can be retrieved if passed during run, or ignored
                                  # state = LLMCallState(state_enum)
        :return: 0 for success, non-zero for failure.
        """
        if not callable(callback_func):
            raise ValueError("callback_func must be a callable Python function.")

        # Keep a reference to the ctypes callback object to prevent it from being garbage collected
        self._c_callback = LLMResultCallback(callback_func)
        
        ret = self.lib.rkllm_init(ctypes.byref(self.llm_handle), ctypes.byref(param), self._c_callback)
        if ret != 0:
            raise RuntimeError(f"rkllm_init failed with error code {ret}")
        return ret

    def load_lora(self, lora_adapter: RKLLMLoraAdapter) -> int:
        """Loads a Lora adapter."""
        ret = self.lib.rkllm_load_lora(self.llm_handle, ctypes.byref(lora_adapter))
        if ret != 0:
            raise RuntimeError(f"rkllm_load_lora failed with error code {ret}")
        return ret

    def load_prompt_cache(self, prompt_cache_path: str) -> int:
        """Loads a prompt cache from a file."""
        c_path = prompt_cache_path.encode('utf-8')
        ret = self.lib.rkllm_load_prompt_cache(self.llm_handle, c_path)
        if ret != 0:
            raise RuntimeError(f"rkllm_load_prompt_cache failed for {prompt_cache_path} with error code {ret}")
        return ret

    def release_prompt_cache(self) -> int:
        """Releases the prompt cache from memory."""
        ret = self.lib.rkllm_release_prompt_cache(self.llm_handle)
        if ret != 0:
            raise RuntimeError(f"rkllm_release_prompt_cache failed with error code {ret}")
        return ret

    def destroy(self) -> int:
        """Destroys the LLM instance and releases resources."""
        if self.llm_handle and self.llm_handle.value: # Check if handle is not NULL
            ret = self.lib.rkllm_destroy(self.llm_handle)
            self.llm_handle = LLMHandle() # Reset handle
            if ret != 0:
                # Don't raise here as it might be called in __del__
                print(f"Warning: rkllm_destroy failed with error code {ret}") 
            return ret
        return 0 # Already destroyed or not initialized

    def run(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
        """Runs an LLM inference task synchronously."""
        # userdata can be a ctypes.py_object if you want to pass Python objects,
        # then cast to c_void_p. Or simply None.
        if userdata is not None:
            # Store the userdata object to keep it alive during the call
            self._userdata_ref = userdata
            c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
        else:
            c_userdata = None
        ret = self.lib.rkllm_run(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
        if ret != 0:
            raise RuntimeError(f"rkllm_run failed with error code {ret}")
        return ret

    def run_async(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
        """Runs an LLM inference task asynchronously."""
        if userdata is not None:
            # Store the userdata object to keep it alive during the call
            self._userdata_ref = userdata
            c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
        else:
            c_userdata = None
        ret = self.lib.rkllm_run_async(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
        if ret != 0:
            raise RuntimeError(f"rkllm_run_async failed with error code {ret}")
        return ret

    def abort(self) -> int:
        """Aborts an ongoing LLM task."""
        ret = self.lib.rkllm_abort(self.llm_handle)
        if ret != 0:
            raise RuntimeError(f"rkllm_abort failed with error code {ret}")
        return ret

    def is_running(self) -> bool:
        """Checks if an LLM task is currently running. Returns True if running."""
        # The C API returns 0 if running, non-zero otherwise.
        # This is a bit counter-intuitive for a boolean "is_running".
        return self.lib.rkllm_is_running(self.llm_handle) == 0

    def clear_kv_cache(self, keep_system_prompt: bool) -> int:
        """Clears the key-value cache."""
        ret = self.lib.rkllm_clear_kv_cache(self.llm_handle, ctypes.c_int(1 if keep_system_prompt else 0))
        if ret != 0:
            raise RuntimeError(f"rkllm_clear_kv_cache failed with error code {ret}")
        return ret

    def set_chat_template(self, system_prompt: str, prompt_prefix: str, prompt_postfix: str) -> int:
        """Sets the chat template for the LLM."""
        c_system = system_prompt.encode('utf-8') if system_prompt else b""
        c_prefix = prompt_prefix.encode('utf-8') if prompt_prefix else b""
        c_postfix = prompt_postfix.encode('utf-8') if prompt_postfix else b""
        
        ret = self.lib.rkllm_set_chat_template(self.llm_handle, c_system, c_prefix, c_postfix)
        if ret != 0:
            raise RuntimeError(f"rkllm_set_chat_template failed with error code {ret}")
        return ret

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.destroy()

    def __del__(self):
        self.destroy() # Ensure resources are freed if object is garbage collected

# --- Example Usage (Illustrative) ---
if __name__ == "__main__":
    # This is a placeholder for how you might use it.
    # You'll need a valid .rkllm model and librkllmrt.so in your path.

    # Global list to store results from callback for demonstration
    results_buffer = []

    def my_python_callback(result_ptr, userdata_ptr, state_enum):
        """
        Callback function to be called by the C library.
        """
        global results_buffer
        state = LLMCallState(state_enum)
        result = result_ptr.contents

        current_text = ""
        if result.text: # Check if the char_p is not NULL
            current_text = result.text.decode('utf-8', errors='ignore')
        
        print(f"Callback: State={state.name}, TokenID={result.token_id}, Text='{current_text}'")
        results_buffer.append(current_text)

        if state == LLMCallState.RKLLM_RUN_FINISH:
            print("Inference finished.")
        elif state == LLMCallState.RKLLM_RUN_ERROR:
            print("Inference error.")
        
        # Example: Accessing logits if available (and if mode was set to get logits)
        # if result.logits.logits and result.logits.vocab_size > 0:
        #     print(f"  Logits (first 5 of vocab_size {result.logits.vocab_size}):")
        #     for i in range(min(5, result.logits.vocab_size)):
        #         print(f"    {result.logits.logits[i]:.4f}", end=" ")
        #     print()


    # --- Attempt to use the wrapper ---
    try:
        print("Initializing RKLLMRuntime...")
        # Adjust library_path if librkllmrt.so is not in default search paths
        # e.g., library_path="./path/to/librkllmrt.so"
        rk_llm = RKLLMRuntime() 

        print("Creating default parameters...")
        params = rk_llm.create_default_param()

        # --- Configure parameters ---
        # THIS IS CRITICAL: model_path must point to an actual .rkllm file
        # For this example to run, you need a model file.
        # Let's assume a dummy path for now, this will fail at init if not valid.
        model_file = "dummy_model.rkllm" 
        if not os.path.exists(model_file):
            print(f"Warning: Model file '{model_file}' does not exist. Init will likely fail.")
            # Create a dummy file for the example to proceed further, though init will still fail
            # with a real library unless it's a valid model.
            with open(model_file, "w") as f:
                f.write("dummy content")

        params.model_path = model_file.encode('utf-8')
        params.max_context_len = 512
        params.max_new_tokens = 128
        params.top_k = 1 # Greedy
        params.temperature = 0.7
        params.repeat_penalty = 1.1
        # ... set other params as needed

        print(f"Initializing LLM with model: {params.model_path.decode()}...")
        # This will likely fail if dummy_model.rkllm is not a valid model recognized by the library
        try:
            rk_llm.init(params, my_python_callback)
            print("LLM Initialized.")
        except RuntimeError as e:
            print(f"Error during LLM initialization: {e}")
            print("This is expected if 'dummy_model.rkllm' is not a valid model.")
            print("Replace 'dummy_model.rkllm' with a real model path to test further.")
            exit()


        # --- Prepare input ---
        print("Preparing input...")
        rk_input = RKLLMInput()
        rk_input.input_type = RKLLMInputType.RKLLM_INPUT_PROMPT
        
        prompt_text = "Translate the following English text to French: 'Hello, world!'"
        c_prompt = prompt_text.encode('utf-8')
        rk_input._union_data.prompt_input = c_prompt # Accessing union member directly

        # --- Prepare inference parameters ---
        print("Preparing inference parameters...")
        infer_params = RKLLMInferParam()
        infer_params.mode = RKLLMInferMode.RKLLM_INFER_GENERATE
        infer_params.keep_history = 1 # True
        # infer_params.lora_params = None # or set up RKLLMLoraParam if using LoRA
        # infer_params.prompt_cache_params = None # or set up RKLLMPromptCacheParam

        # --- Run inference ---
        print(f"Running inference with prompt: '{prompt_text}'")
        results_buffer.clear()
        try:
            rk_llm.run(rk_input, infer_params) # Userdata is None by default
            print("\n--- Full Response ---")
            print("".join(results_buffer))
            print("---------------------\n")
        except RuntimeError as e:
            print(f"Error during LLM run: {e}")


        # --- Example: Set chat template (if model supports it) ---
        # print("Setting chat template...")
        # try:
        #     rk_llm.set_chat_template("You are a helpful assistant.", "<user>: ", "<assistant>: ")
        #     print("Chat template set.")
        # except RuntimeError as e:
        #     print(f"Error setting chat template: {e}")

        # --- Example: Clear KV Cache ---
        # print("Clearing KV cache (keeping system prompt if any)...")
        # try:
        #     rk_llm.clear_kv_cache(keep_system_prompt=True)
        #     print("KV cache cleared.")
        # except RuntimeError as e:
        #     print(f"Error clearing KV cache: {e}")

    except OSError as e:
        print(f"OSError: {e}. Could not load the RKLLM library.")
        print("Please ensure 'librkllmrt.so' is in your LD_LIBRARY_PATH or provide the full path.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        if 'rk_llm' in locals() and rk_llm.llm_handle and rk_llm.llm_handle.value:
            print("Destroying LLM instance...")
            rk_llm.destroy()
            print("LLM instance destroyed.")
        if os.path.exists(model_file) and model_file == "dummy_model.rkllm":
             os.remove(model_file) # Clean up dummy file

    print("Example finished.")