# handler.py
import os
import io
import base64
from typing import Any, Dict, List, Union

import torch
from PIL import Image
from transformers import (
    AutoConfig,
    AutoProcessor,
    AutoTokenizer,
    AutoModelForVision2Seq,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

# --- Env defaults ---
os.environ.setdefault("HF_TRUST_REMOTE_CODE", "1")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

USE_4BIT = os.environ.get("USE_4BIT", "1") not in {"0", "false", "False"}
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0,1")

# mrope 相关键（部分分支可能访问 kwargs 里的这些键，而不是从 generation_config 取）
MROPE_KEYS = ("mrope_section", "mrope_section_output", "mrope_theta", "mrope_t")


def _b64_to_pil(b64str: str) -> Image.Image:
    return Image.open(io.BytesIO(base64.b64decode(b64str))).convert("RGB")


def _normalize_images(img_field: Union[str, List[str], None]) -> List[Image.Image]:
    if img_field is None:
        return []
    if isinstance(img_field, str):
        return [_b64_to_pil(img_field)]
    if isinstance(img_field, list):
        return [_b64_to_pil(s) for s in img_field]
    raise ValueError("image_b64 must be a base64 string or a list of base64 strings.")


def _to_device(batch: Dict[str, Any], device: torch.device) -> Dict[str, Any]:
    out = {}
    for k, v in batch.items():
        if hasattr(v, "to"):
            out[k] = v.to(device)
        else:
            out[k] = v
    return out


class EndpointHandler:
    """
    支持两种输入：
    1) Inference API 形态：
       {"inputs": {...}, "parameters": {...}}
    2) 扁平形态：
       {"prompt": "...", "image_b64": "...|[...]", "max_new_tokens": ..., "temperature": ..., "top_p": ..., "force_text": ...}

    返回：{"text": "..."}
    """

    def __init__(self, path: str = "/repository"):
        self.model_id = path
        self.cfg = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)

        # 判定是否多模态
        vl_model_types = {
            "qwen2_5_vl", "qwen2_vl", "mllama", "fuyu",
            "phi4multimodal", "git", "gotocr2", "qwen2_5_vl_moe"
        }
        self.is_vl = getattr(self.cfg, "model_type", "").lower() in vl_model_types

        # 处理器 / 分词器
        self.processor = None
        self.tokenizer = None
        if self.is_vl:
            self.processor = AutoProcessor.from_pretrained(self.model_id, trust_remote_code=True)
            self.tokenizer = getattr(self.processor, "tokenizer", None)
            if self.tokenizer is None:
                self.tokenizer = AutoTokenizer.from_pretrained(
                    self.model_id, trust_remote_code=True, use_fast=True
                )
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_id, trust_remote_code=True, use_fast=True
            )

        # 精度 / 量化
        dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
        quant_cfg = None
        if USE_4BIT:
            try:
                quant_cfg = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=dtype,
                    bnb_4bit_use_double_quant=True,
                )
            except Exception:
                quant_cfg = None

        self.max_memory = {0: "38GiB", 1: "38GiB", "cpu": "120GiB"}
        ModelClass = AutoModelForVision2Seq if self.is_vl else AutoModelForCausalLM

        self.model = ModelClass.from_pretrained(
            self.model_id,
            trust_remote_code=True,
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
            device_map="auto",
            max_memory=self.max_memory,
            quantization_config=quant_cfg,
        ).eval()

        torch.backends.cuda.matmul.allow_tf32 = True

        self.default_gen_kwargs = dict(
            max_new_tokens=128,
            do_sample=False,
            temperature=0.0,
            top_p=1.0,
        )

        has_mrope = any(hasattr(self.cfg, k) for k in MROPE_KEYS)
        print(
            f"[handler] transformers={self._tf_version_safe()}, "
            f"model_type={getattr(self.cfg, 'model_type', 'unknown')}, "
            f"has_mrope_in_cfg={has_mrope}"
        )

        # 先在 generation_config 上打底
        self._ensure_mrope_on(self.model.generation_config)

    # ---------- helpers ----------

    def _tf_version_safe(self) -> str:
        try:
            import transformers as _tf  # noqa
            return getattr(_tf, "__version__", "unknown")
        except Exception:
            return "unknown"

    def _ensure_mrope_on(self, gen_cfg: Any) -> None:
        if gen_cfg is None:
            return
        for k in MROPE_KEYS:
            if not hasattr(gen_cfg, k):
                try:
                    setattr(gen_cfg, k, None)
                except Exception:
                    pass
            # 尽力把内部 dict 也补齐（不同实现 to_dict 用法不同，这里 best-effort）
            for attr_name in ("__dict__", "_internal_dict"):
                try:
                    d = getattr(gen_cfg, attr_name, None)
                    if isinstance(d, dict) and k not in d:
                        d[k] = None
                except Exception:
                    pass

    def _inject_mrope_into_kwargs(self, gen_kwargs: Dict[str, Any]) -> None:
        """
        关键修复：把 mrope_* 直接注入到即将传入 generate 的 kwargs 中，
        防止底层代码对 kwargs['mrope_section'] 等键进行索引时报 KeyError。
        """
        for k in MROPE_KEYS:
            if k not in gen_kwargs:
                # 优先取 generation_config 上的值，否则 None
                v = getattr(self.model.generation_config, k, None)
                gen_kwargs[k] = v

    def _build_and_merge_payload(self, data: Dict[str, Any]) -> Dict[str, Any]:
        if isinstance(data, dict) and "inputs" in data and not isinstance(data["inputs"], str):
            payload = data.get("inputs") or {}
            params = data.get("parameters") or {}
            merged = dict(payload)
            for k, v in params.items():
                merged.setdefault(k, v)
            return merged
        return data

    def _decode_outputs(self, outputs: Any) -> str:
        if hasattr(self.processor, "batch_decode"):
            try:
                return self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
            except Exception:
                pass
        if self.tokenizer is not None:
            try:
                return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            except Exception:
                pass
        try:
            return str(outputs[0].tolist())
        except Exception:
            return str(outputs)

    # ---------- inference ----------

    @torch.inference_mode()
    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        data = self._build_and_merge_payload(data)

        prompt = str(data.get("prompt", "")).strip()
        max_new_tokens = int(data.get("max_new_tokens", self.default_gen_kwargs["max_new_tokens"]))
        temperature = float(data.get("temperature", self.default_gen_kwargs["temperature"]))
        top_p = float(data.get("top_p", self.default_gen_kwargs["top_p"]))
        force_text = bool(data.get("force_text", False))

        gen_kwargs: Dict[str, Any] = dict(
            max_new_tokens=max_new_tokens,
            do_sample=(temperature > 0.0),
            temperature=temperature,
            top_p=top_p,
        )

        # 生成前：双保险
        self._ensure_mrope_on(self.model.generation_config)
        self._inject_mrope_into_kwargs(gen_kwargs)

        # 文本路径
        def run_text_path(text: str) -> str:
            tok = self.tokenizer
            if tok is None and getattr(self, "processor", None) is not None:
                tok = getattr(self.processor, "tokenizer", None)
            if tok is None:
                tok = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True, use_fast=True)
                self.tokenizer = tok

            txt = text if text else "Hello"
            inputs = tok(txt, return_tensors="pt")
            if "input_ids" not in inputs or inputs["input_ids"].numel() == 0:
                inputs = tok("Hello", return_tensors="pt")
            inputs = _to_device(inputs, self.model.device)

            # 再次注入，避免外层修改丢失
            local_gen = dict(gen_kwargs)
            self._inject_mrope_into_kwargs(local_gen)

            try:
                out = self.model.generate(**inputs, **local_gen)
            except KeyError as e:
                if "mrope" in str(e).lower():
                    print("[handler] caught mrope in text path, rebuilding & retry")
                    self._ensure_mrope_on(self.model.generation_config)
                    self._inject_mrope_into_kwargs(local_gen)
                    out = self.model.generate(**inputs, **local_gen)
                else:
                    raise
            return self.tokenizer.decode(out[0], skip_special_tokens=True)

        # 非 VL 或强制文本
        if (not self.is_vl) or force_text:
            return {"text": run_text_path(prompt)}

        # VL 路径
        images = _normalize_images(data.get("image_b64"))
        if hasattr(self.processor, "apply_chat_template"):
            content = [{"type": "text", "text": prompt or "Describe the image."}]
            for _ in images:
                content.append({"type": "image"})
            msgs = [{"role": "user", "content": content}]
            prompt_text = self.processor.apply_chat_template(
                msgs, tokenize=False, add_generation_prompt=True
            )
        else:
            prompt_text = prompt or "Describe the image."

        proc_inputs = self.processor(
            text=prompt_text,
            images=images if images else None,
            return_tensors="pt",
        )

        if (("input_ids" in proc_inputs and hasattr(proc_inputs["input_ids"], "numel")
             and proc_inputs["input_ids"].numel() == 0) and not images):
            return {"text": run_text_path(prompt)}

        proc_inputs = _to_device(proc_inputs, self.model.device)

        # 每次 generate 前都注入一次
        local_gen = dict(gen_kwargs)
        self._inject_mrope_into_kwargs(local_gen)

        try:
            outputs = self.model.generate(**proc_inputs, **local_gen)
        except KeyError as e:
            if "mrope" in str(e).lower():
                print("[handler] caught mrope in VL path, rebuilding & retry")
                self._ensure_mrope_on(self.model.generation_config)
                self._inject_mrope_into_kwargs(local_gen)
                outputs = self.model.generate(**proc_inputs, **local_gen)
            else:
                raise

        text = self._decode_outputs(outputs)
        return {"text": text}