JigsawStack
/

moondream2-batched

@@ -828,6 +828,231 @@ class MoondreamModel(nn.Module):
         return {"points": objects}
     def _detect_gaze(
         self,
         image: EncodedImage,

         return {"points": objects}
+    # === BEGIN: Batched multi-label detection additions ===
+    def _load_encoded_image_batched(self, encoded_image, batch_size: int):
+        """
+        Clone single-image KV caches into a batch-B cache so we can decode B labels in parallel.
+        """
+        for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
+            T = k.size(2)
+            # Allocate new [B, n_kv_heads, T_max, head_dim] caches if needed
+            if b.kv_cache.k_cache.size(0) != batch_size:
+                new_k = b.kv_cache.k_cache.new_zeros((batch_size,) + b.kv_cache.k_cache.shape[1:])
+                new_v = b.kv_cache.v_cache.new_zeros((batch_size,) + b.kv_cache.v_cache.shape[1:])
+                b.kv_cache.k_cache = new_k
+                b.kv_cache.v_cache = new_v
+            # Copy current prefix from the encoded image into all B rows
+            b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
+            b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
+    def _prefill_prompt_batched(self, labels, pos: int, lora=None, temperature: float = 0.0, top_p: float = 0.0):
+        """
+        Build detect prompts for many labels, pad to same length, prefill once as a batch,
+        then return (last_hidden per row, next_token per row, pos per row).
+        """
+        import torch
+        from .text import text_encoder, lm_head
+        tpl = self.config.tokenizer.templates["detect"]
+        if tpl is None:
+            raise NotImplementedError("Model does not support object detection (no detect template).")
+        rows, lens = [], []
+        for lab in labels:
+            ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
+            rows.append(torch.tensor(ids, device=self.device, dtype=torch.long))
+            lens.append(len(ids))
+        B = len(rows); T = max(lens)
+        eos = self.config.tokenizer.eos_id
+        # Pad with eos so we can prefill as a single batch
+        prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
+        for i, ids in enumerate(rows):
+            prompt_ids[i, : ids.numel()] = ids
+        # Embed & prefill once
+        prompt_emb = text_encoder(prompt_ids, self.text)    # (B, T, C)
+        import torch
+        torch._dynamo.mark_dynamic(prompt_emb, 1)           # allow variable T
+        attn_mask = self.attn_mask
+        mask = attn_mask[:, :, pos : pos + T, :].expand(B, -1, -1, -1).contiguous()
+        pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)
+        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)  # (B, T, C)
+        logits_BTV = lm_head(hidden_BTC, self.text)                  # (B, T, V)
+        # Take the last *real* token per row (ignore padding positions)
+        idx = (torch.tensor(lens, device=self.device, dtype=torch.long) - 1).clamp_min(0)
+        last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B, 1, C)
+        last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B, V)
+        if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)  # (B, 1)
+        else:
+            probs = torch.softmax(last_logits / temperature, dim=-1)
+            probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)   # (B, 1)
+        pos_vec = torch.tensor([pos], device=self.device, dtype=torch.long).repeat(B) + torch.tensor(lens, device=self.device)
+        return last_hidden, next_token, pos_vec  # (B,1,C), (B,1), (B,)
+    def _generate_points_batched(self, hidden, next_token, pos_vec, include_size: bool = True, max_objects: int = 50, lora=None):
+        """
+        Vectorized version of _generate_points() that decodes x -> y -> size -> next-token
+        for all rows in the batch simultaneously.
+        Returns: list-of-lists of dicts, length B.
+        """
+        import torch
+        from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
+        B = hidden.size(0)
+        device = self.device
+        out = [[] for _ in range(B)]
+        eos_id = self.config.tokenizer.eos_id
+        # Per-row attention/masking state
+        max_ctx = self.config.text.max_context
+        mask = torch.zeros(B, 1, max_ctx, device=device, dtype=torch.bool)
+        for i in range(B):
+            mask[i, :, : int(pos_vec[i].item())] = 1
+        pos_ids = pos_vec.clone()
+        alive = torch.ones(B, dtype=torch.bool, device=device)
+        counts = torch.zeros(B, dtype=torch.int32, device=device)
+        with torch.inference_mode():
+            while alive.any() and (counts < max_objects).any():
+                # --- x coordinate (from current hidden) ---
+                x_logits = decode_coordinate(hidden, self.region)  # (B, 1, 1024) or (B, 1024)
+                if x_logits.dim() == 3:
+                    x_logits = x_logits.squeeze(1)                 # (B, 1024)
+                x_bin = x_logits.argmax(dim=-1).to(torch.float32)  # (B,)
+                x_center = x_bin / float(x_logits.size(-1))        # normalize to [0,1]
+                x_emb = encode_coordinate(x_center.to(dtype=x_logits.dtype), self.region).unsqueeze(1)  # (B,1,C)
+                # step: decode to get hidden for y
+                for i in range(B):
+                    if alive[i]:
+                        mask[i, :, pos_ids[i]] = 1
+                logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
+                pos_ids = pos_ids + alive.to(torch.long)
+                # --- y coordinate ---
+                y_logits = decode_coordinate(hidden, self.region)
+                if y_logits.dim() == 3:
+                    y_logits = y_logits.squeeze(1)                 # (B, 1024)
+                y_bin = y_logits.argmax(dim=-1).to(torch.float32)
+                y_center = y_bin / float(y_logits.size(-1))
+                y_emb = encode_coordinate(y_center.to(dtype=y_logits.dtype), self.region).unsqueeze(1)
+                # step: decode to get hidden for size (or eos)
+                for i in range(B):
+                    if alive[i]:
+                        mask[i, :, pos_ids[i]] = 1
+                logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
+                pos_ids = pos_ids + alive.to(torch.long)
+                if include_size:
+                    # --- size logits (batched) ---
+                    size_logits = decode_size(hidden, self.region)   # tuple/list [w_logits, h_logits] shaped (B,1,1024)
+                    w_logits, h_logits = size_logits[0].squeeze(1), size_logits[1].squeeze(1)  # (B,1024), (B,1024)
+                    w_bin = w_logits.argmax(dim=-1).to(torch.float32)
+                    h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    # Convert from log-scale bin to size in [0,1]
+                    w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
+                    h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_emb = encode_size(torch.stack([w, h], dim=0), self.region).transpose(0,1).unsqueeze(1)  # (B,1,C)
+                    # Commit boxes for alive rows
+                    for i in range(B):
+                        if not alive[i]:
+                            continue
+                        out[i].append({
+                            "x_min": (x_center[i] - w[i] / 2).item(),
+                            "y_min": (y_center[i] - h[i] / 2).item(),
+                            "x_max": (x_center[i] + w[i] / 2).item(),
+                            "y_max": (y_center[i] + h[i] / 2).item(),
+                        })
+                    # step: decode "next token" to decide continuation
+                    for i in range(B):
+                        if alive[i]:
+                            mask[i, :, pos_ids[i]] = 1
+                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
+                    pos_ids = pos_ids + alive.to(torch.long)
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
+                else:
+                    # Points mode (no size)
+                    for i in range(B):
+                        if not alive[i]:
+                            continue
+                        out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    # step: decode next token from y_emb
+                    for i in range(B):
+                        if alive[i]:
+                            mask[i, :, pos_ids[i]] = 1
+                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
+                    pos_ids = pos_ids + alive.to(torch.long)
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)
+                # Update which rows are done and count
+                finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
+                counts = counts + (~finished_now & alive).to(counts.dtype)
+                alive &= ~finished_now
+        return out
+    def detect_multi(self, image, objects, settings=None):
+        """
+        Parallel multi-label detection.
+        Args:
+            image: PIL.Image or EncodedImage
+            objects: list[str], e.g. ["person", "car"]
+            settings: Optional[ObjectSamplingSettings], honors "max_objects" and "variant"
+        Returns:
+            {"objects": {label: [box_dict, ...]}}
+        """
+        import torch
+        from typing import Optional, List, Union
+        if self.config.tokenizer.templates["detect"] is None:
+            raise NotImplementedError("Model does not support object detection.")
+        settings = settings or {}
+        # Encode once; reuse caches
+        image = self.encode_image(image, settings)
+        B = len(objects)
+        self._load_encoded_image_batched(image, B)
+        # Optional LoRA variant (same as detect())
+        lora = None
+        if "variant" in settings:
+            from .lora import variant_state_dict
+            lora = variant_state_dict(settings["variant"], device=self.device)
+        # Prefill all prompts at once
+        last_hidden, next_token, pos_vec = self._prefill_prompt_batched(
+            objects, image.pos, lora=lora, temperature=0.0, top_p=0.0
+        )
+        # Batched decode loop
+        max_objects = settings.get("max_objects", 50)
+        det_lists = self._generate_points_batched(
+            last_hidden, next_token, pos_vec,
+            include_size=True, max_objects=max_objects, lora=lora
+        )
+        # Map back to labels and add "label" tags
+        res = {}
+        for lab, lst in zip(objects, det_lists):
+            for d in lst:
+                d["label"] = lab
+            res[lab] = lst
+        return {"objects": res}
+    # === END: Batched multi-label detection additions ===
     def _detect_gaze(
         self,
         image: EncodedImage,