thisnick
/

llama-joycaption-alpha-two

Safetensors

llava

captioning

Model card Files Files and versions

xet

Community

thisnick commited on Feb 4

Commit

383b031

verified ·

1 Parent(s): ed84920

Upload full model folder with custom handler

Browse files

Files changed (2) hide show

README.md +0 -1
handler.py +34 -25

README.md CHANGED Viewed

@@ -2,7 +2,6 @@
 base_model:
 - meta-llama/Llama-3.1-8B-Instruct
 - google/siglip-so400m-patch14-384
-- fancyfeast/llama-joycaption-alpha-two-hf-llava
 tags:
 - captioning
 ---

 base_model:
 - meta-llama/Llama-3.1-8B-Instruct
 - google/siglip-so400m-patch14-384
 tags:
 - captioning
 ---

handler.py CHANGED Viewed

@@ -16,14 +16,21 @@ class EndpointHandler():
     self.model.eval()
   def __call__(self, data):
-    # Expecting data with a "prompt" (text) and an "image" (base64 string)
-    prompt = data.get("prompt", "Generate a caption for this image.")
-    image_b64 = data.get("image")
-    if image_b64 is None:
-      return {"error": "No image provided in the payload."}
     try:
-      image_bytes = base64.b64decode(image_b64)
-      image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     except Exception as e:
       return {"error": f"Failed to decode image: {str(e)}"}
@@ -41,32 +48,34 @@ class EndpointHandler():
     if not isinstance(convo_string, str):
       return {"error": "Failed to create conversation string."}
-    # Prepare the inputs for the model
-    inputs = self.processor(
       text=[convo_string],
-      images=[image],
       return_tensors="pt"
     )
-    inputs = {k: v.to(self.device) for k, v in inputs.items()}
-    if "pixel_values" in inputs:
-      inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
-    # Generate caption tokens
     generate_ids = self.model.generate(
-      **inputs,
       max_new_tokens=300,
       do_sample=True,
       temperature=0.6,
       top_p=0.9
-    )[0]
-    # Optionally, trim off the prompt tokens
-    generate_ids = generate_ids[inputs["input_ids"].shape[1]:]
-    caption = self.processor.tokenizer.decode(
-      generate_ids,
-      skip_special_tokens=True,
-      clean_up_tokenization_spaces=False
-    ).strip()
-    return {"caption": caption}

     self.model.eval()
   def __call__(self, data):
+    inputs = data.get("inputs", {})
+    prompt = inputs.get("prompt", "Generate a caption for this image.")
+    images_b64 = inputs.get("images")
+    # Handle both single image and list of images
+    if isinstance(images_b64, str):
+      images_b64 = [images_b64]
+    if not images_b64:
+      return {"error": "No images provided in the payload."}
     try:
+      images = [
+        Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
+        for img_b64 in images_b64
+      ]
     except Exception as e:
       return {"error": f"Failed to decode image: {str(e)}"}
     if not isinstance(convo_string, str):
       return {"error": "Failed to create conversation string."}
+    # Prepare the inputs for the model - process all images at once
+    model_inputs = self.processor(
       text=[convo_string],
+      images=images,
       return_tensors="pt"
     )
+    model_inputs = {k: v.to(self.device) for k, v in model_inputs.items()}
+    if "pixel_values" in model_inputs:
+      model_inputs["pixel_values"] = model_inputs["pixel_values"].to(torch.bfloat16)
+    # Generate caption tokens for all images at once
     generate_ids = self.model.generate(
+      **model_inputs,
       max_new_tokens=300,
       do_sample=True,
       temperature=0.6,
       top_p=0.9
+    )
+    # Trim off the prompt tokens and decode all captions
+    generate_ids = generate_ids[:, model_inputs["input_ids"].shape[1]:]
+    captions = [
+      self.processor.tokenizer.decode(
+        ids,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
+      ).strip()
+      for ids in generate_ids
+    ]
+    return {"captions": captions if len(captions) > 1 else captions[0]}