Final_Assignment_Template

Sleeping

App Files Files Community

Dkapsis commited on May 19

Commit

ed20377

1 Parent(s): 6dd5f4b

manager tools

Browse files

Files changed (3) hide show

app_agents/manager_agent.py +3 -11
app_tools/text_inspector.py +92 -0
app_tools/visual_qa.py +252 -0

app_agents/manager_agent.py CHANGED Viewed

@@ -2,21 +2,13 @@ import os
 from smolagents.utils import encode_image_base64, make_image_url
 from smolagents import OpenAIServerModel, CodeAgent, InferenceClientModel
-# from gradio_tools import (StableDiffusionTool, ImageCaptioningTool, StableDiffusionPromptGeneratorTool,
-#                           TextToVideoTool)
-# from langchain.agents import initialize_agent
-# from langchain.memory import ConversationBufferMemory
 import app_agents.web_agent as web_agent
-# import app_tools.tools as agent_tools
-# tools = [StableDiffusionTool().langchain, ImageCaptioningTool().langchain,
-#          StableDiffusionPromptGeneratorTool().langchain, TextToVideoTool().langchain]
-# memory = ConversationBufferMemory(memory_key="chat_history")
 manager_agent = CodeAgent(
     model=InferenceClientModel("deepseek-ai/DeepSeek-R1", provider="together", max_tokens=8096),
-    tools=[],
     planning_interval=4,
     verbosity_level=2,
     managed_agents=[web_agent.web_agent],

 from smolagents.utils import encode_image_base64, make_image_url
 from smolagents import OpenAIServerModel, CodeAgent, InferenceClientModel
 import app_agents.web_agent as web_agent
+import app_tools.text_inspector
+import app_tools.visual_qa
 manager_agent = CodeAgent(
     model=InferenceClientModel("deepseek-ai/DeepSeek-R1", provider="together", max_tokens=8096),
+    tools=[app_tools.text_inspector.TextInspectorTool(), app_tools.visual_qa.VisualQATool()],
     planning_interval=4,
     verbosity_level=2,
     managed_agents=[web_agent.web_agent],

app_tools/text_inspector.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from dotenv import load_dotenv
+from typing import Optional
+from smolagents import InferenceClientModel, Tool
+from app_tools.mdconvert import MarkdownConverter
+load_dotenv()
+text_limit = 70000
+websurfer_llm_engine = InferenceClientModel(
+    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+)
+class TextInspectorTool(Tool):
+    name = "inspect_file_as_text"
+    description = """
+You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
+This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
+    inputs = {
+        "question": {
+            "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
+            "type": "string",
+            "nullable": True,
+        },
+        "file_path": {
+            "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT USE THIS TOOL FOR A WEBPAGE: use the search tool instead!",
+            "type": "string",
+        },
+    }
+    output_type = "string"
+    md_converter = MarkdownConverter()
+    def forward_initial_exam_mode(self, file_path, question):
+        result = self.md_converter.convert(file_path)
+        if file_path[-4:] in ['.png', '.jpg']:
+            raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
+        if ".zip" in file_path:
+            return result.text_content
+        if not question:
+            return result.text_content
+        messages = [
+            {
+                "role": MessageRole.SYSTEM,
+                "content": "Here is a file:\n### "
+                + str(result.title)
+                + "\n\n"
+                + result.text_content[:text_limit],
+            },
+            {
+                "role": MessageRole.USER,
+                "content": question,
+            },
+        ]
+        return websurfer_llm_engine(messages)
+    def forward(self, file_path, question: Optional[str] = None) -> str:
+        result = self.md_converter.convert(file_path)
+        if file_path[-4:] in ['.png', '.jpg']:
+            raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
+        if ".zip" in file_path:
+            return result.text_content
+        if not question:
+            return result.text_content
+        messages = [
+            {
+                "role": MessageRole.SYSTEM,
+                "content": "You will have to write a short caption for this file, then answer this question:"
+                + question,
+            },
+            {
+                "role": MessageRole.USER,
+                "content": "Here is the complete file:\n### "
+                + str(result.title)
+                + "\n\n"
+                + result.text_content[:text_limit],
+            },
+            {
+                "role": MessageRole.USER,
+                "content": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
+                + question,
+            },
+        ]
+        return websurfer_llm_engine(messages)

app_tools/visual_qa.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from PIL import Image
+import base64
+from io import BytesIO
+import json
+import os
+import requests
+from typing import Optional
+from huggingface_hub import InferenceClient
+from transformers import AutoProcessor
+from smolagents import Tool
+import uuid
+import mimetypes
+from dotenv import load_dotenv
+load_dotenv(override=True)
+idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
+def process_images_and_text(image_path, query, client):
+    messages = [
+        {
+            "role": "user", "content": [
+                {"type": "image"},
+                {"type": "text", "text": query},
+            ]
+        },
+    ]
+    prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
+    # load images from local directory
+    # encode images to strings which can be sent to the endpoint
+    def encode_local_image(image_path):
+        # load image
+        image = Image.open(image_path).convert('RGB')
+        # Convert the image to a base64 string
+        buffer = BytesIO()
+        image.save(buffer, format="JPEG")  # Use the appropriate format (e.g., JPEG, PNG)
+        base64_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
+        # add string formatting required by the endpoint
+        image_string = f"data:image/jpeg;base64,{base64_image}"
+        return image_string
+    image_string = encode_local_image(image_path)
+    prompt_with_images = prompt_with_template.replace("<image>", "![]({}) ").format(image_string)
+    payload = {
+        "inputs": prompt_with_images,
+        "parameters": {
+            "return_full_text": False,
+            "max_new_tokens": 200,
+        }
+    }
+    return json.loads(client.post(json=payload).decode())[0]
+# Function to encode the image
+def encode_image(image_path):
+    if image_path.startswith("http"):
+        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
+        request_kwargs = {
+            "headers": {"User-Agent": user_agent},
+            "stream": True,
+        }
+        # Send a HTTP request to the URL
+        response = requests.get(image_path, **request_kwargs)
+        response.raise_for_status()
+        content_type = response.headers.get("content-type", "")
+        extension = mimetypes.guess_extension(content_type)
+        if extension is None:
+            extension = ".download"
+        fname = str(uuid.uuid4()) + extension
+        download_path = os.path.abspath(os.path.join("downloads", fname))
+        with open(download_path, "wb") as fh:
+            for chunk in response.iter_content(chunk_size=512):
+                fh.write(chunk)
+        image_path = download_path
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"
+}
+def resize_image(image_path):
+    img = Image.open(image_path)
+    width, height = img.size
+    img = img.resize((int(width / 2), int(height / 2)))
+    new_image_path = f"resized_{image_path}"
+    img.save(new_image_path)
+    return new_image_path
+class VisualQATool(Tool):
+    name = "visualizer"
+    description = "A tool that can answer questions about attached images."
+    inputs = {
+        "question": {
+            "description": "the question to answer",
+            "type": "string",
+            "nullable": True,
+        },
+        "image_path": {
+            "description": "The path to the image on which to answer the question",
+            "type": "string",
+        },
+    }
+    output_type = "string"
+    client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
+    def forward(self, image_path: str, question: Optional[str] = None) -> str:
+        add_note = False
+        if not question:
+            add_note = True
+            question = "Please write a detailed caption for this image."
+        try:
+            output = process_images_and_text(image_path, question, self.client)
+        except Exception as e:
+            print(e)
+            if "Payload Too Large" in str(e):
+                new_image_path = resize_image(image_path)
+                output = process_images_and_text(new_image_path, question, self.client)
+        if add_note:
+            output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
+        return output
+# ////////////////////////////////////////////////////////////////////////
+# import base64
+# import json
+# import os
+# import uuid
+# import mimetypes
+# from io import BytesIO
+# from typing import Optional
+# from PIL import Image
+# from dotenv import load_dotenv
+# import requests
+# from smolagents import Tool
+# from huggingface_hub import InferenceClient
+# load_dotenv()
+# # === UTILS ===
+# def encode_local_image(image_path):
+#     image = Image.open(image_path).convert("RGB")
+#     buffer = BytesIO()
+#     image.save(buffer, format="JPEG")
+#     base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
+#     return f"data:image/jpeg;base64,{base64_image}"
+# def encode_image(image_path):
+#     if image_path.startswith("http"):
+#         user_agent = (
+#             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+#             "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
+#         )
+#         response = requests.get(image_path, headers={"User-Agent": user_agent}, stream=True)
+#         response.raise_for_status()
+#         ext = mimetypes.guess_extension(response.headers.get("content-type", ""))
+#         fname = str(uuid.uuid4()) + (ext or ".jpg")
+#         os.makedirs("downloads", exist_ok=True)
+#         local_path = os.path.join("downloads", fname)
+#         with open(local_path, "wb") as f:
+#             for chunk in response.iter_content(chunk_size=1024):
+#                 f.write(chunk)
+#         image_path = local_path
+#     with open(image_path, "rb") as img:
+#         return base64.b64encode(img.read()).decode("utf-8")
+# def resize_image(image_path):
+#     img = Image.open(image_path)
+#     width, height = img.size
+#     img = img.resize((int(width / 2), int(height / 2)))
+#     new_path = f"resized_{os.path.basename(image_path)}"
+#     img.save(new_path)
+#     return new_path
+# # === IDEFICS2 Tool ===
+# class VisualQATool(Tool):
+#     name = "visualizer"
+#     description = "A tool that can answer questions about attached images using IDEFICS2."
+#     inputs = {
+#         "question": {
+#             "description": "The question to answer",
+#             "type": "string",
+#             "nullable": True,
+#         },
+#         "image_path": {
+#             "description": "Path to the image (local or downloaded)",
+#             "type": "string",
+#         },
+#     }
+#     output_type = "string"
+#     client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
+#     def forward(self, image_path: str, question: Optional[str] = None) -> str:
+#         add_note = False
+#         if not question:
+#             add_note = True
+#             question = "Please write a detailed caption for this image."
+#         image_string = encode_local_image(image_path)
+#         prompt = f"![]({image_string})\n\n{question}"
+#         payload = {
+#             "inputs": prompt,
+#             "parameters": {
+#                 "return_full_text": False,
+#                 "max_new_tokens": 200,
+#             },
+#         }
+#         try:
+#             result = json.loads(self.client.post(json=payload).decode())[0]
+#         except Exception as e:
+#             if "Payload Too Large" in str(e):
+#                 resized = resize_image(image_path)
+#                 image_string = encode_local_image(resized)
+#                 prompt = f"![]({image_string})\n\n{question}"
+#                 payload["inputs"] = prompt
+#                 result = json.loads(self.client.post(json=payload).decode())[0]
+#             else:
+#                 raise e
+#         return (
+#             f"You did not provide a particular question, so here is a detailed caption for the image: {result}"
+#             if add_note else result
+#         )