Dkapsis commited on
Commit
ed20377
·
1 Parent(s): 6dd5f4b

manager tools

Browse files
app_agents/manager_agent.py CHANGED
@@ -2,21 +2,13 @@ import os
2
  from smolagents.utils import encode_image_base64, make_image_url
3
  from smolagents import OpenAIServerModel, CodeAgent, InferenceClientModel
4
 
5
- # from gradio_tools import (StableDiffusionTool, ImageCaptioningTool, StableDiffusionPromptGeneratorTool,
6
- # TextToVideoTool)
7
- # from langchain.agents import initialize_agent
8
- # from langchain.memory import ConversationBufferMemory
9
-
10
  import app_agents.web_agent as web_agent
11
- # import app_tools.tools as agent_tools
12
-
13
- # tools = [StableDiffusionTool().langchain, ImageCaptioningTool().langchain,
14
- # StableDiffusionPromptGeneratorTool().langchain, TextToVideoTool().langchain]
15
- # memory = ConversationBufferMemory(memory_key="chat_history")
16
 
17
  manager_agent = CodeAgent(
18
  model=InferenceClientModel("deepseek-ai/DeepSeek-R1", provider="together", max_tokens=8096),
19
- tools=[],
20
  planning_interval=4,
21
  verbosity_level=2,
22
  managed_agents=[web_agent.web_agent],
 
2
  from smolagents.utils import encode_image_base64, make_image_url
3
  from smolagents import OpenAIServerModel, CodeAgent, InferenceClientModel
4
 
 
 
 
 
 
5
  import app_agents.web_agent as web_agent
6
+ import app_tools.text_inspector
7
+ import app_tools.visual_qa
 
 
 
8
 
9
  manager_agent = CodeAgent(
10
  model=InferenceClientModel("deepseek-ai/DeepSeek-R1", provider="together", max_tokens=8096),
11
+ tools=[app_tools.text_inspector.TextInspectorTool(), app_tools.visual_qa.VisualQATool()],
12
  planning_interval=4,
13
  verbosity_level=2,
14
  managed_agents=[web_agent.web_agent],
app_tools/text_inspector.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from typing import Optional
3
+ from smolagents import InferenceClientModel, Tool
4
+
5
+ from app_tools.mdconvert import MarkdownConverter
6
+
7
+ load_dotenv()
8
+ text_limit = 70000
9
+ websurfer_llm_engine = InferenceClientModel(
10
+ model="meta-llama/Meta-Llama-3.1-70B-Instruct",
11
+ )
12
+
13
+ class TextInspectorTool(Tool):
14
+ name = "inspect_file_as_text"
15
+ description = """
16
+ You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
17
+ This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
18
+
19
+ inputs = {
20
+ "question": {
21
+ "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
22
+ "type": "string",
23
+ "nullable": True,
24
+ },
25
+ "file_path": {
26
+ "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT USE THIS TOOL FOR A WEBPAGE: use the search tool instead!",
27
+ "type": "string",
28
+ },
29
+ }
30
+ output_type = "string"
31
+ md_converter = MarkdownConverter()
32
+
33
+ def forward_initial_exam_mode(self, file_path, question):
34
+ result = self.md_converter.convert(file_path)
35
+
36
+ if file_path[-4:] in ['.png', '.jpg']:
37
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
38
+
39
+ if ".zip" in file_path:
40
+ return result.text_content
41
+
42
+ if not question:
43
+ return result.text_content
44
+
45
+ messages = [
46
+ {
47
+ "role": MessageRole.SYSTEM,
48
+ "content": "Here is a file:\n### "
49
+ + str(result.title)
50
+ + "\n\n"
51
+ + result.text_content[:text_limit],
52
+ },
53
+ {
54
+ "role": MessageRole.USER,
55
+ "content": question,
56
+ },
57
+ ]
58
+ return websurfer_llm_engine(messages)
59
+
60
+ def forward(self, file_path, question: Optional[str] = None) -> str:
61
+
62
+ result = self.md_converter.convert(file_path)
63
+
64
+ if file_path[-4:] in ['.png', '.jpg']:
65
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
66
+
67
+ if ".zip" in file_path:
68
+ return result.text_content
69
+
70
+ if not question:
71
+ return result.text_content
72
+
73
+ messages = [
74
+ {
75
+ "role": MessageRole.SYSTEM,
76
+ "content": "You will have to write a short caption for this file, then answer this question:"
77
+ + question,
78
+ },
79
+ {
80
+ "role": MessageRole.USER,
81
+ "content": "Here is the complete file:\n### "
82
+ + str(result.title)
83
+ + "\n\n"
84
+ + result.text_content[:text_limit],
85
+ },
86
+ {
87
+ "role": MessageRole.USER,
88
+ "content": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
89
+ + question,
90
+ },
91
+ ]
92
+ return websurfer_llm_engine(messages)
app_tools/visual_qa.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import base64
3
+ from io import BytesIO
4
+ import json
5
+ import os
6
+ import requests
7
+ from typing import Optional
8
+ from huggingface_hub import InferenceClient
9
+ from transformers import AutoProcessor
10
+ from smolagents import Tool
11
+ import uuid
12
+ import mimetypes
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv(override=True)
16
+
17
+ idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
18
+
19
+ def process_images_and_text(image_path, query, client):
20
+ messages = [
21
+ {
22
+ "role": "user", "content": [
23
+ {"type": "image"},
24
+ {"type": "text", "text": query},
25
+ ]
26
+ },
27
+ ]
28
+
29
+ prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
30
+
31
+ # load images from local directory
32
+
33
+ # encode images to strings which can be sent to the endpoint
34
+ def encode_local_image(image_path):
35
+ # load image
36
+ image = Image.open(image_path).convert('RGB')
37
+
38
+ # Convert the image to a base64 string
39
+ buffer = BytesIO()
40
+ image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
41
+ base64_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
42
+
43
+ # add string formatting required by the endpoint
44
+ image_string = f"data:image/jpeg;base64,{base64_image}"
45
+
46
+ return image_string
47
+
48
+
49
+ image_string = encode_local_image(image_path)
50
+ prompt_with_images = prompt_with_template.replace("<image>", "![]({}) ").format(image_string)
51
+
52
+
53
+ payload = {
54
+ "inputs": prompt_with_images,
55
+ "parameters": {
56
+ "return_full_text": False,
57
+ "max_new_tokens": 200,
58
+ }
59
+ }
60
+
61
+ return json.loads(client.post(json=payload).decode())[0]
62
+
63
+ # Function to encode the image
64
+ def encode_image(image_path):
65
+ if image_path.startswith("http"):
66
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
67
+ request_kwargs = {
68
+ "headers": {"User-Agent": user_agent},
69
+ "stream": True,
70
+ }
71
+
72
+ # Send a HTTP request to the URL
73
+ response = requests.get(image_path, **request_kwargs)
74
+ response.raise_for_status()
75
+ content_type = response.headers.get("content-type", "")
76
+
77
+ extension = mimetypes.guess_extension(content_type)
78
+ if extension is None:
79
+ extension = ".download"
80
+
81
+ fname = str(uuid.uuid4()) + extension
82
+ download_path = os.path.abspath(os.path.join("downloads", fname))
83
+
84
+ with open(download_path, "wb") as fh:
85
+ for chunk in response.iter_content(chunk_size=512):
86
+ fh.write(chunk)
87
+
88
+ image_path = download_path
89
+
90
+ with open(image_path, "rb") as image_file:
91
+ return base64.b64encode(image_file.read()).decode('utf-8')
92
+
93
+ headers = {
94
+ "Content-Type": "application/json",
95
+ "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"
96
+ }
97
+
98
+
99
+ def resize_image(image_path):
100
+ img = Image.open(image_path)
101
+ width, height = img.size
102
+ img = img.resize((int(width / 2), int(height / 2)))
103
+ new_image_path = f"resized_{image_path}"
104
+ img.save(new_image_path)
105
+ return new_image_path
106
+
107
+
108
+ class VisualQATool(Tool):
109
+ name = "visualizer"
110
+ description = "A tool that can answer questions about attached images."
111
+ inputs = {
112
+ "question": {
113
+ "description": "the question to answer",
114
+ "type": "string",
115
+ "nullable": True,
116
+ },
117
+ "image_path": {
118
+ "description": "The path to the image on which to answer the question",
119
+ "type": "string",
120
+ },
121
+ }
122
+ output_type = "string"
123
+
124
+ client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
125
+
126
+ def forward(self, image_path: str, question: Optional[str] = None) -> str:
127
+ add_note = False
128
+ if not question:
129
+ add_note = True
130
+ question = "Please write a detailed caption for this image."
131
+ try:
132
+ output = process_images_and_text(image_path, question, self.client)
133
+ except Exception as e:
134
+ print(e)
135
+ if "Payload Too Large" in str(e):
136
+ new_image_path = resize_image(image_path)
137
+ output = process_images_and_text(new_image_path, question, self.client)
138
+
139
+ if add_note:
140
+ output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
141
+
142
+ return output
143
+
144
+ # ////////////////////////////////////////////////////////////////////////
145
+ # import base64
146
+ # import json
147
+ # import os
148
+ # import uuid
149
+ # import mimetypes
150
+ # from io import BytesIO
151
+ # from typing import Optional
152
+ # from PIL import Image
153
+ # from dotenv import load_dotenv
154
+ # import requests
155
+ # from smolagents import Tool
156
+ # from huggingface_hub import InferenceClient
157
+
158
+ # load_dotenv()
159
+
160
+ # # === UTILS ===
161
+
162
+ # def encode_local_image(image_path):
163
+ # image = Image.open(image_path).convert("RGB")
164
+ # buffer = BytesIO()
165
+ # image.save(buffer, format="JPEG")
166
+ # base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
167
+ # return f"data:image/jpeg;base64,{base64_image}"
168
+
169
+ # def encode_image(image_path):
170
+ # if image_path.startswith("http"):
171
+ # user_agent = (
172
+ # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
173
+ # "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
174
+ # )
175
+ # response = requests.get(image_path, headers={"User-Agent": user_agent}, stream=True)
176
+ # response.raise_for_status()
177
+
178
+ # ext = mimetypes.guess_extension(response.headers.get("content-type", ""))
179
+ # fname = str(uuid.uuid4()) + (ext or ".jpg")
180
+ # os.makedirs("downloads", exist_ok=True)
181
+ # local_path = os.path.join("downloads", fname)
182
+
183
+ # with open(local_path, "wb") as f:
184
+ # for chunk in response.iter_content(chunk_size=1024):
185
+ # f.write(chunk)
186
+
187
+ # image_path = local_path
188
+
189
+ # with open(image_path, "rb") as img:
190
+ # return base64.b64encode(img.read()).decode("utf-8")
191
+
192
+ # def resize_image(image_path):
193
+ # img = Image.open(image_path)
194
+ # width, height = img.size
195
+ # img = img.resize((int(width / 2), int(height / 2)))
196
+ # new_path = f"resized_{os.path.basename(image_path)}"
197
+ # img.save(new_path)
198
+ # return new_path
199
+
200
+ # # === IDEFICS2 Tool ===
201
+
202
+ # class VisualQATool(Tool):
203
+ # name = "visualizer"
204
+ # description = "A tool that can answer questions about attached images using IDEFICS2."
205
+ # inputs = {
206
+ # "question": {
207
+ # "description": "The question to answer",
208
+ # "type": "string",
209
+ # "nullable": True,
210
+ # },
211
+ # "image_path": {
212
+ # "description": "Path to the image (local or downloaded)",
213
+ # "type": "string",
214
+ # },
215
+ # }
216
+ # output_type = "string"
217
+
218
+ # client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
219
+
220
+ # def forward(self, image_path: str, question: Optional[str] = None) -> str:
221
+ # add_note = False
222
+ # if not question:
223
+ # add_note = True
224
+ # question = "Please write a detailed caption for this image."
225
+
226
+ # image_string = encode_local_image(image_path)
227
+ # prompt = f"![]({image_string})\n\n{question}"
228
+
229
+ # payload = {
230
+ # "inputs": prompt,
231
+ # "parameters": {
232
+ # "return_full_text": False,
233
+ # "max_new_tokens": 200,
234
+ # },
235
+ # }
236
+
237
+ # try:
238
+ # result = json.loads(self.client.post(json=payload).decode())[0]
239
+ # except Exception as e:
240
+ # if "Payload Too Large" in str(e):
241
+ # resized = resize_image(image_path)
242
+ # image_string = encode_local_image(resized)
243
+ # prompt = f"![]({image_string})\n\n{question}"
244
+ # payload["inputs"] = prompt
245
+ # result = json.loads(self.client.post(json=payload).decode())[0]
246
+ # else:
247
+ # raise e
248
+
249
+ # return (
250
+ # f"You did not provide a particular question, so here is a detailed caption for the image: {result}"
251
+ # if add_note else result
252
+ # )