agent-zero/tools/vision_load.py
Alessandro 1993f6f864 Store vision and computer-use images as path refs
Keep image payloads out of persistent agent history by storing vision and
computer-use captures as file path references instead of inline base64 data.

- update vision_load to attach image paths without compression or JPEG conversion
- update computer_use_remote to attach shared capture artifact paths directly
- serialize local image refs into provider-valid data URLs only at request prep
- reject base64/data URL attachments on the connector WebSocket path
- advertise path_or_url as the connector attachment mode
2026-04-21 18:18:59 +02:00

89 lines
3.8 KiB
Python

from helpers.print_style import PrintStyle
from helpers.tool import Tool, Response
from helpers import runtime, files, plugins
from mimetypes import guess_type
from helpers import history
# image token estimation for context window
TOKENS_ESTIMATE = 1500
class VisionLoad(Tool):
async def execute(self, paths: list[str] = [], **kwargs) -> Response:
self.images_dict = {}
self.loaded_paths: list[str] = []
self.skipped_paths: list[str] = []
max_embeds = self._get_max_embeds()
limited_paths = paths if max_embeds <= 0 else paths[-max_embeds:]
self.skipped_paths = paths[:-max_embeds] if max_embeds > 0 and len(paths) > max_embeds else []
for path in limited_paths:
if not await runtime.call_development_function(files.exists, str(path)):
continue
if path not in self.images_dict:
mime_type, _ = guess_type(str(path))
if mime_type and mime_type.startswith("image/"):
self.images_dict[path] = str(path)
self.loaded_paths.append(path)
return Response(message="dummy", break_loop=False)
def _get_max_embeds(self) -> int:
cfg = plugins.get_plugin_config("_model_config", agent=self.agent) or {}
chat_cfg = cfg.get("chat_model", {})
max_embeds = chat_cfg.get("max_embeds", 10)
return int(max_embeds or 0)
async def after_execution(self, response: Response, **kwargs):
# build image data messages for LLMs, or error message
content = []
loaded_count = len(self.loaded_paths)
skipped_count = len(self.skipped_paths)
loaded_summary = "\n".join(self.loaded_paths) if self.loaded_paths else "none"
skipped_summary = "\n".join(self.skipped_paths) if self.skipped_paths else "none"
summary = (
f"Loaded images: {loaded_count}\n"
f"Loaded images:\n{loaded_summary}\n\n"
f"Skipped images: {skipped_count}\n"
f"Skipped images (max {self._get_max_embeds()} loaded at a time according to model configuration):\n{skipped_summary}"
)
if self.images_dict:
self.agent.hist_add_tool_result(self.name, summary, id=self.log.id if self.log else "")
for path, image_path in self.images_dict.items():
if image_path:
content.append(
{
"type": "image_url",
"image_url": {"url": image_path},
}
)
else:
content.append(
{
"type": "text",
"text": "Error processing image " + path,
}
)
# append as raw message content for LLMs with vision tokens estimate
msg = history.RawMessage(raw_content=content, preview="<Image attachments loaded by path>")
self.agent.hist_add_message(
False, content=msg, tokens=TOKENS_ESTIMATE * len(content)
)
else:
self.agent.hist_add_tool_result(self.name, summary if self.skipped_paths else "No images processed", id=self.log.id if self.log else "")
# print and log short version
message = (
"No images processed"
if not self.images_dict and not self.skipped_paths
else f"{loaded_count} images loaded, {skipped_count} skipped"
)
PrintStyle(
font_color="#1B4F72", background_color="white", padding=True, bold=True
).print(f"{self.agent.agent_name}: Response from tool '{self.name}'")
PrintStyle(font_color="#85C1E9").print(message)
self.log.update(result=message)