More visibility into attached files and duplicate status (#776)

This commit is contained in:
Kerem Yilmaz 2024-09-06 11:08:33 +03:00 committed by GitHub
parent 73227963dd
commit be1c8ba060
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 33 additions and 1 deletions

View file

@ -1,3 +1,4 @@
import hashlib
import os import os
import tempfile import tempfile
import zipfile import zipfile
@ -86,3 +87,12 @@ def get_number_of_files_in_directory(directory: Path, recursive: bool = False) -
break break
count += len(files) count += len(files)
return count return count
def calculate_sha256(file_path: str) -> str:
"""Helper function to calculate SHA256 hash of a file."""
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()

View file

@ -6,6 +6,7 @@ import os
import smtplib import smtplib
import textwrap import textwrap
import uuid import uuid
from collections import defaultdict
from dataclasses import dataclass from dataclasses import dataclass
from email.message import EmailMessage from email.message import EmailMessage
from enum import StrEnum from enum import StrEnum
@ -30,7 +31,12 @@ from skyvern.exceptions import (
from skyvern.forge import app from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.aws import AsyncAWSClient from skyvern.forge.sdk.api.aws import AsyncAWSClient
from skyvern.forge.sdk.api.files import download_file, download_from_s3, get_path_for_workflow_download_directory from skyvern.forge.sdk.api.files import (
calculate_sha256,
download_file,
download_from_s3,
get_path_for_workflow_download_directory,
)
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
from skyvern.forge.sdk.schemas.tasks import TaskOutput, TaskStatus from skyvern.forge.sdk.schemas.tasks import TaskOutput, TaskStatus
from skyvern.forge.sdk.settings_manager import SettingsManager from skyvern.forge.sdk.settings_manager import SettingsManager
@ -905,6 +911,8 @@ class SendEmailBlock(Block):
else: else:
msg.set_content(self.body) msg.set_content(self.body)
file_names_by_hash: dict[str, list[str]] = defaultdict(list)
for filename in self._get_file_paths(workflow_run_context, workflow_run_id): for filename in self._get_file_paths(workflow_run_context, workflow_run_id):
path = None path = None
try: try:
@ -961,10 +969,24 @@ class SendEmailBlock(Block):
subtype=subtype, subtype=subtype,
filename=attachment_filename, filename=attachment_filename,
) )
file_hash = calculate_sha256(path)
file_names_by_hash[file_hash].append(path)
finally: finally:
if path: if path:
os.unlink(path) os.unlink(path)
# Calculate file stats based on content hashes
total_files = sum(len(files) for files in file_names_by_hash.values())
unique_files = len(file_names_by_hash)
duplicate_files_list = [files for files in file_names_by_hash.values() if len(files) > 1]
# Log file statistics
LOG.info("SendEmailBlock: Total files attached", total_files=total_files)
LOG.info("SendEmailBlock: Unique files (based on content) attached", unique_files=unique_files)
LOG.info(
"SendEmailBlock: Duplicate files (based on content) attached", duplicate_files_list=duplicate_files_list
)
return msg return msg
async def execute(self, workflow_run_id: str, **kwargs: dict) -> BlockResult: async def execute(self, workflow_run_id: str, **kwargs: dict) -> BlockResult: