More visibility into attached files and duplicate status (#776)

This commit is contained in:
Kerem Yilmaz 2024-09-06 11:08:33 +03:00 committed by GitHub
parent 73227963dd
commit be1c8ba060
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 33 additions and 1 deletions

View file

@ -1,3 +1,4 @@
import hashlib
import os
import tempfile
import zipfile
@ -86,3 +87,12 @@ def get_number_of_files_in_directory(directory: Path, recursive: bool = False) -
break
count += len(files)
return count
def calculate_sha256(file_path: str) -> str:
"""Helper function to calculate SHA256 hash of a file."""
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()

View file

@ -6,6 +6,7 @@ import os
import smtplib
import textwrap
import uuid
from collections import defaultdict
from dataclasses import dataclass
from email.message import EmailMessage
from enum import StrEnum
@ -30,7 +31,12 @@ from skyvern.exceptions import (
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.aws import AsyncAWSClient
from skyvern.forge.sdk.api.files import download_file, download_from_s3, get_path_for_workflow_download_directory
from skyvern.forge.sdk.api.files import (
calculate_sha256,
download_file,
download_from_s3,
get_path_for_workflow_download_directory,
)
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
from skyvern.forge.sdk.schemas.tasks import TaskOutput, TaskStatus
from skyvern.forge.sdk.settings_manager import SettingsManager
@ -905,6 +911,8 @@ class SendEmailBlock(Block):
else:
msg.set_content(self.body)
file_names_by_hash: dict[str, list[str]] = defaultdict(list)
for filename in self._get_file_paths(workflow_run_context, workflow_run_id):
path = None
try:
@ -961,10 +969,24 @@ class SendEmailBlock(Block):
subtype=subtype,
filename=attachment_filename,
)
file_hash = calculate_sha256(path)
file_names_by_hash[file_hash].append(path)
finally:
if path:
os.unlink(path)
# Calculate file stats based on content hashes
total_files = sum(len(files) for files in file_names_by_hash.values())
unique_files = len(file_names_by_hash)
duplicate_files_list = [files for files in file_names_by_hash.values() if len(files) > 1]
# Log file statistics
LOG.info("SendEmailBlock: Total files attached", total_files=total_files)
LOG.info("SendEmailBlock: Unique files (based on content) attached", unique_files=unique_files)
LOG.info(
"SendEmailBlock: Duplicate files (based on content) attached", duplicate_files_list=duplicate_files_list
)
return msg
async def execute(self, workflow_run_id: str, **kwargs: dict) -> BlockResult: