eigent/backend/camel/utils/filename.py

# ========= Copyright 2023-2026 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2026 @ CAMEL-AI.org. All Rights Reserved. =========
import platform
import re
import unicodedata

MAX_FILENAME_LENGTH = 255
WINDOWS_RESERVED = {
    'CON',
    'PRN',
    'AUX',
    'NUL',
    'COM1',
    'COM2',
    'COM3',
    'COM4',
    'LPT1',
    'LPT2',
    'LPT3',
}


def sanitize_filename(
    url_name: str,
    default: str = "index",
    max_length: int = MAX_FILENAME_LENGTH,
) -> str:
    r"""Sanitize a URL path into a safe filename that is safe for
    most platforms.

    Args:
        url_name (str): The URL path to sanitize.
        default (str): Default name if sanitization results in empty string.
            (default: :obj:`"index"`)
        max_length (int): Maximum length of the filename.
            (default: :obj:`MAX_FILENAME_LENGTH`)

    Returns:
        str: A sanitized filename safe for most platforms.
    """
    if max_length < 1:
        raise ValueError(
            f"`max_length` must be greater than " f"0, got {max_length}"
        )

    if not url_name:
        return default

    # Normalize Unicode characters by removing characters
    # such as accents and special characters:
    # café☕.txt -> cafe.txt
    url_name = unicodedata.normalize('NFKD', url_name)
    url_name = url_name.encode('ASCII', 'ignore').decode('ASCII')

    # Replace special characters such as:
    # Separators: my/file:name*.txt -> my_file_name.txt etc.
    url_name = re.sub(r'[\\/:*?"<>|.]', '_', url_name)
    url_name = re.sub(r'_+', '_', url_name)  # Collapse multiple underscores
    url_name = url_name.strip('_')  # Remove leading/trailing underscores

    # Handle empty result if all characters are invalid:
    if not url_name:
        return default

    # Handle Windows reserved names
    if platform.system() == "Windows" and url_name.upper() in WINDOWS_RESERVED:
        url_name = f"_{url_name}"

    return url_name[:max_length]