fix: preserve safe remote fetch compatibility for public sites

Restore remote document fetch compatibility for public sites after the
CVE-2026-4308 SSRF hardening.

The initial security fix correctly blocked non-public destinations, but
it also changed the outbound request fingerprint for `document_query`
remote fetches. Some public sites, including https://nvd.nist.gov/vuln/detail/CVE-2026-4308, used for testing, responded with HTTP
403 to the default `requests` user agent even though they remained safe
and publicly routable.

This change keeps the centralized SSRF protections in place while
restoring the previous request compatibility behavior by sending the
configured `USER_AGENT` header, falling back to the prior
`@mixedbread-ai/unstructured` value.

What is fixed:
- public URLs such as
  `https://nvd.nist.gov/vuln/detail/CVE-2026-4308`
  no longer fail with site-specific HTTP 403 due to request fingerprint
  changes introduced by the SSRF mitigation
This commit is contained in:
Alessandro 2026-04-12 02:08:13 +02:00
parent 6397acc092
commit 91f43e28b4

View file

@ -2,6 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass
import ipaddress
import os
import socket
import struct
from urllib.parse import urljoin, urlparse
@ -11,6 +12,7 @@ import requests
SAFE_HTTP_SCHEMES = frozenset({"http", "https"})
DEFAULT_FETCH_TIMEOUT = (3.05, 10.0)
DEFAULT_HTTP_USER_AGENT = "@mixedbread-ai/unstructured"
@dataclass(frozen=True)
@ -25,6 +27,15 @@ class UnsafeUrlError(ValueError):
"""Raised when a remote URL resolves to a non-public destination."""
def _build_request_headers() -> dict[str, str]:
user_agent = (
os.getenv("USER_AGENT")
or os.getenv("user_agent")
or DEFAULT_HTTP_USER_AGENT
).strip()
return {"User-Agent": user_agent or DEFAULT_HTTP_USER_AGENT}
def _normalize_content_type(content_type: str | None) -> str | None:
if not content_type:
return None
@ -104,6 +115,7 @@ def fetch_public_http_resource(
current_url,
stream=True,
allow_redirects=False,
headers=_build_request_headers(),
timeout=timeout,
) as response:
if 300 <= response.status_code < 400: