From c57a97958ab91db725b95a6f8fb295c21a1699c3 Mon Sep 17 00:00:00 2001
From: Roland Tannous <115670425+rolandtannous@users.noreply.github.com>
Date: Fri, 8 May 2026 13:07:18 +0400
Subject: [PATCH 1/2] Studio: stop truncating long log lines as suspected
 base64 (#5335)

* Studio: stop truncating long log lines as suspected base64

filter_sensitive_data carried a heuristic from the original Studio
import that truncated any string >100 chars containing ',' or '/'
to value[:20] + '...'. The block was dormant until #5246 wired
filter_sensitive_data into the structlog processor chain to redact
native-path leases. Once active, the heuristic ate normal log lines
- llama_cpp_backend's GGUF size summary, mmproj selection, the full
llama-server command line, and any traceback containing a path -
all rendered as a 20-char prefix, defeating debugging of llama-server
exceptions and GPU selection.

Drop the base64 truncation. No call site in the codebase logs raw
base64; if one ever does, it should truncate at the source rather
than in a global filter. Native-path lease redaction added by #5246
is preserved.

* Studio: regression test for filter_sensitive_data truncation

Pins two properties in studio/backend/loggers/handlers.py:

1. Long log messages with ',' or '/' (the GGUF size summary, mmproj
   selection, full llama-server command, exception tracebacks) flow
   through filter_sensitive_data unchanged. Exercises the exact call
   sites that regressed when #5246 wired the processor in.

2. Native-path lease redaction still fires for both the inline
   native_path_lease=... regex form and the nativePathLease dict-key
   form, so a future cleanup of the truncation logic can't quietly
   strip #5246's redaction along with it.
---
 studio/backend/loggers/handlers.py            |  10 +-
 .../tests/test_log_filter_no_truncation.py    | 108 ++++++++++++++++++
 2 files changed, 110 insertions(+), 8 deletions(-)
 create mode 100644 studio/backend/tests/test_log_filter_no_truncation.py
diff --git a/studio/backend/loggers/handlers.py b/studio/backend/loggers/handlers.py
index ddd404cdf..e80bd4faf 100644
--- a/studio/backend/loggers/handlers.py
+++ b/studio/backend/loggers/handlers.py
@@ -78,7 +78,7 @@ class LoggingMiddleware(BaseHTTPMiddleware):
 
 
 def filter_sensitive_data(logger, method_name, event_dict):
-    """Structlog processor to filter out base64 data from logs."""
+    """Structlog processor to redact native path leases from logs."""
 
     def filter_value(value):
         if isinstance(value, str):
@@ -87,13 +87,7 @@ def filter_sensitive_data(logger, method_name, event_dict):
             except Exception:
                 pass
             value = _NATIVE_PATH_LEASE_RE.sub(r"\1<redacted native path lease>", value)
-        if (
-            isinstance(value, str)
-            and len(value) > 100
-            and ("," in value or "/" in value)
-        ):
-            # Likely base64 data, truncate it
-            return value[:20] + "..."
+            return value
         elif isinstance(value, dict):
             return {
                 k: "<redacted native path lease>"
diff --git a/studio/backend/tests/test_log_filter_no_truncation.py b/studio/backend/tests/test_log_filter_no_truncation.py
new file mode 100644
index 000000000..d78643f5b
--- /dev/null
+++ b/studio/backend/tests/test_log_filter_no_truncation.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""
+Regression tests for studio.backend.loggers.handlers.filter_sensitive_data.
+
+Context: filter_sensitive_data was originally written with a base64-detection
+heuristic that truncated any string >100 chars containing ',' or '/' down to
+20 chars + '...'. The block was dormant until PR #5246 wired the processor
+into the structlog chain to redact native-path leases. Once active, the
+heuristic ate normal log lines emitted by llama_cpp_backend (GGUF size
+summary, mmproj selection, the full llama-server command line) and any
+exception traceback that happened to contain a file path.
+
+These tests pin two properties:
+
+1. Long, comma- or slash-bearing log messages flow through filter_sensitive_data
+   unchanged. The exact strings exercised match the call sites at
+   studio/backend/core/inference/llama_cpp.py:2117, :2283, and :2312 that
+   were truncated in the original bug report.
+
+2. PR #5246's native-path lease redaction still fires for both the inline
+   ``native_path_lease=...`` regex form and the ``nativePathLease`` dict-key
+   form. This guards against future regressions that strip redaction along
+   with the truncation block.
+"""
+
+from loggers.handlers import filter_sensitive_data
+
+
+def _run(event_dict):
+    return filter_sensitive_data(logger = None, method_name = "info", event_dict = event_dict)
+
+
+class TestNoTruncation:
+    def test_gguf_size_summary_survives(self):
+        # Mirrors the f-string at studio/backend/core/inference/llama_cpp.py:2117
+        event = (
+            "GGUF size: 232.9 GB, est. KV cache: 87.0 GB, context: 259072, "
+            "GPUs free: [(0, 80000), (1, 80000)], selected: [0, 1], fit: False"
+        )
+        out = _run({"event": event})
+        assert out["event"] == event
+        assert "..." not in out["event"]
+
+    def test_mmproj_path_survives(self):
+        # Mirrors logger.info at studio/backend/core/inference/llama_cpp.py:2283
+        event = (
+            "Using mmproj for vision: "
+            "/home/user/.cache/unsloth/models/some-vision-model-uncensored-r1-distill/mmproj-F16.gguf"
+        )
+        out = _run({"event": event})
+        assert out["event"] == event
+
+    def test_llama_server_command_survives(self):
+        # Mirrors logger.info at studio/backend/core/inference/llama_cpp.py:2312
+        event = (
+            "Starting llama-server: /home/user/.unsloth/studio/llama.cpp/build/bin/llama-server "
+            "-m /home/user/.cache/unsloth/models/foo.gguf --port 8090 -c 259072 --parallel 1 "
+            "--flash-attn on --mmproj /home/user/.cache/unsloth/models/mmproj-F16.gguf"
+        )
+        out = _run({"event": event})
+        assert out["event"] == event
+
+    def test_traceback_with_paths_survives(self):
+        traceback_str = (
+            "Traceback (most recent call last):\n"
+            '  File "/home/user/.unsloth/studio/unsloth_studio/lib/python3.11/site-packages/'
+            'studio/backend/core/inference/llama_cpp.py", line 2312, in start\n'
+            '    raise RuntimeError("llama-server crashed: bad alloc, /dev/shm full")\n'
+            "RuntimeError: llama-server crashed: bad alloc, /dev/shm full"
+        )
+        out = _run({"event": "llama-server crashed", "exception": traceback_str})
+        assert out["exception"] == traceback_str
+        assert "..." not in out["exception"]
+
+    def test_nested_long_string_in_dict_survives(self):
+        long_value = (
+            "/very/long/path/with,many,commas,and/slashes/that/used/to/get/"
+            "chopped/to/twenty/chars/file.gguf"
+        )
+        out = _run({"event": "load", "details": {"path": long_value}})
+        assert out["details"]["path"] == long_value
+
+
+class TestNativePathLeaseRedactionStillWorks:
+    """Guards PR #5246's redaction from being lost alongside the truncation block."""
+
+    def test_inline_native_path_lease_value_redacted(self):
+        event = (
+            "rejected request: native_path_lease=AAAAAA.BBBBBB extra context "
+            "with /some/path,values"
+        )
+        out = _run({"event": event})
+        assert "AAAAAA.BBBBBB" not in out["event"]
+        assert "<redacted native path lease>" in out["event"]
+
+    def test_camelcase_native_path_lease_dict_key_redacted(self):
+        out = _run({"event": "load", "nativePathLease": "AAAAAA.BBBBBB"})
+        assert out["nativePathLease"] == "<redacted native path lease>"
+
+    def test_snakecase_native_path_lease_dict_key_redacted(self):
+        out = _run({"event": "load", "native_path_lease": "AAAAAA.BBBBBB"})
+        assert out["native_path_lease"] == "<redacted native path lease>"
+
+    def test_nested_native_path_lease_key_redacted(self):
+        out = _run({"event": "load", "payload": {"nativePathLease": "AAAAAA.BBBBBB"}})
+        assert out["payload"]["nativePathLease"] == "<redacted native path lease>"

From b3640802253f64117ee228718be7fab32e47aa5f Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Fri, 8 May 2026 10:57:41 -0700
Subject: [PATCH 2/2] fix(gh_client): fail fast on 401/403 auth errors instead
 of retrying forever (#5325) (#5329)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(gh_client): fail fast on 401/403 auth errors instead of retrying forever (#5325)

Fixes #5325. The Studio data-recipe GitHub Crawler swallows 401 Unauthorized
(and 403 Forbidden without rate-limit headers) into the generic
"network error" retry path, so a job with a stale or wrong-scoped GitHub
token spins indefinitely emitting "Retry." lines until the user cancels.

Changes:

- Add GitHubAuthError. Raised on 401, and on 403 unless the response carries
  a clear rate-limit signal (Retry-After header for secondary limits, or
  X-RateLimit-Remaining: 0 for primary limits).
- Track which token source resolved at construction time: explicit argument
  (recipe-level field), GH_TOKEN, or GITHUB_TOKEN. Surfaced in the error
  message so the user knows which credential to rotate.
- Insert the auth-failure check before the existing 403/429 rate-limit branch
  in both .graphql() and .rest() so auth failures bypass the sleep-and-retry
  loop and abort the recipe immediately.

Genuine rate limiting still retries via the existing path. requests.RequestException
handling is unchanged because GitHubAuthError does not inherit from it.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* style: apply black formatting per pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix GitHub auth failure handling

Preserve GitHub token source through the repo seed scraper and fail fast on non-rate-limit auth errors while keeping genuine rate-limit retries.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Wasim Yousef Said <wasimysdev@gmail.com>
---
 .../data_designer_github_repo_seed/scraper.py |  32 ++++--
 .../scraper_impl/gh_client.py                 | 105 +++++++++++++++---
 2 files changed, 117 insertions(+), 20 deletions(-)

diff --git a/studio/backend/plugins/data-designer-github-repo-seed/src/data_designer_github_repo_seed/scraper.py b/studio/backend/plugins/data-designer-github-repo-seed/src/data_designer_github_repo_seed/scraper.py
index d768fe37b..6acb985b5 100644
--- a/studio/backend/plugins/data-designer-github-repo-seed/src/data_designer_github_repo_seed/scraper.py
+++ b/studio/backend/plugins/data-designer-github-repo-seed/src/data_designer_github_repo_seed/scraper.py
@@ -48,13 +48,31 @@ class ScrapeConfig:
     max_comments_per_item: int
 
 
-def _resolve_token(token: str) -> str:
-    tok = token or os.environ.get("GH_TOKEN", "") or os.environ.get("GITHUB_TOKEN", "")
-    if not tok:
-        raise ValueError(
-            "GitHub token is required. Set it in the recipe config or the GH_TOKEN / GITHUB_TOKEN env var."
+@dataclass(frozen = True)
+class ResolvedToken:
+    value: str
+    source: str
+
+
+def _resolve_token(token: str) -> ResolvedToken:
+    if token:
+        return ResolvedToken(
+            value = token,
+            source = "explicit token argument (recipe-level field)",
         )
-    return tok
+    if os.environ.get("GH_TOKEN"):
+        return ResolvedToken(
+            value = os.environ["GH_TOKEN"],
+            source = "GH_TOKEN environment variable",
+        )
+    if os.environ.get("GITHUB_TOKEN"):
+        return ResolvedToken(
+            value = os.environ["GITHUB_TOKEN"],
+            source = "GITHUB_TOKEN environment variable",
+        )
+    raise ValueError(
+        "GitHub token is required. Set it in the recipe config or the GH_TOKEN / GITHUB_TOKEN env var."
+    )
 
 
 def _read_jsonl(path: Path, max_rows: int | None = None):
@@ -155,7 +173,7 @@ def _flatten_commit_row(r: dict, repo: str) -> dict:
 def scrape(cfg: ScrapeConfig, base_dir: Path):
     token = _resolve_token(cfg.token)
     GitHubClient, RepoScraper = _load_impl()
-    client = GitHubClient(token = token)
+    client = GitHubClient(token = token.value, token_source = token.source)
     base_dir.mkdir(parents = True, exist_ok = True)
 
     # Per-resource trial limits. limit <= 0 means "all": use a very large cap.
diff --git a/studio/backend/plugins/data-designer-github-repo-seed/src/data_designer_github_repo_seed/scraper_impl/gh_client.py b/studio/backend/plugins/data-designer-github-repo-seed/src/data_designer_github_repo_seed/scraper_impl/gh_client.py
index dd2de2f5c..696d0ccb9 100644
--- a/studio/backend/plugins/data-designer-github-repo-seed/src/data_designer_github_repo_seed/scraper_impl/gh_client.py
+++ b/studio/backend/plugins/data-designer-github-repo-seed/src/data_designer_github_repo_seed/scraper_impl/gh_client.py
@@ -9,6 +9,8 @@ import json
 import os
 import time
 import logging
+from datetime import timezone
+from email.utils import parsedate_to_datetime
 from typing import Any, Dict, Iterable, Iterator, List, Optional
 
 import requests
@@ -29,16 +31,46 @@ class RateLimitError(Exception):
     pass
 
 
+class GitHubAuthError(RuntimeError):
+    """Raised when GitHub returns 401/403 due to invalid or insufficient credentials."""
+
+
+def _retry_after_seconds(value: str | None) -> int | None:
+    if not value:
+        return None
+    try:
+        return max(0, int(value))
+    except ValueError:
+        pass
+    try:
+        retry_at = parsedate_to_datetime(value)
+    except (TypeError, ValueError, IndexError, OverflowError):
+        return None
+    if retry_at.tzinfo is None:
+        retry_at = retry_at.replace(tzinfo = timezone.utc)
+    return max(0, int(retry_at.timestamp() - time.time()))
+
+
 class GitHubClient:
     def __init__(
         self,
         min_remaining_graphql: int = 100,
         min_remaining_rest: int = 100,
         token: str | None = None,
+        token_source: str | None = None,
     ):
-        token = token or os.environ.get("GH_TOKEN") or os.environ.get("GITHUB_TOKEN")
-        if not token:
-            raise RuntimeError("GH_TOKEN not set in environment")
+        if token:
+            self._token_source = (
+                token_source or "explicit token argument (recipe-level field)"
+            )
+        elif os.environ.get("GH_TOKEN"):
+            self._token_source = "GH_TOKEN environment variable"
+            token = os.environ["GH_TOKEN"]
+        elif os.environ.get("GITHUB_TOKEN"):
+            self._token_source = "GITHUB_TOKEN environment variable"
+            token = os.environ["GITHUB_TOKEN"]
+        else:
+            raise RuntimeError("GH_TOKEN or GITHUB_TOKEN not set in environment")
         self.session = requests.Session()
         self.session.headers.update(
             {**BASE_HEADERS, "Authorization": f"Bearer {token}"}
@@ -59,6 +91,49 @@ class GitHubClient:
         log.warning("Rate limit hit. Sleeping %ds until reset.", wait)
         time.sleep(wait)
 
+    def _is_rate_limit_response(self, r: "requests.Response") -> bool:
+        if r.headers.get("Retry-After"):
+            return True
+        if r.headers.get("X-RateLimit-Remaining") == "0":
+            return True
+        body = (r.text or "").lower()
+        return any(
+            marker in body
+            for marker in (
+                "api rate limit exceeded",
+                "rate limit exceeded",
+                "secondary rate limit",
+                "secondary limit",
+                "abuse detection mechanism",
+                "abuse detection",
+            )
+        )
+
+    def _is_auth_failure(self, r: "requests.Response") -> bool:
+        """Distinguish auth failures from rate limiting on 401/403 responses.
+
+        - 401: always an auth failure (invalid / expired / wrong-scope token).
+        - 403: an auth failure UNLESS the response carries a clear rate-limit signal
+          (Retry-After header, X-RateLimit-Remaining: 0, or GitHub's secondary /
+          abuse rate-limit response text).
+        """
+        if r.status_code == 401:
+            return True
+        if r.status_code == 403:
+            return not self._is_rate_limit_response(r)
+        return False
+
+    def _raise_auth_error(self, r: "requests.Response", endpoint: str) -> None:
+        snippet = (r.text or "").strip()[:200]
+        request_id = r.headers.get("X-GitHub-Request-Id")
+        request_id_message = f" Request ID: {request_id}." if request_id else ""
+        raise GitHubAuthError(
+            f"GitHub {endpoint} returned {r.status_code} {r.reason}. "
+            f"Token source: {self._token_source}. "
+            f"The token is invalid, expired, or missing required scopes — "
+            f"retrying will not recover.{request_id_message} Response: {snippet}"
+        )
+
     def _check_rate_and_wait(self, kind: str) -> None:
         if kind == "graphql":
             remaining = self.graphql_remaining
@@ -112,13 +187,14 @@ class GitHubClient:
                     time.sleep(backoff)
                     backoff = min(backoff * 2, 60)
                     continue
+                if self._is_auth_failure(r):
+                    self._raise_auth_error(r, "GraphQL")
                 if r.status_code == 403 or r.status_code == 429:
                     # Check for secondary/abuse
-                    retry_after = r.headers.get("Retry-After")
-                    if retry_after:
-                        t = int(retry_after)
-                        log.warning("Secondary rate limit. Sleep %ds.", t)
-                        time.sleep(t + 2)
+                    retry_after = _retry_after_seconds(r.headers.get("Retry-After"))
+                    if retry_after is not None:
+                        log.warning("Secondary rate limit. Sleep %ds.", retry_after)
+                        time.sleep(retry_after + 2)
                         continue
                     if self.graphql_reset:
                         self._sleep_until(self.graphql_reset)
@@ -188,12 +264,15 @@ class GitHubClient:
                     time.sleep(backoff)
                     backoff = min(backoff * 2, 60)
                     continue
+                if self._is_auth_failure(r):
+                    self._raise_auth_error(r, "REST")
                 if r.status_code in (403, 429):
-                    retry_after = r.headers.get("Retry-After")
-                    if retry_after:
-                        t = int(retry_after)
-                        log.warning("Secondary rate limit on REST. Sleep %ds.", t)
-                        time.sleep(t + 2)
+                    retry_after = _retry_after_seconds(r.headers.get("Retry-After"))
+                    if retry_after is not None:
+                        log.warning(
+                            "Secondary rate limit on REST. Sleep %ds.", retry_after
+                        )
+                        time.sleep(retry_after + 2)
                         continue
                     # Check if primary rate
                     if self.rest_remaining == 0 and self.rest_reset: