SurfSense/surfsense_evals/tests/core/test_config.py
DESKTOP-RTLN3BA\$punk 3737118050
Some checks failed
Build and Push Docker Images / tag_release (push) Has been cancelled
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Has been cancelled
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Has been cancelled
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Has been cancelled
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Has been cancelled
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Has been cancelled
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Has been cancelled
chore: evals
2026-05-13 14:02:26 -07:00

160 lines
5.6 KiB
Python

"""Tests for env loading + state.json read/write."""
from __future__ import annotations
import json
from surfsense_evals.core.config import (
DEFAULT_SCENARIO,
SCENARIOS,
SuiteState,
clear_suite_state,
get_suite_state,
load_config,
set_suite_state,
)
def test_load_config_defaults_to_localhost(tmp_env): # noqa: ARG001
config = load_config()
assert config.surfsense_api_base == "http://localhost:8000"
assert config.has_jwt_mode() is False
assert config.has_local_mode() is False
assert config.credential_mode() == "none"
def test_load_config_picks_up_jwt_env(tmp_env, monkeypatch): # noqa: ARG001
monkeypatch.setenv("SURFSENSE_JWT", "tok")
config = load_config()
assert config.credential_mode() == "jwt"
def test_load_config_picks_up_local_env(tmp_env, monkeypatch): # noqa: ARG001
monkeypatch.setenv("SURFSENSE_USER_EMAIL", "u@x.com")
monkeypatch.setenv("SURFSENSE_USER_PASSWORD", "pw")
config = load_config()
assert config.credential_mode() == "local"
def test_state_roundtrip_per_suite(tmp_env): # noqa: ARG001
config = load_config()
assert get_suite_state(config, "medical") is None
state = SuiteState(
search_space_id=1,
agent_llm_id=-10042,
provider_model="anthropic/claude-sonnet-4.5",
created_at="2026-05-11T20-30-00Z",
)
set_suite_state(config, "medical", state)
legal = SuiteState(
search_space_id=2,
agent_llm_id=-1,
provider_model="openai/gpt-5",
created_at="2026-05-11T21-00-00Z",
)
set_suite_state(config, "legal", legal)
fetched = get_suite_state(config, "medical")
assert fetched.search_space_id == 1
assert fetched.provider_model == "anthropic/claude-sonnet-4.5"
# Other suite untouched after teardown.
cleared = clear_suite_state(config, "medical")
assert cleared is True
assert get_suite_state(config, "medical") is None
assert get_suite_state(config, "legal").search_space_id == 2
raw = json.loads(config.state_path.read_text(encoding="utf-8"))
assert "medical" not in raw["suites"]
assert "legal" in raw["suites"]
def test_paths_are_per_suite(tmp_env): # noqa: ARG001
config = load_config()
a = config.suite_data_dir("medical")
b = config.suite_data_dir("legal")
assert a != b
assert config.suite_reports_dir("medical").parent == config.reports_dir
assert config.suite_runs_dir("medical").name == "runs"
assert config.suite_maps_dir("medical").name == "maps"
# ---------------------------------------------------------------------------
# Scenario state — back-compat + new fields
# ---------------------------------------------------------------------------
def test_legacy_state_back_compat_defaults_to_head_to_head():
"""state.json files written before scenarios shipped must still load.
Missing ``scenario`` / ``vision_*`` / ``native_arm_model`` keys all
default to ``head-to-head`` / ``None`` so old setups keep working
after upgrade — the runner's behaviour exactly mirrors the legacy
one (both arms answer with ``provider_model``).
"""
legacy = {
"search_space_id": 7,
"agent_llm_id": -123,
"provider_model": "anthropic/claude-sonnet-4.5",
"created_at": "2026-05-11T20-30-00Z",
"ingestion_maps": {},
}
state = SuiteState.from_dict(legacy)
assert state.scenario == DEFAULT_SCENARIO == "head-to-head"
assert state.vision_llm_config_id is None
assert state.vision_provider_model is None
assert state.native_arm_model is None
# The native arm should still answer with the same slug as SurfSense.
assert state.effective_native_arm_model == state.provider_model
def test_unknown_scenario_falls_back_to_default():
"""Garbage scenario in state.json → default, not crash.
Defensive: we'd rather a stale state file render with the safe
head-to-head behaviour than break the whole run with a KeyError.
"""
payload = {
"search_space_id": 1,
"agent_llm_id": -1,
"provider_model": "openai/gpt-5",
"scenario": "unknown-scenario-name",
}
state = SuiteState.from_dict(payload)
assert state.scenario == DEFAULT_SCENARIO
def test_cost_arbitrage_state_persists_native_arm_model(tmp_env): # noqa: ARG001
config = load_config()
state = SuiteState(
search_space_id=42,
agent_llm_id=-1,
provider_model="openai/gpt-5.4-mini",
created_at="2026-05-11T20-30-00Z",
scenario="cost-arbitrage",
vision_llm_config_id=-101,
vision_provider_model="anthropic/claude-sonnet-4.5",
native_arm_model="anthropic/claude-sonnet-4.5",
)
set_suite_state(config, "medical", state)
fetched = get_suite_state(config, "medical")
assert fetched.scenario == "cost-arbitrage"
assert fetched.vision_llm_config_id == -101
assert fetched.vision_provider_model == "anthropic/claude-sonnet-4.5"
assert fetched.native_arm_model == "anthropic/claude-sonnet-4.5"
# Cost arbitrage's whole point: native arm slug != surfsense slug.
assert fetched.effective_native_arm_model != fetched.provider_model
assert fetched.effective_native_arm_model == "anthropic/claude-sonnet-4.5"
raw = json.loads(config.state_path.read_text(encoding="utf-8"))
assert raw["suites"]["medical"]["scenario"] == "cost-arbitrage"
def test_scenario_constants_are_stable():
"""Pin the public scenario list; runners + tests key off these strings."""
assert SCENARIOS == ("head-to-head", "symmetric-cheap", "cost-arbitrage")
assert DEFAULT_SCENARIO == "head-to-head"