mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-31 04:49:09 +00:00
Some checks failed
Build and Push Docker Images / tag_release (push) Has been cancelled
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Has been cancelled
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Has been cancelled
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Has been cancelled
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Has been cancelled
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Has been cancelled
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Has been cancelled
160 lines
5.6 KiB
Python
160 lines
5.6 KiB
Python
"""Tests for env loading + state.json read/write."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
|
|
from surfsense_evals.core.config import (
|
|
DEFAULT_SCENARIO,
|
|
SCENARIOS,
|
|
SuiteState,
|
|
clear_suite_state,
|
|
get_suite_state,
|
|
load_config,
|
|
set_suite_state,
|
|
)
|
|
|
|
|
|
def test_load_config_defaults_to_localhost(tmp_env): # noqa: ARG001
|
|
config = load_config()
|
|
assert config.surfsense_api_base == "http://localhost:8000"
|
|
assert config.has_jwt_mode() is False
|
|
assert config.has_local_mode() is False
|
|
assert config.credential_mode() == "none"
|
|
|
|
|
|
def test_load_config_picks_up_jwt_env(tmp_env, monkeypatch): # noqa: ARG001
|
|
monkeypatch.setenv("SURFSENSE_JWT", "tok")
|
|
config = load_config()
|
|
assert config.credential_mode() == "jwt"
|
|
|
|
|
|
def test_load_config_picks_up_local_env(tmp_env, monkeypatch): # noqa: ARG001
|
|
monkeypatch.setenv("SURFSENSE_USER_EMAIL", "u@x.com")
|
|
monkeypatch.setenv("SURFSENSE_USER_PASSWORD", "pw")
|
|
config = load_config()
|
|
assert config.credential_mode() == "local"
|
|
|
|
|
|
def test_state_roundtrip_per_suite(tmp_env): # noqa: ARG001
|
|
config = load_config()
|
|
assert get_suite_state(config, "medical") is None
|
|
state = SuiteState(
|
|
search_space_id=1,
|
|
agent_llm_id=-10042,
|
|
provider_model="anthropic/claude-sonnet-4.5",
|
|
created_at="2026-05-11T20-30-00Z",
|
|
)
|
|
set_suite_state(config, "medical", state)
|
|
legal = SuiteState(
|
|
search_space_id=2,
|
|
agent_llm_id=-1,
|
|
provider_model="openai/gpt-5",
|
|
created_at="2026-05-11T21-00-00Z",
|
|
)
|
|
set_suite_state(config, "legal", legal)
|
|
|
|
fetched = get_suite_state(config, "medical")
|
|
assert fetched.search_space_id == 1
|
|
assert fetched.provider_model == "anthropic/claude-sonnet-4.5"
|
|
|
|
# Other suite untouched after teardown.
|
|
cleared = clear_suite_state(config, "medical")
|
|
assert cleared is True
|
|
assert get_suite_state(config, "medical") is None
|
|
assert get_suite_state(config, "legal").search_space_id == 2
|
|
|
|
raw = json.loads(config.state_path.read_text(encoding="utf-8"))
|
|
assert "medical" not in raw["suites"]
|
|
assert "legal" in raw["suites"]
|
|
|
|
|
|
def test_paths_are_per_suite(tmp_env): # noqa: ARG001
|
|
config = load_config()
|
|
a = config.suite_data_dir("medical")
|
|
b = config.suite_data_dir("legal")
|
|
assert a != b
|
|
assert config.suite_reports_dir("medical").parent == config.reports_dir
|
|
assert config.suite_runs_dir("medical").name == "runs"
|
|
assert config.suite_maps_dir("medical").name == "maps"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scenario state — back-compat + new fields
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_legacy_state_back_compat_defaults_to_head_to_head():
|
|
"""state.json files written before scenarios shipped must still load.
|
|
|
|
Missing ``scenario`` / ``vision_*`` / ``native_arm_model`` keys all
|
|
default to ``head-to-head`` / ``None`` so old setups keep working
|
|
after upgrade — the runner's behaviour exactly mirrors the legacy
|
|
one (both arms answer with ``provider_model``).
|
|
"""
|
|
|
|
legacy = {
|
|
"search_space_id": 7,
|
|
"agent_llm_id": -123,
|
|
"provider_model": "anthropic/claude-sonnet-4.5",
|
|
"created_at": "2026-05-11T20-30-00Z",
|
|
"ingestion_maps": {},
|
|
}
|
|
state = SuiteState.from_dict(legacy)
|
|
assert state.scenario == DEFAULT_SCENARIO == "head-to-head"
|
|
assert state.vision_llm_config_id is None
|
|
assert state.vision_provider_model is None
|
|
assert state.native_arm_model is None
|
|
# The native arm should still answer with the same slug as SurfSense.
|
|
assert state.effective_native_arm_model == state.provider_model
|
|
|
|
|
|
def test_unknown_scenario_falls_back_to_default():
|
|
"""Garbage scenario in state.json → default, not crash.
|
|
|
|
Defensive: we'd rather a stale state file render with the safe
|
|
head-to-head behaviour than break the whole run with a KeyError.
|
|
"""
|
|
|
|
payload = {
|
|
"search_space_id": 1,
|
|
"agent_llm_id": -1,
|
|
"provider_model": "openai/gpt-5",
|
|
"scenario": "unknown-scenario-name",
|
|
}
|
|
state = SuiteState.from_dict(payload)
|
|
assert state.scenario == DEFAULT_SCENARIO
|
|
|
|
|
|
def test_cost_arbitrage_state_persists_native_arm_model(tmp_env): # noqa: ARG001
|
|
config = load_config()
|
|
state = SuiteState(
|
|
search_space_id=42,
|
|
agent_llm_id=-1,
|
|
provider_model="openai/gpt-5.4-mini",
|
|
created_at="2026-05-11T20-30-00Z",
|
|
scenario="cost-arbitrage",
|
|
vision_llm_config_id=-101,
|
|
vision_provider_model="anthropic/claude-sonnet-4.5",
|
|
native_arm_model="anthropic/claude-sonnet-4.5",
|
|
)
|
|
set_suite_state(config, "medical", state)
|
|
|
|
fetched = get_suite_state(config, "medical")
|
|
assert fetched.scenario == "cost-arbitrage"
|
|
assert fetched.vision_llm_config_id == -101
|
|
assert fetched.vision_provider_model == "anthropic/claude-sonnet-4.5"
|
|
assert fetched.native_arm_model == "anthropic/claude-sonnet-4.5"
|
|
# Cost arbitrage's whole point: native arm slug != surfsense slug.
|
|
assert fetched.effective_native_arm_model != fetched.provider_model
|
|
assert fetched.effective_native_arm_model == "anthropic/claude-sonnet-4.5"
|
|
|
|
raw = json.loads(config.state_path.read_text(encoding="utf-8"))
|
|
assert raw["suites"]["medical"]["scenario"] == "cost-arbitrage"
|
|
|
|
|
|
def test_scenario_constants_are_stable():
|
|
"""Pin the public scenario list; runners + tests key off these strings."""
|
|
|
|
assert SCENARIOS == ("head-to-head", "symmetric-cheap", "cost-arbitrage")
|
|
assert DEFAULT_SCENARIO == "head-to-head"
|