Pulse/tests/integration/scripts/run-evals.mjs
2026-03-18 16:06:30 +00:00

323 lines
9.9 KiB
JavaScript

#!/usr/bin/env node
import { spawn } from 'node:child_process';
import fs from 'node:fs/promises';
import path from 'node:path';
import process from 'node:process';
import { fileURLToPath } from 'node:url';
const INTEGRATION_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
const SCENARIOS_FILE = path.join(INTEGRATION_ROOT, 'evals', 'scenarios.json');
// Keep eval reports outside Playwright's test-results directory.
// Playwright clears test-results between invocations, which can remove report paths mid-run.
const DEFAULT_RESULTS_ROOT = path.join(INTEGRATION_ROOT, 'eval-results');
const args = process.argv.slice(2);
const hasArg = (flag) => args.includes(flag);
const argValue = (flag) => {
const idx = args.indexOf(flag);
if (idx === -1 || idx + 1 >= args.length) return null;
return args[idx + 1];
};
if (hasArg('--help') || hasArg('-h')) {
console.log(`
Usage: node ./scripts/run-evals.mjs [options]
Options:
--scenario <id[,id2]> Run one or more specific scenario ids
--mode <name> deterministic (default) | agentic
--dry-run Print planned commands without executing
--help Show this help
Environment:
PULSE_EVAL_MODE Default mode when --mode is not provided
PULSE_EVAL_AGENT_COMMAND_TEMPLATE Required for agentic mode. Shell command with placeholders:
{{task_file}}, {{result_json}}, {{scenario_id}}, {{base_url}}
PULSE_BASE_URL Base URL passed to scenarios (default http://localhost:7655)
PULSE_E2E_USERNAME Username context for prompts (default admin)
PULSE_E2E_PASSWORD Password context for prompts (default adminadminadmin)
`.trim());
process.exit(0);
}
const selectedScenarioIDs = new Set(
(argValue('--scenario') || '')
.split(',')
.map((v) => v.trim())
.filter(Boolean),
);
const mode = (argValue('--mode') || process.env.PULSE_EVAL_MODE || 'deterministic').trim();
const dryRun = hasArg('--dry-run');
const baseURL = (process.env.PULSE_BASE_URL || 'http://localhost:7655').trim();
const username = (process.env.PULSE_E2E_USERNAME || 'admin').trim();
const password = (process.env.PULSE_E2E_PASSWORD || 'adminadminadmin').trim();
function nowStamp() {
return new Date().toISOString().replace(/[:.]/g, '-');
}
function formatMs(ms) {
if (ms < 1000) return `${ms}ms`;
return `${(ms / 1000).toFixed(1)}s`;
}
function runCommand(command, argsList, env = {}, useShell = false) {
return new Promise((resolve) => {
const startedAt = Date.now();
const child = spawn(command, argsList, {
cwd: INTEGRATION_ROOT,
env: { ...process.env, ...env },
stdio: 'inherit',
shell: useShell,
});
child.on('close', (code) => {
resolve({
code: code ?? 1,
durationMs: Date.now() - startedAt,
});
});
});
}
function renderTemplate(template, replacements) {
let output = template;
for (const [key, value] of Object.entries(replacements)) {
output = output.replaceAll(`{{${key}}}`, String(value));
}
return output;
}
function buildMarkdownReport(summary) {
const total = summary.results.length;
const passed = summary.results.filter((r) => r.status === 'pass').length;
const failed = total - passed;
const lines = [];
lines.push('# Pulse Agentic Eval Report');
lines.push('');
lines.push(`- Mode: \`${summary.mode}\``);
lines.push(`- Base URL: \`${summary.baseURL}\``);
lines.push(`- Total: ${total}`);
lines.push(`- Passed: ${passed}`);
lines.push(`- Failed: ${failed}`);
lines.push('');
lines.push('## Scenario Results');
lines.push('');
for (const result of summary.results) {
lines.push(`- [${result.status === 'pass' ? 'PASS' : 'FAIL'}] ${result.id} (${formatMs(result.durationMs)})`);
lines.push(` Summary: ${result.summary || 'No summary provided'}`);
if (Array.isArray(result.issues) && result.issues.length > 0) {
lines.push(` Issues: ${result.issues.join(' | ')}`);
}
}
return lines.join('\n');
}
async function loadScenarios() {
const raw = await fs.readFile(SCENARIOS_FILE, 'utf8');
const parsed = JSON.parse(raw);
return Array.isArray(parsed.scenarios) ? parsed.scenarios : [];
}
async function runDeterministicScenario(scenario) {
const spec = scenario?.deterministic;
if (!spec || !Array.isArray(spec.command) || spec.command.length === 0) {
return {
status: 'fail',
summary: 'Scenario missing deterministic command',
issues: ['invalid deterministic configuration'],
durationMs: 0,
};
}
const [command, ...argsList] = spec.command;
const commandEnv = {
...spec.env,
PULSE_BASE_URL: baseURL,
PULSE_E2E_USERNAME: username,
PULSE_E2E_PASSWORD: password,
};
if (dryRun) {
console.log(`[dry-run] ${command} ${argsList.join(' ')}`);
return {
status: 'pass',
summary: 'Dry run only',
issues: [],
durationMs: 0,
};
}
const exec = await runCommand(command, argsList, commandEnv, false);
return {
status: exec.code === 0 ? 'pass' : 'fail',
summary: exec.code === 0 ? 'Deterministic run passed' : `Deterministic run failed with exit ${exec.code}`,
issues: exec.code === 0 ? [] : [`exit code ${exec.code}`],
durationMs: exec.durationMs,
};
}
async function runAgenticScenario(scenario, scenarioRunDir) {
const agentTemplate = process.env.PULSE_EVAL_AGENT_COMMAND_TEMPLATE;
if (dryRun && (!agentTemplate || agentTemplate.trim() === '')) {
return {
status: 'pass',
summary:
'Dry run only (set PULSE_EVAL_AGENT_COMMAND_TEMPLATE to execute agentic scenarios)',
issues: [],
durationMs: 0,
};
}
if (!agentTemplate || agentTemplate.trim() === '') {
return {
status: 'fail',
summary: 'Agentic mode requested but PULSE_EVAL_AGENT_COMMAND_TEMPLATE is unset',
issues: ['missing PULSE_EVAL_AGENT_COMMAND_TEMPLATE'],
durationMs: 0,
};
}
const taskRelPath = scenario?.agentic?.task_file;
if (!taskRelPath) {
return {
status: 'fail',
summary: 'Scenario missing agentic task file',
issues: ['invalid agentic configuration'],
durationMs: 0,
};
}
const taskTemplatePath = path.join(INTEGRATION_ROOT, taskRelPath);
const taskTemplate = await fs.readFile(taskTemplatePath, 'utf8');
const resultJSONPath = path.join(scenarioRunDir, `${scenario.id}.json`);
const renderedTaskPath = path.join(scenarioRunDir, `${scenario.id}.task.md`);
const renderedTask = renderTemplate(taskTemplate, {
base_url: baseURL,
username,
password,
result_json: resultJSONPath,
});
await fs.writeFile(renderedTaskPath, renderedTask, 'utf8');
const renderedCommand = renderTemplate(agentTemplate, {
task_file: renderedTaskPath,
result_json: resultJSONPath,
scenario_id: scenario.id,
base_url: baseURL,
});
if (dryRun) {
console.log(`[dry-run] ${renderedCommand}`);
return {
status: 'pass',
summary: 'Dry run only',
issues: [],
durationMs: 0,
};
}
const exec = await runCommand(renderedCommand, [], {
PULSE_EVAL_SCENARIO: scenario.id,
PULSE_EVAL_BASE_URL: baseURL,
PULSE_EVAL_RESULT_JSON: resultJSONPath,
}, true);
if (exec.code !== 0) {
return {
status: 'fail',
summary: `Agent command failed with exit ${exec.code}`,
issues: [`exit code ${exec.code}`],
durationMs: exec.durationMs,
};
}
try {
const raw = await fs.readFile(resultJSONPath, 'utf8');
const parsed = JSON.parse(raw);
const status = parsed?.status === 'pass' ? 'pass' : 'fail';
const summary = typeof parsed?.summary === 'string' ? parsed.summary : 'Agentic run completed';
const issues = Array.isArray(parsed?.issues)
? parsed.issues.filter((v) => typeof v === 'string')
: [];
return {
status,
summary,
issues,
durationMs: exec.durationMs,
};
} catch {
return {
status: 'fail',
summary: 'Agentic command finished but no valid JSON result was produced',
issues: ['invalid or missing scenario result JSON'],
durationMs: exec.durationMs,
};
}
}
async function main() {
const scenarios = await loadScenarios();
const selected =
selectedScenarioIDs.size === 0
? scenarios
: scenarios.filter((scenario) => selectedScenarioIDs.has(scenario.id));
if (selected.length === 0) {
console.error('No matching eval scenarios found.');
process.exit(1);
}
if (!['deterministic', 'agentic'].includes(mode)) {
console.error(`Unsupported mode "${mode}". Use deterministic or agentic.`);
process.exit(1);
}
const runID = nowStamp();
const runDir = path.join(DEFAULT_RESULTS_ROOT, runID);
await fs.mkdir(runDir, { recursive: true });
const results = [];
for (const scenario of selected) {
console.log(`\n=== Eval: ${scenario.id} (${mode}) ===`);
const startedAt = Date.now();
const result =
mode === 'deterministic'
? await runDeterministicScenario(scenario)
: await runAgenticScenario(scenario, runDir);
results.push({
id: scenario.id,
name: scenario.name,
status: result.status,
summary: result.summary,
issues: result.issues || [],
durationMs: result.durationMs || Date.now() - startedAt,
});
}
const summary = {
generatedAt: new Date().toISOString(),
mode,
baseURL,
results,
};
const jsonPath = path.join(runDir, 'report.json');
const mdPath = path.join(runDir, 'report.md');
await fs.writeFile(jsonPath, JSON.stringify(summary, null, 2), 'utf8');
await fs.writeFile(mdPath, buildMarkdownReport(summary), 'utf8');
const failed = results.filter((r) => r.status !== 'pass');
console.log(`\nReport: ${jsonPath}`);
console.log(`Summary: ${mdPath}`);
if (failed.length > 0) {
console.error(`Failed scenarios: ${failed.map((r) => r.id).join(', ')}`);
process.exit(1);
}
}
main().catch((err) => {
console.error(err);
process.exit(1);
});